From bb756eebdac6fd24e8919e2c43f7d2c8c4091f59 Mon Sep 17 00:00:00 2001 From: RajithaY Date: Tue, 25 Apr 2017 03:31:15 -0700 Subject: Adding qemu as a submodule of KVMFORNFV This Patch includes the changes to add qemu as a submodule to kvmfornfv repo and make use of the updated latest qemu for the execution of all testcase Change-Id: I1280af507a857675c7f81d30c95255635667bdd7 Signed-off-by:RajithaY --- qemu/block/Makefile.objs | 46 - qemu/block/accounting.c | 173 --- qemu/block/archipelago.c | 1084 -------------- qemu/block/backup.c | 613 -------- qemu/block/blkdebug.c | 759 ---------- qemu/block/blkreplay.c | 160 -- qemu/block/blkverify.c | 371 ----- qemu/block/block-backend.c | 1635 -------------------- qemu/block/bochs.c | 279 ---- qemu/block/cloop.c | 285 ---- qemu/block/commit.c | 277 ---- qemu/block/crypto.c | 586 -------- qemu/block/curl.c | 896 ----------- qemu/block/dirty-bitmap.c | 387 ----- qemu/block/dmg.c | 727 --------- qemu/block/gluster.c | 866 ----------- qemu/block/io.c | 2810 ----------------------------------- qemu/block/iscsi.c | 1904 ------------------------ qemu/block/linux-aio.c | 339 ----- qemu/block/mirror.c | 976 ------------ qemu/block/nbd-client.c | 436 ------ qemu/block/nbd-client.h | 59 - qemu/block/nbd.c | 539 ------- qemu/block/nfs.c | 563 ------- qemu/block/null.c | 266 ---- qemu/block/parallels.c | 766 ---------- qemu/block/qapi.c | 783 ---------- qemu/block/qcow.c | 1050 ------------- qemu/block/qcow2-cache.c | 411 ----- qemu/block/qcow2-cluster.c | 1899 ------------------------ qemu/block/qcow2-refcount.c | 2921 ------------------------------------ qemu/block/qcow2-snapshot.c | 738 --------- qemu/block/qcow2.c | 3373 ------------------------------------------ qemu/block/qcow2.h | 599 -------- qemu/block/qed-check.c | 251 ---- qemu/block/qed-cluster.c | 166 --- qemu/block/qed-gencb.c | 33 - qemu/block/qed-l2-cache.c | 188 --- qemu/block/qed-table.c | 297 ---- qemu/block/qed.c | 1689 --------------------- qemu/block/qed.h | 344 ----- qemu/block/quorum.c | 1091 -------------- qemu/block/raw-aio.h | 64 - qemu/block/raw-posix.c | 2701 --------------------------------- qemu/block/raw-win32.c | 731 --------- qemu/block/raw_bsd.c | 285 ---- qemu/block/rbd.c | 1015 ------------- qemu/block/sheepdog.c | 3042 ------------------------------------- qemu/block/snapshot.c | 493 ------ qemu/block/ssh.c | 1111 -------------- qemu/block/stream.c | 246 --- qemu/block/throttle-groups.c | 483 ------ qemu/block/vdi.c | 923 ------------ qemu/block/vhdx-endian.c | 224 --- qemu/block/vhdx-log.c | 1043 ------------- qemu/block/vhdx.c | 1981 ------------------------- qemu/block/vhdx.h | 453 ------ qemu/block/vmdk.c | 2349 ----------------------------- qemu/block/vpc.c | 1074 -------------- qemu/block/vvfat.c | 3050 -------------------------------------- qemu/block/win32-aio.c | 219 --- qemu/block/write-threshold.c | 126 -- 62 files changed, 55248 deletions(-) delete mode 100644 qemu/block/Makefile.objs delete mode 100644 qemu/block/accounting.c delete mode 100644 qemu/block/archipelago.c delete mode 100644 qemu/block/backup.c delete mode 100644 qemu/block/blkdebug.c delete mode 100755 qemu/block/blkreplay.c delete mode 100644 qemu/block/blkverify.c delete mode 100644 qemu/block/block-backend.c delete mode 100644 qemu/block/bochs.c delete mode 100644 qemu/block/cloop.c delete mode 100644 qemu/block/commit.c delete mode 100644 qemu/block/crypto.c delete mode 100644 qemu/block/curl.c delete mode 100644 qemu/block/dirty-bitmap.c delete mode 100644 qemu/block/dmg.c delete mode 100644 qemu/block/gluster.c delete mode 100644 qemu/block/io.c delete mode 100644 qemu/block/iscsi.c delete mode 100644 qemu/block/linux-aio.c delete mode 100644 qemu/block/mirror.c delete mode 100644 qemu/block/nbd-client.c delete mode 100644 qemu/block/nbd-client.h delete mode 100644 qemu/block/nbd.c delete mode 100644 qemu/block/nfs.c delete mode 100644 qemu/block/null.c delete mode 100644 qemu/block/parallels.c delete mode 100644 qemu/block/qapi.c delete mode 100644 qemu/block/qcow.c delete mode 100644 qemu/block/qcow2-cache.c delete mode 100644 qemu/block/qcow2-cluster.c delete mode 100644 qemu/block/qcow2-refcount.c delete mode 100644 qemu/block/qcow2-snapshot.c delete mode 100644 qemu/block/qcow2.c delete mode 100644 qemu/block/qcow2.h delete mode 100644 qemu/block/qed-check.c delete mode 100644 qemu/block/qed-cluster.c delete mode 100644 qemu/block/qed-gencb.c delete mode 100644 qemu/block/qed-l2-cache.c delete mode 100644 qemu/block/qed-table.c delete mode 100644 qemu/block/qed.c delete mode 100644 qemu/block/qed.h delete mode 100644 qemu/block/quorum.c delete mode 100644 qemu/block/raw-aio.h delete mode 100644 qemu/block/raw-posix.c delete mode 100644 qemu/block/raw-win32.c delete mode 100644 qemu/block/raw_bsd.c delete mode 100644 qemu/block/rbd.c delete mode 100644 qemu/block/sheepdog.c delete mode 100644 qemu/block/snapshot.c delete mode 100644 qemu/block/ssh.c delete mode 100644 qemu/block/stream.c delete mode 100644 qemu/block/throttle-groups.c delete mode 100644 qemu/block/vdi.c delete mode 100644 qemu/block/vhdx-endian.c delete mode 100644 qemu/block/vhdx-log.c delete mode 100644 qemu/block/vhdx.c delete mode 100644 qemu/block/vhdx.h delete mode 100644 qemu/block/vmdk.c delete mode 100644 qemu/block/vpc.c delete mode 100644 qemu/block/vvfat.c delete mode 100644 qemu/block/win32-aio.c delete mode 100644 qemu/block/write-threshold.c (limited to 'qemu/block') diff --git a/qemu/block/Makefile.objs b/qemu/block/Makefile.objs deleted file mode 100644 index 44a541622..000000000 --- a/qemu/block/Makefile.objs +++ /dev/null @@ -1,46 +0,0 @@ -block-obj-y += raw_bsd.o qcow.o vdi.o vmdk.o cloop.o bochs.o vpc.o vvfat.o -block-obj-y += qcow2.o qcow2-refcount.o qcow2-cluster.o qcow2-snapshot.o qcow2-cache.o -block-obj-y += qed.o qed-gencb.o qed-l2-cache.o qed-table.o qed-cluster.o -block-obj-y += qed-check.o -block-obj-$(CONFIG_VHDX) += vhdx.o vhdx-endian.o vhdx-log.o -block-obj-y += quorum.o -block-obj-y += parallels.o blkdebug.o blkverify.o blkreplay.o -block-obj-y += block-backend.o snapshot.o qapi.o -block-obj-$(CONFIG_WIN32) += raw-win32.o win32-aio.o -block-obj-$(CONFIG_POSIX) += raw-posix.o -block-obj-$(CONFIG_LINUX_AIO) += linux-aio.o -block-obj-y += null.o mirror.o io.o -block-obj-y += throttle-groups.o - -block-obj-y += nbd.o nbd-client.o sheepdog.o -block-obj-$(CONFIG_LIBISCSI) += iscsi.o -block-obj-$(CONFIG_LIBNFS) += nfs.o -block-obj-$(CONFIG_CURL) += curl.o -block-obj-$(CONFIG_RBD) += rbd.o -block-obj-$(CONFIG_GLUSTERFS) += gluster.o -block-obj-$(CONFIG_ARCHIPELAGO) += archipelago.o -block-obj-$(CONFIG_LIBSSH2) += ssh.o -block-obj-y += accounting.o dirty-bitmap.o -block-obj-y += write-threshold.o - -block-obj-y += crypto.o - -common-obj-y += stream.o -common-obj-y += commit.o -common-obj-y += backup.o - -iscsi.o-cflags := $(LIBISCSI_CFLAGS) -iscsi.o-libs := $(LIBISCSI_LIBS) -curl.o-cflags := $(CURL_CFLAGS) -curl.o-libs := $(CURL_LIBS) -rbd.o-cflags := $(RBD_CFLAGS) -rbd.o-libs := $(RBD_LIBS) -gluster.o-cflags := $(GLUSTERFS_CFLAGS) -gluster.o-libs := $(GLUSTERFS_LIBS) -ssh.o-cflags := $(LIBSSH2_CFLAGS) -ssh.o-libs := $(LIBSSH2_LIBS) -archipelago.o-libs := $(ARCHIPELAGO_LIBS) -block-obj-m += dmg.o -dmg.o-libs := $(BZIP2_LIBS) -qcow.o-libs := -lz -linux-aio.o-libs := -laio diff --git a/qemu/block/accounting.c b/qemu/block/accounting.c deleted file mode 100644 index 3f457c4e7..000000000 --- a/qemu/block/accounting.c +++ /dev/null @@ -1,173 +0,0 @@ -/* - * QEMU System Emulator block accounting - * - * Copyright (c) 2011 Christoph Hellwig - * Copyright (c) 2015 Igalia, S.L. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. - */ - -#include "qemu/osdep.h" -#include "block/accounting.h" -#include "block/block_int.h" -#include "qemu/timer.h" -#include "sysemu/qtest.h" - -static QEMUClockType clock_type = QEMU_CLOCK_REALTIME; -static const int qtest_latency_ns = NANOSECONDS_PER_SECOND / 1000; - -void block_acct_init(BlockAcctStats *stats, bool account_invalid, - bool account_failed) -{ - stats->account_invalid = account_invalid; - stats->account_failed = account_failed; - - if (qtest_enabled()) { - clock_type = QEMU_CLOCK_VIRTUAL; - } -} - -void block_acct_cleanup(BlockAcctStats *stats) -{ - BlockAcctTimedStats *s, *next; - QSLIST_FOREACH_SAFE(s, &stats->intervals, entries, next) { - g_free(s); - } -} - -void block_acct_add_interval(BlockAcctStats *stats, unsigned interval_length) -{ - BlockAcctTimedStats *s; - unsigned i; - - s = g_new0(BlockAcctTimedStats, 1); - s->interval_length = interval_length; - QSLIST_INSERT_HEAD(&stats->intervals, s, entries); - - for (i = 0; i < BLOCK_MAX_IOTYPE; i++) { - timed_average_init(&s->latency[i], clock_type, - (uint64_t) interval_length * NANOSECONDS_PER_SECOND); - } -} - -BlockAcctTimedStats *block_acct_interval_next(BlockAcctStats *stats, - BlockAcctTimedStats *s) -{ - if (s == NULL) { - return QSLIST_FIRST(&stats->intervals); - } else { - return QSLIST_NEXT(s, entries); - } -} - -void block_acct_start(BlockAcctStats *stats, BlockAcctCookie *cookie, - int64_t bytes, enum BlockAcctType type) -{ - assert(type < BLOCK_MAX_IOTYPE); - - cookie->bytes = bytes; - cookie->start_time_ns = qemu_clock_get_ns(clock_type); - cookie->type = type; -} - -void block_acct_done(BlockAcctStats *stats, BlockAcctCookie *cookie) -{ - BlockAcctTimedStats *s; - int64_t time_ns = qemu_clock_get_ns(clock_type); - int64_t latency_ns = time_ns - cookie->start_time_ns; - - if (qtest_enabled()) { - latency_ns = qtest_latency_ns; - } - - assert(cookie->type < BLOCK_MAX_IOTYPE); - - stats->nr_bytes[cookie->type] += cookie->bytes; - stats->nr_ops[cookie->type]++; - stats->total_time_ns[cookie->type] += latency_ns; - stats->last_access_time_ns = time_ns; - - QSLIST_FOREACH(s, &stats->intervals, entries) { - timed_average_account(&s->latency[cookie->type], latency_ns); - } -} - -void block_acct_failed(BlockAcctStats *stats, BlockAcctCookie *cookie) -{ - assert(cookie->type < BLOCK_MAX_IOTYPE); - - stats->failed_ops[cookie->type]++; - - if (stats->account_failed) { - BlockAcctTimedStats *s; - int64_t time_ns = qemu_clock_get_ns(clock_type); - int64_t latency_ns = time_ns - cookie->start_time_ns; - - if (qtest_enabled()) { - latency_ns = qtest_latency_ns; - } - - stats->total_time_ns[cookie->type] += latency_ns; - stats->last_access_time_ns = time_ns; - - QSLIST_FOREACH(s, &stats->intervals, entries) { - timed_average_account(&s->latency[cookie->type], latency_ns); - } - } -} - -void block_acct_invalid(BlockAcctStats *stats, enum BlockAcctType type) -{ - assert(type < BLOCK_MAX_IOTYPE); - - /* block_acct_done() and block_acct_failed() update - * total_time_ns[], but this one does not. The reason is that - * invalid requests are accounted during their submission, - * therefore there's no actual I/O involved. */ - - stats->invalid_ops[type]++; - - if (stats->account_invalid) { - stats->last_access_time_ns = qemu_clock_get_ns(clock_type); - } -} - -void block_acct_merge_done(BlockAcctStats *stats, enum BlockAcctType type, - int num_requests) -{ - assert(type < BLOCK_MAX_IOTYPE); - stats->merged[type] += num_requests; -} - -int64_t block_acct_idle_time_ns(BlockAcctStats *stats) -{ - return qemu_clock_get_ns(clock_type) - stats->last_access_time_ns; -} - -double block_acct_queue_depth(BlockAcctTimedStats *stats, - enum BlockAcctType type) -{ - uint64_t sum, elapsed; - - assert(type < BLOCK_MAX_IOTYPE); - - sum = timed_average_sum(&stats->latency[type], &elapsed); - - return (double) sum / elapsed; -} diff --git a/qemu/block/archipelago.c b/qemu/block/archipelago.c deleted file mode 100644 index b9f5e69d4..000000000 --- a/qemu/block/archipelago.c +++ /dev/null @@ -1,1084 +0,0 @@ -/* - * QEMU Block driver for Archipelago - * - * Copyright (C) 2014 Chrysostomos Nanakos - * - * This work is licensed under the terms of the GNU GPL, version 2 or later. - * See the COPYING file in the top-level directory. - * - */ - -/* - * VM Image on Archipelago volume is specified like this: - * - * file.driver=archipelago,file.volume= - * [,file.mport=[,file.vport=] - * [,file.segment=]] - * - * or - * - * file=archipelago:[/mport=[:vport=][: - * segment=]] - * - * 'archipelago' is the protocol. - * - * 'mport' is the port number on which mapperd is listening. This is optional - * and if not specified, QEMU will make Archipelago to use the default port. - * - * 'vport' is the port number on which vlmcd is listening. This is optional - * and if not specified, QEMU will make Archipelago to use the default port. - * - * 'segment' is the name of the shared memory segment Archipelago stack - * is using. This is optional and if not specified, QEMU will make Archipelago - * to use the default value, 'archipelago'. - * - * Examples: - * - * file.driver=archipelago,file.volume=my_vm_volume - * file.driver=archipelago,file.volume=my_vm_volume,file.mport=123 - * file.driver=archipelago,file.volume=my_vm_volume,file.mport=123, - * file.vport=1234 - * file.driver=archipelago,file.volume=my_vm_volume,file.mport=123, - * file.vport=1234,file.segment=my_segment - * - * or - * - * file=archipelago:my_vm_volume - * file=archipelago:my_vm_volume/mport=123 - * file=archipelago:my_vm_volume/mport=123:vport=1234 - * file=archipelago:my_vm_volume/mport=123:vport=1234:segment=my_segment - * - */ - -#include "qemu/osdep.h" -#include "qemu/cutils.h" -#include "block/block_int.h" -#include "qemu/error-report.h" -#include "qemu/thread.h" -#include "qapi/qmp/qint.h" -#include "qapi/qmp/qstring.h" -#include "qapi/qmp/qjson.h" -#include "qemu/atomic.h" - -#include -#include - -#define MAX_REQUEST_SIZE 524288 - -#define ARCHIPELAGO_OPT_VOLUME "volume" -#define ARCHIPELAGO_OPT_SEGMENT "segment" -#define ARCHIPELAGO_OPT_MPORT "mport" -#define ARCHIPELAGO_OPT_VPORT "vport" -#define ARCHIPELAGO_DFL_MPORT 1001 -#define ARCHIPELAGO_DFL_VPORT 501 - -#define archipelagolog(fmt, ...) \ - do { \ - fprintf(stderr, "archipelago\t%-24s: " fmt, __func__, ##__VA_ARGS__); \ - } while (0) - -typedef enum { - ARCHIP_OP_READ, - ARCHIP_OP_WRITE, - ARCHIP_OP_FLUSH, - ARCHIP_OP_VOLINFO, - ARCHIP_OP_TRUNCATE, -} ARCHIPCmd; - -typedef struct ArchipelagoAIOCB { - BlockAIOCB common; - QEMUBH *bh; - struct BDRVArchipelagoState *s; - QEMUIOVector *qiov; - ARCHIPCmd cmd; - int status; - int64_t size; - int64_t ret; -} ArchipelagoAIOCB; - -typedef struct BDRVArchipelagoState { - ArchipelagoAIOCB *event_acb; - char *volname; - char *segment_name; - uint64_t size; - /* Archipelago specific */ - struct xseg *xseg; - struct xseg_port *port; - xport srcport; - xport sport; - xport mportno; - xport vportno; - QemuMutex archip_mutex; - QemuCond archip_cond; - bool is_signaled; - /* Request handler specific */ - QemuThread request_th; - QemuCond request_cond; - QemuMutex request_mutex; - bool th_is_signaled; - bool stopping; -} BDRVArchipelagoState; - -typedef struct ArchipelagoSegmentedRequest { - size_t count; - size_t total; - int ref; - int failed; -} ArchipelagoSegmentedRequest; - -typedef struct AIORequestData { - const char *volname; - off_t offset; - size_t size; - uint64_t bufidx; - int ret; - int op; - ArchipelagoAIOCB *aio_cb; - ArchipelagoSegmentedRequest *segreq; -} AIORequestData; - -static void qemu_archipelago_complete_aio(void *opaque); - -static void init_local_signal(struct xseg *xseg, xport sport, xport srcport) -{ - if (xseg && (sport != srcport)) { - xseg_init_local_signal(xseg, srcport); - sport = srcport; - } -} - -static void archipelago_finish_aiocb(AIORequestData *reqdata) -{ - if (reqdata->aio_cb->ret != reqdata->segreq->total) { - reqdata->aio_cb->ret = -EIO; - } else if (reqdata->aio_cb->ret == reqdata->segreq->total) { - reqdata->aio_cb->ret = 0; - } - reqdata->aio_cb->bh = aio_bh_new( - bdrv_get_aio_context(reqdata->aio_cb->common.bs), - qemu_archipelago_complete_aio, reqdata - ); - qemu_bh_schedule(reqdata->aio_cb->bh); -} - -static int wait_reply(struct xseg *xseg, xport srcport, struct xseg_port *port, - struct xseg_request *expected_req) -{ - struct xseg_request *req; - xseg_prepare_wait(xseg, srcport); - void *psd = xseg_get_signal_desc(xseg, port); - while (1) { - req = xseg_receive(xseg, srcport, X_NONBLOCK); - if (req) { - if (req != expected_req) { - archipelagolog("Unknown received request\n"); - xseg_put_request(xseg, req, srcport); - } else if (!(req->state & XS_SERVED)) { - return -1; - } else { - break; - } - } - xseg_wait_signal(xseg, psd, 100000UL); - } - xseg_cancel_wait(xseg, srcport); - return 0; -} - -static void xseg_request_handler(void *state) -{ - BDRVArchipelagoState *s = (BDRVArchipelagoState *) state; - void *psd = xseg_get_signal_desc(s->xseg, s->port); - qemu_mutex_lock(&s->request_mutex); - - while (!s->stopping) { - struct xseg_request *req; - void *data; - xseg_prepare_wait(s->xseg, s->srcport); - req = xseg_receive(s->xseg, s->srcport, X_NONBLOCK); - if (req) { - AIORequestData *reqdata; - ArchipelagoSegmentedRequest *segreq; - xseg_get_req_data(s->xseg, req, (void **)&reqdata); - - switch (reqdata->op) { - case ARCHIP_OP_READ: - data = xseg_get_data(s->xseg, req); - segreq = reqdata->segreq; - segreq->count += req->serviced; - - qemu_iovec_from_buf(reqdata->aio_cb->qiov, reqdata->bufidx, - data, - req->serviced); - - xseg_put_request(s->xseg, req, s->srcport); - - if (atomic_fetch_dec(&segreq->ref) == 1) { - if (!segreq->failed) { - reqdata->aio_cb->ret = segreq->count; - archipelago_finish_aiocb(reqdata); - g_free(segreq); - } else { - g_free(segreq); - g_free(reqdata); - } - } else { - g_free(reqdata); - } - break; - case ARCHIP_OP_WRITE: - case ARCHIP_OP_FLUSH: - segreq = reqdata->segreq; - segreq->count += req->serviced; - xseg_put_request(s->xseg, req, s->srcport); - - if (atomic_fetch_dec(&segreq->ref) == 1) { - if (!segreq->failed) { - reqdata->aio_cb->ret = segreq->count; - archipelago_finish_aiocb(reqdata); - g_free(segreq); - } else { - g_free(segreq); - g_free(reqdata); - } - } else { - g_free(reqdata); - } - break; - case ARCHIP_OP_VOLINFO: - case ARCHIP_OP_TRUNCATE: - s->is_signaled = true; - qemu_cond_signal(&s->archip_cond); - break; - } - } else { - xseg_wait_signal(s->xseg, psd, 100000UL); - } - xseg_cancel_wait(s->xseg, s->srcport); - } - - s->th_is_signaled = true; - qemu_cond_signal(&s->request_cond); - qemu_mutex_unlock(&s->request_mutex); - qemu_thread_exit(NULL); -} - -static int qemu_archipelago_xseg_init(BDRVArchipelagoState *s) -{ - if (xseg_initialize()) { - archipelagolog("Cannot initialize XSEG\n"); - goto err_exit; - } - - s->xseg = xseg_join("posix", s->segment_name, - "posixfd", NULL); - if (!s->xseg) { - archipelagolog("Cannot join XSEG shared memory segment\n"); - goto err_exit; - } - s->port = xseg_bind_dynport(s->xseg); - s->srcport = s->port->portno; - init_local_signal(s->xseg, s->sport, s->srcport); - return 0; - -err_exit: - return -1; -} - -static int qemu_archipelago_init(BDRVArchipelagoState *s) -{ - int ret; - - ret = qemu_archipelago_xseg_init(s); - if (ret < 0) { - error_report("Cannot initialize XSEG. Aborting..."); - goto err_exit; - } - - qemu_cond_init(&s->archip_cond); - qemu_mutex_init(&s->archip_mutex); - qemu_cond_init(&s->request_cond); - qemu_mutex_init(&s->request_mutex); - s->th_is_signaled = false; - qemu_thread_create(&s->request_th, "xseg_io_th", - (void *) xseg_request_handler, - (void *) s, QEMU_THREAD_JOINABLE); - -err_exit: - return ret; -} - -static void qemu_archipelago_complete_aio(void *opaque) -{ - AIORequestData *reqdata = (AIORequestData *) opaque; - ArchipelagoAIOCB *aio_cb = (ArchipelagoAIOCB *) reqdata->aio_cb; - - qemu_bh_delete(aio_cb->bh); - aio_cb->common.cb(aio_cb->common.opaque, aio_cb->ret); - aio_cb->status = 0; - - qemu_aio_unref(aio_cb); - g_free(reqdata); -} - -static void xseg_find_port(char *pstr, const char *needle, xport *aport) -{ - const char *a; - char *endptr = NULL; - unsigned long port; - if (strstart(pstr, needle, &a)) { - if (strlen(a) > 0) { - port = strtoul(a, &endptr, 10); - if (strlen(endptr)) { - *aport = -2; - return; - } - *aport = (xport) port; - } - } -} - -static void xseg_find_segment(char *pstr, const char *needle, - char **segment_name) -{ - const char *a; - if (strstart(pstr, needle, &a)) { - if (strlen(a) > 0) { - *segment_name = g_strdup(a); - } - } -} - -static void parse_filename_opts(const char *filename, Error **errp, - char **volume, char **segment_name, - xport *mport, xport *vport) -{ - const char *start; - char *tokens[4], *ds; - int idx; - xport lmport = NoPort, lvport = NoPort; - - strstart(filename, "archipelago:", &start); - - ds = g_strdup(start); - tokens[0] = strtok(ds, "/"); - tokens[1] = strtok(NULL, ":"); - tokens[2] = strtok(NULL, ":"); - tokens[3] = strtok(NULL, "\0"); - - if (!strlen(tokens[0])) { - error_setg(errp, "volume name must be specified first"); - g_free(ds); - return; - } - - for (idx = 1; idx < 4; idx++) { - if (tokens[idx] != NULL) { - if (strstart(tokens[idx], "mport=", NULL)) { - xseg_find_port(tokens[idx], "mport=", &lmport); - } - if (strstart(tokens[idx], "vport=", NULL)) { - xseg_find_port(tokens[idx], "vport=", &lvport); - } - if (strstart(tokens[idx], "segment=", NULL)) { - xseg_find_segment(tokens[idx], "segment=", segment_name); - } - } - } - - if ((lmport == -2) || (lvport == -2)) { - error_setg(errp, "mport and/or vport must be set"); - g_free(ds); - return; - } - *volume = g_strdup(tokens[0]); - *mport = lmport; - *vport = lvport; - g_free(ds); -} - -static void archipelago_parse_filename(const char *filename, QDict *options, - Error **errp) -{ - const char *start; - char *volume = NULL, *segment_name = NULL; - xport mport = NoPort, vport = NoPort; - - if (qdict_haskey(options, ARCHIPELAGO_OPT_VOLUME) - || qdict_haskey(options, ARCHIPELAGO_OPT_SEGMENT) - || qdict_haskey(options, ARCHIPELAGO_OPT_MPORT) - || qdict_haskey(options, ARCHIPELAGO_OPT_VPORT)) { - error_setg(errp, "volume/mport/vport/segment and a file name may not" - " be specified at the same time"); - return; - } - - if (!strstart(filename, "archipelago:", &start)) { - error_setg(errp, "File name must start with 'archipelago:'"); - return; - } - - if (!strlen(start) || strstart(start, "/", NULL)) { - error_setg(errp, "volume name must be specified"); - return; - } - - parse_filename_opts(filename, errp, &volume, &segment_name, &mport, &vport); - - if (volume) { - qdict_put(options, ARCHIPELAGO_OPT_VOLUME, qstring_from_str(volume)); - g_free(volume); - } - if (segment_name) { - qdict_put(options, ARCHIPELAGO_OPT_SEGMENT, - qstring_from_str(segment_name)); - g_free(segment_name); - } - if (mport != NoPort) { - qdict_put(options, ARCHIPELAGO_OPT_MPORT, qint_from_int(mport)); - } - if (vport != NoPort) { - qdict_put(options, ARCHIPELAGO_OPT_VPORT, qint_from_int(vport)); - } -} - -static QemuOptsList archipelago_runtime_opts = { - .name = "archipelago", - .head = QTAILQ_HEAD_INITIALIZER(archipelago_runtime_opts.head), - .desc = { - { - .name = ARCHIPELAGO_OPT_VOLUME, - .type = QEMU_OPT_STRING, - .help = "Name of the volume image", - }, - { - .name = ARCHIPELAGO_OPT_SEGMENT, - .type = QEMU_OPT_STRING, - .help = "Name of the Archipelago shared memory segment", - }, - { - .name = ARCHIPELAGO_OPT_MPORT, - .type = QEMU_OPT_NUMBER, - .help = "Archipelago mapperd port number" - }, - { - .name = ARCHIPELAGO_OPT_VPORT, - .type = QEMU_OPT_NUMBER, - .help = "Archipelago vlmcd port number" - - }, - { /* end of list */ } - }, -}; - -static int qemu_archipelago_open(BlockDriverState *bs, - QDict *options, - int bdrv_flags, - Error **errp) -{ - int ret = 0; - const char *volume, *segment_name; - QemuOpts *opts; - Error *local_err = NULL; - BDRVArchipelagoState *s = bs->opaque; - - opts = qemu_opts_create(&archipelago_runtime_opts, NULL, 0, &error_abort); - qemu_opts_absorb_qdict(opts, options, &local_err); - if (local_err) { - error_propagate(errp, local_err); - ret = -EINVAL; - goto err_exit; - } - - s->mportno = qemu_opt_get_number(opts, ARCHIPELAGO_OPT_MPORT, - ARCHIPELAGO_DFL_MPORT); - s->vportno = qemu_opt_get_number(opts, ARCHIPELAGO_OPT_VPORT, - ARCHIPELAGO_DFL_VPORT); - - segment_name = qemu_opt_get(opts, ARCHIPELAGO_OPT_SEGMENT); - if (segment_name == NULL) { - s->segment_name = g_strdup("archipelago"); - } else { - s->segment_name = g_strdup(segment_name); - } - - volume = qemu_opt_get(opts, ARCHIPELAGO_OPT_VOLUME); - if (volume == NULL) { - error_setg(errp, "archipelago block driver requires the 'volume'" - " option"); - ret = -EINVAL; - goto err_exit; - } - s->volname = g_strdup(volume); - - /* Initialize XSEG, join shared memory segment */ - ret = qemu_archipelago_init(s); - if (ret < 0) { - error_setg(errp, "cannot initialize XSEG and join shared " - "memory segment"); - goto err_exit; - } - - qemu_opts_del(opts); - return 0; - -err_exit: - g_free(s->volname); - g_free(s->segment_name); - qemu_opts_del(opts); - return ret; -} - -static void qemu_archipelago_close(BlockDriverState *bs) -{ - int r, targetlen; - char *target; - struct xseg_request *req; - BDRVArchipelagoState *s = bs->opaque; - - s->stopping = true; - - qemu_mutex_lock(&s->request_mutex); - while (!s->th_is_signaled) { - qemu_cond_wait(&s->request_cond, - &s->request_mutex); - } - qemu_mutex_unlock(&s->request_mutex); - qemu_thread_join(&s->request_th); - qemu_cond_destroy(&s->request_cond); - qemu_mutex_destroy(&s->request_mutex); - - qemu_cond_destroy(&s->archip_cond); - qemu_mutex_destroy(&s->archip_mutex); - - targetlen = strlen(s->volname); - req = xseg_get_request(s->xseg, s->srcport, s->vportno, X_ALLOC); - if (!req) { - archipelagolog("Cannot get XSEG request\n"); - goto err_exit; - } - r = xseg_prep_request(s->xseg, req, targetlen, 0); - if (r < 0) { - xseg_put_request(s->xseg, req, s->srcport); - archipelagolog("Cannot prepare XSEG close request\n"); - goto err_exit; - } - - target = xseg_get_target(s->xseg, req); - memcpy(target, s->volname, targetlen); - req->size = req->datalen; - req->offset = 0; - req->op = X_CLOSE; - - xport p = xseg_submit(s->xseg, req, s->srcport, X_ALLOC); - if (p == NoPort) { - xseg_put_request(s->xseg, req, s->srcport); - archipelagolog("Cannot submit XSEG close request\n"); - goto err_exit; - } - - xseg_signal(s->xseg, p); - wait_reply(s->xseg, s->srcport, s->port, req); - - xseg_put_request(s->xseg, req, s->srcport); - -err_exit: - g_free(s->volname); - g_free(s->segment_name); - xseg_quit_local_signal(s->xseg, s->srcport); - xseg_leave_dynport(s->xseg, s->port); - xseg_leave(s->xseg); -} - -static int qemu_archipelago_create_volume(Error **errp, const char *volname, - char *segment_name, - uint64_t size, xport mportno, - xport vportno) -{ - int ret, targetlen; - struct xseg *xseg = NULL; - struct xseg_request *req; - struct xseg_request_clone *xclone; - struct xseg_port *port; - xport srcport = NoPort, sport = NoPort; - char *target; - - /* Try default values if none has been set */ - if (mportno == (xport) -1) { - mportno = ARCHIPELAGO_DFL_MPORT; - } - - if (vportno == (xport) -1) { - vportno = ARCHIPELAGO_DFL_VPORT; - } - - if (xseg_initialize()) { - error_setg(errp, "Cannot initialize XSEG"); - return -1; - } - - xseg = xseg_join("posix", segment_name, - "posixfd", NULL); - - if (!xseg) { - error_setg(errp, "Cannot join XSEG shared memory segment"); - return -1; - } - - port = xseg_bind_dynport(xseg); - srcport = port->portno; - init_local_signal(xseg, sport, srcport); - - req = xseg_get_request(xseg, srcport, mportno, X_ALLOC); - if (!req) { - error_setg(errp, "Cannot get XSEG request"); - return -1; - } - - targetlen = strlen(volname); - ret = xseg_prep_request(xseg, req, targetlen, - sizeof(struct xseg_request_clone)); - if (ret < 0) { - error_setg(errp, "Cannot prepare XSEG request"); - goto err_exit; - } - - target = xseg_get_target(xseg, req); - if (!target) { - error_setg(errp, "Cannot get XSEG target."); - goto err_exit; - } - memcpy(target, volname, targetlen); - xclone = (struct xseg_request_clone *) xseg_get_data(xseg, req); - memset(xclone->target, 0 , XSEG_MAX_TARGETLEN); - xclone->targetlen = 0; - xclone->size = size; - req->offset = 0; - req->size = req->datalen; - req->op = X_CLONE; - - xport p = xseg_submit(xseg, req, srcport, X_ALLOC); - if (p == NoPort) { - error_setg(errp, "Could not submit XSEG request"); - goto err_exit; - } - xseg_signal(xseg, p); - - ret = wait_reply(xseg, srcport, port, req); - if (ret < 0) { - error_setg(errp, "wait_reply() error."); - } - - xseg_put_request(xseg, req, srcport); - xseg_quit_local_signal(xseg, srcport); - xseg_leave_dynport(xseg, port); - xseg_leave(xseg); - return ret; - -err_exit: - xseg_put_request(xseg, req, srcport); - xseg_quit_local_signal(xseg, srcport); - xseg_leave_dynport(xseg, port); - xseg_leave(xseg); - return -1; -} - -static int qemu_archipelago_create(const char *filename, - QemuOpts *options, - Error **errp) -{ - int ret = 0; - uint64_t total_size = 0; - char *volname = NULL, *segment_name = NULL; - const char *start; - xport mport = NoPort, vport = NoPort; - - if (!strstart(filename, "archipelago:", &start)) { - error_setg(errp, "File name must start with 'archipelago:'"); - return -1; - } - - if (!strlen(start) || strstart(start, "/", NULL)) { - error_setg(errp, "volume name must be specified"); - return -1; - } - - parse_filename_opts(filename, errp, &volname, &segment_name, &mport, - &vport); - total_size = ROUND_UP(qemu_opt_get_size_del(options, BLOCK_OPT_SIZE, 0), - BDRV_SECTOR_SIZE); - - if (segment_name == NULL) { - segment_name = g_strdup("archipelago"); - } - - /* Create an Archipelago volume */ - ret = qemu_archipelago_create_volume(errp, volname, segment_name, - total_size, mport, - vport); - - g_free(volname); - g_free(segment_name); - return ret; -} - -static const AIOCBInfo archipelago_aiocb_info = { - .aiocb_size = sizeof(ArchipelagoAIOCB), -}; - -static int archipelago_submit_request(BDRVArchipelagoState *s, - uint64_t bufidx, - size_t count, - off_t offset, - ArchipelagoAIOCB *aio_cb, - ArchipelagoSegmentedRequest *segreq, - int op) -{ - int ret, targetlen; - char *target; - void *data = NULL; - struct xseg_request *req; - AIORequestData *reqdata = g_new(AIORequestData, 1); - - targetlen = strlen(s->volname); - req = xseg_get_request(s->xseg, s->srcport, s->vportno, X_ALLOC); - if (!req) { - archipelagolog("Cannot get XSEG request\n"); - goto err_exit2; - } - ret = xseg_prep_request(s->xseg, req, targetlen, count); - if (ret < 0) { - archipelagolog("Cannot prepare XSEG request\n"); - goto err_exit; - } - target = xseg_get_target(s->xseg, req); - if (!target) { - archipelagolog("Cannot get XSEG target\n"); - goto err_exit; - } - memcpy(target, s->volname, targetlen); - req->size = count; - req->offset = offset; - - switch (op) { - case ARCHIP_OP_READ: - req->op = X_READ; - break; - case ARCHIP_OP_WRITE: - req->op = X_WRITE; - break; - case ARCHIP_OP_FLUSH: - req->op = X_FLUSH; - break; - } - reqdata->volname = s->volname; - reqdata->offset = offset; - reqdata->size = count; - reqdata->bufidx = bufidx; - reqdata->aio_cb = aio_cb; - reqdata->segreq = segreq; - reqdata->op = op; - - xseg_set_req_data(s->xseg, req, reqdata); - if (op == ARCHIP_OP_WRITE) { - data = xseg_get_data(s->xseg, req); - if (!data) { - archipelagolog("Cannot get XSEG data\n"); - goto err_exit; - } - qemu_iovec_to_buf(aio_cb->qiov, bufidx, data, count); - } - - xport p = xseg_submit(s->xseg, req, s->srcport, X_ALLOC); - if (p == NoPort) { - archipelagolog("Could not submit XSEG request\n"); - goto err_exit; - } - xseg_signal(s->xseg, p); - return 0; - -err_exit: - g_free(reqdata); - xseg_put_request(s->xseg, req, s->srcport); - return -EIO; -err_exit2: - g_free(reqdata); - return -EIO; -} - -static int archipelago_aio_segmented_rw(BDRVArchipelagoState *s, - size_t count, - off_t offset, - ArchipelagoAIOCB *aio_cb, - int op) -{ - int ret, segments_nr; - size_t pos = 0; - ArchipelagoSegmentedRequest *segreq; - - segreq = g_new0(ArchipelagoSegmentedRequest, 1); - - if (op == ARCHIP_OP_FLUSH) { - segments_nr = 1; - } else { - segments_nr = (int)(count / MAX_REQUEST_SIZE) + \ - ((count % MAX_REQUEST_SIZE) ? 1 : 0); - } - segreq->total = count; - atomic_mb_set(&segreq->ref, segments_nr); - - while (segments_nr > 1) { - ret = archipelago_submit_request(s, pos, - MAX_REQUEST_SIZE, - offset + pos, - aio_cb, segreq, op); - - if (ret < 0) { - goto err_exit; - } - count -= MAX_REQUEST_SIZE; - pos += MAX_REQUEST_SIZE; - segments_nr--; - } - ret = archipelago_submit_request(s, pos, count, offset + pos, - aio_cb, segreq, op); - - if (ret < 0) { - goto err_exit; - } - return 0; - -err_exit: - segreq->failed = 1; - if (atomic_fetch_sub(&segreq->ref, segments_nr) == segments_nr) { - g_free(segreq); - } - return ret; -} - -static BlockAIOCB *qemu_archipelago_aio_rw(BlockDriverState *bs, - int64_t sector_num, - QEMUIOVector *qiov, - int nb_sectors, - BlockCompletionFunc *cb, - void *opaque, - int op) -{ - ArchipelagoAIOCB *aio_cb; - BDRVArchipelagoState *s = bs->opaque; - int64_t size, off; - int ret; - - aio_cb = qemu_aio_get(&archipelago_aiocb_info, bs, cb, opaque); - aio_cb->cmd = op; - aio_cb->qiov = qiov; - - aio_cb->ret = 0; - aio_cb->s = s; - aio_cb->status = -EINPROGRESS; - - off = sector_num * BDRV_SECTOR_SIZE; - size = nb_sectors * BDRV_SECTOR_SIZE; - aio_cb->size = size; - - ret = archipelago_aio_segmented_rw(s, size, off, - aio_cb, op); - if (ret < 0) { - goto err_exit; - } - return &aio_cb->common; - -err_exit: - error_report("qemu_archipelago_aio_rw(): I/O Error"); - qemu_aio_unref(aio_cb); - return NULL; -} - -static BlockAIOCB *qemu_archipelago_aio_readv(BlockDriverState *bs, - int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, - BlockCompletionFunc *cb, void *opaque) -{ - return qemu_archipelago_aio_rw(bs, sector_num, qiov, nb_sectors, cb, - opaque, ARCHIP_OP_READ); -} - -static BlockAIOCB *qemu_archipelago_aio_writev(BlockDriverState *bs, - int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, - BlockCompletionFunc *cb, void *opaque) -{ - return qemu_archipelago_aio_rw(bs, sector_num, qiov, nb_sectors, cb, - opaque, ARCHIP_OP_WRITE); -} - -static int64_t archipelago_volume_info(BDRVArchipelagoState *s) -{ - uint64_t size; - int ret, targetlen; - struct xseg_request *req; - struct xseg_reply_info *xinfo; - AIORequestData *reqdata = g_new(AIORequestData, 1); - - const char *volname = s->volname; - targetlen = strlen(volname); - req = xseg_get_request(s->xseg, s->srcport, s->mportno, X_ALLOC); - if (!req) { - archipelagolog("Cannot get XSEG request\n"); - goto err_exit2; - } - ret = xseg_prep_request(s->xseg, req, targetlen, - sizeof(struct xseg_reply_info)); - if (ret < 0) { - archipelagolog("Cannot prepare XSEG request\n"); - goto err_exit; - } - char *target = xseg_get_target(s->xseg, req); - if (!target) { - archipelagolog("Cannot get XSEG target\n"); - goto err_exit; - } - memcpy(target, volname, targetlen); - req->size = req->datalen; - req->offset = 0; - req->op = X_INFO; - - reqdata->op = ARCHIP_OP_VOLINFO; - reqdata->volname = volname; - xseg_set_req_data(s->xseg, req, reqdata); - - xport p = xseg_submit(s->xseg, req, s->srcport, X_ALLOC); - if (p == NoPort) { - archipelagolog("Cannot submit XSEG request\n"); - goto err_exit; - } - xseg_signal(s->xseg, p); - qemu_mutex_lock(&s->archip_mutex); - while (!s->is_signaled) { - qemu_cond_wait(&s->archip_cond, &s->archip_mutex); - } - s->is_signaled = false; - qemu_mutex_unlock(&s->archip_mutex); - - xinfo = (struct xseg_reply_info *) xseg_get_data(s->xseg, req); - size = xinfo->size; - xseg_put_request(s->xseg, req, s->srcport); - g_free(reqdata); - s->size = size; - return size; - -err_exit: - xseg_put_request(s->xseg, req, s->srcport); -err_exit2: - g_free(reqdata); - return -EIO; -} - -static int64_t qemu_archipelago_getlength(BlockDriverState *bs) -{ - int64_t ret; - BDRVArchipelagoState *s = bs->opaque; - - ret = archipelago_volume_info(s); - return ret; -} - -static int qemu_archipelago_truncate(BlockDriverState *bs, int64_t offset) -{ - int ret, targetlen; - struct xseg_request *req; - BDRVArchipelagoState *s = bs->opaque; - AIORequestData *reqdata = g_new(AIORequestData, 1); - - const char *volname = s->volname; - targetlen = strlen(volname); - req = xseg_get_request(s->xseg, s->srcport, s->mportno, X_ALLOC); - if (!req) { - archipelagolog("Cannot get XSEG request\n"); - goto err_exit2; - } - - ret = xseg_prep_request(s->xseg, req, targetlen, 0); - if (ret < 0) { - archipelagolog("Cannot prepare XSEG request\n"); - goto err_exit; - } - char *target = xseg_get_target(s->xseg, req); - if (!target) { - archipelagolog("Cannot get XSEG target\n"); - goto err_exit; - } - memcpy(target, volname, targetlen); - req->offset = offset; - req->op = X_TRUNCATE; - - reqdata->op = ARCHIP_OP_TRUNCATE; - reqdata->volname = volname; - - xseg_set_req_data(s->xseg, req, reqdata); - - xport p = xseg_submit(s->xseg, req, s->srcport, X_ALLOC); - if (p == NoPort) { - archipelagolog("Cannot submit XSEG request\n"); - goto err_exit; - } - - xseg_signal(s->xseg, p); - qemu_mutex_lock(&s->archip_mutex); - while (!s->is_signaled) { - qemu_cond_wait(&s->archip_cond, &s->archip_mutex); - } - s->is_signaled = false; - qemu_mutex_unlock(&s->archip_mutex); - xseg_put_request(s->xseg, req, s->srcport); - g_free(reqdata); - return 0; - -err_exit: - xseg_put_request(s->xseg, req, s->srcport); -err_exit2: - g_free(reqdata); - return -EIO; -} - -static QemuOptsList qemu_archipelago_create_opts = { - .name = "archipelago-create-opts", - .head = QTAILQ_HEAD_INITIALIZER(qemu_archipelago_create_opts.head), - .desc = { - { - .name = BLOCK_OPT_SIZE, - .type = QEMU_OPT_SIZE, - .help = "Virtual disk size" - }, - { /* end of list */ } - } -}; - -static BlockAIOCB *qemu_archipelago_aio_flush(BlockDriverState *bs, - BlockCompletionFunc *cb, void *opaque) -{ - return qemu_archipelago_aio_rw(bs, 0, NULL, 0, cb, opaque, - ARCHIP_OP_FLUSH); -} - -static BlockDriver bdrv_archipelago = { - .format_name = "archipelago", - .protocol_name = "archipelago", - .instance_size = sizeof(BDRVArchipelagoState), - .bdrv_parse_filename = archipelago_parse_filename, - .bdrv_file_open = qemu_archipelago_open, - .bdrv_close = qemu_archipelago_close, - .bdrv_create = qemu_archipelago_create, - .bdrv_getlength = qemu_archipelago_getlength, - .bdrv_truncate = qemu_archipelago_truncate, - .bdrv_aio_readv = qemu_archipelago_aio_readv, - .bdrv_aio_writev = qemu_archipelago_aio_writev, - .bdrv_aio_flush = qemu_archipelago_aio_flush, - .bdrv_has_zero_init = bdrv_has_zero_init_1, - .create_opts = &qemu_archipelago_create_opts, -}; - -static void bdrv_archipelago_init(void) -{ - bdrv_register(&bdrv_archipelago); -} - -block_init(bdrv_archipelago_init); diff --git a/qemu/block/backup.c b/qemu/block/backup.c deleted file mode 100644 index 491fd1406..000000000 --- a/qemu/block/backup.c +++ /dev/null @@ -1,613 +0,0 @@ -/* - * QEMU backup - * - * Copyright (C) 2013 Proxmox Server Solutions - * - * Authors: - * Dietmar Maurer (dietmar@proxmox.com) - * - * This work is licensed under the terms of the GNU GPL, version 2 or later. - * See the COPYING file in the top-level directory. - * - */ - -#include "qemu/osdep.h" - -#include "trace.h" -#include "block/block.h" -#include "block/block_int.h" -#include "block/blockjob.h" -#include "qapi/error.h" -#include "qapi/qmp/qerror.h" -#include "qemu/ratelimit.h" -#include "qemu/cutils.h" -#include "sysemu/block-backend.h" -#include "qemu/bitmap.h" - -#define BACKUP_CLUSTER_SIZE_DEFAULT (1 << 16) -#define SLICE_TIME 100000000ULL /* ns */ - -typedef struct CowRequest { - int64_t start; - int64_t end; - QLIST_ENTRY(CowRequest) list; - CoQueue wait_queue; /* coroutines blocked on this request */ -} CowRequest; - -typedef struct BackupBlockJob { - BlockJob common; - BlockDriverState *target; - /* bitmap for sync=incremental */ - BdrvDirtyBitmap *sync_bitmap; - MirrorSyncMode sync_mode; - RateLimit limit; - BlockdevOnError on_source_error; - BlockdevOnError on_target_error; - CoRwlock flush_rwlock; - uint64_t sectors_read; - unsigned long *done_bitmap; - int64_t cluster_size; - QLIST_HEAD(, CowRequest) inflight_reqs; -} BackupBlockJob; - -/* Size of a cluster in sectors, instead of bytes. */ -static inline int64_t cluster_size_sectors(BackupBlockJob *job) -{ - return job->cluster_size / BDRV_SECTOR_SIZE; -} - -/* See if in-flight requests overlap and wait for them to complete */ -static void coroutine_fn wait_for_overlapping_requests(BackupBlockJob *job, - int64_t start, - int64_t end) -{ - CowRequest *req; - bool retry; - - do { - retry = false; - QLIST_FOREACH(req, &job->inflight_reqs, list) { - if (end > req->start && start < req->end) { - qemu_co_queue_wait(&req->wait_queue); - retry = true; - break; - } - } - } while (retry); -} - -/* Keep track of an in-flight request */ -static void cow_request_begin(CowRequest *req, BackupBlockJob *job, - int64_t start, int64_t end) -{ - req->start = start; - req->end = end; - qemu_co_queue_init(&req->wait_queue); - QLIST_INSERT_HEAD(&job->inflight_reqs, req, list); -} - -/* Forget about a completed request */ -static void cow_request_end(CowRequest *req) -{ - QLIST_REMOVE(req, list); - qemu_co_queue_restart_all(&req->wait_queue); -} - -static int coroutine_fn backup_do_cow(BlockDriverState *bs, - int64_t sector_num, int nb_sectors, - bool *error_is_read, - bool is_write_notifier) -{ - BackupBlockJob *job = (BackupBlockJob *)bs->job; - CowRequest cow_request; - struct iovec iov; - QEMUIOVector bounce_qiov; - void *bounce_buffer = NULL; - int ret = 0; - int64_t sectors_per_cluster = cluster_size_sectors(job); - int64_t start, end; - int n; - - qemu_co_rwlock_rdlock(&job->flush_rwlock); - - start = sector_num / sectors_per_cluster; - end = DIV_ROUND_UP(sector_num + nb_sectors, sectors_per_cluster); - - trace_backup_do_cow_enter(job, start, sector_num, nb_sectors); - - wait_for_overlapping_requests(job, start, end); - cow_request_begin(&cow_request, job, start, end); - - for (; start < end; start++) { - if (test_bit(start, job->done_bitmap)) { - trace_backup_do_cow_skip(job, start); - continue; /* already copied */ - } - - trace_backup_do_cow_process(job, start); - - n = MIN(sectors_per_cluster, - job->common.len / BDRV_SECTOR_SIZE - - start * sectors_per_cluster); - - if (!bounce_buffer) { - bounce_buffer = qemu_blockalign(bs, job->cluster_size); - } - iov.iov_base = bounce_buffer; - iov.iov_len = n * BDRV_SECTOR_SIZE; - qemu_iovec_init_external(&bounce_qiov, &iov, 1); - - if (is_write_notifier) { - ret = bdrv_co_readv_no_serialising(bs, - start * sectors_per_cluster, - n, &bounce_qiov); - } else { - ret = bdrv_co_readv(bs, start * sectors_per_cluster, n, - &bounce_qiov); - } - if (ret < 0) { - trace_backup_do_cow_read_fail(job, start, ret); - if (error_is_read) { - *error_is_read = true; - } - goto out; - } - - if (buffer_is_zero(iov.iov_base, iov.iov_len)) { - ret = bdrv_co_write_zeroes(job->target, - start * sectors_per_cluster, - n, BDRV_REQ_MAY_UNMAP); - } else { - ret = bdrv_co_writev(job->target, - start * sectors_per_cluster, n, - &bounce_qiov); - } - if (ret < 0) { - trace_backup_do_cow_write_fail(job, start, ret); - if (error_is_read) { - *error_is_read = false; - } - goto out; - } - - set_bit(start, job->done_bitmap); - - /* Publish progress, guest I/O counts as progress too. Note that the - * offset field is an opaque progress value, it is not a disk offset. - */ - job->sectors_read += n; - job->common.offset += n * BDRV_SECTOR_SIZE; - } - -out: - if (bounce_buffer) { - qemu_vfree(bounce_buffer); - } - - cow_request_end(&cow_request); - - trace_backup_do_cow_return(job, sector_num, nb_sectors, ret); - - qemu_co_rwlock_unlock(&job->flush_rwlock); - - return ret; -} - -static int coroutine_fn backup_before_write_notify( - NotifierWithReturn *notifier, - void *opaque) -{ - BdrvTrackedRequest *req = opaque; - int64_t sector_num = req->offset >> BDRV_SECTOR_BITS; - int nb_sectors = req->bytes >> BDRV_SECTOR_BITS; - - assert((req->offset & (BDRV_SECTOR_SIZE - 1)) == 0); - assert((req->bytes & (BDRV_SECTOR_SIZE - 1)) == 0); - - return backup_do_cow(req->bs, sector_num, nb_sectors, NULL, true); -} - -static void backup_set_speed(BlockJob *job, int64_t speed, Error **errp) -{ - BackupBlockJob *s = container_of(job, BackupBlockJob, common); - - if (speed < 0) { - error_setg(errp, QERR_INVALID_PARAMETER, "speed"); - return; - } - ratelimit_set_speed(&s->limit, speed / BDRV_SECTOR_SIZE, SLICE_TIME); -} - -static void backup_iostatus_reset(BlockJob *job) -{ - BackupBlockJob *s = container_of(job, BackupBlockJob, common); - - if (s->target->blk) { - blk_iostatus_reset(s->target->blk); - } -} - -static void backup_cleanup_sync_bitmap(BackupBlockJob *job, int ret) -{ - BdrvDirtyBitmap *bm; - BlockDriverState *bs = job->common.bs; - - if (ret < 0 || block_job_is_cancelled(&job->common)) { - /* Merge the successor back into the parent, delete nothing. */ - bm = bdrv_reclaim_dirty_bitmap(bs, job->sync_bitmap, NULL); - assert(bm); - } else { - /* Everything is fine, delete this bitmap and install the backup. */ - bm = bdrv_dirty_bitmap_abdicate(bs, job->sync_bitmap, NULL); - assert(bm); - } -} - -static void backup_commit(BlockJob *job) -{ - BackupBlockJob *s = container_of(job, BackupBlockJob, common); - if (s->sync_bitmap) { - backup_cleanup_sync_bitmap(s, 0); - } -} - -static void backup_abort(BlockJob *job) -{ - BackupBlockJob *s = container_of(job, BackupBlockJob, common); - if (s->sync_bitmap) { - backup_cleanup_sync_bitmap(s, -1); - } -} - -static const BlockJobDriver backup_job_driver = { - .instance_size = sizeof(BackupBlockJob), - .job_type = BLOCK_JOB_TYPE_BACKUP, - .set_speed = backup_set_speed, - .iostatus_reset = backup_iostatus_reset, - .commit = backup_commit, - .abort = backup_abort, -}; - -static BlockErrorAction backup_error_action(BackupBlockJob *job, - bool read, int error) -{ - if (read) { - return block_job_error_action(&job->common, job->common.bs, - job->on_source_error, true, error); - } else { - return block_job_error_action(&job->common, job->target, - job->on_target_error, false, error); - } -} - -typedef struct { - int ret; -} BackupCompleteData; - -static void backup_complete(BlockJob *job, void *opaque) -{ - BackupBlockJob *s = container_of(job, BackupBlockJob, common); - BackupCompleteData *data = opaque; - - bdrv_unref(s->target); - - block_job_completed(job, data->ret); - g_free(data); -} - -static bool coroutine_fn yield_and_check(BackupBlockJob *job) -{ - if (block_job_is_cancelled(&job->common)) { - return true; - } - - /* we need to yield so that bdrv_drain_all() returns. - * (without, VM does not reboot) - */ - if (job->common.speed) { - uint64_t delay_ns = ratelimit_calculate_delay(&job->limit, - job->sectors_read); - job->sectors_read = 0; - block_job_sleep_ns(&job->common, QEMU_CLOCK_REALTIME, delay_ns); - } else { - block_job_sleep_ns(&job->common, QEMU_CLOCK_REALTIME, 0); - } - - if (block_job_is_cancelled(&job->common)) { - return true; - } - - return false; -} - -static int coroutine_fn backup_run_incremental(BackupBlockJob *job) -{ - bool error_is_read; - int ret = 0; - int clusters_per_iter; - uint32_t granularity; - int64_t sector; - int64_t cluster; - int64_t end; - int64_t last_cluster = -1; - int64_t sectors_per_cluster = cluster_size_sectors(job); - BlockDriverState *bs = job->common.bs; - HBitmapIter hbi; - - granularity = bdrv_dirty_bitmap_granularity(job->sync_bitmap); - clusters_per_iter = MAX((granularity / job->cluster_size), 1); - bdrv_dirty_iter_init(job->sync_bitmap, &hbi); - - /* Find the next dirty sector(s) */ - while ((sector = hbitmap_iter_next(&hbi)) != -1) { - cluster = sector / sectors_per_cluster; - - /* Fake progress updates for any clusters we skipped */ - if (cluster != last_cluster + 1) { - job->common.offset += ((cluster - last_cluster - 1) * - job->cluster_size); - } - - for (end = cluster + clusters_per_iter; cluster < end; cluster++) { - do { - if (yield_and_check(job)) { - return ret; - } - ret = backup_do_cow(bs, cluster * sectors_per_cluster, - sectors_per_cluster, &error_is_read, - false); - if ((ret < 0) && - backup_error_action(job, error_is_read, -ret) == - BLOCK_ERROR_ACTION_REPORT) { - return ret; - } - } while (ret < 0); - } - - /* If the bitmap granularity is smaller than the backup granularity, - * we need to advance the iterator pointer to the next cluster. */ - if (granularity < job->cluster_size) { - bdrv_set_dirty_iter(&hbi, cluster * sectors_per_cluster); - } - - last_cluster = cluster - 1; - } - - /* Play some final catchup with the progress meter */ - end = DIV_ROUND_UP(job->common.len, job->cluster_size); - if (last_cluster + 1 < end) { - job->common.offset += ((end - last_cluster - 1) * job->cluster_size); - } - - return ret; -} - -static void coroutine_fn backup_run(void *opaque) -{ - BackupBlockJob *job = opaque; - BackupCompleteData *data; - BlockDriverState *bs = job->common.bs; - BlockDriverState *target = job->target; - BlockdevOnError on_target_error = job->on_target_error; - NotifierWithReturn before_write = { - .notify = backup_before_write_notify, - }; - int64_t start, end; - int64_t sectors_per_cluster = cluster_size_sectors(job); - int ret = 0; - - QLIST_INIT(&job->inflight_reqs); - qemu_co_rwlock_init(&job->flush_rwlock); - - start = 0; - end = DIV_ROUND_UP(job->common.len, job->cluster_size); - - job->done_bitmap = bitmap_new(end); - - if (target->blk) { - blk_set_on_error(target->blk, on_target_error, on_target_error); - blk_iostatus_enable(target->blk); - } - - bdrv_add_before_write_notifier(bs, &before_write); - - if (job->sync_mode == MIRROR_SYNC_MODE_NONE) { - while (!block_job_is_cancelled(&job->common)) { - /* Yield until the job is cancelled. We just let our before_write - * notify callback service CoW requests. */ - job->common.busy = false; - qemu_coroutine_yield(); - job->common.busy = true; - } - } else if (job->sync_mode == MIRROR_SYNC_MODE_INCREMENTAL) { - ret = backup_run_incremental(job); - } else { - /* Both FULL and TOP SYNC_MODE's require copying.. */ - for (; start < end; start++) { - bool error_is_read; - if (yield_and_check(job)) { - break; - } - - if (job->sync_mode == MIRROR_SYNC_MODE_TOP) { - int i, n; - int alloced = 0; - - /* Check to see if these blocks are already in the - * backing file. */ - - for (i = 0; i < sectors_per_cluster;) { - /* bdrv_is_allocated() only returns true/false based - * on the first set of sectors it comes across that - * are are all in the same state. - * For that reason we must verify each sector in the - * backup cluster length. We end up copying more than - * needed but at some point that is always the case. */ - alloced = - bdrv_is_allocated(bs, - start * sectors_per_cluster + i, - sectors_per_cluster - i, &n); - i += n; - - if (alloced == 1 || n == 0) { - break; - } - } - - /* If the above loop never found any sectors that are in - * the topmost image, skip this backup. */ - if (alloced == 0) { - continue; - } - } - /* FULL sync mode we copy the whole drive. */ - ret = backup_do_cow(bs, start * sectors_per_cluster, - sectors_per_cluster, &error_is_read, false); - if (ret < 0) { - /* Depending on error action, fail now or retry cluster */ - BlockErrorAction action = - backup_error_action(job, error_is_read, -ret); - if (action == BLOCK_ERROR_ACTION_REPORT) { - break; - } else { - start--; - continue; - } - } - } - } - - notifier_with_return_remove(&before_write); - - /* wait until pending backup_do_cow() calls have completed */ - qemu_co_rwlock_wrlock(&job->flush_rwlock); - qemu_co_rwlock_unlock(&job->flush_rwlock); - g_free(job->done_bitmap); - - if (target->blk) { - blk_iostatus_disable(target->blk); - } - bdrv_op_unblock_all(target, job->common.blocker); - - data = g_malloc(sizeof(*data)); - data->ret = ret; - block_job_defer_to_main_loop(&job->common, backup_complete, data); -} - -void backup_start(BlockDriverState *bs, BlockDriverState *target, - int64_t speed, MirrorSyncMode sync_mode, - BdrvDirtyBitmap *sync_bitmap, - BlockdevOnError on_source_error, - BlockdevOnError on_target_error, - BlockCompletionFunc *cb, void *opaque, - BlockJobTxn *txn, Error **errp) -{ - int64_t len; - BlockDriverInfo bdi; - int ret; - - assert(bs); - assert(target); - assert(cb); - - if (bs == target) { - error_setg(errp, "Source and target cannot be the same"); - return; - } - - if ((on_source_error == BLOCKDEV_ON_ERROR_STOP || - on_source_error == BLOCKDEV_ON_ERROR_ENOSPC) && - (!bs->blk || !blk_iostatus_is_enabled(bs->blk))) { - error_setg(errp, QERR_INVALID_PARAMETER, "on-source-error"); - return; - } - - if (!bdrv_is_inserted(bs)) { - error_setg(errp, "Device is not inserted: %s", - bdrv_get_device_name(bs)); - return; - } - - if (!bdrv_is_inserted(target)) { - error_setg(errp, "Device is not inserted: %s", - bdrv_get_device_name(target)); - return; - } - - if (bdrv_op_is_blocked(bs, BLOCK_OP_TYPE_BACKUP_SOURCE, errp)) { - return; - } - - if (bdrv_op_is_blocked(target, BLOCK_OP_TYPE_BACKUP_TARGET, errp)) { - return; - } - - if (sync_mode == MIRROR_SYNC_MODE_INCREMENTAL) { - if (!sync_bitmap) { - error_setg(errp, "must provide a valid bitmap name for " - "\"incremental\" sync mode"); - return; - } - - /* Create a new bitmap, and freeze/disable this one. */ - if (bdrv_dirty_bitmap_create_successor(bs, sync_bitmap, errp) < 0) { - return; - } - } else if (sync_bitmap) { - error_setg(errp, - "a sync_bitmap was provided to backup_run, " - "but received an incompatible sync_mode (%s)", - MirrorSyncMode_lookup[sync_mode]); - return; - } - - len = bdrv_getlength(bs); - if (len < 0) { - error_setg_errno(errp, -len, "unable to get length for '%s'", - bdrv_get_device_name(bs)); - goto error; - } - - BackupBlockJob *job = block_job_create(&backup_job_driver, bs, speed, - cb, opaque, errp); - if (!job) { - goto error; - } - - job->on_source_error = on_source_error; - job->on_target_error = on_target_error; - job->target = target; - job->sync_mode = sync_mode; - job->sync_bitmap = sync_mode == MIRROR_SYNC_MODE_INCREMENTAL ? - sync_bitmap : NULL; - - /* If there is no backing file on the target, we cannot rely on COW if our - * backup cluster size is smaller than the target cluster size. Even for - * targets with a backing file, try to avoid COW if possible. */ - ret = bdrv_get_info(job->target, &bdi); - if (ret < 0 && !target->backing) { - error_setg_errno(errp, -ret, - "Couldn't determine the cluster size of the target image, " - "which has no backing file"); - error_append_hint(errp, - "Aborting, since this may create an unusable destination image\n"); - goto error; - } else if (ret < 0 && target->backing) { - /* Not fatal; just trudge on ahead. */ - job->cluster_size = BACKUP_CLUSTER_SIZE_DEFAULT; - } else { - job->cluster_size = MAX(BACKUP_CLUSTER_SIZE_DEFAULT, bdi.cluster_size); - } - - bdrv_op_block_all(target, job->common.blocker); - job->common.len = len; - job->common.co = qemu_coroutine_create(backup_run); - block_job_txn_add_job(txn, &job->common); - qemu_coroutine_enter(job->common.co, job); - return; - - error: - if (sync_bitmap) { - bdrv_reclaim_dirty_bitmap(bs, sync_bitmap, NULL); - } -} diff --git a/qemu/block/blkdebug.c b/qemu/block/blkdebug.c deleted file mode 100644 index 20d25bda6..000000000 --- a/qemu/block/blkdebug.c +++ /dev/null @@ -1,759 +0,0 @@ -/* - * Block protocol for I/O error injection - * - * Copyright (c) 2010 Kevin Wolf - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. - */ - -#include "qemu/osdep.h" -#include "qapi/error.h" -#include "qemu/cutils.h" -#include "qemu/config-file.h" -#include "block/block_int.h" -#include "qemu/module.h" -#include "qapi/qmp/qbool.h" -#include "qapi/qmp/qdict.h" -#include "qapi/qmp/qint.h" -#include "qapi/qmp/qstring.h" -#include "sysemu/qtest.h" - -typedef struct BDRVBlkdebugState { - int state; - int new_state; - - QLIST_HEAD(, BlkdebugRule) rules[BLKDBG__MAX]; - QSIMPLEQ_HEAD(, BlkdebugRule) active_rules; - QLIST_HEAD(, BlkdebugSuspendedReq) suspended_reqs; -} BDRVBlkdebugState; - -typedef struct BlkdebugAIOCB { - BlockAIOCB common; - QEMUBH *bh; - int ret; -} BlkdebugAIOCB; - -typedef struct BlkdebugSuspendedReq { - Coroutine *co; - char *tag; - QLIST_ENTRY(BlkdebugSuspendedReq) next; -} BlkdebugSuspendedReq; - -static const AIOCBInfo blkdebug_aiocb_info = { - .aiocb_size = sizeof(BlkdebugAIOCB), -}; - -enum { - ACTION_INJECT_ERROR, - ACTION_SET_STATE, - ACTION_SUSPEND, -}; - -typedef struct BlkdebugRule { - BlkdebugEvent event; - int action; - int state; - union { - struct { - int error; - int immediately; - int once; - int64_t sector; - } inject; - struct { - int new_state; - } set_state; - struct { - char *tag; - } suspend; - } options; - QLIST_ENTRY(BlkdebugRule) next; - QSIMPLEQ_ENTRY(BlkdebugRule) active_next; -} BlkdebugRule; - -static QemuOptsList inject_error_opts = { - .name = "inject-error", - .head = QTAILQ_HEAD_INITIALIZER(inject_error_opts.head), - .desc = { - { - .name = "event", - .type = QEMU_OPT_STRING, - }, - { - .name = "state", - .type = QEMU_OPT_NUMBER, - }, - { - .name = "errno", - .type = QEMU_OPT_NUMBER, - }, - { - .name = "sector", - .type = QEMU_OPT_NUMBER, - }, - { - .name = "once", - .type = QEMU_OPT_BOOL, - }, - { - .name = "immediately", - .type = QEMU_OPT_BOOL, - }, - { /* end of list */ } - }, -}; - -static QemuOptsList set_state_opts = { - .name = "set-state", - .head = QTAILQ_HEAD_INITIALIZER(set_state_opts.head), - .desc = { - { - .name = "event", - .type = QEMU_OPT_STRING, - }, - { - .name = "state", - .type = QEMU_OPT_NUMBER, - }, - { - .name = "new_state", - .type = QEMU_OPT_NUMBER, - }, - { /* end of list */ } - }, -}; - -static QemuOptsList *config_groups[] = { - &inject_error_opts, - &set_state_opts, - NULL -}; - -static int get_event_by_name(const char *name, BlkdebugEvent *event) -{ - int i; - - for (i = 0; i < BLKDBG__MAX; i++) { - if (!strcmp(BlkdebugEvent_lookup[i], name)) { - *event = i; - return 0; - } - } - - return -1; -} - -struct add_rule_data { - BDRVBlkdebugState *s; - int action; -}; - -static int add_rule(void *opaque, QemuOpts *opts, Error **errp) -{ - struct add_rule_data *d = opaque; - BDRVBlkdebugState *s = d->s; - const char* event_name; - BlkdebugEvent event; - struct BlkdebugRule *rule; - - /* Find the right event for the rule */ - event_name = qemu_opt_get(opts, "event"); - if (!event_name) { - error_setg(errp, "Missing event name for rule"); - return -1; - } else if (get_event_by_name(event_name, &event) < 0) { - error_setg(errp, "Invalid event name \"%s\"", event_name); - return -1; - } - - /* Set attributes common for all actions */ - rule = g_malloc0(sizeof(*rule)); - *rule = (struct BlkdebugRule) { - .event = event, - .action = d->action, - .state = qemu_opt_get_number(opts, "state", 0), - }; - - /* Parse action-specific options */ - switch (d->action) { - case ACTION_INJECT_ERROR: - rule->options.inject.error = qemu_opt_get_number(opts, "errno", EIO); - rule->options.inject.once = qemu_opt_get_bool(opts, "once", 0); - rule->options.inject.immediately = - qemu_opt_get_bool(opts, "immediately", 0); - rule->options.inject.sector = qemu_opt_get_number(opts, "sector", -1); - break; - - case ACTION_SET_STATE: - rule->options.set_state.new_state = - qemu_opt_get_number(opts, "new_state", 0); - break; - - case ACTION_SUSPEND: - rule->options.suspend.tag = - g_strdup(qemu_opt_get(opts, "tag")); - break; - }; - - /* Add the rule */ - QLIST_INSERT_HEAD(&s->rules[event], rule, next); - - return 0; -} - -static void remove_rule(BlkdebugRule *rule) -{ - switch (rule->action) { - case ACTION_INJECT_ERROR: - case ACTION_SET_STATE: - break; - case ACTION_SUSPEND: - g_free(rule->options.suspend.tag); - break; - } - - QLIST_REMOVE(rule, next); - g_free(rule); -} - -static int read_config(BDRVBlkdebugState *s, const char *filename, - QDict *options, Error **errp) -{ - FILE *f = NULL; - int ret; - struct add_rule_data d; - Error *local_err = NULL; - - if (filename) { - f = fopen(filename, "r"); - if (f == NULL) { - error_setg_errno(errp, errno, "Could not read blkdebug config file"); - return -errno; - } - - ret = qemu_config_parse(f, config_groups, filename); - if (ret < 0) { - error_setg(errp, "Could not parse blkdebug config file"); - ret = -EINVAL; - goto fail; - } - } - - qemu_config_parse_qdict(options, config_groups, &local_err); - if (local_err) { - error_propagate(errp, local_err); - ret = -EINVAL; - goto fail; - } - - d.s = s; - d.action = ACTION_INJECT_ERROR; - qemu_opts_foreach(&inject_error_opts, add_rule, &d, &local_err); - if (local_err) { - error_propagate(errp, local_err); - ret = -EINVAL; - goto fail; - } - - d.action = ACTION_SET_STATE; - qemu_opts_foreach(&set_state_opts, add_rule, &d, &local_err); - if (local_err) { - error_propagate(errp, local_err); - ret = -EINVAL; - goto fail; - } - - ret = 0; -fail: - qemu_opts_reset(&inject_error_opts); - qemu_opts_reset(&set_state_opts); - if (f) { - fclose(f); - } - return ret; -} - -/* Valid blkdebug filenames look like blkdebug:path/to/config:path/to/image */ -static void blkdebug_parse_filename(const char *filename, QDict *options, - Error **errp) -{ - const char *c; - - /* Parse the blkdebug: prefix */ - if (!strstart(filename, "blkdebug:", &filename)) { - /* There was no prefix; therefore, all options have to be already - present in the QDict (except for the filename) */ - qdict_put(options, "x-image", qstring_from_str(filename)); - return; - } - - /* Parse config file path */ - c = strchr(filename, ':'); - if (c == NULL) { - error_setg(errp, "blkdebug requires both config file and image path"); - return; - } - - if (c != filename) { - QString *config_path; - config_path = qstring_from_substr(filename, 0, c - filename - 1); - qdict_put(options, "config", config_path); - } - - /* TODO Allow multi-level nesting and set file.filename here */ - filename = c + 1; - qdict_put(options, "x-image", qstring_from_str(filename)); -} - -static QemuOptsList runtime_opts = { - .name = "blkdebug", - .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head), - .desc = { - { - .name = "config", - .type = QEMU_OPT_STRING, - .help = "Path to the configuration file", - }, - { - .name = "x-image", - .type = QEMU_OPT_STRING, - .help = "[internal use only, will be removed]", - }, - { - .name = "align", - .type = QEMU_OPT_SIZE, - .help = "Required alignment in bytes", - }, - { /* end of list */ } - }, -}; - -static int blkdebug_open(BlockDriverState *bs, QDict *options, int flags, - Error **errp) -{ - BDRVBlkdebugState *s = bs->opaque; - QemuOpts *opts; - Error *local_err = NULL; - const char *config; - uint64_t align; - int ret; - - opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort); - qemu_opts_absorb_qdict(opts, options, &local_err); - if (local_err) { - error_propagate(errp, local_err); - ret = -EINVAL; - goto out; - } - - /* Read rules from config file or command line options */ - config = qemu_opt_get(opts, "config"); - ret = read_config(s, config, options, errp); - if (ret) { - goto out; - } - - /* Set initial state */ - s->state = 1; - - /* Open the image file */ - bs->file = bdrv_open_child(qemu_opt_get(opts, "x-image"), options, "image", - bs, &child_file, false, &local_err); - if (local_err) { - ret = -EINVAL; - error_propagate(errp, local_err); - goto out; - } - - /* Set request alignment */ - align = qemu_opt_get_size(opts, "align", bs->request_alignment); - if (align > 0 && align < INT_MAX && !(align & (align - 1))) { - bs->request_alignment = align; - } else { - error_setg(errp, "Invalid alignment"); - ret = -EINVAL; - goto fail_unref; - } - - ret = 0; - goto out; - -fail_unref: - bdrv_unref_child(bs, bs->file); -out: - qemu_opts_del(opts); - return ret; -} - -static void error_callback_bh(void *opaque) -{ - struct BlkdebugAIOCB *acb = opaque; - qemu_bh_delete(acb->bh); - acb->common.cb(acb->common.opaque, acb->ret); - qemu_aio_unref(acb); -} - -static BlockAIOCB *inject_error(BlockDriverState *bs, - BlockCompletionFunc *cb, void *opaque, BlkdebugRule *rule) -{ - BDRVBlkdebugState *s = bs->opaque; - int error = rule->options.inject.error; - struct BlkdebugAIOCB *acb; - QEMUBH *bh; - bool immediately = rule->options.inject.immediately; - - if (rule->options.inject.once) { - QSIMPLEQ_REMOVE(&s->active_rules, rule, BlkdebugRule, active_next); - remove_rule(rule); - } - - if (immediately) { - return NULL; - } - - acb = qemu_aio_get(&blkdebug_aiocb_info, bs, cb, opaque); - acb->ret = -error; - - bh = aio_bh_new(bdrv_get_aio_context(bs), error_callback_bh, acb); - acb->bh = bh; - qemu_bh_schedule(bh); - - return &acb->common; -} - -static BlockAIOCB *blkdebug_aio_readv(BlockDriverState *bs, - int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, - BlockCompletionFunc *cb, void *opaque) -{ - BDRVBlkdebugState *s = bs->opaque; - BlkdebugRule *rule = NULL; - - QSIMPLEQ_FOREACH(rule, &s->active_rules, active_next) { - if (rule->options.inject.sector == -1 || - (rule->options.inject.sector >= sector_num && - rule->options.inject.sector < sector_num + nb_sectors)) { - break; - } - } - - if (rule && rule->options.inject.error) { - return inject_error(bs, cb, opaque, rule); - } - - return bdrv_aio_readv(bs->file->bs, sector_num, qiov, nb_sectors, - cb, opaque); -} - -static BlockAIOCB *blkdebug_aio_writev(BlockDriverState *bs, - int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, - BlockCompletionFunc *cb, void *opaque) -{ - BDRVBlkdebugState *s = bs->opaque; - BlkdebugRule *rule = NULL; - - QSIMPLEQ_FOREACH(rule, &s->active_rules, active_next) { - if (rule->options.inject.sector == -1 || - (rule->options.inject.sector >= sector_num && - rule->options.inject.sector < sector_num + nb_sectors)) { - break; - } - } - - if (rule && rule->options.inject.error) { - return inject_error(bs, cb, opaque, rule); - } - - return bdrv_aio_writev(bs->file->bs, sector_num, qiov, nb_sectors, - cb, opaque); -} - -static BlockAIOCB *blkdebug_aio_flush(BlockDriverState *bs, - BlockCompletionFunc *cb, void *opaque) -{ - BDRVBlkdebugState *s = bs->opaque; - BlkdebugRule *rule = NULL; - - QSIMPLEQ_FOREACH(rule, &s->active_rules, active_next) { - if (rule->options.inject.sector == -1) { - break; - } - } - - if (rule && rule->options.inject.error) { - return inject_error(bs, cb, opaque, rule); - } - - return bdrv_aio_flush(bs->file->bs, cb, opaque); -} - - -static void blkdebug_close(BlockDriverState *bs) -{ - BDRVBlkdebugState *s = bs->opaque; - BlkdebugRule *rule, *next; - int i; - - for (i = 0; i < BLKDBG__MAX; i++) { - QLIST_FOREACH_SAFE(rule, &s->rules[i], next, next) { - remove_rule(rule); - } - } -} - -static void suspend_request(BlockDriverState *bs, BlkdebugRule *rule) -{ - BDRVBlkdebugState *s = bs->opaque; - BlkdebugSuspendedReq r; - - r = (BlkdebugSuspendedReq) { - .co = qemu_coroutine_self(), - .tag = g_strdup(rule->options.suspend.tag), - }; - - remove_rule(rule); - QLIST_INSERT_HEAD(&s->suspended_reqs, &r, next); - - if (!qtest_enabled()) { - printf("blkdebug: Suspended request '%s'\n", r.tag); - } - qemu_coroutine_yield(); - if (!qtest_enabled()) { - printf("blkdebug: Resuming request '%s'\n", r.tag); - } - - QLIST_REMOVE(&r, next); - g_free(r.tag); -} - -static bool process_rule(BlockDriverState *bs, struct BlkdebugRule *rule, - bool injected) -{ - BDRVBlkdebugState *s = bs->opaque; - - /* Only process rules for the current state */ - if (rule->state && rule->state != s->state) { - return injected; - } - - /* Take the action */ - switch (rule->action) { - case ACTION_INJECT_ERROR: - if (!injected) { - QSIMPLEQ_INIT(&s->active_rules); - injected = true; - } - QSIMPLEQ_INSERT_HEAD(&s->active_rules, rule, active_next); - break; - - case ACTION_SET_STATE: - s->new_state = rule->options.set_state.new_state; - break; - - case ACTION_SUSPEND: - suspend_request(bs, rule); - break; - } - return injected; -} - -static void blkdebug_debug_event(BlockDriverState *bs, BlkdebugEvent event) -{ - BDRVBlkdebugState *s = bs->opaque; - struct BlkdebugRule *rule, *next; - bool injected; - - assert((int)event >= 0 && event < BLKDBG__MAX); - - injected = false; - s->new_state = s->state; - QLIST_FOREACH_SAFE(rule, &s->rules[event], next, next) { - injected = process_rule(bs, rule, injected); - } - s->state = s->new_state; -} - -static int blkdebug_debug_breakpoint(BlockDriverState *bs, const char *event, - const char *tag) -{ - BDRVBlkdebugState *s = bs->opaque; - struct BlkdebugRule *rule; - BlkdebugEvent blkdebug_event; - - if (get_event_by_name(event, &blkdebug_event) < 0) { - return -ENOENT; - } - - - rule = g_malloc(sizeof(*rule)); - *rule = (struct BlkdebugRule) { - .event = blkdebug_event, - .action = ACTION_SUSPEND, - .state = 0, - .options.suspend.tag = g_strdup(tag), - }; - - QLIST_INSERT_HEAD(&s->rules[blkdebug_event], rule, next); - - return 0; -} - -static int blkdebug_debug_resume(BlockDriverState *bs, const char *tag) -{ - BDRVBlkdebugState *s = bs->opaque; - BlkdebugSuspendedReq *r, *next; - - QLIST_FOREACH_SAFE(r, &s->suspended_reqs, next, next) { - if (!strcmp(r->tag, tag)) { - qemu_coroutine_enter(r->co, NULL); - return 0; - } - } - return -ENOENT; -} - -static int blkdebug_debug_remove_breakpoint(BlockDriverState *bs, - const char *tag) -{ - BDRVBlkdebugState *s = bs->opaque; - BlkdebugSuspendedReq *r, *r_next; - BlkdebugRule *rule, *next; - int i, ret = -ENOENT; - - for (i = 0; i < BLKDBG__MAX; i++) { - QLIST_FOREACH_SAFE(rule, &s->rules[i], next, next) { - if (rule->action == ACTION_SUSPEND && - !strcmp(rule->options.suspend.tag, tag)) { - remove_rule(rule); - ret = 0; - } - } - } - QLIST_FOREACH_SAFE(r, &s->suspended_reqs, next, r_next) { - if (!strcmp(r->tag, tag)) { - qemu_coroutine_enter(r->co, NULL); - ret = 0; - } - } - return ret; -} - -static bool blkdebug_debug_is_suspended(BlockDriverState *bs, const char *tag) -{ - BDRVBlkdebugState *s = bs->opaque; - BlkdebugSuspendedReq *r; - - QLIST_FOREACH(r, &s->suspended_reqs, next) { - if (!strcmp(r->tag, tag)) { - return true; - } - } - return false; -} - -static int64_t blkdebug_getlength(BlockDriverState *bs) -{ - return bdrv_getlength(bs->file->bs); -} - -static int blkdebug_truncate(BlockDriverState *bs, int64_t offset) -{ - return bdrv_truncate(bs->file->bs, offset); -} - -static void blkdebug_refresh_filename(BlockDriverState *bs, QDict *options) -{ - QDict *opts; - const QDictEntry *e; - bool force_json = false; - - for (e = qdict_first(options); e; e = qdict_next(options, e)) { - if (strcmp(qdict_entry_key(e), "config") && - strcmp(qdict_entry_key(e), "x-image")) - { - force_json = true; - break; - } - } - - if (force_json && !bs->file->bs->full_open_options) { - /* The config file cannot be recreated, so creating a plain filename - * is impossible */ - return; - } - - if (!force_json && bs->file->bs->exact_filename[0]) { - snprintf(bs->exact_filename, sizeof(bs->exact_filename), - "blkdebug:%s:%s", - qdict_get_try_str(options, "config") ?: "", - bs->file->bs->exact_filename); - } - - opts = qdict_new(); - qdict_put_obj(opts, "driver", QOBJECT(qstring_from_str("blkdebug"))); - - QINCREF(bs->file->bs->full_open_options); - qdict_put_obj(opts, "image", QOBJECT(bs->file->bs->full_open_options)); - - for (e = qdict_first(options); e; e = qdict_next(options, e)) { - if (strcmp(qdict_entry_key(e), "x-image")) { - qobject_incref(qdict_entry_value(e)); - qdict_put_obj(opts, qdict_entry_key(e), qdict_entry_value(e)); - } - } - - bs->full_open_options = opts; -} - -static int blkdebug_reopen_prepare(BDRVReopenState *reopen_state, - BlockReopenQueue *queue, Error **errp) -{ - return 0; -} - -static BlockDriver bdrv_blkdebug = { - .format_name = "blkdebug", - .protocol_name = "blkdebug", - .instance_size = sizeof(BDRVBlkdebugState), - - .bdrv_parse_filename = blkdebug_parse_filename, - .bdrv_file_open = blkdebug_open, - .bdrv_close = blkdebug_close, - .bdrv_reopen_prepare = blkdebug_reopen_prepare, - .bdrv_getlength = blkdebug_getlength, - .bdrv_truncate = blkdebug_truncate, - .bdrv_refresh_filename = blkdebug_refresh_filename, - - .bdrv_aio_readv = blkdebug_aio_readv, - .bdrv_aio_writev = blkdebug_aio_writev, - .bdrv_aio_flush = blkdebug_aio_flush, - - .bdrv_debug_event = blkdebug_debug_event, - .bdrv_debug_breakpoint = blkdebug_debug_breakpoint, - .bdrv_debug_remove_breakpoint - = blkdebug_debug_remove_breakpoint, - .bdrv_debug_resume = blkdebug_debug_resume, - .bdrv_debug_is_suspended = blkdebug_debug_is_suspended, -}; - -static void bdrv_blkdebug_init(void) -{ - bdrv_register(&bdrv_blkdebug); -} - -block_init(bdrv_blkdebug_init); diff --git a/qemu/block/blkreplay.c b/qemu/block/blkreplay.c deleted file mode 100755 index 42f1813af..000000000 --- a/qemu/block/blkreplay.c +++ /dev/null @@ -1,160 +0,0 @@ -/* - * Block protocol for record/replay - * - * Copyright (c) 2010-2016 Institute for System Programming - * of the Russian Academy of Sciences. - * - * This work is licensed under the terms of the GNU GPL, version 2 or later. - * See the COPYING file in the top-level directory. - * - */ - -#include "qemu/osdep.h" -#include "qemu-common.h" -#include "block/block_int.h" -#include "sysemu/replay.h" -#include "qapi/error.h" - -typedef struct Request { - Coroutine *co; - QEMUBH *bh; -} Request; - -/* Next request id. - This counter is global, because requests from different - block devices should not get overlapping ids. */ -static uint64_t request_id; - -static int blkreplay_open(BlockDriverState *bs, QDict *options, int flags, - Error **errp) -{ - Error *local_err = NULL; - int ret; - - /* Open the image file */ - bs->file = bdrv_open_child(NULL, options, "image", - bs, &child_file, false, &local_err); - if (local_err) { - ret = -EINVAL; - error_propagate(errp, local_err); - goto fail; - } - - ret = 0; -fail: - if (ret < 0) { - bdrv_unref_child(bs, bs->file); - } - return ret; -} - -static void blkreplay_close(BlockDriverState *bs) -{ -} - -static int64_t blkreplay_getlength(BlockDriverState *bs) -{ - return bdrv_getlength(bs->file->bs); -} - -/* This bh is used for synchronization of return from coroutines. - It continues yielded coroutine which then finishes its execution. - BH is called adjusted to some replay checkpoint, therefore - record and replay will always finish coroutines deterministically. -*/ -static void blkreplay_bh_cb(void *opaque) -{ - Request *req = opaque; - qemu_coroutine_enter(req->co, NULL); - qemu_bh_delete(req->bh); - g_free(req); -} - -static void block_request_create(uint64_t reqid, BlockDriverState *bs, - Coroutine *co) -{ - Request *req = g_new(Request, 1); - *req = (Request) { - .co = co, - .bh = aio_bh_new(bdrv_get_aio_context(bs), blkreplay_bh_cb, req), - }; - replay_block_event(req->bh, reqid); -} - -static int coroutine_fn blkreplay_co_readv(BlockDriverState *bs, - int64_t sector_num, int nb_sectors, QEMUIOVector *qiov) -{ - uint64_t reqid = request_id++; - int ret = bdrv_co_readv(bs->file->bs, sector_num, nb_sectors, qiov); - block_request_create(reqid, bs, qemu_coroutine_self()); - qemu_coroutine_yield(); - - return ret; -} - -static int coroutine_fn blkreplay_co_writev(BlockDriverState *bs, - int64_t sector_num, int nb_sectors, QEMUIOVector *qiov) -{ - uint64_t reqid = request_id++; - int ret = bdrv_co_writev(bs->file->bs, sector_num, nb_sectors, qiov); - block_request_create(reqid, bs, qemu_coroutine_self()); - qemu_coroutine_yield(); - - return ret; -} - -static int coroutine_fn blkreplay_co_write_zeroes(BlockDriverState *bs, - int64_t sector_num, int nb_sectors, BdrvRequestFlags flags) -{ - uint64_t reqid = request_id++; - int ret = bdrv_co_write_zeroes(bs->file->bs, sector_num, nb_sectors, flags); - block_request_create(reqid, bs, qemu_coroutine_self()); - qemu_coroutine_yield(); - - return ret; -} - -static int coroutine_fn blkreplay_co_discard(BlockDriverState *bs, - int64_t sector_num, int nb_sectors) -{ - uint64_t reqid = request_id++; - int ret = bdrv_co_discard(bs->file->bs, sector_num, nb_sectors); - block_request_create(reqid, bs, qemu_coroutine_self()); - qemu_coroutine_yield(); - - return ret; -} - -static int coroutine_fn blkreplay_co_flush(BlockDriverState *bs) -{ - uint64_t reqid = request_id++; - int ret = bdrv_co_flush(bs->file->bs); - block_request_create(reqid, bs, qemu_coroutine_self()); - qemu_coroutine_yield(); - - return ret; -} - -static BlockDriver bdrv_blkreplay = { - .format_name = "blkreplay", - .protocol_name = "blkreplay", - .instance_size = 0, - - .bdrv_file_open = blkreplay_open, - .bdrv_close = blkreplay_close, - .bdrv_getlength = blkreplay_getlength, - - .bdrv_co_readv = blkreplay_co_readv, - .bdrv_co_writev = blkreplay_co_writev, - - .bdrv_co_write_zeroes = blkreplay_co_write_zeroes, - .bdrv_co_discard = blkreplay_co_discard, - .bdrv_co_flush = blkreplay_co_flush, -}; - -static void bdrv_blkreplay_init(void) -{ - bdrv_register(&bdrv_blkreplay); -} - -block_init(bdrv_blkreplay_init); diff --git a/qemu/block/blkverify.c b/qemu/block/blkverify.c deleted file mode 100644 index 9414b7a84..000000000 --- a/qemu/block/blkverify.c +++ /dev/null @@ -1,371 +0,0 @@ -/* - * Block protocol for block driver correctness testing - * - * Copyright (C) 2010 IBM, Corp. - * - * This work is licensed under the terms of the GNU GPL, version 2 or later. - * See the COPYING file in the top-level directory. - */ - -#include "qemu/osdep.h" -#include "qapi/error.h" -#include "qemu/sockets.h" /* for EINPROGRESS on Windows */ -#include "block/block_int.h" -#include "qapi/qmp/qdict.h" -#include "qapi/qmp/qstring.h" -#include "qemu/cutils.h" - -typedef struct { - BdrvChild *test_file; -} BDRVBlkverifyState; - -typedef struct BlkverifyAIOCB BlkverifyAIOCB; -struct BlkverifyAIOCB { - BlockAIOCB common; - QEMUBH *bh; - - /* Request metadata */ - bool is_write; - int64_t sector_num; - int nb_sectors; - - int ret; /* first completed request's result */ - unsigned int done; /* completion counter */ - - QEMUIOVector *qiov; /* user I/O vector */ - QEMUIOVector raw_qiov; /* cloned I/O vector for raw file */ - void *buf; /* buffer for raw file I/O */ - - void (*verify)(BlkverifyAIOCB *acb); -}; - -static const AIOCBInfo blkverify_aiocb_info = { - .aiocb_size = sizeof(BlkverifyAIOCB), -}; - -static void GCC_FMT_ATTR(2, 3) blkverify_err(BlkverifyAIOCB *acb, - const char *fmt, ...) -{ - va_list ap; - - va_start(ap, fmt); - fprintf(stderr, "blkverify: %s sector_num=%" PRId64 " nb_sectors=%d ", - acb->is_write ? "write" : "read", acb->sector_num, - acb->nb_sectors); - vfprintf(stderr, fmt, ap); - fprintf(stderr, "\n"); - va_end(ap); - exit(1); -} - -/* Valid blkverify filenames look like blkverify:path/to/raw_image:path/to/image */ -static void blkverify_parse_filename(const char *filename, QDict *options, - Error **errp) -{ - const char *c; - QString *raw_path; - - - /* Parse the blkverify: prefix */ - if (!strstart(filename, "blkverify:", &filename)) { - /* There was no prefix; therefore, all options have to be already - present in the QDict (except for the filename) */ - qdict_put(options, "x-image", qstring_from_str(filename)); - return; - } - - /* Parse the raw image filename */ - c = strchr(filename, ':'); - if (c == NULL) { - error_setg(errp, "blkverify requires raw copy and original image path"); - return; - } - - /* TODO Implement option pass-through and set raw.filename here */ - raw_path = qstring_from_substr(filename, 0, c - filename - 1); - qdict_put(options, "x-raw", raw_path); - - /* TODO Allow multi-level nesting and set file.filename here */ - filename = c + 1; - qdict_put(options, "x-image", qstring_from_str(filename)); -} - -static QemuOptsList runtime_opts = { - .name = "blkverify", - .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head), - .desc = { - { - .name = "x-raw", - .type = QEMU_OPT_STRING, - .help = "[internal use only, will be removed]", - }, - { - .name = "x-image", - .type = QEMU_OPT_STRING, - .help = "[internal use only, will be removed]", - }, - { /* end of list */ } - }, -}; - -static int blkverify_open(BlockDriverState *bs, QDict *options, int flags, - Error **errp) -{ - BDRVBlkverifyState *s = bs->opaque; - QemuOpts *opts; - Error *local_err = NULL; - int ret; - - opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort); - qemu_opts_absorb_qdict(opts, options, &local_err); - if (local_err) { - error_propagate(errp, local_err); - ret = -EINVAL; - goto fail; - } - - /* Open the raw file */ - bs->file = bdrv_open_child(qemu_opt_get(opts, "x-raw"), options, "raw", - bs, &child_file, false, &local_err); - if (local_err) { - ret = -EINVAL; - error_propagate(errp, local_err); - goto fail; - } - - /* Open the test file */ - s->test_file = bdrv_open_child(qemu_opt_get(opts, "x-image"), options, - "test", bs, &child_format, false, - &local_err); - if (local_err) { - ret = -EINVAL; - error_propagate(errp, local_err); - goto fail; - } - - ret = 0; -fail: - if (ret < 0) { - bdrv_unref_child(bs, bs->file); - } - qemu_opts_del(opts); - return ret; -} - -static void blkverify_close(BlockDriverState *bs) -{ - BDRVBlkverifyState *s = bs->opaque; - - bdrv_unref_child(bs, s->test_file); - s->test_file = NULL; -} - -static int64_t blkverify_getlength(BlockDriverState *bs) -{ - BDRVBlkverifyState *s = bs->opaque; - - return bdrv_getlength(s->test_file->bs); -} - -static BlkverifyAIOCB *blkverify_aio_get(BlockDriverState *bs, bool is_write, - int64_t sector_num, QEMUIOVector *qiov, - int nb_sectors, - BlockCompletionFunc *cb, - void *opaque) -{ - BlkverifyAIOCB *acb = qemu_aio_get(&blkverify_aiocb_info, bs, cb, opaque); - - acb->bh = NULL; - acb->is_write = is_write; - acb->sector_num = sector_num; - acb->nb_sectors = nb_sectors; - acb->ret = -EINPROGRESS; - acb->done = 0; - acb->qiov = qiov; - acb->buf = NULL; - acb->verify = NULL; - return acb; -} - -static void blkverify_aio_bh(void *opaque) -{ - BlkverifyAIOCB *acb = opaque; - - qemu_bh_delete(acb->bh); - if (acb->buf) { - qemu_iovec_destroy(&acb->raw_qiov); - qemu_vfree(acb->buf); - } - acb->common.cb(acb->common.opaque, acb->ret); - qemu_aio_unref(acb); -} - -static void blkverify_aio_cb(void *opaque, int ret) -{ - BlkverifyAIOCB *acb = opaque; - - switch (++acb->done) { - case 1: - acb->ret = ret; - break; - - case 2: - if (acb->ret != ret) { - blkverify_err(acb, "return value mismatch %d != %d", acb->ret, ret); - } - - if (acb->verify) { - acb->verify(acb); - } - - acb->bh = aio_bh_new(bdrv_get_aio_context(acb->common.bs), - blkverify_aio_bh, acb); - qemu_bh_schedule(acb->bh); - break; - } -} - -static void blkverify_verify_readv(BlkverifyAIOCB *acb) -{ - ssize_t offset = qemu_iovec_compare(acb->qiov, &acb->raw_qiov); - if (offset != -1) { - blkverify_err(acb, "contents mismatch in sector %" PRId64, - acb->sector_num + (int64_t)(offset / BDRV_SECTOR_SIZE)); - } -} - -static BlockAIOCB *blkverify_aio_readv(BlockDriverState *bs, - int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, - BlockCompletionFunc *cb, void *opaque) -{ - BDRVBlkverifyState *s = bs->opaque; - BlkverifyAIOCB *acb = blkverify_aio_get(bs, false, sector_num, qiov, - nb_sectors, cb, opaque); - - acb->verify = blkverify_verify_readv; - acb->buf = qemu_blockalign(bs->file->bs, qiov->size); - qemu_iovec_init(&acb->raw_qiov, acb->qiov->niov); - qemu_iovec_clone(&acb->raw_qiov, qiov, acb->buf); - - bdrv_aio_readv(s->test_file->bs, sector_num, qiov, nb_sectors, - blkverify_aio_cb, acb); - bdrv_aio_readv(bs->file->bs, sector_num, &acb->raw_qiov, nb_sectors, - blkverify_aio_cb, acb); - return &acb->common; -} - -static BlockAIOCB *blkverify_aio_writev(BlockDriverState *bs, - int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, - BlockCompletionFunc *cb, void *opaque) -{ - BDRVBlkverifyState *s = bs->opaque; - BlkverifyAIOCB *acb = blkverify_aio_get(bs, true, sector_num, qiov, - nb_sectors, cb, opaque); - - bdrv_aio_writev(s->test_file->bs, sector_num, qiov, nb_sectors, - blkverify_aio_cb, acb); - bdrv_aio_writev(bs->file->bs, sector_num, qiov, nb_sectors, - blkverify_aio_cb, acb); - return &acb->common; -} - -static BlockAIOCB *blkverify_aio_flush(BlockDriverState *bs, - BlockCompletionFunc *cb, - void *opaque) -{ - BDRVBlkverifyState *s = bs->opaque; - - /* Only flush test file, the raw file is not important */ - return bdrv_aio_flush(s->test_file->bs, cb, opaque); -} - -static bool blkverify_recurse_is_first_non_filter(BlockDriverState *bs, - BlockDriverState *candidate) -{ - BDRVBlkverifyState *s = bs->opaque; - - bool perm = bdrv_recurse_is_first_non_filter(bs->file->bs, candidate); - - if (perm) { - return true; - } - - return bdrv_recurse_is_first_non_filter(s->test_file->bs, candidate); -} - -/* Propagate AioContext changes to ->test_file */ -static void blkverify_detach_aio_context(BlockDriverState *bs) -{ - BDRVBlkverifyState *s = bs->opaque; - - bdrv_detach_aio_context(s->test_file->bs); -} - -static void blkverify_attach_aio_context(BlockDriverState *bs, - AioContext *new_context) -{ - BDRVBlkverifyState *s = bs->opaque; - - bdrv_attach_aio_context(s->test_file->bs, new_context); -} - -static void blkverify_refresh_filename(BlockDriverState *bs, QDict *options) -{ - BDRVBlkverifyState *s = bs->opaque; - - /* bs->file->bs has already been refreshed */ - bdrv_refresh_filename(s->test_file->bs); - - if (bs->file->bs->full_open_options - && s->test_file->bs->full_open_options) - { - QDict *opts = qdict_new(); - qdict_put_obj(opts, "driver", QOBJECT(qstring_from_str("blkverify"))); - - QINCREF(bs->file->bs->full_open_options); - qdict_put_obj(opts, "raw", QOBJECT(bs->file->bs->full_open_options)); - QINCREF(s->test_file->bs->full_open_options); - qdict_put_obj(opts, "test", - QOBJECT(s->test_file->bs->full_open_options)); - - bs->full_open_options = opts; - } - - if (bs->file->bs->exact_filename[0] - && s->test_file->bs->exact_filename[0]) - { - snprintf(bs->exact_filename, sizeof(bs->exact_filename), - "blkverify:%s:%s", - bs->file->bs->exact_filename, - s->test_file->bs->exact_filename); - } -} - -static BlockDriver bdrv_blkverify = { - .format_name = "blkverify", - .protocol_name = "blkverify", - .instance_size = sizeof(BDRVBlkverifyState), - - .bdrv_parse_filename = blkverify_parse_filename, - .bdrv_file_open = blkverify_open, - .bdrv_close = blkverify_close, - .bdrv_getlength = blkverify_getlength, - .bdrv_refresh_filename = blkverify_refresh_filename, - - .bdrv_aio_readv = blkverify_aio_readv, - .bdrv_aio_writev = blkverify_aio_writev, - .bdrv_aio_flush = blkverify_aio_flush, - - .bdrv_attach_aio_context = blkverify_attach_aio_context, - .bdrv_detach_aio_context = blkverify_detach_aio_context, - - .is_filter = true, - .bdrv_recurse_is_first_non_filter = blkverify_recurse_is_first_non_filter, -}; - -static void bdrv_blkverify_init(void) -{ - bdrv_register(&bdrv_blkverify); -} - -block_init(bdrv_blkverify_init); diff --git a/qemu/block/block-backend.c b/qemu/block/block-backend.c deleted file mode 100644 index 16c9d5e0f..000000000 --- a/qemu/block/block-backend.c +++ /dev/null @@ -1,1635 +0,0 @@ -/* - * QEMU Block backends - * - * Copyright (C) 2014 Red Hat, Inc. - * - * Authors: - * Markus Armbruster , - * - * This work is licensed under the terms of the GNU LGPL, version 2.1 - * or later. See the COPYING.LIB file in the top-level directory. - */ - -#include "qemu/osdep.h" -#include "sysemu/block-backend.h" -#include "block/block_int.h" -#include "block/blockjob.h" -#include "block/throttle-groups.h" -#include "sysemu/blockdev.h" -#include "sysemu/sysemu.h" -#include "qapi-event.h" -#include "qemu/id.h" - -/* Number of coroutines to reserve per attached device model */ -#define COROUTINE_POOL_RESERVATION 64 - -#define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */ - -static AioContext *blk_aiocb_get_aio_context(BlockAIOCB *acb); - -struct BlockBackend { - char *name; - int refcnt; - BdrvChild *root; - DriveInfo *legacy_dinfo; /* null unless created by drive_new() */ - QTAILQ_ENTRY(BlockBackend) link; /* for block_backends */ - QTAILQ_ENTRY(BlockBackend) monitor_link; /* for monitor_block_backends */ - - void *dev; /* attached device model, if any */ - /* TODO change to DeviceState when all users are qdevified */ - const BlockDevOps *dev_ops; - void *dev_opaque; - - /* the block size for which the guest device expects atomicity */ - int guest_block_size; - - /* If the BDS tree is removed, some of its options are stored here (which - * can be used to restore those options in the new BDS on insert) */ - BlockBackendRootState root_state; - - bool enable_write_cache; - - /* I/O stats (display with "info blockstats"). */ - BlockAcctStats stats; - - BlockdevOnError on_read_error, on_write_error; - bool iostatus_enabled; - BlockDeviceIoStatus iostatus; - - bool allow_write_beyond_eof; - - NotifierList remove_bs_notifiers, insert_bs_notifiers; -}; - -typedef struct BlockBackendAIOCB { - BlockAIOCB common; - QEMUBH *bh; - BlockBackend *blk; - int ret; -} BlockBackendAIOCB; - -static const AIOCBInfo block_backend_aiocb_info = { - .get_aio_context = blk_aiocb_get_aio_context, - .aiocb_size = sizeof(BlockBackendAIOCB), -}; - -static void drive_info_del(DriveInfo *dinfo); - -/* All BlockBackends */ -static QTAILQ_HEAD(, BlockBackend) block_backends = - QTAILQ_HEAD_INITIALIZER(block_backends); - -/* All BlockBackends referenced by the monitor and which are iterated through by - * blk_next() */ -static QTAILQ_HEAD(, BlockBackend) monitor_block_backends = - QTAILQ_HEAD_INITIALIZER(monitor_block_backends); - -static void blk_root_inherit_options(int *child_flags, QDict *child_options, - int parent_flags, QDict *parent_options) -{ - /* We're not supposed to call this function for root nodes */ - abort(); -} - -static const BdrvChildRole child_root = { - .inherit_options = blk_root_inherit_options, -}; - -/* - * Create a new BlockBackend with a reference count of one. - * Store an error through @errp on failure, unless it's null. - * Return the new BlockBackend on success, null on failure. - */ -BlockBackend *blk_new(Error **errp) -{ - BlockBackend *blk; - - blk = g_new0(BlockBackend, 1); - blk->refcnt = 1; - notifier_list_init(&blk->remove_bs_notifiers); - notifier_list_init(&blk->insert_bs_notifiers); - QTAILQ_INSERT_TAIL(&block_backends, blk, link); - return blk; -} - -/* - * Create a new BlockBackend with a new BlockDriverState attached. - * Otherwise just like blk_new(), which see. - */ -BlockBackend *blk_new_with_bs(Error **errp) -{ - BlockBackend *blk; - BlockDriverState *bs; - - blk = blk_new(errp); - if (!blk) { - return NULL; - } - - bs = bdrv_new_root(); - blk->root = bdrv_root_attach_child(bs, "root", &child_root); - bs->blk = blk; - return blk; -} - -/* - * Calls blk_new_with_bs() and then calls bdrv_open() on the BlockDriverState. - * - * Just as with bdrv_open(), after having called this function the reference to - * @options belongs to the block layer (even on failure). - * - * TODO: Remove @filename and @flags; it should be possible to specify a whole - * BDS tree just by specifying the @options QDict (or @reference, - * alternatively). At the time of adding this function, this is not possible, - * though, so callers of this function have to be able to specify @filename and - * @flags. - */ -BlockBackend *blk_new_open(const char *filename, const char *reference, - QDict *options, int flags, Error **errp) -{ - BlockBackend *blk; - int ret; - - blk = blk_new_with_bs(errp); - if (!blk) { - QDECREF(options); - return NULL; - } - - ret = bdrv_open(&blk->root->bs, filename, reference, options, flags, errp); - if (ret < 0) { - blk_unref(blk); - return NULL; - } - - blk_set_enable_write_cache(blk, true); - - return blk; -} - -static void blk_delete(BlockBackend *blk) -{ - assert(!blk->refcnt); - assert(!blk->name); - assert(!blk->dev); - if (blk->root) { - blk_remove_bs(blk); - } - assert(QLIST_EMPTY(&blk->remove_bs_notifiers.notifiers)); - assert(QLIST_EMPTY(&blk->insert_bs_notifiers.notifiers)); - if (blk->root_state.throttle_state) { - g_free(blk->root_state.throttle_group); - throttle_group_unref(blk->root_state.throttle_state); - } - QTAILQ_REMOVE(&block_backends, blk, link); - drive_info_del(blk->legacy_dinfo); - block_acct_cleanup(&blk->stats); - g_free(blk); -} - -static void drive_info_del(DriveInfo *dinfo) -{ - if (!dinfo) { - return; - } - qemu_opts_del(dinfo->opts); - g_free(dinfo->serial); - g_free(dinfo); -} - -int blk_get_refcnt(BlockBackend *blk) -{ - return blk ? blk->refcnt : 0; -} - -/* - * Increment @blk's reference count. - * @blk must not be null. - */ -void blk_ref(BlockBackend *blk) -{ - blk->refcnt++; -} - -/* - * Decrement @blk's reference count. - * If this drops it to zero, destroy @blk. - * For convenience, do nothing if @blk is null. - */ -void blk_unref(BlockBackend *blk) -{ - if (blk) { - assert(blk->refcnt > 0); - if (!--blk->refcnt) { - blk_delete(blk); - } - } -} - -/* - * Behaves similarly to blk_next() but iterates over all BlockBackends, even the - * ones which are hidden (i.e. are not referenced by the monitor). - */ -static BlockBackend *blk_all_next(BlockBackend *blk) -{ - return blk ? QTAILQ_NEXT(blk, link) - : QTAILQ_FIRST(&block_backends); -} - -void blk_remove_all_bs(void) -{ - BlockBackend *blk = NULL; - - while ((blk = blk_all_next(blk)) != NULL) { - AioContext *ctx = blk_get_aio_context(blk); - - aio_context_acquire(ctx); - if (blk->root) { - blk_remove_bs(blk); - } - aio_context_release(ctx); - } -} - -/* - * Return the monitor-owned BlockBackend after @blk. - * If @blk is null, return the first one. - * Else, return @blk's next sibling, which may be null. - * - * To iterate over all BlockBackends, do - * for (blk = blk_next(NULL); blk; blk = blk_next(blk)) { - * ... - * } - */ -BlockBackend *blk_next(BlockBackend *blk) -{ - return blk ? QTAILQ_NEXT(blk, monitor_link) - : QTAILQ_FIRST(&monitor_block_backends); -} - -/* - * Iterates over all BlockDriverStates which are attached to a BlockBackend. - * This function is for use by bdrv_next(). - * - * @bs must be NULL or a BDS that is attached to a BB. - */ -BlockDriverState *blk_next_root_bs(BlockDriverState *bs) -{ - BlockBackend *blk; - - if (bs) { - assert(bs->blk); - blk = bs->blk; - } else { - blk = NULL; - } - - do { - blk = blk_all_next(blk); - } while (blk && !blk->root); - - return blk ? blk->root->bs : NULL; -} - -/* - * Add a BlockBackend into the list of backends referenced by the monitor, with - * the given @name acting as the handle for the monitor. - * Strictly for use by blockdev.c. - * - * @name must not be null or empty. - * - * Returns true on success and false on failure. In the latter case, an Error - * object is returned through @errp. - */ -bool monitor_add_blk(BlockBackend *blk, const char *name, Error **errp) -{ - assert(!blk->name); - assert(name && name[0]); - - if (!id_wellformed(name)) { - error_setg(errp, "Invalid device name"); - return false; - } - if (blk_by_name(name)) { - error_setg(errp, "Device with id '%s' already exists", name); - return false; - } - if (bdrv_find_node(name)) { - error_setg(errp, - "Device name '%s' conflicts with an existing node name", - name); - return false; - } - - blk->name = g_strdup(name); - QTAILQ_INSERT_TAIL(&monitor_block_backends, blk, monitor_link); - return true; -} - -/* - * Remove a BlockBackend from the list of backends referenced by the monitor. - * Strictly for use by blockdev.c. - */ -void monitor_remove_blk(BlockBackend *blk) -{ - if (!blk->name) { - return; - } - - QTAILQ_REMOVE(&monitor_block_backends, blk, monitor_link); - g_free(blk->name); - blk->name = NULL; -} - -/* - * Return @blk's name, a non-null string. - * Returns an empty string iff @blk is not referenced by the monitor. - */ -const char *blk_name(BlockBackend *blk) -{ - return blk->name ?: ""; -} - -/* - * Return the BlockBackend with name @name if it exists, else null. - * @name must not be null. - */ -BlockBackend *blk_by_name(const char *name) -{ - BlockBackend *blk = NULL; - - assert(name); - while ((blk = blk_next(blk)) != NULL) { - if (!strcmp(name, blk->name)) { - return blk; - } - } - return NULL; -} - -/* - * Return the BlockDriverState attached to @blk if any, else null. - */ -BlockDriverState *blk_bs(BlockBackend *blk) -{ - return blk->root ? blk->root->bs : NULL; -} - -/* - * Return @blk's DriveInfo if any, else null. - */ -DriveInfo *blk_legacy_dinfo(BlockBackend *blk) -{ - return blk->legacy_dinfo; -} - -/* - * Set @blk's DriveInfo to @dinfo, and return it. - * @blk must not have a DriveInfo set already. - * No other BlockBackend may have the same DriveInfo set. - */ -DriveInfo *blk_set_legacy_dinfo(BlockBackend *blk, DriveInfo *dinfo) -{ - assert(!blk->legacy_dinfo); - return blk->legacy_dinfo = dinfo; -} - -/* - * Return the BlockBackend with DriveInfo @dinfo. - * It must exist. - */ -BlockBackend *blk_by_legacy_dinfo(DriveInfo *dinfo) -{ - BlockBackend *blk = NULL; - - while ((blk = blk_next(blk)) != NULL) { - if (blk->legacy_dinfo == dinfo) { - return blk; - } - } - abort(); -} - -/* - * Disassociates the currently associated BlockDriverState from @blk. - */ -void blk_remove_bs(BlockBackend *blk) -{ - assert(blk->root->bs->blk == blk); - - notifier_list_notify(&blk->remove_bs_notifiers, blk); - - blk_update_root_state(blk); - - blk->root->bs->blk = NULL; - bdrv_root_unref_child(blk->root); - blk->root = NULL; -} - -/* - * Associates a new BlockDriverState with @blk. - */ -void blk_insert_bs(BlockBackend *blk, BlockDriverState *bs) -{ - assert(!blk->root && !bs->blk); - bdrv_ref(bs); - blk->root = bdrv_root_attach_child(bs, "root", &child_root); - bs->blk = blk; - - notifier_list_notify(&blk->insert_bs_notifiers, blk); -} - -/* - * Attach device model @dev to @blk. - * Return 0 on success, -EBUSY when a device model is attached already. - */ -int blk_attach_dev(BlockBackend *blk, void *dev) -/* TODO change to DeviceState *dev when all users are qdevified */ -{ - if (blk->dev) { - return -EBUSY; - } - blk_ref(blk); - blk->dev = dev; - blk_iostatus_reset(blk); - return 0; -} - -/* - * Attach device model @dev to @blk. - * @blk must not have a device model attached already. - * TODO qdevified devices don't use this, remove when devices are qdevified - */ -void blk_attach_dev_nofail(BlockBackend *blk, void *dev) -{ - if (blk_attach_dev(blk, dev) < 0) { - abort(); - } -} - -/* - * Detach device model @dev from @blk. - * @dev must be currently attached to @blk. - */ -void blk_detach_dev(BlockBackend *blk, void *dev) -/* TODO change to DeviceState *dev when all users are qdevified */ -{ - assert(blk->dev == dev); - blk->dev = NULL; - blk->dev_ops = NULL; - blk->dev_opaque = NULL; - blk->guest_block_size = 512; - blk_unref(blk); -} - -/* - * Return the device model attached to @blk if any, else null. - */ -void *blk_get_attached_dev(BlockBackend *blk) -/* TODO change to return DeviceState * when all users are qdevified */ -{ - return blk->dev; -} - -/* - * Set @blk's device model callbacks to @ops. - * @opaque is the opaque argument to pass to the callbacks. - * This is for use by device models. - */ -void blk_set_dev_ops(BlockBackend *blk, const BlockDevOps *ops, - void *opaque) -{ - blk->dev_ops = ops; - blk->dev_opaque = opaque; -} - -/* - * Notify @blk's attached device model of media change. - * If @load is true, notify of media load. - * Else, notify of media eject. - * Also send DEVICE_TRAY_MOVED events as appropriate. - */ -void blk_dev_change_media_cb(BlockBackend *blk, bool load) -{ - if (blk->dev_ops && blk->dev_ops->change_media_cb) { - bool tray_was_open, tray_is_open; - - tray_was_open = blk_dev_is_tray_open(blk); - blk->dev_ops->change_media_cb(blk->dev_opaque, load); - tray_is_open = blk_dev_is_tray_open(blk); - - if (tray_was_open != tray_is_open) { - qapi_event_send_device_tray_moved(blk_name(blk), tray_is_open, - &error_abort); - } - } -} - -/* - * Does @blk's attached device model have removable media? - * %true if no device model is attached. - */ -bool blk_dev_has_removable_media(BlockBackend *blk) -{ - return !blk->dev || (blk->dev_ops && blk->dev_ops->change_media_cb); -} - -/* - * Does @blk's attached device model have a tray? - */ -bool blk_dev_has_tray(BlockBackend *blk) -{ - return blk->dev_ops && blk->dev_ops->is_tray_open; -} - -/* - * Notify @blk's attached device model of a media eject request. - * If @force is true, the medium is about to be yanked out forcefully. - */ -void blk_dev_eject_request(BlockBackend *blk, bool force) -{ - if (blk->dev_ops && blk->dev_ops->eject_request_cb) { - blk->dev_ops->eject_request_cb(blk->dev_opaque, force); - } -} - -/* - * Does @blk's attached device model have a tray, and is it open? - */ -bool blk_dev_is_tray_open(BlockBackend *blk) -{ - if (blk_dev_has_tray(blk)) { - return blk->dev_ops->is_tray_open(blk->dev_opaque); - } - return false; -} - -/* - * Does @blk's attached device model have the medium locked? - * %false if the device model has no such lock. - */ -bool blk_dev_is_medium_locked(BlockBackend *blk) -{ - if (blk->dev_ops && blk->dev_ops->is_medium_locked) { - return blk->dev_ops->is_medium_locked(blk->dev_opaque); - } - return false; -} - -/* - * Notify @blk's attached device model of a backend size change. - */ -void blk_dev_resize_cb(BlockBackend *blk) -{ - if (blk->dev_ops && blk->dev_ops->resize_cb) { - blk->dev_ops->resize_cb(blk->dev_opaque); - } -} - -void blk_iostatus_enable(BlockBackend *blk) -{ - blk->iostatus_enabled = true; - blk->iostatus = BLOCK_DEVICE_IO_STATUS_OK; -} - -/* The I/O status is only enabled if the drive explicitly - * enables it _and_ the VM is configured to stop on errors */ -bool blk_iostatus_is_enabled(const BlockBackend *blk) -{ - return (blk->iostatus_enabled && - (blk->on_write_error == BLOCKDEV_ON_ERROR_ENOSPC || - blk->on_write_error == BLOCKDEV_ON_ERROR_STOP || - blk->on_read_error == BLOCKDEV_ON_ERROR_STOP)); -} - -BlockDeviceIoStatus blk_iostatus(const BlockBackend *blk) -{ - return blk->iostatus; -} - -void blk_iostatus_disable(BlockBackend *blk) -{ - blk->iostatus_enabled = false; -} - -void blk_iostatus_reset(BlockBackend *blk) -{ - if (blk_iostatus_is_enabled(blk)) { - BlockDriverState *bs = blk_bs(blk); - blk->iostatus = BLOCK_DEVICE_IO_STATUS_OK; - if (bs && bs->job) { - block_job_iostatus_reset(bs->job); - } - } -} - -void blk_iostatus_set_err(BlockBackend *blk, int error) -{ - assert(blk_iostatus_is_enabled(blk)); - if (blk->iostatus == BLOCK_DEVICE_IO_STATUS_OK) { - blk->iostatus = error == ENOSPC ? BLOCK_DEVICE_IO_STATUS_NOSPACE : - BLOCK_DEVICE_IO_STATUS_FAILED; - } -} - -void blk_set_allow_write_beyond_eof(BlockBackend *blk, bool allow) -{ - blk->allow_write_beyond_eof = allow; -} - -static int blk_check_byte_request(BlockBackend *blk, int64_t offset, - size_t size) -{ - int64_t len; - - if (size > INT_MAX) { - return -EIO; - } - - if (!blk_is_available(blk)) { - return -ENOMEDIUM; - } - - if (offset < 0) { - return -EIO; - } - - if (!blk->allow_write_beyond_eof) { - len = blk_getlength(blk); - if (len < 0) { - return len; - } - - if (offset > len || len - offset < size) { - return -EIO; - } - } - - return 0; -} - -static int blk_check_request(BlockBackend *blk, int64_t sector_num, - int nb_sectors) -{ - if (sector_num < 0 || sector_num > INT64_MAX / BDRV_SECTOR_SIZE) { - return -EIO; - } - - if (nb_sectors < 0 || nb_sectors > INT_MAX / BDRV_SECTOR_SIZE) { - return -EIO; - } - - return blk_check_byte_request(blk, sector_num * BDRV_SECTOR_SIZE, - nb_sectors * BDRV_SECTOR_SIZE); -} - -static int coroutine_fn blk_co_preadv(BlockBackend *blk, int64_t offset, - unsigned int bytes, QEMUIOVector *qiov, - BdrvRequestFlags flags) -{ - int ret = blk_check_byte_request(blk, offset, bytes); - if (ret < 0) { - return ret; - } - - return bdrv_co_do_preadv(blk_bs(blk), offset, bytes, qiov, flags); -} - -static int coroutine_fn blk_co_pwritev(BlockBackend *blk, int64_t offset, - unsigned int bytes, QEMUIOVector *qiov, - BdrvRequestFlags flags) -{ - int ret; - - ret = blk_check_byte_request(blk, offset, bytes); - if (ret < 0) { - return ret; - } - - if (!blk->enable_write_cache) { - flags |= BDRV_REQ_FUA; - } - - return bdrv_co_do_pwritev(blk_bs(blk), offset, bytes, qiov, flags); -} - -typedef struct BlkRwCo { - BlockBackend *blk; - int64_t offset; - QEMUIOVector *qiov; - int ret; - BdrvRequestFlags flags; -} BlkRwCo; - -static void blk_read_entry(void *opaque) -{ - BlkRwCo *rwco = opaque; - - rwco->ret = blk_co_preadv(rwco->blk, rwco->offset, rwco->qiov->size, - rwco->qiov, rwco->flags); -} - -static void blk_write_entry(void *opaque) -{ - BlkRwCo *rwco = opaque; - - rwco->ret = blk_co_pwritev(rwco->blk, rwco->offset, rwco->qiov->size, - rwco->qiov, rwco->flags); -} - -static int blk_prw(BlockBackend *blk, int64_t offset, uint8_t *buf, - int64_t bytes, CoroutineEntry co_entry, - BdrvRequestFlags flags) -{ - AioContext *aio_context; - QEMUIOVector qiov; - struct iovec iov; - Coroutine *co; - BlkRwCo rwco; - - iov = (struct iovec) { - .iov_base = buf, - .iov_len = bytes, - }; - qemu_iovec_init_external(&qiov, &iov, 1); - - rwco = (BlkRwCo) { - .blk = blk, - .offset = offset, - .qiov = &qiov, - .flags = flags, - .ret = NOT_DONE, - }; - - co = qemu_coroutine_create(co_entry); - qemu_coroutine_enter(co, &rwco); - - aio_context = blk_get_aio_context(blk); - while (rwco.ret == NOT_DONE) { - aio_poll(aio_context, true); - } - - return rwco.ret; -} - -static int blk_rw(BlockBackend *blk, int64_t sector_num, uint8_t *buf, - int nb_sectors, CoroutineEntry co_entry, - BdrvRequestFlags flags) -{ - if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) { - return -EINVAL; - } - - return blk_prw(blk, sector_num << BDRV_SECTOR_BITS, buf, - nb_sectors << BDRV_SECTOR_BITS, co_entry, flags); -} - -int blk_read(BlockBackend *blk, int64_t sector_num, uint8_t *buf, - int nb_sectors) -{ - return blk_rw(blk, sector_num, buf, nb_sectors, blk_read_entry, 0); -} - -int blk_read_unthrottled(BlockBackend *blk, int64_t sector_num, uint8_t *buf, - int nb_sectors) -{ - BlockDriverState *bs = blk_bs(blk); - bool enabled; - int ret; - - ret = blk_check_request(blk, sector_num, nb_sectors); - if (ret < 0) { - return ret; - } - - enabled = bs->io_limits_enabled; - bs->io_limits_enabled = false; - ret = blk_read(blk, sector_num, buf, nb_sectors); - bs->io_limits_enabled = enabled; - return ret; -} - -int blk_write(BlockBackend *blk, int64_t sector_num, const uint8_t *buf, - int nb_sectors) -{ - return blk_rw(blk, sector_num, (uint8_t*) buf, nb_sectors, - blk_write_entry, 0); -} - -int blk_write_zeroes(BlockBackend *blk, int64_t sector_num, - int nb_sectors, BdrvRequestFlags flags) -{ - return blk_rw(blk, sector_num, NULL, nb_sectors, blk_write_entry, - flags | BDRV_REQ_ZERO_WRITE); -} - -static void error_callback_bh(void *opaque) -{ - struct BlockBackendAIOCB *acb = opaque; - qemu_bh_delete(acb->bh); - acb->common.cb(acb->common.opaque, acb->ret); - qemu_aio_unref(acb); -} - -BlockAIOCB *blk_abort_aio_request(BlockBackend *blk, - BlockCompletionFunc *cb, - void *opaque, int ret) -{ - struct BlockBackendAIOCB *acb; - QEMUBH *bh; - - acb = blk_aio_get(&block_backend_aiocb_info, blk, cb, opaque); - acb->blk = blk; - acb->ret = ret; - - bh = aio_bh_new(blk_get_aio_context(blk), error_callback_bh, acb); - acb->bh = bh; - qemu_bh_schedule(bh); - - return &acb->common; -} - -typedef struct BlkAioEmAIOCB { - BlockAIOCB common; - BlkRwCo rwco; - int bytes; - bool has_returned; - QEMUBH* bh; -} BlkAioEmAIOCB; - -static const AIOCBInfo blk_aio_em_aiocb_info = { - .aiocb_size = sizeof(BlkAioEmAIOCB), -}; - -static void blk_aio_complete(BlkAioEmAIOCB *acb) -{ - if (acb->bh) { - assert(acb->has_returned); - qemu_bh_delete(acb->bh); - } - if (acb->has_returned) { - acb->common.cb(acb->common.opaque, acb->rwco.ret); - qemu_aio_unref(acb); - } -} - -static void blk_aio_complete_bh(void *opaque) -{ - blk_aio_complete(opaque); -} - -static BlockAIOCB *blk_aio_prwv(BlockBackend *blk, int64_t offset, int bytes, - QEMUIOVector *qiov, CoroutineEntry co_entry, - BdrvRequestFlags flags, - BlockCompletionFunc *cb, void *opaque) -{ - BlkAioEmAIOCB *acb; - Coroutine *co; - - acb = blk_aio_get(&blk_aio_em_aiocb_info, blk, cb, opaque); - acb->rwco = (BlkRwCo) { - .blk = blk, - .offset = offset, - .qiov = qiov, - .flags = flags, - .ret = NOT_DONE, - }; - acb->bytes = bytes; - acb->bh = NULL; - acb->has_returned = false; - - co = qemu_coroutine_create(co_entry); - qemu_coroutine_enter(co, acb); - - acb->has_returned = true; - if (acb->rwco.ret != NOT_DONE) { - acb->bh = aio_bh_new(blk_get_aio_context(blk), blk_aio_complete_bh, acb); - qemu_bh_schedule(acb->bh); - } - - return &acb->common; -} - -static void blk_aio_read_entry(void *opaque) -{ - BlkAioEmAIOCB *acb = opaque; - BlkRwCo *rwco = &acb->rwco; - - assert(rwco->qiov->size == acb->bytes); - rwco->ret = blk_co_preadv(rwco->blk, rwco->offset, acb->bytes, - rwco->qiov, rwco->flags); - blk_aio_complete(acb); -} - -static void blk_aio_write_entry(void *opaque) -{ - BlkAioEmAIOCB *acb = opaque; - BlkRwCo *rwco = &acb->rwco; - - assert(!rwco->qiov || rwco->qiov->size == acb->bytes); - rwco->ret = blk_co_pwritev(rwco->blk, rwco->offset, acb->bytes, - rwco->qiov, rwco->flags); - blk_aio_complete(acb); -} - -BlockAIOCB *blk_aio_write_zeroes(BlockBackend *blk, int64_t sector_num, - int nb_sectors, BdrvRequestFlags flags, - BlockCompletionFunc *cb, void *opaque) -{ - if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) { - return blk_abort_aio_request(blk, cb, opaque, -EINVAL); - } - - return blk_aio_prwv(blk, sector_num << BDRV_SECTOR_BITS, - nb_sectors << BDRV_SECTOR_BITS, NULL, - blk_aio_write_entry, flags | BDRV_REQ_ZERO_WRITE, - cb, opaque); -} - -int blk_pread(BlockBackend *blk, int64_t offset, void *buf, int count) -{ - int ret = blk_prw(blk, offset, buf, count, blk_read_entry, 0); - if (ret < 0) { - return ret; - } - return count; -} - -int blk_pwrite(BlockBackend *blk, int64_t offset, const void *buf, int count) -{ - int ret = blk_prw(blk, offset, (void*) buf, count, blk_write_entry, 0); - if (ret < 0) { - return ret; - } - return count; -} - -int64_t blk_getlength(BlockBackend *blk) -{ - if (!blk_is_available(blk)) { - return -ENOMEDIUM; - } - - return bdrv_getlength(blk_bs(blk)); -} - -void blk_get_geometry(BlockBackend *blk, uint64_t *nb_sectors_ptr) -{ - if (!blk_bs(blk)) { - *nb_sectors_ptr = 0; - } else { - bdrv_get_geometry(blk_bs(blk), nb_sectors_ptr); - } -} - -int64_t blk_nb_sectors(BlockBackend *blk) -{ - if (!blk_is_available(blk)) { - return -ENOMEDIUM; - } - - return bdrv_nb_sectors(blk_bs(blk)); -} - -BlockAIOCB *blk_aio_readv(BlockBackend *blk, int64_t sector_num, - QEMUIOVector *iov, int nb_sectors, - BlockCompletionFunc *cb, void *opaque) -{ - if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) { - return blk_abort_aio_request(blk, cb, opaque, -EINVAL); - } - - assert(nb_sectors << BDRV_SECTOR_BITS == iov->size); - return blk_aio_prwv(blk, sector_num << BDRV_SECTOR_BITS, iov->size, iov, - blk_aio_read_entry, 0, cb, opaque); -} - -BlockAIOCB *blk_aio_writev(BlockBackend *blk, int64_t sector_num, - QEMUIOVector *iov, int nb_sectors, - BlockCompletionFunc *cb, void *opaque) -{ - if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) { - return blk_abort_aio_request(blk, cb, opaque, -EINVAL); - } - - assert(nb_sectors << BDRV_SECTOR_BITS == iov->size); - return blk_aio_prwv(blk, sector_num << BDRV_SECTOR_BITS, iov->size, iov, - blk_aio_write_entry, 0, cb, opaque); -} - -BlockAIOCB *blk_aio_flush(BlockBackend *blk, - BlockCompletionFunc *cb, void *opaque) -{ - if (!blk_is_available(blk)) { - return blk_abort_aio_request(blk, cb, opaque, -ENOMEDIUM); - } - - return bdrv_aio_flush(blk_bs(blk), cb, opaque); -} - -BlockAIOCB *blk_aio_discard(BlockBackend *blk, - int64_t sector_num, int nb_sectors, - BlockCompletionFunc *cb, void *opaque) -{ - int ret = blk_check_request(blk, sector_num, nb_sectors); - if (ret < 0) { - return blk_abort_aio_request(blk, cb, opaque, ret); - } - - return bdrv_aio_discard(blk_bs(blk), sector_num, nb_sectors, cb, opaque); -} - -void blk_aio_cancel(BlockAIOCB *acb) -{ - bdrv_aio_cancel(acb); -} - -void blk_aio_cancel_async(BlockAIOCB *acb) -{ - bdrv_aio_cancel_async(acb); -} - -int blk_aio_multiwrite(BlockBackend *blk, BlockRequest *reqs, int num_reqs) -{ - int i, ret; - - for (i = 0; i < num_reqs; i++) { - ret = blk_check_request(blk, reqs[i].sector, reqs[i].nb_sectors); - if (ret < 0) { - return ret; - } - } - - return bdrv_aio_multiwrite(blk_bs(blk), reqs, num_reqs); -} - -int blk_ioctl(BlockBackend *blk, unsigned long int req, void *buf) -{ - if (!blk_is_available(blk)) { - return -ENOMEDIUM; - } - - return bdrv_ioctl(blk_bs(blk), req, buf); -} - -BlockAIOCB *blk_aio_ioctl(BlockBackend *blk, unsigned long int req, void *buf, - BlockCompletionFunc *cb, void *opaque) -{ - if (!blk_is_available(blk)) { - return blk_abort_aio_request(blk, cb, opaque, -ENOMEDIUM); - } - - return bdrv_aio_ioctl(blk_bs(blk), req, buf, cb, opaque); -} - -int blk_co_discard(BlockBackend *blk, int64_t sector_num, int nb_sectors) -{ - int ret = blk_check_request(blk, sector_num, nb_sectors); - if (ret < 0) { - return ret; - } - - return bdrv_co_discard(blk_bs(blk), sector_num, nb_sectors); -} - -int blk_co_flush(BlockBackend *blk) -{ - if (!blk_is_available(blk)) { - return -ENOMEDIUM; - } - - return bdrv_co_flush(blk_bs(blk)); -} - -int blk_flush(BlockBackend *blk) -{ - if (!blk_is_available(blk)) { - return -ENOMEDIUM; - } - - return bdrv_flush(blk_bs(blk)); -} - -void blk_drain(BlockBackend *blk) -{ - if (blk_bs(blk)) { - bdrv_drain(blk_bs(blk)); - } -} - -void blk_drain_all(void) -{ - bdrv_drain_all(); -} - -void blk_set_on_error(BlockBackend *blk, BlockdevOnError on_read_error, - BlockdevOnError on_write_error) -{ - blk->on_read_error = on_read_error; - blk->on_write_error = on_write_error; -} - -BlockdevOnError blk_get_on_error(BlockBackend *blk, bool is_read) -{ - return is_read ? blk->on_read_error : blk->on_write_error; -} - -BlockErrorAction blk_get_error_action(BlockBackend *blk, bool is_read, - int error) -{ - BlockdevOnError on_err = blk_get_on_error(blk, is_read); - - switch (on_err) { - case BLOCKDEV_ON_ERROR_ENOSPC: - return (error == ENOSPC) ? - BLOCK_ERROR_ACTION_STOP : BLOCK_ERROR_ACTION_REPORT; - case BLOCKDEV_ON_ERROR_STOP: - return BLOCK_ERROR_ACTION_STOP; - case BLOCKDEV_ON_ERROR_REPORT: - return BLOCK_ERROR_ACTION_REPORT; - case BLOCKDEV_ON_ERROR_IGNORE: - return BLOCK_ERROR_ACTION_IGNORE; - default: - abort(); - } -} - -static void send_qmp_error_event(BlockBackend *blk, - BlockErrorAction action, - bool is_read, int error) -{ - IoOperationType optype; - - optype = is_read ? IO_OPERATION_TYPE_READ : IO_OPERATION_TYPE_WRITE; - qapi_event_send_block_io_error(blk_name(blk), optype, action, - blk_iostatus_is_enabled(blk), - error == ENOSPC, strerror(error), - &error_abort); -} - -/* This is done by device models because, while the block layer knows - * about the error, it does not know whether an operation comes from - * the device or the block layer (from a job, for example). - */ -void blk_error_action(BlockBackend *blk, BlockErrorAction action, - bool is_read, int error) -{ - assert(error >= 0); - - if (action == BLOCK_ERROR_ACTION_STOP) { - /* First set the iostatus, so that "info block" returns an iostatus - * that matches the events raised so far (an additional error iostatus - * is fine, but not a lost one). - */ - blk_iostatus_set_err(blk, error); - - /* Then raise the request to stop the VM and the event. - * qemu_system_vmstop_request_prepare has two effects. First, - * it ensures that the STOP event always comes after the - * BLOCK_IO_ERROR event. Second, it ensures that even if management - * can observe the STOP event and do a "cont" before the STOP - * event is issued, the VM will not stop. In this case, vm_start() - * also ensures that the STOP/RESUME pair of events is emitted. - */ - qemu_system_vmstop_request_prepare(); - send_qmp_error_event(blk, action, is_read, error); - qemu_system_vmstop_request(RUN_STATE_IO_ERROR); - } else { - send_qmp_error_event(blk, action, is_read, error); - } -} - -int blk_is_read_only(BlockBackend *blk) -{ - BlockDriverState *bs = blk_bs(blk); - - if (bs) { - return bdrv_is_read_only(bs); - } else { - return blk->root_state.read_only; - } -} - -int blk_is_sg(BlockBackend *blk) -{ - BlockDriverState *bs = blk_bs(blk); - - if (!bs) { - return 0; - } - - return bdrv_is_sg(bs); -} - -int blk_enable_write_cache(BlockBackend *blk) -{ - return blk->enable_write_cache; -} - -void blk_set_enable_write_cache(BlockBackend *blk, bool wce) -{ - blk->enable_write_cache = wce; -} - -void blk_invalidate_cache(BlockBackend *blk, Error **errp) -{ - BlockDriverState *bs = blk_bs(blk); - - if (!bs) { - error_setg(errp, "Device '%s' has no medium", blk->name); - return; - } - - bdrv_invalidate_cache(bs, errp); -} - -bool blk_is_inserted(BlockBackend *blk) -{ - BlockDriverState *bs = blk_bs(blk); - - return bs && bdrv_is_inserted(bs); -} - -bool blk_is_available(BlockBackend *blk) -{ - return blk_is_inserted(blk) && !blk_dev_is_tray_open(blk); -} - -void blk_lock_medium(BlockBackend *blk, bool locked) -{ - BlockDriverState *bs = blk_bs(blk); - - if (bs) { - bdrv_lock_medium(bs, locked); - } -} - -void blk_eject(BlockBackend *blk, bool eject_flag) -{ - BlockDriverState *bs = blk_bs(blk); - - if (bs) { - bdrv_eject(bs, eject_flag); - } -} - -int blk_get_flags(BlockBackend *blk) -{ - BlockDriverState *bs = blk_bs(blk); - - if (bs) { - return bdrv_get_flags(bs); - } else { - return blk->root_state.open_flags; - } -} - -int blk_get_max_transfer_length(BlockBackend *blk) -{ - BlockDriverState *bs = blk_bs(blk); - - if (bs) { - return bs->bl.max_transfer_length; - } else { - return 0; - } -} - -int blk_get_max_iov(BlockBackend *blk) -{ - return blk->root->bs->bl.max_iov; -} - -void blk_set_guest_block_size(BlockBackend *blk, int align) -{ - blk->guest_block_size = align; -} - -void *blk_try_blockalign(BlockBackend *blk, size_t size) -{ - return qemu_try_blockalign(blk ? blk_bs(blk) : NULL, size); -} - -void *blk_blockalign(BlockBackend *blk, size_t size) -{ - return qemu_blockalign(blk ? blk_bs(blk) : NULL, size); -} - -bool blk_op_is_blocked(BlockBackend *blk, BlockOpType op, Error **errp) -{ - BlockDriverState *bs = blk_bs(blk); - - if (!bs) { - return false; - } - - return bdrv_op_is_blocked(bs, op, errp); -} - -void blk_op_unblock(BlockBackend *blk, BlockOpType op, Error *reason) -{ - BlockDriverState *bs = blk_bs(blk); - - if (bs) { - bdrv_op_unblock(bs, op, reason); - } -} - -void blk_op_block_all(BlockBackend *blk, Error *reason) -{ - BlockDriverState *bs = blk_bs(blk); - - if (bs) { - bdrv_op_block_all(bs, reason); - } -} - -void blk_op_unblock_all(BlockBackend *blk, Error *reason) -{ - BlockDriverState *bs = blk_bs(blk); - - if (bs) { - bdrv_op_unblock_all(bs, reason); - } -} - -AioContext *blk_get_aio_context(BlockBackend *blk) -{ - BlockDriverState *bs = blk_bs(blk); - - if (bs) { - return bdrv_get_aio_context(bs); - } else { - return qemu_get_aio_context(); - } -} - -static AioContext *blk_aiocb_get_aio_context(BlockAIOCB *acb) -{ - BlockBackendAIOCB *blk_acb = DO_UPCAST(BlockBackendAIOCB, common, acb); - return blk_get_aio_context(blk_acb->blk); -} - -void blk_set_aio_context(BlockBackend *blk, AioContext *new_context) -{ - BlockDriverState *bs = blk_bs(blk); - - if (bs) { - bdrv_set_aio_context(bs, new_context); - } -} - -void blk_add_aio_context_notifier(BlockBackend *blk, - void (*attached_aio_context)(AioContext *new_context, void *opaque), - void (*detach_aio_context)(void *opaque), void *opaque) -{ - BlockDriverState *bs = blk_bs(blk); - - if (bs) { - bdrv_add_aio_context_notifier(bs, attached_aio_context, - detach_aio_context, opaque); - } -} - -void blk_remove_aio_context_notifier(BlockBackend *blk, - void (*attached_aio_context)(AioContext *, - void *), - void (*detach_aio_context)(void *), - void *opaque) -{ - BlockDriverState *bs = blk_bs(blk); - - if (bs) { - bdrv_remove_aio_context_notifier(bs, attached_aio_context, - detach_aio_context, opaque); - } -} - -void blk_add_remove_bs_notifier(BlockBackend *blk, Notifier *notify) -{ - notifier_list_add(&blk->remove_bs_notifiers, notify); -} - -void blk_add_insert_bs_notifier(BlockBackend *blk, Notifier *notify) -{ - notifier_list_add(&blk->insert_bs_notifiers, notify); -} - -void blk_io_plug(BlockBackend *blk) -{ - BlockDriverState *bs = blk_bs(blk); - - if (bs) { - bdrv_io_plug(bs); - } -} - -void blk_io_unplug(BlockBackend *blk) -{ - BlockDriverState *bs = blk_bs(blk); - - if (bs) { - bdrv_io_unplug(bs); - } -} - -BlockAcctStats *blk_get_stats(BlockBackend *blk) -{ - return &blk->stats; -} - -void *blk_aio_get(const AIOCBInfo *aiocb_info, BlockBackend *blk, - BlockCompletionFunc *cb, void *opaque) -{ - return qemu_aio_get(aiocb_info, blk_bs(blk), cb, opaque); -} - -int coroutine_fn blk_co_write_zeroes(BlockBackend *blk, int64_t sector_num, - int nb_sectors, BdrvRequestFlags flags) -{ - if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) { - return -EINVAL; - } - - return blk_co_pwritev(blk, sector_num << BDRV_SECTOR_BITS, - nb_sectors << BDRV_SECTOR_BITS, NULL, - flags | BDRV_REQ_ZERO_WRITE); -} - -int blk_write_compressed(BlockBackend *blk, int64_t sector_num, - const uint8_t *buf, int nb_sectors) -{ - int ret = blk_check_request(blk, sector_num, nb_sectors); - if (ret < 0) { - return ret; - } - - return bdrv_write_compressed(blk_bs(blk), sector_num, buf, nb_sectors); -} - -int blk_truncate(BlockBackend *blk, int64_t offset) -{ - if (!blk_is_available(blk)) { - return -ENOMEDIUM; - } - - return bdrv_truncate(blk_bs(blk), offset); -} - -int blk_discard(BlockBackend *blk, int64_t sector_num, int nb_sectors) -{ - int ret = blk_check_request(blk, sector_num, nb_sectors); - if (ret < 0) { - return ret; - } - - return bdrv_discard(blk_bs(blk), sector_num, nb_sectors); -} - -int blk_save_vmstate(BlockBackend *blk, const uint8_t *buf, - int64_t pos, int size) -{ - int ret; - - if (!blk_is_available(blk)) { - return -ENOMEDIUM; - } - - ret = bdrv_save_vmstate(blk_bs(blk), buf, pos, size); - if (ret < 0) { - return ret; - } - - if (ret == size && !blk->enable_write_cache) { - ret = bdrv_flush(blk_bs(blk)); - } - - return ret < 0 ? ret : size; -} - -int blk_load_vmstate(BlockBackend *blk, uint8_t *buf, int64_t pos, int size) -{ - if (!blk_is_available(blk)) { - return -ENOMEDIUM; - } - - return bdrv_load_vmstate(blk_bs(blk), buf, pos, size); -} - -int blk_probe_blocksizes(BlockBackend *blk, BlockSizes *bsz) -{ - if (!blk_is_available(blk)) { - return -ENOMEDIUM; - } - - return bdrv_probe_blocksizes(blk_bs(blk), bsz); -} - -int blk_probe_geometry(BlockBackend *blk, HDGeometry *geo) -{ - if (!blk_is_available(blk)) { - return -ENOMEDIUM; - } - - return bdrv_probe_geometry(blk_bs(blk), geo); -} - -/* - * Updates the BlockBackendRootState object with data from the currently - * attached BlockDriverState. - */ -void blk_update_root_state(BlockBackend *blk) -{ - assert(blk->root); - - blk->root_state.open_flags = blk->root->bs->open_flags; - blk->root_state.read_only = blk->root->bs->read_only; - blk->root_state.detect_zeroes = blk->root->bs->detect_zeroes; - - if (blk->root_state.throttle_group) { - g_free(blk->root_state.throttle_group); - throttle_group_unref(blk->root_state.throttle_state); - } - if (blk->root->bs->throttle_state) { - const char *name = throttle_group_get_name(blk->root->bs); - blk->root_state.throttle_group = g_strdup(name); - blk->root_state.throttle_state = throttle_group_incref(name); - } else { - blk->root_state.throttle_group = NULL; - blk->root_state.throttle_state = NULL; - } -} - -/* - * Applies the information in the root state to the given BlockDriverState. This - * does not include the flags which have to be specified for bdrv_open(), use - * blk_get_open_flags_from_root_state() to inquire them. - */ -void blk_apply_root_state(BlockBackend *blk, BlockDriverState *bs) -{ - bs->detect_zeroes = blk->root_state.detect_zeroes; - if (blk->root_state.throttle_group) { - bdrv_io_limits_enable(bs, blk->root_state.throttle_group); - } -} - -/* - * Returns the flags to be used for bdrv_open() of a BlockDriverState which is - * supposed to inherit the root state. - */ -int blk_get_open_flags_from_root_state(BlockBackend *blk) -{ - int bs_flags; - - bs_flags = blk->root_state.read_only ? 0 : BDRV_O_RDWR; - bs_flags |= blk->root_state.open_flags & ~BDRV_O_RDWR; - - return bs_flags; -} - -BlockBackendRootState *blk_get_root_state(BlockBackend *blk) -{ - return &blk->root_state; -} - -int blk_commit_all(void) -{ - BlockBackend *blk = NULL; - - while ((blk = blk_all_next(blk)) != NULL) { - AioContext *aio_context = blk_get_aio_context(blk); - - aio_context_acquire(aio_context); - if (blk_is_inserted(blk) && blk->root->bs->backing) { - int ret = bdrv_commit(blk->root->bs); - if (ret < 0) { - aio_context_release(aio_context); - return ret; - } - } - aio_context_release(aio_context); - } - return 0; -} - -int blk_flush_all(void) -{ - BlockBackend *blk = NULL; - int result = 0; - - while ((blk = blk_all_next(blk)) != NULL) { - AioContext *aio_context = blk_get_aio_context(blk); - int ret; - - aio_context_acquire(aio_context); - if (blk_is_inserted(blk)) { - ret = blk_flush(blk); - if (ret < 0 && !result) { - result = ret; - } - } - aio_context_release(aio_context); - } - - return result; -} diff --git a/qemu/block/bochs.c b/qemu/block/bochs.c deleted file mode 100644 index af8b7abdf..000000000 --- a/qemu/block/bochs.c +++ /dev/null @@ -1,279 +0,0 @@ -/* - * Block driver for the various disk image formats used by Bochs - * Currently only for "growing" type in read-only mode - * - * Copyright (c) 2005 Alex Beregszaszi - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. - */ -#include "qemu/osdep.h" -#include "qapi/error.h" -#include "qemu-common.h" -#include "block/block_int.h" -#include "qemu/module.h" - -/**************************************************************/ - -#define HEADER_MAGIC "Bochs Virtual HD Image" -#define HEADER_VERSION 0x00020000 -#define HEADER_V1 0x00010000 -#define HEADER_SIZE 512 - -#define REDOLOG_TYPE "Redolog" -#define GROWING_TYPE "Growing" - -// not allocated: 0xffffffff - -// always little-endian -struct bochs_header { - char magic[32]; /* "Bochs Virtual HD Image" */ - char type[16]; /* "Redolog" */ - char subtype[16]; /* "Undoable" / "Volatile" / "Growing" */ - uint32_t version; - uint32_t header; /* size of header */ - - uint32_t catalog; /* num of entries */ - uint32_t bitmap; /* bitmap size */ - uint32_t extent; /* extent size */ - - union { - struct { - uint32_t reserved; /* for ??? */ - uint64_t disk; /* disk size */ - char padding[HEADER_SIZE - 64 - 20 - 12]; - } QEMU_PACKED redolog; - struct { - uint64_t disk; /* disk size */ - char padding[HEADER_SIZE - 64 - 20 - 8]; - } QEMU_PACKED redolog_v1; - char padding[HEADER_SIZE - 64 - 20]; - } extra; -} QEMU_PACKED; - -typedef struct BDRVBochsState { - CoMutex lock; - uint32_t *catalog_bitmap; - uint32_t catalog_size; - - uint32_t data_offset; - - uint32_t bitmap_blocks; - uint32_t extent_blocks; - uint32_t extent_size; -} BDRVBochsState; - -static int bochs_probe(const uint8_t *buf, int buf_size, const char *filename) -{ - const struct bochs_header *bochs = (const void *)buf; - - if (buf_size < HEADER_SIZE) - return 0; - - if (!strcmp(bochs->magic, HEADER_MAGIC) && - !strcmp(bochs->type, REDOLOG_TYPE) && - !strcmp(bochs->subtype, GROWING_TYPE) && - ((le32_to_cpu(bochs->version) == HEADER_VERSION) || - (le32_to_cpu(bochs->version) == HEADER_V1))) - return 100; - - return 0; -} - -static int bochs_open(BlockDriverState *bs, QDict *options, int flags, - Error **errp) -{ - BDRVBochsState *s = bs->opaque; - uint32_t i; - struct bochs_header bochs; - int ret; - - bs->read_only = 1; // no write support yet - - ret = bdrv_pread(bs->file->bs, 0, &bochs, sizeof(bochs)); - if (ret < 0) { - return ret; - } - - if (strcmp(bochs.magic, HEADER_MAGIC) || - strcmp(bochs.type, REDOLOG_TYPE) || - strcmp(bochs.subtype, GROWING_TYPE) || - ((le32_to_cpu(bochs.version) != HEADER_VERSION) && - (le32_to_cpu(bochs.version) != HEADER_V1))) { - error_setg(errp, "Image not in Bochs format"); - return -EINVAL; - } - - if (le32_to_cpu(bochs.version) == HEADER_V1) { - bs->total_sectors = le64_to_cpu(bochs.extra.redolog_v1.disk) / 512; - } else { - bs->total_sectors = le64_to_cpu(bochs.extra.redolog.disk) / 512; - } - - /* Limit to 1M entries to avoid unbounded allocation. This is what is - * needed for the largest image that bximage can create (~8 TB). */ - s->catalog_size = le32_to_cpu(bochs.catalog); - if (s->catalog_size > 0x100000) { - error_setg(errp, "Catalog size is too large"); - return -EFBIG; - } - - s->catalog_bitmap = g_try_new(uint32_t, s->catalog_size); - if (s->catalog_size && s->catalog_bitmap == NULL) { - error_setg(errp, "Could not allocate memory for catalog"); - return -ENOMEM; - } - - ret = bdrv_pread(bs->file->bs, le32_to_cpu(bochs.header), s->catalog_bitmap, - s->catalog_size * 4); - if (ret < 0) { - goto fail; - } - - for (i = 0; i < s->catalog_size; i++) - le32_to_cpus(&s->catalog_bitmap[i]); - - s->data_offset = le32_to_cpu(bochs.header) + (s->catalog_size * 4); - - s->bitmap_blocks = 1 + (le32_to_cpu(bochs.bitmap) - 1) / 512; - s->extent_blocks = 1 + (le32_to_cpu(bochs.extent) - 1) / 512; - - s->extent_size = le32_to_cpu(bochs.extent); - if (s->extent_size < BDRV_SECTOR_SIZE) { - /* bximage actually never creates extents smaller than 4k */ - error_setg(errp, "Extent size must be at least 512"); - ret = -EINVAL; - goto fail; - } else if (!is_power_of_2(s->extent_size)) { - error_setg(errp, "Extent size %" PRIu32 " is not a power of two", - s->extent_size); - ret = -EINVAL; - goto fail; - } else if (s->extent_size > 0x800000) { - error_setg(errp, "Extent size %" PRIu32 " is too large", - s->extent_size); - ret = -EINVAL; - goto fail; - } - - if (s->catalog_size < DIV_ROUND_UP(bs->total_sectors, - s->extent_size / BDRV_SECTOR_SIZE)) - { - error_setg(errp, "Catalog size is too small for this disk size"); - ret = -EINVAL; - goto fail; - } - - qemu_co_mutex_init(&s->lock); - return 0; - -fail: - g_free(s->catalog_bitmap); - return ret; -} - -static int64_t seek_to_sector(BlockDriverState *bs, int64_t sector_num) -{ - BDRVBochsState *s = bs->opaque; - uint64_t offset = sector_num * 512; - uint64_t extent_index, extent_offset, bitmap_offset; - char bitmap_entry; - int ret; - - // seek to sector - extent_index = offset / s->extent_size; - extent_offset = (offset % s->extent_size) / 512; - - if (s->catalog_bitmap[extent_index] == 0xffffffff) { - return 0; /* not allocated */ - } - - bitmap_offset = s->data_offset + - (512 * (uint64_t) s->catalog_bitmap[extent_index] * - (s->extent_blocks + s->bitmap_blocks)); - - /* read in bitmap for current extent */ - ret = bdrv_pread(bs->file->bs, bitmap_offset + (extent_offset / 8), - &bitmap_entry, 1); - if (ret < 0) { - return ret; - } - - if (!((bitmap_entry >> (extent_offset % 8)) & 1)) { - return 0; /* not allocated */ - } - - return bitmap_offset + (512 * (s->bitmap_blocks + extent_offset)); -} - -static int bochs_read(BlockDriverState *bs, int64_t sector_num, - uint8_t *buf, int nb_sectors) -{ - int ret; - - while (nb_sectors > 0) { - int64_t block_offset = seek_to_sector(bs, sector_num); - if (block_offset < 0) { - return block_offset; - } else if (block_offset > 0) { - ret = bdrv_pread(bs->file->bs, block_offset, buf, 512); - if (ret < 0) { - return ret; - } - } else { - memset(buf, 0, 512); - } - nb_sectors--; - sector_num++; - buf += 512; - } - return 0; -} - -static coroutine_fn int bochs_co_read(BlockDriverState *bs, int64_t sector_num, - uint8_t *buf, int nb_sectors) -{ - int ret; - BDRVBochsState *s = bs->opaque; - qemu_co_mutex_lock(&s->lock); - ret = bochs_read(bs, sector_num, buf, nb_sectors); - qemu_co_mutex_unlock(&s->lock); - return ret; -} - -static void bochs_close(BlockDriverState *bs) -{ - BDRVBochsState *s = bs->opaque; - g_free(s->catalog_bitmap); -} - -static BlockDriver bdrv_bochs = { - .format_name = "bochs", - .instance_size = sizeof(BDRVBochsState), - .bdrv_probe = bochs_probe, - .bdrv_open = bochs_open, - .bdrv_read = bochs_co_read, - .bdrv_close = bochs_close, -}; - -static void bdrv_bochs_init(void) -{ - bdrv_register(&bdrv_bochs); -} - -block_init(bdrv_bochs_init); diff --git a/qemu/block/cloop.c b/qemu/block/cloop.c deleted file mode 100644 index a84f14019..000000000 --- a/qemu/block/cloop.c +++ /dev/null @@ -1,285 +0,0 @@ -/* - * QEMU Block driver for CLOOP images - * - * Copyright (c) 2004 Johannes E. Schindelin - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. - */ -#include "qemu/osdep.h" -#include "qapi/error.h" -#include "qemu-common.h" -#include "block/block_int.h" -#include "qemu/module.h" -#include - -/* Maximum compressed block size */ -#define MAX_BLOCK_SIZE (64 * 1024 * 1024) - -typedef struct BDRVCloopState { - CoMutex lock; - uint32_t block_size; - uint32_t n_blocks; - uint64_t *offsets; - uint32_t sectors_per_block; - uint32_t current_block; - uint8_t *compressed_block; - uint8_t *uncompressed_block; - z_stream zstream; -} BDRVCloopState; - -static int cloop_probe(const uint8_t *buf, int buf_size, const char *filename) -{ - const char *magic_version_2_0 = "#!/bin/sh\n" - "#V2.0 Format\n" - "modprobe cloop file=$0 && mount -r -t iso9660 /dev/cloop $1\n"; - int length = strlen(magic_version_2_0); - if (length > buf_size) { - length = buf_size; - } - if (!memcmp(magic_version_2_0, buf, length)) { - return 2; - } - return 0; -} - -static int cloop_open(BlockDriverState *bs, QDict *options, int flags, - Error **errp) -{ - BDRVCloopState *s = bs->opaque; - uint32_t offsets_size, max_compressed_block_size = 1, i; - int ret; - - bs->read_only = 1; - - /* read header */ - ret = bdrv_pread(bs->file->bs, 128, &s->block_size, 4); - if (ret < 0) { - return ret; - } - s->block_size = be32_to_cpu(s->block_size); - if (s->block_size % 512) { - error_setg(errp, "block_size %" PRIu32 " must be a multiple of 512", - s->block_size); - return -EINVAL; - } - if (s->block_size == 0) { - error_setg(errp, "block_size cannot be zero"); - return -EINVAL; - } - - /* cloop's create_compressed_fs.c warns about block sizes beyond 256 KB but - * we can accept more. Prevent ridiculous values like 4 GB - 1 since we - * need a buffer this big. - */ - if (s->block_size > MAX_BLOCK_SIZE) { - error_setg(errp, "block_size %" PRIu32 " must be %u MB or less", - s->block_size, - MAX_BLOCK_SIZE / (1024 * 1024)); - return -EINVAL; - } - - ret = bdrv_pread(bs->file->bs, 128 + 4, &s->n_blocks, 4); - if (ret < 0) { - return ret; - } - s->n_blocks = be32_to_cpu(s->n_blocks); - - /* read offsets */ - if (s->n_blocks > (UINT32_MAX - 1) / sizeof(uint64_t)) { - /* Prevent integer overflow */ - error_setg(errp, "n_blocks %" PRIu32 " must be %zu or less", - s->n_blocks, - (UINT32_MAX - 1) / sizeof(uint64_t)); - return -EINVAL; - } - offsets_size = (s->n_blocks + 1) * sizeof(uint64_t); - if (offsets_size > 512 * 1024 * 1024) { - /* Prevent ridiculous offsets_size which causes memory allocation to - * fail or overflows bdrv_pread() size. In practice the 512 MB - * offsets[] limit supports 16 TB images at 256 KB block size. - */ - error_setg(errp, "image requires too many offsets, " - "try increasing block size"); - return -EINVAL; - } - - s->offsets = g_try_malloc(offsets_size); - if (s->offsets == NULL) { - error_setg(errp, "Could not allocate offsets table"); - return -ENOMEM; - } - - ret = bdrv_pread(bs->file->bs, 128 + 4 + 4, s->offsets, offsets_size); - if (ret < 0) { - goto fail; - } - - for (i = 0; i < s->n_blocks + 1; i++) { - uint64_t size; - - s->offsets[i] = be64_to_cpu(s->offsets[i]); - if (i == 0) { - continue; - } - - if (s->offsets[i] < s->offsets[i - 1]) { - error_setg(errp, "offsets not monotonically increasing at " - "index %" PRIu32 ", image file is corrupt", i); - ret = -EINVAL; - goto fail; - } - - size = s->offsets[i] - s->offsets[i - 1]; - - /* Compressed blocks should be smaller than the uncompressed block size - * but maybe compression performed poorly so the compressed block is - * actually bigger. Clamp down on unrealistic values to prevent - * ridiculous s->compressed_block allocation. - */ - if (size > 2 * MAX_BLOCK_SIZE) { - error_setg(errp, "invalid compressed block size at index %" PRIu32 - ", image file is corrupt", i); - ret = -EINVAL; - goto fail; - } - - if (size > max_compressed_block_size) { - max_compressed_block_size = size; - } - } - - /* initialize zlib engine */ - s->compressed_block = g_try_malloc(max_compressed_block_size + 1); - if (s->compressed_block == NULL) { - error_setg(errp, "Could not allocate compressed_block"); - ret = -ENOMEM; - goto fail; - } - - s->uncompressed_block = g_try_malloc(s->block_size); - if (s->uncompressed_block == NULL) { - error_setg(errp, "Could not allocate uncompressed_block"); - ret = -ENOMEM; - goto fail; - } - - if (inflateInit(&s->zstream) != Z_OK) { - ret = -EINVAL; - goto fail; - } - s->current_block = s->n_blocks; - - s->sectors_per_block = s->block_size/512; - bs->total_sectors = s->n_blocks * s->sectors_per_block; - qemu_co_mutex_init(&s->lock); - return 0; - -fail: - g_free(s->offsets); - g_free(s->compressed_block); - g_free(s->uncompressed_block); - return ret; -} - -static inline int cloop_read_block(BlockDriverState *bs, int block_num) -{ - BDRVCloopState *s = bs->opaque; - - if (s->current_block != block_num) { - int ret; - uint32_t bytes = s->offsets[block_num + 1] - s->offsets[block_num]; - - ret = bdrv_pread(bs->file->bs, s->offsets[block_num], - s->compressed_block, bytes); - if (ret != bytes) { - return -1; - } - - s->zstream.next_in = s->compressed_block; - s->zstream.avail_in = bytes; - s->zstream.next_out = s->uncompressed_block; - s->zstream.avail_out = s->block_size; - ret = inflateReset(&s->zstream); - if (ret != Z_OK) { - return -1; - } - ret = inflate(&s->zstream, Z_FINISH); - if (ret != Z_STREAM_END || s->zstream.total_out != s->block_size) { - return -1; - } - - s->current_block = block_num; - } - return 0; -} - -static int cloop_read(BlockDriverState *bs, int64_t sector_num, - uint8_t *buf, int nb_sectors) -{ - BDRVCloopState *s = bs->opaque; - int i; - - for (i = 0; i < nb_sectors; i++) { - uint32_t sector_offset_in_block = - ((sector_num + i) % s->sectors_per_block), - block_num = (sector_num + i) / s->sectors_per_block; - if (cloop_read_block(bs, block_num) != 0) { - return -1; - } - memcpy(buf + i * 512, - s->uncompressed_block + sector_offset_in_block * 512, 512); - } - return 0; -} - -static coroutine_fn int cloop_co_read(BlockDriverState *bs, int64_t sector_num, - uint8_t *buf, int nb_sectors) -{ - int ret; - BDRVCloopState *s = bs->opaque; - qemu_co_mutex_lock(&s->lock); - ret = cloop_read(bs, sector_num, buf, nb_sectors); - qemu_co_mutex_unlock(&s->lock); - return ret; -} - -static void cloop_close(BlockDriverState *bs) -{ - BDRVCloopState *s = bs->opaque; - g_free(s->offsets); - g_free(s->compressed_block); - g_free(s->uncompressed_block); - inflateEnd(&s->zstream); -} - -static BlockDriver bdrv_cloop = { - .format_name = "cloop", - .instance_size = sizeof(BDRVCloopState), - .bdrv_probe = cloop_probe, - .bdrv_open = cloop_open, - .bdrv_read = cloop_co_read, - .bdrv_close = cloop_close, -}; - -static void bdrv_cloop_init(void) -{ - bdrv_register(&bdrv_cloop); -} - -block_init(bdrv_cloop_init); diff --git a/qemu/block/commit.c b/qemu/block/commit.c deleted file mode 100644 index cba0e8c1e..000000000 --- a/qemu/block/commit.c +++ /dev/null @@ -1,277 +0,0 @@ -/* - * Live block commit - * - * Copyright Red Hat, Inc. 2012 - * - * Authors: - * Jeff Cody - * Based on stream.c by Stefan Hajnoczi - * - * This work is licensed under the terms of the GNU LGPL, version 2 or later. - * See the COPYING.LIB file in the top-level directory. - * - */ - -#include "qemu/osdep.h" -#include "trace.h" -#include "block/block_int.h" -#include "block/blockjob.h" -#include "qapi/error.h" -#include "qapi/qmp/qerror.h" -#include "qemu/ratelimit.h" -#include "sysemu/block-backend.h" - -enum { - /* - * Size of data buffer for populating the image file. This should be large - * enough to process multiple clusters in a single call, so that populating - * contiguous regions of the image is efficient. - */ - COMMIT_BUFFER_SIZE = 512 * 1024, /* in bytes */ -}; - -#define SLICE_TIME 100000000ULL /* ns */ - -typedef struct CommitBlockJob { - BlockJob common; - RateLimit limit; - BlockDriverState *active; - BlockDriverState *top; - BlockDriverState *base; - BlockdevOnError on_error; - int base_flags; - int orig_overlay_flags; - char *backing_file_str; -} CommitBlockJob; - -static int coroutine_fn commit_populate(BlockDriverState *bs, - BlockDriverState *base, - int64_t sector_num, int nb_sectors, - void *buf) -{ - int ret = 0; - - ret = bdrv_read(bs, sector_num, buf, nb_sectors); - if (ret) { - return ret; - } - - ret = bdrv_write(base, sector_num, buf, nb_sectors); - if (ret) { - return ret; - } - - return 0; -} - -typedef struct { - int ret; -} CommitCompleteData; - -static void commit_complete(BlockJob *job, void *opaque) -{ - CommitBlockJob *s = container_of(job, CommitBlockJob, common); - CommitCompleteData *data = opaque; - BlockDriverState *active = s->active; - BlockDriverState *top = s->top; - BlockDriverState *base = s->base; - BlockDriverState *overlay_bs; - int ret = data->ret; - - if (!block_job_is_cancelled(&s->common) && ret == 0) { - /* success */ - ret = bdrv_drop_intermediate(active, top, base, s->backing_file_str); - } - - /* restore base open flags here if appropriate (e.g., change the base back - * to r/o). These reopens do not need to be atomic, since we won't abort - * even on failure here */ - if (s->base_flags != bdrv_get_flags(base)) { - bdrv_reopen(base, s->base_flags, NULL); - } - overlay_bs = bdrv_find_overlay(active, top); - if (overlay_bs && s->orig_overlay_flags != bdrv_get_flags(overlay_bs)) { - bdrv_reopen(overlay_bs, s->orig_overlay_flags, NULL); - } - g_free(s->backing_file_str); - block_job_completed(&s->common, ret); - g_free(data); -} - -static void coroutine_fn commit_run(void *opaque) -{ - CommitBlockJob *s = opaque; - CommitCompleteData *data; - BlockDriverState *top = s->top; - BlockDriverState *base = s->base; - int64_t sector_num, end; - int ret = 0; - int n = 0; - void *buf = NULL; - int bytes_written = 0; - int64_t base_len; - - ret = s->common.len = bdrv_getlength(top); - - - if (s->common.len < 0) { - goto out; - } - - ret = base_len = bdrv_getlength(base); - if (base_len < 0) { - goto out; - } - - if (base_len < s->common.len) { - ret = bdrv_truncate(base, s->common.len); - if (ret) { - goto out; - } - } - - end = s->common.len >> BDRV_SECTOR_BITS; - buf = qemu_blockalign(top, COMMIT_BUFFER_SIZE); - - for (sector_num = 0; sector_num < end; sector_num += n) { - uint64_t delay_ns = 0; - bool copy; - -wait: - /* Note that even when no rate limit is applied we need to yield - * with no pending I/O here so that bdrv_drain_all() returns. - */ - block_job_sleep_ns(&s->common, QEMU_CLOCK_REALTIME, delay_ns); - if (block_job_is_cancelled(&s->common)) { - break; - } - /* Copy if allocated above the base */ - ret = bdrv_is_allocated_above(top, base, sector_num, - COMMIT_BUFFER_SIZE / BDRV_SECTOR_SIZE, - &n); - copy = (ret == 1); - trace_commit_one_iteration(s, sector_num, n, ret); - if (copy) { - if (s->common.speed) { - delay_ns = ratelimit_calculate_delay(&s->limit, n); - if (delay_ns > 0) { - goto wait; - } - } - ret = commit_populate(top, base, sector_num, n, buf); - bytes_written += n * BDRV_SECTOR_SIZE; - } - if (ret < 0) { - if (s->on_error == BLOCKDEV_ON_ERROR_STOP || - s->on_error == BLOCKDEV_ON_ERROR_REPORT|| - (s->on_error == BLOCKDEV_ON_ERROR_ENOSPC && ret == -ENOSPC)) { - goto out; - } else { - n = 0; - continue; - } - } - /* Publish progress */ - s->common.offset += n * BDRV_SECTOR_SIZE; - } - - ret = 0; - -out: - qemu_vfree(buf); - - data = g_malloc(sizeof(*data)); - data->ret = ret; - block_job_defer_to_main_loop(&s->common, commit_complete, data); -} - -static void commit_set_speed(BlockJob *job, int64_t speed, Error **errp) -{ - CommitBlockJob *s = container_of(job, CommitBlockJob, common); - - if (speed < 0) { - error_setg(errp, QERR_INVALID_PARAMETER, "speed"); - return; - } - ratelimit_set_speed(&s->limit, speed / BDRV_SECTOR_SIZE, SLICE_TIME); -} - -static const BlockJobDriver commit_job_driver = { - .instance_size = sizeof(CommitBlockJob), - .job_type = BLOCK_JOB_TYPE_COMMIT, - .set_speed = commit_set_speed, -}; - -void commit_start(BlockDriverState *bs, BlockDriverState *base, - BlockDriverState *top, int64_t speed, - BlockdevOnError on_error, BlockCompletionFunc *cb, - void *opaque, const char *backing_file_str, Error **errp) -{ - CommitBlockJob *s; - BlockReopenQueue *reopen_queue = NULL; - int orig_overlay_flags; - int orig_base_flags; - BlockDriverState *overlay_bs; - Error *local_err = NULL; - - if ((on_error == BLOCKDEV_ON_ERROR_STOP || - on_error == BLOCKDEV_ON_ERROR_ENOSPC) && - (!bs->blk || !blk_iostatus_is_enabled(bs->blk))) { - error_setg(errp, "Invalid parameter combination"); - return; - } - - assert(top != bs); - if (top == base) { - error_setg(errp, "Invalid files for merge: top and base are the same"); - return; - } - - overlay_bs = bdrv_find_overlay(bs, top); - - if (overlay_bs == NULL) { - error_setg(errp, "Could not find overlay image for %s:", top->filename); - return; - } - - orig_base_flags = bdrv_get_flags(base); - orig_overlay_flags = bdrv_get_flags(overlay_bs); - - /* convert base & overlay_bs to r/w, if necessary */ - if (!(orig_overlay_flags & BDRV_O_RDWR)) { - reopen_queue = bdrv_reopen_queue(reopen_queue, overlay_bs, NULL, - orig_overlay_flags | BDRV_O_RDWR); - } - if (!(orig_base_flags & BDRV_O_RDWR)) { - reopen_queue = bdrv_reopen_queue(reopen_queue, base, NULL, - orig_base_flags | BDRV_O_RDWR); - } - if (reopen_queue) { - bdrv_reopen_multiple(reopen_queue, &local_err); - if (local_err != NULL) { - error_propagate(errp, local_err); - return; - } - } - - - s = block_job_create(&commit_job_driver, bs, speed, cb, opaque, errp); - if (!s) { - return; - } - - s->base = base; - s->top = top; - s->active = bs; - - s->base_flags = orig_base_flags; - s->orig_overlay_flags = orig_overlay_flags; - - s->backing_file_str = g_strdup(backing_file_str); - - s->on_error = on_error; - s->common.co = qemu_coroutine_create(commit_run); - - trace_commit_start(bs, base, top, s, s->common.co, opaque); - qemu_coroutine_enter(s->common.co, s); -} diff --git a/qemu/block/crypto.c b/qemu/block/crypto.c deleted file mode 100644 index 1903e84fb..000000000 --- a/qemu/block/crypto.c +++ /dev/null @@ -1,586 +0,0 @@ -/* - * QEMU block full disk encryption - * - * Copyright (c) 2015-2016 Red Hat, Inc. - * - * This library is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with this library; if not, see . - * - */ - -#include "qemu/osdep.h" - -#include "block/block_int.h" -#include "sysemu/block-backend.h" -#include "crypto/block.h" -#include "qapi/opts-visitor.h" -#include "qapi-visit.h" -#include "qapi/error.h" - -#define BLOCK_CRYPTO_OPT_LUKS_KEY_SECRET "key-secret" -#define BLOCK_CRYPTO_OPT_LUKS_CIPHER_ALG "cipher-alg" -#define BLOCK_CRYPTO_OPT_LUKS_CIPHER_MODE "cipher-mode" -#define BLOCK_CRYPTO_OPT_LUKS_IVGEN_ALG "ivgen-alg" -#define BLOCK_CRYPTO_OPT_LUKS_IVGEN_HASH_ALG "ivgen-hash-alg" -#define BLOCK_CRYPTO_OPT_LUKS_HASH_ALG "hash-alg" - -typedef struct BlockCrypto BlockCrypto; - -struct BlockCrypto { - QCryptoBlock *block; -}; - - -static int block_crypto_probe_generic(QCryptoBlockFormat format, - const uint8_t *buf, - int buf_size, - const char *filename) -{ - if (qcrypto_block_has_format(format, buf, buf_size)) { - return 100; - } else { - return 0; - } -} - - -static ssize_t block_crypto_read_func(QCryptoBlock *block, - size_t offset, - uint8_t *buf, - size_t buflen, - Error **errp, - void *opaque) -{ - BlockDriverState *bs = opaque; - ssize_t ret; - - ret = bdrv_pread(bs->file->bs, offset, buf, buflen); - if (ret < 0) { - error_setg_errno(errp, -ret, "Could not read encryption header"); - return ret; - } - return ret; -} - - -struct BlockCryptoCreateData { - const char *filename; - QemuOpts *opts; - BlockBackend *blk; - uint64_t size; -}; - - -static ssize_t block_crypto_write_func(QCryptoBlock *block, - size_t offset, - const uint8_t *buf, - size_t buflen, - Error **errp, - void *opaque) -{ - struct BlockCryptoCreateData *data = opaque; - ssize_t ret; - - ret = blk_pwrite(data->blk, offset, buf, buflen); - if (ret < 0) { - error_setg_errno(errp, -ret, "Could not write encryption header"); - return ret; - } - return ret; -} - - -static ssize_t block_crypto_init_func(QCryptoBlock *block, - size_t headerlen, - Error **errp, - void *opaque) -{ - struct BlockCryptoCreateData *data = opaque; - int ret; - - /* User provided size should reflect amount of space made - * available to the guest, so we must take account of that - * which will be used by the crypto header - */ - data->size += headerlen; - - qemu_opt_set_number(data->opts, BLOCK_OPT_SIZE, data->size, &error_abort); - ret = bdrv_create_file(data->filename, data->opts, errp); - if (ret < 0) { - return -1; - } - - data->blk = blk_new_open(data->filename, NULL, NULL, - BDRV_O_RDWR | BDRV_O_PROTOCOL, errp); - if (!data->blk) { - return -1; - } - - return 0; -} - - -static QemuOptsList block_crypto_runtime_opts_luks = { - .name = "crypto", - .head = QTAILQ_HEAD_INITIALIZER(block_crypto_runtime_opts_luks.head), - .desc = { - { - .name = BLOCK_CRYPTO_OPT_LUKS_KEY_SECRET, - .type = QEMU_OPT_STRING, - .help = "ID of the secret that provides the encryption key", - }, - { /* end of list */ } - }, -}; - - -static QemuOptsList block_crypto_create_opts_luks = { - .name = "crypto", - .head = QTAILQ_HEAD_INITIALIZER(block_crypto_create_opts_luks.head), - .desc = { - { - .name = BLOCK_OPT_SIZE, - .type = QEMU_OPT_SIZE, - .help = "Virtual disk size" - }, - { - .name = BLOCK_CRYPTO_OPT_LUKS_KEY_SECRET, - .type = QEMU_OPT_STRING, - .help = "ID of the secret that provides the encryption key", - }, - { - .name = BLOCK_CRYPTO_OPT_LUKS_CIPHER_ALG, - .type = QEMU_OPT_STRING, - .help = "Name of encryption cipher algorithm", - }, - { - .name = BLOCK_CRYPTO_OPT_LUKS_CIPHER_MODE, - .type = QEMU_OPT_STRING, - .help = "Name of encryption cipher mode", - }, - { - .name = BLOCK_CRYPTO_OPT_LUKS_IVGEN_ALG, - .type = QEMU_OPT_STRING, - .help = "Name of IV generator algorithm", - }, - { - .name = BLOCK_CRYPTO_OPT_LUKS_IVGEN_HASH_ALG, - .type = QEMU_OPT_STRING, - .help = "Name of IV generator hash algorithm", - }, - { - .name = BLOCK_CRYPTO_OPT_LUKS_HASH_ALG, - .type = QEMU_OPT_STRING, - .help = "Name of encryption hash algorithm", - }, - { /* end of list */ } - }, -}; - - -static QCryptoBlockOpenOptions * -block_crypto_open_opts_init(QCryptoBlockFormat format, - QemuOpts *opts, - Error **errp) -{ - OptsVisitor *ov; - QCryptoBlockOpenOptions *ret = NULL; - Error *local_err = NULL; - Error *end_err = NULL; - - ret = g_new0(QCryptoBlockOpenOptions, 1); - ret->format = format; - - ov = opts_visitor_new(opts); - - visit_start_struct(opts_get_visitor(ov), - NULL, NULL, 0, &local_err); - if (local_err) { - goto out; - } - - switch (format) { - case Q_CRYPTO_BLOCK_FORMAT_LUKS: - visit_type_QCryptoBlockOptionsLUKS_members( - opts_get_visitor(ov), &ret->u.luks, &local_err); - break; - - default: - error_setg(&local_err, "Unsupported block format %d", format); - break; - } - - visit_end_struct(opts_get_visitor(ov), &end_err); - error_propagate(&local_err, end_err); - - out: - if (local_err) { - error_propagate(errp, local_err); - qapi_free_QCryptoBlockOpenOptions(ret); - ret = NULL; - } - opts_visitor_cleanup(ov); - return ret; -} - - -static QCryptoBlockCreateOptions * -block_crypto_create_opts_init(QCryptoBlockFormat format, - QemuOpts *opts, - Error **errp) -{ - OptsVisitor *ov; - QCryptoBlockCreateOptions *ret = NULL; - Error *local_err = NULL; - Error *end_err = NULL; - - ret = g_new0(QCryptoBlockCreateOptions, 1); - ret->format = format; - - ov = opts_visitor_new(opts); - - visit_start_struct(opts_get_visitor(ov), - NULL, NULL, 0, &local_err); - if (local_err) { - goto out; - } - - switch (format) { - case Q_CRYPTO_BLOCK_FORMAT_LUKS: - visit_type_QCryptoBlockCreateOptionsLUKS_members( - opts_get_visitor(ov), &ret->u.luks, &local_err); - break; - - default: - error_setg(&local_err, "Unsupported block format %d", format); - break; - } - - visit_end_struct(opts_get_visitor(ov), &end_err); - error_propagate(&local_err, end_err); - - out: - if (local_err) { - error_propagate(errp, local_err); - qapi_free_QCryptoBlockCreateOptions(ret); - ret = NULL; - } - opts_visitor_cleanup(ov); - return ret; -} - - -static int block_crypto_open_generic(QCryptoBlockFormat format, - QemuOptsList *opts_spec, - BlockDriverState *bs, - QDict *options, - int flags, - Error **errp) -{ - BlockCrypto *crypto = bs->opaque; - QemuOpts *opts = NULL; - Error *local_err = NULL; - int ret = -EINVAL; - QCryptoBlockOpenOptions *open_opts = NULL; - unsigned int cflags = 0; - - opts = qemu_opts_create(opts_spec, NULL, 0, &error_abort); - qemu_opts_absorb_qdict(opts, options, &local_err); - if (local_err) { - error_propagate(errp, local_err); - goto cleanup; - } - - open_opts = block_crypto_open_opts_init(format, opts, errp); - if (!open_opts) { - goto cleanup; - } - - if (flags & BDRV_O_NO_IO) { - cflags |= QCRYPTO_BLOCK_OPEN_NO_IO; - } - crypto->block = qcrypto_block_open(open_opts, - block_crypto_read_func, - bs, - cflags, - errp); - - if (!crypto->block) { - ret = -EIO; - goto cleanup; - } - - bs->encrypted = 1; - bs->valid_key = 1; - - ret = 0; - cleanup: - qapi_free_QCryptoBlockOpenOptions(open_opts); - return ret; -} - - -static int block_crypto_create_generic(QCryptoBlockFormat format, - const char *filename, - QemuOpts *opts, - Error **errp) -{ - int ret = -EINVAL; - QCryptoBlockCreateOptions *create_opts = NULL; - QCryptoBlock *crypto = NULL; - struct BlockCryptoCreateData data = { - .size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0), - BDRV_SECTOR_SIZE), - .opts = opts, - .filename = filename, - }; - - create_opts = block_crypto_create_opts_init(format, opts, errp); - if (!create_opts) { - return -1; - } - - crypto = qcrypto_block_create(create_opts, - block_crypto_init_func, - block_crypto_write_func, - &data, - errp); - - if (!crypto) { - ret = -EIO; - goto cleanup; - } - - ret = 0; - cleanup: - qcrypto_block_free(crypto); - blk_unref(data.blk); - qapi_free_QCryptoBlockCreateOptions(create_opts); - return ret; -} - -static int block_crypto_truncate(BlockDriverState *bs, int64_t offset) -{ - BlockCrypto *crypto = bs->opaque; - size_t payload_offset = - qcrypto_block_get_payload_offset(crypto->block); - - offset += payload_offset; - - return bdrv_truncate(bs->file->bs, offset); -} - -static void block_crypto_close(BlockDriverState *bs) -{ - BlockCrypto *crypto = bs->opaque; - qcrypto_block_free(crypto->block); -} - - -#define BLOCK_CRYPTO_MAX_SECTORS 32 - -static coroutine_fn int -block_crypto_co_readv(BlockDriverState *bs, int64_t sector_num, - int remaining_sectors, QEMUIOVector *qiov) -{ - BlockCrypto *crypto = bs->opaque; - int cur_nr_sectors; /* number of sectors in current iteration */ - uint64_t bytes_done = 0; - uint8_t *cipher_data = NULL; - QEMUIOVector hd_qiov; - int ret = 0; - size_t payload_offset = - qcrypto_block_get_payload_offset(crypto->block) / 512; - - qemu_iovec_init(&hd_qiov, qiov->niov); - - /* Bounce buffer so we have a linear mem region for - * entire sector. XXX optimize so we avoid bounce - * buffer in case that qiov->niov == 1 - */ - cipher_data = - qemu_try_blockalign(bs->file->bs, MIN(BLOCK_CRYPTO_MAX_SECTORS * 512, - qiov->size)); - if (cipher_data == NULL) { - ret = -ENOMEM; - goto cleanup; - } - - while (remaining_sectors) { - cur_nr_sectors = remaining_sectors; - - if (cur_nr_sectors > BLOCK_CRYPTO_MAX_SECTORS) { - cur_nr_sectors = BLOCK_CRYPTO_MAX_SECTORS; - } - - qemu_iovec_reset(&hd_qiov); - qemu_iovec_add(&hd_qiov, cipher_data, cur_nr_sectors * 512); - - ret = bdrv_co_readv(bs->file->bs, - payload_offset + sector_num, - cur_nr_sectors, &hd_qiov); - if (ret < 0) { - goto cleanup; - } - - if (qcrypto_block_decrypt(crypto->block, - sector_num, - cipher_data, cur_nr_sectors * 512, - NULL) < 0) { - ret = -EIO; - goto cleanup; - } - - qemu_iovec_from_buf(qiov, bytes_done, - cipher_data, cur_nr_sectors * 512); - - remaining_sectors -= cur_nr_sectors; - sector_num += cur_nr_sectors; - bytes_done += cur_nr_sectors * 512; - } - - cleanup: - qemu_iovec_destroy(&hd_qiov); - qemu_vfree(cipher_data); - - return ret; -} - - -static coroutine_fn int -block_crypto_co_writev(BlockDriverState *bs, int64_t sector_num, - int remaining_sectors, QEMUIOVector *qiov) -{ - BlockCrypto *crypto = bs->opaque; - int cur_nr_sectors; /* number of sectors in current iteration */ - uint64_t bytes_done = 0; - uint8_t *cipher_data = NULL; - QEMUIOVector hd_qiov; - int ret = 0; - size_t payload_offset = - qcrypto_block_get_payload_offset(crypto->block) / 512; - - qemu_iovec_init(&hd_qiov, qiov->niov); - - /* Bounce buffer so we have a linear mem region for - * entire sector. XXX optimize so we avoid bounce - * buffer in case that qiov->niov == 1 - */ - cipher_data = - qemu_try_blockalign(bs->file->bs, MIN(BLOCK_CRYPTO_MAX_SECTORS * 512, - qiov->size)); - if (cipher_data == NULL) { - ret = -ENOMEM; - goto cleanup; - } - - while (remaining_sectors) { - cur_nr_sectors = remaining_sectors; - - if (cur_nr_sectors > BLOCK_CRYPTO_MAX_SECTORS) { - cur_nr_sectors = BLOCK_CRYPTO_MAX_SECTORS; - } - - qemu_iovec_to_buf(qiov, bytes_done, - cipher_data, cur_nr_sectors * 512); - - if (qcrypto_block_encrypt(crypto->block, - sector_num, - cipher_data, cur_nr_sectors * 512, - NULL) < 0) { - ret = -EIO; - goto cleanup; - } - - qemu_iovec_reset(&hd_qiov); - qemu_iovec_add(&hd_qiov, cipher_data, cur_nr_sectors * 512); - - ret = bdrv_co_writev(bs->file->bs, - payload_offset + sector_num, - cur_nr_sectors, &hd_qiov); - if (ret < 0) { - goto cleanup; - } - - remaining_sectors -= cur_nr_sectors; - sector_num += cur_nr_sectors; - bytes_done += cur_nr_sectors * 512; - } - - cleanup: - qemu_iovec_destroy(&hd_qiov); - qemu_vfree(cipher_data); - - return ret; -} - - -static int64_t block_crypto_getlength(BlockDriverState *bs) -{ - BlockCrypto *crypto = bs->opaque; - int64_t len = bdrv_getlength(bs->file->bs); - - ssize_t offset = qcrypto_block_get_payload_offset(crypto->block); - - len -= offset; - - return len; -} - - -static int block_crypto_probe_luks(const uint8_t *buf, - int buf_size, - const char *filename) { - return block_crypto_probe_generic(Q_CRYPTO_BLOCK_FORMAT_LUKS, - buf, buf_size, filename); -} - -static int block_crypto_open_luks(BlockDriverState *bs, - QDict *options, - int flags, - Error **errp) -{ - return block_crypto_open_generic(Q_CRYPTO_BLOCK_FORMAT_LUKS, - &block_crypto_runtime_opts_luks, - bs, options, flags, errp); -} - -static int block_crypto_create_luks(const char *filename, - QemuOpts *opts, - Error **errp) -{ - return block_crypto_create_generic(Q_CRYPTO_BLOCK_FORMAT_LUKS, - filename, opts, errp); -} - -BlockDriver bdrv_crypto_luks = { - .format_name = "luks", - .instance_size = sizeof(BlockCrypto), - .bdrv_probe = block_crypto_probe_luks, - .bdrv_open = block_crypto_open_luks, - .bdrv_close = block_crypto_close, - .bdrv_create = block_crypto_create_luks, - .bdrv_truncate = block_crypto_truncate, - .create_opts = &block_crypto_create_opts_luks, - - .bdrv_co_readv = block_crypto_co_readv, - .bdrv_co_writev = block_crypto_co_writev, - .bdrv_getlength = block_crypto_getlength, -}; - -static void block_crypto_init(void) -{ - bdrv_register(&bdrv_crypto_luks); -} - -block_init(block_crypto_init); diff --git a/qemu/block/curl.c b/qemu/block/curl.c deleted file mode 100644 index 5a8f8b623..000000000 --- a/qemu/block/curl.c +++ /dev/null @@ -1,896 +0,0 @@ -/* - * QEMU Block driver for CURL images - * - * Copyright (c) 2009 Alexander Graf - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. - */ -#include "qemu/osdep.h" -#include "qapi/error.h" -#include "qemu-common.h" -#include "qemu/error-report.h" -#include "block/block_int.h" -#include "qapi/qmp/qbool.h" -#include "qapi/qmp/qstring.h" -#include "crypto/secret.h" -#include -#include "qemu/cutils.h" - -// #define DEBUG_CURL -// #define DEBUG_VERBOSE - -#ifdef DEBUG_CURL -#define DPRINTF(fmt, ...) do { printf(fmt, ## __VA_ARGS__); } while (0) -#else -#define DPRINTF(fmt, ...) do { } while (0) -#endif - -#if LIBCURL_VERSION_NUM >= 0x071000 -/* The multi interface timer callback was introduced in 7.16.0 */ -#define NEED_CURL_TIMER_CALLBACK -#define HAVE_SOCKET_ACTION -#endif - -#ifndef HAVE_SOCKET_ACTION -/* If curl_multi_socket_action isn't available, define it statically here in - * terms of curl_multi_socket. Note that ev_bitmask will be ignored, which is - * less efficient but still safe. */ -static CURLMcode __curl_multi_socket_action(CURLM *multi_handle, - curl_socket_t sockfd, - int ev_bitmask, - int *running_handles) -{ - return curl_multi_socket(multi_handle, sockfd, running_handles); -} -#define curl_multi_socket_action __curl_multi_socket_action -#endif - -#define PROTOCOLS (CURLPROTO_HTTP | CURLPROTO_HTTPS | \ - CURLPROTO_FTP | CURLPROTO_FTPS | \ - CURLPROTO_TFTP) - -#define CURL_NUM_STATES 8 -#define CURL_NUM_ACB 8 -#define SECTOR_SIZE 512 -#define READ_AHEAD_DEFAULT (256 * 1024) -#define CURL_TIMEOUT_DEFAULT 5 -#define CURL_TIMEOUT_MAX 10000 - -#define FIND_RET_NONE 0 -#define FIND_RET_OK 1 -#define FIND_RET_WAIT 2 - -#define CURL_BLOCK_OPT_URL "url" -#define CURL_BLOCK_OPT_READAHEAD "readahead" -#define CURL_BLOCK_OPT_SSLVERIFY "sslverify" -#define CURL_BLOCK_OPT_TIMEOUT "timeout" -#define CURL_BLOCK_OPT_COOKIE "cookie" -#define CURL_BLOCK_OPT_USERNAME "username" -#define CURL_BLOCK_OPT_PASSWORD_SECRET "password-secret" -#define CURL_BLOCK_OPT_PROXY_USERNAME "proxy-username" -#define CURL_BLOCK_OPT_PROXY_PASSWORD_SECRET "proxy-password-secret" - -struct BDRVCURLState; - -typedef struct CURLAIOCB { - BlockAIOCB common; - QEMUBH *bh; - QEMUIOVector *qiov; - - int64_t sector_num; - int nb_sectors; - - size_t start; - size_t end; -} CURLAIOCB; - -typedef struct CURLState -{ - struct BDRVCURLState *s; - CURLAIOCB *acb[CURL_NUM_ACB]; - CURL *curl; - curl_socket_t sock_fd; - char *orig_buf; - size_t buf_start; - size_t buf_off; - size_t buf_len; - char range[128]; - char errmsg[CURL_ERROR_SIZE]; - char in_use; -} CURLState; - -typedef struct BDRVCURLState { - CURLM *multi; - QEMUTimer timer; - size_t len; - CURLState states[CURL_NUM_STATES]; - char *url; - size_t readahead_size; - bool sslverify; - uint64_t timeout; - char *cookie; - bool accept_range; - AioContext *aio_context; - char *username; - char *password; - char *proxyusername; - char *proxypassword; -} BDRVCURLState; - -static void curl_clean_state(CURLState *s); -static void curl_multi_do(void *arg); -static void curl_multi_read(void *arg); - -#ifdef NEED_CURL_TIMER_CALLBACK -static int curl_timer_cb(CURLM *multi, long timeout_ms, void *opaque) -{ - BDRVCURLState *s = opaque; - - DPRINTF("CURL: timer callback timeout_ms %ld\n", timeout_ms); - if (timeout_ms == -1) { - timer_del(&s->timer); - } else { - int64_t timeout_ns = (int64_t)timeout_ms * 1000 * 1000; - timer_mod(&s->timer, - qemu_clock_get_ns(QEMU_CLOCK_REALTIME) + timeout_ns); - } - return 0; -} -#endif - -static int curl_sock_cb(CURL *curl, curl_socket_t fd, int action, - void *userp, void *sp) -{ - BDRVCURLState *s; - CURLState *state = NULL; - curl_easy_getinfo(curl, CURLINFO_PRIVATE, (char **)&state); - state->sock_fd = fd; - s = state->s; - - DPRINTF("CURL (AIO): Sock action %d on fd %d\n", action, fd); - switch (action) { - case CURL_POLL_IN: - aio_set_fd_handler(s->aio_context, fd, false, - curl_multi_read, NULL, state); - break; - case CURL_POLL_OUT: - aio_set_fd_handler(s->aio_context, fd, false, - NULL, curl_multi_do, state); - break; - case CURL_POLL_INOUT: - aio_set_fd_handler(s->aio_context, fd, false, - curl_multi_read, curl_multi_do, state); - break; - case CURL_POLL_REMOVE: - aio_set_fd_handler(s->aio_context, fd, false, - NULL, NULL, NULL); - break; - } - - return 0; -} - -static size_t curl_header_cb(void *ptr, size_t size, size_t nmemb, void *opaque) -{ - BDRVCURLState *s = opaque; - size_t realsize = size * nmemb; - const char *accept_line = "Accept-Ranges: bytes"; - - if (realsize >= strlen(accept_line) - && strncmp((char *)ptr, accept_line, strlen(accept_line)) == 0) { - s->accept_range = true; - } - - return realsize; -} - -static size_t curl_read_cb(void *ptr, size_t size, size_t nmemb, void *opaque) -{ - CURLState *s = ((CURLState*)opaque); - size_t realsize = size * nmemb; - int i; - - DPRINTF("CURL: Just reading %zd bytes\n", realsize); - - if (!s || !s->orig_buf) - return 0; - - if (s->buf_off >= s->buf_len) { - /* buffer full, read nothing */ - return 0; - } - realsize = MIN(realsize, s->buf_len - s->buf_off); - memcpy(s->orig_buf + s->buf_off, ptr, realsize); - s->buf_off += realsize; - - for(i=0; iacb[i]; - - if (!acb) - continue; - - if ((s->buf_off >= acb->end)) { - qemu_iovec_from_buf(acb->qiov, 0, s->orig_buf + acb->start, - acb->end - acb->start); - acb->common.cb(acb->common.opaque, 0); - qemu_aio_unref(acb); - s->acb[i] = NULL; - } - } - - return realsize; -} - -static int curl_find_buf(BDRVCURLState *s, size_t start, size_t len, - CURLAIOCB *acb) -{ - int i; - size_t end = start + len; - - for (i=0; istates[i]; - size_t buf_end = (state->buf_start + state->buf_off); - size_t buf_fend = (state->buf_start + state->buf_len); - - if (!state->orig_buf) - continue; - if (!state->buf_off) - continue; - - // Does the existing buffer cover our section? - if ((start >= state->buf_start) && - (start <= buf_end) && - (end >= state->buf_start) && - (end <= buf_end)) - { - char *buf = state->orig_buf + (start - state->buf_start); - - qemu_iovec_from_buf(acb->qiov, 0, buf, len); - acb->common.cb(acb->common.opaque, 0); - - return FIND_RET_OK; - } - - // Wait for unfinished chunks - if (state->in_use && - (start >= state->buf_start) && - (start <= buf_fend) && - (end >= state->buf_start) && - (end <= buf_fend)) - { - int j; - - acb->start = start - state->buf_start; - acb->end = acb->start + len; - - for (j=0; jacb[j]) { - state->acb[j] = acb; - return FIND_RET_WAIT; - } - } - } - } - - return FIND_RET_NONE; -} - -static void curl_multi_check_completion(BDRVCURLState *s) -{ - int msgs_in_queue; - - /* Try to find done transfers, so we can free the easy - * handle again. */ - for (;;) { - CURLMsg *msg; - msg = curl_multi_info_read(s->multi, &msgs_in_queue); - - /* Quit when there are no more completions */ - if (!msg) - break; - - if (msg->msg == CURLMSG_DONE) { - CURLState *state = NULL; - curl_easy_getinfo(msg->easy_handle, CURLINFO_PRIVATE, - (char **)&state); - - /* ACBs for successful messages get completed in curl_read_cb */ - if (msg->data.result != CURLE_OK) { - int i; - static int errcount = 100; - - /* Don't lose the original error message from curl, since - * it contains extra data. - */ - if (errcount > 0) { - error_report("curl: %s", state->errmsg); - if (--errcount == 0) { - error_report("curl: further errors suppressed"); - } - } - - for (i = 0; i < CURL_NUM_ACB; i++) { - CURLAIOCB *acb = state->acb[i]; - - if (acb == NULL) { - continue; - } - - acb->common.cb(acb->common.opaque, -EPROTO); - qemu_aio_unref(acb); - state->acb[i] = NULL; - } - } - - curl_clean_state(state); - break; - } - } -} - -static void curl_multi_do(void *arg) -{ - CURLState *s = (CURLState *)arg; - int running; - int r; - - if (!s->s->multi) { - return; - } - - do { - r = curl_multi_socket_action(s->s->multi, s->sock_fd, 0, &running); - } while(r == CURLM_CALL_MULTI_PERFORM); - -} - -static void curl_multi_read(void *arg) -{ - CURLState *s = (CURLState *)arg; - - curl_multi_do(arg); - curl_multi_check_completion(s->s); -} - -static void curl_multi_timeout_do(void *arg) -{ -#ifdef NEED_CURL_TIMER_CALLBACK - BDRVCURLState *s = (BDRVCURLState *)arg; - int running; - - if (!s->multi) { - return; - } - - curl_multi_socket_action(s->multi, CURL_SOCKET_TIMEOUT, 0, &running); - - curl_multi_check_completion(s); -#else - abort(); -#endif -} - -static CURLState *curl_init_state(BlockDriverState *bs, BDRVCURLState *s) -{ - CURLState *state = NULL; - int i, j; - - do { - for (i=0; istates[i].acb[j]) - continue; - if (s->states[i].in_use) - continue; - - state = &s->states[i]; - state->in_use = 1; - break; - } - if (!state) { - aio_poll(bdrv_get_aio_context(bs), true); - } - } while(!state); - - if (!state->curl) { - state->curl = curl_easy_init(); - if (!state->curl) { - return NULL; - } - curl_easy_setopt(state->curl, CURLOPT_URL, s->url); - curl_easy_setopt(state->curl, CURLOPT_SSL_VERIFYPEER, - (long) s->sslverify); - if (s->cookie) { - curl_easy_setopt(state->curl, CURLOPT_COOKIE, s->cookie); - } - curl_easy_setopt(state->curl, CURLOPT_TIMEOUT, (long)s->timeout); - curl_easy_setopt(state->curl, CURLOPT_WRITEFUNCTION, - (void *)curl_read_cb); - curl_easy_setopt(state->curl, CURLOPT_WRITEDATA, (void *)state); - curl_easy_setopt(state->curl, CURLOPT_PRIVATE, (void *)state); - curl_easy_setopt(state->curl, CURLOPT_AUTOREFERER, 1); - curl_easy_setopt(state->curl, CURLOPT_FOLLOWLOCATION, 1); - curl_easy_setopt(state->curl, CURLOPT_NOSIGNAL, 1); - curl_easy_setopt(state->curl, CURLOPT_ERRORBUFFER, state->errmsg); - curl_easy_setopt(state->curl, CURLOPT_FAILONERROR, 1); - - if (s->username) { - curl_easy_setopt(state->curl, CURLOPT_USERNAME, s->username); - } - if (s->password) { - curl_easy_setopt(state->curl, CURLOPT_PASSWORD, s->password); - } - if (s->proxyusername) { - curl_easy_setopt(state->curl, - CURLOPT_PROXYUSERNAME, s->proxyusername); - } - if (s->proxypassword) { - curl_easy_setopt(state->curl, - CURLOPT_PROXYPASSWORD, s->proxypassword); - } - - /* Restrict supported protocols to avoid security issues in the more - * obscure protocols. For example, do not allow POP3/SMTP/IMAP see - * CVE-2013-0249. - * - * Restricting protocols is only supported from 7.19.4 upwards. - */ -#if LIBCURL_VERSION_NUM >= 0x071304 - curl_easy_setopt(state->curl, CURLOPT_PROTOCOLS, PROTOCOLS); - curl_easy_setopt(state->curl, CURLOPT_REDIR_PROTOCOLS, PROTOCOLS); -#endif - -#ifdef DEBUG_VERBOSE - curl_easy_setopt(state->curl, CURLOPT_VERBOSE, 1); -#endif - } - - state->s = s; - - return state; -} - -static void curl_clean_state(CURLState *s) -{ - if (s->s->multi) - curl_multi_remove_handle(s->s->multi, s->curl); - s->in_use = 0; -} - -static void curl_parse_filename(const char *filename, QDict *options, - Error **errp) -{ - qdict_put(options, CURL_BLOCK_OPT_URL, qstring_from_str(filename)); -} - -static void curl_detach_aio_context(BlockDriverState *bs) -{ - BDRVCURLState *s = bs->opaque; - int i; - - for (i = 0; i < CURL_NUM_STATES; i++) { - if (s->states[i].in_use) { - curl_clean_state(&s->states[i]); - } - if (s->states[i].curl) { - curl_easy_cleanup(s->states[i].curl); - s->states[i].curl = NULL; - } - g_free(s->states[i].orig_buf); - s->states[i].orig_buf = NULL; - } - if (s->multi) { - curl_multi_cleanup(s->multi); - s->multi = NULL; - } - - timer_del(&s->timer); -} - -static void curl_attach_aio_context(BlockDriverState *bs, - AioContext *new_context) -{ - BDRVCURLState *s = bs->opaque; - - aio_timer_init(new_context, &s->timer, - QEMU_CLOCK_REALTIME, SCALE_NS, - curl_multi_timeout_do, s); - - assert(!s->multi); - s->multi = curl_multi_init(); - s->aio_context = new_context; - curl_multi_setopt(s->multi, CURLMOPT_SOCKETFUNCTION, curl_sock_cb); -#ifdef NEED_CURL_TIMER_CALLBACK - curl_multi_setopt(s->multi, CURLMOPT_TIMERDATA, s); - curl_multi_setopt(s->multi, CURLMOPT_TIMERFUNCTION, curl_timer_cb); -#endif -} - -static QemuOptsList runtime_opts = { - .name = "curl", - .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head), - .desc = { - { - .name = CURL_BLOCK_OPT_URL, - .type = QEMU_OPT_STRING, - .help = "URL to open", - }, - { - .name = CURL_BLOCK_OPT_READAHEAD, - .type = QEMU_OPT_SIZE, - .help = "Readahead size", - }, - { - .name = CURL_BLOCK_OPT_SSLVERIFY, - .type = QEMU_OPT_BOOL, - .help = "Verify SSL certificate" - }, - { - .name = CURL_BLOCK_OPT_TIMEOUT, - .type = QEMU_OPT_NUMBER, - .help = "Curl timeout" - }, - { - .name = CURL_BLOCK_OPT_COOKIE, - .type = QEMU_OPT_STRING, - .help = "Pass the cookie or list of cookies with each request" - }, - { - .name = CURL_BLOCK_OPT_USERNAME, - .type = QEMU_OPT_STRING, - .help = "Username for HTTP auth" - }, - { - .name = CURL_BLOCK_OPT_PASSWORD_SECRET, - .type = QEMU_OPT_STRING, - .help = "ID of secret used as password for HTTP auth", - }, - { - .name = CURL_BLOCK_OPT_PROXY_USERNAME, - .type = QEMU_OPT_STRING, - .help = "Username for HTTP proxy auth" - }, - { - .name = CURL_BLOCK_OPT_PROXY_PASSWORD_SECRET, - .type = QEMU_OPT_STRING, - .help = "ID of secret used as password for HTTP proxy auth", - }, - { /* end of list */ } - }, -}; - - -static int curl_open(BlockDriverState *bs, QDict *options, int flags, - Error **errp) -{ - BDRVCURLState *s = bs->opaque; - CURLState *state = NULL; - QemuOpts *opts; - Error *local_err = NULL; - const char *file; - const char *cookie; - double d; - const char *secretid; - - static int inited = 0; - - if (flags & BDRV_O_RDWR) { - error_setg(errp, "curl block device does not support writes"); - return -EROFS; - } - - opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort); - qemu_opts_absorb_qdict(opts, options, &local_err); - if (local_err) { - error_propagate(errp, local_err); - goto out_noclean; - } - - s->readahead_size = qemu_opt_get_size(opts, CURL_BLOCK_OPT_READAHEAD, - READ_AHEAD_DEFAULT); - if ((s->readahead_size & 0x1ff) != 0) { - error_setg(errp, "HTTP_READAHEAD_SIZE %zd is not a multiple of 512", - s->readahead_size); - goto out_noclean; - } - - s->timeout = qemu_opt_get_number(opts, CURL_BLOCK_OPT_TIMEOUT, - CURL_TIMEOUT_DEFAULT); - if (s->timeout > CURL_TIMEOUT_MAX) { - error_setg(errp, "timeout parameter is too large or negative"); - goto out_noclean; - } - - s->sslverify = qemu_opt_get_bool(opts, CURL_BLOCK_OPT_SSLVERIFY, true); - - cookie = qemu_opt_get(opts, CURL_BLOCK_OPT_COOKIE); - s->cookie = g_strdup(cookie); - - file = qemu_opt_get(opts, CURL_BLOCK_OPT_URL); - if (file == NULL) { - error_setg(errp, "curl block driver requires an 'url' option"); - goto out_noclean; - } - - s->username = g_strdup(qemu_opt_get(opts, CURL_BLOCK_OPT_USERNAME)); - secretid = qemu_opt_get(opts, CURL_BLOCK_OPT_PASSWORD_SECRET); - - if (secretid) { - s->password = qcrypto_secret_lookup_as_utf8(secretid, errp); - if (!s->password) { - goto out_noclean; - } - } - - s->proxyusername = g_strdup( - qemu_opt_get(opts, CURL_BLOCK_OPT_PROXY_USERNAME)); - secretid = qemu_opt_get(opts, CURL_BLOCK_OPT_PROXY_PASSWORD_SECRET); - if (secretid) { - s->proxypassword = qcrypto_secret_lookup_as_utf8(secretid, errp); - if (!s->proxypassword) { - goto out_noclean; - } - } - - if (!inited) { - curl_global_init(CURL_GLOBAL_ALL); - inited = 1; - } - - DPRINTF("CURL: Opening %s\n", file); - s->aio_context = bdrv_get_aio_context(bs); - s->url = g_strdup(file); - state = curl_init_state(bs, s); - if (!state) - goto out_noclean; - - // Get file size - - s->accept_range = false; - curl_easy_setopt(state->curl, CURLOPT_NOBODY, 1); - curl_easy_setopt(state->curl, CURLOPT_HEADERFUNCTION, - curl_header_cb); - curl_easy_setopt(state->curl, CURLOPT_HEADERDATA, s); - if (curl_easy_perform(state->curl)) - goto out; - curl_easy_getinfo(state->curl, CURLINFO_CONTENT_LENGTH_DOWNLOAD, &d); - if (d) - s->len = (size_t)d; - else if(!s->len) - goto out; - if ((!strncasecmp(s->url, "http://", strlen("http://")) - || !strncasecmp(s->url, "https://", strlen("https://"))) - && !s->accept_range) { - pstrcpy(state->errmsg, CURL_ERROR_SIZE, - "Server does not support 'range' (byte ranges)."); - goto out; - } - DPRINTF("CURL: Size = %zd\n", s->len); - - curl_clean_state(state); - curl_easy_cleanup(state->curl); - state->curl = NULL; - - curl_attach_aio_context(bs, bdrv_get_aio_context(bs)); - - qemu_opts_del(opts); - return 0; - -out: - error_setg(errp, "CURL: Error opening file: %s", state->errmsg); - curl_easy_cleanup(state->curl); - state->curl = NULL; -out_noclean: - g_free(s->cookie); - g_free(s->url); - qemu_opts_del(opts); - return -EINVAL; -} - -static const AIOCBInfo curl_aiocb_info = { - .aiocb_size = sizeof(CURLAIOCB), -}; - - -static void curl_readv_bh_cb(void *p) -{ - CURLState *state; - int running; - - CURLAIOCB *acb = p; - BDRVCURLState *s = acb->common.bs->opaque; - - qemu_bh_delete(acb->bh); - acb->bh = NULL; - - size_t start = acb->sector_num * SECTOR_SIZE; - size_t end; - - // In case we have the requested data already (e.g. read-ahead), - // we can just call the callback and be done. - switch (curl_find_buf(s, start, acb->nb_sectors * SECTOR_SIZE, acb)) { - case FIND_RET_OK: - qemu_aio_unref(acb); - // fall through - case FIND_RET_WAIT: - return; - default: - break; - } - - // No cache found, so let's start a new request - state = curl_init_state(acb->common.bs, s); - if (!state) { - acb->common.cb(acb->common.opaque, -EIO); - qemu_aio_unref(acb); - return; - } - - acb->start = 0; - acb->end = (acb->nb_sectors * SECTOR_SIZE); - - state->buf_off = 0; - g_free(state->orig_buf); - state->buf_start = start; - state->buf_len = acb->end + s->readahead_size; - end = MIN(start + state->buf_len, s->len) - 1; - state->orig_buf = g_try_malloc(state->buf_len); - if (state->buf_len && state->orig_buf == NULL) { - curl_clean_state(state); - acb->common.cb(acb->common.opaque, -ENOMEM); - qemu_aio_unref(acb); - return; - } - state->acb[0] = acb; - - snprintf(state->range, 127, "%zd-%zd", start, end); - DPRINTF("CURL (AIO): Reading %d at %zd (%s)\n", - (acb->nb_sectors * SECTOR_SIZE), start, state->range); - curl_easy_setopt(state->curl, CURLOPT_RANGE, state->range); - - curl_multi_add_handle(s->multi, state->curl); - - /* Tell curl it needs to kick things off */ - curl_multi_socket_action(s->multi, CURL_SOCKET_TIMEOUT, 0, &running); -} - -static BlockAIOCB *curl_aio_readv(BlockDriverState *bs, - int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, - BlockCompletionFunc *cb, void *opaque) -{ - CURLAIOCB *acb; - - acb = qemu_aio_get(&curl_aiocb_info, bs, cb, opaque); - - acb->qiov = qiov; - acb->sector_num = sector_num; - acb->nb_sectors = nb_sectors; - - acb->bh = aio_bh_new(bdrv_get_aio_context(bs), curl_readv_bh_cb, acb); - qemu_bh_schedule(acb->bh); - return &acb->common; -} - -static void curl_close(BlockDriverState *bs) -{ - BDRVCURLState *s = bs->opaque; - - DPRINTF("CURL: Close\n"); - curl_detach_aio_context(bs); - - g_free(s->cookie); - g_free(s->url); -} - -static int64_t curl_getlength(BlockDriverState *bs) -{ - BDRVCURLState *s = bs->opaque; - return s->len; -} - -static BlockDriver bdrv_http = { - .format_name = "http", - .protocol_name = "http", - - .instance_size = sizeof(BDRVCURLState), - .bdrv_parse_filename = curl_parse_filename, - .bdrv_file_open = curl_open, - .bdrv_close = curl_close, - .bdrv_getlength = curl_getlength, - - .bdrv_aio_readv = curl_aio_readv, - - .bdrv_detach_aio_context = curl_detach_aio_context, - .bdrv_attach_aio_context = curl_attach_aio_context, -}; - -static BlockDriver bdrv_https = { - .format_name = "https", - .protocol_name = "https", - - .instance_size = sizeof(BDRVCURLState), - .bdrv_parse_filename = curl_parse_filename, - .bdrv_file_open = curl_open, - .bdrv_close = curl_close, - .bdrv_getlength = curl_getlength, - - .bdrv_aio_readv = curl_aio_readv, - - .bdrv_detach_aio_context = curl_detach_aio_context, - .bdrv_attach_aio_context = curl_attach_aio_context, -}; - -static BlockDriver bdrv_ftp = { - .format_name = "ftp", - .protocol_name = "ftp", - - .instance_size = sizeof(BDRVCURLState), - .bdrv_parse_filename = curl_parse_filename, - .bdrv_file_open = curl_open, - .bdrv_close = curl_close, - .bdrv_getlength = curl_getlength, - - .bdrv_aio_readv = curl_aio_readv, - - .bdrv_detach_aio_context = curl_detach_aio_context, - .bdrv_attach_aio_context = curl_attach_aio_context, -}; - -static BlockDriver bdrv_ftps = { - .format_name = "ftps", - .protocol_name = "ftps", - - .instance_size = sizeof(BDRVCURLState), - .bdrv_parse_filename = curl_parse_filename, - .bdrv_file_open = curl_open, - .bdrv_close = curl_close, - .bdrv_getlength = curl_getlength, - - .bdrv_aio_readv = curl_aio_readv, - - .bdrv_detach_aio_context = curl_detach_aio_context, - .bdrv_attach_aio_context = curl_attach_aio_context, -}; - -static BlockDriver bdrv_tftp = { - .format_name = "tftp", - .protocol_name = "tftp", - - .instance_size = sizeof(BDRVCURLState), - .bdrv_parse_filename = curl_parse_filename, - .bdrv_file_open = curl_open, - .bdrv_close = curl_close, - .bdrv_getlength = curl_getlength, - - .bdrv_aio_readv = curl_aio_readv, - - .bdrv_detach_aio_context = curl_detach_aio_context, - .bdrv_attach_aio_context = curl_attach_aio_context, -}; - -static void curl_block_init(void) -{ - bdrv_register(&bdrv_http); - bdrv_register(&bdrv_https); - bdrv_register(&bdrv_ftp); - bdrv_register(&bdrv_ftps); - bdrv_register(&bdrv_tftp); -} - -block_init(curl_block_init); diff --git a/qemu/block/dirty-bitmap.c b/qemu/block/dirty-bitmap.c deleted file mode 100644 index 4902ca557..000000000 --- a/qemu/block/dirty-bitmap.c +++ /dev/null @@ -1,387 +0,0 @@ -/* - * Block Dirty Bitmap - * - * Copyright (c) 2016 Red Hat. Inc - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. - */ -#include "qemu/osdep.h" -#include "qapi/error.h" -#include "qemu-common.h" -#include "trace.h" -#include "block/block_int.h" -#include "block/blockjob.h" - -/** - * A BdrvDirtyBitmap can be in three possible states: - * (1) successor is NULL and disabled is false: full r/w mode - * (2) successor is NULL and disabled is true: read only mode ("disabled") - * (3) successor is set: frozen mode. - * A frozen bitmap cannot be renamed, deleted, anonymized, cleared, set, - * or enabled. A frozen bitmap can only abdicate() or reclaim(). - */ -struct BdrvDirtyBitmap { - HBitmap *bitmap; /* Dirty sector bitmap implementation */ - BdrvDirtyBitmap *successor; /* Anonymous child; implies frozen status */ - char *name; /* Optional non-empty unique ID */ - int64_t size; /* Size of the bitmap (Number of sectors) */ - bool disabled; /* Bitmap is read-only */ - QLIST_ENTRY(BdrvDirtyBitmap) list; -}; - -BdrvDirtyBitmap *bdrv_find_dirty_bitmap(BlockDriverState *bs, const char *name) -{ - BdrvDirtyBitmap *bm; - - assert(name); - QLIST_FOREACH(bm, &bs->dirty_bitmaps, list) { - if (bm->name && !strcmp(name, bm->name)) { - return bm; - } - } - return NULL; -} - -void bdrv_dirty_bitmap_make_anon(BdrvDirtyBitmap *bitmap) -{ - assert(!bdrv_dirty_bitmap_frozen(bitmap)); - g_free(bitmap->name); - bitmap->name = NULL; -} - -BdrvDirtyBitmap *bdrv_create_dirty_bitmap(BlockDriverState *bs, - uint32_t granularity, - const char *name, - Error **errp) -{ - int64_t bitmap_size; - BdrvDirtyBitmap *bitmap; - uint32_t sector_granularity; - - assert((granularity & (granularity - 1)) == 0); - - if (name && bdrv_find_dirty_bitmap(bs, name)) { - error_setg(errp, "Bitmap already exists: %s", name); - return NULL; - } - sector_granularity = granularity >> BDRV_SECTOR_BITS; - assert(sector_granularity); - bitmap_size = bdrv_nb_sectors(bs); - if (bitmap_size < 0) { - error_setg_errno(errp, -bitmap_size, "could not get length of device"); - errno = -bitmap_size; - return NULL; - } - bitmap = g_new0(BdrvDirtyBitmap, 1); - bitmap->bitmap = hbitmap_alloc(bitmap_size, ctz32(sector_granularity)); - bitmap->size = bitmap_size; - bitmap->name = g_strdup(name); - bitmap->disabled = false; - QLIST_INSERT_HEAD(&bs->dirty_bitmaps, bitmap, list); - return bitmap; -} - -bool bdrv_dirty_bitmap_frozen(BdrvDirtyBitmap *bitmap) -{ - return bitmap->successor; -} - -bool bdrv_dirty_bitmap_enabled(BdrvDirtyBitmap *bitmap) -{ - return !(bitmap->disabled || bitmap->successor); -} - -DirtyBitmapStatus bdrv_dirty_bitmap_status(BdrvDirtyBitmap *bitmap) -{ - if (bdrv_dirty_bitmap_frozen(bitmap)) { - return DIRTY_BITMAP_STATUS_FROZEN; - } else if (!bdrv_dirty_bitmap_enabled(bitmap)) { - return DIRTY_BITMAP_STATUS_DISABLED; - } else { - return DIRTY_BITMAP_STATUS_ACTIVE; - } -} - -/** - * Create a successor bitmap destined to replace this bitmap after an operation. - * Requires that the bitmap is not frozen and has no successor. - */ -int bdrv_dirty_bitmap_create_successor(BlockDriverState *bs, - BdrvDirtyBitmap *bitmap, Error **errp) -{ - uint64_t granularity; - BdrvDirtyBitmap *child; - - if (bdrv_dirty_bitmap_frozen(bitmap)) { - error_setg(errp, "Cannot create a successor for a bitmap that is " - "currently frozen"); - return -1; - } - assert(!bitmap->successor); - - /* Create an anonymous successor */ - granularity = bdrv_dirty_bitmap_granularity(bitmap); - child = bdrv_create_dirty_bitmap(bs, granularity, NULL, errp); - if (!child) { - return -1; - } - - /* Successor will be on or off based on our current state. */ - child->disabled = bitmap->disabled; - - /* Install the successor and freeze the parent */ - bitmap->successor = child; - return 0; -} - -/** - * For a bitmap with a successor, yield our name to the successor, - * delete the old bitmap, and return a handle to the new bitmap. - */ -BdrvDirtyBitmap *bdrv_dirty_bitmap_abdicate(BlockDriverState *bs, - BdrvDirtyBitmap *bitmap, - Error **errp) -{ - char *name; - BdrvDirtyBitmap *successor = bitmap->successor; - - if (successor == NULL) { - error_setg(errp, "Cannot relinquish control if " - "there's no successor present"); - return NULL; - } - - name = bitmap->name; - bitmap->name = NULL; - successor->name = name; - bitmap->successor = NULL; - bdrv_release_dirty_bitmap(bs, bitmap); - - return successor; -} - -/** - * In cases of failure where we can no longer safely delete the parent, - * we may wish to re-join the parent and child/successor. - * The merged parent will be un-frozen, but not explicitly re-enabled. - */ -BdrvDirtyBitmap *bdrv_reclaim_dirty_bitmap(BlockDriverState *bs, - BdrvDirtyBitmap *parent, - Error **errp) -{ - BdrvDirtyBitmap *successor = parent->successor; - - if (!successor) { - error_setg(errp, "Cannot reclaim a successor when none is present"); - return NULL; - } - - if (!hbitmap_merge(parent->bitmap, successor->bitmap)) { - error_setg(errp, "Merging of parent and successor bitmap failed"); - return NULL; - } - bdrv_release_dirty_bitmap(bs, successor); - parent->successor = NULL; - - return parent; -} - -/** - * Truncates _all_ bitmaps attached to a BDS. - */ -void bdrv_dirty_bitmap_truncate(BlockDriverState *bs) -{ - BdrvDirtyBitmap *bitmap; - uint64_t size = bdrv_nb_sectors(bs); - - QLIST_FOREACH(bitmap, &bs->dirty_bitmaps, list) { - assert(!bdrv_dirty_bitmap_frozen(bitmap)); - hbitmap_truncate(bitmap->bitmap, size); - bitmap->size = size; - } -} - -static void bdrv_do_release_matching_dirty_bitmap(BlockDriverState *bs, - BdrvDirtyBitmap *bitmap, - bool only_named) -{ - BdrvDirtyBitmap *bm, *next; - QLIST_FOREACH_SAFE(bm, &bs->dirty_bitmaps, list, next) { - if ((!bitmap || bm == bitmap) && (!only_named || bm->name)) { - assert(!bdrv_dirty_bitmap_frozen(bm)); - QLIST_REMOVE(bm, list); - hbitmap_free(bm->bitmap); - g_free(bm->name); - g_free(bm); - - if (bitmap) { - return; - } - } - } -} - -void bdrv_release_dirty_bitmap(BlockDriverState *bs, BdrvDirtyBitmap *bitmap) -{ - bdrv_do_release_matching_dirty_bitmap(bs, bitmap, false); -} - -/** - * Release all named dirty bitmaps attached to a BDS (for use in bdrv_close()). - * There must not be any frozen bitmaps attached. - */ -void bdrv_release_named_dirty_bitmaps(BlockDriverState *bs) -{ - bdrv_do_release_matching_dirty_bitmap(bs, NULL, true); -} - -void bdrv_disable_dirty_bitmap(BdrvDirtyBitmap *bitmap) -{ - assert(!bdrv_dirty_bitmap_frozen(bitmap)); - bitmap->disabled = true; -} - -void bdrv_enable_dirty_bitmap(BdrvDirtyBitmap *bitmap) -{ - assert(!bdrv_dirty_bitmap_frozen(bitmap)); - bitmap->disabled = false; -} - -BlockDirtyInfoList *bdrv_query_dirty_bitmaps(BlockDriverState *bs) -{ - BdrvDirtyBitmap *bm; - BlockDirtyInfoList *list = NULL; - BlockDirtyInfoList **plist = &list; - - QLIST_FOREACH(bm, &bs->dirty_bitmaps, list) { - BlockDirtyInfo *info = g_new0(BlockDirtyInfo, 1); - BlockDirtyInfoList *entry = g_new0(BlockDirtyInfoList, 1); - info->count = bdrv_get_dirty_count(bm); - info->granularity = bdrv_dirty_bitmap_granularity(bm); - info->has_name = !!bm->name; - info->name = g_strdup(bm->name); - info->status = bdrv_dirty_bitmap_status(bm); - entry->value = info; - *plist = entry; - plist = &entry->next; - } - - return list; -} - -int bdrv_get_dirty(BlockDriverState *bs, BdrvDirtyBitmap *bitmap, - int64_t sector) -{ - if (bitmap) { - return hbitmap_get(bitmap->bitmap, sector); - } else { - return 0; - } -} - -/** - * Chooses a default granularity based on the existing cluster size, - * but clamped between [4K, 64K]. Defaults to 64K in the case that there - * is no cluster size information available. - */ -uint32_t bdrv_get_default_bitmap_granularity(BlockDriverState *bs) -{ - BlockDriverInfo bdi; - uint32_t granularity; - - if (bdrv_get_info(bs, &bdi) >= 0 && bdi.cluster_size > 0) { - granularity = MAX(4096, bdi.cluster_size); - granularity = MIN(65536, granularity); - } else { - granularity = 65536; - } - - return granularity; -} - -uint32_t bdrv_dirty_bitmap_granularity(BdrvDirtyBitmap *bitmap) -{ - return BDRV_SECTOR_SIZE << hbitmap_granularity(bitmap->bitmap); -} - -void bdrv_dirty_iter_init(BdrvDirtyBitmap *bitmap, HBitmapIter *hbi) -{ - hbitmap_iter_init(hbi, bitmap->bitmap, 0); -} - -void bdrv_set_dirty_bitmap(BdrvDirtyBitmap *bitmap, - int64_t cur_sector, int nr_sectors) -{ - assert(bdrv_dirty_bitmap_enabled(bitmap)); - hbitmap_set(bitmap->bitmap, cur_sector, nr_sectors); -} - -void bdrv_reset_dirty_bitmap(BdrvDirtyBitmap *bitmap, - int64_t cur_sector, int nr_sectors) -{ - assert(bdrv_dirty_bitmap_enabled(bitmap)); - hbitmap_reset(bitmap->bitmap, cur_sector, nr_sectors); -} - -void bdrv_clear_dirty_bitmap(BdrvDirtyBitmap *bitmap, HBitmap **out) -{ - assert(bdrv_dirty_bitmap_enabled(bitmap)); - if (!out) { - hbitmap_reset_all(bitmap->bitmap); - } else { - HBitmap *backup = bitmap->bitmap; - bitmap->bitmap = hbitmap_alloc(bitmap->size, - hbitmap_granularity(backup)); - *out = backup; - } -} - -void bdrv_undo_clear_dirty_bitmap(BdrvDirtyBitmap *bitmap, HBitmap *in) -{ - HBitmap *tmp = bitmap->bitmap; - assert(bdrv_dirty_bitmap_enabled(bitmap)); - bitmap->bitmap = in; - hbitmap_free(tmp); -} - -void bdrv_set_dirty(BlockDriverState *bs, int64_t cur_sector, - int nr_sectors) -{ - BdrvDirtyBitmap *bitmap; - QLIST_FOREACH(bitmap, &bs->dirty_bitmaps, list) { - if (!bdrv_dirty_bitmap_enabled(bitmap)) { - continue; - } - hbitmap_set(bitmap->bitmap, cur_sector, nr_sectors); - } -} - -/** - * Advance an HBitmapIter to an arbitrary offset. - */ -void bdrv_set_dirty_iter(HBitmapIter *hbi, int64_t offset) -{ - assert(hbi->hb); - hbitmap_iter_init(hbi, hbi->hb, offset); -} - -int64_t bdrv_get_dirty_count(BdrvDirtyBitmap *bitmap) -{ - return hbitmap_count(bitmap->bitmap); -} diff --git a/qemu/block/dmg.c b/qemu/block/dmg.c deleted file mode 100644 index a496eb7c9..000000000 --- a/qemu/block/dmg.c +++ /dev/null @@ -1,727 +0,0 @@ -/* - * QEMU Block driver for DMG images - * - * Copyright (c) 2004 Johannes E. Schindelin - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. - */ -#include "qemu/osdep.h" -#include "qapi/error.h" -#include "qemu-common.h" -#include "block/block_int.h" -#include "qemu/bswap.h" -#include "qemu/error-report.h" -#include "qemu/module.h" -#include -#ifdef CONFIG_BZIP2 -#include -#endif -#include - -enum { - /* Limit chunk sizes to prevent unreasonable amounts of memory being used - * or truncating when converting to 32-bit types - */ - DMG_LENGTHS_MAX = 64 * 1024 * 1024, /* 64 MB */ - DMG_SECTORCOUNTS_MAX = DMG_LENGTHS_MAX / 512, -}; - -typedef struct BDRVDMGState { - CoMutex lock; - /* each chunk contains a certain number of sectors, - * offsets[i] is the offset in the .dmg file, - * lengths[i] is the length of the compressed chunk, - * sectors[i] is the sector beginning at offsets[i], - * sectorcounts[i] is the number of sectors in that chunk, - * the sectors array is ordered - * 0<=i 4 && !strcmp(filename + len - 4, ".dmg")) { - return 2; - } - return 0; -} - -static int read_uint64(BlockDriverState *bs, int64_t offset, uint64_t *result) -{ - uint64_t buffer; - int ret; - - ret = bdrv_pread(bs->file->bs, offset, &buffer, 8); - if (ret < 0) { - return ret; - } - - *result = be64_to_cpu(buffer); - return 0; -} - -static int read_uint32(BlockDriverState *bs, int64_t offset, uint32_t *result) -{ - uint32_t buffer; - int ret; - - ret = bdrv_pread(bs->file->bs, offset, &buffer, 4); - if (ret < 0) { - return ret; - } - - *result = be32_to_cpu(buffer); - return 0; -} - -static inline uint64_t buff_read_uint64(const uint8_t *buffer, int64_t offset) -{ - return be64_to_cpu(*(uint64_t *)&buffer[offset]); -} - -static inline uint32_t buff_read_uint32(const uint8_t *buffer, int64_t offset) -{ - return be32_to_cpu(*(uint32_t *)&buffer[offset]); -} - -/* Increase max chunk sizes, if necessary. This function is used to calculate - * the buffer sizes needed for compressed/uncompressed chunk I/O. - */ -static void update_max_chunk_size(BDRVDMGState *s, uint32_t chunk, - uint32_t *max_compressed_size, - uint32_t *max_sectors_per_chunk) -{ - uint32_t compressed_size = 0; - uint32_t uncompressed_sectors = 0; - - switch (s->types[chunk]) { - case 0x80000005: /* zlib compressed */ - case 0x80000006: /* bzip2 compressed */ - compressed_size = s->lengths[chunk]; - uncompressed_sectors = s->sectorcounts[chunk]; - break; - case 1: /* copy */ - uncompressed_sectors = (s->lengths[chunk] + 511) / 512; - break; - case 2: /* zero */ - /* as the all-zeroes block may be large, it is treated specially: the - * sector is not copied from a large buffer, a simple memset is used - * instead. Therefore uncompressed_sectors does not need to be set. */ - break; - } - - if (compressed_size > *max_compressed_size) { - *max_compressed_size = compressed_size; - } - if (uncompressed_sectors > *max_sectors_per_chunk) { - *max_sectors_per_chunk = uncompressed_sectors; - } -} - -static int64_t dmg_find_koly_offset(BlockDriverState *file_bs, Error **errp) -{ - int64_t length; - int64_t offset = 0; - uint8_t buffer[515]; - int i, ret; - - /* bdrv_getlength returns a multiple of block size (512), rounded up. Since - * dmg images can have odd sizes, try to look for the "koly" magic which - * marks the begin of the UDIF trailer (512 bytes). This magic can be found - * in the last 511 bytes of the second-last sector or the first 4 bytes of - * the last sector (search space: 515 bytes) */ - length = bdrv_getlength(file_bs); - if (length < 0) { - error_setg_errno(errp, -length, - "Failed to get file size while reading UDIF trailer"); - return length; - } else if (length < 512) { - error_setg(errp, "dmg file must be at least 512 bytes long"); - return -EINVAL; - } - if (length > 511 + 512) { - offset = length - 511 - 512; - } - length = length < 515 ? length : 515; - ret = bdrv_pread(file_bs, offset, buffer, length); - if (ret < 0) { - error_setg_errno(errp, -ret, "Failed while reading UDIF trailer"); - return ret; - } - for (i = 0; i < length - 3; i++) { - if (buffer[i] == 'k' && buffer[i+1] == 'o' && - buffer[i+2] == 'l' && buffer[i+3] == 'y') { - return offset + i; - } - } - error_setg(errp, "Could not locate UDIF trailer in dmg file"); - return -EINVAL; -} - -/* used when building the sector table */ -typedef struct DmgHeaderState { - /* used internally by dmg_read_mish_block to remember offsets of blocks - * across calls */ - uint64_t data_fork_offset; - /* exported for dmg_open */ - uint32_t max_compressed_size; - uint32_t max_sectors_per_chunk; -} DmgHeaderState; - -static bool dmg_is_known_block_type(uint32_t entry_type) -{ - switch (entry_type) { - case 0x00000001: /* uncompressed */ - case 0x00000002: /* zeroes */ - case 0x80000005: /* zlib */ -#ifdef CONFIG_BZIP2 - case 0x80000006: /* bzip2 */ -#endif - return true; - default: - return false; - } -} - -static int dmg_read_mish_block(BDRVDMGState *s, DmgHeaderState *ds, - uint8_t *buffer, uint32_t count) -{ - uint32_t type, i; - int ret; - size_t new_size; - uint32_t chunk_count; - int64_t offset = 0; - uint64_t data_offset; - uint64_t in_offset = ds->data_fork_offset; - uint64_t out_offset; - - type = buff_read_uint32(buffer, offset); - /* skip data that is not a valid MISH block (invalid magic or too small) */ - if (type != 0x6d697368 || count < 244) { - /* assume success for now */ - return 0; - } - - /* chunk offsets are relative to this sector number */ - out_offset = buff_read_uint64(buffer, offset + 8); - - /* location in data fork for (compressed) blob (in bytes) */ - data_offset = buff_read_uint64(buffer, offset + 0x18); - in_offset += data_offset; - - /* move to begin of chunk entries */ - offset += 204; - - chunk_count = (count - 204) / 40; - new_size = sizeof(uint64_t) * (s->n_chunks + chunk_count); - s->types = g_realloc(s->types, new_size / 2); - s->offsets = g_realloc(s->offsets, new_size); - s->lengths = g_realloc(s->lengths, new_size); - s->sectors = g_realloc(s->sectors, new_size); - s->sectorcounts = g_realloc(s->sectorcounts, new_size); - - for (i = s->n_chunks; i < s->n_chunks + chunk_count; i++) { - s->types[i] = buff_read_uint32(buffer, offset); - if (!dmg_is_known_block_type(s->types[i])) { - chunk_count--; - i--; - offset += 40; - continue; - } - - /* sector number */ - s->sectors[i] = buff_read_uint64(buffer, offset + 8); - s->sectors[i] += out_offset; - - /* sector count */ - s->sectorcounts[i] = buff_read_uint64(buffer, offset + 0x10); - - /* all-zeroes sector (type 2) does not need to be "uncompressed" and can - * therefore be unbounded. */ - if (s->types[i] != 2 && s->sectorcounts[i] > DMG_SECTORCOUNTS_MAX) { - error_report("sector count %" PRIu64 " for chunk %" PRIu32 - " is larger than max (%u)", - s->sectorcounts[i], i, DMG_SECTORCOUNTS_MAX); - ret = -EINVAL; - goto fail; - } - - /* offset in (compressed) data fork */ - s->offsets[i] = buff_read_uint64(buffer, offset + 0x18); - s->offsets[i] += in_offset; - - /* length in (compressed) data fork */ - s->lengths[i] = buff_read_uint64(buffer, offset + 0x20); - - if (s->lengths[i] > DMG_LENGTHS_MAX) { - error_report("length %" PRIu64 " for chunk %" PRIu32 - " is larger than max (%u)", - s->lengths[i], i, DMG_LENGTHS_MAX); - ret = -EINVAL; - goto fail; - } - - update_max_chunk_size(s, i, &ds->max_compressed_size, - &ds->max_sectors_per_chunk); - offset += 40; - } - s->n_chunks += chunk_count; - return 0; - -fail: - return ret; -} - -static int dmg_read_resource_fork(BlockDriverState *bs, DmgHeaderState *ds, - uint64_t info_begin, uint64_t info_length) -{ - BDRVDMGState *s = bs->opaque; - int ret; - uint32_t count, rsrc_data_offset; - uint8_t *buffer = NULL; - uint64_t info_end; - uint64_t offset; - - /* read offset from begin of resource fork (info_begin) to resource data */ - ret = read_uint32(bs, info_begin, &rsrc_data_offset); - if (ret < 0) { - goto fail; - } else if (rsrc_data_offset > info_length) { - ret = -EINVAL; - goto fail; - } - - /* read length of resource data */ - ret = read_uint32(bs, info_begin + 8, &count); - if (ret < 0) { - goto fail; - } else if (count == 0 || rsrc_data_offset + count > info_length) { - ret = -EINVAL; - goto fail; - } - - /* begin of resource data (consisting of one or more resources) */ - offset = info_begin + rsrc_data_offset; - - /* end of resource data (there is possibly a following resource map - * which will be ignored). */ - info_end = offset + count; - - /* read offsets (mish blocks) from one or more resources in resource data */ - while (offset < info_end) { - /* size of following resource */ - ret = read_uint32(bs, offset, &count); - if (ret < 0) { - goto fail; - } else if (count == 0 || count > info_end - offset) { - ret = -EINVAL; - goto fail; - } - offset += 4; - - buffer = g_realloc(buffer, count); - ret = bdrv_pread(bs->file->bs, offset, buffer, count); - if (ret < 0) { - goto fail; - } - - ret = dmg_read_mish_block(s, ds, buffer, count); - if (ret < 0) { - goto fail; - } - /* advance offset by size of resource */ - offset += count; - } - ret = 0; - -fail: - g_free(buffer); - return ret; -} - -static int dmg_read_plist_xml(BlockDriverState *bs, DmgHeaderState *ds, - uint64_t info_begin, uint64_t info_length) -{ - BDRVDMGState *s = bs->opaque; - int ret; - uint8_t *buffer = NULL; - char *data_begin, *data_end; - - /* Have at least some length to avoid NULL for g_malloc. Attempt to set a - * safe upper cap on the data length. A test sample had a XML length of - * about 1 MiB. */ - if (info_length == 0 || info_length > 16 * 1024 * 1024) { - ret = -EINVAL; - goto fail; - } - - buffer = g_malloc(info_length + 1); - buffer[info_length] = '\0'; - ret = bdrv_pread(bs->file->bs, info_begin, buffer, info_length); - if (ret != info_length) { - ret = -EINVAL; - goto fail; - } - - /* look for .... The data is 284 (0x11c) bytes after base64 - * decode. The actual data element has 431 (0x1af) bytes which includes tabs - * and line feeds. */ - data_end = (char *)buffer; - while ((data_begin = strstr(data_end, "")) != NULL) { - guchar *mish; - gsize out_len = 0; - - data_begin += 6; - data_end = strstr(data_begin, ""); - /* malformed XML? */ - if (data_end == NULL) { - ret = -EINVAL; - goto fail; - } - *data_end++ = '\0'; - mish = g_base64_decode(data_begin, &out_len); - ret = dmg_read_mish_block(s, ds, mish, (uint32_t)out_len); - g_free(mish); - if (ret < 0) { - goto fail; - } - } - ret = 0; - -fail: - g_free(buffer); - return ret; -} - -static int dmg_open(BlockDriverState *bs, QDict *options, int flags, - Error **errp) -{ - BDRVDMGState *s = bs->opaque; - DmgHeaderState ds; - uint64_t rsrc_fork_offset, rsrc_fork_length; - uint64_t plist_xml_offset, plist_xml_length; - int64_t offset; - int ret; - - bs->read_only = 1; - s->n_chunks = 0; - s->offsets = s->lengths = s->sectors = s->sectorcounts = NULL; - /* used by dmg_read_mish_block to keep track of the current I/O position */ - ds.data_fork_offset = 0; - ds.max_compressed_size = 1; - ds.max_sectors_per_chunk = 1; - - /* locate the UDIF trailer */ - offset = dmg_find_koly_offset(bs->file->bs, errp); - if (offset < 0) { - ret = offset; - goto fail; - } - - /* offset of data fork (DataForkOffset) */ - ret = read_uint64(bs, offset + 0x18, &ds.data_fork_offset); - if (ret < 0) { - goto fail; - } else if (ds.data_fork_offset > offset) { - ret = -EINVAL; - goto fail; - } - - /* offset of resource fork (RsrcForkOffset) */ - ret = read_uint64(bs, offset + 0x28, &rsrc_fork_offset); - if (ret < 0) { - goto fail; - } - ret = read_uint64(bs, offset + 0x30, &rsrc_fork_length); - if (ret < 0) { - goto fail; - } - if (rsrc_fork_offset >= offset || - rsrc_fork_length > offset - rsrc_fork_offset) { - ret = -EINVAL; - goto fail; - } - /* offset of property list (XMLOffset) */ - ret = read_uint64(bs, offset + 0xd8, &plist_xml_offset); - if (ret < 0) { - goto fail; - } - ret = read_uint64(bs, offset + 0xe0, &plist_xml_length); - if (ret < 0) { - goto fail; - } - if (plist_xml_offset >= offset || - plist_xml_length > offset - plist_xml_offset) { - ret = -EINVAL; - goto fail; - } - ret = read_uint64(bs, offset + 0x1ec, (uint64_t *)&bs->total_sectors); - if (ret < 0) { - goto fail; - } - if (bs->total_sectors < 0) { - ret = -EINVAL; - goto fail; - } - if (rsrc_fork_length != 0) { - ret = dmg_read_resource_fork(bs, &ds, - rsrc_fork_offset, rsrc_fork_length); - if (ret < 0) { - goto fail; - } - } else if (plist_xml_length != 0) { - ret = dmg_read_plist_xml(bs, &ds, plist_xml_offset, plist_xml_length); - if (ret < 0) { - goto fail; - } - } else { - ret = -EINVAL; - goto fail; - } - - /* initialize zlib engine */ - s->compressed_chunk = qemu_try_blockalign(bs->file->bs, - ds.max_compressed_size + 1); - s->uncompressed_chunk = qemu_try_blockalign(bs->file->bs, - 512 * ds.max_sectors_per_chunk); - if (s->compressed_chunk == NULL || s->uncompressed_chunk == NULL) { - ret = -ENOMEM; - goto fail; - } - - if (inflateInit(&s->zstream) != Z_OK) { - ret = -EINVAL; - goto fail; - } - - s->current_chunk = s->n_chunks; - - qemu_co_mutex_init(&s->lock); - return 0; - -fail: - g_free(s->types); - g_free(s->offsets); - g_free(s->lengths); - g_free(s->sectors); - g_free(s->sectorcounts); - qemu_vfree(s->compressed_chunk); - qemu_vfree(s->uncompressed_chunk); - return ret; -} - -static inline int is_sector_in_chunk(BDRVDMGState* s, - uint32_t chunk_num, uint64_t sector_num) -{ - if (chunk_num >= s->n_chunks || s->sectors[chunk_num] > sector_num || - s->sectors[chunk_num] + s->sectorcounts[chunk_num] <= sector_num) { - return 0; - } else { - return -1; - } -} - -static inline uint32_t search_chunk(BDRVDMGState *s, uint64_t sector_num) -{ - /* binary search */ - uint32_t chunk1 = 0, chunk2 = s->n_chunks, chunk3; - while (chunk1 != chunk2) { - chunk3 = (chunk1 + chunk2) / 2; - if (s->sectors[chunk3] > sector_num) { - chunk2 = chunk3; - } else if (s->sectors[chunk3] + s->sectorcounts[chunk3] > sector_num) { - return chunk3; - } else { - chunk1 = chunk3; - } - } - return s->n_chunks; /* error */ -} - -static inline int dmg_read_chunk(BlockDriverState *bs, uint64_t sector_num) -{ - BDRVDMGState *s = bs->opaque; - - if (!is_sector_in_chunk(s, s->current_chunk, sector_num)) { - int ret; - uint32_t chunk = search_chunk(s, sector_num); -#ifdef CONFIG_BZIP2 - uint64_t total_out; -#endif - - if (chunk >= s->n_chunks) { - return -1; - } - - s->current_chunk = s->n_chunks; - switch (s->types[chunk]) { /* block entry type */ - case 0x80000005: { /* zlib compressed */ - /* we need to buffer, because only the chunk as whole can be - * inflated. */ - ret = bdrv_pread(bs->file->bs, s->offsets[chunk], - s->compressed_chunk, s->lengths[chunk]); - if (ret != s->lengths[chunk]) { - return -1; - } - - s->zstream.next_in = s->compressed_chunk; - s->zstream.avail_in = s->lengths[chunk]; - s->zstream.next_out = s->uncompressed_chunk; - s->zstream.avail_out = 512 * s->sectorcounts[chunk]; - ret = inflateReset(&s->zstream); - if (ret != Z_OK) { - return -1; - } - ret = inflate(&s->zstream, Z_FINISH); - if (ret != Z_STREAM_END || - s->zstream.total_out != 512 * s->sectorcounts[chunk]) { - return -1; - } - break; } -#ifdef CONFIG_BZIP2 - case 0x80000006: /* bzip2 compressed */ - /* we need to buffer, because only the chunk as whole can be - * inflated. */ - ret = bdrv_pread(bs->file->bs, s->offsets[chunk], - s->compressed_chunk, s->lengths[chunk]); - if (ret != s->lengths[chunk]) { - return -1; - } - - ret = BZ2_bzDecompressInit(&s->bzstream, 0, 0); - if (ret != BZ_OK) { - return -1; - } - s->bzstream.next_in = (char *)s->compressed_chunk; - s->bzstream.avail_in = (unsigned int) s->lengths[chunk]; - s->bzstream.next_out = (char *)s->uncompressed_chunk; - s->bzstream.avail_out = (unsigned int) 512 * s->sectorcounts[chunk]; - ret = BZ2_bzDecompress(&s->bzstream); - total_out = ((uint64_t)s->bzstream.total_out_hi32 << 32) + - s->bzstream.total_out_lo32; - BZ2_bzDecompressEnd(&s->bzstream); - if (ret != BZ_STREAM_END || - total_out != 512 * s->sectorcounts[chunk]) { - return -1; - } - break; -#endif /* CONFIG_BZIP2 */ - case 1: /* copy */ - ret = bdrv_pread(bs->file->bs, s->offsets[chunk], - s->uncompressed_chunk, s->lengths[chunk]); - if (ret != s->lengths[chunk]) { - return -1; - } - break; - case 2: /* zero */ - /* see dmg_read, it is treated specially. No buffer needs to be - * pre-filled, the zeroes can be set directly. */ - break; - } - s->current_chunk = chunk; - } - return 0; -} - -static int dmg_read(BlockDriverState *bs, int64_t sector_num, - uint8_t *buf, int nb_sectors) -{ - BDRVDMGState *s = bs->opaque; - int i; - - for (i = 0; i < nb_sectors; i++) { - uint32_t sector_offset_in_chunk; - if (dmg_read_chunk(bs, sector_num + i) != 0) { - return -1; - } - /* Special case: current chunk is all zeroes. Do not perform a memcpy as - * s->uncompressed_chunk may be too small to cover the large all-zeroes - * section. dmg_read_chunk is called to find s->current_chunk */ - if (s->types[s->current_chunk] == 2) { /* all zeroes block entry */ - memset(buf + i * 512, 0, 512); - continue; - } - sector_offset_in_chunk = sector_num + i - s->sectors[s->current_chunk]; - memcpy(buf + i * 512, - s->uncompressed_chunk + sector_offset_in_chunk * 512, 512); - } - return 0; -} - -static coroutine_fn int dmg_co_read(BlockDriverState *bs, int64_t sector_num, - uint8_t *buf, int nb_sectors) -{ - int ret; - BDRVDMGState *s = bs->opaque; - qemu_co_mutex_lock(&s->lock); - ret = dmg_read(bs, sector_num, buf, nb_sectors); - qemu_co_mutex_unlock(&s->lock); - return ret; -} - -static void dmg_close(BlockDriverState *bs) -{ - BDRVDMGState *s = bs->opaque; - - g_free(s->types); - g_free(s->offsets); - g_free(s->lengths); - g_free(s->sectors); - g_free(s->sectorcounts); - qemu_vfree(s->compressed_chunk); - qemu_vfree(s->uncompressed_chunk); - - inflateEnd(&s->zstream); -} - -static BlockDriver bdrv_dmg = { - .format_name = "dmg", - .instance_size = sizeof(BDRVDMGState), - .bdrv_probe = dmg_probe, - .bdrv_open = dmg_open, - .bdrv_read = dmg_co_read, - .bdrv_close = dmg_close, -}; - -static void bdrv_dmg_init(void) -{ - bdrv_register(&bdrv_dmg); -} - -block_init(bdrv_dmg_init); diff --git a/qemu/block/gluster.c b/qemu/block/gluster.c deleted file mode 100644 index a8aaacf64..000000000 --- a/qemu/block/gluster.c +++ /dev/null @@ -1,866 +0,0 @@ -/* - * GlusterFS backend for QEMU - * - * Copyright (C) 2012 Bharata B Rao - * - * This work is licensed under the terms of the GNU GPL, version 2 or later. - * See the COPYING file in the top-level directory. - * - */ -#include "qemu/osdep.h" -#include -#include "block/block_int.h" -#include "qapi/error.h" -#include "qemu/uri.h" - -typedef struct GlusterAIOCB { - int64_t size; - int ret; - QEMUBH *bh; - Coroutine *coroutine; - AioContext *aio_context; -} GlusterAIOCB; - -typedef struct BDRVGlusterState { - struct glfs *glfs; - struct glfs_fd *fd; -} BDRVGlusterState; - -typedef struct GlusterConf { - char *server; - int port; - char *volname; - char *image; - char *transport; -} GlusterConf; - -static void qemu_gluster_gconf_free(GlusterConf *gconf) -{ - if (gconf) { - g_free(gconf->server); - g_free(gconf->volname); - g_free(gconf->image); - g_free(gconf->transport); - g_free(gconf); - } -} - -static int parse_volume_options(GlusterConf *gconf, char *path) -{ - char *p, *q; - - if (!path) { - return -EINVAL; - } - - /* volume */ - p = q = path + strspn(path, "/"); - p += strcspn(p, "/"); - if (*p == '\0') { - return -EINVAL; - } - gconf->volname = g_strndup(q, p - q); - - /* image */ - p += strspn(p, "/"); - if (*p == '\0') { - return -EINVAL; - } - gconf->image = g_strdup(p); - return 0; -} - -/* - * file=gluster[+transport]://[server[:port]]/volname/image[?socket=...] - * - * 'gluster' is the protocol. - * - * 'transport' specifies the transport type used to connect to gluster - * management daemon (glusterd). Valid transport types are - * tcp, unix and rdma. If a transport type isn't specified, then tcp - * type is assumed. - * - * 'server' specifies the server where the volume file specification for - * the given volume resides. This can be either hostname, ipv4 address - * or ipv6 address. ipv6 address needs to be within square brackets [ ]. - * If transport type is 'unix', then 'server' field should not be specified. - * The 'socket' field needs to be populated with the path to unix domain - * socket. - * - * 'port' is the port number on which glusterd is listening. This is optional - * and if not specified, QEMU will send 0 which will make gluster to use the - * default port. If the transport type is unix, then 'port' should not be - * specified. - * - * 'volname' is the name of the gluster volume which contains the VM image. - * - * 'image' is the path to the actual VM image that resides on gluster volume. - * - * Examples: - * - * file=gluster://1.2.3.4/testvol/a.img - * file=gluster+tcp://1.2.3.4/testvol/a.img - * file=gluster+tcp://1.2.3.4:24007/testvol/dir/a.img - * file=gluster+tcp://[1:2:3:4:5:6:7:8]/testvol/dir/a.img - * file=gluster+tcp://[1:2:3:4:5:6:7:8]:24007/testvol/dir/a.img - * file=gluster+tcp://server.domain.com:24007/testvol/dir/a.img - * file=gluster+unix:///testvol/dir/a.img?socket=/tmp/glusterd.socket - * file=gluster+rdma://1.2.3.4:24007/testvol/a.img - */ -static int qemu_gluster_parseuri(GlusterConf *gconf, const char *filename) -{ - URI *uri; - QueryParams *qp = NULL; - bool is_unix = false; - int ret = 0; - - uri = uri_parse(filename); - if (!uri) { - return -EINVAL; - } - - /* transport */ - if (!uri->scheme || !strcmp(uri->scheme, "gluster")) { - gconf->transport = g_strdup("tcp"); - } else if (!strcmp(uri->scheme, "gluster+tcp")) { - gconf->transport = g_strdup("tcp"); - } else if (!strcmp(uri->scheme, "gluster+unix")) { - gconf->transport = g_strdup("unix"); - is_unix = true; - } else if (!strcmp(uri->scheme, "gluster+rdma")) { - gconf->transport = g_strdup("rdma"); - } else { - ret = -EINVAL; - goto out; - } - - ret = parse_volume_options(gconf, uri->path); - if (ret < 0) { - goto out; - } - - qp = query_params_parse(uri->query); - if (qp->n > 1 || (is_unix && !qp->n) || (!is_unix && qp->n)) { - ret = -EINVAL; - goto out; - } - - if (is_unix) { - if (uri->server || uri->port) { - ret = -EINVAL; - goto out; - } - if (strcmp(qp->p[0].name, "socket")) { - ret = -EINVAL; - goto out; - } - gconf->server = g_strdup(qp->p[0].value); - } else { - gconf->server = g_strdup(uri->server ? uri->server : "localhost"); - gconf->port = uri->port; - } - -out: - if (qp) { - query_params_free(qp); - } - uri_free(uri); - return ret; -} - -static struct glfs *qemu_gluster_init(GlusterConf *gconf, const char *filename, - Error **errp) -{ - struct glfs *glfs = NULL; - int ret; - int old_errno; - - ret = qemu_gluster_parseuri(gconf, filename); - if (ret < 0) { - error_setg(errp, "Usage: file=gluster[+transport]://[server[:port]]/" - "volname/image[?socket=...]"); - errno = -ret; - goto out; - } - - glfs = glfs_new(gconf->volname); - if (!glfs) { - goto out; - } - - ret = glfs_set_volfile_server(glfs, gconf->transport, gconf->server, - gconf->port); - if (ret < 0) { - goto out; - } - - /* - * TODO: Use GF_LOG_ERROR instead of hard code value of 4 here when - * GlusterFS makes GF_LOG_* macros available to libgfapi users. - */ - ret = glfs_set_logging(glfs, "-", 4); - if (ret < 0) { - goto out; - } - - ret = glfs_init(glfs); - if (ret) { - error_setg_errno(errp, errno, - "Gluster connection failed for server=%s port=%d " - "volume=%s image=%s transport=%s", gconf->server, - gconf->port, gconf->volname, gconf->image, - gconf->transport); - - /* glfs_init sometimes doesn't set errno although docs suggest that */ - if (errno == 0) - errno = EINVAL; - - goto out; - } - return glfs; - -out: - if (glfs) { - old_errno = errno; - glfs_fini(glfs); - errno = old_errno; - } - return NULL; -} - -static void qemu_gluster_complete_aio(void *opaque) -{ - GlusterAIOCB *acb = (GlusterAIOCB *)opaque; - - qemu_bh_delete(acb->bh); - acb->bh = NULL; - qemu_coroutine_enter(acb->coroutine, NULL); -} - -/* - * AIO callback routine called from GlusterFS thread. - */ -static void gluster_finish_aiocb(struct glfs_fd *fd, ssize_t ret, void *arg) -{ - GlusterAIOCB *acb = (GlusterAIOCB *)arg; - - if (!ret || ret == acb->size) { - acb->ret = 0; /* Success */ - } else if (ret < 0) { - acb->ret = -errno; /* Read/Write failed */ - } else { - acb->ret = -EIO; /* Partial read/write - fail it */ - } - - acb->bh = aio_bh_new(acb->aio_context, qemu_gluster_complete_aio, acb); - qemu_bh_schedule(acb->bh); -} - -/* TODO Convert to fine grained options */ -static QemuOptsList runtime_opts = { - .name = "gluster", - .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head), - .desc = { - { - .name = "filename", - .type = QEMU_OPT_STRING, - .help = "URL to the gluster image", - }, - { /* end of list */ } - }, -}; - -static void qemu_gluster_parse_flags(int bdrv_flags, int *open_flags) -{ - assert(open_flags != NULL); - - *open_flags |= O_BINARY; - - if (bdrv_flags & BDRV_O_RDWR) { - *open_flags |= O_RDWR; - } else { - *open_flags |= O_RDONLY; - } - - if ((bdrv_flags & BDRV_O_NOCACHE)) { - *open_flags |= O_DIRECT; - } -} - -static int qemu_gluster_open(BlockDriverState *bs, QDict *options, - int bdrv_flags, Error **errp) -{ - BDRVGlusterState *s = bs->opaque; - int open_flags = 0; - int ret = 0; - GlusterConf *gconf = g_new0(GlusterConf, 1); - QemuOpts *opts; - Error *local_err = NULL; - const char *filename; - - opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort); - qemu_opts_absorb_qdict(opts, options, &local_err); - if (local_err) { - error_propagate(errp, local_err); - ret = -EINVAL; - goto out; - } - - filename = qemu_opt_get(opts, "filename"); - - s->glfs = qemu_gluster_init(gconf, filename, errp); - if (!s->glfs) { - ret = -errno; - goto out; - } - -#ifdef CONFIG_GLUSTERFS_XLATOR_OPT - /* Without this, if fsync fails for a recoverable reason (for instance, - * ENOSPC), gluster will dump its cache, preventing retries. This means - * almost certain data loss. Not all gluster versions support the - * 'resync-failed-syncs-after-fsync' key value, but there is no way to - * discover during runtime if it is supported (this api returns success for - * unknown key/value pairs) */ - ret = glfs_set_xlator_option(s->glfs, "*-write-behind", - "resync-failed-syncs-after-fsync", - "on"); - if (ret < 0) { - error_setg_errno(errp, errno, "Unable to set xlator key/value pair"); - ret = -errno; - goto out; - } -#endif - - qemu_gluster_parse_flags(bdrv_flags, &open_flags); - - s->fd = glfs_open(s->glfs, gconf->image, open_flags); - if (!s->fd) { - ret = -errno; - } - -out: - qemu_opts_del(opts); - qemu_gluster_gconf_free(gconf); - if (!ret) { - return ret; - } - if (s->fd) { - glfs_close(s->fd); - } - if (s->glfs) { - glfs_fini(s->glfs); - } - return ret; -} - -typedef struct BDRVGlusterReopenState { - struct glfs *glfs; - struct glfs_fd *fd; -} BDRVGlusterReopenState; - - -static int qemu_gluster_reopen_prepare(BDRVReopenState *state, - BlockReopenQueue *queue, Error **errp) -{ - int ret = 0; - BDRVGlusterReopenState *reop_s; - GlusterConf *gconf = NULL; - int open_flags = 0; - - assert(state != NULL); - assert(state->bs != NULL); - - state->opaque = g_new0(BDRVGlusterReopenState, 1); - reop_s = state->opaque; - - qemu_gluster_parse_flags(state->flags, &open_flags); - - gconf = g_new0(GlusterConf, 1); - - reop_s->glfs = qemu_gluster_init(gconf, state->bs->filename, errp); - if (reop_s->glfs == NULL) { - ret = -errno; - goto exit; - } - -#ifdef CONFIG_GLUSTERFS_XLATOR_OPT - ret = glfs_set_xlator_option(reop_s->glfs, "*-write-behind", - "resync-failed-syncs-after-fsync", "on"); - if (ret < 0) { - error_setg_errno(errp, errno, "Unable to set xlator key/value pair"); - ret = -errno; - goto exit; - } -#endif - - reop_s->fd = glfs_open(reop_s->glfs, gconf->image, open_flags); - if (reop_s->fd == NULL) { - /* reops->glfs will be cleaned up in _abort */ - ret = -errno; - goto exit; - } - -exit: - /* state->opaque will be freed in either the _abort or _commit */ - qemu_gluster_gconf_free(gconf); - return ret; -} - -static void qemu_gluster_reopen_commit(BDRVReopenState *state) -{ - BDRVGlusterReopenState *reop_s = state->opaque; - BDRVGlusterState *s = state->bs->opaque; - - - /* close the old */ - if (s->fd) { - glfs_close(s->fd); - } - if (s->glfs) { - glfs_fini(s->glfs); - } - - /* use the newly opened image / connection */ - s->fd = reop_s->fd; - s->glfs = reop_s->glfs; - - g_free(state->opaque); - state->opaque = NULL; - - return; -} - - -static void qemu_gluster_reopen_abort(BDRVReopenState *state) -{ - BDRVGlusterReopenState *reop_s = state->opaque; - - if (reop_s == NULL) { - return; - } - - if (reop_s->fd) { - glfs_close(reop_s->fd); - } - - if (reop_s->glfs) { - glfs_fini(reop_s->glfs); - } - - g_free(state->opaque); - state->opaque = NULL; - - return; -} - -#ifdef CONFIG_GLUSTERFS_ZEROFILL -static coroutine_fn int qemu_gluster_co_write_zeroes(BlockDriverState *bs, - int64_t sector_num, int nb_sectors, BdrvRequestFlags flags) -{ - int ret; - GlusterAIOCB acb; - BDRVGlusterState *s = bs->opaque; - off_t size = nb_sectors * BDRV_SECTOR_SIZE; - off_t offset = sector_num * BDRV_SECTOR_SIZE; - - acb.size = size; - acb.ret = 0; - acb.coroutine = qemu_coroutine_self(); - acb.aio_context = bdrv_get_aio_context(bs); - - ret = glfs_zerofill_async(s->fd, offset, size, gluster_finish_aiocb, &acb); - if (ret < 0) { - return -errno; - } - - qemu_coroutine_yield(); - return acb.ret; -} - -static inline bool gluster_supports_zerofill(void) -{ - return 1; -} - -static inline int qemu_gluster_zerofill(struct glfs_fd *fd, int64_t offset, - int64_t size) -{ - return glfs_zerofill(fd, offset, size); -} - -#else -static inline bool gluster_supports_zerofill(void) -{ - return 0; -} - -static inline int qemu_gluster_zerofill(struct glfs_fd *fd, int64_t offset, - int64_t size) -{ - return 0; -} -#endif - -static int qemu_gluster_create(const char *filename, - QemuOpts *opts, Error **errp) -{ - struct glfs *glfs; - struct glfs_fd *fd; - int ret = 0; - int prealloc = 0; - int64_t total_size = 0; - char *tmp = NULL; - GlusterConf *gconf = g_new0(GlusterConf, 1); - - glfs = qemu_gluster_init(gconf, filename, errp); - if (!glfs) { - ret = -errno; - goto out; - } - - total_size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0), - BDRV_SECTOR_SIZE); - - tmp = qemu_opt_get_del(opts, BLOCK_OPT_PREALLOC); - if (!tmp || !strcmp(tmp, "off")) { - prealloc = 0; - } else if (!strcmp(tmp, "full") && - gluster_supports_zerofill()) { - prealloc = 1; - } else { - error_setg(errp, "Invalid preallocation mode: '%s'" - " or GlusterFS doesn't support zerofill API", - tmp); - ret = -EINVAL; - goto out; - } - - fd = glfs_creat(glfs, gconf->image, - O_WRONLY | O_CREAT | O_TRUNC | O_BINARY, S_IRUSR | S_IWUSR); - if (!fd) { - ret = -errno; - } else { - if (!glfs_ftruncate(fd, total_size)) { - if (prealloc && qemu_gluster_zerofill(fd, 0, total_size)) { - ret = -errno; - } - } else { - ret = -errno; - } - - if (glfs_close(fd) != 0) { - ret = -errno; - } - } -out: - g_free(tmp); - qemu_gluster_gconf_free(gconf); - if (glfs) { - glfs_fini(glfs); - } - return ret; -} - -static coroutine_fn int qemu_gluster_co_rw(BlockDriverState *bs, - int64_t sector_num, int nb_sectors, QEMUIOVector *qiov, int write) -{ - int ret; - GlusterAIOCB acb; - BDRVGlusterState *s = bs->opaque; - size_t size = nb_sectors * BDRV_SECTOR_SIZE; - off_t offset = sector_num * BDRV_SECTOR_SIZE; - - acb.size = size; - acb.ret = 0; - acb.coroutine = qemu_coroutine_self(); - acb.aio_context = bdrv_get_aio_context(bs); - - if (write) { - ret = glfs_pwritev_async(s->fd, qiov->iov, qiov->niov, offset, 0, - gluster_finish_aiocb, &acb); - } else { - ret = glfs_preadv_async(s->fd, qiov->iov, qiov->niov, offset, 0, - gluster_finish_aiocb, &acb); - } - - if (ret < 0) { - return -errno; - } - - qemu_coroutine_yield(); - return acb.ret; -} - -static int qemu_gluster_truncate(BlockDriverState *bs, int64_t offset) -{ - int ret; - BDRVGlusterState *s = bs->opaque; - - ret = glfs_ftruncate(s->fd, offset); - if (ret < 0) { - return -errno; - } - - return 0; -} - -static coroutine_fn int qemu_gluster_co_readv(BlockDriverState *bs, - int64_t sector_num, int nb_sectors, QEMUIOVector *qiov) -{ - return qemu_gluster_co_rw(bs, sector_num, nb_sectors, qiov, 0); -} - -static coroutine_fn int qemu_gluster_co_writev(BlockDriverState *bs, - int64_t sector_num, int nb_sectors, QEMUIOVector *qiov) -{ - return qemu_gluster_co_rw(bs, sector_num, nb_sectors, qiov, 1); -} - -static void qemu_gluster_close(BlockDriverState *bs) -{ - BDRVGlusterState *s = bs->opaque; - - if (s->fd) { - glfs_close(s->fd); - s->fd = NULL; - } - glfs_fini(s->glfs); -} - -static coroutine_fn int qemu_gluster_co_flush_to_disk(BlockDriverState *bs) -{ - int ret; - GlusterAIOCB acb; - BDRVGlusterState *s = bs->opaque; - - acb.size = 0; - acb.ret = 0; - acb.coroutine = qemu_coroutine_self(); - acb.aio_context = bdrv_get_aio_context(bs); - - ret = glfs_fsync_async(s->fd, gluster_finish_aiocb, &acb); - if (ret < 0) { - ret = -errno; - goto error; - } - - qemu_coroutine_yield(); - if (acb.ret < 0) { - ret = acb.ret; - goto error; - } - - return acb.ret; - -error: - /* Some versions of Gluster (3.5.6 -> 3.5.8?) will not retain its cache - * after a fsync failure, so we have no way of allowing the guest to safely - * continue. Gluster versions prior to 3.5.6 don't retain the cache - * either, but will invalidate the fd on error, so this is again our only - * option. - * - * The 'resync-failed-syncs-after-fsync' xlator option for the - * write-behind cache will cause later gluster versions to retain its - * cache after error, so long as the fd remains open. However, we - * currently have no way of knowing if this option is supported. - * - * TODO: Once gluster provides a way for us to determine if the option - * is supported, bypass the closure and setting drv to NULL. */ - qemu_gluster_close(bs); - bs->drv = NULL; - return ret; -} - -#ifdef CONFIG_GLUSTERFS_DISCARD -static coroutine_fn int qemu_gluster_co_discard(BlockDriverState *bs, - int64_t sector_num, int nb_sectors) -{ - int ret; - GlusterAIOCB acb; - BDRVGlusterState *s = bs->opaque; - size_t size = nb_sectors * BDRV_SECTOR_SIZE; - off_t offset = sector_num * BDRV_SECTOR_SIZE; - - acb.size = 0; - acb.ret = 0; - acb.coroutine = qemu_coroutine_self(); - acb.aio_context = bdrv_get_aio_context(bs); - - ret = glfs_discard_async(s->fd, offset, size, gluster_finish_aiocb, &acb); - if (ret < 0) { - return -errno; - } - - qemu_coroutine_yield(); - return acb.ret; -} -#endif - -static int64_t qemu_gluster_getlength(BlockDriverState *bs) -{ - BDRVGlusterState *s = bs->opaque; - int64_t ret; - - ret = glfs_lseek(s->fd, 0, SEEK_END); - if (ret < 0) { - return -errno; - } else { - return ret; - } -} - -static int64_t qemu_gluster_allocated_file_size(BlockDriverState *bs) -{ - BDRVGlusterState *s = bs->opaque; - struct stat st; - int ret; - - ret = glfs_fstat(s->fd, &st); - if (ret < 0) { - return -errno; - } else { - return st.st_blocks * 512; - } -} - -static int qemu_gluster_has_zero_init(BlockDriverState *bs) -{ - /* GlusterFS volume could be backed by a block device */ - return 0; -} - -static QemuOptsList qemu_gluster_create_opts = { - .name = "qemu-gluster-create-opts", - .head = QTAILQ_HEAD_INITIALIZER(qemu_gluster_create_opts.head), - .desc = { - { - .name = BLOCK_OPT_SIZE, - .type = QEMU_OPT_SIZE, - .help = "Virtual disk size" - }, - { - .name = BLOCK_OPT_PREALLOC, - .type = QEMU_OPT_STRING, - .help = "Preallocation mode (allowed values: off, full)" - }, - { /* end of list */ } - } -}; - -static BlockDriver bdrv_gluster = { - .format_name = "gluster", - .protocol_name = "gluster", - .instance_size = sizeof(BDRVGlusterState), - .bdrv_needs_filename = true, - .bdrv_file_open = qemu_gluster_open, - .bdrv_reopen_prepare = qemu_gluster_reopen_prepare, - .bdrv_reopen_commit = qemu_gluster_reopen_commit, - .bdrv_reopen_abort = qemu_gluster_reopen_abort, - .bdrv_close = qemu_gluster_close, - .bdrv_create = qemu_gluster_create, - .bdrv_getlength = qemu_gluster_getlength, - .bdrv_get_allocated_file_size = qemu_gluster_allocated_file_size, - .bdrv_truncate = qemu_gluster_truncate, - .bdrv_co_readv = qemu_gluster_co_readv, - .bdrv_co_writev = qemu_gluster_co_writev, - .bdrv_co_flush_to_disk = qemu_gluster_co_flush_to_disk, - .bdrv_has_zero_init = qemu_gluster_has_zero_init, -#ifdef CONFIG_GLUSTERFS_DISCARD - .bdrv_co_discard = qemu_gluster_co_discard, -#endif -#ifdef CONFIG_GLUSTERFS_ZEROFILL - .bdrv_co_write_zeroes = qemu_gluster_co_write_zeroes, -#endif - .create_opts = &qemu_gluster_create_opts, -}; - -static BlockDriver bdrv_gluster_tcp = { - .format_name = "gluster", - .protocol_name = "gluster+tcp", - .instance_size = sizeof(BDRVGlusterState), - .bdrv_needs_filename = true, - .bdrv_file_open = qemu_gluster_open, - .bdrv_reopen_prepare = qemu_gluster_reopen_prepare, - .bdrv_reopen_commit = qemu_gluster_reopen_commit, - .bdrv_reopen_abort = qemu_gluster_reopen_abort, - .bdrv_close = qemu_gluster_close, - .bdrv_create = qemu_gluster_create, - .bdrv_getlength = qemu_gluster_getlength, - .bdrv_get_allocated_file_size = qemu_gluster_allocated_file_size, - .bdrv_truncate = qemu_gluster_truncate, - .bdrv_co_readv = qemu_gluster_co_readv, - .bdrv_co_writev = qemu_gluster_co_writev, - .bdrv_co_flush_to_disk = qemu_gluster_co_flush_to_disk, - .bdrv_has_zero_init = qemu_gluster_has_zero_init, -#ifdef CONFIG_GLUSTERFS_DISCARD - .bdrv_co_discard = qemu_gluster_co_discard, -#endif -#ifdef CONFIG_GLUSTERFS_ZEROFILL - .bdrv_co_write_zeroes = qemu_gluster_co_write_zeroes, -#endif - .create_opts = &qemu_gluster_create_opts, -}; - -static BlockDriver bdrv_gluster_unix = { - .format_name = "gluster", - .protocol_name = "gluster+unix", - .instance_size = sizeof(BDRVGlusterState), - .bdrv_needs_filename = true, - .bdrv_file_open = qemu_gluster_open, - .bdrv_reopen_prepare = qemu_gluster_reopen_prepare, - .bdrv_reopen_commit = qemu_gluster_reopen_commit, - .bdrv_reopen_abort = qemu_gluster_reopen_abort, - .bdrv_close = qemu_gluster_close, - .bdrv_create = qemu_gluster_create, - .bdrv_getlength = qemu_gluster_getlength, - .bdrv_get_allocated_file_size = qemu_gluster_allocated_file_size, - .bdrv_truncate = qemu_gluster_truncate, - .bdrv_co_readv = qemu_gluster_co_readv, - .bdrv_co_writev = qemu_gluster_co_writev, - .bdrv_co_flush_to_disk = qemu_gluster_co_flush_to_disk, - .bdrv_has_zero_init = qemu_gluster_has_zero_init, -#ifdef CONFIG_GLUSTERFS_DISCARD - .bdrv_co_discard = qemu_gluster_co_discard, -#endif -#ifdef CONFIG_GLUSTERFS_ZEROFILL - .bdrv_co_write_zeroes = qemu_gluster_co_write_zeroes, -#endif - .create_opts = &qemu_gluster_create_opts, -}; - -static BlockDriver bdrv_gluster_rdma = { - .format_name = "gluster", - .protocol_name = "gluster+rdma", - .instance_size = sizeof(BDRVGlusterState), - .bdrv_needs_filename = true, - .bdrv_file_open = qemu_gluster_open, - .bdrv_reopen_prepare = qemu_gluster_reopen_prepare, - .bdrv_reopen_commit = qemu_gluster_reopen_commit, - .bdrv_reopen_abort = qemu_gluster_reopen_abort, - .bdrv_close = qemu_gluster_close, - .bdrv_create = qemu_gluster_create, - .bdrv_getlength = qemu_gluster_getlength, - .bdrv_get_allocated_file_size = qemu_gluster_allocated_file_size, - .bdrv_truncate = qemu_gluster_truncate, - .bdrv_co_readv = qemu_gluster_co_readv, - .bdrv_co_writev = qemu_gluster_co_writev, - .bdrv_co_flush_to_disk = qemu_gluster_co_flush_to_disk, - .bdrv_has_zero_init = qemu_gluster_has_zero_init, -#ifdef CONFIG_GLUSTERFS_DISCARD - .bdrv_co_discard = qemu_gluster_co_discard, -#endif -#ifdef CONFIG_GLUSTERFS_ZEROFILL - .bdrv_co_write_zeroes = qemu_gluster_co_write_zeroes, -#endif - .create_opts = &qemu_gluster_create_opts, -}; - -static void bdrv_gluster_init(void) -{ - bdrv_register(&bdrv_gluster_rdma); - bdrv_register(&bdrv_gluster_unix); - bdrv_register(&bdrv_gluster_tcp); - bdrv_register(&bdrv_gluster); -} - -block_init(bdrv_gluster_init); diff --git a/qemu/block/io.c b/qemu/block/io.c deleted file mode 100644 index a7dbf85b1..000000000 --- a/qemu/block/io.c +++ /dev/null @@ -1,2810 +0,0 @@ -/* - * Block layer I/O functions - * - * Copyright (c) 2003 Fabrice Bellard - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. - */ - -#include "qemu/osdep.h" -#include "trace.h" -#include "sysemu/block-backend.h" -#include "block/blockjob.h" -#include "block/block_int.h" -#include "block/throttle-groups.h" -#include "qemu/cutils.h" -#include "qapi/error.h" -#include "qemu/error-report.h" - -#define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */ - -static BlockAIOCB *bdrv_aio_readv_em(BlockDriverState *bs, - int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, - BlockCompletionFunc *cb, void *opaque); -static BlockAIOCB *bdrv_aio_writev_em(BlockDriverState *bs, - int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, - BlockCompletionFunc *cb, void *opaque); -static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs, - int64_t sector_num, int nb_sectors, - QEMUIOVector *iov); -static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs, - int64_t sector_num, int nb_sectors, - QEMUIOVector *iov); -static BlockAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs, - int64_t sector_num, - QEMUIOVector *qiov, - int nb_sectors, - BdrvRequestFlags flags, - BlockCompletionFunc *cb, - void *opaque, - bool is_write); -static void coroutine_fn bdrv_co_do_rw(void *opaque); -static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs, - int64_t sector_num, int nb_sectors, BdrvRequestFlags flags); - -/* throttling disk I/O limits */ -void bdrv_set_io_limits(BlockDriverState *bs, - ThrottleConfig *cfg) -{ - int i; - - throttle_group_config(bs, cfg); - - for (i = 0; i < 2; i++) { - qemu_co_enter_next(&bs->throttled_reqs[i]); - } -} - -/* this function drain all the throttled IOs */ -static bool bdrv_start_throttled_reqs(BlockDriverState *bs) -{ - bool drained = false; - bool enabled = bs->io_limits_enabled; - int i; - - bs->io_limits_enabled = false; - - for (i = 0; i < 2; i++) { - while (qemu_co_enter_next(&bs->throttled_reqs[i])) { - drained = true; - } - } - - bs->io_limits_enabled = enabled; - - return drained; -} - -void bdrv_io_limits_disable(BlockDriverState *bs) -{ - bs->io_limits_enabled = false; - bdrv_start_throttled_reqs(bs); - throttle_group_unregister_bs(bs); -} - -/* should be called before bdrv_set_io_limits if a limit is set */ -void bdrv_io_limits_enable(BlockDriverState *bs, const char *group) -{ - assert(!bs->io_limits_enabled); - throttle_group_register_bs(bs, group); - bs->io_limits_enabled = true; -} - -void bdrv_io_limits_update_group(BlockDriverState *bs, const char *group) -{ - /* this bs is not part of any group */ - if (!bs->throttle_state) { - return; - } - - /* this bs is a part of the same group than the one we want */ - if (!g_strcmp0(throttle_group_get_name(bs), group)) { - return; - } - - /* need to change the group this bs belong to */ - bdrv_io_limits_disable(bs); - bdrv_io_limits_enable(bs, group); -} - -void bdrv_setup_io_funcs(BlockDriver *bdrv) -{ - /* Block drivers without coroutine functions need emulation */ - if (!bdrv->bdrv_co_readv) { - bdrv->bdrv_co_readv = bdrv_co_readv_em; - bdrv->bdrv_co_writev = bdrv_co_writev_em; - - /* bdrv_co_readv_em()/brdv_co_writev_em() work in terms of aio, so if - * the block driver lacks aio we need to emulate that too. - */ - if (!bdrv->bdrv_aio_readv) { - /* add AIO emulation layer */ - bdrv->bdrv_aio_readv = bdrv_aio_readv_em; - bdrv->bdrv_aio_writev = bdrv_aio_writev_em; - } - } -} - -void bdrv_refresh_limits(BlockDriverState *bs, Error **errp) -{ - BlockDriver *drv = bs->drv; - Error *local_err = NULL; - - memset(&bs->bl, 0, sizeof(bs->bl)); - - if (!drv) { - return; - } - - /* Take some limits from the children as a default */ - if (bs->file) { - bdrv_refresh_limits(bs->file->bs, &local_err); - if (local_err) { - error_propagate(errp, local_err); - return; - } - bs->bl.opt_transfer_length = bs->file->bs->bl.opt_transfer_length; - bs->bl.max_transfer_length = bs->file->bs->bl.max_transfer_length; - bs->bl.min_mem_alignment = bs->file->bs->bl.min_mem_alignment; - bs->bl.opt_mem_alignment = bs->file->bs->bl.opt_mem_alignment; - bs->bl.max_iov = bs->file->bs->bl.max_iov; - } else { - bs->bl.min_mem_alignment = 512; - bs->bl.opt_mem_alignment = getpagesize(); - - /* Safe default since most protocols use readv()/writev()/etc */ - bs->bl.max_iov = IOV_MAX; - } - - if (bs->backing) { - bdrv_refresh_limits(bs->backing->bs, &local_err); - if (local_err) { - error_propagate(errp, local_err); - return; - } - bs->bl.opt_transfer_length = - MAX(bs->bl.opt_transfer_length, - bs->backing->bs->bl.opt_transfer_length); - bs->bl.max_transfer_length = - MIN_NON_ZERO(bs->bl.max_transfer_length, - bs->backing->bs->bl.max_transfer_length); - bs->bl.opt_mem_alignment = - MAX(bs->bl.opt_mem_alignment, - bs->backing->bs->bl.opt_mem_alignment); - bs->bl.min_mem_alignment = - MAX(bs->bl.min_mem_alignment, - bs->backing->bs->bl.min_mem_alignment); - bs->bl.max_iov = - MIN(bs->bl.max_iov, - bs->backing->bs->bl.max_iov); - } - - /* Then let the driver override it */ - if (drv->bdrv_refresh_limits) { - drv->bdrv_refresh_limits(bs, errp); - } -} - -/** - * The copy-on-read flag is actually a reference count so multiple users may - * use the feature without worrying about clobbering its previous state. - * Copy-on-read stays enabled until all users have called to disable it. - */ -void bdrv_enable_copy_on_read(BlockDriverState *bs) -{ - bs->copy_on_read++; -} - -void bdrv_disable_copy_on_read(BlockDriverState *bs) -{ - assert(bs->copy_on_read > 0); - bs->copy_on_read--; -} - -/* Check if any requests are in-flight (including throttled requests) */ -bool bdrv_requests_pending(BlockDriverState *bs) -{ - BdrvChild *child; - - if (!QLIST_EMPTY(&bs->tracked_requests)) { - return true; - } - if (!qemu_co_queue_empty(&bs->throttled_reqs[0])) { - return true; - } - if (!qemu_co_queue_empty(&bs->throttled_reqs[1])) { - return true; - } - - QLIST_FOREACH(child, &bs->children, next) { - if (bdrv_requests_pending(child->bs)) { - return true; - } - } - - return false; -} - -static void bdrv_drain_recurse(BlockDriverState *bs) -{ - BdrvChild *child; - - if (bs->drv && bs->drv->bdrv_drain) { - bs->drv->bdrv_drain(bs); - } - QLIST_FOREACH(child, &bs->children, next) { - bdrv_drain_recurse(child->bs); - } -} - -typedef struct { - Coroutine *co; - BlockDriverState *bs; - QEMUBH *bh; - bool done; -} BdrvCoDrainData; - -static void bdrv_co_drain_bh_cb(void *opaque) -{ - BdrvCoDrainData *data = opaque; - Coroutine *co = data->co; - - qemu_bh_delete(data->bh); - bdrv_drain(data->bs); - data->done = true; - qemu_coroutine_enter(co, NULL); -} - -void coroutine_fn bdrv_co_drain(BlockDriverState *bs) -{ - BdrvCoDrainData data; - - /* Calling bdrv_drain() from a BH ensures the current coroutine yields and - * other coroutines run if they were queued from - * qemu_co_queue_run_restart(). */ - - assert(qemu_in_coroutine()); - data = (BdrvCoDrainData) { - .co = qemu_coroutine_self(), - .bs = bs, - .done = false, - .bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_co_drain_bh_cb, &data), - }; - qemu_bh_schedule(data.bh); - - qemu_coroutine_yield(); - /* If we are resumed from some other event (such as an aio completion or a - * timer callback), it is a bug in the caller that should be fixed. */ - assert(data.done); -} - -/* - * Wait for pending requests to complete on a single BlockDriverState subtree, - * and suspend block driver's internal I/O until next request arrives. - * - * Note that unlike bdrv_drain_all(), the caller must hold the BlockDriverState - * AioContext. - * - * Only this BlockDriverState's AioContext is run, so in-flight requests must - * not depend on events in other AioContexts. In that case, use - * bdrv_drain_all() instead. - */ -void bdrv_drain(BlockDriverState *bs) -{ - bool busy = true; - - bdrv_drain_recurse(bs); - if (qemu_in_coroutine()) { - bdrv_co_drain(bs); - return; - } - while (busy) { - /* Keep iterating */ - bdrv_flush_io_queue(bs); - busy = bdrv_requests_pending(bs); - busy |= aio_poll(bdrv_get_aio_context(bs), busy); - } -} - -/* - * Wait for pending requests to complete across all BlockDriverStates - * - * This function does not flush data to disk, use bdrv_flush_all() for that - * after calling this function. - */ -void bdrv_drain_all(void) -{ - /* Always run first iteration so any pending completion BHs run */ - bool busy = true; - BlockDriverState *bs = NULL; - GSList *aio_ctxs = NULL, *ctx; - - while ((bs = bdrv_next(bs))) { - AioContext *aio_context = bdrv_get_aio_context(bs); - - aio_context_acquire(aio_context); - if (bs->job) { - block_job_pause(bs->job); - } - bdrv_drain_recurse(bs); - aio_context_release(aio_context); - - if (!g_slist_find(aio_ctxs, aio_context)) { - aio_ctxs = g_slist_prepend(aio_ctxs, aio_context); - } - } - - /* Note that completion of an asynchronous I/O operation can trigger any - * number of other I/O operations on other devices---for example a - * coroutine can submit an I/O request to another device in response to - * request completion. Therefore we must keep looping until there was no - * more activity rather than simply draining each device independently. - */ - while (busy) { - busy = false; - - for (ctx = aio_ctxs; ctx != NULL; ctx = ctx->next) { - AioContext *aio_context = ctx->data; - bs = NULL; - - aio_context_acquire(aio_context); - while ((bs = bdrv_next(bs))) { - if (aio_context == bdrv_get_aio_context(bs)) { - bdrv_flush_io_queue(bs); - if (bdrv_requests_pending(bs)) { - busy = true; - aio_poll(aio_context, busy); - } - } - } - busy |= aio_poll(aio_context, false); - aio_context_release(aio_context); - } - } - - bs = NULL; - while ((bs = bdrv_next(bs))) { - AioContext *aio_context = bdrv_get_aio_context(bs); - - aio_context_acquire(aio_context); - if (bs->job) { - block_job_resume(bs->job); - } - aio_context_release(aio_context); - } - g_slist_free(aio_ctxs); -} - -/** - * Remove an active request from the tracked requests list - * - * This function should be called when a tracked request is completing. - */ -static void tracked_request_end(BdrvTrackedRequest *req) -{ - if (req->serialising) { - req->bs->serialising_in_flight--; - } - - QLIST_REMOVE(req, list); - qemu_co_queue_restart_all(&req->wait_queue); -} - -/** - * Add an active request to the tracked requests list - */ -static void tracked_request_begin(BdrvTrackedRequest *req, - BlockDriverState *bs, - int64_t offset, - unsigned int bytes, - enum BdrvTrackedRequestType type) -{ - *req = (BdrvTrackedRequest){ - .bs = bs, - .offset = offset, - .bytes = bytes, - .type = type, - .co = qemu_coroutine_self(), - .serialising = false, - .overlap_offset = offset, - .overlap_bytes = bytes, - }; - - qemu_co_queue_init(&req->wait_queue); - - QLIST_INSERT_HEAD(&bs->tracked_requests, req, list); -} - -static void mark_request_serialising(BdrvTrackedRequest *req, uint64_t align) -{ - int64_t overlap_offset = req->offset & ~(align - 1); - unsigned int overlap_bytes = ROUND_UP(req->offset + req->bytes, align) - - overlap_offset; - - if (!req->serialising) { - req->bs->serialising_in_flight++; - req->serialising = true; - } - - req->overlap_offset = MIN(req->overlap_offset, overlap_offset); - req->overlap_bytes = MAX(req->overlap_bytes, overlap_bytes); -} - -/** - * Round a region to cluster boundaries - */ -void bdrv_round_to_clusters(BlockDriverState *bs, - int64_t sector_num, int nb_sectors, - int64_t *cluster_sector_num, - int *cluster_nb_sectors) -{ - BlockDriverInfo bdi; - - if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) { - *cluster_sector_num = sector_num; - *cluster_nb_sectors = nb_sectors; - } else { - int64_t c = bdi.cluster_size / BDRV_SECTOR_SIZE; - *cluster_sector_num = QEMU_ALIGN_DOWN(sector_num, c); - *cluster_nb_sectors = QEMU_ALIGN_UP(sector_num - *cluster_sector_num + - nb_sectors, c); - } -} - -static int bdrv_get_cluster_size(BlockDriverState *bs) -{ - BlockDriverInfo bdi; - int ret; - - ret = bdrv_get_info(bs, &bdi); - if (ret < 0 || bdi.cluster_size == 0) { - return bs->request_alignment; - } else { - return bdi.cluster_size; - } -} - -static bool tracked_request_overlaps(BdrvTrackedRequest *req, - int64_t offset, unsigned int bytes) -{ - /* aaaa bbbb */ - if (offset >= req->overlap_offset + req->overlap_bytes) { - return false; - } - /* bbbb aaaa */ - if (req->overlap_offset >= offset + bytes) { - return false; - } - return true; -} - -static bool coroutine_fn wait_serialising_requests(BdrvTrackedRequest *self) -{ - BlockDriverState *bs = self->bs; - BdrvTrackedRequest *req; - bool retry; - bool waited = false; - - if (!bs->serialising_in_flight) { - return false; - } - - do { - retry = false; - QLIST_FOREACH(req, &bs->tracked_requests, list) { - if (req == self || (!req->serialising && !self->serialising)) { - continue; - } - if (tracked_request_overlaps(req, self->overlap_offset, - self->overlap_bytes)) - { - /* Hitting this means there was a reentrant request, for - * example, a block driver issuing nested requests. This must - * never happen since it means deadlock. - */ - assert(qemu_coroutine_self() != req->co); - - /* If the request is already (indirectly) waiting for us, or - * will wait for us as soon as it wakes up, then just go on - * (instead of producing a deadlock in the former case). */ - if (!req->waiting_for) { - self->waiting_for = req; - qemu_co_queue_wait(&req->wait_queue); - self->waiting_for = NULL; - retry = true; - waited = true; - break; - } - } - } - } while (retry); - - return waited; -} - -static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset, - size_t size) -{ - if (size > BDRV_REQUEST_MAX_SECTORS << BDRV_SECTOR_BITS) { - return -EIO; - } - - if (!bdrv_is_inserted(bs)) { - return -ENOMEDIUM; - } - - if (offset < 0) { - return -EIO; - } - - return 0; -} - -static int bdrv_check_request(BlockDriverState *bs, int64_t sector_num, - int nb_sectors) -{ - if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) { - return -EIO; - } - - return bdrv_check_byte_request(bs, sector_num * BDRV_SECTOR_SIZE, - nb_sectors * BDRV_SECTOR_SIZE); -} - -typedef struct RwCo { - BlockDriverState *bs; - int64_t offset; - QEMUIOVector *qiov; - bool is_write; - int ret; - BdrvRequestFlags flags; -} RwCo; - -static void coroutine_fn bdrv_rw_co_entry(void *opaque) -{ - RwCo *rwco = opaque; - - if (!rwco->is_write) { - rwco->ret = bdrv_co_do_preadv(rwco->bs, rwco->offset, - rwco->qiov->size, rwco->qiov, - rwco->flags); - } else { - rwco->ret = bdrv_co_do_pwritev(rwco->bs, rwco->offset, - rwco->qiov->size, rwco->qiov, - rwco->flags); - } -} - -/* - * Process a vectored synchronous request using coroutines - */ -static int bdrv_prwv_co(BlockDriverState *bs, int64_t offset, - QEMUIOVector *qiov, bool is_write, - BdrvRequestFlags flags) -{ - Coroutine *co; - RwCo rwco = { - .bs = bs, - .offset = offset, - .qiov = qiov, - .is_write = is_write, - .ret = NOT_DONE, - .flags = flags, - }; - - /** - * In sync call context, when the vcpu is blocked, this throttling timer - * will not fire; so the I/O throttling function has to be disabled here - * if it has been enabled. - */ - if (bs->io_limits_enabled) { - fprintf(stderr, "Disabling I/O throttling on '%s' due " - "to synchronous I/O.\n", bdrv_get_device_name(bs)); - bdrv_io_limits_disable(bs); - } - - if (qemu_in_coroutine()) { - /* Fast-path if already in coroutine context */ - bdrv_rw_co_entry(&rwco); - } else { - AioContext *aio_context = bdrv_get_aio_context(bs); - - co = qemu_coroutine_create(bdrv_rw_co_entry); - qemu_coroutine_enter(co, &rwco); - while (rwco.ret == NOT_DONE) { - aio_poll(aio_context, true); - } - } - return rwco.ret; -} - -/* - * Process a synchronous request using coroutines - */ -static int bdrv_rw_co(BlockDriverState *bs, int64_t sector_num, uint8_t *buf, - int nb_sectors, bool is_write, BdrvRequestFlags flags) -{ - QEMUIOVector qiov; - struct iovec iov = { - .iov_base = (void *)buf, - .iov_len = nb_sectors * BDRV_SECTOR_SIZE, - }; - - if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) { - return -EINVAL; - } - - qemu_iovec_init_external(&qiov, &iov, 1); - return bdrv_prwv_co(bs, sector_num << BDRV_SECTOR_BITS, - &qiov, is_write, flags); -} - -/* return < 0 if error. See bdrv_write() for the return codes */ -int bdrv_read(BlockDriverState *bs, int64_t sector_num, - uint8_t *buf, int nb_sectors) -{ - return bdrv_rw_co(bs, sector_num, buf, nb_sectors, false, 0); -} - -/* Return < 0 if error. Important errors are: - -EIO generic I/O error (may happen for all errors) - -ENOMEDIUM No media inserted. - -EINVAL Invalid sector number or nb_sectors - -EACCES Trying to write a read-only device -*/ -int bdrv_write(BlockDriverState *bs, int64_t sector_num, - const uint8_t *buf, int nb_sectors) -{ - return bdrv_rw_co(bs, sector_num, (uint8_t *)buf, nb_sectors, true, 0); -} - -int bdrv_write_zeroes(BlockDriverState *bs, int64_t sector_num, - int nb_sectors, BdrvRequestFlags flags) -{ - return bdrv_rw_co(bs, sector_num, NULL, nb_sectors, true, - BDRV_REQ_ZERO_WRITE | flags); -} - -/* - * Completely zero out a block device with the help of bdrv_write_zeroes. - * The operation is sped up by checking the block status and only writing - * zeroes to the device if they currently do not return zeroes. Optional - * flags are passed through to bdrv_write_zeroes (e.g. BDRV_REQ_MAY_UNMAP). - * - * Returns < 0 on error, 0 on success. For error codes see bdrv_write(). - */ -int bdrv_make_zero(BlockDriverState *bs, BdrvRequestFlags flags) -{ - int64_t target_sectors, ret, nb_sectors, sector_num = 0; - BlockDriverState *file; - int n; - - target_sectors = bdrv_nb_sectors(bs); - if (target_sectors < 0) { - return target_sectors; - } - - for (;;) { - nb_sectors = MIN(target_sectors - sector_num, BDRV_REQUEST_MAX_SECTORS); - if (nb_sectors <= 0) { - return 0; - } - ret = bdrv_get_block_status(bs, sector_num, nb_sectors, &n, &file); - if (ret < 0) { - error_report("error getting block status at sector %" PRId64 ": %s", - sector_num, strerror(-ret)); - return ret; - } - if (ret & BDRV_BLOCK_ZERO) { - sector_num += n; - continue; - } - ret = bdrv_write_zeroes(bs, sector_num, n, flags); - if (ret < 0) { - error_report("error writing zeroes at sector %" PRId64 ": %s", - sector_num, strerror(-ret)); - return ret; - } - sector_num += n; - } -} - -int bdrv_pread(BlockDriverState *bs, int64_t offset, void *buf, int bytes) -{ - QEMUIOVector qiov; - struct iovec iov = { - .iov_base = (void *)buf, - .iov_len = bytes, - }; - int ret; - - if (bytes < 0) { - return -EINVAL; - } - - qemu_iovec_init_external(&qiov, &iov, 1); - ret = bdrv_prwv_co(bs, offset, &qiov, false, 0); - if (ret < 0) { - return ret; - } - - return bytes; -} - -int bdrv_pwritev(BlockDriverState *bs, int64_t offset, QEMUIOVector *qiov) -{ - int ret; - - ret = bdrv_prwv_co(bs, offset, qiov, true, 0); - if (ret < 0) { - return ret; - } - - return qiov->size; -} - -int bdrv_pwrite(BlockDriverState *bs, int64_t offset, - const void *buf, int bytes) -{ - QEMUIOVector qiov; - struct iovec iov = { - .iov_base = (void *) buf, - .iov_len = bytes, - }; - - if (bytes < 0) { - return -EINVAL; - } - - qemu_iovec_init_external(&qiov, &iov, 1); - return bdrv_pwritev(bs, offset, &qiov); -} - -/* - * Writes to the file and ensures that no writes are reordered across this - * request (acts as a barrier) - * - * Returns 0 on success, -errno in error cases. - */ -int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset, - const void *buf, int count) -{ - int ret; - - ret = bdrv_pwrite(bs, offset, buf, count); - if (ret < 0) { - return ret; - } - - ret = bdrv_flush(bs); - if (ret < 0) { - return ret; - } - - return 0; -} - -static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs, - int64_t sector_num, int nb_sectors, QEMUIOVector *qiov) -{ - /* Perform I/O through a temporary buffer so that users who scribble over - * their read buffer while the operation is in progress do not end up - * modifying the image file. This is critical for zero-copy guest I/O - * where anything might happen inside guest memory. - */ - void *bounce_buffer; - - BlockDriver *drv = bs->drv; - struct iovec iov; - QEMUIOVector bounce_qiov; - int64_t cluster_sector_num; - int cluster_nb_sectors; - size_t skip_bytes; - int ret; - - /* Cover entire cluster so no additional backing file I/O is required when - * allocating cluster in the image file. - */ - bdrv_round_to_clusters(bs, sector_num, nb_sectors, - &cluster_sector_num, &cluster_nb_sectors); - - trace_bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, - cluster_sector_num, cluster_nb_sectors); - - iov.iov_len = cluster_nb_sectors * BDRV_SECTOR_SIZE; - iov.iov_base = bounce_buffer = qemu_try_blockalign(bs, iov.iov_len); - if (bounce_buffer == NULL) { - ret = -ENOMEM; - goto err; - } - - qemu_iovec_init_external(&bounce_qiov, &iov, 1); - - ret = drv->bdrv_co_readv(bs, cluster_sector_num, cluster_nb_sectors, - &bounce_qiov); - if (ret < 0) { - goto err; - } - - if (drv->bdrv_co_write_zeroes && - buffer_is_zero(bounce_buffer, iov.iov_len)) { - ret = bdrv_co_do_write_zeroes(bs, cluster_sector_num, - cluster_nb_sectors, 0); - } else { - /* This does not change the data on the disk, it is not necessary - * to flush even in cache=writethrough mode. - */ - ret = drv->bdrv_co_writev(bs, cluster_sector_num, cluster_nb_sectors, - &bounce_qiov); - } - - if (ret < 0) { - /* It might be okay to ignore write errors for guest requests. If this - * is a deliberate copy-on-read then we don't want to ignore the error. - * Simply report it in all cases. - */ - goto err; - } - - skip_bytes = (sector_num - cluster_sector_num) * BDRV_SECTOR_SIZE; - qemu_iovec_from_buf(qiov, 0, bounce_buffer + skip_bytes, - nb_sectors * BDRV_SECTOR_SIZE); - -err: - qemu_vfree(bounce_buffer); - return ret; -} - -/* - * Forwards an already correctly aligned request to the BlockDriver. This - * handles copy on read and zeroing after EOF; any other features must be - * implemented by the caller. - */ -static int coroutine_fn bdrv_aligned_preadv(BlockDriverState *bs, - BdrvTrackedRequest *req, int64_t offset, unsigned int bytes, - int64_t align, QEMUIOVector *qiov, int flags) -{ - BlockDriver *drv = bs->drv; - int ret; - - int64_t sector_num = offset >> BDRV_SECTOR_BITS; - unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS; - - assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0); - assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0); - assert(!qiov || bytes == qiov->size); - assert((bs->open_flags & BDRV_O_NO_IO) == 0); - - /* Handle Copy on Read and associated serialisation */ - if (flags & BDRV_REQ_COPY_ON_READ) { - /* If we touch the same cluster it counts as an overlap. This - * guarantees that allocating writes will be serialized and not race - * with each other for the same cluster. For example, in copy-on-read - * it ensures that the CoR read and write operations are atomic and - * guest writes cannot interleave between them. */ - mark_request_serialising(req, bdrv_get_cluster_size(bs)); - } - - if (!(flags & BDRV_REQ_NO_SERIALISING)) { - wait_serialising_requests(req); - } - - if (flags & BDRV_REQ_COPY_ON_READ) { - int pnum; - - ret = bdrv_is_allocated(bs, sector_num, nb_sectors, &pnum); - if (ret < 0) { - goto out; - } - - if (!ret || pnum != nb_sectors) { - ret = bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, qiov); - goto out; - } - } - - /* Forward the request to the BlockDriver */ - if (!bs->zero_beyond_eof) { - ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov); - } else { - /* Read zeros after EOF */ - int64_t total_sectors, max_nb_sectors; - - total_sectors = bdrv_nb_sectors(bs); - if (total_sectors < 0) { - ret = total_sectors; - goto out; - } - - max_nb_sectors = ROUND_UP(MAX(0, total_sectors - sector_num), - align >> BDRV_SECTOR_BITS); - if (nb_sectors < max_nb_sectors) { - ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov); - } else if (max_nb_sectors > 0) { - QEMUIOVector local_qiov; - - qemu_iovec_init(&local_qiov, qiov->niov); - qemu_iovec_concat(&local_qiov, qiov, 0, - max_nb_sectors * BDRV_SECTOR_SIZE); - - ret = drv->bdrv_co_readv(bs, sector_num, max_nb_sectors, - &local_qiov); - - qemu_iovec_destroy(&local_qiov); - } else { - ret = 0; - } - - /* Reading beyond end of file is supposed to produce zeroes */ - if (ret == 0 && total_sectors < sector_num + nb_sectors) { - uint64_t offset = MAX(0, total_sectors - sector_num); - uint64_t bytes = (sector_num + nb_sectors - offset) * - BDRV_SECTOR_SIZE; - qemu_iovec_memset(qiov, offset * BDRV_SECTOR_SIZE, 0, bytes); - } - } - -out: - return ret; -} - -/* - * Handle a read request in coroutine context - */ -int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs, - int64_t offset, unsigned int bytes, QEMUIOVector *qiov, - BdrvRequestFlags flags) -{ - BlockDriver *drv = bs->drv; - BdrvTrackedRequest req; - - /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */ - uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment); - uint8_t *head_buf = NULL; - uint8_t *tail_buf = NULL; - QEMUIOVector local_qiov; - bool use_local_qiov = false; - int ret; - - if (!drv) { - return -ENOMEDIUM; - } - - ret = bdrv_check_byte_request(bs, offset, bytes); - if (ret < 0) { - return ret; - } - - /* Don't do copy-on-read if we read data before write operation */ - if (bs->copy_on_read && !(flags & BDRV_REQ_NO_SERIALISING)) { - flags |= BDRV_REQ_COPY_ON_READ; - } - - /* throttling disk I/O */ - if (bs->io_limits_enabled) { - throttle_group_co_io_limits_intercept(bs, bytes, false); - } - - /* Align read if necessary by padding qiov */ - if (offset & (align - 1)) { - head_buf = qemu_blockalign(bs, align); - qemu_iovec_init(&local_qiov, qiov->niov + 2); - qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1)); - qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size); - use_local_qiov = true; - - bytes += offset & (align - 1); - offset = offset & ~(align - 1); - } - - if ((offset + bytes) & (align - 1)) { - if (!use_local_qiov) { - qemu_iovec_init(&local_qiov, qiov->niov + 1); - qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size); - use_local_qiov = true; - } - tail_buf = qemu_blockalign(bs, align); - qemu_iovec_add(&local_qiov, tail_buf, - align - ((offset + bytes) & (align - 1))); - - bytes = ROUND_UP(bytes, align); - } - - tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_READ); - ret = bdrv_aligned_preadv(bs, &req, offset, bytes, align, - use_local_qiov ? &local_qiov : qiov, - flags); - tracked_request_end(&req); - - if (use_local_qiov) { - qemu_iovec_destroy(&local_qiov); - qemu_vfree(head_buf); - qemu_vfree(tail_buf); - } - - return ret; -} - -static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs, - int64_t sector_num, int nb_sectors, QEMUIOVector *qiov, - BdrvRequestFlags flags) -{ - if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) { - return -EINVAL; - } - - return bdrv_co_do_preadv(bs, sector_num << BDRV_SECTOR_BITS, - nb_sectors << BDRV_SECTOR_BITS, qiov, flags); -} - -int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num, - int nb_sectors, QEMUIOVector *qiov) -{ - trace_bdrv_co_readv(bs, sector_num, nb_sectors); - - return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 0); -} - -int coroutine_fn bdrv_co_readv_no_serialising(BlockDriverState *bs, - int64_t sector_num, int nb_sectors, QEMUIOVector *qiov) -{ - trace_bdrv_co_readv_no_serialising(bs, sector_num, nb_sectors); - - return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, - BDRV_REQ_NO_SERIALISING); -} - -int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs, - int64_t sector_num, int nb_sectors, QEMUIOVector *qiov) -{ - trace_bdrv_co_copy_on_readv(bs, sector_num, nb_sectors); - - return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, - BDRV_REQ_COPY_ON_READ); -} - -#define MAX_WRITE_ZEROES_BOUNCE_BUFFER 32768 - -static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs, - int64_t sector_num, int nb_sectors, BdrvRequestFlags flags) -{ - BlockDriver *drv = bs->drv; - QEMUIOVector qiov; - struct iovec iov = {0}; - int ret = 0; - - int max_write_zeroes = MIN_NON_ZERO(bs->bl.max_write_zeroes, - BDRV_REQUEST_MAX_SECTORS); - - while (nb_sectors > 0 && !ret) { - int num = nb_sectors; - - /* Align request. Block drivers can expect the "bulk" of the request - * to be aligned. - */ - if (bs->bl.write_zeroes_alignment - && num > bs->bl.write_zeroes_alignment) { - if (sector_num % bs->bl.write_zeroes_alignment != 0) { - /* Make a small request up to the first aligned sector. */ - num = bs->bl.write_zeroes_alignment; - num -= sector_num % bs->bl.write_zeroes_alignment; - } else if ((sector_num + num) % bs->bl.write_zeroes_alignment != 0) { - /* Shorten the request to the last aligned sector. num cannot - * underflow because num > bs->bl.write_zeroes_alignment. - */ - num -= (sector_num + num) % bs->bl.write_zeroes_alignment; - } - } - - /* limit request size */ - if (num > max_write_zeroes) { - num = max_write_zeroes; - } - - ret = -ENOTSUP; - /* First try the efficient write zeroes operation */ - if (drv->bdrv_co_write_zeroes) { - ret = drv->bdrv_co_write_zeroes(bs, sector_num, num, flags); - } - - if (ret == -ENOTSUP) { - /* Fall back to bounce buffer if write zeroes is unsupported */ - int max_xfer_len = MIN_NON_ZERO(bs->bl.max_transfer_length, - MAX_WRITE_ZEROES_BOUNCE_BUFFER); - num = MIN(num, max_xfer_len); - iov.iov_len = num * BDRV_SECTOR_SIZE; - if (iov.iov_base == NULL) { - iov.iov_base = qemu_try_blockalign(bs, num * BDRV_SECTOR_SIZE); - if (iov.iov_base == NULL) { - ret = -ENOMEM; - goto fail; - } - memset(iov.iov_base, 0, num * BDRV_SECTOR_SIZE); - } - qemu_iovec_init_external(&qiov, &iov, 1); - - ret = drv->bdrv_co_writev(bs, sector_num, num, &qiov); - - /* Keep bounce buffer around if it is big enough for all - * all future requests. - */ - if (num < max_xfer_len) { - qemu_vfree(iov.iov_base); - iov.iov_base = NULL; - } - } - - sector_num += num; - nb_sectors -= num; - } - -fail: - qemu_vfree(iov.iov_base); - return ret; -} - -/* - * Forwards an already correctly aligned write request to the BlockDriver. - */ -static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs, - BdrvTrackedRequest *req, int64_t offset, unsigned int bytes, - QEMUIOVector *qiov, int flags) -{ - BlockDriver *drv = bs->drv; - bool waited; - int ret; - - int64_t sector_num = offset >> BDRV_SECTOR_BITS; - unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS; - - assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0); - assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0); - assert(!qiov || bytes == qiov->size); - assert((bs->open_flags & BDRV_O_NO_IO) == 0); - - waited = wait_serialising_requests(req); - assert(!waited || !req->serialising); - assert(req->overlap_offset <= offset); - assert(offset + bytes <= req->overlap_offset + req->overlap_bytes); - - ret = notifier_with_return_list_notify(&bs->before_write_notifiers, req); - - if (!ret && bs->detect_zeroes != BLOCKDEV_DETECT_ZEROES_OPTIONS_OFF && - !(flags & BDRV_REQ_ZERO_WRITE) && drv->bdrv_co_write_zeroes && - qemu_iovec_is_zero(qiov)) { - flags |= BDRV_REQ_ZERO_WRITE; - if (bs->detect_zeroes == BLOCKDEV_DETECT_ZEROES_OPTIONS_UNMAP) { - flags |= BDRV_REQ_MAY_UNMAP; - } - } - - if (ret < 0) { - /* Do nothing, write notifier decided to fail this request */ - } else if (flags & BDRV_REQ_ZERO_WRITE) { - bdrv_debug_event(bs, BLKDBG_PWRITEV_ZERO); - ret = bdrv_co_do_write_zeroes(bs, sector_num, nb_sectors, flags); - } else if (drv->bdrv_co_writev_flags) { - bdrv_debug_event(bs, BLKDBG_PWRITEV); - ret = drv->bdrv_co_writev_flags(bs, sector_num, nb_sectors, qiov, - flags); - } else { - assert(drv->supported_write_flags == 0); - bdrv_debug_event(bs, BLKDBG_PWRITEV); - ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov); - } - bdrv_debug_event(bs, BLKDBG_PWRITEV_DONE); - - if (ret == 0 && (flags & BDRV_REQ_FUA) && - !(drv->supported_write_flags & BDRV_REQ_FUA)) - { - ret = bdrv_co_flush(bs); - } - - bdrv_set_dirty(bs, sector_num, nb_sectors); - - if (bs->wr_highest_offset < offset + bytes) { - bs->wr_highest_offset = offset + bytes; - } - - if (ret >= 0) { - bs->total_sectors = MAX(bs->total_sectors, sector_num + nb_sectors); - } - - return ret; -} - -static int coroutine_fn bdrv_co_do_zero_pwritev(BlockDriverState *bs, - int64_t offset, - unsigned int bytes, - BdrvRequestFlags flags, - BdrvTrackedRequest *req) -{ - uint8_t *buf = NULL; - QEMUIOVector local_qiov; - struct iovec iov; - uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment); - unsigned int head_padding_bytes, tail_padding_bytes; - int ret = 0; - - head_padding_bytes = offset & (align - 1); - tail_padding_bytes = align - ((offset + bytes) & (align - 1)); - - - assert(flags & BDRV_REQ_ZERO_WRITE); - if (head_padding_bytes || tail_padding_bytes) { - buf = qemu_blockalign(bs, align); - iov = (struct iovec) { - .iov_base = buf, - .iov_len = align, - }; - qemu_iovec_init_external(&local_qiov, &iov, 1); - } - if (head_padding_bytes) { - uint64_t zero_bytes = MIN(bytes, align - head_padding_bytes); - - /* RMW the unaligned part before head. */ - mark_request_serialising(req, align); - wait_serialising_requests(req); - bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_HEAD); - ret = bdrv_aligned_preadv(bs, req, offset & ~(align - 1), align, - align, &local_qiov, 0); - if (ret < 0) { - goto fail; - } - bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD); - - memset(buf + head_padding_bytes, 0, zero_bytes); - ret = bdrv_aligned_pwritev(bs, req, offset & ~(align - 1), align, - &local_qiov, - flags & ~BDRV_REQ_ZERO_WRITE); - if (ret < 0) { - goto fail; - } - offset += zero_bytes; - bytes -= zero_bytes; - } - - assert(!bytes || (offset & (align - 1)) == 0); - if (bytes >= align) { - /* Write the aligned part in the middle. */ - uint64_t aligned_bytes = bytes & ~(align - 1); - ret = bdrv_aligned_pwritev(bs, req, offset, aligned_bytes, - NULL, flags); - if (ret < 0) { - goto fail; - } - bytes -= aligned_bytes; - offset += aligned_bytes; - } - - assert(!bytes || (offset & (align - 1)) == 0); - if (bytes) { - assert(align == tail_padding_bytes + bytes); - /* RMW the unaligned part after tail. */ - mark_request_serialising(req, align); - wait_serialising_requests(req); - bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_TAIL); - ret = bdrv_aligned_preadv(bs, req, offset, align, - align, &local_qiov, 0); - if (ret < 0) { - goto fail; - } - bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL); - - memset(buf, 0, bytes); - ret = bdrv_aligned_pwritev(bs, req, offset, align, - &local_qiov, flags & ~BDRV_REQ_ZERO_WRITE); - } -fail: - qemu_vfree(buf); - return ret; - -} - -/* - * Handle a write request in coroutine context - */ -int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs, - int64_t offset, unsigned int bytes, QEMUIOVector *qiov, - BdrvRequestFlags flags) -{ - BdrvTrackedRequest req; - /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */ - uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment); - uint8_t *head_buf = NULL; - uint8_t *tail_buf = NULL; - QEMUIOVector local_qiov; - bool use_local_qiov = false; - int ret; - - if (!bs->drv) { - return -ENOMEDIUM; - } - if (bs->read_only) { - return -EPERM; - } - assert(!(bs->open_flags & BDRV_O_INACTIVE)); - - ret = bdrv_check_byte_request(bs, offset, bytes); - if (ret < 0) { - return ret; - } - - /* throttling disk I/O */ - if (bs->io_limits_enabled) { - throttle_group_co_io_limits_intercept(bs, bytes, true); - } - - /* - * Align write if necessary by performing a read-modify-write cycle. - * Pad qiov with the read parts and be sure to have a tracked request not - * only for bdrv_aligned_pwritev, but also for the reads of the RMW cycle. - */ - tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_WRITE); - - if (!qiov) { - ret = bdrv_co_do_zero_pwritev(bs, offset, bytes, flags, &req); - goto out; - } - - if (offset & (align - 1)) { - QEMUIOVector head_qiov; - struct iovec head_iov; - - mark_request_serialising(&req, align); - wait_serialising_requests(&req); - - head_buf = qemu_blockalign(bs, align); - head_iov = (struct iovec) { - .iov_base = head_buf, - .iov_len = align, - }; - qemu_iovec_init_external(&head_qiov, &head_iov, 1); - - bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_HEAD); - ret = bdrv_aligned_preadv(bs, &req, offset & ~(align - 1), align, - align, &head_qiov, 0); - if (ret < 0) { - goto fail; - } - bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD); - - qemu_iovec_init(&local_qiov, qiov->niov + 2); - qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1)); - qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size); - use_local_qiov = true; - - bytes += offset & (align - 1); - offset = offset & ~(align - 1); - } - - if ((offset + bytes) & (align - 1)) { - QEMUIOVector tail_qiov; - struct iovec tail_iov; - size_t tail_bytes; - bool waited; - - mark_request_serialising(&req, align); - waited = wait_serialising_requests(&req); - assert(!waited || !use_local_qiov); - - tail_buf = qemu_blockalign(bs, align); - tail_iov = (struct iovec) { - .iov_base = tail_buf, - .iov_len = align, - }; - qemu_iovec_init_external(&tail_qiov, &tail_iov, 1); - - bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_TAIL); - ret = bdrv_aligned_preadv(bs, &req, (offset + bytes) & ~(align - 1), align, - align, &tail_qiov, 0); - if (ret < 0) { - goto fail; - } - bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL); - - if (!use_local_qiov) { - qemu_iovec_init(&local_qiov, qiov->niov + 1); - qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size); - use_local_qiov = true; - } - - tail_bytes = (offset + bytes) & (align - 1); - qemu_iovec_add(&local_qiov, tail_buf + tail_bytes, align - tail_bytes); - - bytes = ROUND_UP(bytes, align); - } - - ret = bdrv_aligned_pwritev(bs, &req, offset, bytes, - use_local_qiov ? &local_qiov : qiov, - flags); - -fail: - - if (use_local_qiov) { - qemu_iovec_destroy(&local_qiov); - } - qemu_vfree(head_buf); - qemu_vfree(tail_buf); -out: - tracked_request_end(&req); - return ret; -} - -static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs, - int64_t sector_num, int nb_sectors, QEMUIOVector *qiov, - BdrvRequestFlags flags) -{ - if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) { - return -EINVAL; - } - - return bdrv_co_do_pwritev(bs, sector_num << BDRV_SECTOR_BITS, - nb_sectors << BDRV_SECTOR_BITS, qiov, flags); -} - -int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num, - int nb_sectors, QEMUIOVector *qiov) -{ - trace_bdrv_co_writev(bs, sector_num, nb_sectors); - - return bdrv_co_do_writev(bs, sector_num, nb_sectors, qiov, 0); -} - -int coroutine_fn bdrv_co_write_zeroes(BlockDriverState *bs, - int64_t sector_num, int nb_sectors, - BdrvRequestFlags flags) -{ - trace_bdrv_co_write_zeroes(bs, sector_num, nb_sectors, flags); - - if (!(bs->open_flags & BDRV_O_UNMAP)) { - flags &= ~BDRV_REQ_MAY_UNMAP; - } - - return bdrv_co_do_writev(bs, sector_num, nb_sectors, NULL, - BDRV_REQ_ZERO_WRITE | flags); -} - -typedef struct BdrvCoGetBlockStatusData { - BlockDriverState *bs; - BlockDriverState *base; - BlockDriverState **file; - int64_t sector_num; - int nb_sectors; - int *pnum; - int64_t ret; - bool done; -} BdrvCoGetBlockStatusData; - -/* - * Returns the allocation status of the specified sectors. - * Drivers not implementing the functionality are assumed to not support - * backing files, hence all their sectors are reported as allocated. - * - * If 'sector_num' is beyond the end of the disk image the return value is 0 - * and 'pnum' is set to 0. - * - * 'pnum' is set to the number of sectors (including and immediately following - * the specified sector) that are known to be in the same - * allocated/unallocated state. - * - * 'nb_sectors' is the max value 'pnum' should be set to. If nb_sectors goes - * beyond the end of the disk image it will be clamped. - * - * If returned value is positive and BDRV_BLOCK_OFFSET_VALID bit is set, 'file' - * points to the BDS which the sector range is allocated in. - */ -static int64_t coroutine_fn bdrv_co_get_block_status(BlockDriverState *bs, - int64_t sector_num, - int nb_sectors, int *pnum, - BlockDriverState **file) -{ - int64_t total_sectors; - int64_t n; - int64_t ret, ret2; - - total_sectors = bdrv_nb_sectors(bs); - if (total_sectors < 0) { - return total_sectors; - } - - if (sector_num >= total_sectors) { - *pnum = 0; - return 0; - } - - n = total_sectors - sector_num; - if (n < nb_sectors) { - nb_sectors = n; - } - - if (!bs->drv->bdrv_co_get_block_status) { - *pnum = nb_sectors; - ret = BDRV_BLOCK_DATA | BDRV_BLOCK_ALLOCATED; - if (bs->drv->protocol_name) { - ret |= BDRV_BLOCK_OFFSET_VALID | (sector_num * BDRV_SECTOR_SIZE); - } - return ret; - } - - *file = NULL; - ret = bs->drv->bdrv_co_get_block_status(bs, sector_num, nb_sectors, pnum, - file); - if (ret < 0) { - *pnum = 0; - return ret; - } - - if (ret & BDRV_BLOCK_RAW) { - assert(ret & BDRV_BLOCK_OFFSET_VALID); - return bdrv_get_block_status(bs->file->bs, ret >> BDRV_SECTOR_BITS, - *pnum, pnum, file); - } - - if (ret & (BDRV_BLOCK_DATA | BDRV_BLOCK_ZERO)) { - ret |= BDRV_BLOCK_ALLOCATED; - } else { - if (bdrv_unallocated_blocks_are_zero(bs)) { - ret |= BDRV_BLOCK_ZERO; - } else if (bs->backing) { - BlockDriverState *bs2 = bs->backing->bs; - int64_t nb_sectors2 = bdrv_nb_sectors(bs2); - if (nb_sectors2 >= 0 && sector_num >= nb_sectors2) { - ret |= BDRV_BLOCK_ZERO; - } - } - } - - if (*file && *file != bs && - (ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO) && - (ret & BDRV_BLOCK_OFFSET_VALID)) { - BlockDriverState *file2; - int file_pnum; - - ret2 = bdrv_co_get_block_status(*file, ret >> BDRV_SECTOR_BITS, - *pnum, &file_pnum, &file2); - if (ret2 >= 0) { - /* Ignore errors. This is just providing extra information, it - * is useful but not necessary. - */ - if (!file_pnum) { - /* !file_pnum indicates an offset at or beyond the EOF; it is - * perfectly valid for the format block driver to point to such - * offsets, so catch it and mark everything as zero */ - ret |= BDRV_BLOCK_ZERO; - } else { - /* Limit request to the range reported by the protocol driver */ - *pnum = file_pnum; - ret |= (ret2 & BDRV_BLOCK_ZERO); - } - } - } - - return ret; -} - -static int64_t coroutine_fn bdrv_co_get_block_status_above(BlockDriverState *bs, - BlockDriverState *base, - int64_t sector_num, - int nb_sectors, - int *pnum, - BlockDriverState **file) -{ - BlockDriverState *p; - int64_t ret = 0; - - assert(bs != base); - for (p = bs; p != base; p = backing_bs(p)) { - ret = bdrv_co_get_block_status(p, sector_num, nb_sectors, pnum, file); - if (ret < 0 || ret & BDRV_BLOCK_ALLOCATED) { - break; - } - /* [sector_num, pnum] unallocated on this layer, which could be only - * the first part of [sector_num, nb_sectors]. */ - nb_sectors = MIN(nb_sectors, *pnum); - } - return ret; -} - -/* Coroutine wrapper for bdrv_get_block_status_above() */ -static void coroutine_fn bdrv_get_block_status_above_co_entry(void *opaque) -{ - BdrvCoGetBlockStatusData *data = opaque; - - data->ret = bdrv_co_get_block_status_above(data->bs, data->base, - data->sector_num, - data->nb_sectors, - data->pnum, - data->file); - data->done = true; -} - -/* - * Synchronous wrapper around bdrv_co_get_block_status_above(). - * - * See bdrv_co_get_block_status_above() for details. - */ -int64_t bdrv_get_block_status_above(BlockDriverState *bs, - BlockDriverState *base, - int64_t sector_num, - int nb_sectors, int *pnum, - BlockDriverState **file) -{ - Coroutine *co; - BdrvCoGetBlockStatusData data = { - .bs = bs, - .base = base, - .file = file, - .sector_num = sector_num, - .nb_sectors = nb_sectors, - .pnum = pnum, - .done = false, - }; - - if (qemu_in_coroutine()) { - /* Fast-path if already in coroutine context */ - bdrv_get_block_status_above_co_entry(&data); - } else { - AioContext *aio_context = bdrv_get_aio_context(bs); - - co = qemu_coroutine_create(bdrv_get_block_status_above_co_entry); - qemu_coroutine_enter(co, &data); - while (!data.done) { - aio_poll(aio_context, true); - } - } - return data.ret; -} - -int64_t bdrv_get_block_status(BlockDriverState *bs, - int64_t sector_num, - int nb_sectors, int *pnum, - BlockDriverState **file) -{ - return bdrv_get_block_status_above(bs, backing_bs(bs), - sector_num, nb_sectors, pnum, file); -} - -int coroutine_fn bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num, - int nb_sectors, int *pnum) -{ - BlockDriverState *file; - int64_t ret = bdrv_get_block_status(bs, sector_num, nb_sectors, pnum, - &file); - if (ret < 0) { - return ret; - } - return !!(ret & BDRV_BLOCK_ALLOCATED); -} - -/* - * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP] - * - * Return true if the given sector is allocated in any image between - * BASE and TOP (inclusive). BASE can be NULL to check if the given - * sector is allocated in any image of the chain. Return false otherwise. - * - * 'pnum' is set to the number of sectors (including and immediately following - * the specified sector) that are known to be in the same - * allocated/unallocated state. - * - */ -int bdrv_is_allocated_above(BlockDriverState *top, - BlockDriverState *base, - int64_t sector_num, - int nb_sectors, int *pnum) -{ - BlockDriverState *intermediate; - int ret, n = nb_sectors; - - intermediate = top; - while (intermediate && intermediate != base) { - int pnum_inter; - ret = bdrv_is_allocated(intermediate, sector_num, nb_sectors, - &pnum_inter); - if (ret < 0) { - return ret; - } else if (ret) { - *pnum = pnum_inter; - return 1; - } - - /* - * [sector_num, nb_sectors] is unallocated on top but intermediate - * might have - * - * [sector_num+x, nr_sectors] allocated. - */ - if (n > pnum_inter && - (intermediate == top || - sector_num + pnum_inter < intermediate->total_sectors)) { - n = pnum_inter; - } - - intermediate = backing_bs(intermediate); - } - - *pnum = n; - return 0; -} - -int bdrv_write_compressed(BlockDriverState *bs, int64_t sector_num, - const uint8_t *buf, int nb_sectors) -{ - BlockDriver *drv = bs->drv; - int ret; - - if (!drv) { - return -ENOMEDIUM; - } - if (!drv->bdrv_write_compressed) { - return -ENOTSUP; - } - ret = bdrv_check_request(bs, sector_num, nb_sectors); - if (ret < 0) { - return ret; - } - - assert(QLIST_EMPTY(&bs->dirty_bitmaps)); - - return drv->bdrv_write_compressed(bs, sector_num, buf, nb_sectors); -} - -int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf, - int64_t pos, int size) -{ - QEMUIOVector qiov; - struct iovec iov = { - .iov_base = (void *) buf, - .iov_len = size, - }; - - qemu_iovec_init_external(&qiov, &iov, 1); - return bdrv_writev_vmstate(bs, &qiov, pos); -} - -int bdrv_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos) -{ - BlockDriver *drv = bs->drv; - - if (!drv) { - return -ENOMEDIUM; - } else if (drv->bdrv_save_vmstate) { - return drv->bdrv_save_vmstate(bs, qiov, pos); - } else if (bs->file) { - return bdrv_writev_vmstate(bs->file->bs, qiov, pos); - } - - return -ENOTSUP; -} - -int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf, - int64_t pos, int size) -{ - BlockDriver *drv = bs->drv; - if (!drv) - return -ENOMEDIUM; - if (drv->bdrv_load_vmstate) - return drv->bdrv_load_vmstate(bs, buf, pos, size); - if (bs->file) - return bdrv_load_vmstate(bs->file->bs, buf, pos, size); - return -ENOTSUP; -} - -/**************************************************************/ -/* async I/Os */ - -BlockAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num, - QEMUIOVector *qiov, int nb_sectors, - BlockCompletionFunc *cb, void *opaque) -{ - trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque); - - return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0, - cb, opaque, false); -} - -BlockAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num, - QEMUIOVector *qiov, int nb_sectors, - BlockCompletionFunc *cb, void *opaque) -{ - trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque); - - return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0, - cb, opaque, true); -} - -BlockAIOCB *bdrv_aio_write_zeroes(BlockDriverState *bs, - int64_t sector_num, int nb_sectors, BdrvRequestFlags flags, - BlockCompletionFunc *cb, void *opaque) -{ - trace_bdrv_aio_write_zeroes(bs, sector_num, nb_sectors, flags, opaque); - - return bdrv_co_aio_rw_vector(bs, sector_num, NULL, nb_sectors, - BDRV_REQ_ZERO_WRITE | flags, - cb, opaque, true); -} - - -typedef struct MultiwriteCB { - int error; - int num_requests; - int num_callbacks; - struct { - BlockCompletionFunc *cb; - void *opaque; - QEMUIOVector *free_qiov; - } callbacks[]; -} MultiwriteCB; - -static void multiwrite_user_cb(MultiwriteCB *mcb) -{ - int i; - - for (i = 0; i < mcb->num_callbacks; i++) { - mcb->callbacks[i].cb(mcb->callbacks[i].opaque, mcb->error); - if (mcb->callbacks[i].free_qiov) { - qemu_iovec_destroy(mcb->callbacks[i].free_qiov); - } - g_free(mcb->callbacks[i].free_qiov); - } -} - -static void multiwrite_cb(void *opaque, int ret) -{ - MultiwriteCB *mcb = opaque; - - trace_multiwrite_cb(mcb, ret); - - if (ret < 0 && !mcb->error) { - mcb->error = ret; - } - - mcb->num_requests--; - if (mcb->num_requests == 0) { - multiwrite_user_cb(mcb); - g_free(mcb); - } -} - -static int multiwrite_req_compare(const void *a, const void *b) -{ - const BlockRequest *req1 = a, *req2 = b; - - /* - * Note that we can't simply subtract req2->sector from req1->sector - * here as that could overflow the return value. - */ - if (req1->sector > req2->sector) { - return 1; - } else if (req1->sector < req2->sector) { - return -1; - } else { - return 0; - } -} - -/* - * Takes a bunch of requests and tries to merge them. Returns the number of - * requests that remain after merging. - */ -static int multiwrite_merge(BlockDriverState *bs, BlockRequest *reqs, - int num_reqs, MultiwriteCB *mcb) -{ - int i, outidx; - - // Sort requests by start sector - qsort(reqs, num_reqs, sizeof(*reqs), &multiwrite_req_compare); - - // Check if adjacent requests touch the same clusters. If so, combine them, - // filling up gaps with zero sectors. - outidx = 0; - for (i = 1; i < num_reqs; i++) { - int merge = 0; - int64_t oldreq_last = reqs[outidx].sector + reqs[outidx].nb_sectors; - - // Handle exactly sequential writes and overlapping writes. - if (reqs[i].sector <= oldreq_last) { - merge = 1; - } - - if (reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1 > - bs->bl.max_iov) { - merge = 0; - } - - if (bs->bl.max_transfer_length && reqs[outidx].nb_sectors + - reqs[i].nb_sectors > bs->bl.max_transfer_length) { - merge = 0; - } - - if (merge) { - size_t size; - QEMUIOVector *qiov = g_malloc0(sizeof(*qiov)); - qemu_iovec_init(qiov, - reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1); - - // Add the first request to the merged one. If the requests are - // overlapping, drop the last sectors of the first request. - size = (reqs[i].sector - reqs[outidx].sector) << 9; - qemu_iovec_concat(qiov, reqs[outidx].qiov, 0, size); - - // We should need to add any zeros between the two requests - assert (reqs[i].sector <= oldreq_last); - - // Add the second request - qemu_iovec_concat(qiov, reqs[i].qiov, 0, reqs[i].qiov->size); - - // Add tail of first request, if necessary - if (qiov->size < reqs[outidx].qiov->size) { - qemu_iovec_concat(qiov, reqs[outidx].qiov, qiov->size, - reqs[outidx].qiov->size - qiov->size); - } - - reqs[outidx].nb_sectors = qiov->size >> 9; - reqs[outidx].qiov = qiov; - - mcb->callbacks[i].free_qiov = reqs[outidx].qiov; - } else { - outidx++; - reqs[outidx].sector = reqs[i].sector; - reqs[outidx].nb_sectors = reqs[i].nb_sectors; - reqs[outidx].qiov = reqs[i].qiov; - } - } - - if (bs->blk) { - block_acct_merge_done(blk_get_stats(bs->blk), BLOCK_ACCT_WRITE, - num_reqs - outidx - 1); - } - - return outidx + 1; -} - -/* - * Submit multiple AIO write requests at once. - * - * On success, the function returns 0 and all requests in the reqs array have - * been submitted. In error case this function returns -1, and any of the - * requests may or may not be submitted yet. In particular, this means that the - * callback will be called for some of the requests, for others it won't. The - * caller must check the error field of the BlockRequest to wait for the right - * callbacks (if error != 0, no callback will be called). - * - * The implementation may modify the contents of the reqs array, e.g. to merge - * requests. However, the fields opaque and error are left unmodified as they - * are used to signal failure for a single request to the caller. - */ -int bdrv_aio_multiwrite(BlockDriverState *bs, BlockRequest *reqs, int num_reqs) -{ - MultiwriteCB *mcb; - int i; - - /* don't submit writes if we don't have a medium */ - if (bs->drv == NULL) { - for (i = 0; i < num_reqs; i++) { - reqs[i].error = -ENOMEDIUM; - } - return -1; - } - - if (num_reqs == 0) { - return 0; - } - - // Create MultiwriteCB structure - mcb = g_malloc0(sizeof(*mcb) + num_reqs * sizeof(*mcb->callbacks)); - mcb->num_requests = 0; - mcb->num_callbacks = num_reqs; - - for (i = 0; i < num_reqs; i++) { - mcb->callbacks[i].cb = reqs[i].cb; - mcb->callbacks[i].opaque = reqs[i].opaque; - } - - // Check for mergable requests - num_reqs = multiwrite_merge(bs, reqs, num_reqs, mcb); - - trace_bdrv_aio_multiwrite(mcb, mcb->num_callbacks, num_reqs); - - /* Run the aio requests. */ - mcb->num_requests = num_reqs; - for (i = 0; i < num_reqs; i++) { - bdrv_co_aio_rw_vector(bs, reqs[i].sector, reqs[i].qiov, - reqs[i].nb_sectors, reqs[i].flags, - multiwrite_cb, mcb, - true); - } - - return 0; -} - -void bdrv_aio_cancel(BlockAIOCB *acb) -{ - qemu_aio_ref(acb); - bdrv_aio_cancel_async(acb); - while (acb->refcnt > 1) { - if (acb->aiocb_info->get_aio_context) { - aio_poll(acb->aiocb_info->get_aio_context(acb), true); - } else if (acb->bs) { - aio_poll(bdrv_get_aio_context(acb->bs), true); - } else { - abort(); - } - } - qemu_aio_unref(acb); -} - -/* Async version of aio cancel. The caller is not blocked if the acb implements - * cancel_async, otherwise we do nothing and let the request normally complete. - * In either case the completion callback must be called. */ -void bdrv_aio_cancel_async(BlockAIOCB *acb) -{ - if (acb->aiocb_info->cancel_async) { - acb->aiocb_info->cancel_async(acb); - } -} - -/**************************************************************/ -/* async block device emulation */ - -typedef struct BlockAIOCBSync { - BlockAIOCB common; - QEMUBH *bh; - int ret; - /* vector translation state */ - QEMUIOVector *qiov; - uint8_t *bounce; - int is_write; -} BlockAIOCBSync; - -static const AIOCBInfo bdrv_em_aiocb_info = { - .aiocb_size = sizeof(BlockAIOCBSync), -}; - -static void bdrv_aio_bh_cb(void *opaque) -{ - BlockAIOCBSync *acb = opaque; - - if (!acb->is_write && acb->ret >= 0) { - qemu_iovec_from_buf(acb->qiov, 0, acb->bounce, acb->qiov->size); - } - qemu_vfree(acb->bounce); - acb->common.cb(acb->common.opaque, acb->ret); - qemu_bh_delete(acb->bh); - acb->bh = NULL; - qemu_aio_unref(acb); -} - -static BlockAIOCB *bdrv_aio_rw_vector(BlockDriverState *bs, - int64_t sector_num, - QEMUIOVector *qiov, - int nb_sectors, - BlockCompletionFunc *cb, - void *opaque, - int is_write) - -{ - BlockAIOCBSync *acb; - - acb = qemu_aio_get(&bdrv_em_aiocb_info, bs, cb, opaque); - acb->is_write = is_write; - acb->qiov = qiov; - acb->bounce = qemu_try_blockalign(bs, qiov->size); - acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_aio_bh_cb, acb); - - if (acb->bounce == NULL) { - acb->ret = -ENOMEM; - } else if (is_write) { - qemu_iovec_to_buf(acb->qiov, 0, acb->bounce, qiov->size); - acb->ret = bs->drv->bdrv_write(bs, sector_num, acb->bounce, nb_sectors); - } else { - acb->ret = bs->drv->bdrv_read(bs, sector_num, acb->bounce, nb_sectors); - } - - qemu_bh_schedule(acb->bh); - - return &acb->common; -} - -static BlockAIOCB *bdrv_aio_readv_em(BlockDriverState *bs, - int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, - BlockCompletionFunc *cb, void *opaque) -{ - return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 0); -} - -static BlockAIOCB *bdrv_aio_writev_em(BlockDriverState *bs, - int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, - BlockCompletionFunc *cb, void *opaque) -{ - return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 1); -} - - -typedef struct BlockAIOCBCoroutine { - BlockAIOCB common; - BlockRequest req; - bool is_write; - bool need_bh; - bool *done; - QEMUBH* bh; -} BlockAIOCBCoroutine; - -static const AIOCBInfo bdrv_em_co_aiocb_info = { - .aiocb_size = sizeof(BlockAIOCBCoroutine), -}; - -static void bdrv_co_complete(BlockAIOCBCoroutine *acb) -{ - if (!acb->need_bh) { - acb->common.cb(acb->common.opaque, acb->req.error); - qemu_aio_unref(acb); - } -} - -static void bdrv_co_em_bh(void *opaque) -{ - BlockAIOCBCoroutine *acb = opaque; - - assert(!acb->need_bh); - qemu_bh_delete(acb->bh); - bdrv_co_complete(acb); -} - -static void bdrv_co_maybe_schedule_bh(BlockAIOCBCoroutine *acb) -{ - acb->need_bh = false; - if (acb->req.error != -EINPROGRESS) { - BlockDriverState *bs = acb->common.bs; - - acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_co_em_bh, acb); - qemu_bh_schedule(acb->bh); - } -} - -/* Invoke bdrv_co_do_readv/bdrv_co_do_writev */ -static void coroutine_fn bdrv_co_do_rw(void *opaque) -{ - BlockAIOCBCoroutine *acb = opaque; - BlockDriverState *bs = acb->common.bs; - - if (!acb->is_write) { - acb->req.error = bdrv_co_do_readv(bs, acb->req.sector, - acb->req.nb_sectors, acb->req.qiov, acb->req.flags); - } else { - acb->req.error = bdrv_co_do_writev(bs, acb->req.sector, - acb->req.nb_sectors, acb->req.qiov, acb->req.flags); - } - - bdrv_co_complete(acb); -} - -static BlockAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs, - int64_t sector_num, - QEMUIOVector *qiov, - int nb_sectors, - BdrvRequestFlags flags, - BlockCompletionFunc *cb, - void *opaque, - bool is_write) -{ - Coroutine *co; - BlockAIOCBCoroutine *acb; - - acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque); - acb->need_bh = true; - acb->req.error = -EINPROGRESS; - acb->req.sector = sector_num; - acb->req.nb_sectors = nb_sectors; - acb->req.qiov = qiov; - acb->req.flags = flags; - acb->is_write = is_write; - - co = qemu_coroutine_create(bdrv_co_do_rw); - qemu_coroutine_enter(co, acb); - - bdrv_co_maybe_schedule_bh(acb); - return &acb->common; -} - -static void coroutine_fn bdrv_aio_flush_co_entry(void *opaque) -{ - BlockAIOCBCoroutine *acb = opaque; - BlockDriverState *bs = acb->common.bs; - - acb->req.error = bdrv_co_flush(bs); - bdrv_co_complete(acb); -} - -BlockAIOCB *bdrv_aio_flush(BlockDriverState *bs, - BlockCompletionFunc *cb, void *opaque) -{ - trace_bdrv_aio_flush(bs, opaque); - - Coroutine *co; - BlockAIOCBCoroutine *acb; - - acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque); - acb->need_bh = true; - acb->req.error = -EINPROGRESS; - - co = qemu_coroutine_create(bdrv_aio_flush_co_entry); - qemu_coroutine_enter(co, acb); - - bdrv_co_maybe_schedule_bh(acb); - return &acb->common; -} - -static void coroutine_fn bdrv_aio_discard_co_entry(void *opaque) -{ - BlockAIOCBCoroutine *acb = opaque; - BlockDriverState *bs = acb->common.bs; - - acb->req.error = bdrv_co_discard(bs, acb->req.sector, acb->req.nb_sectors); - bdrv_co_complete(acb); -} - -BlockAIOCB *bdrv_aio_discard(BlockDriverState *bs, - int64_t sector_num, int nb_sectors, - BlockCompletionFunc *cb, void *opaque) -{ - Coroutine *co; - BlockAIOCBCoroutine *acb; - - trace_bdrv_aio_discard(bs, sector_num, nb_sectors, opaque); - - acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque); - acb->need_bh = true; - acb->req.error = -EINPROGRESS; - acb->req.sector = sector_num; - acb->req.nb_sectors = nb_sectors; - co = qemu_coroutine_create(bdrv_aio_discard_co_entry); - qemu_coroutine_enter(co, acb); - - bdrv_co_maybe_schedule_bh(acb); - return &acb->common; -} - -void *qemu_aio_get(const AIOCBInfo *aiocb_info, BlockDriverState *bs, - BlockCompletionFunc *cb, void *opaque) -{ - BlockAIOCB *acb; - - acb = g_malloc(aiocb_info->aiocb_size); - acb->aiocb_info = aiocb_info; - acb->bs = bs; - acb->cb = cb; - acb->opaque = opaque; - acb->refcnt = 1; - return acb; -} - -void qemu_aio_ref(void *p) -{ - BlockAIOCB *acb = p; - acb->refcnt++; -} - -void qemu_aio_unref(void *p) -{ - BlockAIOCB *acb = p; - assert(acb->refcnt > 0); - if (--acb->refcnt == 0) { - g_free(acb); - } -} - -/**************************************************************/ -/* Coroutine block device emulation */ - -typedef struct CoroutineIOCompletion { - Coroutine *coroutine; - int ret; -} CoroutineIOCompletion; - -static void bdrv_co_io_em_complete(void *opaque, int ret) -{ - CoroutineIOCompletion *co = opaque; - - co->ret = ret; - qemu_coroutine_enter(co->coroutine, NULL); -} - -static int coroutine_fn bdrv_co_io_em(BlockDriverState *bs, int64_t sector_num, - int nb_sectors, QEMUIOVector *iov, - bool is_write) -{ - CoroutineIOCompletion co = { - .coroutine = qemu_coroutine_self(), - }; - BlockAIOCB *acb; - - if (is_write) { - acb = bs->drv->bdrv_aio_writev(bs, sector_num, iov, nb_sectors, - bdrv_co_io_em_complete, &co); - } else { - acb = bs->drv->bdrv_aio_readv(bs, sector_num, iov, nb_sectors, - bdrv_co_io_em_complete, &co); - } - - trace_bdrv_co_io_em(bs, sector_num, nb_sectors, is_write, acb); - if (!acb) { - return -EIO; - } - qemu_coroutine_yield(); - - return co.ret; -} - -static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs, - int64_t sector_num, int nb_sectors, - QEMUIOVector *iov) -{ - return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, false); -} - -static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs, - int64_t sector_num, int nb_sectors, - QEMUIOVector *iov) -{ - return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, true); -} - -static void coroutine_fn bdrv_flush_co_entry(void *opaque) -{ - RwCo *rwco = opaque; - - rwco->ret = bdrv_co_flush(rwco->bs); -} - -int coroutine_fn bdrv_co_flush(BlockDriverState *bs) -{ - int ret; - BdrvTrackedRequest req; - - if (!bs || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs) || - bdrv_is_sg(bs)) { - return 0; - } - - tracked_request_begin(&req, bs, 0, 0, BDRV_TRACKED_FLUSH); - - /* Write back all layers by calling one driver function */ - if (bs->drv->bdrv_co_flush) { - ret = bs->drv->bdrv_co_flush(bs); - goto out; - } - - /* Write back cached data to the OS even with cache=unsafe */ - BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_OS); - if (bs->drv->bdrv_co_flush_to_os) { - ret = bs->drv->bdrv_co_flush_to_os(bs); - if (ret < 0) { - goto out; - } - } - - /* But don't actually force it to the disk with cache=unsafe */ - if (bs->open_flags & BDRV_O_NO_FLUSH) { - goto flush_parent; - } - - BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_DISK); - if (bs->drv->bdrv_co_flush_to_disk) { - ret = bs->drv->bdrv_co_flush_to_disk(bs); - } else if (bs->drv->bdrv_aio_flush) { - BlockAIOCB *acb; - CoroutineIOCompletion co = { - .coroutine = qemu_coroutine_self(), - }; - - acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co); - if (acb == NULL) { - ret = -EIO; - } else { - qemu_coroutine_yield(); - ret = co.ret; - } - } else { - /* - * Some block drivers always operate in either writethrough or unsafe - * mode and don't support bdrv_flush therefore. Usually qemu doesn't - * know how the server works (because the behaviour is hardcoded or - * depends on server-side configuration), so we can't ensure that - * everything is safe on disk. Returning an error doesn't work because - * that would break guests even if the server operates in writethrough - * mode. - * - * Let's hope the user knows what he's doing. - */ - ret = 0; - } - if (ret < 0) { - goto out; - } - - /* Now flush the underlying protocol. It will also have BDRV_O_NO_FLUSH - * in the case of cache=unsafe, so there are no useless flushes. - */ -flush_parent: - ret = bs->file ? bdrv_co_flush(bs->file->bs) : 0; -out: - tracked_request_end(&req); - return ret; -} - -int bdrv_flush(BlockDriverState *bs) -{ - Coroutine *co; - RwCo rwco = { - .bs = bs, - .ret = NOT_DONE, - }; - - if (qemu_in_coroutine()) { - /* Fast-path if already in coroutine context */ - bdrv_flush_co_entry(&rwco); - } else { - AioContext *aio_context = bdrv_get_aio_context(bs); - - co = qemu_coroutine_create(bdrv_flush_co_entry); - qemu_coroutine_enter(co, &rwco); - while (rwco.ret == NOT_DONE) { - aio_poll(aio_context, true); - } - } - - return rwco.ret; -} - -typedef struct DiscardCo { - BlockDriverState *bs; - int64_t sector_num; - int nb_sectors; - int ret; -} DiscardCo; -static void coroutine_fn bdrv_discard_co_entry(void *opaque) -{ - DiscardCo *rwco = opaque; - - rwco->ret = bdrv_co_discard(rwco->bs, rwco->sector_num, rwco->nb_sectors); -} - -int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num, - int nb_sectors) -{ - BdrvTrackedRequest req; - int max_discard, ret; - - if (!bs->drv) { - return -ENOMEDIUM; - } - - ret = bdrv_check_request(bs, sector_num, nb_sectors); - if (ret < 0) { - return ret; - } else if (bs->read_only) { - return -EPERM; - } - assert(!(bs->open_flags & BDRV_O_INACTIVE)); - - /* Do nothing if disabled. */ - if (!(bs->open_flags & BDRV_O_UNMAP)) { - return 0; - } - - if (!bs->drv->bdrv_co_discard && !bs->drv->bdrv_aio_discard) { - return 0; - } - - tracked_request_begin(&req, bs, sector_num, nb_sectors, - BDRV_TRACKED_DISCARD); - bdrv_set_dirty(bs, sector_num, nb_sectors); - - max_discard = MIN_NON_ZERO(bs->bl.max_discard, BDRV_REQUEST_MAX_SECTORS); - while (nb_sectors > 0) { - int ret; - int num = nb_sectors; - - /* align request */ - if (bs->bl.discard_alignment && - num >= bs->bl.discard_alignment && - sector_num % bs->bl.discard_alignment) { - if (num > bs->bl.discard_alignment) { - num = bs->bl.discard_alignment; - } - num -= sector_num % bs->bl.discard_alignment; - } - - /* limit request size */ - if (num > max_discard) { - num = max_discard; - } - - if (bs->drv->bdrv_co_discard) { - ret = bs->drv->bdrv_co_discard(bs, sector_num, num); - } else { - BlockAIOCB *acb; - CoroutineIOCompletion co = { - .coroutine = qemu_coroutine_self(), - }; - - acb = bs->drv->bdrv_aio_discard(bs, sector_num, nb_sectors, - bdrv_co_io_em_complete, &co); - if (acb == NULL) { - ret = -EIO; - goto out; - } else { - qemu_coroutine_yield(); - ret = co.ret; - } - } - if (ret && ret != -ENOTSUP) { - goto out; - } - - sector_num += num; - nb_sectors -= num; - } - ret = 0; -out: - tracked_request_end(&req); - return ret; -} - -int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors) -{ - Coroutine *co; - DiscardCo rwco = { - .bs = bs, - .sector_num = sector_num, - .nb_sectors = nb_sectors, - .ret = NOT_DONE, - }; - - if (qemu_in_coroutine()) { - /* Fast-path if already in coroutine context */ - bdrv_discard_co_entry(&rwco); - } else { - AioContext *aio_context = bdrv_get_aio_context(bs); - - co = qemu_coroutine_create(bdrv_discard_co_entry); - qemu_coroutine_enter(co, &rwco); - while (rwco.ret == NOT_DONE) { - aio_poll(aio_context, true); - } - } - - return rwco.ret; -} - -typedef struct { - CoroutineIOCompletion *co; - QEMUBH *bh; -} BdrvIoctlCompletionData; - -static void bdrv_ioctl_bh_cb(void *opaque) -{ - BdrvIoctlCompletionData *data = opaque; - - bdrv_co_io_em_complete(data->co, -ENOTSUP); - qemu_bh_delete(data->bh); -} - -static int bdrv_co_do_ioctl(BlockDriverState *bs, int req, void *buf) -{ - BlockDriver *drv = bs->drv; - BdrvTrackedRequest tracked_req; - CoroutineIOCompletion co = { - .coroutine = qemu_coroutine_self(), - }; - BlockAIOCB *acb; - - tracked_request_begin(&tracked_req, bs, 0, 0, BDRV_TRACKED_IOCTL); - if (!drv || !drv->bdrv_aio_ioctl) { - co.ret = -ENOTSUP; - goto out; - } - - acb = drv->bdrv_aio_ioctl(bs, req, buf, bdrv_co_io_em_complete, &co); - if (!acb) { - BdrvIoctlCompletionData *data = g_new(BdrvIoctlCompletionData, 1); - data->bh = aio_bh_new(bdrv_get_aio_context(bs), - bdrv_ioctl_bh_cb, data); - data->co = &co; - qemu_bh_schedule(data->bh); - } - qemu_coroutine_yield(); -out: - tracked_request_end(&tracked_req); - return co.ret; -} - -typedef struct { - BlockDriverState *bs; - int req; - void *buf; - int ret; -} BdrvIoctlCoData; - -static void coroutine_fn bdrv_co_ioctl_entry(void *opaque) -{ - BdrvIoctlCoData *data = opaque; - data->ret = bdrv_co_do_ioctl(data->bs, data->req, data->buf); -} - -/* needed for generic scsi interface */ -int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf) -{ - BdrvIoctlCoData data = { - .bs = bs, - .req = req, - .buf = buf, - .ret = -EINPROGRESS, - }; - - if (qemu_in_coroutine()) { - /* Fast-path if already in coroutine context */ - bdrv_co_ioctl_entry(&data); - } else { - Coroutine *co = qemu_coroutine_create(bdrv_co_ioctl_entry); - - qemu_coroutine_enter(co, &data); - while (data.ret == -EINPROGRESS) { - aio_poll(bdrv_get_aio_context(bs), true); - } - } - return data.ret; -} - -static void coroutine_fn bdrv_co_aio_ioctl_entry(void *opaque) -{ - BlockAIOCBCoroutine *acb = opaque; - acb->req.error = bdrv_co_do_ioctl(acb->common.bs, - acb->req.req, acb->req.buf); - bdrv_co_complete(acb); -} - -BlockAIOCB *bdrv_aio_ioctl(BlockDriverState *bs, - unsigned long int req, void *buf, - BlockCompletionFunc *cb, void *opaque) -{ - BlockAIOCBCoroutine *acb = qemu_aio_get(&bdrv_em_co_aiocb_info, - bs, cb, opaque); - Coroutine *co; - - acb->need_bh = true; - acb->req.error = -EINPROGRESS; - acb->req.req = req; - acb->req.buf = buf; - co = qemu_coroutine_create(bdrv_co_aio_ioctl_entry); - qemu_coroutine_enter(co, acb); - - bdrv_co_maybe_schedule_bh(acb); - return &acb->common; -} - -void *qemu_blockalign(BlockDriverState *bs, size_t size) -{ - return qemu_memalign(bdrv_opt_mem_align(bs), size); -} - -void *qemu_blockalign0(BlockDriverState *bs, size_t size) -{ - return memset(qemu_blockalign(bs, size), 0, size); -} - -void *qemu_try_blockalign(BlockDriverState *bs, size_t size) -{ - size_t align = bdrv_opt_mem_align(bs); - - /* Ensure that NULL is never returned on success */ - assert(align > 0); - if (size == 0) { - size = align; - } - - return qemu_try_memalign(align, size); -} - -void *qemu_try_blockalign0(BlockDriverState *bs, size_t size) -{ - void *mem = qemu_try_blockalign(bs, size); - - if (mem) { - memset(mem, 0, size); - } - - return mem; -} - -/* - * Check if all memory in this vector is sector aligned. - */ -bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov) -{ - int i; - size_t alignment = bdrv_min_mem_align(bs); - - for (i = 0; i < qiov->niov; i++) { - if ((uintptr_t) qiov->iov[i].iov_base % alignment) { - return false; - } - if (qiov->iov[i].iov_len % alignment) { - return false; - } - } - - return true; -} - -void bdrv_add_before_write_notifier(BlockDriverState *bs, - NotifierWithReturn *notifier) -{ - notifier_with_return_list_add(&bs->before_write_notifiers, notifier); -} - -void bdrv_io_plug(BlockDriverState *bs) -{ - BlockDriver *drv = bs->drv; - if (drv && drv->bdrv_io_plug) { - drv->bdrv_io_plug(bs); - } else if (bs->file) { - bdrv_io_plug(bs->file->bs); - } -} - -void bdrv_io_unplug(BlockDriverState *bs) -{ - BlockDriver *drv = bs->drv; - if (drv && drv->bdrv_io_unplug) { - drv->bdrv_io_unplug(bs); - } else if (bs->file) { - bdrv_io_unplug(bs->file->bs); - } -} - -void bdrv_flush_io_queue(BlockDriverState *bs) -{ - BlockDriver *drv = bs->drv; - if (drv && drv->bdrv_flush_io_queue) { - drv->bdrv_flush_io_queue(bs); - } else if (bs->file) { - bdrv_flush_io_queue(bs->file->bs); - } - bdrv_start_throttled_reqs(bs); -} - -void bdrv_drained_begin(BlockDriverState *bs) -{ - if (!bs->quiesce_counter++) { - aio_disable_external(bdrv_get_aio_context(bs)); - } - bdrv_drain(bs); -} - -void bdrv_drained_end(BlockDriverState *bs) -{ - assert(bs->quiesce_counter > 0); - if (--bs->quiesce_counter > 0) { - return; - } - aio_enable_external(bdrv_get_aio_context(bs)); -} diff --git a/qemu/block/iscsi.c b/qemu/block/iscsi.c deleted file mode 100644 index 302baf84c..000000000 --- a/qemu/block/iscsi.c +++ /dev/null @@ -1,1904 +0,0 @@ -/* - * QEMU Block driver for iSCSI images - * - * Copyright (c) 2010-2011 Ronnie Sahlberg - * Copyright (c) 2012-2015 Peter Lieven - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. - */ - -#include "qemu/osdep.h" - -#include -#include -#include -#include "qemu-common.h" -#include "qemu/config-file.h" -#include "qemu/error-report.h" -#include "qemu/bitops.h" -#include "qemu/bitmap.h" -#include "block/block_int.h" -#include "block/scsi.h" -#include "qemu/iov.h" -#include "sysemu/sysemu.h" -#include "qmp-commands.h" -#include "qapi/qmp/qstring.h" -#include "crypto/secret.h" - -#include -#include - -#ifdef __linux__ -#include -#include -#endif - -typedef struct IscsiLun { - struct iscsi_context *iscsi; - AioContext *aio_context; - int lun; - enum scsi_inquiry_peripheral_device_type type; - int block_size; - uint64_t num_blocks; - int events; - QEMUTimer *nop_timer; - QEMUTimer *event_timer; - struct scsi_inquiry_logical_block_provisioning lbp; - struct scsi_inquiry_block_limits bl; - unsigned char *zeroblock; - unsigned long *allocationmap; - int cluster_sectors; - bool use_16_for_rw; - bool write_protected; - bool lbpme; - bool lbprz; - bool dpofua; - bool has_write_same; - bool request_timed_out; -} IscsiLun; - -typedef struct IscsiTask { - int status; - int complete; - int retries; - int do_retry; - struct scsi_task *task; - Coroutine *co; - QEMUBH *bh; - IscsiLun *iscsilun; - QEMUTimer retry_timer; - int err_code; -} IscsiTask; - -typedef struct IscsiAIOCB { - BlockAIOCB common; - QEMUIOVector *qiov; - QEMUBH *bh; - IscsiLun *iscsilun; - struct scsi_task *task; - uint8_t *buf; - int status; - int64_t sector_num; - int nb_sectors; - int ret; -#ifdef __linux__ - sg_io_hdr_t *ioh; -#endif -} IscsiAIOCB; - -/* libiscsi uses time_t so its enough to process events every second */ -#define EVENT_INTERVAL 1000 -#define NOP_INTERVAL 5000 -#define MAX_NOP_FAILURES 3 -#define ISCSI_CMD_RETRIES ARRAY_SIZE(iscsi_retry_times) -static const unsigned iscsi_retry_times[] = {8, 32, 128, 512, 2048, 8192, 32768}; - -/* this threshold is a trade-off knob to choose between - * the potential additional overhead of an extra GET_LBA_STATUS request - * vs. unnecessarily reading a lot of zero sectors over the wire. - * If a read request is greater or equal than ISCSI_CHECKALLOC_THRES - * sectors we check the allocation status of the area covered by the - * request first if the allocationmap indicates that the area might be - * unallocated. */ -#define ISCSI_CHECKALLOC_THRES 64 - -static void -iscsi_bh_cb(void *p) -{ - IscsiAIOCB *acb = p; - - qemu_bh_delete(acb->bh); - - g_free(acb->buf); - acb->buf = NULL; - - acb->common.cb(acb->common.opaque, acb->status); - - if (acb->task != NULL) { - scsi_free_scsi_task(acb->task); - acb->task = NULL; - } - - qemu_aio_unref(acb); -} - -static void -iscsi_schedule_bh(IscsiAIOCB *acb) -{ - if (acb->bh) { - return; - } - acb->bh = aio_bh_new(acb->iscsilun->aio_context, iscsi_bh_cb, acb); - qemu_bh_schedule(acb->bh); -} - -static void iscsi_co_generic_bh_cb(void *opaque) -{ - struct IscsiTask *iTask = opaque; - iTask->complete = 1; - qemu_bh_delete(iTask->bh); - qemu_coroutine_enter(iTask->co, NULL); -} - -static void iscsi_retry_timer_expired(void *opaque) -{ - struct IscsiTask *iTask = opaque; - iTask->complete = 1; - if (iTask->co) { - qemu_coroutine_enter(iTask->co, NULL); - } -} - -static inline unsigned exp_random(double mean) -{ - return -mean * log((double)rand() / RAND_MAX); -} - -/* SCSI_SENSE_ASCQ_INVALID_FIELD_IN_PARAMETER_LIST was introduced in - * libiscsi 1.10.0, together with other constants we need. Use it as - * a hint that we have to define them ourselves if needed, to keep the - * minimum required libiscsi version at 1.9.0. We use an ASCQ macro for - * the test because SCSI_STATUS_* is an enum. - * - * To guard against future changes where SCSI_SENSE_ASCQ_* also becomes - * an enum, check against the LIBISCSI_API_VERSION macro, which was - * introduced in 1.11.0. If it is present, there is no need to define - * anything. - */ -#if !defined(SCSI_SENSE_ASCQ_INVALID_FIELD_IN_PARAMETER_LIST) && \ - !defined(LIBISCSI_API_VERSION) -#define SCSI_STATUS_TASK_SET_FULL 0x28 -#define SCSI_STATUS_TIMEOUT 0x0f000002 -#define SCSI_SENSE_ASCQ_INVALID_FIELD_IN_PARAMETER_LIST 0x2600 -#define SCSI_SENSE_ASCQ_PARAMETER_LIST_LENGTH_ERROR 0x1a00 -#endif - -static int iscsi_translate_sense(struct scsi_sense *sense) -{ - int ret; - - switch (sense->key) { - case SCSI_SENSE_NOT_READY: - return -EBUSY; - case SCSI_SENSE_DATA_PROTECTION: - return -EACCES; - case SCSI_SENSE_COMMAND_ABORTED: - return -ECANCELED; - case SCSI_SENSE_ILLEGAL_REQUEST: - /* Parse ASCQ */ - break; - default: - return -EIO; - } - switch (sense->ascq) { - case SCSI_SENSE_ASCQ_PARAMETER_LIST_LENGTH_ERROR: - case SCSI_SENSE_ASCQ_INVALID_OPERATION_CODE: - case SCSI_SENSE_ASCQ_INVALID_FIELD_IN_CDB: - case SCSI_SENSE_ASCQ_INVALID_FIELD_IN_PARAMETER_LIST: - ret = -EINVAL; - break; - case SCSI_SENSE_ASCQ_LBA_OUT_OF_RANGE: - ret = -ENOSPC; - break; - case SCSI_SENSE_ASCQ_LOGICAL_UNIT_NOT_SUPPORTED: - ret = -ENOTSUP; - break; - case SCSI_SENSE_ASCQ_MEDIUM_NOT_PRESENT: - case SCSI_SENSE_ASCQ_MEDIUM_NOT_PRESENT_TRAY_CLOSED: - case SCSI_SENSE_ASCQ_MEDIUM_NOT_PRESENT_TRAY_OPEN: - ret = -ENOMEDIUM; - break; - case SCSI_SENSE_ASCQ_WRITE_PROTECTED: - ret = -EACCES; - break; - default: - ret = -EIO; - break; - } - return ret; -} - -static void -iscsi_co_generic_cb(struct iscsi_context *iscsi, int status, - void *command_data, void *opaque) -{ - struct IscsiTask *iTask = opaque; - struct scsi_task *task = command_data; - - iTask->status = status; - iTask->do_retry = 0; - iTask->task = task; - - if (status != SCSI_STATUS_GOOD) { - if (iTask->retries++ < ISCSI_CMD_RETRIES) { - if (status == SCSI_STATUS_CHECK_CONDITION - && task->sense.key == SCSI_SENSE_UNIT_ATTENTION) { - error_report("iSCSI CheckCondition: %s", - iscsi_get_error(iscsi)); - iTask->do_retry = 1; - goto out; - } - if (status == SCSI_STATUS_BUSY || - status == SCSI_STATUS_TIMEOUT || - status == SCSI_STATUS_TASK_SET_FULL) { - unsigned retry_time = - exp_random(iscsi_retry_times[iTask->retries - 1]); - if (status == SCSI_STATUS_TIMEOUT) { - /* make sure the request is rescheduled AFTER the - * reconnect is initiated */ - retry_time = EVENT_INTERVAL * 2; - iTask->iscsilun->request_timed_out = true; - } - error_report("iSCSI Busy/TaskSetFull/TimeOut" - " (retry #%u in %u ms): %s", - iTask->retries, retry_time, - iscsi_get_error(iscsi)); - aio_timer_init(iTask->iscsilun->aio_context, - &iTask->retry_timer, QEMU_CLOCK_REALTIME, - SCALE_MS, iscsi_retry_timer_expired, iTask); - timer_mod(&iTask->retry_timer, - qemu_clock_get_ms(QEMU_CLOCK_REALTIME) + retry_time); - iTask->do_retry = 1; - return; - } - } - iTask->err_code = iscsi_translate_sense(&task->sense); - error_report("iSCSI Failure: %s", iscsi_get_error(iscsi)); - } - -out: - if (iTask->co) { - iTask->bh = aio_bh_new(iTask->iscsilun->aio_context, - iscsi_co_generic_bh_cb, iTask); - qemu_bh_schedule(iTask->bh); - } else { - iTask->complete = 1; - } -} - -static void iscsi_co_init_iscsitask(IscsiLun *iscsilun, struct IscsiTask *iTask) -{ - *iTask = (struct IscsiTask) { - .co = qemu_coroutine_self(), - .iscsilun = iscsilun, - }; -} - -static void -iscsi_abort_task_cb(struct iscsi_context *iscsi, int status, void *command_data, - void *private_data) -{ - IscsiAIOCB *acb = private_data; - - acb->status = -ECANCELED; - iscsi_schedule_bh(acb); -} - -static void -iscsi_aio_cancel(BlockAIOCB *blockacb) -{ - IscsiAIOCB *acb = (IscsiAIOCB *)blockacb; - IscsiLun *iscsilun = acb->iscsilun; - - if (acb->status != -EINPROGRESS) { - return; - } - - /* send a task mgmt call to the target to cancel the task on the target */ - iscsi_task_mgmt_abort_task_async(iscsilun->iscsi, acb->task, - iscsi_abort_task_cb, acb); - -} - -static const AIOCBInfo iscsi_aiocb_info = { - .aiocb_size = sizeof(IscsiAIOCB), - .cancel_async = iscsi_aio_cancel, -}; - - -static void iscsi_process_read(void *arg); -static void iscsi_process_write(void *arg); - -static void -iscsi_set_events(IscsiLun *iscsilun) -{ - struct iscsi_context *iscsi = iscsilun->iscsi; - int ev = iscsi_which_events(iscsi); - - if (ev != iscsilun->events) { - aio_set_fd_handler(iscsilun->aio_context, iscsi_get_fd(iscsi), - false, - (ev & POLLIN) ? iscsi_process_read : NULL, - (ev & POLLOUT) ? iscsi_process_write : NULL, - iscsilun); - iscsilun->events = ev; - } -} - -static void iscsi_timed_check_events(void *opaque) -{ - IscsiLun *iscsilun = opaque; - - /* check for timed out requests */ - iscsi_service(iscsilun->iscsi, 0); - - if (iscsilun->request_timed_out) { - iscsilun->request_timed_out = false; - iscsi_reconnect(iscsilun->iscsi); - } - - /* newer versions of libiscsi may return zero events. Ensure we are able - * to return to service once this situation changes. */ - iscsi_set_events(iscsilun); - - timer_mod(iscsilun->event_timer, - qemu_clock_get_ms(QEMU_CLOCK_REALTIME) + EVENT_INTERVAL); -} - -static void -iscsi_process_read(void *arg) -{ - IscsiLun *iscsilun = arg; - struct iscsi_context *iscsi = iscsilun->iscsi; - - iscsi_service(iscsi, POLLIN); - iscsi_set_events(iscsilun); -} - -static void -iscsi_process_write(void *arg) -{ - IscsiLun *iscsilun = arg; - struct iscsi_context *iscsi = iscsilun->iscsi; - - iscsi_service(iscsi, POLLOUT); - iscsi_set_events(iscsilun); -} - -static int64_t sector_lun2qemu(int64_t sector, IscsiLun *iscsilun) -{ - return sector * iscsilun->block_size / BDRV_SECTOR_SIZE; -} - -static int64_t sector_qemu2lun(int64_t sector, IscsiLun *iscsilun) -{ - return sector * BDRV_SECTOR_SIZE / iscsilun->block_size; -} - -static bool is_request_lun_aligned(int64_t sector_num, int nb_sectors, - IscsiLun *iscsilun) -{ - if ((sector_num * BDRV_SECTOR_SIZE) % iscsilun->block_size || - (nb_sectors * BDRV_SECTOR_SIZE) % iscsilun->block_size) { - error_report("iSCSI misaligned request: " - "iscsilun->block_size %u, sector_num %" PRIi64 - ", nb_sectors %d", - iscsilun->block_size, sector_num, nb_sectors); - return 0; - } - return 1; -} - -static unsigned long *iscsi_allocationmap_init(IscsiLun *iscsilun) -{ - return bitmap_try_new(DIV_ROUND_UP(sector_lun2qemu(iscsilun->num_blocks, - iscsilun), - iscsilun->cluster_sectors)); -} - -static void iscsi_allocationmap_set(IscsiLun *iscsilun, int64_t sector_num, - int nb_sectors) -{ - if (iscsilun->allocationmap == NULL) { - return; - } - bitmap_set(iscsilun->allocationmap, - sector_num / iscsilun->cluster_sectors, - DIV_ROUND_UP(nb_sectors, iscsilun->cluster_sectors)); -} - -static void iscsi_allocationmap_clear(IscsiLun *iscsilun, int64_t sector_num, - int nb_sectors) -{ - int64_t cluster_num, nb_clusters; - if (iscsilun->allocationmap == NULL) { - return; - } - cluster_num = DIV_ROUND_UP(sector_num, iscsilun->cluster_sectors); - nb_clusters = (sector_num + nb_sectors) / iscsilun->cluster_sectors - - cluster_num; - if (nb_clusters > 0) { - bitmap_clear(iscsilun->allocationmap, cluster_num, nb_clusters); - } -} - -static int coroutine_fn -iscsi_co_writev_flags(BlockDriverState *bs, int64_t sector_num, int nb_sectors, - QEMUIOVector *iov, int flags) -{ - IscsiLun *iscsilun = bs->opaque; - struct IscsiTask iTask; - uint64_t lba; - uint32_t num_sectors; - bool fua; - - if (!is_request_lun_aligned(sector_num, nb_sectors, iscsilun)) { - return -EINVAL; - } - - if (bs->bl.max_transfer_length && nb_sectors > bs->bl.max_transfer_length) { - error_report("iSCSI Error: Write of %d sectors exceeds max_xfer_len " - "of %d sectors", nb_sectors, bs->bl.max_transfer_length); - return -EINVAL; - } - - lba = sector_qemu2lun(sector_num, iscsilun); - num_sectors = sector_qemu2lun(nb_sectors, iscsilun); - iscsi_co_init_iscsitask(iscsilun, &iTask); -retry: - fua = iscsilun->dpofua && (flags & BDRV_REQ_FUA); - if (iscsilun->use_16_for_rw) { - iTask.task = iscsi_write16_task(iscsilun->iscsi, iscsilun->lun, lba, - NULL, num_sectors * iscsilun->block_size, - iscsilun->block_size, 0, 0, fua, 0, 0, - iscsi_co_generic_cb, &iTask); - } else { - iTask.task = iscsi_write10_task(iscsilun->iscsi, iscsilun->lun, lba, - NULL, num_sectors * iscsilun->block_size, - iscsilun->block_size, 0, 0, fua, 0, 0, - iscsi_co_generic_cb, &iTask); - } - if (iTask.task == NULL) { - return -ENOMEM; - } - scsi_task_set_iov_out(iTask.task, (struct scsi_iovec *) iov->iov, - iov->niov); - while (!iTask.complete) { - iscsi_set_events(iscsilun); - qemu_coroutine_yield(); - } - - if (iTask.task != NULL) { - scsi_free_scsi_task(iTask.task); - iTask.task = NULL; - } - - if (iTask.do_retry) { - iTask.complete = 0; - goto retry; - } - - if (iTask.status != SCSI_STATUS_GOOD) { - return iTask.err_code; - } - - iscsi_allocationmap_set(iscsilun, sector_num, nb_sectors); - - return 0; -} - -static int coroutine_fn -iscsi_co_writev(BlockDriverState *bs, int64_t sector_num, int nb_sectors, - QEMUIOVector *iov) -{ - return iscsi_co_writev_flags(bs, sector_num, nb_sectors, iov, 0); -} - - -static bool iscsi_allocationmap_is_allocated(IscsiLun *iscsilun, - int64_t sector_num, int nb_sectors) -{ - unsigned long size; - if (iscsilun->allocationmap == NULL) { - return true; - } - size = DIV_ROUND_UP(sector_num + nb_sectors, iscsilun->cluster_sectors); - return !(find_next_bit(iscsilun->allocationmap, size, - sector_num / iscsilun->cluster_sectors) == size); -} - -static int64_t coroutine_fn iscsi_co_get_block_status(BlockDriverState *bs, - int64_t sector_num, - int nb_sectors, int *pnum, - BlockDriverState **file) -{ - IscsiLun *iscsilun = bs->opaque; - struct scsi_get_lba_status *lbas = NULL; - struct scsi_lba_status_descriptor *lbasd = NULL; - struct IscsiTask iTask; - int64_t ret; - - iscsi_co_init_iscsitask(iscsilun, &iTask); - - if (!is_request_lun_aligned(sector_num, nb_sectors, iscsilun)) { - ret = -EINVAL; - goto out; - } - - /* default to all sectors allocated */ - ret = BDRV_BLOCK_DATA; - ret |= (sector_num << BDRV_SECTOR_BITS) | BDRV_BLOCK_OFFSET_VALID; - *pnum = nb_sectors; - - /* LUN does not support logical block provisioning */ - if (!iscsilun->lbpme) { - goto out; - } - -retry: - if (iscsi_get_lba_status_task(iscsilun->iscsi, iscsilun->lun, - sector_qemu2lun(sector_num, iscsilun), - 8 + 16, iscsi_co_generic_cb, - &iTask) == NULL) { - ret = -ENOMEM; - goto out; - } - - while (!iTask.complete) { - iscsi_set_events(iscsilun); - qemu_coroutine_yield(); - } - - if (iTask.do_retry) { - if (iTask.task != NULL) { - scsi_free_scsi_task(iTask.task); - iTask.task = NULL; - } - iTask.complete = 0; - goto retry; - } - - if (iTask.status != SCSI_STATUS_GOOD) { - /* in case the get_lba_status_callout fails (i.e. - * because the device is busy or the cmd is not - * supported) we pretend all blocks are allocated - * for backwards compatibility */ - goto out; - } - - lbas = scsi_datain_unmarshall(iTask.task); - if (lbas == NULL) { - ret = -EIO; - goto out; - } - - lbasd = &lbas->descriptors[0]; - - if (sector_qemu2lun(sector_num, iscsilun) != lbasd->lba) { - ret = -EIO; - goto out; - } - - *pnum = sector_lun2qemu(lbasd->num_blocks, iscsilun); - - if (lbasd->provisioning == SCSI_PROVISIONING_TYPE_DEALLOCATED || - lbasd->provisioning == SCSI_PROVISIONING_TYPE_ANCHORED) { - ret &= ~BDRV_BLOCK_DATA; - if (iscsilun->lbprz) { - ret |= BDRV_BLOCK_ZERO; - } - } - - if (ret & BDRV_BLOCK_ZERO) { - iscsi_allocationmap_clear(iscsilun, sector_num, *pnum); - } else { - iscsi_allocationmap_set(iscsilun, sector_num, *pnum); - } - - if (*pnum > nb_sectors) { - *pnum = nb_sectors; - } -out: - if (iTask.task != NULL) { - scsi_free_scsi_task(iTask.task); - } - if (ret > 0 && ret & BDRV_BLOCK_OFFSET_VALID) { - *file = bs; - } - return ret; -} - -static int coroutine_fn iscsi_co_readv(BlockDriverState *bs, - int64_t sector_num, int nb_sectors, - QEMUIOVector *iov) -{ - IscsiLun *iscsilun = bs->opaque; - struct IscsiTask iTask; - uint64_t lba; - uint32_t num_sectors; - - if (!is_request_lun_aligned(sector_num, nb_sectors, iscsilun)) { - return -EINVAL; - } - - if (bs->bl.max_transfer_length && nb_sectors > bs->bl.max_transfer_length) { - error_report("iSCSI Error: Read of %d sectors exceeds max_xfer_len " - "of %d sectors", nb_sectors, bs->bl.max_transfer_length); - return -EINVAL; - } - - if (iscsilun->lbprz && nb_sectors >= ISCSI_CHECKALLOC_THRES && - !iscsi_allocationmap_is_allocated(iscsilun, sector_num, nb_sectors)) { - int64_t ret; - int pnum; - BlockDriverState *file; - ret = iscsi_co_get_block_status(bs, sector_num, INT_MAX, &pnum, &file); - if (ret < 0) { - return ret; - } - if (ret & BDRV_BLOCK_ZERO && pnum >= nb_sectors) { - qemu_iovec_memset(iov, 0, 0x00, iov->size); - return 0; - } - } - - lba = sector_qemu2lun(sector_num, iscsilun); - num_sectors = sector_qemu2lun(nb_sectors, iscsilun); - - iscsi_co_init_iscsitask(iscsilun, &iTask); -retry: - if (iscsilun->use_16_for_rw) { - iTask.task = iscsi_read16_task(iscsilun->iscsi, iscsilun->lun, lba, - num_sectors * iscsilun->block_size, - iscsilun->block_size, 0, 0, 0, 0, 0, - iscsi_co_generic_cb, &iTask); - } else { - iTask.task = iscsi_read10_task(iscsilun->iscsi, iscsilun->lun, lba, - num_sectors * iscsilun->block_size, - iscsilun->block_size, - 0, 0, 0, 0, 0, - iscsi_co_generic_cb, &iTask); - } - if (iTask.task == NULL) { - return -ENOMEM; - } - scsi_task_set_iov_in(iTask.task, (struct scsi_iovec *) iov->iov, iov->niov); - - while (!iTask.complete) { - iscsi_set_events(iscsilun); - qemu_coroutine_yield(); - } - - if (iTask.task != NULL) { - scsi_free_scsi_task(iTask.task); - iTask.task = NULL; - } - - if (iTask.do_retry) { - iTask.complete = 0; - goto retry; - } - - if (iTask.status != SCSI_STATUS_GOOD) { - return iTask.err_code; - } - - return 0; -} - -static int coroutine_fn iscsi_co_flush(BlockDriverState *bs) -{ - IscsiLun *iscsilun = bs->opaque; - struct IscsiTask iTask; - - iscsi_co_init_iscsitask(iscsilun, &iTask); -retry: - if (iscsi_synchronizecache10_task(iscsilun->iscsi, iscsilun->lun, 0, 0, 0, - 0, iscsi_co_generic_cb, &iTask) == NULL) { - return -ENOMEM; - } - - while (!iTask.complete) { - iscsi_set_events(iscsilun); - qemu_coroutine_yield(); - } - - if (iTask.task != NULL) { - scsi_free_scsi_task(iTask.task); - iTask.task = NULL; - } - - if (iTask.do_retry) { - iTask.complete = 0; - goto retry; - } - - if (iTask.status != SCSI_STATUS_GOOD) { - return iTask.err_code; - } - - return 0; -} - -#ifdef __linux__ -static void -iscsi_aio_ioctl_cb(struct iscsi_context *iscsi, int status, - void *command_data, void *opaque) -{ - IscsiAIOCB *acb = opaque; - - g_free(acb->buf); - acb->buf = NULL; - - acb->status = 0; - if (status < 0) { - error_report("Failed to ioctl(SG_IO) to iSCSI lun. %s", - iscsi_get_error(iscsi)); - acb->status = iscsi_translate_sense(&acb->task->sense); - } - - acb->ioh->driver_status = 0; - acb->ioh->host_status = 0; - acb->ioh->resid = 0; - -#define SG_ERR_DRIVER_SENSE 0x08 - - if (status == SCSI_STATUS_CHECK_CONDITION && acb->task->datain.size >= 2) { - int ss; - - acb->ioh->driver_status |= SG_ERR_DRIVER_SENSE; - - acb->ioh->sb_len_wr = acb->task->datain.size - 2; - ss = (acb->ioh->mx_sb_len >= acb->ioh->sb_len_wr) ? - acb->ioh->mx_sb_len : acb->ioh->sb_len_wr; - memcpy(acb->ioh->sbp, &acb->task->datain.data[2], ss); - } - - iscsi_schedule_bh(acb); -} - -static void iscsi_ioctl_bh_completion(void *opaque) -{ - IscsiAIOCB *acb = opaque; - - qemu_bh_delete(acb->bh); - acb->common.cb(acb->common.opaque, acb->ret); - qemu_aio_unref(acb); -} - -static void iscsi_ioctl_handle_emulated(IscsiAIOCB *acb, int req, void *buf) -{ - BlockDriverState *bs = acb->common.bs; - IscsiLun *iscsilun = bs->opaque; - int ret = 0; - - switch (req) { - case SG_GET_VERSION_NUM: - *(int *)buf = 30000; - break; - case SG_GET_SCSI_ID: - ((struct sg_scsi_id *)buf)->scsi_type = iscsilun->type; - break; - default: - ret = -EINVAL; - } - assert(!acb->bh); - acb->bh = aio_bh_new(bdrv_get_aio_context(bs), - iscsi_ioctl_bh_completion, acb); - acb->ret = ret; - qemu_bh_schedule(acb->bh); -} - -static BlockAIOCB *iscsi_aio_ioctl(BlockDriverState *bs, - unsigned long int req, void *buf, - BlockCompletionFunc *cb, void *opaque) -{ - IscsiLun *iscsilun = bs->opaque; - struct iscsi_context *iscsi = iscsilun->iscsi; - struct iscsi_data data; - IscsiAIOCB *acb; - - acb = qemu_aio_get(&iscsi_aiocb_info, bs, cb, opaque); - - acb->iscsilun = iscsilun; - acb->bh = NULL; - acb->status = -EINPROGRESS; - acb->buf = NULL; - acb->ioh = buf; - - if (req != SG_IO) { - iscsi_ioctl_handle_emulated(acb, req, buf); - return &acb->common; - } - - acb->task = malloc(sizeof(struct scsi_task)); - if (acb->task == NULL) { - error_report("iSCSI: Failed to allocate task for scsi command. %s", - iscsi_get_error(iscsi)); - qemu_aio_unref(acb); - return NULL; - } - memset(acb->task, 0, sizeof(struct scsi_task)); - - switch (acb->ioh->dxfer_direction) { - case SG_DXFER_TO_DEV: - acb->task->xfer_dir = SCSI_XFER_WRITE; - break; - case SG_DXFER_FROM_DEV: - acb->task->xfer_dir = SCSI_XFER_READ; - break; - default: - acb->task->xfer_dir = SCSI_XFER_NONE; - break; - } - - acb->task->cdb_size = acb->ioh->cmd_len; - memcpy(&acb->task->cdb[0], acb->ioh->cmdp, acb->ioh->cmd_len); - acb->task->expxferlen = acb->ioh->dxfer_len; - - data.size = 0; - if (acb->task->xfer_dir == SCSI_XFER_WRITE) { - if (acb->ioh->iovec_count == 0) { - data.data = acb->ioh->dxferp; - data.size = acb->ioh->dxfer_len; - } else { - scsi_task_set_iov_out(acb->task, - (struct scsi_iovec *) acb->ioh->dxferp, - acb->ioh->iovec_count); - } - } - - if (iscsi_scsi_command_async(iscsi, iscsilun->lun, acb->task, - iscsi_aio_ioctl_cb, - (data.size > 0) ? &data : NULL, - acb) != 0) { - scsi_free_scsi_task(acb->task); - qemu_aio_unref(acb); - return NULL; - } - - /* tell libiscsi to read straight into the buffer we got from ioctl */ - if (acb->task->xfer_dir == SCSI_XFER_READ) { - if (acb->ioh->iovec_count == 0) { - scsi_task_add_data_in_buffer(acb->task, - acb->ioh->dxfer_len, - acb->ioh->dxferp); - } else { - scsi_task_set_iov_in(acb->task, - (struct scsi_iovec *) acb->ioh->dxferp, - acb->ioh->iovec_count); - } - } - - iscsi_set_events(iscsilun); - - return &acb->common; -} - -#endif - -static int64_t -iscsi_getlength(BlockDriverState *bs) -{ - IscsiLun *iscsilun = bs->opaque; - int64_t len; - - len = iscsilun->num_blocks; - len *= iscsilun->block_size; - - return len; -} - -static int -coroutine_fn iscsi_co_discard(BlockDriverState *bs, int64_t sector_num, - int nb_sectors) -{ - IscsiLun *iscsilun = bs->opaque; - struct IscsiTask iTask; - struct unmap_list list; - - if (!is_request_lun_aligned(sector_num, nb_sectors, iscsilun)) { - return -EINVAL; - } - - if (!iscsilun->lbp.lbpu) { - /* UNMAP is not supported by the target */ - return 0; - } - - list.lba = sector_qemu2lun(sector_num, iscsilun); - list.num = sector_qemu2lun(nb_sectors, iscsilun); - - iscsi_co_init_iscsitask(iscsilun, &iTask); -retry: - if (iscsi_unmap_task(iscsilun->iscsi, iscsilun->lun, 0, 0, &list, 1, - iscsi_co_generic_cb, &iTask) == NULL) { - return -ENOMEM; - } - - while (!iTask.complete) { - iscsi_set_events(iscsilun); - qemu_coroutine_yield(); - } - - if (iTask.task != NULL) { - scsi_free_scsi_task(iTask.task); - iTask.task = NULL; - } - - if (iTask.do_retry) { - iTask.complete = 0; - goto retry; - } - - if (iTask.status == SCSI_STATUS_CHECK_CONDITION) { - /* the target might fail with a check condition if it - is not happy with the alignment of the UNMAP request - we silently fail in this case */ - return 0; - } - - if (iTask.status != SCSI_STATUS_GOOD) { - return iTask.err_code; - } - - iscsi_allocationmap_clear(iscsilun, sector_num, nb_sectors); - - return 0; -} - -static int -coroutine_fn iscsi_co_write_zeroes(BlockDriverState *bs, int64_t sector_num, - int nb_sectors, BdrvRequestFlags flags) -{ - IscsiLun *iscsilun = bs->opaque; - struct IscsiTask iTask; - uint64_t lba; - uint32_t nb_blocks; - bool use_16_for_ws = iscsilun->use_16_for_rw; - - if (!is_request_lun_aligned(sector_num, nb_sectors, iscsilun)) { - return -EINVAL; - } - - if (flags & BDRV_REQ_MAY_UNMAP) { - if (!use_16_for_ws && !iscsilun->lbp.lbpws10) { - /* WRITESAME10 with UNMAP is unsupported try WRITESAME16 */ - use_16_for_ws = true; - } - if (use_16_for_ws && !iscsilun->lbp.lbpws) { - /* WRITESAME16 with UNMAP is not supported by the target, - * fall back and try WRITESAME10/16 without UNMAP */ - flags &= ~BDRV_REQ_MAY_UNMAP; - use_16_for_ws = iscsilun->use_16_for_rw; - } - } - - if (!(flags & BDRV_REQ_MAY_UNMAP) && !iscsilun->has_write_same) { - /* WRITESAME without UNMAP is not supported by the target */ - return -ENOTSUP; - } - - lba = sector_qemu2lun(sector_num, iscsilun); - nb_blocks = sector_qemu2lun(nb_sectors, iscsilun); - - if (iscsilun->zeroblock == NULL) { - iscsilun->zeroblock = g_try_malloc0(iscsilun->block_size); - if (iscsilun->zeroblock == NULL) { - return -ENOMEM; - } - } - - iscsi_co_init_iscsitask(iscsilun, &iTask); -retry: - if (use_16_for_ws) { - iTask.task = iscsi_writesame16_task(iscsilun->iscsi, iscsilun->lun, lba, - iscsilun->zeroblock, iscsilun->block_size, - nb_blocks, 0, !!(flags & BDRV_REQ_MAY_UNMAP), - 0, 0, iscsi_co_generic_cb, &iTask); - } else { - iTask.task = iscsi_writesame10_task(iscsilun->iscsi, iscsilun->lun, lba, - iscsilun->zeroblock, iscsilun->block_size, - nb_blocks, 0, !!(flags & BDRV_REQ_MAY_UNMAP), - 0, 0, iscsi_co_generic_cb, &iTask); - } - if (iTask.task == NULL) { - return -ENOMEM; - } - - while (!iTask.complete) { - iscsi_set_events(iscsilun); - qemu_coroutine_yield(); - } - - if (iTask.status == SCSI_STATUS_CHECK_CONDITION && - iTask.task->sense.key == SCSI_SENSE_ILLEGAL_REQUEST && - (iTask.task->sense.ascq == SCSI_SENSE_ASCQ_INVALID_OPERATION_CODE || - iTask.task->sense.ascq == SCSI_SENSE_ASCQ_INVALID_FIELD_IN_CDB)) { - /* WRITE SAME is not supported by the target */ - iscsilun->has_write_same = false; - scsi_free_scsi_task(iTask.task); - return -ENOTSUP; - } - - if (iTask.task != NULL) { - scsi_free_scsi_task(iTask.task); - iTask.task = NULL; - } - - if (iTask.do_retry) { - iTask.complete = 0; - goto retry; - } - - if (iTask.status != SCSI_STATUS_GOOD) { - return iTask.err_code; - } - - if (flags & BDRV_REQ_MAY_UNMAP) { - iscsi_allocationmap_clear(iscsilun, sector_num, nb_sectors); - } else { - iscsi_allocationmap_set(iscsilun, sector_num, nb_sectors); - } - - return 0; -} - -static void parse_chap(struct iscsi_context *iscsi, const char *target, - Error **errp) -{ - QemuOptsList *list; - QemuOpts *opts; - const char *user = NULL; - const char *password = NULL; - const char *secretid; - char *secret = NULL; - - list = qemu_find_opts("iscsi"); - if (!list) { - return; - } - - opts = qemu_opts_find(list, target); - if (opts == NULL) { - opts = QTAILQ_FIRST(&list->head); - if (!opts) { - return; - } - } - - user = qemu_opt_get(opts, "user"); - if (!user) { - return; - } - - secretid = qemu_opt_get(opts, "password-secret"); - password = qemu_opt_get(opts, "password"); - if (secretid && password) { - error_setg(errp, "'password' and 'password-secret' properties are " - "mutually exclusive"); - return; - } - if (secretid) { - secret = qcrypto_secret_lookup_as_utf8(secretid, errp); - if (!secret) { - return; - } - password = secret; - } else if (!password) { - error_setg(errp, "CHAP username specified but no password was given"); - return; - } - - if (iscsi_set_initiator_username_pwd(iscsi, user, password)) { - error_setg(errp, "Failed to set initiator username and password"); - } - - g_free(secret); -} - -static void parse_header_digest(struct iscsi_context *iscsi, const char *target, - Error **errp) -{ - QemuOptsList *list; - QemuOpts *opts; - const char *digest = NULL; - - list = qemu_find_opts("iscsi"); - if (!list) { - return; - } - - opts = qemu_opts_find(list, target); - if (opts == NULL) { - opts = QTAILQ_FIRST(&list->head); - if (!opts) { - return; - } - } - - digest = qemu_opt_get(opts, "header-digest"); - if (!digest) { - return; - } - - if (!strcmp(digest, "CRC32C")) { - iscsi_set_header_digest(iscsi, ISCSI_HEADER_DIGEST_CRC32C); - } else if (!strcmp(digest, "NONE")) { - iscsi_set_header_digest(iscsi, ISCSI_HEADER_DIGEST_NONE); - } else if (!strcmp(digest, "CRC32C-NONE")) { - iscsi_set_header_digest(iscsi, ISCSI_HEADER_DIGEST_CRC32C_NONE); - } else if (!strcmp(digest, "NONE-CRC32C")) { - iscsi_set_header_digest(iscsi, ISCSI_HEADER_DIGEST_NONE_CRC32C); - } else { - error_setg(errp, "Invalid header-digest setting : %s", digest); - } -} - -static char *parse_initiator_name(const char *target) -{ - QemuOptsList *list; - QemuOpts *opts; - const char *name; - char *iscsi_name; - UuidInfo *uuid_info; - - list = qemu_find_opts("iscsi"); - if (list) { - opts = qemu_opts_find(list, target); - if (!opts) { - opts = QTAILQ_FIRST(&list->head); - } - if (opts) { - name = qemu_opt_get(opts, "initiator-name"); - if (name) { - return g_strdup(name); - } - } - } - - uuid_info = qmp_query_uuid(NULL); - if (strcmp(uuid_info->UUID, UUID_NONE) == 0) { - name = qemu_get_vm_name(); - } else { - name = uuid_info->UUID; - } - iscsi_name = g_strdup_printf("iqn.2008-11.org.linux-kvm%s%s", - name ? ":" : "", name ? name : ""); - qapi_free_UuidInfo(uuid_info); - return iscsi_name; -} - -static int parse_timeout(const char *target) -{ - QemuOptsList *list; - QemuOpts *opts; - const char *timeout; - - list = qemu_find_opts("iscsi"); - if (list) { - opts = qemu_opts_find(list, target); - if (!opts) { - opts = QTAILQ_FIRST(&list->head); - } - if (opts) { - timeout = qemu_opt_get(opts, "timeout"); - if (timeout) { - return atoi(timeout); - } - } - } - - return 0; -} - -static void iscsi_nop_timed_event(void *opaque) -{ - IscsiLun *iscsilun = opaque; - - if (iscsi_get_nops_in_flight(iscsilun->iscsi) >= MAX_NOP_FAILURES) { - error_report("iSCSI: NOP timeout. Reconnecting..."); - iscsilun->request_timed_out = true; - } else if (iscsi_nop_out_async(iscsilun->iscsi, NULL, NULL, 0, NULL) != 0) { - error_report("iSCSI: failed to sent NOP-Out. Disabling NOP messages."); - return; - } - - timer_mod(iscsilun->nop_timer, qemu_clock_get_ms(QEMU_CLOCK_REALTIME) + NOP_INTERVAL); - iscsi_set_events(iscsilun); -} - -static void iscsi_readcapacity_sync(IscsiLun *iscsilun, Error **errp) -{ - struct scsi_task *task = NULL; - struct scsi_readcapacity10 *rc10 = NULL; - struct scsi_readcapacity16 *rc16 = NULL; - int retries = ISCSI_CMD_RETRIES; - - do { - if (task != NULL) { - scsi_free_scsi_task(task); - task = NULL; - } - - switch (iscsilun->type) { - case TYPE_DISK: - task = iscsi_readcapacity16_sync(iscsilun->iscsi, iscsilun->lun); - if (task != NULL && task->status == SCSI_STATUS_GOOD) { - rc16 = scsi_datain_unmarshall(task); - if (rc16 == NULL) { - error_setg(errp, "iSCSI: Failed to unmarshall readcapacity16 data."); - } else { - iscsilun->block_size = rc16->block_length; - iscsilun->num_blocks = rc16->returned_lba + 1; - iscsilun->lbpme = !!rc16->lbpme; - iscsilun->lbprz = !!rc16->lbprz; - iscsilun->use_16_for_rw = (rc16->returned_lba > 0xffffffff); - } - break; - } - if (task != NULL && task->status == SCSI_STATUS_CHECK_CONDITION - && task->sense.key == SCSI_SENSE_UNIT_ATTENTION) { - break; - } - /* Fall through and try READ CAPACITY(10) instead. */ - case TYPE_ROM: - task = iscsi_readcapacity10_sync(iscsilun->iscsi, iscsilun->lun, 0, 0); - if (task != NULL && task->status == SCSI_STATUS_GOOD) { - rc10 = scsi_datain_unmarshall(task); - if (rc10 == NULL) { - error_setg(errp, "iSCSI: Failed to unmarshall readcapacity10 data."); - } else { - iscsilun->block_size = rc10->block_size; - if (rc10->lba == 0) { - /* blank disk loaded */ - iscsilun->num_blocks = 0; - } else { - iscsilun->num_blocks = rc10->lba + 1; - } - } - } - break; - default: - return; - } - } while (task != NULL && task->status == SCSI_STATUS_CHECK_CONDITION - && task->sense.key == SCSI_SENSE_UNIT_ATTENTION - && retries-- > 0); - - if (task == NULL || task->status != SCSI_STATUS_GOOD) { - error_setg(errp, "iSCSI: failed to send readcapacity10/16 command"); - } else if (!iscsilun->block_size || - iscsilun->block_size % BDRV_SECTOR_SIZE) { - error_setg(errp, "iSCSI: the target returned an invalid " - "block size of %d.", iscsilun->block_size); - } - if (task) { - scsi_free_scsi_task(task); - } -} - -/* TODO Convert to fine grained options */ -static QemuOptsList runtime_opts = { - .name = "iscsi", - .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head), - .desc = { - { - .name = "filename", - .type = QEMU_OPT_STRING, - .help = "URL to the iscsi image", - }, - { /* end of list */ } - }, -}; - -static struct scsi_task *iscsi_do_inquiry(struct iscsi_context *iscsi, int lun, - int evpd, int pc, void **inq, Error **errp) -{ - int full_size; - struct scsi_task *task = NULL; - task = iscsi_inquiry_sync(iscsi, lun, evpd, pc, 64); - if (task == NULL || task->status != SCSI_STATUS_GOOD) { - goto fail; - } - full_size = scsi_datain_getfullsize(task); - if (full_size > task->datain.size) { - scsi_free_scsi_task(task); - - /* we need more data for the full list */ - task = iscsi_inquiry_sync(iscsi, lun, evpd, pc, full_size); - if (task == NULL || task->status != SCSI_STATUS_GOOD) { - goto fail; - } - } - - *inq = scsi_datain_unmarshall(task); - if (*inq == NULL) { - error_setg(errp, "iSCSI: failed to unmarshall inquiry datain blob"); - goto fail_with_err; - } - - return task; - -fail: - error_setg(errp, "iSCSI: Inquiry command failed : %s", - iscsi_get_error(iscsi)); -fail_with_err: - if (task != NULL) { - scsi_free_scsi_task(task); - } - return NULL; -} - -static void iscsi_detach_aio_context(BlockDriverState *bs) -{ - IscsiLun *iscsilun = bs->opaque; - - aio_set_fd_handler(iscsilun->aio_context, iscsi_get_fd(iscsilun->iscsi), - false, NULL, NULL, NULL); - iscsilun->events = 0; - - if (iscsilun->nop_timer) { - timer_del(iscsilun->nop_timer); - timer_free(iscsilun->nop_timer); - iscsilun->nop_timer = NULL; - } - if (iscsilun->event_timer) { - timer_del(iscsilun->event_timer); - timer_free(iscsilun->event_timer); - iscsilun->event_timer = NULL; - } -} - -static void iscsi_attach_aio_context(BlockDriverState *bs, - AioContext *new_context) -{ - IscsiLun *iscsilun = bs->opaque; - - iscsilun->aio_context = new_context; - iscsi_set_events(iscsilun); - - /* Set up a timer for sending out iSCSI NOPs */ - iscsilun->nop_timer = aio_timer_new(iscsilun->aio_context, - QEMU_CLOCK_REALTIME, SCALE_MS, - iscsi_nop_timed_event, iscsilun); - timer_mod(iscsilun->nop_timer, - qemu_clock_get_ms(QEMU_CLOCK_REALTIME) + NOP_INTERVAL); - - /* Set up a timer for periodic calls to iscsi_set_events and to - * scan for command timeout */ - iscsilun->event_timer = aio_timer_new(iscsilun->aio_context, - QEMU_CLOCK_REALTIME, SCALE_MS, - iscsi_timed_check_events, iscsilun); - timer_mod(iscsilun->event_timer, - qemu_clock_get_ms(QEMU_CLOCK_REALTIME) + EVENT_INTERVAL); -} - -static void iscsi_modesense_sync(IscsiLun *iscsilun) -{ - struct scsi_task *task; - struct scsi_mode_sense *ms = NULL; - iscsilun->write_protected = false; - iscsilun->dpofua = false; - - task = iscsi_modesense6_sync(iscsilun->iscsi, iscsilun->lun, - 1, SCSI_MODESENSE_PC_CURRENT, - 0x3F, 0, 255); - if (task == NULL) { - error_report("iSCSI: Failed to send MODE_SENSE(6) command: %s", - iscsi_get_error(iscsilun->iscsi)); - goto out; - } - - if (task->status != SCSI_STATUS_GOOD) { - error_report("iSCSI: Failed MODE_SENSE(6), LUN assumed writable"); - goto out; - } - ms = scsi_datain_unmarshall(task); - if (!ms) { - error_report("iSCSI: Failed to unmarshall MODE_SENSE(6) data: %s", - iscsi_get_error(iscsilun->iscsi)); - goto out; - } - iscsilun->write_protected = ms->device_specific_parameter & 0x80; - iscsilun->dpofua = ms->device_specific_parameter & 0x10; - -out: - if (task) { - scsi_free_scsi_task(task); - } -} - -/* - * We support iscsi url's on the form - * iscsi://[%@][:]// - */ -static int iscsi_open(BlockDriverState *bs, QDict *options, int flags, - Error **errp) -{ - IscsiLun *iscsilun = bs->opaque; - struct iscsi_context *iscsi = NULL; - struct iscsi_url *iscsi_url = NULL; - struct scsi_task *task = NULL; - struct scsi_inquiry_standard *inq = NULL; - struct scsi_inquiry_supported_pages *inq_vpd; - char *initiator_name = NULL; - QemuOpts *opts; - Error *local_err = NULL; - const char *filename; - int i, ret = 0, timeout = 0; - - opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort); - qemu_opts_absorb_qdict(opts, options, &local_err); - if (local_err) { - error_propagate(errp, local_err); - ret = -EINVAL; - goto out; - } - - filename = qemu_opt_get(opts, "filename"); - - iscsi_url = iscsi_parse_full_url(iscsi, filename); - if (iscsi_url == NULL) { - error_setg(errp, "Failed to parse URL : %s", filename); - ret = -EINVAL; - goto out; - } - - memset(iscsilun, 0, sizeof(IscsiLun)); - - initiator_name = parse_initiator_name(iscsi_url->target); - - iscsi = iscsi_create_context(initiator_name); - if (iscsi == NULL) { - error_setg(errp, "iSCSI: Failed to create iSCSI context."); - ret = -ENOMEM; - goto out; - } - - if (iscsi_set_targetname(iscsi, iscsi_url->target)) { - error_setg(errp, "iSCSI: Failed to set target name."); - ret = -EINVAL; - goto out; - } - - if (iscsi_url->user[0] != '\0') { - ret = iscsi_set_initiator_username_pwd(iscsi, iscsi_url->user, - iscsi_url->passwd); - if (ret != 0) { - error_setg(errp, "Failed to set initiator username and password"); - ret = -EINVAL; - goto out; - } - } - - /* check if we got CHAP username/password via the options */ - parse_chap(iscsi, iscsi_url->target, &local_err); - if (local_err != NULL) { - error_propagate(errp, local_err); - ret = -EINVAL; - goto out; - } - - if (iscsi_set_session_type(iscsi, ISCSI_SESSION_NORMAL) != 0) { - error_setg(errp, "iSCSI: Failed to set session type to normal."); - ret = -EINVAL; - goto out; - } - - iscsi_set_header_digest(iscsi, ISCSI_HEADER_DIGEST_NONE_CRC32C); - - /* check if we got HEADER_DIGEST via the options */ - parse_header_digest(iscsi, iscsi_url->target, &local_err); - if (local_err != NULL) { - error_propagate(errp, local_err); - ret = -EINVAL; - goto out; - } - - /* timeout handling is broken in libiscsi before 1.15.0 */ - timeout = parse_timeout(iscsi_url->target); -#if defined(LIBISCSI_API_VERSION) && LIBISCSI_API_VERSION >= 20150621 - iscsi_set_timeout(iscsi, timeout); -#else - if (timeout) { - error_report("iSCSI: ignoring timeout value for libiscsi <1.15.0"); - } -#endif - - if (iscsi_full_connect_sync(iscsi, iscsi_url->portal, iscsi_url->lun) != 0) { - error_setg(errp, "iSCSI: Failed to connect to LUN : %s", - iscsi_get_error(iscsi)); - ret = -EINVAL; - goto out; - } - - iscsilun->iscsi = iscsi; - iscsilun->aio_context = bdrv_get_aio_context(bs); - iscsilun->lun = iscsi_url->lun; - iscsilun->has_write_same = true; - - task = iscsi_do_inquiry(iscsilun->iscsi, iscsilun->lun, 0, 0, - (void **) &inq, errp); - if (task == NULL) { - ret = -EINVAL; - goto out; - } - iscsilun->type = inq->periperal_device_type; - scsi_free_scsi_task(task); - task = NULL; - - iscsi_modesense_sync(iscsilun); - - /* Check the write protect flag of the LUN if we want to write */ - if (iscsilun->type == TYPE_DISK && (flags & BDRV_O_RDWR) && - iscsilun->write_protected) { - error_setg(errp, "Cannot open a write protected LUN as read-write"); - ret = -EACCES; - goto out; - } - - iscsi_readcapacity_sync(iscsilun, &local_err); - if (local_err != NULL) { - error_propagate(errp, local_err); - ret = -EINVAL; - goto out; - } - bs->total_sectors = sector_lun2qemu(iscsilun->num_blocks, iscsilun); - bs->request_alignment = iscsilun->block_size; - - /* We don't have any emulation for devices other than disks and CD-ROMs, so - * this must be sg ioctl compatible. We force it to be sg, otherwise qemu - * will try to read from the device to guess the image format. - */ - if (iscsilun->type != TYPE_DISK && iscsilun->type != TYPE_ROM) { - bs->sg = 1; - } - - task = iscsi_do_inquiry(iscsilun->iscsi, iscsilun->lun, 1, - SCSI_INQUIRY_PAGECODE_SUPPORTED_VPD_PAGES, - (void **) &inq_vpd, errp); - if (task == NULL) { - ret = -EINVAL; - goto out; - } - for (i = 0; i < inq_vpd->num_pages; i++) { - struct scsi_task *inq_task; - struct scsi_inquiry_logical_block_provisioning *inq_lbp; - struct scsi_inquiry_block_limits *inq_bl; - switch (inq_vpd->pages[i]) { - case SCSI_INQUIRY_PAGECODE_LOGICAL_BLOCK_PROVISIONING: - inq_task = iscsi_do_inquiry(iscsilun->iscsi, iscsilun->lun, 1, - SCSI_INQUIRY_PAGECODE_LOGICAL_BLOCK_PROVISIONING, - (void **) &inq_lbp, errp); - if (inq_task == NULL) { - ret = -EINVAL; - goto out; - } - memcpy(&iscsilun->lbp, inq_lbp, - sizeof(struct scsi_inquiry_logical_block_provisioning)); - scsi_free_scsi_task(inq_task); - break; - case SCSI_INQUIRY_PAGECODE_BLOCK_LIMITS: - inq_task = iscsi_do_inquiry(iscsilun->iscsi, iscsilun->lun, 1, - SCSI_INQUIRY_PAGECODE_BLOCK_LIMITS, - (void **) &inq_bl, errp); - if (inq_task == NULL) { - ret = -EINVAL; - goto out; - } - memcpy(&iscsilun->bl, inq_bl, - sizeof(struct scsi_inquiry_block_limits)); - scsi_free_scsi_task(inq_task); - break; - default: - break; - } - } - scsi_free_scsi_task(task); - task = NULL; - - iscsi_attach_aio_context(bs, iscsilun->aio_context); - - /* Guess the internal cluster (page) size of the iscsi target by the means - * of opt_unmap_gran. Transfer the unmap granularity only if it has a - * reasonable size */ - if (iscsilun->bl.opt_unmap_gran * iscsilun->block_size >= 4 * 1024 && - iscsilun->bl.opt_unmap_gran * iscsilun->block_size <= 16 * 1024 * 1024) { - iscsilun->cluster_sectors = (iscsilun->bl.opt_unmap_gran * - iscsilun->block_size) >> BDRV_SECTOR_BITS; - if (iscsilun->lbprz) { - iscsilun->allocationmap = iscsi_allocationmap_init(iscsilun); - if (iscsilun->allocationmap == NULL) { - ret = -ENOMEM; - } - } - } - -out: - qemu_opts_del(opts); - g_free(initiator_name); - if (iscsi_url != NULL) { - iscsi_destroy_url(iscsi_url); - } - if (task != NULL) { - scsi_free_scsi_task(task); - } - - if (ret) { - if (iscsi != NULL) { - if (iscsi_is_logged_in(iscsi)) { - iscsi_logout_sync(iscsi); - } - iscsi_destroy_context(iscsi); - } - memset(iscsilun, 0, sizeof(IscsiLun)); - } - return ret; -} - -static void iscsi_close(BlockDriverState *bs) -{ - IscsiLun *iscsilun = bs->opaque; - struct iscsi_context *iscsi = iscsilun->iscsi; - - iscsi_detach_aio_context(bs); - if (iscsi_is_logged_in(iscsi)) { - iscsi_logout_sync(iscsi); - } - iscsi_destroy_context(iscsi); - g_free(iscsilun->zeroblock); - g_free(iscsilun->allocationmap); - memset(iscsilun, 0, sizeof(IscsiLun)); -} - -static int sector_limits_lun2qemu(int64_t sector, IscsiLun *iscsilun) -{ - return MIN(sector_lun2qemu(sector, iscsilun), INT_MAX / 2 + 1); -} - -static void iscsi_refresh_limits(BlockDriverState *bs, Error **errp) -{ - /* We don't actually refresh here, but just return data queried in - * iscsi_open(): iscsi targets don't change their limits. */ - - IscsiLun *iscsilun = bs->opaque; - uint32_t max_xfer_len = iscsilun->use_16_for_rw ? 0xffffffff : 0xffff; - - if (iscsilun->bl.max_xfer_len) { - max_xfer_len = MIN(max_xfer_len, iscsilun->bl.max_xfer_len); - } - - bs->bl.max_transfer_length = sector_limits_lun2qemu(max_xfer_len, iscsilun); - - if (iscsilun->lbp.lbpu) { - if (iscsilun->bl.max_unmap < 0xffffffff) { - bs->bl.max_discard = - sector_limits_lun2qemu(iscsilun->bl.max_unmap, iscsilun); - } - bs->bl.discard_alignment = - sector_limits_lun2qemu(iscsilun->bl.opt_unmap_gran, iscsilun); - } - - if (iscsilun->bl.max_ws_len < 0xffffffff) { - bs->bl.max_write_zeroes = - sector_limits_lun2qemu(iscsilun->bl.max_ws_len, iscsilun); - } - if (iscsilun->lbp.lbpws) { - bs->bl.write_zeroes_alignment = - sector_limits_lun2qemu(iscsilun->bl.opt_unmap_gran, iscsilun); - } - bs->bl.opt_transfer_length = - sector_limits_lun2qemu(iscsilun->bl.opt_xfer_len, iscsilun); -} - -/* Note that this will not re-establish a connection with an iSCSI target - it - * is effectively a NOP. */ -static int iscsi_reopen_prepare(BDRVReopenState *state, - BlockReopenQueue *queue, Error **errp) -{ - IscsiLun *iscsilun = state->bs->opaque; - - if (state->flags & BDRV_O_RDWR && iscsilun->write_protected) { - error_setg(errp, "Cannot open a write protected LUN as read-write"); - return -EACCES; - } - return 0; -} - -static int iscsi_truncate(BlockDriverState *bs, int64_t offset) -{ - IscsiLun *iscsilun = bs->opaque; - Error *local_err = NULL; - - if (iscsilun->type != TYPE_DISK) { - return -ENOTSUP; - } - - iscsi_readcapacity_sync(iscsilun, &local_err); - if (local_err != NULL) { - error_free(local_err); - return -EIO; - } - - if (offset > iscsi_getlength(bs)) { - return -EINVAL; - } - - if (iscsilun->allocationmap != NULL) { - g_free(iscsilun->allocationmap); - iscsilun->allocationmap = iscsi_allocationmap_init(iscsilun); - } - - return 0; -} - -static int iscsi_create(const char *filename, QemuOpts *opts, Error **errp) -{ - int ret = 0; - int64_t total_size = 0; - BlockDriverState *bs; - IscsiLun *iscsilun = NULL; - QDict *bs_options; - - bs = bdrv_new(); - - /* Read out options */ - total_size = DIV_ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0), - BDRV_SECTOR_SIZE); - bs->opaque = g_new0(struct IscsiLun, 1); - iscsilun = bs->opaque; - - bs_options = qdict_new(); - qdict_put(bs_options, "filename", qstring_from_str(filename)); - ret = iscsi_open(bs, bs_options, 0, NULL); - QDECREF(bs_options); - - if (ret != 0) { - goto out; - } - iscsi_detach_aio_context(bs); - if (iscsilun->type != TYPE_DISK) { - ret = -ENODEV; - goto out; - } - if (bs->total_sectors < total_size) { - ret = -ENOSPC; - goto out; - } - - ret = 0; -out: - if (iscsilun->iscsi != NULL) { - iscsi_destroy_context(iscsilun->iscsi); - } - g_free(bs->opaque); - bs->opaque = NULL; - bdrv_unref(bs); - return ret; -} - -static int iscsi_get_info(BlockDriverState *bs, BlockDriverInfo *bdi) -{ - IscsiLun *iscsilun = bs->opaque; - bdi->unallocated_blocks_are_zero = iscsilun->lbprz; - bdi->can_write_zeroes_with_unmap = iscsilun->lbprz && iscsilun->lbp.lbpws; - bdi->cluster_size = iscsilun->cluster_sectors * BDRV_SECTOR_SIZE; - return 0; -} - -static QemuOptsList iscsi_create_opts = { - .name = "iscsi-create-opts", - .head = QTAILQ_HEAD_INITIALIZER(iscsi_create_opts.head), - .desc = { - { - .name = BLOCK_OPT_SIZE, - .type = QEMU_OPT_SIZE, - .help = "Virtual disk size" - }, - { /* end of list */ } - } -}; - -static BlockDriver bdrv_iscsi = { - .format_name = "iscsi", - .protocol_name = "iscsi", - - .instance_size = sizeof(IscsiLun), - .bdrv_needs_filename = true, - .bdrv_file_open = iscsi_open, - .bdrv_close = iscsi_close, - .bdrv_create = iscsi_create, - .create_opts = &iscsi_create_opts, - .bdrv_reopen_prepare = iscsi_reopen_prepare, - - .bdrv_getlength = iscsi_getlength, - .bdrv_get_info = iscsi_get_info, - .bdrv_truncate = iscsi_truncate, - .bdrv_refresh_limits = iscsi_refresh_limits, - - .bdrv_co_get_block_status = iscsi_co_get_block_status, - .bdrv_co_discard = iscsi_co_discard, - .bdrv_co_write_zeroes = iscsi_co_write_zeroes, - .bdrv_co_readv = iscsi_co_readv, - .bdrv_co_writev = iscsi_co_writev, - .bdrv_co_writev_flags = iscsi_co_writev_flags, - .supported_write_flags = BDRV_REQ_FUA, - .bdrv_co_flush_to_disk = iscsi_co_flush, - -#ifdef __linux__ - .bdrv_aio_ioctl = iscsi_aio_ioctl, -#endif - - .bdrv_detach_aio_context = iscsi_detach_aio_context, - .bdrv_attach_aio_context = iscsi_attach_aio_context, -}; - -static QemuOptsList qemu_iscsi_opts = { - .name = "iscsi", - .head = QTAILQ_HEAD_INITIALIZER(qemu_iscsi_opts.head), - .desc = { - { - .name = "user", - .type = QEMU_OPT_STRING, - .help = "username for CHAP authentication to target", - },{ - .name = "password", - .type = QEMU_OPT_STRING, - .help = "password for CHAP authentication to target", - },{ - .name = "password-secret", - .type = QEMU_OPT_STRING, - .help = "ID of the secret providing password for CHAP " - "authentication to target", - },{ - .name = "header-digest", - .type = QEMU_OPT_STRING, - .help = "HeaderDigest setting. " - "{CRC32C|CRC32C-NONE|NONE-CRC32C|NONE}", - },{ - .name = "initiator-name", - .type = QEMU_OPT_STRING, - .help = "Initiator iqn name to use when connecting", - },{ - .name = "timeout", - .type = QEMU_OPT_NUMBER, - .help = "Request timeout in seconds (default 0 = no timeout)", - }, - { /* end of list */ } - }, -}; - -static void iscsi_block_init(void) -{ - bdrv_register(&bdrv_iscsi); - qemu_add_opts(&qemu_iscsi_opts); -} - -block_init(iscsi_block_init); diff --git a/qemu/block/linux-aio.c b/qemu/block/linux-aio.c deleted file mode 100644 index 805757e02..000000000 --- a/qemu/block/linux-aio.c +++ /dev/null @@ -1,339 +0,0 @@ -/* - * Linux native AIO support. - * - * Copyright (C) 2009 IBM, Corp. - * Copyright (C) 2009 Red Hat, Inc. - * - * This work is licensed under the terms of the GNU GPL, version 2 or later. - * See the COPYING file in the top-level directory. - */ -#include "qemu/osdep.h" -#include "qemu-common.h" -#include "block/aio.h" -#include "qemu/queue.h" -#include "block/raw-aio.h" -#include "qemu/event_notifier.h" - -#include - -/* - * Queue size (per-device). - * - * XXX: eventually we need to communicate this to the guest and/or make it - * tunable by the guest. If we get more outstanding requests at a time - * than this we will get EAGAIN from io_submit which is communicated to - * the guest as an I/O error. - */ -#define MAX_EVENTS 128 - -#define MAX_QUEUED_IO 128 - -struct qemu_laiocb { - BlockAIOCB common; - struct qemu_laio_state *ctx; - struct iocb iocb; - ssize_t ret; - size_t nbytes; - QEMUIOVector *qiov; - bool is_read; - QSIMPLEQ_ENTRY(qemu_laiocb) next; -}; - -typedef struct { - int plugged; - unsigned int n; - bool blocked; - QSIMPLEQ_HEAD(, qemu_laiocb) pending; -} LaioQueue; - -struct qemu_laio_state { - io_context_t ctx; - EventNotifier e; - - /* io queue for submit at batch */ - LaioQueue io_q; - - /* I/O completion processing */ - QEMUBH *completion_bh; - struct io_event events[MAX_EVENTS]; - int event_idx; - int event_max; -}; - -static void ioq_submit(struct qemu_laio_state *s); - -static inline ssize_t io_event_ret(struct io_event *ev) -{ - return (ssize_t)(((uint64_t)ev->res2 << 32) | ev->res); -} - -/* - * Completes an AIO request (calls the callback and frees the ACB). - */ -static void qemu_laio_process_completion(struct qemu_laio_state *s, - struct qemu_laiocb *laiocb) -{ - int ret; - - ret = laiocb->ret; - if (ret != -ECANCELED) { - if (ret == laiocb->nbytes) { - ret = 0; - } else if (ret >= 0) { - /* Short reads mean EOF, pad with zeros. */ - if (laiocb->is_read) { - qemu_iovec_memset(laiocb->qiov, ret, 0, - laiocb->qiov->size - ret); - } else { - ret = -EINVAL; - } - } - } - laiocb->common.cb(laiocb->common.opaque, ret); - - qemu_aio_unref(laiocb); -} - -/* The completion BH fetches completed I/O requests and invokes their - * callbacks. - * - * The function is somewhat tricky because it supports nested event loops, for - * example when a request callback invokes aio_poll(). In order to do this, - * the completion events array and index are kept in qemu_laio_state. The BH - * reschedules itself as long as there are completions pending so it will - * either be called again in a nested event loop or will be called after all - * events have been completed. When there are no events left to complete, the - * BH returns without rescheduling. - */ -static void qemu_laio_completion_bh(void *opaque) -{ - struct qemu_laio_state *s = opaque; - - /* Fetch more completion events when empty */ - if (s->event_idx == s->event_max) { - do { - struct timespec ts = { 0 }; - s->event_max = io_getevents(s->ctx, MAX_EVENTS, MAX_EVENTS, - s->events, &ts); - } while (s->event_max == -EINTR); - - s->event_idx = 0; - if (s->event_max <= 0) { - s->event_max = 0; - return; /* no more events */ - } - } - - /* Reschedule so nested event loops see currently pending completions */ - qemu_bh_schedule(s->completion_bh); - - /* Process completion events */ - while (s->event_idx < s->event_max) { - struct iocb *iocb = s->events[s->event_idx].obj; - struct qemu_laiocb *laiocb = - container_of(iocb, struct qemu_laiocb, iocb); - - laiocb->ret = io_event_ret(&s->events[s->event_idx]); - s->event_idx++; - - qemu_laio_process_completion(s, laiocb); - } - - if (!s->io_q.plugged && !QSIMPLEQ_EMPTY(&s->io_q.pending)) { - ioq_submit(s); - } -} - -static void qemu_laio_completion_cb(EventNotifier *e) -{ - struct qemu_laio_state *s = container_of(e, struct qemu_laio_state, e); - - if (event_notifier_test_and_clear(&s->e)) { - qemu_bh_schedule(s->completion_bh); - } -} - -static void laio_cancel(BlockAIOCB *blockacb) -{ - struct qemu_laiocb *laiocb = (struct qemu_laiocb *)blockacb; - struct io_event event; - int ret; - - if (laiocb->ret != -EINPROGRESS) { - return; - } - ret = io_cancel(laiocb->ctx->ctx, &laiocb->iocb, &event); - laiocb->ret = -ECANCELED; - if (ret != 0) { - /* iocb is not cancelled, cb will be called by the event loop later */ - return; - } - - laiocb->common.cb(laiocb->common.opaque, laiocb->ret); -} - -static const AIOCBInfo laio_aiocb_info = { - .aiocb_size = sizeof(struct qemu_laiocb), - .cancel_async = laio_cancel, -}; - -static void ioq_init(LaioQueue *io_q) -{ - QSIMPLEQ_INIT(&io_q->pending); - io_q->plugged = 0; - io_q->n = 0; - io_q->blocked = false; -} - -static void ioq_submit(struct qemu_laio_state *s) -{ - int ret, len; - struct qemu_laiocb *aiocb; - struct iocb *iocbs[MAX_QUEUED_IO]; - QSIMPLEQ_HEAD(, qemu_laiocb) completed; - - do { - len = 0; - QSIMPLEQ_FOREACH(aiocb, &s->io_q.pending, next) { - iocbs[len++] = &aiocb->iocb; - if (len == MAX_QUEUED_IO) { - break; - } - } - - ret = io_submit(s->ctx, len, iocbs); - if (ret == -EAGAIN) { - break; - } - if (ret < 0) { - abort(); - } - - s->io_q.n -= ret; - aiocb = container_of(iocbs[ret - 1], struct qemu_laiocb, iocb); - QSIMPLEQ_SPLIT_AFTER(&s->io_q.pending, aiocb, next, &completed); - } while (ret == len && !QSIMPLEQ_EMPTY(&s->io_q.pending)); - s->io_q.blocked = (s->io_q.n > 0); -} - -void laio_io_plug(BlockDriverState *bs, void *aio_ctx) -{ - struct qemu_laio_state *s = aio_ctx; - - s->io_q.plugged++; -} - -void laio_io_unplug(BlockDriverState *bs, void *aio_ctx, bool unplug) -{ - struct qemu_laio_state *s = aio_ctx; - - assert(s->io_q.plugged > 0 || !unplug); - - if (unplug && --s->io_q.plugged > 0) { - return; - } - - if (!s->io_q.blocked && !QSIMPLEQ_EMPTY(&s->io_q.pending)) { - ioq_submit(s); - } -} - -BlockAIOCB *laio_submit(BlockDriverState *bs, void *aio_ctx, int fd, - int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, - BlockCompletionFunc *cb, void *opaque, int type) -{ - struct qemu_laio_state *s = aio_ctx; - struct qemu_laiocb *laiocb; - struct iocb *iocbs; - off_t offset = sector_num * 512; - - laiocb = qemu_aio_get(&laio_aiocb_info, bs, cb, opaque); - laiocb->nbytes = nb_sectors * 512; - laiocb->ctx = s; - laiocb->ret = -EINPROGRESS; - laiocb->is_read = (type == QEMU_AIO_READ); - laiocb->qiov = qiov; - - iocbs = &laiocb->iocb; - - switch (type) { - case QEMU_AIO_WRITE: - io_prep_pwritev(iocbs, fd, qiov->iov, qiov->niov, offset); - break; - case QEMU_AIO_READ: - io_prep_preadv(iocbs, fd, qiov->iov, qiov->niov, offset); - break; - /* Currently Linux kernel does not support other operations */ - default: - fprintf(stderr, "%s: invalid AIO request type 0x%x.\n", - __func__, type); - goto out_free_aiocb; - } - io_set_eventfd(&laiocb->iocb, event_notifier_get_fd(&s->e)); - - QSIMPLEQ_INSERT_TAIL(&s->io_q.pending, laiocb, next); - s->io_q.n++; - if (!s->io_q.blocked && - (!s->io_q.plugged || s->io_q.n >= MAX_QUEUED_IO)) { - ioq_submit(s); - } - return &laiocb->common; - -out_free_aiocb: - qemu_aio_unref(laiocb); - return NULL; -} - -void laio_detach_aio_context(void *s_, AioContext *old_context) -{ - struct qemu_laio_state *s = s_; - - aio_set_event_notifier(old_context, &s->e, false, NULL); - qemu_bh_delete(s->completion_bh); -} - -void laio_attach_aio_context(void *s_, AioContext *new_context) -{ - struct qemu_laio_state *s = s_; - - s->completion_bh = aio_bh_new(new_context, qemu_laio_completion_bh, s); - aio_set_event_notifier(new_context, &s->e, false, - qemu_laio_completion_cb); -} - -void *laio_init(void) -{ - struct qemu_laio_state *s; - - s = g_malloc0(sizeof(*s)); - if (event_notifier_init(&s->e, false) < 0) { - goto out_free_state; - } - - if (io_setup(MAX_EVENTS, &s->ctx) != 0) { - goto out_close_efd; - } - - ioq_init(&s->io_q); - - return s; - -out_close_efd: - event_notifier_cleanup(&s->e); -out_free_state: - g_free(s); - return NULL; -} - -void laio_cleanup(void *s_) -{ - struct qemu_laio_state *s = s_; - - event_notifier_cleanup(&s->e); - - if (io_destroy(s->ctx) != 0) { - fprintf(stderr, "%s: destroy AIO context %p failed\n", - __func__, &s->ctx); - } - g_free(s); -} diff --git a/qemu/block/mirror.c b/qemu/block/mirror.c deleted file mode 100644 index 039f48125..000000000 --- a/qemu/block/mirror.c +++ /dev/null @@ -1,976 +0,0 @@ -/* - * Image mirroring - * - * Copyright Red Hat, Inc. 2012 - * - * Authors: - * Paolo Bonzini - * - * This work is licensed under the terms of the GNU LGPL, version 2 or later. - * See the COPYING.LIB file in the top-level directory. - * - */ - -#include "qemu/osdep.h" -#include "trace.h" -#include "block/blockjob.h" -#include "block/block_int.h" -#include "sysemu/block-backend.h" -#include "qapi/error.h" -#include "qapi/qmp/qerror.h" -#include "qemu/ratelimit.h" -#include "qemu/bitmap.h" -#include "qemu/error-report.h" - -#define SLICE_TIME 100000000ULL /* ns */ -#define MAX_IN_FLIGHT 16 -#define DEFAULT_MIRROR_BUF_SIZE (10 << 20) - -/* The mirroring buffer is a list of granularity-sized chunks. - * Free chunks are organized in a list. - */ -typedef struct MirrorBuffer { - QSIMPLEQ_ENTRY(MirrorBuffer) next; -} MirrorBuffer; - -typedef struct MirrorBlockJob { - BlockJob common; - RateLimit limit; - BlockDriverState *target; - BlockDriverState *base; - /* The name of the graph node to replace */ - char *replaces; - /* The BDS to replace */ - BlockDriverState *to_replace; - /* Used to block operations on the drive-mirror-replace target */ - Error *replace_blocker; - bool is_none_mode; - BlockdevOnError on_source_error, on_target_error; - bool synced; - bool should_complete; - int64_t granularity; - size_t buf_size; - int64_t bdev_length; - unsigned long *cow_bitmap; - BdrvDirtyBitmap *dirty_bitmap; - HBitmapIter hbi; - uint8_t *buf; - QSIMPLEQ_HEAD(, MirrorBuffer) buf_free; - int buf_free_count; - - unsigned long *in_flight_bitmap; - int in_flight; - int sectors_in_flight; - int ret; - bool unmap; - bool waiting_for_io; - int target_cluster_sectors; - int max_iov; -} MirrorBlockJob; - -typedef struct MirrorOp { - MirrorBlockJob *s; - QEMUIOVector qiov; - int64_t sector_num; - int nb_sectors; -} MirrorOp; - -static BlockErrorAction mirror_error_action(MirrorBlockJob *s, bool read, - int error) -{ - s->synced = false; - if (read) { - return block_job_error_action(&s->common, s->common.bs, - s->on_source_error, true, error); - } else { - return block_job_error_action(&s->common, s->target, - s->on_target_error, false, error); - } -} - -static void mirror_iteration_done(MirrorOp *op, int ret) -{ - MirrorBlockJob *s = op->s; - struct iovec *iov; - int64_t chunk_num; - int i, nb_chunks, sectors_per_chunk; - - trace_mirror_iteration_done(s, op->sector_num, op->nb_sectors, ret); - - s->in_flight--; - s->sectors_in_flight -= op->nb_sectors; - iov = op->qiov.iov; - for (i = 0; i < op->qiov.niov; i++) { - MirrorBuffer *buf = (MirrorBuffer *) iov[i].iov_base; - QSIMPLEQ_INSERT_TAIL(&s->buf_free, buf, next); - s->buf_free_count++; - } - - sectors_per_chunk = s->granularity >> BDRV_SECTOR_BITS; - chunk_num = op->sector_num / sectors_per_chunk; - nb_chunks = DIV_ROUND_UP(op->nb_sectors, sectors_per_chunk); - bitmap_clear(s->in_flight_bitmap, chunk_num, nb_chunks); - if (ret >= 0) { - if (s->cow_bitmap) { - bitmap_set(s->cow_bitmap, chunk_num, nb_chunks); - } - s->common.offset += (uint64_t)op->nb_sectors * BDRV_SECTOR_SIZE; - } - - qemu_iovec_destroy(&op->qiov); - g_free(op); - - if (s->waiting_for_io) { - qemu_coroutine_enter(s->common.co, NULL); - } -} - -static void mirror_write_complete(void *opaque, int ret) -{ - MirrorOp *op = opaque; - MirrorBlockJob *s = op->s; - if (ret < 0) { - BlockErrorAction action; - - bdrv_set_dirty_bitmap(s->dirty_bitmap, op->sector_num, op->nb_sectors); - action = mirror_error_action(s, false, -ret); - if (action == BLOCK_ERROR_ACTION_REPORT && s->ret >= 0) { - s->ret = ret; - } - } - mirror_iteration_done(op, ret); -} - -static void mirror_read_complete(void *opaque, int ret) -{ - MirrorOp *op = opaque; - MirrorBlockJob *s = op->s; - if (ret < 0) { - BlockErrorAction action; - - bdrv_set_dirty_bitmap(s->dirty_bitmap, op->sector_num, op->nb_sectors); - action = mirror_error_action(s, true, -ret); - if (action == BLOCK_ERROR_ACTION_REPORT && s->ret >= 0) { - s->ret = ret; - } - - mirror_iteration_done(op, ret); - return; - } - bdrv_aio_writev(s->target, op->sector_num, &op->qiov, op->nb_sectors, - mirror_write_complete, op); -} - -static inline void mirror_clip_sectors(MirrorBlockJob *s, - int64_t sector_num, - int *nb_sectors) -{ - *nb_sectors = MIN(*nb_sectors, - s->bdev_length / BDRV_SECTOR_SIZE - sector_num); -} - -/* Round sector_num and/or nb_sectors to target cluster if COW is needed, and - * return the offset of the adjusted tail sector against original. */ -static int mirror_cow_align(MirrorBlockJob *s, - int64_t *sector_num, - int *nb_sectors) -{ - bool need_cow; - int ret = 0; - int chunk_sectors = s->granularity >> BDRV_SECTOR_BITS; - int64_t align_sector_num = *sector_num; - int align_nb_sectors = *nb_sectors; - int max_sectors = chunk_sectors * s->max_iov; - - need_cow = !test_bit(*sector_num / chunk_sectors, s->cow_bitmap); - need_cow |= !test_bit((*sector_num + *nb_sectors - 1) / chunk_sectors, - s->cow_bitmap); - if (need_cow) { - bdrv_round_to_clusters(s->target, *sector_num, *nb_sectors, - &align_sector_num, &align_nb_sectors); - } - - if (align_nb_sectors > max_sectors) { - align_nb_sectors = max_sectors; - if (need_cow) { - align_nb_sectors = QEMU_ALIGN_DOWN(align_nb_sectors, - s->target_cluster_sectors); - } - } - /* Clipping may result in align_nb_sectors unaligned to chunk boundary, but - * that doesn't matter because it's already the end of source image. */ - mirror_clip_sectors(s, align_sector_num, &align_nb_sectors); - - ret = align_sector_num + align_nb_sectors - (*sector_num + *nb_sectors); - *sector_num = align_sector_num; - *nb_sectors = align_nb_sectors; - assert(ret >= 0); - return ret; -} - -static inline void mirror_wait_for_io(MirrorBlockJob *s) -{ - assert(!s->waiting_for_io); - s->waiting_for_io = true; - qemu_coroutine_yield(); - s->waiting_for_io = false; -} - -/* Submit async read while handling COW. - * Returns: nb_sectors if no alignment is necessary, or - * (new_end - sector_num) if tail is rounded up or down due to - * alignment or buffer limit. - */ -static int mirror_do_read(MirrorBlockJob *s, int64_t sector_num, - int nb_sectors) -{ - BlockDriverState *source = s->common.bs; - int sectors_per_chunk, nb_chunks; - int ret = nb_sectors; - MirrorOp *op; - - sectors_per_chunk = s->granularity >> BDRV_SECTOR_BITS; - - /* We can only handle as much as buf_size at a time. */ - nb_sectors = MIN(s->buf_size >> BDRV_SECTOR_BITS, nb_sectors); - assert(nb_sectors); - - if (s->cow_bitmap) { - ret += mirror_cow_align(s, §or_num, &nb_sectors); - } - assert(nb_sectors << BDRV_SECTOR_BITS <= s->buf_size); - /* The sector range must meet granularity because: - * 1) Caller passes in aligned values; - * 2) mirror_cow_align is used only when target cluster is larger. */ - assert(!(sector_num % sectors_per_chunk)); - nb_chunks = DIV_ROUND_UP(nb_sectors, sectors_per_chunk); - - while (s->buf_free_count < nb_chunks) { - trace_mirror_yield_in_flight(s, sector_num, s->in_flight); - mirror_wait_for_io(s); - } - - /* Allocate a MirrorOp that is used as an AIO callback. */ - op = g_new(MirrorOp, 1); - op->s = s; - op->sector_num = sector_num; - op->nb_sectors = nb_sectors; - - /* Now make a QEMUIOVector taking enough granularity-sized chunks - * from s->buf_free. - */ - qemu_iovec_init(&op->qiov, nb_chunks); - while (nb_chunks-- > 0) { - MirrorBuffer *buf = QSIMPLEQ_FIRST(&s->buf_free); - size_t remaining = nb_sectors * BDRV_SECTOR_SIZE - op->qiov.size; - - QSIMPLEQ_REMOVE_HEAD(&s->buf_free, next); - s->buf_free_count--; - qemu_iovec_add(&op->qiov, buf, MIN(s->granularity, remaining)); - } - - /* Copy the dirty cluster. */ - s->in_flight++; - s->sectors_in_flight += nb_sectors; - trace_mirror_one_iteration(s, sector_num, nb_sectors); - - bdrv_aio_readv(source, sector_num, &op->qiov, nb_sectors, - mirror_read_complete, op); - return ret; -} - -static void mirror_do_zero_or_discard(MirrorBlockJob *s, - int64_t sector_num, - int nb_sectors, - bool is_discard) -{ - MirrorOp *op; - - /* Allocate a MirrorOp that is used as an AIO callback. The qiov is zeroed - * so the freeing in mirror_iteration_done is nop. */ - op = g_new0(MirrorOp, 1); - op->s = s; - op->sector_num = sector_num; - op->nb_sectors = nb_sectors; - - s->in_flight++; - s->sectors_in_flight += nb_sectors; - if (is_discard) { - bdrv_aio_discard(s->target, sector_num, op->nb_sectors, - mirror_write_complete, op); - } else { - bdrv_aio_write_zeroes(s->target, sector_num, op->nb_sectors, - s->unmap ? BDRV_REQ_MAY_UNMAP : 0, - mirror_write_complete, op); - } -} - -static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s) -{ - BlockDriverState *source = s->common.bs; - int64_t sector_num, first_chunk; - uint64_t delay_ns = 0; - /* At least the first dirty chunk is mirrored in one iteration. */ - int nb_chunks = 1; - int64_t end = s->bdev_length / BDRV_SECTOR_SIZE; - int sectors_per_chunk = s->granularity >> BDRV_SECTOR_BITS; - - sector_num = hbitmap_iter_next(&s->hbi); - if (sector_num < 0) { - bdrv_dirty_iter_init(s->dirty_bitmap, &s->hbi); - sector_num = hbitmap_iter_next(&s->hbi); - trace_mirror_restart_iter(s, bdrv_get_dirty_count(s->dirty_bitmap)); - assert(sector_num >= 0); - } - - first_chunk = sector_num / sectors_per_chunk; - while (test_bit(first_chunk, s->in_flight_bitmap)) { - trace_mirror_yield_in_flight(s, first_chunk, s->in_flight); - mirror_wait_for_io(s); - } - - /* Find the number of consective dirty chunks following the first dirty - * one, and wait for in flight requests in them. */ - while (nb_chunks * sectors_per_chunk < (s->buf_size >> BDRV_SECTOR_BITS)) { - int64_t hbitmap_next; - int64_t next_sector = sector_num + nb_chunks * sectors_per_chunk; - int64_t next_chunk = next_sector / sectors_per_chunk; - if (next_sector >= end || - !bdrv_get_dirty(source, s->dirty_bitmap, next_sector)) { - break; - } - if (test_bit(next_chunk, s->in_flight_bitmap)) { - break; - } - - hbitmap_next = hbitmap_iter_next(&s->hbi); - if (hbitmap_next > next_sector || hbitmap_next < 0) { - /* The bitmap iterator's cache is stale, refresh it */ - bdrv_set_dirty_iter(&s->hbi, next_sector); - hbitmap_next = hbitmap_iter_next(&s->hbi); - } - assert(hbitmap_next == next_sector); - nb_chunks++; - } - - /* Clear dirty bits before querying the block status, because - * calling bdrv_get_block_status_above could yield - if some blocks are - * marked dirty in this window, we need to know. - */ - bdrv_reset_dirty_bitmap(s->dirty_bitmap, sector_num, - nb_chunks * sectors_per_chunk); - bitmap_set(s->in_flight_bitmap, sector_num / sectors_per_chunk, nb_chunks); - while (nb_chunks > 0 && sector_num < end) { - int ret; - int io_sectors; - BlockDriverState *file; - enum MirrorMethod { - MIRROR_METHOD_COPY, - MIRROR_METHOD_ZERO, - MIRROR_METHOD_DISCARD - } mirror_method = MIRROR_METHOD_COPY; - - assert(!(sector_num % sectors_per_chunk)); - ret = bdrv_get_block_status_above(source, NULL, sector_num, - nb_chunks * sectors_per_chunk, - &io_sectors, &file); - if (ret < 0) { - io_sectors = nb_chunks * sectors_per_chunk; - } - - io_sectors -= io_sectors % sectors_per_chunk; - if (io_sectors < sectors_per_chunk) { - io_sectors = sectors_per_chunk; - } else if (ret >= 0 && !(ret & BDRV_BLOCK_DATA)) { - int64_t target_sector_num; - int target_nb_sectors; - bdrv_round_to_clusters(s->target, sector_num, io_sectors, - &target_sector_num, &target_nb_sectors); - if (target_sector_num == sector_num && - target_nb_sectors == io_sectors) { - mirror_method = ret & BDRV_BLOCK_ZERO ? - MIRROR_METHOD_ZERO : - MIRROR_METHOD_DISCARD; - } - } - - mirror_clip_sectors(s, sector_num, &io_sectors); - switch (mirror_method) { - case MIRROR_METHOD_COPY: - io_sectors = mirror_do_read(s, sector_num, io_sectors); - break; - case MIRROR_METHOD_ZERO: - mirror_do_zero_or_discard(s, sector_num, io_sectors, false); - break; - case MIRROR_METHOD_DISCARD: - mirror_do_zero_or_discard(s, sector_num, io_sectors, true); - break; - default: - abort(); - } - assert(io_sectors); - sector_num += io_sectors; - nb_chunks -= DIV_ROUND_UP(io_sectors, sectors_per_chunk); - delay_ns += ratelimit_calculate_delay(&s->limit, io_sectors); - } - return delay_ns; -} - -static void mirror_free_init(MirrorBlockJob *s) -{ - int granularity = s->granularity; - size_t buf_size = s->buf_size; - uint8_t *buf = s->buf; - - assert(s->buf_free_count == 0); - QSIMPLEQ_INIT(&s->buf_free); - while (buf_size != 0) { - MirrorBuffer *cur = (MirrorBuffer *)buf; - QSIMPLEQ_INSERT_TAIL(&s->buf_free, cur, next); - s->buf_free_count++; - buf_size -= granularity; - buf += granularity; - } -} - -static void mirror_drain(MirrorBlockJob *s) -{ - while (s->in_flight > 0) { - mirror_wait_for_io(s); - } -} - -typedef struct { - int ret; -} MirrorExitData; - -static void mirror_exit(BlockJob *job, void *opaque) -{ - MirrorBlockJob *s = container_of(job, MirrorBlockJob, common); - MirrorExitData *data = opaque; - AioContext *replace_aio_context = NULL; - BlockDriverState *src = s->common.bs; - - /* Make sure that the source BDS doesn't go away before we called - * block_job_completed(). */ - bdrv_ref(src); - - if (s->to_replace) { - replace_aio_context = bdrv_get_aio_context(s->to_replace); - aio_context_acquire(replace_aio_context); - } - - if (s->should_complete && data->ret == 0) { - BlockDriverState *to_replace = s->common.bs; - if (s->to_replace) { - to_replace = s->to_replace; - } - - /* This was checked in mirror_start_job(), but meanwhile one of the - * nodes could have been newly attached to a BlockBackend. */ - if (to_replace->blk && s->target->blk) { - error_report("block job: Can't create node with two BlockBackends"); - data->ret = -EINVAL; - goto out; - } - - if (bdrv_get_flags(s->target) != bdrv_get_flags(to_replace)) { - bdrv_reopen(s->target, bdrv_get_flags(to_replace), NULL); - } - bdrv_replace_in_backing_chain(to_replace, s->target); - } - -out: - if (s->to_replace) { - bdrv_op_unblock_all(s->to_replace, s->replace_blocker); - error_free(s->replace_blocker); - bdrv_unref(s->to_replace); - } - if (replace_aio_context) { - aio_context_release(replace_aio_context); - } - g_free(s->replaces); - bdrv_op_unblock_all(s->target, s->common.blocker); - bdrv_unref(s->target); - block_job_completed(&s->common, data->ret); - g_free(data); - bdrv_drained_end(src); - if (qemu_get_aio_context() == bdrv_get_aio_context(src)) { - aio_enable_external(iohandler_get_aio_context()); - } - bdrv_unref(src); -} - -static void coroutine_fn mirror_run(void *opaque) -{ - MirrorBlockJob *s = opaque; - MirrorExitData *data; - BlockDriverState *bs = s->common.bs; - int64_t sector_num, end, length; - uint64_t last_pause_ns; - BlockDriverInfo bdi; - char backing_filename[2]; /* we only need 2 characters because we are only - checking for a NULL string */ - int ret = 0; - int n; - int target_cluster_size = BDRV_SECTOR_SIZE; - - if (block_job_is_cancelled(&s->common)) { - goto immediate_exit; - } - - s->bdev_length = bdrv_getlength(bs); - if (s->bdev_length < 0) { - ret = s->bdev_length; - goto immediate_exit; - } else if (s->bdev_length == 0) { - /* Report BLOCK_JOB_READY and wait for complete. */ - block_job_event_ready(&s->common); - s->synced = true; - while (!block_job_is_cancelled(&s->common) && !s->should_complete) { - block_job_yield(&s->common); - } - s->common.cancelled = false; - goto immediate_exit; - } - - length = DIV_ROUND_UP(s->bdev_length, s->granularity); - s->in_flight_bitmap = bitmap_new(length); - - /* If we have no backing file yet in the destination, we cannot let - * the destination do COW. Instead, we copy sectors around the - * dirty data if needed. We need a bitmap to do that. - */ - bdrv_get_backing_filename(s->target, backing_filename, - sizeof(backing_filename)); - if (!bdrv_get_info(s->target, &bdi) && bdi.cluster_size) { - target_cluster_size = bdi.cluster_size; - } - if (backing_filename[0] && !s->target->backing - && s->granularity < target_cluster_size) { - s->buf_size = MAX(s->buf_size, target_cluster_size); - s->cow_bitmap = bitmap_new(length); - } - s->target_cluster_sectors = target_cluster_size >> BDRV_SECTOR_BITS; - s->max_iov = MIN(s->common.bs->bl.max_iov, s->target->bl.max_iov); - - end = s->bdev_length / BDRV_SECTOR_SIZE; - s->buf = qemu_try_blockalign(bs, s->buf_size); - if (s->buf == NULL) { - ret = -ENOMEM; - goto immediate_exit; - } - - mirror_free_init(s); - - last_pause_ns = qemu_clock_get_ns(QEMU_CLOCK_REALTIME); - if (!s->is_none_mode) { - /* First part, loop on the sectors and initialize the dirty bitmap. */ - BlockDriverState *base = s->base; - bool mark_all_dirty = s->base == NULL && !bdrv_has_zero_init(s->target); - - for (sector_num = 0; sector_num < end; ) { - /* Just to make sure we are not exceeding int limit. */ - int nb_sectors = MIN(INT_MAX >> BDRV_SECTOR_BITS, - end - sector_num); - int64_t now = qemu_clock_get_ns(QEMU_CLOCK_REALTIME); - - if (now - last_pause_ns > SLICE_TIME) { - last_pause_ns = now; - block_job_sleep_ns(&s->common, QEMU_CLOCK_REALTIME, 0); - } - - if (block_job_is_cancelled(&s->common)) { - goto immediate_exit; - } - - ret = bdrv_is_allocated_above(bs, base, sector_num, nb_sectors, &n); - - if (ret < 0) { - goto immediate_exit; - } - - assert(n > 0); - if (ret == 1 || mark_all_dirty) { - bdrv_set_dirty_bitmap(s->dirty_bitmap, sector_num, n); - } - sector_num += n; - } - } - - bdrv_dirty_iter_init(s->dirty_bitmap, &s->hbi); - for (;;) { - uint64_t delay_ns = 0; - int64_t cnt; - bool should_complete; - - if (s->ret < 0) { - ret = s->ret; - goto immediate_exit; - } - - cnt = bdrv_get_dirty_count(s->dirty_bitmap); - /* s->common.offset contains the number of bytes already processed so - * far, cnt is the number of dirty sectors remaining and - * s->sectors_in_flight is the number of sectors currently being - * processed; together those are the current total operation length */ - s->common.len = s->common.offset + - (cnt + s->sectors_in_flight) * BDRV_SECTOR_SIZE; - - /* Note that even when no rate limit is applied we need to yield - * periodically with no pending I/O so that bdrv_drain_all() returns. - * We do so every SLICE_TIME nanoseconds, or when there is an error, - * or when the source is clean, whichever comes first. - */ - if (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - last_pause_ns < SLICE_TIME && - s->common.iostatus == BLOCK_DEVICE_IO_STATUS_OK) { - if (s->in_flight == MAX_IN_FLIGHT || s->buf_free_count == 0 || - (cnt == 0 && s->in_flight > 0)) { - trace_mirror_yield(s, s->in_flight, s->buf_free_count, cnt); - mirror_wait_for_io(s); - continue; - } else if (cnt != 0) { - delay_ns = mirror_iteration(s); - } - } - - should_complete = false; - if (s->in_flight == 0 && cnt == 0) { - trace_mirror_before_flush(s); - ret = bdrv_flush(s->target); - if (ret < 0) { - if (mirror_error_action(s, false, -ret) == - BLOCK_ERROR_ACTION_REPORT) { - goto immediate_exit; - } - } else { - /* We're out of the streaming phase. From now on, if the job - * is cancelled we will actually complete all pending I/O and - * report completion. This way, block-job-cancel will leave - * the target in a consistent state. - */ - if (!s->synced) { - block_job_event_ready(&s->common); - s->synced = true; - } - - should_complete = s->should_complete || - block_job_is_cancelled(&s->common); - cnt = bdrv_get_dirty_count(s->dirty_bitmap); - } - } - - if (cnt == 0 && should_complete) { - /* The dirty bitmap is not updated while operations are pending. - * If we're about to exit, wait for pending operations before - * calling bdrv_get_dirty_count(bs), or we may exit while the - * source has dirty data to copy! - * - * Note that I/O can be submitted by the guest while - * mirror_populate runs. - */ - trace_mirror_before_drain(s, cnt); - bdrv_co_drain(bs); - cnt = bdrv_get_dirty_count(s->dirty_bitmap); - } - - ret = 0; - trace_mirror_before_sleep(s, cnt, s->synced, delay_ns); - if (!s->synced) { - block_job_sleep_ns(&s->common, QEMU_CLOCK_REALTIME, delay_ns); - if (block_job_is_cancelled(&s->common)) { - break; - } - } else if (!should_complete) { - delay_ns = (s->in_flight == 0 && cnt == 0 ? SLICE_TIME : 0); - block_job_sleep_ns(&s->common, QEMU_CLOCK_REALTIME, delay_ns); - } else if (cnt == 0) { - /* The two disks are in sync. Exit and report successful - * completion. - */ - assert(QLIST_EMPTY(&bs->tracked_requests)); - s->common.cancelled = false; - break; - } - last_pause_ns = qemu_clock_get_ns(QEMU_CLOCK_REALTIME); - } - -immediate_exit: - if (s->in_flight > 0) { - /* We get here only if something went wrong. Either the job failed, - * or it was cancelled prematurely so that we do not guarantee that - * the target is a copy of the source. - */ - assert(ret < 0 || (!s->synced && block_job_is_cancelled(&s->common))); - mirror_drain(s); - } - - assert(s->in_flight == 0); - qemu_vfree(s->buf); - g_free(s->cow_bitmap); - g_free(s->in_flight_bitmap); - bdrv_release_dirty_bitmap(bs, s->dirty_bitmap); - if (s->target->blk) { - blk_iostatus_disable(s->target->blk); - } - - data = g_malloc(sizeof(*data)); - data->ret = ret; - /* Before we switch to target in mirror_exit, make sure data doesn't - * change. */ - bdrv_drained_begin(s->common.bs); - if (qemu_get_aio_context() == bdrv_get_aio_context(bs)) { - /* FIXME: virtio host notifiers run on iohandler_ctx, therefore the - * above bdrv_drained_end isn't enough to quiesce it. This is ugly, we - * need a block layer API change to achieve this. */ - aio_disable_external(iohandler_get_aio_context()); - } - block_job_defer_to_main_loop(&s->common, mirror_exit, data); -} - -static void mirror_set_speed(BlockJob *job, int64_t speed, Error **errp) -{ - MirrorBlockJob *s = container_of(job, MirrorBlockJob, common); - - if (speed < 0) { - error_setg(errp, QERR_INVALID_PARAMETER, "speed"); - return; - } - ratelimit_set_speed(&s->limit, speed / BDRV_SECTOR_SIZE, SLICE_TIME); -} - -static void mirror_iostatus_reset(BlockJob *job) -{ - MirrorBlockJob *s = container_of(job, MirrorBlockJob, common); - - if (s->target->blk) { - blk_iostatus_reset(s->target->blk); - } -} - -static void mirror_complete(BlockJob *job, Error **errp) -{ - MirrorBlockJob *s = container_of(job, MirrorBlockJob, common); - Error *local_err = NULL; - int ret; - - ret = bdrv_open_backing_file(s->target, NULL, "backing", &local_err); - if (ret < 0) { - error_propagate(errp, local_err); - return; - } - if (!s->synced) { - error_setg(errp, QERR_BLOCK_JOB_NOT_READY, job->id); - return; - } - - /* check the target bs is not blocked and block all operations on it */ - if (s->replaces) { - AioContext *replace_aio_context; - - s->to_replace = bdrv_find_node(s->replaces); - if (!s->to_replace) { - error_setg(errp, "Node name '%s' not found", s->replaces); - return; - } - - replace_aio_context = bdrv_get_aio_context(s->to_replace); - aio_context_acquire(replace_aio_context); - - error_setg(&s->replace_blocker, - "block device is in use by block-job-complete"); - bdrv_op_block_all(s->to_replace, s->replace_blocker); - bdrv_ref(s->to_replace); - - aio_context_release(replace_aio_context); - } - - s->should_complete = true; - block_job_enter(&s->common); -} - -static const BlockJobDriver mirror_job_driver = { - .instance_size = sizeof(MirrorBlockJob), - .job_type = BLOCK_JOB_TYPE_MIRROR, - .set_speed = mirror_set_speed, - .iostatus_reset= mirror_iostatus_reset, - .complete = mirror_complete, -}; - -static const BlockJobDriver commit_active_job_driver = { - .instance_size = sizeof(MirrorBlockJob), - .job_type = BLOCK_JOB_TYPE_COMMIT, - .set_speed = mirror_set_speed, - .iostatus_reset - = mirror_iostatus_reset, - .complete = mirror_complete, -}; - -static void mirror_start_job(BlockDriverState *bs, BlockDriverState *target, - const char *replaces, - int64_t speed, uint32_t granularity, - int64_t buf_size, - BlockdevOnError on_source_error, - BlockdevOnError on_target_error, - bool unmap, - BlockCompletionFunc *cb, - void *opaque, Error **errp, - const BlockJobDriver *driver, - bool is_none_mode, BlockDriverState *base) -{ - MirrorBlockJob *s; - BlockDriverState *replaced_bs; - - if (granularity == 0) { - granularity = bdrv_get_default_bitmap_granularity(target); - } - - assert ((granularity & (granularity - 1)) == 0); - - if ((on_source_error == BLOCKDEV_ON_ERROR_STOP || - on_source_error == BLOCKDEV_ON_ERROR_ENOSPC) && - (!bs->blk || !blk_iostatus_is_enabled(bs->blk))) { - error_setg(errp, QERR_INVALID_PARAMETER, "on-source-error"); - return; - } - - if (buf_size < 0) { - error_setg(errp, "Invalid parameter 'buf-size'"); - return; - } - - if (buf_size == 0) { - buf_size = DEFAULT_MIRROR_BUF_SIZE; - } - - /* We can't support this case as long as the block layer can't handle - * multiple BlockBackends per BlockDriverState. */ - if (replaces) { - replaced_bs = bdrv_lookup_bs(replaces, replaces, errp); - if (replaced_bs == NULL) { - return; - } - } else { - replaced_bs = bs; - } - if (replaced_bs->blk && target->blk) { - error_setg(errp, "Can't create node with two BlockBackends"); - return; - } - - s = block_job_create(driver, bs, speed, cb, opaque, errp); - if (!s) { - return; - } - - s->replaces = g_strdup(replaces); - s->on_source_error = on_source_error; - s->on_target_error = on_target_error; - s->target = target; - s->is_none_mode = is_none_mode; - s->base = base; - s->granularity = granularity; - s->buf_size = ROUND_UP(buf_size, granularity); - s->unmap = unmap; - - s->dirty_bitmap = bdrv_create_dirty_bitmap(bs, granularity, NULL, errp); - if (!s->dirty_bitmap) { - g_free(s->replaces); - block_job_unref(&s->common); - return; - } - - bdrv_op_block_all(s->target, s->common.blocker); - - if (s->target->blk) { - blk_set_on_error(s->target->blk, on_target_error, on_target_error); - blk_iostatus_enable(s->target->blk); - } - s->common.co = qemu_coroutine_create(mirror_run); - trace_mirror_start(bs, s, s->common.co, opaque); - qemu_coroutine_enter(s->common.co, s); -} - -void mirror_start(BlockDriverState *bs, BlockDriverState *target, - const char *replaces, - int64_t speed, uint32_t granularity, int64_t buf_size, - MirrorSyncMode mode, BlockdevOnError on_source_error, - BlockdevOnError on_target_error, - bool unmap, - BlockCompletionFunc *cb, - void *opaque, Error **errp) -{ - bool is_none_mode; - BlockDriverState *base; - - if (mode == MIRROR_SYNC_MODE_INCREMENTAL) { - error_setg(errp, "Sync mode 'incremental' not supported"); - return; - } - is_none_mode = mode == MIRROR_SYNC_MODE_NONE; - base = mode == MIRROR_SYNC_MODE_TOP ? backing_bs(bs) : NULL; - mirror_start_job(bs, target, replaces, - speed, granularity, buf_size, - on_source_error, on_target_error, unmap, cb, opaque, errp, - &mirror_job_driver, is_none_mode, base); -} - -void commit_active_start(BlockDriverState *bs, BlockDriverState *base, - int64_t speed, - BlockdevOnError on_error, - BlockCompletionFunc *cb, - void *opaque, Error **errp) -{ - int64_t length, base_length; - int orig_base_flags; - int ret; - Error *local_err = NULL; - - orig_base_flags = bdrv_get_flags(base); - - if (bdrv_reopen(base, bs->open_flags, errp)) { - return; - } - - length = bdrv_getlength(bs); - if (length < 0) { - error_setg_errno(errp, -length, - "Unable to determine length of %s", bs->filename); - goto error_restore_flags; - } - - base_length = bdrv_getlength(base); - if (base_length < 0) { - error_setg_errno(errp, -base_length, - "Unable to determine length of %s", base->filename); - goto error_restore_flags; - } - - if (length > base_length) { - ret = bdrv_truncate(base, length); - if (ret < 0) { - error_setg_errno(errp, -ret, - "Top image %s is larger than base image %s, and " - "resize of base image failed", - bs->filename, base->filename); - goto error_restore_flags; - } - } - - bdrv_ref(base); - mirror_start_job(bs, base, NULL, speed, 0, 0, - on_error, on_error, false, cb, opaque, &local_err, - &commit_active_job_driver, false, base); - if (local_err) { - error_propagate(errp, local_err); - goto error_restore_flags; - } - - return; - -error_restore_flags: - /* ignore error and errp for bdrv_reopen, because we want to propagate - * the original error */ - bdrv_reopen(base, orig_base_flags, NULL); - return; -} diff --git a/qemu/block/nbd-client.c b/qemu/block/nbd-client.c deleted file mode 100644 index 878e879ac..000000000 --- a/qemu/block/nbd-client.c +++ /dev/null @@ -1,436 +0,0 @@ -/* - * QEMU Block driver for NBD - * - * Copyright (C) 2008 Bull S.A.S. - * Author: Laurent Vivier - * - * Some parts: - * Copyright (C) 2007 Anthony Liguori - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. - */ - -#include "qemu/osdep.h" -#include "nbd-client.h" - -#define HANDLE_TO_INDEX(bs, handle) ((handle) ^ ((uint64_t)(intptr_t)bs)) -#define INDEX_TO_HANDLE(bs, index) ((index) ^ ((uint64_t)(intptr_t)bs)) - -static void nbd_recv_coroutines_enter_all(NbdClientSession *s) -{ - int i; - - for (i = 0; i < MAX_NBD_REQUESTS; i++) { - if (s->recv_coroutine[i]) { - qemu_coroutine_enter(s->recv_coroutine[i], NULL); - } - } -} - -static void nbd_teardown_connection(BlockDriverState *bs) -{ - NbdClientSession *client = nbd_get_client_session(bs); - - if (!client->ioc) { /* Already closed */ - return; - } - - /* finish any pending coroutines */ - qio_channel_shutdown(client->ioc, - QIO_CHANNEL_SHUTDOWN_BOTH, - NULL); - nbd_recv_coroutines_enter_all(client); - - nbd_client_detach_aio_context(bs); - object_unref(OBJECT(client->sioc)); - client->sioc = NULL; - object_unref(OBJECT(client->ioc)); - client->ioc = NULL; -} - -static void nbd_reply_ready(void *opaque) -{ - BlockDriverState *bs = opaque; - NbdClientSession *s = nbd_get_client_session(bs); - uint64_t i; - int ret; - - if (!s->ioc) { /* Already closed */ - return; - } - - if (s->reply.handle == 0) { - /* No reply already in flight. Fetch a header. It is possible - * that another thread has done the same thing in parallel, so - * the socket is not readable anymore. - */ - ret = nbd_receive_reply(s->ioc, &s->reply); - if (ret == -EAGAIN) { - return; - } - if (ret < 0) { - s->reply.handle = 0; - goto fail; - } - } - - /* There's no need for a mutex on the receive side, because the - * handler acts as a synchronization point and ensures that only - * one coroutine is called until the reply finishes. */ - i = HANDLE_TO_INDEX(s, s->reply.handle); - if (i >= MAX_NBD_REQUESTS) { - goto fail; - } - - if (s->recv_coroutine[i]) { - qemu_coroutine_enter(s->recv_coroutine[i], NULL); - return; - } - -fail: - nbd_teardown_connection(bs); -} - -static void nbd_restart_write(void *opaque) -{ - BlockDriverState *bs = opaque; - - qemu_coroutine_enter(nbd_get_client_session(bs)->send_coroutine, NULL); -} - -static int nbd_co_send_request(BlockDriverState *bs, - struct nbd_request *request, - QEMUIOVector *qiov, int offset) -{ - NbdClientSession *s = nbd_get_client_session(bs); - AioContext *aio_context; - int rc, ret, i; - - qemu_co_mutex_lock(&s->send_mutex); - - for (i = 0; i < MAX_NBD_REQUESTS; i++) { - if (s->recv_coroutine[i] == NULL) { - s->recv_coroutine[i] = qemu_coroutine_self(); - break; - } - } - - g_assert(qemu_in_coroutine()); - assert(i < MAX_NBD_REQUESTS); - request->handle = INDEX_TO_HANDLE(s, i); - - if (!s->ioc) { - qemu_co_mutex_unlock(&s->send_mutex); - return -EPIPE; - } - - s->send_coroutine = qemu_coroutine_self(); - aio_context = bdrv_get_aio_context(bs); - - aio_set_fd_handler(aio_context, s->sioc->fd, false, - nbd_reply_ready, nbd_restart_write, bs); - if (qiov) { - qio_channel_set_cork(s->ioc, true); - rc = nbd_send_request(s->ioc, request); - if (rc >= 0) { - ret = nbd_wr_syncv(s->ioc, qiov->iov, qiov->niov, - offset, request->len, 0); - if (ret != request->len) { - rc = -EIO; - } - } - qio_channel_set_cork(s->ioc, false); - } else { - rc = nbd_send_request(s->ioc, request); - } - aio_set_fd_handler(aio_context, s->sioc->fd, false, - nbd_reply_ready, NULL, bs); - s->send_coroutine = NULL; - qemu_co_mutex_unlock(&s->send_mutex); - return rc; -} - -static void nbd_co_receive_reply(NbdClientSession *s, - struct nbd_request *request, struct nbd_reply *reply, - QEMUIOVector *qiov, int offset) -{ - int ret; - - /* Wait until we're woken up by the read handler. TODO: perhaps - * peek at the next reply and avoid yielding if it's ours? */ - qemu_coroutine_yield(); - *reply = s->reply; - if (reply->handle != request->handle || - !s->ioc) { - reply->error = EIO; - } else { - if (qiov && reply->error == 0) { - ret = nbd_wr_syncv(s->ioc, qiov->iov, qiov->niov, - offset, request->len, 1); - if (ret != request->len) { - reply->error = EIO; - } - } - - /* Tell the read handler to read another header. */ - s->reply.handle = 0; - } -} - -static void nbd_coroutine_start(NbdClientSession *s, - struct nbd_request *request) -{ - /* Poor man semaphore. The free_sema is locked when no other request - * can be accepted, and unlocked after receiving one reply. */ - if (s->in_flight >= MAX_NBD_REQUESTS - 1) { - qemu_co_mutex_lock(&s->free_sema); - assert(s->in_flight < MAX_NBD_REQUESTS); - } - s->in_flight++; - - /* s->recv_coroutine[i] is set as soon as we get the send_lock. */ -} - -static void nbd_coroutine_end(NbdClientSession *s, - struct nbd_request *request) -{ - int i = HANDLE_TO_INDEX(s, request->handle); - s->recv_coroutine[i] = NULL; - if (s->in_flight-- == MAX_NBD_REQUESTS) { - qemu_co_mutex_unlock(&s->free_sema); - } -} - -static int nbd_co_readv_1(BlockDriverState *bs, int64_t sector_num, - int nb_sectors, QEMUIOVector *qiov, - int offset) -{ - NbdClientSession *client = nbd_get_client_session(bs); - struct nbd_request request = { .type = NBD_CMD_READ }; - struct nbd_reply reply; - ssize_t ret; - - request.from = sector_num * 512; - request.len = nb_sectors * 512; - - nbd_coroutine_start(client, &request); - ret = nbd_co_send_request(bs, &request, NULL, 0); - if (ret < 0) { - reply.error = -ret; - } else { - nbd_co_receive_reply(client, &request, &reply, qiov, offset); - } - nbd_coroutine_end(client, &request); - return -reply.error; - -} - -static int nbd_co_writev_1(BlockDriverState *bs, int64_t sector_num, - int nb_sectors, QEMUIOVector *qiov, - int offset, int *flags) -{ - NbdClientSession *client = nbd_get_client_session(bs); - struct nbd_request request = { .type = NBD_CMD_WRITE }; - struct nbd_reply reply; - ssize_t ret; - - if ((*flags & BDRV_REQ_FUA) && (client->nbdflags & NBD_FLAG_SEND_FUA)) { - *flags &= ~BDRV_REQ_FUA; - request.type |= NBD_CMD_FLAG_FUA; - } - - request.from = sector_num * 512; - request.len = nb_sectors * 512; - - nbd_coroutine_start(client, &request); - ret = nbd_co_send_request(bs, &request, qiov, offset); - if (ret < 0) { - reply.error = -ret; - } else { - nbd_co_receive_reply(client, &request, &reply, NULL, 0); - } - nbd_coroutine_end(client, &request); - return -reply.error; -} - -/* qemu-nbd has a limit of slightly less than 1M per request. Try to - * remain aligned to 4K. */ -#define NBD_MAX_SECTORS 2040 - -int nbd_client_co_readv(BlockDriverState *bs, int64_t sector_num, - int nb_sectors, QEMUIOVector *qiov) -{ - int offset = 0; - int ret; - while (nb_sectors > NBD_MAX_SECTORS) { - ret = nbd_co_readv_1(bs, sector_num, NBD_MAX_SECTORS, qiov, offset); - if (ret < 0) { - return ret; - } - offset += NBD_MAX_SECTORS * 512; - sector_num += NBD_MAX_SECTORS; - nb_sectors -= NBD_MAX_SECTORS; - } - return nbd_co_readv_1(bs, sector_num, nb_sectors, qiov, offset); -} - -int nbd_client_co_writev(BlockDriverState *bs, int64_t sector_num, - int nb_sectors, QEMUIOVector *qiov, int *flags) -{ - int offset = 0; - int ret; - while (nb_sectors > NBD_MAX_SECTORS) { - ret = nbd_co_writev_1(bs, sector_num, NBD_MAX_SECTORS, qiov, offset, - flags); - if (ret < 0) { - return ret; - } - offset += NBD_MAX_SECTORS * 512; - sector_num += NBD_MAX_SECTORS; - nb_sectors -= NBD_MAX_SECTORS; - } - return nbd_co_writev_1(bs, sector_num, nb_sectors, qiov, offset, flags); -} - -int nbd_client_co_flush(BlockDriverState *bs) -{ - NbdClientSession *client = nbd_get_client_session(bs); - struct nbd_request request = { .type = NBD_CMD_FLUSH }; - struct nbd_reply reply; - ssize_t ret; - - if (!(client->nbdflags & NBD_FLAG_SEND_FLUSH)) { - return 0; - } - - request.from = 0; - request.len = 0; - - nbd_coroutine_start(client, &request); - ret = nbd_co_send_request(bs, &request, NULL, 0); - if (ret < 0) { - reply.error = -ret; - } else { - nbd_co_receive_reply(client, &request, &reply, NULL, 0); - } - nbd_coroutine_end(client, &request); - return -reply.error; -} - -int nbd_client_co_discard(BlockDriverState *bs, int64_t sector_num, - int nb_sectors) -{ - NbdClientSession *client = nbd_get_client_session(bs); - struct nbd_request request = { .type = NBD_CMD_TRIM }; - struct nbd_reply reply; - ssize_t ret; - - if (!(client->nbdflags & NBD_FLAG_SEND_TRIM)) { - return 0; - } - request.from = sector_num * 512; - request.len = nb_sectors * 512; - - nbd_coroutine_start(client, &request); - ret = nbd_co_send_request(bs, &request, NULL, 0); - if (ret < 0) { - reply.error = -ret; - } else { - nbd_co_receive_reply(client, &request, &reply, NULL, 0); - } - nbd_coroutine_end(client, &request); - return -reply.error; - -} - -void nbd_client_detach_aio_context(BlockDriverState *bs) -{ - aio_set_fd_handler(bdrv_get_aio_context(bs), - nbd_get_client_session(bs)->sioc->fd, - false, NULL, NULL, NULL); -} - -void nbd_client_attach_aio_context(BlockDriverState *bs, - AioContext *new_context) -{ - aio_set_fd_handler(new_context, nbd_get_client_session(bs)->sioc->fd, - false, nbd_reply_ready, NULL, bs); -} - -void nbd_client_close(BlockDriverState *bs) -{ - NbdClientSession *client = nbd_get_client_session(bs); - struct nbd_request request = { - .type = NBD_CMD_DISC, - .from = 0, - .len = 0 - }; - - if (client->ioc == NULL) { - return; - } - - nbd_send_request(client->ioc, &request); - - nbd_teardown_connection(bs); -} - -int nbd_client_init(BlockDriverState *bs, - QIOChannelSocket *sioc, - const char *export, - QCryptoTLSCreds *tlscreds, - const char *hostname, - Error **errp) -{ - NbdClientSession *client = nbd_get_client_session(bs); - int ret; - - /* NBD handshake */ - logout("session init %s\n", export); - qio_channel_set_blocking(QIO_CHANNEL(sioc), true, NULL); - - ret = nbd_receive_negotiate(QIO_CHANNEL(sioc), export, - &client->nbdflags, - tlscreds, hostname, - &client->ioc, - &client->size, errp); - if (ret < 0) { - logout("Failed to negotiate with the NBD server\n"); - return ret; - } - - qemu_co_mutex_init(&client->send_mutex); - qemu_co_mutex_init(&client->free_sema); - client->sioc = sioc; - object_ref(OBJECT(client->sioc)); - - if (!client->ioc) { - client->ioc = QIO_CHANNEL(sioc); - object_ref(OBJECT(client->ioc)); - } - - /* Now that we're connected, set the socket to be non-blocking and - * kick the reply mechanism. */ - qio_channel_set_blocking(QIO_CHANNEL(sioc), false, NULL); - - nbd_client_attach_aio_context(bs, bdrv_get_aio_context(bs)); - - logout("Established connection with NBD server\n"); - return 0; -} diff --git a/qemu/block/nbd-client.h b/qemu/block/nbd-client.h deleted file mode 100644 index bc7aec079..000000000 --- a/qemu/block/nbd-client.h +++ /dev/null @@ -1,59 +0,0 @@ -#ifndef NBD_CLIENT_H -#define NBD_CLIENT_H - -#include "qemu-common.h" -#include "block/nbd.h" -#include "block/block_int.h" -#include "io/channel-socket.h" - -/* #define DEBUG_NBD */ - -#if defined(DEBUG_NBD) -#define logout(fmt, ...) \ - fprintf(stderr, "nbd\t%-24s" fmt, __func__, ##__VA_ARGS__) -#else -#define logout(fmt, ...) ((void)0) -#endif - -#define MAX_NBD_REQUESTS 16 - -typedef struct NbdClientSession { - QIOChannelSocket *sioc; /* The master data channel */ - QIOChannel *ioc; /* The current I/O channel which may differ (eg TLS) */ - uint32_t nbdflags; - off_t size; - - CoMutex send_mutex; - CoMutex free_sema; - Coroutine *send_coroutine; - int in_flight; - - Coroutine *recv_coroutine[MAX_NBD_REQUESTS]; - struct nbd_reply reply; - - bool is_unix; -} NbdClientSession; - -NbdClientSession *nbd_get_client_session(BlockDriverState *bs); - -int nbd_client_init(BlockDriverState *bs, - QIOChannelSocket *sock, - const char *export_name, - QCryptoTLSCreds *tlscreds, - const char *hostname, - Error **errp); -void nbd_client_close(BlockDriverState *bs); - -int nbd_client_co_discard(BlockDriverState *bs, int64_t sector_num, - int nb_sectors); -int nbd_client_co_flush(BlockDriverState *bs); -int nbd_client_co_writev(BlockDriverState *bs, int64_t sector_num, - int nb_sectors, QEMUIOVector *qiov, int *flags); -int nbd_client_co_readv(BlockDriverState *bs, int64_t sector_num, - int nb_sectors, QEMUIOVector *qiov); - -void nbd_client_detach_aio_context(BlockDriverState *bs); -void nbd_client_attach_aio_context(BlockDriverState *bs, - AioContext *new_context); - -#endif /* NBD_CLIENT_H */ diff --git a/qemu/block/nbd.c b/qemu/block/nbd.c deleted file mode 100644 index f7ea3b360..000000000 --- a/qemu/block/nbd.c +++ /dev/null @@ -1,539 +0,0 @@ -/* - * QEMU Block driver for NBD - * - * Copyright (C) 2008 Bull S.A.S. - * Author: Laurent Vivier - * - * Some parts: - * Copyright (C) 2007 Anthony Liguori - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. - */ - -#include "qemu/osdep.h" -#include "block/nbd-client.h" -#include "qapi/error.h" -#include "qemu/uri.h" -#include "block/block_int.h" -#include "qemu/module.h" -#include "qapi/qmp/qdict.h" -#include "qapi/qmp/qjson.h" -#include "qapi/qmp/qint.h" -#include "qapi/qmp/qstring.h" -#include "qemu/cutils.h" - -#define EN_OPTSTR ":exportname=" - -typedef struct BDRVNBDState { - NbdClientSession client; -} BDRVNBDState; - -static int nbd_parse_uri(const char *filename, QDict *options) -{ - URI *uri; - const char *p; - QueryParams *qp = NULL; - int ret = 0; - bool is_unix; - - uri = uri_parse(filename); - if (!uri) { - return -EINVAL; - } - - /* transport */ - if (!strcmp(uri->scheme, "nbd")) { - is_unix = false; - } else if (!strcmp(uri->scheme, "nbd+tcp")) { - is_unix = false; - } else if (!strcmp(uri->scheme, "nbd+unix")) { - is_unix = true; - } else { - ret = -EINVAL; - goto out; - } - - p = uri->path ? uri->path : "/"; - p += strspn(p, "/"); - if (p[0]) { - qdict_put(options, "export", qstring_from_str(p)); - } - - qp = query_params_parse(uri->query); - if (qp->n > 1 || (is_unix && !qp->n) || (!is_unix && qp->n)) { - ret = -EINVAL; - goto out; - } - - if (is_unix) { - /* nbd+unix:///export?socket=path */ - if (uri->server || uri->port || strcmp(qp->p[0].name, "socket")) { - ret = -EINVAL; - goto out; - } - qdict_put(options, "path", qstring_from_str(qp->p[0].value)); - } else { - QString *host; - /* nbd[+tcp]://host[:port]/export */ - if (!uri->server) { - ret = -EINVAL; - goto out; - } - - /* strip braces from literal IPv6 address */ - if (uri->server[0] == '[') { - host = qstring_from_substr(uri->server, 1, - strlen(uri->server) - 2); - } else { - host = qstring_from_str(uri->server); - } - - qdict_put(options, "host", host); - if (uri->port) { - char* port_str = g_strdup_printf("%d", uri->port); - qdict_put(options, "port", qstring_from_str(port_str)); - g_free(port_str); - } - } - -out: - if (qp) { - query_params_free(qp); - } - uri_free(uri); - return ret; -} - -static void nbd_parse_filename(const char *filename, QDict *options, - Error **errp) -{ - char *file; - char *export_name; - const char *host_spec; - const char *unixpath; - - if (qdict_haskey(options, "host") - || qdict_haskey(options, "port") - || qdict_haskey(options, "path")) - { - error_setg(errp, "host/port/path and a file name may not be specified " - "at the same time"); - return; - } - - if (strstr(filename, "://")) { - int ret = nbd_parse_uri(filename, options); - if (ret < 0) { - error_setg(errp, "No valid URL specified"); - } - return; - } - - file = g_strdup(filename); - - export_name = strstr(file, EN_OPTSTR); - if (export_name) { - if (export_name[strlen(EN_OPTSTR)] == 0) { - goto out; - } - export_name[0] = 0; /* truncate 'file' */ - export_name += strlen(EN_OPTSTR); - - qdict_put(options, "export", qstring_from_str(export_name)); - } - - /* extract the host_spec - fail if it's not nbd:... */ - if (!strstart(file, "nbd:", &host_spec)) { - error_setg(errp, "File name string for NBD must start with 'nbd:'"); - goto out; - } - - if (!*host_spec) { - goto out; - } - - /* are we a UNIX or TCP socket? */ - if (strstart(host_spec, "unix:", &unixpath)) { - qdict_put(options, "path", qstring_from_str(unixpath)); - } else { - InetSocketAddress *addr = NULL; - - addr = inet_parse(host_spec, errp); - if (!addr) { - goto out; - } - - qdict_put(options, "host", qstring_from_str(addr->host)); - qdict_put(options, "port", qstring_from_str(addr->port)); - qapi_free_InetSocketAddress(addr); - } - -out: - g_free(file); -} - -static SocketAddress *nbd_config(BDRVNBDState *s, QDict *options, char **export, - Error **errp) -{ - SocketAddress *saddr; - - if (qdict_haskey(options, "path") == qdict_haskey(options, "host")) { - if (qdict_haskey(options, "path")) { - error_setg(errp, "path and host may not be used at the same time."); - } else { - error_setg(errp, "one of path and host must be specified."); - } - return NULL; - } - - saddr = g_new0(SocketAddress, 1); - - if (qdict_haskey(options, "path")) { - UnixSocketAddress *q_unix; - saddr->type = SOCKET_ADDRESS_KIND_UNIX; - q_unix = saddr->u.q_unix.data = g_new0(UnixSocketAddress, 1); - q_unix->path = g_strdup(qdict_get_str(options, "path")); - qdict_del(options, "path"); - } else { - InetSocketAddress *inet; - saddr->type = SOCKET_ADDRESS_KIND_INET; - inet = saddr->u.inet.data = g_new0(InetSocketAddress, 1); - inet->host = g_strdup(qdict_get_str(options, "host")); - if (!qdict_get_try_str(options, "port")) { - inet->port = g_strdup_printf("%d", NBD_DEFAULT_PORT); - } else { - inet->port = g_strdup(qdict_get_str(options, "port")); - } - qdict_del(options, "host"); - qdict_del(options, "port"); - } - - s->client.is_unix = saddr->type == SOCKET_ADDRESS_KIND_UNIX; - - *export = g_strdup(qdict_get_try_str(options, "export")); - if (*export) { - qdict_del(options, "export"); - } - - return saddr; -} - -NbdClientSession *nbd_get_client_session(BlockDriverState *bs) -{ - BDRVNBDState *s = bs->opaque; - return &s->client; -} - -static QIOChannelSocket *nbd_establish_connection(SocketAddress *saddr, - Error **errp) -{ - QIOChannelSocket *sioc; - Error *local_err = NULL; - - sioc = qio_channel_socket_new(); - - qio_channel_socket_connect_sync(sioc, - saddr, - &local_err); - if (local_err) { - error_propagate(errp, local_err); - return NULL; - } - - qio_channel_set_delay(QIO_CHANNEL(sioc), false); - - return sioc; -} - - -static QCryptoTLSCreds *nbd_get_tls_creds(const char *id, Error **errp) -{ - Object *obj; - QCryptoTLSCreds *creds; - - obj = object_resolve_path_component( - object_get_objects_root(), id); - if (!obj) { - error_setg(errp, "No TLS credentials with id '%s'", - id); - return NULL; - } - creds = (QCryptoTLSCreds *) - object_dynamic_cast(obj, TYPE_QCRYPTO_TLS_CREDS); - if (!creds) { - error_setg(errp, "Object with id '%s' is not TLS credentials", - id); - return NULL; - } - - if (creds->endpoint != QCRYPTO_TLS_CREDS_ENDPOINT_CLIENT) { - error_setg(errp, - "Expecting TLS credentials with a client endpoint"); - return NULL; - } - object_ref(obj); - return creds; -} - - -static int nbd_open(BlockDriverState *bs, QDict *options, int flags, - Error **errp) -{ - BDRVNBDState *s = bs->opaque; - char *export = NULL; - QIOChannelSocket *sioc = NULL; - SocketAddress *saddr; - const char *tlscredsid; - QCryptoTLSCreds *tlscreds = NULL; - const char *hostname = NULL; - int ret = -EINVAL; - - /* Pop the config into our state object. Exit if invalid. */ - saddr = nbd_config(s, options, &export, errp); - if (!saddr) { - goto error; - } - - tlscredsid = g_strdup(qdict_get_try_str(options, "tls-creds")); - if (tlscredsid) { - qdict_del(options, "tls-creds"); - tlscreds = nbd_get_tls_creds(tlscredsid, errp); - if (!tlscreds) { - goto error; - } - - if (saddr->type != SOCKET_ADDRESS_KIND_INET) { - error_setg(errp, "TLS only supported over IP sockets"); - goto error; - } - hostname = saddr->u.inet.data->host; - } - - /* establish TCP connection, return error if it fails - * TODO: Configurable retry-until-timeout behaviour. - */ - sioc = nbd_establish_connection(saddr, errp); - if (!sioc) { - ret = -ECONNREFUSED; - goto error; - } - - /* NBD handshake */ - ret = nbd_client_init(bs, sioc, export, - tlscreds, hostname, errp); - error: - if (sioc) { - object_unref(OBJECT(sioc)); - } - if (tlscreds) { - object_unref(OBJECT(tlscreds)); - } - qapi_free_SocketAddress(saddr); - g_free(export); - return ret; -} - -static int nbd_co_readv(BlockDriverState *bs, int64_t sector_num, - int nb_sectors, QEMUIOVector *qiov) -{ - return nbd_client_co_readv(bs, sector_num, nb_sectors, qiov); -} - -static int nbd_co_writev_flags(BlockDriverState *bs, int64_t sector_num, - int nb_sectors, QEMUIOVector *qiov, int flags) -{ - int ret; - - ret = nbd_client_co_writev(bs, sector_num, nb_sectors, qiov, &flags); - if (ret < 0) { - return ret; - } - - /* The flag wasn't sent to the server, so we need to emulate it with an - * explicit flush */ - if (flags & BDRV_REQ_FUA) { - ret = nbd_client_co_flush(bs); - } - - return ret; -} - -static int nbd_co_writev(BlockDriverState *bs, int64_t sector_num, - int nb_sectors, QEMUIOVector *qiov) -{ - return nbd_co_writev_flags(bs, sector_num, nb_sectors, qiov, 0); -} - -static int nbd_co_flush(BlockDriverState *bs) -{ - return nbd_client_co_flush(bs); -} - -static void nbd_refresh_limits(BlockDriverState *bs, Error **errp) -{ - bs->bl.max_discard = UINT32_MAX >> BDRV_SECTOR_BITS; - bs->bl.max_transfer_length = UINT32_MAX >> BDRV_SECTOR_BITS; -} - -static int nbd_co_discard(BlockDriverState *bs, int64_t sector_num, - int nb_sectors) -{ - return nbd_client_co_discard(bs, sector_num, nb_sectors); -} - -static void nbd_close(BlockDriverState *bs) -{ - nbd_client_close(bs); -} - -static int64_t nbd_getlength(BlockDriverState *bs) -{ - BDRVNBDState *s = bs->opaque; - - return s->client.size; -} - -static void nbd_detach_aio_context(BlockDriverState *bs) -{ - nbd_client_detach_aio_context(bs); -} - -static void nbd_attach_aio_context(BlockDriverState *bs, - AioContext *new_context) -{ - nbd_client_attach_aio_context(bs, new_context); -} - -static void nbd_refresh_filename(BlockDriverState *bs, QDict *options) -{ - QDict *opts = qdict_new(); - const char *path = qdict_get_try_str(options, "path"); - const char *host = qdict_get_try_str(options, "host"); - const char *port = qdict_get_try_str(options, "port"); - const char *export = qdict_get_try_str(options, "export"); - const char *tlscreds = qdict_get_try_str(options, "tls-creds"); - - qdict_put_obj(opts, "driver", QOBJECT(qstring_from_str("nbd"))); - - if (path && export) { - snprintf(bs->exact_filename, sizeof(bs->exact_filename), - "nbd+unix:///%s?socket=%s", export, path); - } else if (path && !export) { - snprintf(bs->exact_filename, sizeof(bs->exact_filename), - "nbd+unix://?socket=%s", path); - } else if (!path && export && port) { - snprintf(bs->exact_filename, sizeof(bs->exact_filename), - "nbd://%s:%s/%s", host, port, export); - } else if (!path && export && !port) { - snprintf(bs->exact_filename, sizeof(bs->exact_filename), - "nbd://%s/%s", host, export); - } else if (!path && !export && port) { - snprintf(bs->exact_filename, sizeof(bs->exact_filename), - "nbd://%s:%s", host, port); - } else if (!path && !export && !port) { - snprintf(bs->exact_filename, sizeof(bs->exact_filename), - "nbd://%s", host); - } - - if (path) { - qdict_put_obj(opts, "path", QOBJECT(qstring_from_str(path))); - } else if (port) { - qdict_put_obj(opts, "host", QOBJECT(qstring_from_str(host))); - qdict_put_obj(opts, "port", QOBJECT(qstring_from_str(port))); - } else { - qdict_put_obj(opts, "host", QOBJECT(qstring_from_str(host))); - } - if (export) { - qdict_put_obj(opts, "export", QOBJECT(qstring_from_str(export))); - } - if (tlscreds) { - qdict_put_obj(opts, "tls-creds", QOBJECT(qstring_from_str(tlscreds))); - } - - bs->full_open_options = opts; -} - -static BlockDriver bdrv_nbd = { - .format_name = "nbd", - .protocol_name = "nbd", - .instance_size = sizeof(BDRVNBDState), - .bdrv_parse_filename = nbd_parse_filename, - .bdrv_file_open = nbd_open, - .bdrv_co_readv = nbd_co_readv, - .bdrv_co_writev = nbd_co_writev, - .bdrv_co_writev_flags = nbd_co_writev_flags, - .supported_write_flags = BDRV_REQ_FUA, - .bdrv_close = nbd_close, - .bdrv_co_flush_to_os = nbd_co_flush, - .bdrv_co_discard = nbd_co_discard, - .bdrv_refresh_limits = nbd_refresh_limits, - .bdrv_getlength = nbd_getlength, - .bdrv_detach_aio_context = nbd_detach_aio_context, - .bdrv_attach_aio_context = nbd_attach_aio_context, - .bdrv_refresh_filename = nbd_refresh_filename, -}; - -static BlockDriver bdrv_nbd_tcp = { - .format_name = "nbd", - .protocol_name = "nbd+tcp", - .instance_size = sizeof(BDRVNBDState), - .bdrv_parse_filename = nbd_parse_filename, - .bdrv_file_open = nbd_open, - .bdrv_co_readv = nbd_co_readv, - .bdrv_co_writev = nbd_co_writev, - .bdrv_co_writev_flags = nbd_co_writev_flags, - .supported_write_flags = BDRV_REQ_FUA, - .bdrv_close = nbd_close, - .bdrv_co_flush_to_os = nbd_co_flush, - .bdrv_co_discard = nbd_co_discard, - .bdrv_refresh_limits = nbd_refresh_limits, - .bdrv_getlength = nbd_getlength, - .bdrv_detach_aio_context = nbd_detach_aio_context, - .bdrv_attach_aio_context = nbd_attach_aio_context, - .bdrv_refresh_filename = nbd_refresh_filename, -}; - -static BlockDriver bdrv_nbd_unix = { - .format_name = "nbd", - .protocol_name = "nbd+unix", - .instance_size = sizeof(BDRVNBDState), - .bdrv_parse_filename = nbd_parse_filename, - .bdrv_file_open = nbd_open, - .bdrv_co_readv = nbd_co_readv, - .bdrv_co_writev = nbd_co_writev, - .bdrv_co_writev_flags = nbd_co_writev_flags, - .supported_write_flags = BDRV_REQ_FUA, - .bdrv_close = nbd_close, - .bdrv_co_flush_to_os = nbd_co_flush, - .bdrv_co_discard = nbd_co_discard, - .bdrv_refresh_limits = nbd_refresh_limits, - .bdrv_getlength = nbd_getlength, - .bdrv_detach_aio_context = nbd_detach_aio_context, - .bdrv_attach_aio_context = nbd_attach_aio_context, - .bdrv_refresh_filename = nbd_refresh_filename, -}; - -static void bdrv_nbd_init(void) -{ - bdrv_register(&bdrv_nbd); - bdrv_register(&bdrv_nbd_tcp); - bdrv_register(&bdrv_nbd_unix); -} - -block_init(bdrv_nbd_init); diff --git a/qemu/block/nfs.c b/qemu/block/nfs.c deleted file mode 100644 index 9f51cc3f1..000000000 --- a/qemu/block/nfs.c +++ /dev/null @@ -1,563 +0,0 @@ -/* - * QEMU Block driver for native access to files on NFS shares - * - * Copyright (c) 2014 Peter Lieven - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. - */ - -#include "qemu/osdep.h" - -#include -#include "qemu-common.h" -#include "qemu/config-file.h" -#include "qemu/error-report.h" -#include "qapi/error.h" -#include "block/block_int.h" -#include "trace.h" -#include "qemu/iov.h" -#include "qemu/uri.h" -#include "qemu/cutils.h" -#include "sysemu/sysemu.h" -#include - -#define QEMU_NFS_MAX_READAHEAD_SIZE 1048576 -#define QEMU_NFS_MAX_DEBUG_LEVEL 2 - -typedef struct NFSClient { - struct nfs_context *context; - struct nfsfh *fh; - int events; - bool has_zero_init; - AioContext *aio_context; - blkcnt_t st_blocks; -} NFSClient; - -typedef struct NFSRPC { - int ret; - int complete; - QEMUIOVector *iov; - struct stat *st; - Coroutine *co; - QEMUBH *bh; - NFSClient *client; -} NFSRPC; - -static void nfs_process_read(void *arg); -static void nfs_process_write(void *arg); - -static void nfs_set_events(NFSClient *client) -{ - int ev = nfs_which_events(client->context); - if (ev != client->events) { - aio_set_fd_handler(client->aio_context, nfs_get_fd(client->context), - false, - (ev & POLLIN) ? nfs_process_read : NULL, - (ev & POLLOUT) ? nfs_process_write : NULL, client); - - } - client->events = ev; -} - -static void nfs_process_read(void *arg) -{ - NFSClient *client = arg; - nfs_service(client->context, POLLIN); - nfs_set_events(client); -} - -static void nfs_process_write(void *arg) -{ - NFSClient *client = arg; - nfs_service(client->context, POLLOUT); - nfs_set_events(client); -} - -static void nfs_co_init_task(NFSClient *client, NFSRPC *task) -{ - *task = (NFSRPC) { - .co = qemu_coroutine_self(), - .client = client, - }; -} - -static void nfs_co_generic_bh_cb(void *opaque) -{ - NFSRPC *task = opaque; - task->complete = 1; - qemu_bh_delete(task->bh); - qemu_coroutine_enter(task->co, NULL); -} - -static void -nfs_co_generic_cb(int ret, struct nfs_context *nfs, void *data, - void *private_data) -{ - NFSRPC *task = private_data; - task->ret = ret; - if (task->ret > 0 && task->iov) { - if (task->ret <= task->iov->size) { - qemu_iovec_from_buf(task->iov, 0, data, task->ret); - } else { - task->ret = -EIO; - } - } - if (task->ret == 0 && task->st) { - memcpy(task->st, data, sizeof(struct stat)); - } - if (task->ret < 0) { - error_report("NFS Error: %s", nfs_get_error(nfs)); - } - if (task->co) { - task->bh = aio_bh_new(task->client->aio_context, - nfs_co_generic_bh_cb, task); - qemu_bh_schedule(task->bh); - } else { - task->complete = 1; - } -} - -static int coroutine_fn nfs_co_readv(BlockDriverState *bs, - int64_t sector_num, int nb_sectors, - QEMUIOVector *iov) -{ - NFSClient *client = bs->opaque; - NFSRPC task; - - nfs_co_init_task(client, &task); - task.iov = iov; - - if (nfs_pread_async(client->context, client->fh, - sector_num * BDRV_SECTOR_SIZE, - nb_sectors * BDRV_SECTOR_SIZE, - nfs_co_generic_cb, &task) != 0) { - return -ENOMEM; - } - - while (!task.complete) { - nfs_set_events(client); - qemu_coroutine_yield(); - } - - if (task.ret < 0) { - return task.ret; - } - - /* zero pad short reads */ - if (task.ret < iov->size) { - qemu_iovec_memset(iov, task.ret, 0, iov->size - task.ret); - } - - return 0; -} - -static int coroutine_fn nfs_co_writev(BlockDriverState *bs, - int64_t sector_num, int nb_sectors, - QEMUIOVector *iov) -{ - NFSClient *client = bs->opaque; - NFSRPC task; - char *buf = NULL; - - nfs_co_init_task(client, &task); - - buf = g_try_malloc(nb_sectors * BDRV_SECTOR_SIZE); - if (nb_sectors && buf == NULL) { - return -ENOMEM; - } - - qemu_iovec_to_buf(iov, 0, buf, nb_sectors * BDRV_SECTOR_SIZE); - - if (nfs_pwrite_async(client->context, client->fh, - sector_num * BDRV_SECTOR_SIZE, - nb_sectors * BDRV_SECTOR_SIZE, - buf, nfs_co_generic_cb, &task) != 0) { - g_free(buf); - return -ENOMEM; - } - - while (!task.complete) { - nfs_set_events(client); - qemu_coroutine_yield(); - } - - g_free(buf); - - if (task.ret != nb_sectors * BDRV_SECTOR_SIZE) { - return task.ret < 0 ? task.ret : -EIO; - } - - return 0; -} - -static int coroutine_fn nfs_co_flush(BlockDriverState *bs) -{ - NFSClient *client = bs->opaque; - NFSRPC task; - - nfs_co_init_task(client, &task); - - if (nfs_fsync_async(client->context, client->fh, nfs_co_generic_cb, - &task) != 0) { - return -ENOMEM; - } - - while (!task.complete) { - nfs_set_events(client); - qemu_coroutine_yield(); - } - - return task.ret; -} - -/* TODO Convert to fine grained options */ -static QemuOptsList runtime_opts = { - .name = "nfs", - .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head), - .desc = { - { - .name = "filename", - .type = QEMU_OPT_STRING, - .help = "URL to the NFS file", - }, - { /* end of list */ } - }, -}; - -static void nfs_detach_aio_context(BlockDriverState *bs) -{ - NFSClient *client = bs->opaque; - - aio_set_fd_handler(client->aio_context, nfs_get_fd(client->context), - false, NULL, NULL, NULL); - client->events = 0; -} - -static void nfs_attach_aio_context(BlockDriverState *bs, - AioContext *new_context) -{ - NFSClient *client = bs->opaque; - - client->aio_context = new_context; - nfs_set_events(client); -} - -static void nfs_client_close(NFSClient *client) -{ - if (client->context) { - if (client->fh) { - nfs_close(client->context, client->fh); - } - aio_set_fd_handler(client->aio_context, nfs_get_fd(client->context), - false, NULL, NULL, NULL); - nfs_destroy_context(client->context); - } - memset(client, 0, sizeof(NFSClient)); -} - -static void nfs_file_close(BlockDriverState *bs) -{ - NFSClient *client = bs->opaque; - nfs_client_close(client); -} - -static int64_t nfs_client_open(NFSClient *client, const char *filename, - int flags, Error **errp) -{ - int ret = -EINVAL, i; - struct stat st; - URI *uri; - QueryParams *qp = NULL; - char *file = NULL, *strp = NULL; - - uri = uri_parse(filename); - if (!uri) { - error_setg(errp, "Invalid URL specified"); - goto fail; - } - if (!uri->server) { - error_setg(errp, "Invalid URL specified"); - goto fail; - } - strp = strrchr(uri->path, '/'); - if (strp == NULL) { - error_setg(errp, "Invalid URL specified"); - goto fail; - } - file = g_strdup(strp); - *strp = 0; - - client->context = nfs_init_context(); - if (client->context == NULL) { - error_setg(errp, "Failed to init NFS context"); - goto fail; - } - - qp = query_params_parse(uri->query); - for (i = 0; i < qp->n; i++) { - unsigned long long val; - if (!qp->p[i].value) { - error_setg(errp, "Value for NFS parameter expected: %s", - qp->p[i].name); - goto fail; - } - if (parse_uint_full(qp->p[i].value, &val, 0)) { - error_setg(errp, "Illegal value for NFS parameter: %s", - qp->p[i].name); - goto fail; - } - if (!strcmp(qp->p[i].name, "uid")) { - nfs_set_uid(client->context, val); - } else if (!strcmp(qp->p[i].name, "gid")) { - nfs_set_gid(client->context, val); - } else if (!strcmp(qp->p[i].name, "tcp-syncnt")) { - nfs_set_tcp_syncnt(client->context, val); -#ifdef LIBNFS_FEATURE_READAHEAD - } else if (!strcmp(qp->p[i].name, "readahead")) { - if (val > QEMU_NFS_MAX_READAHEAD_SIZE) { - error_report("NFS Warning: Truncating NFS readahead" - " size to %d", QEMU_NFS_MAX_READAHEAD_SIZE); - val = QEMU_NFS_MAX_READAHEAD_SIZE; - } - nfs_set_readahead(client->context, val); -#endif -#ifdef LIBNFS_FEATURE_DEBUG - } else if (!strcmp(qp->p[i].name, "debug")) { - /* limit the maximum debug level to avoid potential flooding - * of our log files. */ - if (val > QEMU_NFS_MAX_DEBUG_LEVEL) { - error_report("NFS Warning: Limiting NFS debug level" - " to %d", QEMU_NFS_MAX_DEBUG_LEVEL); - val = QEMU_NFS_MAX_DEBUG_LEVEL; - } - nfs_set_debug(client->context, val); -#endif - } else { - error_setg(errp, "Unknown NFS parameter name: %s", - qp->p[i].name); - goto fail; - } - } - - ret = nfs_mount(client->context, uri->server, uri->path); - if (ret < 0) { - error_setg(errp, "Failed to mount nfs share: %s", - nfs_get_error(client->context)); - goto fail; - } - - if (flags & O_CREAT) { - ret = nfs_creat(client->context, file, 0600, &client->fh); - if (ret < 0) { - error_setg(errp, "Failed to create file: %s", - nfs_get_error(client->context)); - goto fail; - } - } else { - ret = nfs_open(client->context, file, flags, &client->fh); - if (ret < 0) { - error_setg(errp, "Failed to open file : %s", - nfs_get_error(client->context)); - goto fail; - } - } - - ret = nfs_fstat(client->context, client->fh, &st); - if (ret < 0) { - error_setg(errp, "Failed to fstat file: %s", - nfs_get_error(client->context)); - goto fail; - } - - ret = DIV_ROUND_UP(st.st_size, BDRV_SECTOR_SIZE); - client->st_blocks = st.st_blocks; - client->has_zero_init = S_ISREG(st.st_mode); - goto out; -fail: - nfs_client_close(client); -out: - if (qp) { - query_params_free(qp); - } - uri_free(uri); - g_free(file); - return ret; -} - -static int nfs_file_open(BlockDriverState *bs, QDict *options, int flags, - Error **errp) { - NFSClient *client = bs->opaque; - int64_t ret; - QemuOpts *opts; - Error *local_err = NULL; - - client->aio_context = bdrv_get_aio_context(bs); - - opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort); - qemu_opts_absorb_qdict(opts, options, &local_err); - if (local_err) { - error_propagate(errp, local_err); - ret = -EINVAL; - goto out; - } - ret = nfs_client_open(client, qemu_opt_get(opts, "filename"), - (flags & BDRV_O_RDWR) ? O_RDWR : O_RDONLY, - errp); - if (ret < 0) { - goto out; - } - bs->total_sectors = ret; - ret = 0; -out: - qemu_opts_del(opts); - return ret; -} - -static QemuOptsList nfs_create_opts = { - .name = "nfs-create-opts", - .head = QTAILQ_HEAD_INITIALIZER(nfs_create_opts.head), - .desc = { - { - .name = BLOCK_OPT_SIZE, - .type = QEMU_OPT_SIZE, - .help = "Virtual disk size" - }, - { /* end of list */ } - } -}; - -static int nfs_file_create(const char *url, QemuOpts *opts, Error **errp) -{ - int ret = 0; - int64_t total_size = 0; - NFSClient *client = g_new0(NFSClient, 1); - - client->aio_context = qemu_get_aio_context(); - - /* Read out options */ - total_size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0), - BDRV_SECTOR_SIZE); - - ret = nfs_client_open(client, url, O_CREAT, errp); - if (ret < 0) { - goto out; - } - ret = nfs_ftruncate(client->context, client->fh, total_size); - nfs_client_close(client); -out: - g_free(client); - return ret; -} - -static int nfs_has_zero_init(BlockDriverState *bs) -{ - NFSClient *client = bs->opaque; - return client->has_zero_init; -} - -static int64_t nfs_get_allocated_file_size(BlockDriverState *bs) -{ - NFSClient *client = bs->opaque; - NFSRPC task = {0}; - struct stat st; - - if (bdrv_is_read_only(bs) && - !(bs->open_flags & BDRV_O_NOCACHE)) { - return client->st_blocks * 512; - } - - task.st = &st; - if (nfs_fstat_async(client->context, client->fh, nfs_co_generic_cb, - &task) != 0) { - return -ENOMEM; - } - - while (!task.complete) { - nfs_set_events(client); - aio_poll(client->aio_context, true); - } - - return (task.ret < 0 ? task.ret : st.st_blocks * 512); -} - -static int nfs_file_truncate(BlockDriverState *bs, int64_t offset) -{ - NFSClient *client = bs->opaque; - return nfs_ftruncate(client->context, client->fh, offset); -} - -/* Note that this will not re-establish a connection with the NFS server - * - it is effectively a NOP. */ -static int nfs_reopen_prepare(BDRVReopenState *state, - BlockReopenQueue *queue, Error **errp) -{ - NFSClient *client = state->bs->opaque; - struct stat st; - int ret = 0; - - if (state->flags & BDRV_O_RDWR && bdrv_is_read_only(state->bs)) { - error_setg(errp, "Cannot open a read-only mount as read-write"); - return -EACCES; - } - - /* Update cache for read-only reopens */ - if (!(state->flags & BDRV_O_RDWR)) { - ret = nfs_fstat(client->context, client->fh, &st); - if (ret < 0) { - error_setg(errp, "Failed to fstat file: %s", - nfs_get_error(client->context)); - return ret; - } - client->st_blocks = st.st_blocks; - } - - return 0; -} - -static BlockDriver bdrv_nfs = { - .format_name = "nfs", - .protocol_name = "nfs", - - .instance_size = sizeof(NFSClient), - .bdrv_needs_filename = true, - .create_opts = &nfs_create_opts, - - .bdrv_has_zero_init = nfs_has_zero_init, - .bdrv_get_allocated_file_size = nfs_get_allocated_file_size, - .bdrv_truncate = nfs_file_truncate, - - .bdrv_file_open = nfs_file_open, - .bdrv_close = nfs_file_close, - .bdrv_create = nfs_file_create, - .bdrv_reopen_prepare = nfs_reopen_prepare, - - .bdrv_co_readv = nfs_co_readv, - .bdrv_co_writev = nfs_co_writev, - .bdrv_co_flush_to_disk = nfs_co_flush, - - .bdrv_detach_aio_context = nfs_detach_aio_context, - .bdrv_attach_aio_context = nfs_attach_aio_context, -}; - -static void nfs_block_init(void) -{ - bdrv_register(&bdrv_nfs); -} - -block_init(nfs_block_init); diff --git a/qemu/block/null.c b/qemu/block/null.c deleted file mode 100644 index 396500bab..000000000 --- a/qemu/block/null.c +++ /dev/null @@ -1,266 +0,0 @@ -/* - * Null block driver - * - * Authors: - * Fam Zheng - * - * Copyright (C) 2014 Red Hat, Inc. - * - * This work is licensed under the terms of the GNU GPL, version 2 or later. - * See the COPYING file in the top-level directory. - */ - -#include "qemu/osdep.h" -#include "qapi/error.h" -#include "block/block_int.h" - -#define NULL_OPT_LATENCY "latency-ns" -#define NULL_OPT_ZEROES "read-zeroes" - -typedef struct { - int64_t length; - int64_t latency_ns; - bool read_zeroes; -} BDRVNullState; - -static QemuOptsList runtime_opts = { - .name = "null", - .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head), - .desc = { - { - .name = "filename", - .type = QEMU_OPT_STRING, - .help = "", - }, - { - .name = BLOCK_OPT_SIZE, - .type = QEMU_OPT_SIZE, - .help = "size of the null block", - }, - { - .name = NULL_OPT_LATENCY, - .type = QEMU_OPT_NUMBER, - .help = "nanoseconds (approximated) to wait " - "before completing request", - }, - { - .name = NULL_OPT_ZEROES, - .type = QEMU_OPT_BOOL, - .help = "return zeroes when read", - }, - { /* end of list */ } - }, -}; - -static int null_file_open(BlockDriverState *bs, QDict *options, int flags, - Error **errp) -{ - QemuOpts *opts; - BDRVNullState *s = bs->opaque; - int ret = 0; - - opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort); - qemu_opts_absorb_qdict(opts, options, &error_abort); - s->length = - qemu_opt_get_size(opts, BLOCK_OPT_SIZE, 1 << 30); - s->latency_ns = - qemu_opt_get_number(opts, NULL_OPT_LATENCY, 0); - if (s->latency_ns < 0) { - error_setg(errp, "latency-ns is invalid"); - ret = -EINVAL; - } - s->read_zeroes = qemu_opt_get_bool(opts, NULL_OPT_ZEROES, false); - qemu_opts_del(opts); - return ret; -} - -static void null_close(BlockDriverState *bs) -{ -} - -static int64_t null_getlength(BlockDriverState *bs) -{ - BDRVNullState *s = bs->opaque; - return s->length; -} - -static coroutine_fn int null_co_common(BlockDriverState *bs) -{ - BDRVNullState *s = bs->opaque; - - if (s->latency_ns) { - co_aio_sleep_ns(bdrv_get_aio_context(bs), QEMU_CLOCK_REALTIME, - s->latency_ns); - } - return 0; -} - -static coroutine_fn int null_co_readv(BlockDriverState *bs, - int64_t sector_num, int nb_sectors, - QEMUIOVector *qiov) -{ - BDRVNullState *s = bs->opaque; - - if (s->read_zeroes) { - qemu_iovec_memset(qiov, 0, 0, nb_sectors * BDRV_SECTOR_SIZE); - } - - return null_co_common(bs); -} - -static coroutine_fn int null_co_writev(BlockDriverState *bs, - int64_t sector_num, int nb_sectors, - QEMUIOVector *qiov) -{ - return null_co_common(bs); -} - -static coroutine_fn int null_co_flush(BlockDriverState *bs) -{ - return null_co_common(bs); -} - -typedef struct { - BlockAIOCB common; - QEMUBH *bh; - QEMUTimer timer; -} NullAIOCB; - -static const AIOCBInfo null_aiocb_info = { - .aiocb_size = sizeof(NullAIOCB), -}; - -static void null_bh_cb(void *opaque) -{ - NullAIOCB *acb = opaque; - acb->common.cb(acb->common.opaque, 0); - qemu_bh_delete(acb->bh); - qemu_aio_unref(acb); -} - -static void null_timer_cb(void *opaque) -{ - NullAIOCB *acb = opaque; - acb->common.cb(acb->common.opaque, 0); - timer_deinit(&acb->timer); - qemu_aio_unref(acb); -} - -static inline BlockAIOCB *null_aio_common(BlockDriverState *bs, - BlockCompletionFunc *cb, - void *opaque) -{ - NullAIOCB *acb; - BDRVNullState *s = bs->opaque; - - acb = qemu_aio_get(&null_aiocb_info, bs, cb, opaque); - /* Only emulate latency after vcpu is running. */ - if (s->latency_ns) { - aio_timer_init(bdrv_get_aio_context(bs), &acb->timer, - QEMU_CLOCK_REALTIME, SCALE_NS, - null_timer_cb, acb); - timer_mod_ns(&acb->timer, - qemu_clock_get_ns(QEMU_CLOCK_REALTIME) + s->latency_ns); - } else { - acb->bh = aio_bh_new(bdrv_get_aio_context(bs), null_bh_cb, acb); - qemu_bh_schedule(acb->bh); - } - return &acb->common; -} - -static BlockAIOCB *null_aio_readv(BlockDriverState *bs, - int64_t sector_num, QEMUIOVector *qiov, - int nb_sectors, - BlockCompletionFunc *cb, - void *opaque) -{ - BDRVNullState *s = bs->opaque; - - if (s->read_zeroes) { - qemu_iovec_memset(qiov, 0, 0, nb_sectors * BDRV_SECTOR_SIZE); - } - - return null_aio_common(bs, cb, opaque); -} - -static BlockAIOCB *null_aio_writev(BlockDriverState *bs, - int64_t sector_num, QEMUIOVector *qiov, - int nb_sectors, - BlockCompletionFunc *cb, - void *opaque) -{ - return null_aio_common(bs, cb, opaque); -} - -static BlockAIOCB *null_aio_flush(BlockDriverState *bs, - BlockCompletionFunc *cb, - void *opaque) -{ - return null_aio_common(bs, cb, opaque); -} - -static int null_reopen_prepare(BDRVReopenState *reopen_state, - BlockReopenQueue *queue, Error **errp) -{ - return 0; -} - -static int64_t coroutine_fn null_co_get_block_status(BlockDriverState *bs, - int64_t sector_num, - int nb_sectors, int *pnum, - BlockDriverState **file) -{ - BDRVNullState *s = bs->opaque; - off_t start = sector_num * BDRV_SECTOR_SIZE; - - *pnum = nb_sectors; - *file = bs; - - if (s->read_zeroes) { - return BDRV_BLOCK_OFFSET_VALID | start | BDRV_BLOCK_ZERO; - } else { - return BDRV_BLOCK_OFFSET_VALID | start; - } -} - -static BlockDriver bdrv_null_co = { - .format_name = "null-co", - .protocol_name = "null-co", - .instance_size = sizeof(BDRVNullState), - - .bdrv_file_open = null_file_open, - .bdrv_close = null_close, - .bdrv_getlength = null_getlength, - - .bdrv_co_readv = null_co_readv, - .bdrv_co_writev = null_co_writev, - .bdrv_co_flush_to_disk = null_co_flush, - .bdrv_reopen_prepare = null_reopen_prepare, - - .bdrv_co_get_block_status = null_co_get_block_status, -}; - -static BlockDriver bdrv_null_aio = { - .format_name = "null-aio", - .protocol_name = "null-aio", - .instance_size = sizeof(BDRVNullState), - - .bdrv_file_open = null_file_open, - .bdrv_close = null_close, - .bdrv_getlength = null_getlength, - - .bdrv_aio_readv = null_aio_readv, - .bdrv_aio_writev = null_aio_writev, - .bdrv_aio_flush = null_aio_flush, - .bdrv_reopen_prepare = null_reopen_prepare, - - .bdrv_co_get_block_status = null_co_get_block_status, -}; - -static void bdrv_null_init(void) -{ - bdrv_register(&bdrv_null_co); - bdrv_register(&bdrv_null_aio); -} - -block_init(bdrv_null_init); diff --git a/qemu/block/parallels.c b/qemu/block/parallels.c deleted file mode 100644 index 324ed43ac..000000000 --- a/qemu/block/parallels.c +++ /dev/null @@ -1,766 +0,0 @@ -/* - * Block driver for Parallels disk image format - * - * Copyright (c) 2007 Alex Beregszaszi - * Copyright (c) 2015 Denis V. Lunev - * - * This code was originally based on comparing different disk images created - * by Parallels. Currently it is based on opened OpenVZ sources - * available at - * http://git.openvz.org/?p=ploop;a=summary - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. - */ -#include "qemu/osdep.h" -#include "qapi/error.h" -#include "qemu-common.h" -#include "block/block_int.h" -#include "sysemu/block-backend.h" -#include "qemu/module.h" -#include "qemu/bitmap.h" -#include "qapi/util.h" - -/**************************************************************/ - -#define HEADER_MAGIC "WithoutFreeSpace" -#define HEADER_MAGIC2 "WithouFreSpacExt" -#define HEADER_VERSION 2 -#define HEADER_INUSE_MAGIC (0x746F6E59) - -#define DEFAULT_CLUSTER_SIZE 1048576 /* 1 MiB */ - - -// always little-endian -typedef struct ParallelsHeader { - char magic[16]; // "WithoutFreeSpace" - uint32_t version; - uint32_t heads; - uint32_t cylinders; - uint32_t tracks; - uint32_t bat_entries; - uint64_t nb_sectors; - uint32_t inuse; - uint32_t data_off; - char padding[12]; -} QEMU_PACKED ParallelsHeader; - - -typedef enum ParallelsPreallocMode { - PRL_PREALLOC_MODE_FALLOCATE = 0, - PRL_PREALLOC_MODE_TRUNCATE = 1, - PRL_PREALLOC_MODE__MAX = 2, -} ParallelsPreallocMode; - -static const char *prealloc_mode_lookup[] = { - "falloc", - "truncate", - NULL, -}; - - -typedef struct BDRVParallelsState { - /** Locking is conservative, the lock protects - * - image file extending (truncate, fallocate) - * - any access to block allocation table - */ - CoMutex lock; - - ParallelsHeader *header; - uint32_t header_size; - bool header_unclean; - - unsigned long *bat_dirty_bmap; - unsigned int bat_dirty_block; - - uint32_t *bat_bitmap; - unsigned int bat_size; - - int64_t data_end; - uint64_t prealloc_size; - ParallelsPreallocMode prealloc_mode; - - unsigned int tracks; - - unsigned int off_multiplier; -} BDRVParallelsState; - - -#define PARALLELS_OPT_PREALLOC_MODE "prealloc-mode" -#define PARALLELS_OPT_PREALLOC_SIZE "prealloc-size" - -static QemuOptsList parallels_runtime_opts = { - .name = "parallels", - .head = QTAILQ_HEAD_INITIALIZER(parallels_runtime_opts.head), - .desc = { - { - .name = PARALLELS_OPT_PREALLOC_SIZE, - .type = QEMU_OPT_SIZE, - .help = "Preallocation size on image expansion", - .def_value_str = "128MiB", - }, - { - .name = PARALLELS_OPT_PREALLOC_MODE, - .type = QEMU_OPT_STRING, - .help = "Preallocation mode on image expansion " - "(allowed values: falloc, truncate)", - .def_value_str = "falloc", - }, - { /* end of list */ }, - }, -}; - - -static int64_t bat2sect(BDRVParallelsState *s, uint32_t idx) -{ - return (uint64_t)le32_to_cpu(s->bat_bitmap[idx]) * s->off_multiplier; -} - -static uint32_t bat_entry_off(uint32_t idx) -{ - return sizeof(ParallelsHeader) + sizeof(uint32_t) * idx; -} - -static int64_t seek_to_sector(BDRVParallelsState *s, int64_t sector_num) -{ - uint32_t index, offset; - - index = sector_num / s->tracks; - offset = sector_num % s->tracks; - - /* not allocated */ - if ((index >= s->bat_size) || (s->bat_bitmap[index] == 0)) { - return -1; - } - return bat2sect(s, index) + offset; -} - -static int cluster_remainder(BDRVParallelsState *s, int64_t sector_num, - int nb_sectors) -{ - int ret = s->tracks - sector_num % s->tracks; - return MIN(nb_sectors, ret); -} - -static int64_t block_status(BDRVParallelsState *s, int64_t sector_num, - int nb_sectors, int *pnum) -{ - int64_t start_off = -2, prev_end_off = -2; - - *pnum = 0; - while (nb_sectors > 0 || start_off == -2) { - int64_t offset = seek_to_sector(s, sector_num); - int to_end; - - if (start_off == -2) { - start_off = offset; - prev_end_off = offset; - } else if (offset != prev_end_off) { - break; - } - - to_end = cluster_remainder(s, sector_num, nb_sectors); - nb_sectors -= to_end; - sector_num += to_end; - *pnum += to_end; - - if (offset > 0) { - prev_end_off += to_end; - } - } - return start_off; -} - -static int64_t allocate_clusters(BlockDriverState *bs, int64_t sector_num, - int nb_sectors, int *pnum) -{ - BDRVParallelsState *s = bs->opaque; - uint32_t idx, to_allocate, i; - int64_t pos, space; - - pos = block_status(s, sector_num, nb_sectors, pnum); - if (pos > 0) { - return pos; - } - - idx = sector_num / s->tracks; - if (idx >= s->bat_size) { - return -EINVAL; - } - - to_allocate = (sector_num + *pnum + s->tracks - 1) / s->tracks - idx; - space = to_allocate * s->tracks; - if (s->data_end + space > bdrv_getlength(bs->file->bs) >> BDRV_SECTOR_BITS) { - int ret; - space += s->prealloc_size; - if (s->prealloc_mode == PRL_PREALLOC_MODE_FALLOCATE) { - ret = bdrv_write_zeroes(bs->file->bs, s->data_end, space, 0); - } else { - ret = bdrv_truncate(bs->file->bs, - (s->data_end + space) << BDRV_SECTOR_BITS); - } - if (ret < 0) { - return ret; - } - } - - for (i = 0; i < to_allocate; i++) { - s->bat_bitmap[idx + i] = cpu_to_le32(s->data_end / s->off_multiplier); - s->data_end += s->tracks; - bitmap_set(s->bat_dirty_bmap, - bat_entry_off(idx + i) / s->bat_dirty_block, 1); - } - - return bat2sect(s, idx) + sector_num % s->tracks; -} - - -static coroutine_fn int parallels_co_flush_to_os(BlockDriverState *bs) -{ - BDRVParallelsState *s = bs->opaque; - unsigned long size = DIV_ROUND_UP(s->header_size, s->bat_dirty_block); - unsigned long bit; - - qemu_co_mutex_lock(&s->lock); - - bit = find_first_bit(s->bat_dirty_bmap, size); - while (bit < size) { - uint32_t off = bit * s->bat_dirty_block; - uint32_t to_write = s->bat_dirty_block; - int ret; - - if (off + to_write > s->header_size) { - to_write = s->header_size - off; - } - ret = bdrv_pwrite(bs->file->bs, off, (uint8_t *)s->header + off, - to_write); - if (ret < 0) { - qemu_co_mutex_unlock(&s->lock); - return ret; - } - bit = find_next_bit(s->bat_dirty_bmap, size, bit + 1); - } - bitmap_zero(s->bat_dirty_bmap, size); - - qemu_co_mutex_unlock(&s->lock); - return 0; -} - - -static int64_t coroutine_fn parallels_co_get_block_status(BlockDriverState *bs, - int64_t sector_num, int nb_sectors, int *pnum, BlockDriverState **file) -{ - BDRVParallelsState *s = bs->opaque; - int64_t offset; - - qemu_co_mutex_lock(&s->lock); - offset = block_status(s, sector_num, nb_sectors, pnum); - qemu_co_mutex_unlock(&s->lock); - - if (offset < 0) { - return 0; - } - - *file = bs->file->bs; - return (offset << BDRV_SECTOR_BITS) | - BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID; -} - -static coroutine_fn int parallels_co_writev(BlockDriverState *bs, - int64_t sector_num, int nb_sectors, QEMUIOVector *qiov) -{ - BDRVParallelsState *s = bs->opaque; - uint64_t bytes_done = 0; - QEMUIOVector hd_qiov; - int ret = 0; - - qemu_iovec_init(&hd_qiov, qiov->niov); - - while (nb_sectors > 0) { - int64_t position; - int n, nbytes; - - qemu_co_mutex_lock(&s->lock); - position = allocate_clusters(bs, sector_num, nb_sectors, &n); - qemu_co_mutex_unlock(&s->lock); - if (position < 0) { - ret = (int)position; - break; - } - - nbytes = n << BDRV_SECTOR_BITS; - - qemu_iovec_reset(&hd_qiov); - qemu_iovec_concat(&hd_qiov, qiov, bytes_done, nbytes); - - ret = bdrv_co_writev(bs->file->bs, position, n, &hd_qiov); - if (ret < 0) { - break; - } - - nb_sectors -= n; - sector_num += n; - bytes_done += nbytes; - } - - qemu_iovec_destroy(&hd_qiov); - return ret; -} - -static coroutine_fn int parallels_co_readv(BlockDriverState *bs, - int64_t sector_num, int nb_sectors, QEMUIOVector *qiov) -{ - BDRVParallelsState *s = bs->opaque; - uint64_t bytes_done = 0; - QEMUIOVector hd_qiov; - int ret = 0; - - qemu_iovec_init(&hd_qiov, qiov->niov); - - while (nb_sectors > 0) { - int64_t position; - int n, nbytes; - - qemu_co_mutex_lock(&s->lock); - position = block_status(s, sector_num, nb_sectors, &n); - qemu_co_mutex_unlock(&s->lock); - - nbytes = n << BDRV_SECTOR_BITS; - - if (position < 0) { - qemu_iovec_memset(qiov, bytes_done, 0, nbytes); - } else { - qemu_iovec_reset(&hd_qiov); - qemu_iovec_concat(&hd_qiov, qiov, bytes_done, nbytes); - - ret = bdrv_co_readv(bs->file->bs, position, n, &hd_qiov); - if (ret < 0) { - break; - } - } - - nb_sectors -= n; - sector_num += n; - bytes_done += nbytes; - } - - qemu_iovec_destroy(&hd_qiov); - return ret; -} - - -static int parallels_check(BlockDriverState *bs, BdrvCheckResult *res, - BdrvCheckMode fix) -{ - BDRVParallelsState *s = bs->opaque; - int64_t size, prev_off, high_off; - int ret; - uint32_t i; - bool flush_bat = false; - int cluster_size = s->tracks << BDRV_SECTOR_BITS; - - size = bdrv_getlength(bs->file->bs); - if (size < 0) { - res->check_errors++; - return size; - } - - if (s->header_unclean) { - fprintf(stderr, "%s image was not closed correctly\n", - fix & BDRV_FIX_ERRORS ? "Repairing" : "ERROR"); - res->corruptions++; - if (fix & BDRV_FIX_ERRORS) { - /* parallels_close will do the job right */ - res->corruptions_fixed++; - s->header_unclean = false; - } - } - - res->bfi.total_clusters = s->bat_size; - res->bfi.compressed_clusters = 0; /* compression is not supported */ - - high_off = 0; - prev_off = 0; - for (i = 0; i < s->bat_size; i++) { - int64_t off = bat2sect(s, i) << BDRV_SECTOR_BITS; - if (off == 0) { - prev_off = 0; - continue; - } - - /* cluster outside the image */ - if (off > size) { - fprintf(stderr, "%s cluster %u is outside image\n", - fix & BDRV_FIX_ERRORS ? "Repairing" : "ERROR", i); - res->corruptions++; - if (fix & BDRV_FIX_ERRORS) { - prev_off = 0; - s->bat_bitmap[i] = 0; - res->corruptions_fixed++; - flush_bat = true; - continue; - } - } - - res->bfi.allocated_clusters++; - if (off > high_off) { - high_off = off; - } - - if (prev_off != 0 && (prev_off + cluster_size) != off) { - res->bfi.fragmented_clusters++; - } - prev_off = off; - } - - if (flush_bat) { - ret = bdrv_pwrite_sync(bs->file->bs, 0, s->header, s->header_size); - if (ret < 0) { - res->check_errors++; - return ret; - } - } - - res->image_end_offset = high_off + cluster_size; - if (size > res->image_end_offset) { - int64_t count; - count = DIV_ROUND_UP(size - res->image_end_offset, cluster_size); - fprintf(stderr, "%s space leaked at the end of the image %" PRId64 "\n", - fix & BDRV_FIX_LEAKS ? "Repairing" : "ERROR", - size - res->image_end_offset); - res->leaks += count; - if (fix & BDRV_FIX_LEAKS) { - ret = bdrv_truncate(bs->file->bs, res->image_end_offset); - if (ret < 0) { - res->check_errors++; - return ret; - } - res->leaks_fixed += count; - } - } - - return 0; -} - - -static int parallels_create(const char *filename, QemuOpts *opts, Error **errp) -{ - int64_t total_size, cl_size; - uint8_t tmp[BDRV_SECTOR_SIZE]; - Error *local_err = NULL; - BlockBackend *file; - uint32_t bat_entries, bat_sectors; - ParallelsHeader header; - int ret; - - total_size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0), - BDRV_SECTOR_SIZE); - cl_size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_CLUSTER_SIZE, - DEFAULT_CLUSTER_SIZE), BDRV_SECTOR_SIZE); - - ret = bdrv_create_file(filename, opts, &local_err); - if (ret < 0) { - error_propagate(errp, local_err); - return ret; - } - - file = blk_new_open(filename, NULL, NULL, - BDRV_O_RDWR | BDRV_O_PROTOCOL, &local_err); - if (file == NULL) { - error_propagate(errp, local_err); - return -EIO; - } - - blk_set_allow_write_beyond_eof(file, true); - - ret = blk_truncate(file, 0); - if (ret < 0) { - goto exit; - } - - bat_entries = DIV_ROUND_UP(total_size, cl_size); - bat_sectors = DIV_ROUND_UP(bat_entry_off(bat_entries), cl_size); - bat_sectors = (bat_sectors * cl_size) >> BDRV_SECTOR_BITS; - - memset(&header, 0, sizeof(header)); - memcpy(header.magic, HEADER_MAGIC2, sizeof(header.magic)); - header.version = cpu_to_le32(HEADER_VERSION); - /* don't care much about geometry, it is not used on image level */ - header.heads = cpu_to_le32(16); - header.cylinders = cpu_to_le32(total_size / BDRV_SECTOR_SIZE / 16 / 32); - header.tracks = cpu_to_le32(cl_size >> BDRV_SECTOR_BITS); - header.bat_entries = cpu_to_le32(bat_entries); - header.nb_sectors = cpu_to_le64(DIV_ROUND_UP(total_size, BDRV_SECTOR_SIZE)); - header.data_off = cpu_to_le32(bat_sectors); - - /* write all the data */ - memset(tmp, 0, sizeof(tmp)); - memcpy(tmp, &header, sizeof(header)); - - ret = blk_pwrite(file, 0, tmp, BDRV_SECTOR_SIZE); - if (ret < 0) { - goto exit; - } - ret = blk_write_zeroes(file, 1, bat_sectors - 1, 0); - if (ret < 0) { - goto exit; - } - ret = 0; - -done: - blk_unref(file); - return ret; - -exit: - error_setg_errno(errp, -ret, "Failed to create Parallels image"); - goto done; -} - - -static int parallels_probe(const uint8_t *buf, int buf_size, - const char *filename) -{ - const ParallelsHeader *ph = (const void *)buf; - - if (buf_size < sizeof(ParallelsHeader)) { - return 0; - } - - if ((!memcmp(ph->magic, HEADER_MAGIC, 16) || - !memcmp(ph->magic, HEADER_MAGIC2, 16)) && - (le32_to_cpu(ph->version) == HEADER_VERSION)) { - return 100; - } - - return 0; -} - -static int parallels_update_header(BlockDriverState *bs) -{ - BDRVParallelsState *s = bs->opaque; - unsigned size = MAX(bdrv_opt_mem_align(bs->file->bs), - sizeof(ParallelsHeader)); - - if (size > s->header_size) { - size = s->header_size; - } - return bdrv_pwrite_sync(bs->file->bs, 0, s->header, size); -} - -static int parallels_open(BlockDriverState *bs, QDict *options, int flags, - Error **errp) -{ - BDRVParallelsState *s = bs->opaque; - ParallelsHeader ph; - int ret, size, i; - QemuOpts *opts = NULL; - Error *local_err = NULL; - char *buf; - - ret = bdrv_pread(bs->file->bs, 0, &ph, sizeof(ph)); - if (ret < 0) { - goto fail; - } - - bs->total_sectors = le64_to_cpu(ph.nb_sectors); - - if (le32_to_cpu(ph.version) != HEADER_VERSION) { - goto fail_format; - } - if (!memcmp(ph.magic, HEADER_MAGIC, 16)) { - s->off_multiplier = 1; - bs->total_sectors = 0xffffffff & bs->total_sectors; - } else if (!memcmp(ph.magic, HEADER_MAGIC2, 16)) { - s->off_multiplier = le32_to_cpu(ph.tracks); - } else { - goto fail_format; - } - - s->tracks = le32_to_cpu(ph.tracks); - if (s->tracks == 0) { - error_setg(errp, "Invalid image: Zero sectors per track"); - ret = -EINVAL; - goto fail; - } - if (s->tracks > INT32_MAX/513) { - error_setg(errp, "Invalid image: Too big cluster"); - ret = -EFBIG; - goto fail; - } - - s->bat_size = le32_to_cpu(ph.bat_entries); - if (s->bat_size > INT_MAX / sizeof(uint32_t)) { - error_setg(errp, "Catalog too large"); - ret = -EFBIG; - goto fail; - } - - size = bat_entry_off(s->bat_size); - s->header_size = ROUND_UP(size, bdrv_opt_mem_align(bs->file->bs)); - s->header = qemu_try_blockalign(bs->file->bs, s->header_size); - if (s->header == NULL) { - ret = -ENOMEM; - goto fail; - } - s->data_end = le32_to_cpu(ph.data_off); - if (s->data_end == 0) { - s->data_end = ROUND_UP(bat_entry_off(s->bat_size), BDRV_SECTOR_SIZE); - } - if (s->data_end < s->header_size) { - /* there is not enough unused space to fit to block align between BAT - and actual data. We can't avoid read-modify-write... */ - s->header_size = size; - } - - ret = bdrv_pread(bs->file->bs, 0, s->header, s->header_size); - if (ret < 0) { - goto fail; - } - s->bat_bitmap = (uint32_t *)(s->header + 1); - - for (i = 0; i < s->bat_size; i++) { - int64_t off = bat2sect(s, i); - if (off >= s->data_end) { - s->data_end = off + s->tracks; - } - } - - if (le32_to_cpu(ph.inuse) == HEADER_INUSE_MAGIC) { - /* Image was not closed correctly. The check is mandatory */ - s->header_unclean = true; - if ((flags & BDRV_O_RDWR) && !(flags & BDRV_O_CHECK)) { - error_setg(errp, "parallels: Image was not closed correctly; " - "cannot be opened read/write"); - ret = -EACCES; - goto fail; - } - } - - opts = qemu_opts_create(¶llels_runtime_opts, NULL, 0, &local_err); - if (local_err != NULL) { - goto fail_options; - } - - qemu_opts_absorb_qdict(opts, options, &local_err); - if (local_err != NULL) { - goto fail_options; - } - - s->prealloc_size = - qemu_opt_get_size_del(opts, PARALLELS_OPT_PREALLOC_SIZE, 0); - s->prealloc_size = MAX(s->tracks, s->prealloc_size >> BDRV_SECTOR_BITS); - buf = qemu_opt_get_del(opts, PARALLELS_OPT_PREALLOC_MODE); - s->prealloc_mode = qapi_enum_parse(prealloc_mode_lookup, buf, - PRL_PREALLOC_MODE__MAX, PRL_PREALLOC_MODE_FALLOCATE, &local_err); - g_free(buf); - if (local_err != NULL) { - goto fail_options; - } - if (!bdrv_has_zero_init(bs->file->bs) || - bdrv_truncate(bs->file->bs, bdrv_getlength(bs->file->bs)) != 0) { - s->prealloc_mode = PRL_PREALLOC_MODE_FALLOCATE; - } - - if (flags & BDRV_O_RDWR) { - s->header->inuse = cpu_to_le32(HEADER_INUSE_MAGIC); - ret = parallels_update_header(bs); - if (ret < 0) { - goto fail; - } - } - - s->bat_dirty_block = 4 * getpagesize(); - s->bat_dirty_bmap = - bitmap_new(DIV_ROUND_UP(s->header_size, s->bat_dirty_block)); - - qemu_co_mutex_init(&s->lock); - return 0; - -fail_format: - error_setg(errp, "Image not in Parallels format"); - ret = -EINVAL; -fail: - qemu_vfree(s->header); - return ret; - -fail_options: - error_propagate(errp, local_err); - ret = -EINVAL; - goto fail; -} - - -static void parallels_close(BlockDriverState *bs) -{ - BDRVParallelsState *s = bs->opaque; - - if (bs->open_flags & BDRV_O_RDWR) { - s->header->inuse = 0; - parallels_update_header(bs); - } - - if (bs->open_flags & BDRV_O_RDWR) { - bdrv_truncate(bs->file->bs, s->data_end << BDRV_SECTOR_BITS); - } - - g_free(s->bat_dirty_bmap); - qemu_vfree(s->header); -} - -static QemuOptsList parallels_create_opts = { - .name = "parallels-create-opts", - .head = QTAILQ_HEAD_INITIALIZER(parallels_create_opts.head), - .desc = { - { - .name = BLOCK_OPT_SIZE, - .type = QEMU_OPT_SIZE, - .help = "Virtual disk size", - }, - { - .name = BLOCK_OPT_CLUSTER_SIZE, - .type = QEMU_OPT_SIZE, - .help = "Parallels image cluster size", - .def_value_str = stringify(DEFAULT_CLUSTER_SIZE), - }, - { /* end of list */ } - } -}; - -static BlockDriver bdrv_parallels = { - .format_name = "parallels", - .instance_size = sizeof(BDRVParallelsState), - .bdrv_probe = parallels_probe, - .bdrv_open = parallels_open, - .bdrv_close = parallels_close, - .bdrv_co_get_block_status = parallels_co_get_block_status, - .bdrv_has_zero_init = bdrv_has_zero_init_1, - .bdrv_co_flush_to_os = parallels_co_flush_to_os, - .bdrv_co_readv = parallels_co_readv, - .bdrv_co_writev = parallels_co_writev, - - .bdrv_create = parallels_create, - .bdrv_check = parallels_check, - .create_opts = ¶llels_create_opts, -}; - -static void bdrv_parallels_init(void) -{ - bdrv_register(&bdrv_parallels); -} - -block_init(bdrv_parallels_init); diff --git a/qemu/block/qapi.c b/qemu/block/qapi.c deleted file mode 100644 index c5f6ba643..000000000 --- a/qemu/block/qapi.c +++ /dev/null @@ -1,783 +0,0 @@ -/* - * Block layer qmp and info dump related functions - * - * Copyright (c) 2003-2008 Fabrice Bellard - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. - */ - -#include "qemu/osdep.h" -#include "block/qapi.h" -#include "block/block_int.h" -#include "block/throttle-groups.h" -#include "block/write-threshold.h" -#include "qmp-commands.h" -#include "qapi-visit.h" -#include "qapi/qmp-output-visitor.h" -#include "qapi/qmp/types.h" -#include "sysemu/block-backend.h" -#include "qemu/cutils.h" - -BlockDeviceInfo *bdrv_block_device_info(BlockBackend *blk, - BlockDriverState *bs, Error **errp) -{ - ImageInfo **p_image_info; - BlockDriverState *bs0; - BlockDeviceInfo *info = g_malloc0(sizeof(*info)); - - info->file = g_strdup(bs->filename); - info->ro = bs->read_only; - info->drv = g_strdup(bs->drv->format_name); - info->encrypted = bs->encrypted; - info->encryption_key_missing = bdrv_key_required(bs); - - info->cache = g_new(BlockdevCacheInfo, 1); - *info->cache = (BlockdevCacheInfo) { - .writeback = blk ? blk_enable_write_cache(blk) : true, - .direct = !!(bs->open_flags & BDRV_O_NOCACHE), - .no_flush = !!(bs->open_flags & BDRV_O_NO_FLUSH), - }; - - if (bs->node_name[0]) { - info->has_node_name = true; - info->node_name = g_strdup(bs->node_name); - } - - if (bs->backing_file[0]) { - info->has_backing_file = true; - info->backing_file = g_strdup(bs->backing_file); - } - - info->backing_file_depth = bdrv_get_backing_file_depth(bs); - info->detect_zeroes = bs->detect_zeroes; - - if (bs->throttle_state) { - ThrottleConfig cfg; - - throttle_group_get_config(bs, &cfg); - - info->bps = cfg.buckets[THROTTLE_BPS_TOTAL].avg; - info->bps_rd = cfg.buckets[THROTTLE_BPS_READ].avg; - info->bps_wr = cfg.buckets[THROTTLE_BPS_WRITE].avg; - - info->iops = cfg.buckets[THROTTLE_OPS_TOTAL].avg; - info->iops_rd = cfg.buckets[THROTTLE_OPS_READ].avg; - info->iops_wr = cfg.buckets[THROTTLE_OPS_WRITE].avg; - - info->has_bps_max = cfg.buckets[THROTTLE_BPS_TOTAL].max; - info->bps_max = cfg.buckets[THROTTLE_BPS_TOTAL].max; - info->has_bps_rd_max = cfg.buckets[THROTTLE_BPS_READ].max; - info->bps_rd_max = cfg.buckets[THROTTLE_BPS_READ].max; - info->has_bps_wr_max = cfg.buckets[THROTTLE_BPS_WRITE].max; - info->bps_wr_max = cfg.buckets[THROTTLE_BPS_WRITE].max; - - info->has_iops_max = cfg.buckets[THROTTLE_OPS_TOTAL].max; - info->iops_max = cfg.buckets[THROTTLE_OPS_TOTAL].max; - info->has_iops_rd_max = cfg.buckets[THROTTLE_OPS_READ].max; - info->iops_rd_max = cfg.buckets[THROTTLE_OPS_READ].max; - info->has_iops_wr_max = cfg.buckets[THROTTLE_OPS_WRITE].max; - info->iops_wr_max = cfg.buckets[THROTTLE_OPS_WRITE].max; - - info->has_bps_max_length = info->has_bps_max; - info->bps_max_length = - cfg.buckets[THROTTLE_BPS_TOTAL].burst_length; - info->has_bps_rd_max_length = info->has_bps_rd_max; - info->bps_rd_max_length = - cfg.buckets[THROTTLE_BPS_READ].burst_length; - info->has_bps_wr_max_length = info->has_bps_wr_max; - info->bps_wr_max_length = - cfg.buckets[THROTTLE_BPS_WRITE].burst_length; - - info->has_iops_max_length = info->has_iops_max; - info->iops_max_length = - cfg.buckets[THROTTLE_OPS_TOTAL].burst_length; - info->has_iops_rd_max_length = info->has_iops_rd_max; - info->iops_rd_max_length = - cfg.buckets[THROTTLE_OPS_READ].burst_length; - info->has_iops_wr_max_length = info->has_iops_wr_max; - info->iops_wr_max_length = - cfg.buckets[THROTTLE_OPS_WRITE].burst_length; - - info->has_iops_size = cfg.op_size; - info->iops_size = cfg.op_size; - - info->has_group = true; - info->group = g_strdup(throttle_group_get_name(bs)); - } - - info->write_threshold = bdrv_write_threshold_get(bs); - - bs0 = bs; - p_image_info = &info->image; - while (1) { - Error *local_err = NULL; - bdrv_query_image_info(bs0, p_image_info, &local_err); - if (local_err) { - error_propagate(errp, local_err); - qapi_free_BlockDeviceInfo(info); - return NULL; - } - if (bs0->drv && bs0->backing) { - bs0 = bs0->backing->bs; - (*p_image_info)->has_backing_image = true; - p_image_info = &((*p_image_info)->backing_image); - } else { - break; - } - } - - return info; -} - -/* - * Returns 0 on success, with *p_list either set to describe snapshot - * information, or NULL because there are no snapshots. Returns -errno on - * error, with *p_list untouched. - */ -int bdrv_query_snapshot_info_list(BlockDriverState *bs, - SnapshotInfoList **p_list, - Error **errp) -{ - int i, sn_count; - QEMUSnapshotInfo *sn_tab = NULL; - SnapshotInfoList *info_list, *cur_item = NULL, *head = NULL; - SnapshotInfo *info; - - sn_count = bdrv_snapshot_list(bs, &sn_tab); - if (sn_count < 0) { - const char *dev = bdrv_get_device_name(bs); - switch (sn_count) { - case -ENOMEDIUM: - error_setg(errp, "Device '%s' is not inserted", dev); - break; - case -ENOTSUP: - error_setg(errp, - "Device '%s' does not support internal snapshots", - dev); - break; - default: - error_setg_errno(errp, -sn_count, - "Can't list snapshots of device '%s'", dev); - break; - } - return sn_count; - } - - for (i = 0; i < sn_count; i++) { - info = g_new0(SnapshotInfo, 1); - info->id = g_strdup(sn_tab[i].id_str); - info->name = g_strdup(sn_tab[i].name); - info->vm_state_size = sn_tab[i].vm_state_size; - info->date_sec = sn_tab[i].date_sec; - info->date_nsec = sn_tab[i].date_nsec; - info->vm_clock_sec = sn_tab[i].vm_clock_nsec / 1000000000; - info->vm_clock_nsec = sn_tab[i].vm_clock_nsec % 1000000000; - - info_list = g_new0(SnapshotInfoList, 1); - info_list->value = info; - - /* XXX: waiting for the qapi to support qemu-queue.h types */ - if (!cur_item) { - head = cur_item = info_list; - } else { - cur_item->next = info_list; - cur_item = info_list; - } - - } - - g_free(sn_tab); - *p_list = head; - return 0; -} - -/** - * bdrv_query_image_info: - * @bs: block device to examine - * @p_info: location to store image information - * @errp: location to store error information - * - * Store "flat" image information in @p_info. - * - * "Flat" means it does *not* query backing image information, - * i.e. (*pinfo)->has_backing_image will be set to false and - * (*pinfo)->backing_image to NULL even when the image does in fact have - * a backing image. - * - * @p_info will be set only on success. On error, store error in @errp. - */ -void bdrv_query_image_info(BlockDriverState *bs, - ImageInfo **p_info, - Error **errp) -{ - int64_t size; - const char *backing_filename; - BlockDriverInfo bdi; - int ret; - Error *err = NULL; - ImageInfo *info; - - aio_context_acquire(bdrv_get_aio_context(bs)); - - size = bdrv_getlength(bs); - if (size < 0) { - error_setg_errno(errp, -size, "Can't get size of device '%s'", - bdrv_get_device_name(bs)); - goto out; - } - - info = g_new0(ImageInfo, 1); - info->filename = g_strdup(bs->filename); - info->format = g_strdup(bdrv_get_format_name(bs)); - info->virtual_size = size; - info->actual_size = bdrv_get_allocated_file_size(bs); - info->has_actual_size = info->actual_size >= 0; - if (bdrv_is_encrypted(bs)) { - info->encrypted = true; - info->has_encrypted = true; - } - if (bdrv_get_info(bs, &bdi) >= 0) { - if (bdi.cluster_size != 0) { - info->cluster_size = bdi.cluster_size; - info->has_cluster_size = true; - } - info->dirty_flag = bdi.is_dirty; - info->has_dirty_flag = true; - } - info->format_specific = bdrv_get_specific_info(bs); - info->has_format_specific = info->format_specific != NULL; - - backing_filename = bs->backing_file; - if (backing_filename[0] != '\0') { - char *backing_filename2 = g_malloc0(PATH_MAX); - info->backing_filename = g_strdup(backing_filename); - info->has_backing_filename = true; - bdrv_get_full_backing_filename(bs, backing_filename2, PATH_MAX, &err); - if (err) { - /* Can't reconstruct the full backing filename, so we must omit - * this field and apply a Best Effort to this query. */ - g_free(backing_filename2); - backing_filename2 = NULL; - error_free(err); - err = NULL; - } - - /* Always report the full_backing_filename if present, even if it's the - * same as backing_filename. That they are same is useful info. */ - if (backing_filename2) { - info->full_backing_filename = g_strdup(backing_filename2); - info->has_full_backing_filename = true; - } - - if (bs->backing_format[0]) { - info->backing_filename_format = g_strdup(bs->backing_format); - info->has_backing_filename_format = true; - } - g_free(backing_filename2); - } - - ret = bdrv_query_snapshot_info_list(bs, &info->snapshots, &err); - switch (ret) { - case 0: - if (info->snapshots) { - info->has_snapshots = true; - } - break; - /* recoverable error */ - case -ENOMEDIUM: - case -ENOTSUP: - error_free(err); - break; - default: - error_propagate(errp, err); - qapi_free_ImageInfo(info); - goto out; - } - - *p_info = info; - -out: - aio_context_release(bdrv_get_aio_context(bs)); -} - -/* @p_info will be set only on success. */ -static void bdrv_query_info(BlockBackend *blk, BlockInfo **p_info, - Error **errp) -{ - BlockInfo *info = g_malloc0(sizeof(*info)); - BlockDriverState *bs = blk_bs(blk); - info->device = g_strdup(blk_name(blk)); - info->type = g_strdup("unknown"); - info->locked = blk_dev_is_medium_locked(blk); - info->removable = blk_dev_has_removable_media(blk); - - if (blk_dev_has_tray(blk)) { - info->has_tray_open = true; - info->tray_open = blk_dev_is_tray_open(blk); - } - - if (blk_iostatus_is_enabled(blk)) { - info->has_io_status = true; - info->io_status = blk_iostatus(blk); - } - - if (bs && !QLIST_EMPTY(&bs->dirty_bitmaps)) { - info->has_dirty_bitmaps = true; - info->dirty_bitmaps = bdrv_query_dirty_bitmaps(bs); - } - - if (bs && bs->drv) { - info->has_inserted = true; - info->inserted = bdrv_block_device_info(blk, bs, errp); - if (info->inserted == NULL) { - goto err; - } - } - - *p_info = info; - return; - - err: - qapi_free_BlockInfo(info); -} - -static BlockStats *bdrv_query_stats(BlockBackend *blk, - const BlockDriverState *bs, - bool query_backing); - -static void bdrv_query_blk_stats(BlockDeviceStats *ds, BlockBackend *blk) -{ - BlockAcctStats *stats = blk_get_stats(blk); - BlockAcctTimedStats *ts = NULL; - - ds->rd_bytes = stats->nr_bytes[BLOCK_ACCT_READ]; - ds->wr_bytes = stats->nr_bytes[BLOCK_ACCT_WRITE]; - ds->rd_operations = stats->nr_ops[BLOCK_ACCT_READ]; - ds->wr_operations = stats->nr_ops[BLOCK_ACCT_WRITE]; - - ds->failed_rd_operations = stats->failed_ops[BLOCK_ACCT_READ]; - ds->failed_wr_operations = stats->failed_ops[BLOCK_ACCT_WRITE]; - ds->failed_flush_operations = stats->failed_ops[BLOCK_ACCT_FLUSH]; - - ds->invalid_rd_operations = stats->invalid_ops[BLOCK_ACCT_READ]; - ds->invalid_wr_operations = stats->invalid_ops[BLOCK_ACCT_WRITE]; - ds->invalid_flush_operations = - stats->invalid_ops[BLOCK_ACCT_FLUSH]; - - ds->rd_merged = stats->merged[BLOCK_ACCT_READ]; - ds->wr_merged = stats->merged[BLOCK_ACCT_WRITE]; - ds->flush_operations = stats->nr_ops[BLOCK_ACCT_FLUSH]; - ds->wr_total_time_ns = stats->total_time_ns[BLOCK_ACCT_WRITE]; - ds->rd_total_time_ns = stats->total_time_ns[BLOCK_ACCT_READ]; - ds->flush_total_time_ns = stats->total_time_ns[BLOCK_ACCT_FLUSH]; - - ds->has_idle_time_ns = stats->last_access_time_ns > 0; - if (ds->has_idle_time_ns) { - ds->idle_time_ns = block_acct_idle_time_ns(stats); - } - - ds->account_invalid = stats->account_invalid; - ds->account_failed = stats->account_failed; - - while ((ts = block_acct_interval_next(stats, ts))) { - BlockDeviceTimedStatsList *timed_stats = - g_malloc0(sizeof(*timed_stats)); - BlockDeviceTimedStats *dev_stats = g_malloc0(sizeof(*dev_stats)); - timed_stats->next = ds->timed_stats; - timed_stats->value = dev_stats; - ds->timed_stats = timed_stats; - - TimedAverage *rd = &ts->latency[BLOCK_ACCT_READ]; - TimedAverage *wr = &ts->latency[BLOCK_ACCT_WRITE]; - TimedAverage *fl = &ts->latency[BLOCK_ACCT_FLUSH]; - - dev_stats->interval_length = ts->interval_length; - - dev_stats->min_rd_latency_ns = timed_average_min(rd); - dev_stats->max_rd_latency_ns = timed_average_max(rd); - dev_stats->avg_rd_latency_ns = timed_average_avg(rd); - - dev_stats->min_wr_latency_ns = timed_average_min(wr); - dev_stats->max_wr_latency_ns = timed_average_max(wr); - dev_stats->avg_wr_latency_ns = timed_average_avg(wr); - - dev_stats->min_flush_latency_ns = timed_average_min(fl); - dev_stats->max_flush_latency_ns = timed_average_max(fl); - dev_stats->avg_flush_latency_ns = timed_average_avg(fl); - - dev_stats->avg_rd_queue_depth = - block_acct_queue_depth(ts, BLOCK_ACCT_READ); - dev_stats->avg_wr_queue_depth = - block_acct_queue_depth(ts, BLOCK_ACCT_WRITE); - } -} - -static void bdrv_query_bds_stats(BlockStats *s, const BlockDriverState *bs, - bool query_backing) -{ - if (bdrv_get_node_name(bs)[0]) { - s->has_node_name = true; - s->node_name = g_strdup(bdrv_get_node_name(bs)); - } - - s->stats->wr_highest_offset = bs->wr_highest_offset; - - if (bs->file) { - s->has_parent = true; - s->parent = bdrv_query_stats(NULL, bs->file->bs, query_backing); - } - - if (query_backing && bs->backing) { - s->has_backing = true; - s->backing = bdrv_query_stats(NULL, bs->backing->bs, query_backing); - } - -} - -static BlockStats *bdrv_query_stats(BlockBackend *blk, - const BlockDriverState *bs, - bool query_backing) -{ - BlockStats *s; - - s = g_malloc0(sizeof(*s)); - s->stats = g_malloc0(sizeof(*s->stats)); - - if (blk) { - s->has_device = true; - s->device = g_strdup(blk_name(blk)); - bdrv_query_blk_stats(s->stats, blk); - } - if (bs) { - bdrv_query_bds_stats(s, bs, query_backing); - } - - return s; -} - -BlockInfoList *qmp_query_block(Error **errp) -{ - BlockInfoList *head = NULL, **p_next = &head; - BlockBackend *blk; - Error *local_err = NULL; - - for (blk = blk_next(NULL); blk; blk = blk_next(blk)) { - BlockInfoList *info = g_malloc0(sizeof(*info)); - bdrv_query_info(blk, &info->value, &local_err); - if (local_err) { - error_propagate(errp, local_err); - g_free(info); - qapi_free_BlockInfoList(head); - return NULL; - } - - *p_next = info; - p_next = &info->next; - } - - return head; -} - -static bool next_query_bds(BlockBackend **blk, BlockDriverState **bs, - bool query_nodes) -{ - if (query_nodes) { - *bs = bdrv_next_node(*bs); - return !!*bs; - } - - *blk = blk_next(*blk); - *bs = *blk ? blk_bs(*blk) : NULL; - - return !!*blk; -} - -BlockStatsList *qmp_query_blockstats(bool has_query_nodes, - bool query_nodes, - Error **errp) -{ - BlockStatsList *head = NULL, **p_next = &head; - BlockBackend *blk = NULL; - BlockDriverState *bs = NULL; - - /* Just to be safe if query_nodes is not always initialized */ - query_nodes = has_query_nodes && query_nodes; - - while (next_query_bds(&blk, &bs, query_nodes)) { - BlockStatsList *info = g_malloc0(sizeof(*info)); - AioContext *ctx = blk ? blk_get_aio_context(blk) - : bdrv_get_aio_context(bs); - - aio_context_acquire(ctx); - info->value = bdrv_query_stats(blk, bs, !query_nodes); - aio_context_release(ctx); - - *p_next = info; - p_next = &info->next; - } - - return head; -} - -#define NB_SUFFIXES 4 - -static char *get_human_readable_size(char *buf, int buf_size, int64_t size) -{ - static const char suffixes[NB_SUFFIXES] = {'K', 'M', 'G', 'T'}; - int64_t base; - int i; - - if (size <= 999) { - snprintf(buf, buf_size, "%" PRId64, size); - } else { - base = 1024; - for (i = 0; i < NB_SUFFIXES; i++) { - if (size < (10 * base)) { - snprintf(buf, buf_size, "%0.1f%c", - (double)size / base, - suffixes[i]); - break; - } else if (size < (1000 * base) || i == (NB_SUFFIXES - 1)) { - snprintf(buf, buf_size, "%" PRId64 "%c", - ((size + (base >> 1)) / base), - suffixes[i]); - break; - } - base = base * 1024; - } - } - return buf; -} - -void bdrv_snapshot_dump(fprintf_function func_fprintf, void *f, - QEMUSnapshotInfo *sn) -{ - char buf1[128], date_buf[128], clock_buf[128]; - struct tm tm; - time_t ti; - int64_t secs; - - if (!sn) { - func_fprintf(f, - "%-10s%-20s%7s%20s%15s", - "ID", "TAG", "VM SIZE", "DATE", "VM CLOCK"); - } else { - ti = sn->date_sec; - localtime_r(&ti, &tm); - strftime(date_buf, sizeof(date_buf), - "%Y-%m-%d %H:%M:%S", &tm); - secs = sn->vm_clock_nsec / 1000000000; - snprintf(clock_buf, sizeof(clock_buf), - "%02d:%02d:%02d.%03d", - (int)(secs / 3600), - (int)((secs / 60) % 60), - (int)(secs % 60), - (int)((sn->vm_clock_nsec / 1000000) % 1000)); - func_fprintf(f, - "%-10s%-20s%7s%20s%15s", - sn->id_str, sn->name, - get_human_readable_size(buf1, sizeof(buf1), - sn->vm_state_size), - date_buf, - clock_buf); - } -} - -static void dump_qdict(fprintf_function func_fprintf, void *f, int indentation, - QDict *dict); -static void dump_qlist(fprintf_function func_fprintf, void *f, int indentation, - QList *list); - -static void dump_qobject(fprintf_function func_fprintf, void *f, - int comp_indent, QObject *obj) -{ - switch (qobject_type(obj)) { - case QTYPE_QINT: { - QInt *value = qobject_to_qint(obj); - func_fprintf(f, "%" PRId64, qint_get_int(value)); - break; - } - case QTYPE_QSTRING: { - QString *value = qobject_to_qstring(obj); - func_fprintf(f, "%s", qstring_get_str(value)); - break; - } - case QTYPE_QDICT: { - QDict *value = qobject_to_qdict(obj); - dump_qdict(func_fprintf, f, comp_indent, value); - break; - } - case QTYPE_QLIST: { - QList *value = qobject_to_qlist(obj); - dump_qlist(func_fprintf, f, comp_indent, value); - break; - } - case QTYPE_QFLOAT: { - QFloat *value = qobject_to_qfloat(obj); - func_fprintf(f, "%g", qfloat_get_double(value)); - break; - } - case QTYPE_QBOOL: { - QBool *value = qobject_to_qbool(obj); - func_fprintf(f, "%s", qbool_get_bool(value) ? "true" : "false"); - break; - } - default: - abort(); - } -} - -static void dump_qlist(fprintf_function func_fprintf, void *f, int indentation, - QList *list) -{ - const QListEntry *entry; - int i = 0; - - for (entry = qlist_first(list); entry; entry = qlist_next(entry), i++) { - QType type = qobject_type(entry->value); - bool composite = (type == QTYPE_QDICT || type == QTYPE_QLIST); - func_fprintf(f, "%*s[%i]:%c", indentation * 4, "", i, - composite ? '\n' : ' '); - dump_qobject(func_fprintf, f, indentation + 1, entry->value); - if (!composite) { - func_fprintf(f, "\n"); - } - } -} - -static void dump_qdict(fprintf_function func_fprintf, void *f, int indentation, - QDict *dict) -{ - const QDictEntry *entry; - - for (entry = qdict_first(dict); entry; entry = qdict_next(dict, entry)) { - QType type = qobject_type(entry->value); - bool composite = (type == QTYPE_QDICT || type == QTYPE_QLIST); - char *key = g_malloc(strlen(entry->key) + 1); - int i; - - /* replace dashes with spaces in key (variable) names */ - for (i = 0; entry->key[i]; i++) { - key[i] = entry->key[i] == '-' ? ' ' : entry->key[i]; - } - key[i] = 0; - func_fprintf(f, "%*s%s:%c", indentation * 4, "", key, - composite ? '\n' : ' '); - dump_qobject(func_fprintf, f, indentation + 1, entry->value); - if (!composite) { - func_fprintf(f, "\n"); - } - g_free(key); - } -} - -void bdrv_image_info_specific_dump(fprintf_function func_fprintf, void *f, - ImageInfoSpecific *info_spec) -{ - QmpOutputVisitor *ov = qmp_output_visitor_new(); - QObject *obj, *data; - - visit_type_ImageInfoSpecific(qmp_output_get_visitor(ov), NULL, &info_spec, - &error_abort); - obj = qmp_output_get_qobject(ov); - assert(qobject_type(obj) == QTYPE_QDICT); - data = qdict_get(qobject_to_qdict(obj), "data"); - dump_qobject(func_fprintf, f, 1, data); - qmp_output_visitor_cleanup(ov); -} - -void bdrv_image_info_dump(fprintf_function func_fprintf, void *f, - ImageInfo *info) -{ - char size_buf[128], dsize_buf[128]; - if (!info->has_actual_size) { - snprintf(dsize_buf, sizeof(dsize_buf), "unavailable"); - } else { - get_human_readable_size(dsize_buf, sizeof(dsize_buf), - info->actual_size); - } - get_human_readable_size(size_buf, sizeof(size_buf), info->virtual_size); - func_fprintf(f, - "image: %s\n" - "file format: %s\n" - "virtual size: %s (%" PRId64 " bytes)\n" - "disk size: %s\n", - info->filename, info->format, size_buf, - info->virtual_size, - dsize_buf); - - if (info->has_encrypted && info->encrypted) { - func_fprintf(f, "encrypted: yes\n"); - } - - if (info->has_cluster_size) { - func_fprintf(f, "cluster_size: %" PRId64 "\n", - info->cluster_size); - } - - if (info->has_dirty_flag && info->dirty_flag) { - func_fprintf(f, "cleanly shut down: no\n"); - } - - if (info->has_backing_filename) { - func_fprintf(f, "backing file: %s", info->backing_filename); - if (!info->has_full_backing_filename) { - func_fprintf(f, " (cannot determine actual path)"); - } else if (strcmp(info->backing_filename, - info->full_backing_filename) != 0) { - func_fprintf(f, " (actual path: %s)", info->full_backing_filename); - } - func_fprintf(f, "\n"); - if (info->has_backing_filename_format) { - func_fprintf(f, "backing file format: %s\n", - info->backing_filename_format); - } - } - - if (info->has_snapshots) { - SnapshotInfoList *elem; - - func_fprintf(f, "Snapshot list:\n"); - bdrv_snapshot_dump(func_fprintf, f, NULL); - func_fprintf(f, "\n"); - - /* Ideally bdrv_snapshot_dump() would operate on SnapshotInfoList but - * we convert to the block layer's native QEMUSnapshotInfo for now. - */ - for (elem = info->snapshots; elem; elem = elem->next) { - QEMUSnapshotInfo sn = { - .vm_state_size = elem->value->vm_state_size, - .date_sec = elem->value->date_sec, - .date_nsec = elem->value->date_nsec, - .vm_clock_nsec = elem->value->vm_clock_sec * 1000000000ULL + - elem->value->vm_clock_nsec, - }; - - pstrcpy(sn.id_str, sizeof(sn.id_str), elem->value->id); - pstrcpy(sn.name, sizeof(sn.name), elem->value->name); - bdrv_snapshot_dump(func_fprintf, f, &sn); - func_fprintf(f, "\n"); - } - } - - if (info->has_format_specific) { - func_fprintf(f, "Format specific information:\n"); - bdrv_image_info_specific_dump(func_fprintf, f, info->format_specific); - } -} diff --git a/qemu/block/qcow.c b/qemu/block/qcow.c deleted file mode 100644 index 60ddb12ec..000000000 --- a/qemu/block/qcow.c +++ /dev/null @@ -1,1050 +0,0 @@ -/* - * Block driver for the QCOW format - * - * Copyright (c) 2004-2006 Fabrice Bellard - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. - */ -#include "qemu/osdep.h" -#include "qapi/error.h" -#include "qemu-common.h" -#include "qemu/error-report.h" -#include "block/block_int.h" -#include "sysemu/block-backend.h" -#include "qemu/module.h" -#include -#include "qapi/qmp/qerror.h" -#include "crypto/cipher.h" -#include "migration/migration.h" - -/**************************************************************/ -/* QEMU COW block driver with compression and encryption support */ - -#define QCOW_MAGIC (('Q' << 24) | ('F' << 16) | ('I' << 8) | 0xfb) -#define QCOW_VERSION 1 - -#define QCOW_CRYPT_NONE 0 -#define QCOW_CRYPT_AES 1 - -#define QCOW_OFLAG_COMPRESSED (1LL << 63) - -typedef struct QCowHeader { - uint32_t magic; - uint32_t version; - uint64_t backing_file_offset; - uint32_t backing_file_size; - uint32_t mtime; - uint64_t size; /* in bytes */ - uint8_t cluster_bits; - uint8_t l2_bits; - uint16_t padding; - uint32_t crypt_method; - uint64_t l1_table_offset; -} QEMU_PACKED QCowHeader; - -#define L2_CACHE_SIZE 16 - -typedef struct BDRVQcowState { - int cluster_bits; - int cluster_size; - int cluster_sectors; - int l2_bits; - int l2_size; - unsigned int l1_size; - uint64_t cluster_offset_mask; - uint64_t l1_table_offset; - uint64_t *l1_table; - uint64_t *l2_cache; - uint64_t l2_cache_offsets[L2_CACHE_SIZE]; - uint32_t l2_cache_counts[L2_CACHE_SIZE]; - uint8_t *cluster_cache; - uint8_t *cluster_data; - uint64_t cluster_cache_offset; - QCryptoCipher *cipher; /* NULL if no key yet */ - uint32_t crypt_method_header; - CoMutex lock; - Error *migration_blocker; -} BDRVQcowState; - -static int decompress_cluster(BlockDriverState *bs, uint64_t cluster_offset); - -static int qcow_probe(const uint8_t *buf, int buf_size, const char *filename) -{ - const QCowHeader *cow_header = (const void *)buf; - - if (buf_size >= sizeof(QCowHeader) && - be32_to_cpu(cow_header->magic) == QCOW_MAGIC && - be32_to_cpu(cow_header->version) == QCOW_VERSION) - return 100; - else - return 0; -} - -static int qcow_open(BlockDriverState *bs, QDict *options, int flags, - Error **errp) -{ - BDRVQcowState *s = bs->opaque; - unsigned int len, i, shift; - int ret; - QCowHeader header; - - ret = bdrv_pread(bs->file->bs, 0, &header, sizeof(header)); - if (ret < 0) { - goto fail; - } - be32_to_cpus(&header.magic); - be32_to_cpus(&header.version); - be64_to_cpus(&header.backing_file_offset); - be32_to_cpus(&header.backing_file_size); - be32_to_cpus(&header.mtime); - be64_to_cpus(&header.size); - be32_to_cpus(&header.crypt_method); - be64_to_cpus(&header.l1_table_offset); - - if (header.magic != QCOW_MAGIC) { - error_setg(errp, "Image not in qcow format"); - ret = -EINVAL; - goto fail; - } - if (header.version != QCOW_VERSION) { - error_setg(errp, "Unsupported qcow version %" PRIu32, header.version); - ret = -ENOTSUP; - goto fail; - } - - if (header.size <= 1) { - error_setg(errp, "Image size is too small (must be at least 2 bytes)"); - ret = -EINVAL; - goto fail; - } - if (header.cluster_bits < 9 || header.cluster_bits > 16) { - error_setg(errp, "Cluster size must be between 512 and 64k"); - ret = -EINVAL; - goto fail; - } - - /* l2_bits specifies number of entries; storing a uint64_t in each entry, - * so bytes = num_entries << 3. */ - if (header.l2_bits < 9 - 3 || header.l2_bits > 16 - 3) { - error_setg(errp, "L2 table size must be between 512 and 64k"); - ret = -EINVAL; - goto fail; - } - - if (header.crypt_method > QCOW_CRYPT_AES) { - error_setg(errp, "invalid encryption method in qcow header"); - ret = -EINVAL; - goto fail; - } - if (!qcrypto_cipher_supports(QCRYPTO_CIPHER_ALG_AES_128)) { - error_setg(errp, "AES cipher not available"); - ret = -EINVAL; - goto fail; - } - s->crypt_method_header = header.crypt_method; - if (s->crypt_method_header) { - if (bdrv_uses_whitelist() && - s->crypt_method_header == QCOW_CRYPT_AES) { - error_report("qcow built-in AES encryption is deprecated"); - error_printf("Support for it will be removed in a future release.\n" - "You can use 'qemu-img convert' to switch to an\n" - "unencrypted qcow image, or a LUKS raw image.\n"); - } - - bs->encrypted = 1; - } - s->cluster_bits = header.cluster_bits; - s->cluster_size = 1 << s->cluster_bits; - s->cluster_sectors = 1 << (s->cluster_bits - 9); - s->l2_bits = header.l2_bits; - s->l2_size = 1 << s->l2_bits; - bs->total_sectors = header.size / 512; - s->cluster_offset_mask = (1LL << (63 - s->cluster_bits)) - 1; - - /* read the level 1 table */ - shift = s->cluster_bits + s->l2_bits; - if (header.size > UINT64_MAX - (1LL << shift)) { - error_setg(errp, "Image too large"); - ret = -EINVAL; - goto fail; - } else { - uint64_t l1_size = (header.size + (1LL << shift) - 1) >> shift; - if (l1_size > INT_MAX / sizeof(uint64_t)) { - error_setg(errp, "Image too large"); - ret = -EINVAL; - goto fail; - } - s->l1_size = l1_size; - } - - s->l1_table_offset = header.l1_table_offset; - s->l1_table = g_try_new(uint64_t, s->l1_size); - if (s->l1_table == NULL) { - error_setg(errp, "Could not allocate memory for L1 table"); - ret = -ENOMEM; - goto fail; - } - - ret = bdrv_pread(bs->file->bs, s->l1_table_offset, s->l1_table, - s->l1_size * sizeof(uint64_t)); - if (ret < 0) { - goto fail; - } - - for(i = 0;i < s->l1_size; i++) { - be64_to_cpus(&s->l1_table[i]); - } - - /* alloc L2 cache (max. 64k * 16 * 8 = 8 MB) */ - s->l2_cache = - qemu_try_blockalign(bs->file->bs, - s->l2_size * L2_CACHE_SIZE * sizeof(uint64_t)); - if (s->l2_cache == NULL) { - error_setg(errp, "Could not allocate L2 table cache"); - ret = -ENOMEM; - goto fail; - } - s->cluster_cache = g_malloc(s->cluster_size); - s->cluster_data = g_malloc(s->cluster_size); - s->cluster_cache_offset = -1; - - /* read the backing file name */ - if (header.backing_file_offset != 0) { - len = header.backing_file_size; - if (len > 1023 || len >= sizeof(bs->backing_file)) { - error_setg(errp, "Backing file name too long"); - ret = -EINVAL; - goto fail; - } - ret = bdrv_pread(bs->file->bs, header.backing_file_offset, - bs->backing_file, len); - if (ret < 0) { - goto fail; - } - bs->backing_file[len] = '\0'; - } - - /* Disable migration when qcow images are used */ - error_setg(&s->migration_blocker, "The qcow format used by node '%s' " - "does not support live migration", - bdrv_get_device_or_node_name(bs)); - migrate_add_blocker(s->migration_blocker); - - qemu_co_mutex_init(&s->lock); - return 0; - - fail: - g_free(s->l1_table); - qemu_vfree(s->l2_cache); - g_free(s->cluster_cache); - g_free(s->cluster_data); - return ret; -} - - -/* We have nothing to do for QCOW reopen, stubs just return - * success */ -static int qcow_reopen_prepare(BDRVReopenState *state, - BlockReopenQueue *queue, Error **errp) -{ - return 0; -} - -static int qcow_set_key(BlockDriverState *bs, const char *key) -{ - BDRVQcowState *s = bs->opaque; - uint8_t keybuf[16]; - int len, i; - Error *err; - - memset(keybuf, 0, 16); - len = strlen(key); - if (len > 16) - len = 16; - /* XXX: we could compress the chars to 7 bits to increase - entropy */ - for(i = 0;i < len;i++) { - keybuf[i] = key[i]; - } - assert(bs->encrypted); - - qcrypto_cipher_free(s->cipher); - s->cipher = qcrypto_cipher_new( - QCRYPTO_CIPHER_ALG_AES_128, - QCRYPTO_CIPHER_MODE_CBC, - keybuf, G_N_ELEMENTS(keybuf), - &err); - - if (!s->cipher) { - /* XXX would be nice if errors in this method could - * be properly propagate to the caller. Would need - * the bdrv_set_key() API signature to be fixed. */ - error_free(err); - return -1; - } - return 0; -} - -/* The crypt function is compatible with the linux cryptoloop - algorithm for < 4 GB images. NOTE: out_buf == in_buf is - supported */ -static int encrypt_sectors(BDRVQcowState *s, int64_t sector_num, - uint8_t *out_buf, const uint8_t *in_buf, - int nb_sectors, bool enc, Error **errp) -{ - union { - uint64_t ll[2]; - uint8_t b[16]; - } ivec; - int i; - int ret; - - for(i = 0; i < nb_sectors; i++) { - ivec.ll[0] = cpu_to_le64(sector_num); - ivec.ll[1] = 0; - if (qcrypto_cipher_setiv(s->cipher, - ivec.b, G_N_ELEMENTS(ivec.b), - errp) < 0) { - return -1; - } - if (enc) { - ret = qcrypto_cipher_encrypt(s->cipher, - in_buf, - out_buf, - 512, - errp); - } else { - ret = qcrypto_cipher_decrypt(s->cipher, - in_buf, - out_buf, - 512, - errp); - } - if (ret < 0) { - return -1; - } - sector_num++; - in_buf += 512; - out_buf += 512; - } - return 0; -} - -/* 'allocate' is: - * - * 0 to not allocate. - * - * 1 to allocate a normal cluster (for sector indexes 'n_start' to - * 'n_end') - * - * 2 to allocate a compressed cluster of size - * 'compressed_size'. 'compressed_size' must be > 0 and < - * cluster_size - * - * return 0 if not allocated. - */ -static uint64_t get_cluster_offset(BlockDriverState *bs, - uint64_t offset, int allocate, - int compressed_size, - int n_start, int n_end) -{ - BDRVQcowState *s = bs->opaque; - int min_index, i, j, l1_index, l2_index; - uint64_t l2_offset, *l2_table, cluster_offset, tmp; - uint32_t min_count; - int new_l2_table; - - l1_index = offset >> (s->l2_bits + s->cluster_bits); - l2_offset = s->l1_table[l1_index]; - new_l2_table = 0; - if (!l2_offset) { - if (!allocate) - return 0; - /* allocate a new l2 entry */ - l2_offset = bdrv_getlength(bs->file->bs); - /* round to cluster size */ - l2_offset = (l2_offset + s->cluster_size - 1) & ~(s->cluster_size - 1); - /* update the L1 entry */ - s->l1_table[l1_index] = l2_offset; - tmp = cpu_to_be64(l2_offset); - if (bdrv_pwrite_sync(bs->file->bs, - s->l1_table_offset + l1_index * sizeof(tmp), - &tmp, sizeof(tmp)) < 0) - return 0; - new_l2_table = 1; - } - for(i = 0; i < L2_CACHE_SIZE; i++) { - if (l2_offset == s->l2_cache_offsets[i]) { - /* increment the hit count */ - if (++s->l2_cache_counts[i] == 0xffffffff) { - for(j = 0; j < L2_CACHE_SIZE; j++) { - s->l2_cache_counts[j] >>= 1; - } - } - l2_table = s->l2_cache + (i << s->l2_bits); - goto found; - } - } - /* not found: load a new entry in the least used one */ - min_index = 0; - min_count = 0xffffffff; - for(i = 0; i < L2_CACHE_SIZE; i++) { - if (s->l2_cache_counts[i] < min_count) { - min_count = s->l2_cache_counts[i]; - min_index = i; - } - } - l2_table = s->l2_cache + (min_index << s->l2_bits); - if (new_l2_table) { - memset(l2_table, 0, s->l2_size * sizeof(uint64_t)); - if (bdrv_pwrite_sync(bs->file->bs, l2_offset, l2_table, - s->l2_size * sizeof(uint64_t)) < 0) - return 0; - } else { - if (bdrv_pread(bs->file->bs, l2_offset, l2_table, - s->l2_size * sizeof(uint64_t)) != - s->l2_size * sizeof(uint64_t)) - return 0; - } - s->l2_cache_offsets[min_index] = l2_offset; - s->l2_cache_counts[min_index] = 1; - found: - l2_index = (offset >> s->cluster_bits) & (s->l2_size - 1); - cluster_offset = be64_to_cpu(l2_table[l2_index]); - if (!cluster_offset || - ((cluster_offset & QCOW_OFLAG_COMPRESSED) && allocate == 1)) { - if (!allocate) - return 0; - /* allocate a new cluster */ - if ((cluster_offset & QCOW_OFLAG_COMPRESSED) && - (n_end - n_start) < s->cluster_sectors) { - /* if the cluster is already compressed, we must - decompress it in the case it is not completely - overwritten */ - if (decompress_cluster(bs, cluster_offset) < 0) - return 0; - cluster_offset = bdrv_getlength(bs->file->bs); - cluster_offset = (cluster_offset + s->cluster_size - 1) & - ~(s->cluster_size - 1); - /* write the cluster content */ - if (bdrv_pwrite(bs->file->bs, cluster_offset, s->cluster_cache, - s->cluster_size) != - s->cluster_size) - return -1; - } else { - cluster_offset = bdrv_getlength(bs->file->bs); - if (allocate == 1) { - /* round to cluster size */ - cluster_offset = (cluster_offset + s->cluster_size - 1) & - ~(s->cluster_size - 1); - bdrv_truncate(bs->file->bs, cluster_offset + s->cluster_size); - /* if encrypted, we must initialize the cluster - content which won't be written */ - if (bs->encrypted && - (n_end - n_start) < s->cluster_sectors) { - uint64_t start_sect; - assert(s->cipher); - start_sect = (offset & ~(s->cluster_size - 1)) >> 9; - memset(s->cluster_data + 512, 0x00, 512); - for(i = 0; i < s->cluster_sectors; i++) { - if (i < n_start || i >= n_end) { - Error *err = NULL; - if (encrypt_sectors(s, start_sect + i, - s->cluster_data, - s->cluster_data + 512, 1, - true, &err) < 0) { - error_free(err); - errno = EIO; - return -1; - } - if (bdrv_pwrite(bs->file->bs, - cluster_offset + i * 512, - s->cluster_data, 512) != 512) - return -1; - } - } - } - } else if (allocate == 2) { - cluster_offset |= QCOW_OFLAG_COMPRESSED | - (uint64_t)compressed_size << (63 - s->cluster_bits); - } - } - /* update L2 table */ - tmp = cpu_to_be64(cluster_offset); - l2_table[l2_index] = tmp; - if (bdrv_pwrite_sync(bs->file->bs, l2_offset + l2_index * sizeof(tmp), - &tmp, sizeof(tmp)) < 0) - return 0; - } - return cluster_offset; -} - -static int64_t coroutine_fn qcow_co_get_block_status(BlockDriverState *bs, - int64_t sector_num, int nb_sectors, int *pnum, BlockDriverState **file) -{ - BDRVQcowState *s = bs->opaque; - int index_in_cluster, n; - uint64_t cluster_offset; - - qemu_co_mutex_lock(&s->lock); - cluster_offset = get_cluster_offset(bs, sector_num << 9, 0, 0, 0, 0); - qemu_co_mutex_unlock(&s->lock); - index_in_cluster = sector_num & (s->cluster_sectors - 1); - n = s->cluster_sectors - index_in_cluster; - if (n > nb_sectors) - n = nb_sectors; - *pnum = n; - if (!cluster_offset) { - return 0; - } - if ((cluster_offset & QCOW_OFLAG_COMPRESSED) || s->cipher) { - return BDRV_BLOCK_DATA; - } - cluster_offset |= (index_in_cluster << BDRV_SECTOR_BITS); - *file = bs->file->bs; - return BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID | cluster_offset; -} - -static int decompress_buffer(uint8_t *out_buf, int out_buf_size, - const uint8_t *buf, int buf_size) -{ - z_stream strm1, *strm = &strm1; - int ret, out_len; - - memset(strm, 0, sizeof(*strm)); - - strm->next_in = (uint8_t *)buf; - strm->avail_in = buf_size; - strm->next_out = out_buf; - strm->avail_out = out_buf_size; - - ret = inflateInit2(strm, -12); - if (ret != Z_OK) - return -1; - ret = inflate(strm, Z_FINISH); - out_len = strm->next_out - out_buf; - if ((ret != Z_STREAM_END && ret != Z_BUF_ERROR) || - out_len != out_buf_size) { - inflateEnd(strm); - return -1; - } - inflateEnd(strm); - return 0; -} - -static int decompress_cluster(BlockDriverState *bs, uint64_t cluster_offset) -{ - BDRVQcowState *s = bs->opaque; - int ret, csize; - uint64_t coffset; - - coffset = cluster_offset & s->cluster_offset_mask; - if (s->cluster_cache_offset != coffset) { - csize = cluster_offset >> (63 - s->cluster_bits); - csize &= (s->cluster_size - 1); - ret = bdrv_pread(bs->file->bs, coffset, s->cluster_data, csize); - if (ret != csize) - return -1; - if (decompress_buffer(s->cluster_cache, s->cluster_size, - s->cluster_data, csize) < 0) { - return -1; - } - s->cluster_cache_offset = coffset; - } - return 0; -} - -static coroutine_fn int qcow_co_readv(BlockDriverState *bs, int64_t sector_num, - int nb_sectors, QEMUIOVector *qiov) -{ - BDRVQcowState *s = bs->opaque; - int index_in_cluster; - int ret = 0, n; - uint64_t cluster_offset; - struct iovec hd_iov; - QEMUIOVector hd_qiov; - uint8_t *buf; - void *orig_buf; - Error *err = NULL; - - if (qiov->niov > 1) { - buf = orig_buf = qemu_try_blockalign(bs, qiov->size); - if (buf == NULL) { - return -ENOMEM; - } - } else { - orig_buf = NULL; - buf = (uint8_t *)qiov->iov->iov_base; - } - - qemu_co_mutex_lock(&s->lock); - - while (nb_sectors != 0) { - /* prepare next request */ - cluster_offset = get_cluster_offset(bs, sector_num << 9, - 0, 0, 0, 0); - index_in_cluster = sector_num & (s->cluster_sectors - 1); - n = s->cluster_sectors - index_in_cluster; - if (n > nb_sectors) { - n = nb_sectors; - } - - if (!cluster_offset) { - if (bs->backing) { - /* read from the base image */ - hd_iov.iov_base = (void *)buf; - hd_iov.iov_len = n * 512; - qemu_iovec_init_external(&hd_qiov, &hd_iov, 1); - qemu_co_mutex_unlock(&s->lock); - ret = bdrv_co_readv(bs->backing->bs, sector_num, - n, &hd_qiov); - qemu_co_mutex_lock(&s->lock); - if (ret < 0) { - goto fail; - } - } else { - /* Note: in this case, no need to wait */ - memset(buf, 0, 512 * n); - } - } else if (cluster_offset & QCOW_OFLAG_COMPRESSED) { - /* add AIO support for compressed blocks ? */ - if (decompress_cluster(bs, cluster_offset) < 0) { - goto fail; - } - memcpy(buf, - s->cluster_cache + index_in_cluster * 512, 512 * n); - } else { - if ((cluster_offset & 511) != 0) { - goto fail; - } - hd_iov.iov_base = (void *)buf; - hd_iov.iov_len = n * 512; - qemu_iovec_init_external(&hd_qiov, &hd_iov, 1); - qemu_co_mutex_unlock(&s->lock); - ret = bdrv_co_readv(bs->file->bs, - (cluster_offset >> 9) + index_in_cluster, - n, &hd_qiov); - qemu_co_mutex_lock(&s->lock); - if (ret < 0) { - break; - } - if (bs->encrypted) { - assert(s->cipher); - if (encrypt_sectors(s, sector_num, buf, buf, - n, false, &err) < 0) { - goto fail; - } - } - } - ret = 0; - - nb_sectors -= n; - sector_num += n; - buf += n * 512; - } - -done: - qemu_co_mutex_unlock(&s->lock); - - if (qiov->niov > 1) { - qemu_iovec_from_buf(qiov, 0, orig_buf, qiov->size); - qemu_vfree(orig_buf); - } - - return ret; - -fail: - error_free(err); - ret = -EIO; - goto done; -} - -static coroutine_fn int qcow_co_writev(BlockDriverState *bs, int64_t sector_num, - int nb_sectors, QEMUIOVector *qiov) -{ - BDRVQcowState *s = bs->opaque; - int index_in_cluster; - uint64_t cluster_offset; - const uint8_t *src_buf; - int ret = 0, n; - uint8_t *cluster_data = NULL; - struct iovec hd_iov; - QEMUIOVector hd_qiov; - uint8_t *buf; - void *orig_buf; - - s->cluster_cache_offset = -1; /* disable compressed cache */ - - if (qiov->niov > 1) { - buf = orig_buf = qemu_try_blockalign(bs, qiov->size); - if (buf == NULL) { - return -ENOMEM; - } - qemu_iovec_to_buf(qiov, 0, buf, qiov->size); - } else { - orig_buf = NULL; - buf = (uint8_t *)qiov->iov->iov_base; - } - - qemu_co_mutex_lock(&s->lock); - - while (nb_sectors != 0) { - - index_in_cluster = sector_num & (s->cluster_sectors - 1); - n = s->cluster_sectors - index_in_cluster; - if (n > nb_sectors) { - n = nb_sectors; - } - cluster_offset = get_cluster_offset(bs, sector_num << 9, 1, 0, - index_in_cluster, - index_in_cluster + n); - if (!cluster_offset || (cluster_offset & 511) != 0) { - ret = -EIO; - break; - } - if (bs->encrypted) { - Error *err = NULL; - assert(s->cipher); - if (!cluster_data) { - cluster_data = g_malloc0(s->cluster_size); - } - if (encrypt_sectors(s, sector_num, cluster_data, buf, - n, true, &err) < 0) { - error_free(err); - ret = -EIO; - break; - } - src_buf = cluster_data; - } else { - src_buf = buf; - } - - hd_iov.iov_base = (void *)src_buf; - hd_iov.iov_len = n * 512; - qemu_iovec_init_external(&hd_qiov, &hd_iov, 1); - qemu_co_mutex_unlock(&s->lock); - ret = bdrv_co_writev(bs->file->bs, - (cluster_offset >> 9) + index_in_cluster, - n, &hd_qiov); - qemu_co_mutex_lock(&s->lock); - if (ret < 0) { - break; - } - ret = 0; - - nb_sectors -= n; - sector_num += n; - buf += n * 512; - } - qemu_co_mutex_unlock(&s->lock); - - if (qiov->niov > 1) { - qemu_vfree(orig_buf); - } - g_free(cluster_data); - - return ret; -} - -static void qcow_close(BlockDriverState *bs) -{ - BDRVQcowState *s = bs->opaque; - - qcrypto_cipher_free(s->cipher); - s->cipher = NULL; - g_free(s->l1_table); - qemu_vfree(s->l2_cache); - g_free(s->cluster_cache); - g_free(s->cluster_data); - - migrate_del_blocker(s->migration_blocker); - error_free(s->migration_blocker); -} - -static int qcow_create(const char *filename, QemuOpts *opts, Error **errp) -{ - int header_size, backing_filename_len, l1_size, shift, i; - QCowHeader header; - uint8_t *tmp; - int64_t total_size = 0; - char *backing_file = NULL; - int flags = 0; - Error *local_err = NULL; - int ret; - BlockBackend *qcow_blk; - - /* Read out options */ - total_size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0), - BDRV_SECTOR_SIZE); - backing_file = qemu_opt_get_del(opts, BLOCK_OPT_BACKING_FILE); - if (qemu_opt_get_bool_del(opts, BLOCK_OPT_ENCRYPT, false)) { - flags |= BLOCK_FLAG_ENCRYPT; - } - - ret = bdrv_create_file(filename, opts, &local_err); - if (ret < 0) { - error_propagate(errp, local_err); - goto cleanup; - } - - qcow_blk = blk_new_open(filename, NULL, NULL, - BDRV_O_RDWR | BDRV_O_PROTOCOL, &local_err); - if (qcow_blk == NULL) { - error_propagate(errp, local_err); - ret = -EIO; - goto cleanup; - } - - blk_set_allow_write_beyond_eof(qcow_blk, true); - - ret = blk_truncate(qcow_blk, 0); - if (ret < 0) { - goto exit; - } - - memset(&header, 0, sizeof(header)); - header.magic = cpu_to_be32(QCOW_MAGIC); - header.version = cpu_to_be32(QCOW_VERSION); - header.size = cpu_to_be64(total_size); - header_size = sizeof(header); - backing_filename_len = 0; - if (backing_file) { - if (strcmp(backing_file, "fat:")) { - header.backing_file_offset = cpu_to_be64(header_size); - backing_filename_len = strlen(backing_file); - header.backing_file_size = cpu_to_be32(backing_filename_len); - header_size += backing_filename_len; - } else { - /* special backing file for vvfat */ - backing_file = NULL; - } - header.cluster_bits = 9; /* 512 byte cluster to avoid copying - unmodified sectors */ - header.l2_bits = 12; /* 32 KB L2 tables */ - } else { - header.cluster_bits = 12; /* 4 KB clusters */ - header.l2_bits = 9; /* 4 KB L2 tables */ - } - header_size = (header_size + 7) & ~7; - shift = header.cluster_bits + header.l2_bits; - l1_size = (total_size + (1LL << shift) - 1) >> shift; - - header.l1_table_offset = cpu_to_be64(header_size); - if (flags & BLOCK_FLAG_ENCRYPT) { - header.crypt_method = cpu_to_be32(QCOW_CRYPT_AES); - } else { - header.crypt_method = cpu_to_be32(QCOW_CRYPT_NONE); - } - - /* write all the data */ - ret = blk_pwrite(qcow_blk, 0, &header, sizeof(header)); - if (ret != sizeof(header)) { - goto exit; - } - - if (backing_file) { - ret = blk_pwrite(qcow_blk, sizeof(header), - backing_file, backing_filename_len); - if (ret != backing_filename_len) { - goto exit; - } - } - - tmp = g_malloc0(BDRV_SECTOR_SIZE); - for (i = 0; i < ((sizeof(uint64_t)*l1_size + BDRV_SECTOR_SIZE - 1)/ - BDRV_SECTOR_SIZE); i++) { - ret = blk_pwrite(qcow_blk, header_size + - BDRV_SECTOR_SIZE*i, tmp, BDRV_SECTOR_SIZE); - if (ret != BDRV_SECTOR_SIZE) { - g_free(tmp); - goto exit; - } - } - - g_free(tmp); - ret = 0; -exit: - blk_unref(qcow_blk); -cleanup: - g_free(backing_file); - return ret; -} - -static int qcow_make_empty(BlockDriverState *bs) -{ - BDRVQcowState *s = bs->opaque; - uint32_t l1_length = s->l1_size * sizeof(uint64_t); - int ret; - - memset(s->l1_table, 0, l1_length); - if (bdrv_pwrite_sync(bs->file->bs, s->l1_table_offset, s->l1_table, - l1_length) < 0) - return -1; - ret = bdrv_truncate(bs->file->bs, s->l1_table_offset + l1_length); - if (ret < 0) - return ret; - - memset(s->l2_cache, 0, s->l2_size * L2_CACHE_SIZE * sizeof(uint64_t)); - memset(s->l2_cache_offsets, 0, L2_CACHE_SIZE * sizeof(uint64_t)); - memset(s->l2_cache_counts, 0, L2_CACHE_SIZE * sizeof(uint32_t)); - - return 0; -} - -/* XXX: put compressed sectors first, then all the cluster aligned - tables to avoid losing bytes in alignment */ -static int qcow_write_compressed(BlockDriverState *bs, int64_t sector_num, - const uint8_t *buf, int nb_sectors) -{ - BDRVQcowState *s = bs->opaque; - z_stream strm; - int ret, out_len; - uint8_t *out_buf; - uint64_t cluster_offset; - - if (nb_sectors != s->cluster_sectors) { - ret = -EINVAL; - - /* Zero-pad last write if image size is not cluster aligned */ - if (sector_num + nb_sectors == bs->total_sectors && - nb_sectors < s->cluster_sectors) { - uint8_t *pad_buf = qemu_blockalign(bs, s->cluster_size); - memset(pad_buf, 0, s->cluster_size); - memcpy(pad_buf, buf, nb_sectors * BDRV_SECTOR_SIZE); - ret = qcow_write_compressed(bs, sector_num, - pad_buf, s->cluster_sectors); - qemu_vfree(pad_buf); - } - return ret; - } - - out_buf = g_malloc(s->cluster_size + (s->cluster_size / 1000) + 128); - - /* best compression, small window, no zlib header */ - memset(&strm, 0, sizeof(strm)); - ret = deflateInit2(&strm, Z_DEFAULT_COMPRESSION, - Z_DEFLATED, -12, - 9, Z_DEFAULT_STRATEGY); - if (ret != 0) { - ret = -EINVAL; - goto fail; - } - - strm.avail_in = s->cluster_size; - strm.next_in = (uint8_t *)buf; - strm.avail_out = s->cluster_size; - strm.next_out = out_buf; - - ret = deflate(&strm, Z_FINISH); - if (ret != Z_STREAM_END && ret != Z_OK) { - deflateEnd(&strm); - ret = -EINVAL; - goto fail; - } - out_len = strm.next_out - out_buf; - - deflateEnd(&strm); - - if (ret != Z_STREAM_END || out_len >= s->cluster_size) { - /* could not compress: write normal cluster */ - ret = bdrv_write(bs, sector_num, buf, s->cluster_sectors); - if (ret < 0) { - goto fail; - } - } else { - cluster_offset = get_cluster_offset(bs, sector_num << 9, 2, - out_len, 0, 0); - if (cluster_offset == 0) { - ret = -EIO; - goto fail; - } - - cluster_offset &= s->cluster_offset_mask; - ret = bdrv_pwrite(bs->file->bs, cluster_offset, out_buf, out_len); - if (ret < 0) { - goto fail; - } - } - - ret = 0; -fail: - g_free(out_buf); - return ret; -} - -static int qcow_get_info(BlockDriverState *bs, BlockDriverInfo *bdi) -{ - BDRVQcowState *s = bs->opaque; - bdi->cluster_size = s->cluster_size; - return 0; -} - -static QemuOptsList qcow_create_opts = { - .name = "qcow-create-opts", - .head = QTAILQ_HEAD_INITIALIZER(qcow_create_opts.head), - .desc = { - { - .name = BLOCK_OPT_SIZE, - .type = QEMU_OPT_SIZE, - .help = "Virtual disk size" - }, - { - .name = BLOCK_OPT_BACKING_FILE, - .type = QEMU_OPT_STRING, - .help = "File name of a base image" - }, - { - .name = BLOCK_OPT_ENCRYPT, - .type = QEMU_OPT_BOOL, - .help = "Encrypt the image", - .def_value_str = "off" - }, - { /* end of list */ } - } -}; - -static BlockDriver bdrv_qcow = { - .format_name = "qcow", - .instance_size = sizeof(BDRVQcowState), - .bdrv_probe = qcow_probe, - .bdrv_open = qcow_open, - .bdrv_close = qcow_close, - .bdrv_reopen_prepare = qcow_reopen_prepare, - .bdrv_create = qcow_create, - .bdrv_has_zero_init = bdrv_has_zero_init_1, - .supports_backing = true, - - .bdrv_co_readv = qcow_co_readv, - .bdrv_co_writev = qcow_co_writev, - .bdrv_co_get_block_status = qcow_co_get_block_status, - - .bdrv_set_key = qcow_set_key, - .bdrv_make_empty = qcow_make_empty, - .bdrv_write_compressed = qcow_write_compressed, - .bdrv_get_info = qcow_get_info, - - .create_opts = &qcow_create_opts, -}; - -static void bdrv_qcow_init(void) -{ - bdrv_register(&bdrv_qcow); -} - -block_init(bdrv_qcow_init); diff --git a/qemu/block/qcow2-cache.c b/qemu/block/qcow2-cache.c deleted file mode 100644 index 0fe8edae4..000000000 --- a/qemu/block/qcow2-cache.c +++ /dev/null @@ -1,411 +0,0 @@ -/* - * L2/refcount table cache for the QCOW2 format - * - * Copyright (c) 2010 Kevin Wolf - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. - */ - -/* Needed for CONFIG_MADVISE */ -#include "qemu/osdep.h" - -#if defined(CONFIG_MADVISE) || defined(CONFIG_POSIX_MADVISE) -#include -#endif - -#include "block/block_int.h" -#include "qemu-common.h" -#include "qcow2.h" -#include "trace.h" - -typedef struct Qcow2CachedTable { - int64_t offset; - uint64_t lru_counter; - int ref; - bool dirty; -} Qcow2CachedTable; - -struct Qcow2Cache { - Qcow2CachedTable *entries; - struct Qcow2Cache *depends; - int size; - bool depends_on_flush; - void *table_array; - uint64_t lru_counter; - uint64_t cache_clean_lru_counter; -}; - -static inline void *qcow2_cache_get_table_addr(BlockDriverState *bs, - Qcow2Cache *c, int table) -{ - BDRVQcow2State *s = bs->opaque; - return (uint8_t *) c->table_array + (size_t) table * s->cluster_size; -} - -static inline int qcow2_cache_get_table_idx(BlockDriverState *bs, - Qcow2Cache *c, void *table) -{ - BDRVQcow2State *s = bs->opaque; - ptrdiff_t table_offset = (uint8_t *) table - (uint8_t *) c->table_array; - int idx = table_offset / s->cluster_size; - assert(idx >= 0 && idx < c->size && table_offset % s->cluster_size == 0); - return idx; -} - -static void qcow2_cache_table_release(BlockDriverState *bs, Qcow2Cache *c, - int i, int num_tables) -{ -#if QEMU_MADV_DONTNEED != QEMU_MADV_INVALID - BDRVQcow2State *s = bs->opaque; - void *t = qcow2_cache_get_table_addr(bs, c, i); - int align = getpagesize(); - size_t mem_size = (size_t) s->cluster_size * num_tables; - size_t offset = QEMU_ALIGN_UP((uintptr_t) t, align) - (uintptr_t) t; - size_t length = QEMU_ALIGN_DOWN(mem_size - offset, align); - if (length > 0) { - qemu_madvise((uint8_t *) t + offset, length, QEMU_MADV_DONTNEED); - } -#endif -} - -static inline bool can_clean_entry(Qcow2Cache *c, int i) -{ - Qcow2CachedTable *t = &c->entries[i]; - return t->ref == 0 && !t->dirty && t->offset != 0 && - t->lru_counter <= c->cache_clean_lru_counter; -} - -void qcow2_cache_clean_unused(BlockDriverState *bs, Qcow2Cache *c) -{ - int i = 0; - while (i < c->size) { - int to_clean = 0; - - /* Skip the entries that we don't need to clean */ - while (i < c->size && !can_clean_entry(c, i)) { - i++; - } - - /* And count how many we can clean in a row */ - while (i < c->size && can_clean_entry(c, i)) { - c->entries[i].offset = 0; - c->entries[i].lru_counter = 0; - i++; - to_clean++; - } - - if (to_clean > 0) { - qcow2_cache_table_release(bs, c, i - to_clean, to_clean); - } - } - - c->cache_clean_lru_counter = c->lru_counter; -} - -Qcow2Cache *qcow2_cache_create(BlockDriverState *bs, int num_tables) -{ - BDRVQcow2State *s = bs->opaque; - Qcow2Cache *c; - - c = g_new0(Qcow2Cache, 1); - c->size = num_tables; - c->entries = g_try_new0(Qcow2CachedTable, num_tables); - c->table_array = qemu_try_blockalign(bs->file->bs, - (size_t) num_tables * s->cluster_size); - - if (!c->entries || !c->table_array) { - qemu_vfree(c->table_array); - g_free(c->entries); - g_free(c); - c = NULL; - } - - return c; -} - -int qcow2_cache_destroy(BlockDriverState *bs, Qcow2Cache *c) -{ - int i; - - for (i = 0; i < c->size; i++) { - assert(c->entries[i].ref == 0); - } - - qemu_vfree(c->table_array); - g_free(c->entries); - g_free(c); - - return 0; -} - -static int qcow2_cache_flush_dependency(BlockDriverState *bs, Qcow2Cache *c) -{ - int ret; - - ret = qcow2_cache_flush(bs, c->depends); - if (ret < 0) { - return ret; - } - - c->depends = NULL; - c->depends_on_flush = false; - - return 0; -} - -static int qcow2_cache_entry_flush(BlockDriverState *bs, Qcow2Cache *c, int i) -{ - BDRVQcow2State *s = bs->opaque; - int ret = 0; - - if (!c->entries[i].dirty || !c->entries[i].offset) { - return 0; - } - - trace_qcow2_cache_entry_flush(qemu_coroutine_self(), - c == s->l2_table_cache, i); - - if (c->depends) { - ret = qcow2_cache_flush_dependency(bs, c); - } else if (c->depends_on_flush) { - ret = bdrv_flush(bs->file->bs); - if (ret >= 0) { - c->depends_on_flush = false; - } - } - - if (ret < 0) { - return ret; - } - - if (c == s->refcount_block_cache) { - ret = qcow2_pre_write_overlap_check(bs, QCOW2_OL_REFCOUNT_BLOCK, - c->entries[i].offset, s->cluster_size); - } else if (c == s->l2_table_cache) { - ret = qcow2_pre_write_overlap_check(bs, QCOW2_OL_ACTIVE_L2, - c->entries[i].offset, s->cluster_size); - } else { - ret = qcow2_pre_write_overlap_check(bs, 0, - c->entries[i].offset, s->cluster_size); - } - - if (ret < 0) { - return ret; - } - - if (c == s->refcount_block_cache) { - BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_UPDATE_PART); - } else if (c == s->l2_table_cache) { - BLKDBG_EVENT(bs->file, BLKDBG_L2_UPDATE); - } - - ret = bdrv_pwrite(bs->file->bs, c->entries[i].offset, - qcow2_cache_get_table_addr(bs, c, i), s->cluster_size); - if (ret < 0) { - return ret; - } - - c->entries[i].dirty = false; - - return 0; -} - -int qcow2_cache_flush(BlockDriverState *bs, Qcow2Cache *c) -{ - BDRVQcow2State *s = bs->opaque; - int result = 0; - int ret; - int i; - - trace_qcow2_cache_flush(qemu_coroutine_self(), c == s->l2_table_cache); - - for (i = 0; i < c->size; i++) { - ret = qcow2_cache_entry_flush(bs, c, i); - if (ret < 0 && result != -ENOSPC) { - result = ret; - } - } - - if (result == 0) { - ret = bdrv_flush(bs->file->bs); - if (ret < 0) { - result = ret; - } - } - - return result; -} - -int qcow2_cache_set_dependency(BlockDriverState *bs, Qcow2Cache *c, - Qcow2Cache *dependency) -{ - int ret; - - if (dependency->depends) { - ret = qcow2_cache_flush_dependency(bs, dependency); - if (ret < 0) { - return ret; - } - } - - if (c->depends && (c->depends != dependency)) { - ret = qcow2_cache_flush_dependency(bs, c); - if (ret < 0) { - return ret; - } - } - - c->depends = dependency; - return 0; -} - -void qcow2_cache_depends_on_flush(Qcow2Cache *c) -{ - c->depends_on_flush = true; -} - -int qcow2_cache_empty(BlockDriverState *bs, Qcow2Cache *c) -{ - int ret, i; - - ret = qcow2_cache_flush(bs, c); - if (ret < 0) { - return ret; - } - - for (i = 0; i < c->size; i++) { - assert(c->entries[i].ref == 0); - c->entries[i].offset = 0; - c->entries[i].lru_counter = 0; - } - - qcow2_cache_table_release(bs, c, 0, c->size); - - c->lru_counter = 0; - - return 0; -} - -static int qcow2_cache_do_get(BlockDriverState *bs, Qcow2Cache *c, - uint64_t offset, void **table, bool read_from_disk) -{ - BDRVQcow2State *s = bs->opaque; - int i; - int ret; - int lookup_index; - uint64_t min_lru_counter = UINT64_MAX; - int min_lru_index = -1; - - trace_qcow2_cache_get(qemu_coroutine_self(), c == s->l2_table_cache, - offset, read_from_disk); - - /* Check if the table is already cached */ - i = lookup_index = (offset / s->cluster_size * 4) % c->size; - do { - const Qcow2CachedTable *t = &c->entries[i]; - if (t->offset == offset) { - goto found; - } - if (t->ref == 0 && t->lru_counter < min_lru_counter) { - min_lru_counter = t->lru_counter; - min_lru_index = i; - } - if (++i == c->size) { - i = 0; - } - } while (i != lookup_index); - - if (min_lru_index == -1) { - /* This can't happen in current synchronous code, but leave the check - * here as a reminder for whoever starts using AIO with the cache */ - abort(); - } - - /* Cache miss: write a table back and replace it */ - i = min_lru_index; - trace_qcow2_cache_get_replace_entry(qemu_coroutine_self(), - c == s->l2_table_cache, i); - - ret = qcow2_cache_entry_flush(bs, c, i); - if (ret < 0) { - return ret; - } - - trace_qcow2_cache_get_read(qemu_coroutine_self(), - c == s->l2_table_cache, i); - c->entries[i].offset = 0; - if (read_from_disk) { - if (c == s->l2_table_cache) { - BLKDBG_EVENT(bs->file, BLKDBG_L2_LOAD); - } - - ret = bdrv_pread(bs->file->bs, offset, - qcow2_cache_get_table_addr(bs, c, i), - s->cluster_size); - if (ret < 0) { - return ret; - } - } - - c->entries[i].offset = offset; - - /* And return the right table */ -found: - c->entries[i].ref++; - *table = qcow2_cache_get_table_addr(bs, c, i); - - trace_qcow2_cache_get_done(qemu_coroutine_self(), - c == s->l2_table_cache, i); - - return 0; -} - -int qcow2_cache_get(BlockDriverState *bs, Qcow2Cache *c, uint64_t offset, - void **table) -{ - return qcow2_cache_do_get(bs, c, offset, table, true); -} - -int qcow2_cache_get_empty(BlockDriverState *bs, Qcow2Cache *c, uint64_t offset, - void **table) -{ - return qcow2_cache_do_get(bs, c, offset, table, false); -} - -void qcow2_cache_put(BlockDriverState *bs, Qcow2Cache *c, void **table) -{ - int i = qcow2_cache_get_table_idx(bs, c, *table); - - c->entries[i].ref--; - *table = NULL; - - if (c->entries[i].ref == 0) { - c->entries[i].lru_counter = ++c->lru_counter; - } - - assert(c->entries[i].ref >= 0); -} - -void qcow2_cache_entry_mark_dirty(BlockDriverState *bs, Qcow2Cache *c, - void *table) -{ - int i = qcow2_cache_get_table_idx(bs, c, table); - assert(c->entries[i].offset != 0); - c->entries[i].dirty = true; -} diff --git a/qemu/block/qcow2-cluster.c b/qemu/block/qcow2-cluster.c deleted file mode 100644 index 31ecc1030..000000000 --- a/qemu/block/qcow2-cluster.c +++ /dev/null @@ -1,1899 +0,0 @@ -/* - * Block driver for the QCOW version 2 format - * - * Copyright (c) 2004-2006 Fabrice Bellard - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. - */ - -#include "qemu/osdep.h" -#include - -#include "qapi/error.h" -#include "qemu-common.h" -#include "block/block_int.h" -#include "block/qcow2.h" -#include "trace.h" - -int qcow2_grow_l1_table(BlockDriverState *bs, uint64_t min_size, - bool exact_size) -{ - BDRVQcow2State *s = bs->opaque; - int new_l1_size2, ret, i; - uint64_t *new_l1_table; - int64_t old_l1_table_offset, old_l1_size; - int64_t new_l1_table_offset, new_l1_size; - uint8_t data[12]; - - if (min_size <= s->l1_size) - return 0; - - /* Do a sanity check on min_size before trying to calculate new_l1_size - * (this prevents overflows during the while loop for the calculation of - * new_l1_size) */ - if (min_size > INT_MAX / sizeof(uint64_t)) { - return -EFBIG; - } - - if (exact_size) { - new_l1_size = min_size; - } else { - /* Bump size up to reduce the number of times we have to grow */ - new_l1_size = s->l1_size; - if (new_l1_size == 0) { - new_l1_size = 1; - } - while (min_size > new_l1_size) { - new_l1_size = (new_l1_size * 3 + 1) / 2; - } - } - - if (new_l1_size > INT_MAX / sizeof(uint64_t)) { - return -EFBIG; - } - -#ifdef DEBUG_ALLOC2 - fprintf(stderr, "grow l1_table from %d to %" PRId64 "\n", - s->l1_size, new_l1_size); -#endif - - new_l1_size2 = sizeof(uint64_t) * new_l1_size; - new_l1_table = qemu_try_blockalign(bs->file->bs, - align_offset(new_l1_size2, 512)); - if (new_l1_table == NULL) { - return -ENOMEM; - } - memset(new_l1_table, 0, align_offset(new_l1_size2, 512)); - - memcpy(new_l1_table, s->l1_table, s->l1_size * sizeof(uint64_t)); - - /* write new table (align to cluster) */ - BLKDBG_EVENT(bs->file, BLKDBG_L1_GROW_ALLOC_TABLE); - new_l1_table_offset = qcow2_alloc_clusters(bs, new_l1_size2); - if (new_l1_table_offset < 0) { - qemu_vfree(new_l1_table); - return new_l1_table_offset; - } - - ret = qcow2_cache_flush(bs, s->refcount_block_cache); - if (ret < 0) { - goto fail; - } - - /* the L1 position has not yet been updated, so these clusters must - * indeed be completely free */ - ret = qcow2_pre_write_overlap_check(bs, 0, new_l1_table_offset, - new_l1_size2); - if (ret < 0) { - goto fail; - } - - BLKDBG_EVENT(bs->file, BLKDBG_L1_GROW_WRITE_TABLE); - for(i = 0; i < s->l1_size; i++) - new_l1_table[i] = cpu_to_be64(new_l1_table[i]); - ret = bdrv_pwrite_sync(bs->file->bs, new_l1_table_offset, - new_l1_table, new_l1_size2); - if (ret < 0) - goto fail; - for(i = 0; i < s->l1_size; i++) - new_l1_table[i] = be64_to_cpu(new_l1_table[i]); - - /* set new table */ - BLKDBG_EVENT(bs->file, BLKDBG_L1_GROW_ACTIVATE_TABLE); - cpu_to_be32w((uint32_t*)data, new_l1_size); - stq_be_p(data + 4, new_l1_table_offset); - ret = bdrv_pwrite_sync(bs->file->bs, offsetof(QCowHeader, l1_size), - data, sizeof(data)); - if (ret < 0) { - goto fail; - } - qemu_vfree(s->l1_table); - old_l1_table_offset = s->l1_table_offset; - s->l1_table_offset = new_l1_table_offset; - s->l1_table = new_l1_table; - old_l1_size = s->l1_size; - s->l1_size = new_l1_size; - qcow2_free_clusters(bs, old_l1_table_offset, old_l1_size * sizeof(uint64_t), - QCOW2_DISCARD_OTHER); - return 0; - fail: - qemu_vfree(new_l1_table); - qcow2_free_clusters(bs, new_l1_table_offset, new_l1_size2, - QCOW2_DISCARD_OTHER); - return ret; -} - -/* - * l2_load - * - * Loads a L2 table into memory. If the table is in the cache, the cache - * is used; otherwise the L2 table is loaded from the image file. - * - * Returns a pointer to the L2 table on success, or NULL if the read from - * the image file failed. - */ - -static int l2_load(BlockDriverState *bs, uint64_t l2_offset, - uint64_t **l2_table) -{ - BDRVQcow2State *s = bs->opaque; - int ret; - - ret = qcow2_cache_get(bs, s->l2_table_cache, l2_offset, (void**) l2_table); - - return ret; -} - -/* - * Writes one sector of the L1 table to the disk (can't update single entries - * and we really don't want bdrv_pread to perform a read-modify-write) - */ -#define L1_ENTRIES_PER_SECTOR (512 / 8) -int qcow2_write_l1_entry(BlockDriverState *bs, int l1_index) -{ - BDRVQcow2State *s = bs->opaque; - uint64_t buf[L1_ENTRIES_PER_SECTOR] = { 0 }; - int l1_start_index; - int i, ret; - - l1_start_index = l1_index & ~(L1_ENTRIES_PER_SECTOR - 1); - for (i = 0; i < L1_ENTRIES_PER_SECTOR && l1_start_index + i < s->l1_size; - i++) - { - buf[i] = cpu_to_be64(s->l1_table[l1_start_index + i]); - } - - ret = qcow2_pre_write_overlap_check(bs, QCOW2_OL_ACTIVE_L1, - s->l1_table_offset + 8 * l1_start_index, sizeof(buf)); - if (ret < 0) { - return ret; - } - - BLKDBG_EVENT(bs->file, BLKDBG_L1_UPDATE); - ret = bdrv_pwrite_sync(bs->file->bs, - s->l1_table_offset + 8 * l1_start_index, - buf, sizeof(buf)); - if (ret < 0) { - return ret; - } - - return 0; -} - -/* - * l2_allocate - * - * Allocate a new l2 entry in the file. If l1_index points to an already - * used entry in the L2 table (i.e. we are doing a copy on write for the L2 - * table) copy the contents of the old L2 table into the newly allocated one. - * Otherwise the new table is initialized with zeros. - * - */ - -static int l2_allocate(BlockDriverState *bs, int l1_index, uint64_t **table) -{ - BDRVQcow2State *s = bs->opaque; - uint64_t old_l2_offset; - uint64_t *l2_table = NULL; - int64_t l2_offset; - int ret; - - old_l2_offset = s->l1_table[l1_index]; - - trace_qcow2_l2_allocate(bs, l1_index); - - /* allocate a new l2 entry */ - - l2_offset = qcow2_alloc_clusters(bs, s->l2_size * sizeof(uint64_t)); - if (l2_offset < 0) { - ret = l2_offset; - goto fail; - } - - ret = qcow2_cache_flush(bs, s->refcount_block_cache); - if (ret < 0) { - goto fail; - } - - /* allocate a new entry in the l2 cache */ - - trace_qcow2_l2_allocate_get_empty(bs, l1_index); - ret = qcow2_cache_get_empty(bs, s->l2_table_cache, l2_offset, (void**) table); - if (ret < 0) { - goto fail; - } - - l2_table = *table; - - if ((old_l2_offset & L1E_OFFSET_MASK) == 0) { - /* if there was no old l2 table, clear the new table */ - memset(l2_table, 0, s->l2_size * sizeof(uint64_t)); - } else { - uint64_t* old_table; - - /* if there was an old l2 table, read it from the disk */ - BLKDBG_EVENT(bs->file, BLKDBG_L2_ALLOC_COW_READ); - ret = qcow2_cache_get(bs, s->l2_table_cache, - old_l2_offset & L1E_OFFSET_MASK, - (void**) &old_table); - if (ret < 0) { - goto fail; - } - - memcpy(l2_table, old_table, s->cluster_size); - - qcow2_cache_put(bs, s->l2_table_cache, (void **) &old_table); - } - - /* write the l2 table to the file */ - BLKDBG_EVENT(bs->file, BLKDBG_L2_ALLOC_WRITE); - - trace_qcow2_l2_allocate_write_l2(bs, l1_index); - qcow2_cache_entry_mark_dirty(bs, s->l2_table_cache, l2_table); - ret = qcow2_cache_flush(bs, s->l2_table_cache); - if (ret < 0) { - goto fail; - } - - /* update the L1 entry */ - trace_qcow2_l2_allocate_write_l1(bs, l1_index); - s->l1_table[l1_index] = l2_offset | QCOW_OFLAG_COPIED; - ret = qcow2_write_l1_entry(bs, l1_index); - if (ret < 0) { - goto fail; - } - - *table = l2_table; - trace_qcow2_l2_allocate_done(bs, l1_index, 0); - return 0; - -fail: - trace_qcow2_l2_allocate_done(bs, l1_index, ret); - if (l2_table != NULL) { - qcow2_cache_put(bs, s->l2_table_cache, (void**) table); - } - s->l1_table[l1_index] = old_l2_offset; - if (l2_offset > 0) { - qcow2_free_clusters(bs, l2_offset, s->l2_size * sizeof(uint64_t), - QCOW2_DISCARD_ALWAYS); - } - return ret; -} - -/* - * Checks how many clusters in a given L2 table are contiguous in the image - * file. As soon as one of the flags in the bitmask stop_flags changes compared - * to the first cluster, the search is stopped and the cluster is not counted - * as contiguous. (This allows it, for example, to stop at the first compressed - * cluster which may require a different handling) - */ -static int count_contiguous_clusters(int nb_clusters, int cluster_size, - uint64_t *l2_table, uint64_t stop_flags) -{ - int i; - uint64_t mask = stop_flags | L2E_OFFSET_MASK | QCOW_OFLAG_COMPRESSED; - uint64_t first_entry = be64_to_cpu(l2_table[0]); - uint64_t offset = first_entry & mask; - - if (!offset) - return 0; - - assert(qcow2_get_cluster_type(first_entry) == QCOW2_CLUSTER_NORMAL); - - for (i = 0; i < nb_clusters; i++) { - uint64_t l2_entry = be64_to_cpu(l2_table[i]) & mask; - if (offset + (uint64_t) i * cluster_size != l2_entry) { - break; - } - } - - return i; -} - -static int count_contiguous_clusters_by_type(int nb_clusters, - uint64_t *l2_table, - int wanted_type) -{ - int i; - - for (i = 0; i < nb_clusters; i++) { - int type = qcow2_get_cluster_type(be64_to_cpu(l2_table[i])); - - if (type != wanted_type) { - break; - } - } - - return i; -} - -/* The crypt function is compatible with the linux cryptoloop - algorithm for < 4 GB images. NOTE: out_buf == in_buf is - supported */ -int qcow2_encrypt_sectors(BDRVQcow2State *s, int64_t sector_num, - uint8_t *out_buf, const uint8_t *in_buf, - int nb_sectors, bool enc, - Error **errp) -{ - union { - uint64_t ll[2]; - uint8_t b[16]; - } ivec; - int i; - int ret; - - for(i = 0; i < nb_sectors; i++) { - ivec.ll[0] = cpu_to_le64(sector_num); - ivec.ll[1] = 0; - if (qcrypto_cipher_setiv(s->cipher, - ivec.b, G_N_ELEMENTS(ivec.b), - errp) < 0) { - return -1; - } - if (enc) { - ret = qcrypto_cipher_encrypt(s->cipher, - in_buf, - out_buf, - 512, - errp); - } else { - ret = qcrypto_cipher_decrypt(s->cipher, - in_buf, - out_buf, - 512, - errp); - } - if (ret < 0) { - return -1; - } - sector_num++; - in_buf += 512; - out_buf += 512; - } - return 0; -} - -static int coroutine_fn copy_sectors(BlockDriverState *bs, - uint64_t start_sect, - uint64_t cluster_offset, - int n_start, int n_end) -{ - BDRVQcow2State *s = bs->opaque; - QEMUIOVector qiov; - struct iovec iov; - int n, ret; - - n = n_end - n_start; - if (n <= 0) { - return 0; - } - - iov.iov_len = n * BDRV_SECTOR_SIZE; - iov.iov_base = qemu_try_blockalign(bs, iov.iov_len); - if (iov.iov_base == NULL) { - return -ENOMEM; - } - - qemu_iovec_init_external(&qiov, &iov, 1); - - BLKDBG_EVENT(bs->file, BLKDBG_COW_READ); - - if (!bs->drv) { - ret = -ENOMEDIUM; - goto out; - } - - /* Call .bdrv_co_readv() directly instead of using the public block-layer - * interface. This avoids double I/O throttling and request tracking, - * which can lead to deadlock when block layer copy-on-read is enabled. - */ - ret = bs->drv->bdrv_co_readv(bs, start_sect + n_start, n, &qiov); - if (ret < 0) { - goto out; - } - - if (bs->encrypted) { - Error *err = NULL; - assert(s->cipher); - if (qcow2_encrypt_sectors(s, start_sect + n_start, - iov.iov_base, iov.iov_base, n, - true, &err) < 0) { - ret = -EIO; - error_free(err); - goto out; - } - } - - ret = qcow2_pre_write_overlap_check(bs, 0, - cluster_offset + n_start * BDRV_SECTOR_SIZE, n * BDRV_SECTOR_SIZE); - if (ret < 0) { - goto out; - } - - BLKDBG_EVENT(bs->file, BLKDBG_COW_WRITE); - ret = bdrv_co_writev(bs->file->bs, (cluster_offset >> 9) + n_start, n, - &qiov); - if (ret < 0) { - goto out; - } - - ret = 0; -out: - qemu_vfree(iov.iov_base); - return ret; -} - - -/* - * get_cluster_offset - * - * For a given offset of the disk image, find the cluster offset in - * qcow2 file. The offset is stored in *cluster_offset. - * - * on entry, *num is the number of contiguous sectors we'd like to - * access following offset. - * - * on exit, *num is the number of contiguous sectors we can read. - * - * Returns the cluster type (QCOW2_CLUSTER_*) on success, -errno in error - * cases. - */ -int qcow2_get_cluster_offset(BlockDriverState *bs, uint64_t offset, - int *num, uint64_t *cluster_offset) -{ - BDRVQcow2State *s = bs->opaque; - unsigned int l2_index; - uint64_t l1_index, l2_offset, *l2_table; - int l1_bits, c; - unsigned int index_in_cluster, nb_clusters; - uint64_t nb_available, nb_needed; - int ret; - - index_in_cluster = (offset >> 9) & (s->cluster_sectors - 1); - nb_needed = *num + index_in_cluster; - - l1_bits = s->l2_bits + s->cluster_bits; - - /* compute how many bytes there are between the offset and - * the end of the l1 entry - */ - - nb_available = (1ULL << l1_bits) - (offset & ((1ULL << l1_bits) - 1)); - - /* compute the number of available sectors */ - - nb_available = (nb_available >> 9) + index_in_cluster; - - if (nb_needed > nb_available) { - nb_needed = nb_available; - } - assert(nb_needed <= INT_MAX); - - *cluster_offset = 0; - - /* seek to the l2 offset in the l1 table */ - - l1_index = offset >> l1_bits; - if (l1_index >= s->l1_size) { - ret = QCOW2_CLUSTER_UNALLOCATED; - goto out; - } - - l2_offset = s->l1_table[l1_index] & L1E_OFFSET_MASK; - if (!l2_offset) { - ret = QCOW2_CLUSTER_UNALLOCATED; - goto out; - } - - if (offset_into_cluster(s, l2_offset)) { - qcow2_signal_corruption(bs, true, -1, -1, "L2 table offset %#" PRIx64 - " unaligned (L1 index: %#" PRIx64 ")", - l2_offset, l1_index); - return -EIO; - } - - /* load the l2 table in memory */ - - ret = l2_load(bs, l2_offset, &l2_table); - if (ret < 0) { - return ret; - } - - /* find the cluster offset for the given disk offset */ - - l2_index = (offset >> s->cluster_bits) & (s->l2_size - 1); - *cluster_offset = be64_to_cpu(l2_table[l2_index]); - - /* nb_needed <= INT_MAX, thus nb_clusters <= INT_MAX, too */ - nb_clusters = size_to_clusters(s, nb_needed << 9); - - ret = qcow2_get_cluster_type(*cluster_offset); - switch (ret) { - case QCOW2_CLUSTER_COMPRESSED: - /* Compressed clusters can only be processed one by one */ - c = 1; - *cluster_offset &= L2E_COMPRESSED_OFFSET_SIZE_MASK; - break; - case QCOW2_CLUSTER_ZERO: - if (s->qcow_version < 3) { - qcow2_signal_corruption(bs, true, -1, -1, "Zero cluster entry found" - " in pre-v3 image (L2 offset: %#" PRIx64 - ", L2 index: %#x)", l2_offset, l2_index); - ret = -EIO; - goto fail; - } - c = count_contiguous_clusters_by_type(nb_clusters, &l2_table[l2_index], - QCOW2_CLUSTER_ZERO); - *cluster_offset = 0; - break; - case QCOW2_CLUSTER_UNALLOCATED: - /* how many empty clusters ? */ - c = count_contiguous_clusters_by_type(nb_clusters, &l2_table[l2_index], - QCOW2_CLUSTER_UNALLOCATED); - *cluster_offset = 0; - break; - case QCOW2_CLUSTER_NORMAL: - /* how many allocated clusters ? */ - c = count_contiguous_clusters(nb_clusters, s->cluster_size, - &l2_table[l2_index], QCOW_OFLAG_ZERO); - *cluster_offset &= L2E_OFFSET_MASK; - if (offset_into_cluster(s, *cluster_offset)) { - qcow2_signal_corruption(bs, true, -1, -1, "Data cluster offset %#" - PRIx64 " unaligned (L2 offset: %#" PRIx64 - ", L2 index: %#x)", *cluster_offset, - l2_offset, l2_index); - ret = -EIO; - goto fail; - } - break; - default: - abort(); - } - - qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table); - - nb_available = (c * s->cluster_sectors); - -out: - if (nb_available > nb_needed) - nb_available = nb_needed; - - *num = nb_available - index_in_cluster; - - return ret; - -fail: - qcow2_cache_put(bs, s->l2_table_cache, (void **)&l2_table); - return ret; -} - -/* - * get_cluster_table - * - * for a given disk offset, load (and allocate if needed) - * the l2 table. - * - * the l2 table offset in the qcow2 file and the cluster index - * in the l2 table are given to the caller. - * - * Returns 0 on success, -errno in failure case - */ -static int get_cluster_table(BlockDriverState *bs, uint64_t offset, - uint64_t **new_l2_table, - int *new_l2_index) -{ - BDRVQcow2State *s = bs->opaque; - unsigned int l2_index; - uint64_t l1_index, l2_offset; - uint64_t *l2_table = NULL; - int ret; - - /* seek to the l2 offset in the l1 table */ - - l1_index = offset >> (s->l2_bits + s->cluster_bits); - if (l1_index >= s->l1_size) { - ret = qcow2_grow_l1_table(bs, l1_index + 1, false); - if (ret < 0) { - return ret; - } - } - - assert(l1_index < s->l1_size); - l2_offset = s->l1_table[l1_index] & L1E_OFFSET_MASK; - if (offset_into_cluster(s, l2_offset)) { - qcow2_signal_corruption(bs, true, -1, -1, "L2 table offset %#" PRIx64 - " unaligned (L1 index: %#" PRIx64 ")", - l2_offset, l1_index); - return -EIO; - } - - /* seek the l2 table of the given l2 offset */ - - if (s->l1_table[l1_index] & QCOW_OFLAG_COPIED) { - /* load the l2 table in memory */ - ret = l2_load(bs, l2_offset, &l2_table); - if (ret < 0) { - return ret; - } - } else { - /* First allocate a new L2 table (and do COW if needed) */ - ret = l2_allocate(bs, l1_index, &l2_table); - if (ret < 0) { - return ret; - } - - /* Then decrease the refcount of the old table */ - if (l2_offset) { - qcow2_free_clusters(bs, l2_offset, s->l2_size * sizeof(uint64_t), - QCOW2_DISCARD_OTHER); - } - } - - /* find the cluster offset for the given disk offset */ - - l2_index = (offset >> s->cluster_bits) & (s->l2_size - 1); - - *new_l2_table = l2_table; - *new_l2_index = l2_index; - - return 0; -} - -/* - * alloc_compressed_cluster_offset - * - * For a given offset of the disk image, return cluster offset in - * qcow2 file. - * - * If the offset is not found, allocate a new compressed cluster. - * - * Return the cluster offset if successful, - * Return 0, otherwise. - * - */ - -uint64_t qcow2_alloc_compressed_cluster_offset(BlockDriverState *bs, - uint64_t offset, - int compressed_size) -{ - BDRVQcow2State *s = bs->opaque; - int l2_index, ret; - uint64_t *l2_table; - int64_t cluster_offset; - int nb_csectors; - - ret = get_cluster_table(bs, offset, &l2_table, &l2_index); - if (ret < 0) { - return 0; - } - - /* Compression can't overwrite anything. Fail if the cluster was already - * allocated. */ - cluster_offset = be64_to_cpu(l2_table[l2_index]); - if (cluster_offset & L2E_OFFSET_MASK) { - qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table); - return 0; - } - - cluster_offset = qcow2_alloc_bytes(bs, compressed_size); - if (cluster_offset < 0) { - qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table); - return 0; - } - - nb_csectors = ((cluster_offset + compressed_size - 1) >> 9) - - (cluster_offset >> 9); - - cluster_offset |= QCOW_OFLAG_COMPRESSED | - ((uint64_t)nb_csectors << s->csize_shift); - - /* update L2 table */ - - /* compressed clusters never have the copied flag */ - - BLKDBG_EVENT(bs->file, BLKDBG_L2_UPDATE_COMPRESSED); - qcow2_cache_entry_mark_dirty(bs, s->l2_table_cache, l2_table); - l2_table[l2_index] = cpu_to_be64(cluster_offset); - qcow2_cache_put(bs, s->l2_table_cache, (void **) &l2_table); - - return cluster_offset; -} - -static int perform_cow(BlockDriverState *bs, QCowL2Meta *m, Qcow2COWRegion *r) -{ - BDRVQcow2State *s = bs->opaque; - int ret; - - if (r->nb_sectors == 0) { - return 0; - } - - qemu_co_mutex_unlock(&s->lock); - ret = copy_sectors(bs, m->offset / BDRV_SECTOR_SIZE, m->alloc_offset, - r->offset / BDRV_SECTOR_SIZE, - r->offset / BDRV_SECTOR_SIZE + r->nb_sectors); - qemu_co_mutex_lock(&s->lock); - - if (ret < 0) { - return ret; - } - - /* - * Before we update the L2 table to actually point to the new cluster, we - * need to be sure that the refcounts have been increased and COW was - * handled. - */ - qcow2_cache_depends_on_flush(s->l2_table_cache); - - return 0; -} - -int qcow2_alloc_cluster_link_l2(BlockDriverState *bs, QCowL2Meta *m) -{ - BDRVQcow2State *s = bs->opaque; - int i, j = 0, l2_index, ret; - uint64_t *old_cluster, *l2_table; - uint64_t cluster_offset = m->alloc_offset; - - trace_qcow2_cluster_link_l2(qemu_coroutine_self(), m->nb_clusters); - assert(m->nb_clusters > 0); - - old_cluster = g_try_new(uint64_t, m->nb_clusters); - if (old_cluster == NULL) { - ret = -ENOMEM; - goto err; - } - - /* copy content of unmodified sectors */ - ret = perform_cow(bs, m, &m->cow_start); - if (ret < 0) { - goto err; - } - - ret = perform_cow(bs, m, &m->cow_end); - if (ret < 0) { - goto err; - } - - /* Update L2 table. */ - if (s->use_lazy_refcounts) { - qcow2_mark_dirty(bs); - } - if (qcow2_need_accurate_refcounts(s)) { - qcow2_cache_set_dependency(bs, s->l2_table_cache, - s->refcount_block_cache); - } - - ret = get_cluster_table(bs, m->offset, &l2_table, &l2_index); - if (ret < 0) { - goto err; - } - qcow2_cache_entry_mark_dirty(bs, s->l2_table_cache, l2_table); - - assert(l2_index + m->nb_clusters <= s->l2_size); - for (i = 0; i < m->nb_clusters; i++) { - /* if two concurrent writes happen to the same unallocated cluster - * each write allocates separate cluster and writes data concurrently. - * The first one to complete updates l2 table with pointer to its - * cluster the second one has to do RMW (which is done above by - * copy_sectors()), update l2 table with its cluster pointer and free - * old cluster. This is what this loop does */ - if(l2_table[l2_index + i] != 0) - old_cluster[j++] = l2_table[l2_index + i]; - - l2_table[l2_index + i] = cpu_to_be64((cluster_offset + - (i << s->cluster_bits)) | QCOW_OFLAG_COPIED); - } - - - qcow2_cache_put(bs, s->l2_table_cache, (void **) &l2_table); - - /* - * If this was a COW, we need to decrease the refcount of the old cluster. - * - * Don't discard clusters that reach a refcount of 0 (e.g. compressed - * clusters), the next write will reuse them anyway. - */ - if (j != 0) { - for (i = 0; i < j; i++) { - qcow2_free_any_clusters(bs, be64_to_cpu(old_cluster[i]), 1, - QCOW2_DISCARD_NEVER); - } - } - - ret = 0; -err: - g_free(old_cluster); - return ret; - } - -/* - * Returns the number of contiguous clusters that can be used for an allocating - * write, but require COW to be performed (this includes yet unallocated space, - * which must copy from the backing file) - */ -static int count_cow_clusters(BDRVQcow2State *s, int nb_clusters, - uint64_t *l2_table, int l2_index) -{ - int i; - - for (i = 0; i < nb_clusters; i++) { - uint64_t l2_entry = be64_to_cpu(l2_table[l2_index + i]); - int cluster_type = qcow2_get_cluster_type(l2_entry); - - switch(cluster_type) { - case QCOW2_CLUSTER_NORMAL: - if (l2_entry & QCOW_OFLAG_COPIED) { - goto out; - } - break; - case QCOW2_CLUSTER_UNALLOCATED: - case QCOW2_CLUSTER_COMPRESSED: - case QCOW2_CLUSTER_ZERO: - break; - default: - abort(); - } - } - -out: - assert(i <= nb_clusters); - return i; -} - -/* - * Check if there already is an AIO write request in flight which allocates - * the same cluster. In this case we need to wait until the previous - * request has completed and updated the L2 table accordingly. - * - * Returns: - * 0 if there was no dependency. *cur_bytes indicates the number of - * bytes from guest_offset that can be read before the next - * dependency must be processed (or the request is complete) - * - * -EAGAIN if we had to wait for another request, previously gathered - * information on cluster allocation may be invalid now. The caller - * must start over anyway, so consider *cur_bytes undefined. - */ -static int handle_dependencies(BlockDriverState *bs, uint64_t guest_offset, - uint64_t *cur_bytes, QCowL2Meta **m) -{ - BDRVQcow2State *s = bs->opaque; - QCowL2Meta *old_alloc; - uint64_t bytes = *cur_bytes; - - QLIST_FOREACH(old_alloc, &s->cluster_allocs, next_in_flight) { - - uint64_t start = guest_offset; - uint64_t end = start + bytes; - uint64_t old_start = l2meta_cow_start(old_alloc); - uint64_t old_end = l2meta_cow_end(old_alloc); - - if (end <= old_start || start >= old_end) { - /* No intersection */ - } else { - if (start < old_start) { - /* Stop at the start of a running allocation */ - bytes = old_start - start; - } else { - bytes = 0; - } - - /* Stop if already an l2meta exists. After yielding, it wouldn't - * be valid any more, so we'd have to clean up the old L2Metas - * and deal with requests depending on them before starting to - * gather new ones. Not worth the trouble. */ - if (bytes == 0 && *m) { - *cur_bytes = 0; - return 0; - } - - if (bytes == 0) { - /* Wait for the dependency to complete. We need to recheck - * the free/allocated clusters when we continue. */ - qemu_co_mutex_unlock(&s->lock); - qemu_co_queue_wait(&old_alloc->dependent_requests); - qemu_co_mutex_lock(&s->lock); - return -EAGAIN; - } - } - } - - /* Make sure that existing clusters and new allocations are only used up to - * the next dependency if we shortened the request above */ - *cur_bytes = bytes; - - return 0; -} - -/* - * Checks how many already allocated clusters that don't require a copy on - * write there are at the given guest_offset (up to *bytes). If - * *host_offset is not zero, only physically contiguous clusters beginning at - * this host offset are counted. - * - * Note that guest_offset may not be cluster aligned. In this case, the - * returned *host_offset points to exact byte referenced by guest_offset and - * therefore isn't cluster aligned as well. - * - * Returns: - * 0: if no allocated clusters are available at the given offset. - * *bytes is normally unchanged. It is set to 0 if the cluster - * is allocated and doesn't need COW, but doesn't have the right - * physical offset. - * - * 1: if allocated clusters that don't require a COW are available at - * the requested offset. *bytes may have decreased and describes - * the length of the area that can be written to. - * - * -errno: in error cases - */ -static int handle_copied(BlockDriverState *bs, uint64_t guest_offset, - uint64_t *host_offset, uint64_t *bytes, QCowL2Meta **m) -{ - BDRVQcow2State *s = bs->opaque; - int l2_index; - uint64_t cluster_offset; - uint64_t *l2_table; - uint64_t nb_clusters; - unsigned int keep_clusters; - int ret; - - trace_qcow2_handle_copied(qemu_coroutine_self(), guest_offset, *host_offset, - *bytes); - - assert(*host_offset == 0 || offset_into_cluster(s, guest_offset) - == offset_into_cluster(s, *host_offset)); - - /* - * Calculate the number of clusters to look for. We stop at L2 table - * boundaries to keep things simple. - */ - nb_clusters = - size_to_clusters(s, offset_into_cluster(s, guest_offset) + *bytes); - - l2_index = offset_to_l2_index(s, guest_offset); - nb_clusters = MIN(nb_clusters, s->l2_size - l2_index); - assert(nb_clusters <= INT_MAX); - - /* Find L2 entry for the first involved cluster */ - ret = get_cluster_table(bs, guest_offset, &l2_table, &l2_index); - if (ret < 0) { - return ret; - } - - cluster_offset = be64_to_cpu(l2_table[l2_index]); - - /* Check how many clusters are already allocated and don't need COW */ - if (qcow2_get_cluster_type(cluster_offset) == QCOW2_CLUSTER_NORMAL - && (cluster_offset & QCOW_OFLAG_COPIED)) - { - /* If a specific host_offset is required, check it */ - bool offset_matches = - (cluster_offset & L2E_OFFSET_MASK) == *host_offset; - - if (offset_into_cluster(s, cluster_offset & L2E_OFFSET_MASK)) { - qcow2_signal_corruption(bs, true, -1, -1, "Data cluster offset " - "%#llx unaligned (guest offset: %#" PRIx64 - ")", cluster_offset & L2E_OFFSET_MASK, - guest_offset); - ret = -EIO; - goto out; - } - - if (*host_offset != 0 && !offset_matches) { - *bytes = 0; - ret = 0; - goto out; - } - - /* We keep all QCOW_OFLAG_COPIED clusters */ - keep_clusters = - count_contiguous_clusters(nb_clusters, s->cluster_size, - &l2_table[l2_index], - QCOW_OFLAG_COPIED | QCOW_OFLAG_ZERO); - assert(keep_clusters <= nb_clusters); - - *bytes = MIN(*bytes, - keep_clusters * s->cluster_size - - offset_into_cluster(s, guest_offset)); - - ret = 1; - } else { - ret = 0; - } - - /* Cleanup */ -out: - qcow2_cache_put(bs, s->l2_table_cache, (void **) &l2_table); - - /* Only return a host offset if we actually made progress. Otherwise we - * would make requirements for handle_alloc() that it can't fulfill */ - if (ret > 0) { - *host_offset = (cluster_offset & L2E_OFFSET_MASK) - + offset_into_cluster(s, guest_offset); - } - - return ret; -} - -/* - * Allocates new clusters for the given guest_offset. - * - * At most *nb_clusters are allocated, and on return *nb_clusters is updated to - * contain the number of clusters that have been allocated and are contiguous - * in the image file. - * - * If *host_offset is non-zero, it specifies the offset in the image file at - * which the new clusters must start. *nb_clusters can be 0 on return in this - * case if the cluster at host_offset is already in use. If *host_offset is - * zero, the clusters can be allocated anywhere in the image file. - * - * *host_offset is updated to contain the offset into the image file at which - * the first allocated cluster starts. - * - * Return 0 on success and -errno in error cases. -EAGAIN means that the - * function has been waiting for another request and the allocation must be - * restarted, but the whole request should not be failed. - */ -static int do_alloc_cluster_offset(BlockDriverState *bs, uint64_t guest_offset, - uint64_t *host_offset, uint64_t *nb_clusters) -{ - BDRVQcow2State *s = bs->opaque; - - trace_qcow2_do_alloc_clusters_offset(qemu_coroutine_self(), guest_offset, - *host_offset, *nb_clusters); - - /* Allocate new clusters */ - trace_qcow2_cluster_alloc_phys(qemu_coroutine_self()); - if (*host_offset == 0) { - int64_t cluster_offset = - qcow2_alloc_clusters(bs, *nb_clusters * s->cluster_size); - if (cluster_offset < 0) { - return cluster_offset; - } - *host_offset = cluster_offset; - return 0; - } else { - int64_t ret = qcow2_alloc_clusters_at(bs, *host_offset, *nb_clusters); - if (ret < 0) { - return ret; - } - *nb_clusters = ret; - return 0; - } -} - -/* - * Allocates new clusters for an area that either is yet unallocated or needs a - * copy on write. If *host_offset is non-zero, clusters are only allocated if - * the new allocation can match the specified host offset. - * - * Note that guest_offset may not be cluster aligned. In this case, the - * returned *host_offset points to exact byte referenced by guest_offset and - * therefore isn't cluster aligned as well. - * - * Returns: - * 0: if no clusters could be allocated. *bytes is set to 0, - * *host_offset is left unchanged. - * - * 1: if new clusters were allocated. *bytes may be decreased if the - * new allocation doesn't cover all of the requested area. - * *host_offset is updated to contain the host offset of the first - * newly allocated cluster. - * - * -errno: in error cases - */ -static int handle_alloc(BlockDriverState *bs, uint64_t guest_offset, - uint64_t *host_offset, uint64_t *bytes, QCowL2Meta **m) -{ - BDRVQcow2State *s = bs->opaque; - int l2_index; - uint64_t *l2_table; - uint64_t entry; - uint64_t nb_clusters; - int ret; - - uint64_t alloc_cluster_offset; - - trace_qcow2_handle_alloc(qemu_coroutine_self(), guest_offset, *host_offset, - *bytes); - assert(*bytes > 0); - - /* - * Calculate the number of clusters to look for. We stop at L2 table - * boundaries to keep things simple. - */ - nb_clusters = - size_to_clusters(s, offset_into_cluster(s, guest_offset) + *bytes); - - l2_index = offset_to_l2_index(s, guest_offset); - nb_clusters = MIN(nb_clusters, s->l2_size - l2_index); - assert(nb_clusters <= INT_MAX); - - /* Find L2 entry for the first involved cluster */ - ret = get_cluster_table(bs, guest_offset, &l2_table, &l2_index); - if (ret < 0) { - return ret; - } - - entry = be64_to_cpu(l2_table[l2_index]); - - /* For the moment, overwrite compressed clusters one by one */ - if (entry & QCOW_OFLAG_COMPRESSED) { - nb_clusters = 1; - } else { - nb_clusters = count_cow_clusters(s, nb_clusters, l2_table, l2_index); - } - - /* This function is only called when there were no non-COW clusters, so if - * we can't find any unallocated or COW clusters either, something is - * wrong with our code. */ - assert(nb_clusters > 0); - - qcow2_cache_put(bs, s->l2_table_cache, (void **) &l2_table); - - /* Allocate, if necessary at a given offset in the image file */ - alloc_cluster_offset = start_of_cluster(s, *host_offset); - ret = do_alloc_cluster_offset(bs, guest_offset, &alloc_cluster_offset, - &nb_clusters); - if (ret < 0) { - goto fail; - } - - /* Can't extend contiguous allocation */ - if (nb_clusters == 0) { - *bytes = 0; - return 0; - } - - /* !*host_offset would overwrite the image header and is reserved for "no - * host offset preferred". If 0 was a valid host offset, it'd trigger the - * following overlap check; do that now to avoid having an invalid value in - * *host_offset. */ - if (!alloc_cluster_offset) { - ret = qcow2_pre_write_overlap_check(bs, 0, alloc_cluster_offset, - nb_clusters * s->cluster_size); - assert(ret < 0); - goto fail; - } - - /* - * Save info needed for meta data update. - * - * requested_sectors: Number of sectors from the start of the first - * newly allocated cluster to the end of the (possibly shortened - * before) write request. - * - * avail_sectors: Number of sectors from the start of the first - * newly allocated to the end of the last newly allocated cluster. - * - * nb_sectors: The number of sectors from the start of the first - * newly allocated cluster to the end of the area that the write - * request actually writes to (excluding COW at the end) - */ - int requested_sectors = - (*bytes + offset_into_cluster(s, guest_offset)) - >> BDRV_SECTOR_BITS; - int avail_sectors = nb_clusters - << (s->cluster_bits - BDRV_SECTOR_BITS); - int alloc_n_start = offset_into_cluster(s, guest_offset) - >> BDRV_SECTOR_BITS; - int nb_sectors = MIN(requested_sectors, avail_sectors); - QCowL2Meta *old_m = *m; - - *m = g_malloc0(sizeof(**m)); - - **m = (QCowL2Meta) { - .next = old_m, - - .alloc_offset = alloc_cluster_offset, - .offset = start_of_cluster(s, guest_offset), - .nb_clusters = nb_clusters, - .nb_available = nb_sectors, - - .cow_start = { - .offset = 0, - .nb_sectors = alloc_n_start, - }, - .cow_end = { - .offset = nb_sectors * BDRV_SECTOR_SIZE, - .nb_sectors = avail_sectors - nb_sectors, - }, - }; - qemu_co_queue_init(&(*m)->dependent_requests); - QLIST_INSERT_HEAD(&s->cluster_allocs, *m, next_in_flight); - - *host_offset = alloc_cluster_offset + offset_into_cluster(s, guest_offset); - *bytes = MIN(*bytes, (nb_sectors * BDRV_SECTOR_SIZE) - - offset_into_cluster(s, guest_offset)); - assert(*bytes != 0); - - return 1; - -fail: - if (*m && (*m)->nb_clusters > 0) { - QLIST_REMOVE(*m, next_in_flight); - } - return ret; -} - -/* - * alloc_cluster_offset - * - * For a given offset on the virtual disk, find the cluster offset in qcow2 - * file. If the offset is not found, allocate a new cluster. - * - * If the cluster was already allocated, m->nb_clusters is set to 0 and - * other fields in m are meaningless. - * - * If the cluster is newly allocated, m->nb_clusters is set to the number of - * contiguous clusters that have been allocated. In this case, the other - * fields of m are valid and contain information about the first allocated - * cluster. - * - * If the request conflicts with another write request in flight, the coroutine - * is queued and will be reentered when the dependency has completed. - * - * Return 0 on success and -errno in error cases - */ -int qcow2_alloc_cluster_offset(BlockDriverState *bs, uint64_t offset, - int *num, uint64_t *host_offset, QCowL2Meta **m) -{ - BDRVQcow2State *s = bs->opaque; - uint64_t start, remaining; - uint64_t cluster_offset; - uint64_t cur_bytes; - int ret; - - trace_qcow2_alloc_clusters_offset(qemu_coroutine_self(), offset, *num); - - assert((offset & ~BDRV_SECTOR_MASK) == 0); - -again: - start = offset; - remaining = (uint64_t)*num << BDRV_SECTOR_BITS; - cluster_offset = 0; - *host_offset = 0; - cur_bytes = 0; - *m = NULL; - - while (true) { - - if (!*host_offset) { - *host_offset = start_of_cluster(s, cluster_offset); - } - - assert(remaining >= cur_bytes); - - start += cur_bytes; - remaining -= cur_bytes; - cluster_offset += cur_bytes; - - if (remaining == 0) { - break; - } - - cur_bytes = remaining; - - /* - * Now start gathering as many contiguous clusters as possible: - * - * 1. Check for overlaps with in-flight allocations - * - * a) Overlap not in the first cluster -> shorten this request and - * let the caller handle the rest in its next loop iteration. - * - * b) Real overlaps of two requests. Yield and restart the search - * for contiguous clusters (the situation could have changed - * while we were sleeping) - * - * c) TODO: Request starts in the same cluster as the in-flight - * allocation ends. Shorten the COW of the in-fight allocation, - * set cluster_offset to write to the same cluster and set up - * the right synchronisation between the in-flight request and - * the new one. - */ - ret = handle_dependencies(bs, start, &cur_bytes, m); - if (ret == -EAGAIN) { - /* Currently handle_dependencies() doesn't yield if we already had - * an allocation. If it did, we would have to clean up the L2Meta - * structs before starting over. */ - assert(*m == NULL); - goto again; - } else if (ret < 0) { - return ret; - } else if (cur_bytes == 0) { - break; - } else { - /* handle_dependencies() may have decreased cur_bytes (shortened - * the allocations below) so that the next dependency is processed - * correctly during the next loop iteration. */ - } - - /* - * 2. Count contiguous COPIED clusters. - */ - ret = handle_copied(bs, start, &cluster_offset, &cur_bytes, m); - if (ret < 0) { - return ret; - } else if (ret) { - continue; - } else if (cur_bytes == 0) { - break; - } - - /* - * 3. If the request still hasn't completed, allocate new clusters, - * considering any cluster_offset of steps 1c or 2. - */ - ret = handle_alloc(bs, start, &cluster_offset, &cur_bytes, m); - if (ret < 0) { - return ret; - } else if (ret) { - continue; - } else { - assert(cur_bytes == 0); - break; - } - } - - *num -= remaining >> BDRV_SECTOR_BITS; - assert(*num > 0); - assert(*host_offset != 0); - - return 0; -} - -static int decompress_buffer(uint8_t *out_buf, int out_buf_size, - const uint8_t *buf, int buf_size) -{ - z_stream strm1, *strm = &strm1; - int ret, out_len; - - memset(strm, 0, sizeof(*strm)); - - strm->next_in = (uint8_t *)buf; - strm->avail_in = buf_size; - strm->next_out = out_buf; - strm->avail_out = out_buf_size; - - ret = inflateInit2(strm, -12); - if (ret != Z_OK) - return -1; - ret = inflate(strm, Z_FINISH); - out_len = strm->next_out - out_buf; - if ((ret != Z_STREAM_END && ret != Z_BUF_ERROR) || - out_len != out_buf_size) { - inflateEnd(strm); - return -1; - } - inflateEnd(strm); - return 0; -} - -int qcow2_decompress_cluster(BlockDriverState *bs, uint64_t cluster_offset) -{ - BDRVQcow2State *s = bs->opaque; - int ret, csize, nb_csectors, sector_offset; - uint64_t coffset; - - coffset = cluster_offset & s->cluster_offset_mask; - if (s->cluster_cache_offset != coffset) { - nb_csectors = ((cluster_offset >> s->csize_shift) & s->csize_mask) + 1; - sector_offset = coffset & 511; - csize = nb_csectors * 512 - sector_offset; - BLKDBG_EVENT(bs->file, BLKDBG_READ_COMPRESSED); - ret = bdrv_read(bs->file->bs, coffset >> 9, s->cluster_data, - nb_csectors); - if (ret < 0) { - return ret; - } - if (decompress_buffer(s->cluster_cache, s->cluster_size, - s->cluster_data + sector_offset, csize) < 0) { - return -EIO; - } - s->cluster_cache_offset = coffset; - } - return 0; -} - -/* - * This discards as many clusters of nb_clusters as possible at once (i.e. - * all clusters in the same L2 table) and returns the number of discarded - * clusters. - */ -static int discard_single_l2(BlockDriverState *bs, uint64_t offset, - uint64_t nb_clusters, enum qcow2_discard_type type, - bool full_discard) -{ - BDRVQcow2State *s = bs->opaque; - uint64_t *l2_table; - int l2_index; - int ret; - int i; - - ret = get_cluster_table(bs, offset, &l2_table, &l2_index); - if (ret < 0) { - return ret; - } - - /* Limit nb_clusters to one L2 table */ - nb_clusters = MIN(nb_clusters, s->l2_size - l2_index); - assert(nb_clusters <= INT_MAX); - - for (i = 0; i < nb_clusters; i++) { - uint64_t old_l2_entry; - - old_l2_entry = be64_to_cpu(l2_table[l2_index + i]); - - /* - * If full_discard is false, make sure that a discarded area reads back - * as zeroes for v3 images (we cannot do it for v2 without actually - * writing a zero-filled buffer). We can skip the operation if the - * cluster is already marked as zero, or if it's unallocated and we - * don't have a backing file. - * - * TODO We might want to use bdrv_get_block_status(bs) here, but we're - * holding s->lock, so that doesn't work today. - * - * If full_discard is true, the sector should not read back as zeroes, - * but rather fall through to the backing file. - */ - switch (qcow2_get_cluster_type(old_l2_entry)) { - case QCOW2_CLUSTER_UNALLOCATED: - if (full_discard || !bs->backing) { - continue; - } - break; - - case QCOW2_CLUSTER_ZERO: - if (!full_discard) { - continue; - } - break; - - case QCOW2_CLUSTER_NORMAL: - case QCOW2_CLUSTER_COMPRESSED: - break; - - default: - abort(); - } - - /* First remove L2 entries */ - qcow2_cache_entry_mark_dirty(bs, s->l2_table_cache, l2_table); - if (!full_discard && s->qcow_version >= 3) { - l2_table[l2_index + i] = cpu_to_be64(QCOW_OFLAG_ZERO); - } else { - l2_table[l2_index + i] = cpu_to_be64(0); - } - - /* Then decrease the refcount */ - qcow2_free_any_clusters(bs, old_l2_entry, 1, type); - } - - qcow2_cache_put(bs, s->l2_table_cache, (void **) &l2_table); - - return nb_clusters; -} - -int qcow2_discard_clusters(BlockDriverState *bs, uint64_t offset, - int nb_sectors, enum qcow2_discard_type type, bool full_discard) -{ - BDRVQcow2State *s = bs->opaque; - uint64_t end_offset; - uint64_t nb_clusters; - int ret; - - end_offset = offset + (nb_sectors << BDRV_SECTOR_BITS); - - /* Round start up and end down */ - offset = align_offset(offset, s->cluster_size); - end_offset = start_of_cluster(s, end_offset); - - if (offset > end_offset) { - return 0; - } - - nb_clusters = size_to_clusters(s, end_offset - offset); - - s->cache_discards = true; - - /* Each L2 table is handled by its own loop iteration */ - while (nb_clusters > 0) { - ret = discard_single_l2(bs, offset, nb_clusters, type, full_discard); - if (ret < 0) { - goto fail; - } - - nb_clusters -= ret; - offset += (ret * s->cluster_size); - } - - ret = 0; -fail: - s->cache_discards = false; - qcow2_process_discards(bs, ret); - - return ret; -} - -/* - * This zeroes as many clusters of nb_clusters as possible at once (i.e. - * all clusters in the same L2 table) and returns the number of zeroed - * clusters. - */ -static int zero_single_l2(BlockDriverState *bs, uint64_t offset, - uint64_t nb_clusters) -{ - BDRVQcow2State *s = bs->opaque; - uint64_t *l2_table; - int l2_index; - int ret; - int i; - - ret = get_cluster_table(bs, offset, &l2_table, &l2_index); - if (ret < 0) { - return ret; - } - - /* Limit nb_clusters to one L2 table */ - nb_clusters = MIN(nb_clusters, s->l2_size - l2_index); - assert(nb_clusters <= INT_MAX); - - for (i = 0; i < nb_clusters; i++) { - uint64_t old_offset; - - old_offset = be64_to_cpu(l2_table[l2_index + i]); - - /* Update L2 entries */ - qcow2_cache_entry_mark_dirty(bs, s->l2_table_cache, l2_table); - if (old_offset & QCOW_OFLAG_COMPRESSED) { - l2_table[l2_index + i] = cpu_to_be64(QCOW_OFLAG_ZERO); - qcow2_free_any_clusters(bs, old_offset, 1, QCOW2_DISCARD_REQUEST); - } else { - l2_table[l2_index + i] |= cpu_to_be64(QCOW_OFLAG_ZERO); - } - } - - qcow2_cache_put(bs, s->l2_table_cache, (void **) &l2_table); - - return nb_clusters; -} - -int qcow2_zero_clusters(BlockDriverState *bs, uint64_t offset, int nb_sectors) -{ - BDRVQcow2State *s = bs->opaque; - uint64_t nb_clusters; - int ret; - - /* The zero flag is only supported by version 3 and newer */ - if (s->qcow_version < 3) { - return -ENOTSUP; - } - - /* Each L2 table is handled by its own loop iteration */ - nb_clusters = size_to_clusters(s, nb_sectors << BDRV_SECTOR_BITS); - - s->cache_discards = true; - - while (nb_clusters > 0) { - ret = zero_single_l2(bs, offset, nb_clusters); - if (ret < 0) { - goto fail; - } - - nb_clusters -= ret; - offset += (ret * s->cluster_size); - } - - ret = 0; -fail: - s->cache_discards = false; - qcow2_process_discards(bs, ret); - - return ret; -} - -/* - * Expands all zero clusters in a specific L1 table (or deallocates them, for - * non-backed non-pre-allocated zero clusters). - * - * l1_entries and *visited_l1_entries are used to keep track of progress for - * status_cb(). l1_entries contains the total number of L1 entries and - * *visited_l1_entries counts all visited L1 entries. - */ -static int expand_zero_clusters_in_l1(BlockDriverState *bs, uint64_t *l1_table, - int l1_size, int64_t *visited_l1_entries, - int64_t l1_entries, - BlockDriverAmendStatusCB *status_cb, - void *cb_opaque) -{ - BDRVQcow2State *s = bs->opaque; - bool is_active_l1 = (l1_table == s->l1_table); - uint64_t *l2_table = NULL; - int ret; - int i, j; - - if (!is_active_l1) { - /* inactive L2 tables require a buffer to be stored in when loading - * them from disk */ - l2_table = qemu_try_blockalign(bs->file->bs, s->cluster_size); - if (l2_table == NULL) { - return -ENOMEM; - } - } - - for (i = 0; i < l1_size; i++) { - uint64_t l2_offset = l1_table[i] & L1E_OFFSET_MASK; - bool l2_dirty = false; - uint64_t l2_refcount; - - if (!l2_offset) { - /* unallocated */ - (*visited_l1_entries)++; - if (status_cb) { - status_cb(bs, *visited_l1_entries, l1_entries, cb_opaque); - } - continue; - } - - if (offset_into_cluster(s, l2_offset)) { - qcow2_signal_corruption(bs, true, -1, -1, "L2 table offset %#" - PRIx64 " unaligned (L1 index: %#x)", - l2_offset, i); - ret = -EIO; - goto fail; - } - - if (is_active_l1) { - /* get active L2 tables from cache */ - ret = qcow2_cache_get(bs, s->l2_table_cache, l2_offset, - (void **)&l2_table); - } else { - /* load inactive L2 tables from disk */ - ret = bdrv_read(bs->file->bs, l2_offset / BDRV_SECTOR_SIZE, - (void *)l2_table, s->cluster_sectors); - } - if (ret < 0) { - goto fail; - } - - ret = qcow2_get_refcount(bs, l2_offset >> s->cluster_bits, - &l2_refcount); - if (ret < 0) { - goto fail; - } - - for (j = 0; j < s->l2_size; j++) { - uint64_t l2_entry = be64_to_cpu(l2_table[j]); - int64_t offset = l2_entry & L2E_OFFSET_MASK; - int cluster_type = qcow2_get_cluster_type(l2_entry); - bool preallocated = offset != 0; - - if (cluster_type != QCOW2_CLUSTER_ZERO) { - continue; - } - - if (!preallocated) { - if (!bs->backing) { - /* not backed; therefore we can simply deallocate the - * cluster */ - l2_table[j] = 0; - l2_dirty = true; - continue; - } - - offset = qcow2_alloc_clusters(bs, s->cluster_size); - if (offset < 0) { - ret = offset; - goto fail; - } - - if (l2_refcount > 1) { - /* For shared L2 tables, set the refcount accordingly (it is - * already 1 and needs to be l2_refcount) */ - ret = qcow2_update_cluster_refcount(bs, - offset >> s->cluster_bits, - refcount_diff(1, l2_refcount), false, - QCOW2_DISCARD_OTHER); - if (ret < 0) { - qcow2_free_clusters(bs, offset, s->cluster_size, - QCOW2_DISCARD_OTHER); - goto fail; - } - } - } - - if (offset_into_cluster(s, offset)) { - qcow2_signal_corruption(bs, true, -1, -1, "Data cluster offset " - "%#" PRIx64 " unaligned (L2 offset: %#" - PRIx64 ", L2 index: %#x)", offset, - l2_offset, j); - if (!preallocated) { - qcow2_free_clusters(bs, offset, s->cluster_size, - QCOW2_DISCARD_ALWAYS); - } - ret = -EIO; - goto fail; - } - - ret = qcow2_pre_write_overlap_check(bs, 0, offset, s->cluster_size); - if (ret < 0) { - if (!preallocated) { - qcow2_free_clusters(bs, offset, s->cluster_size, - QCOW2_DISCARD_ALWAYS); - } - goto fail; - } - - ret = bdrv_write_zeroes(bs->file->bs, offset / BDRV_SECTOR_SIZE, - s->cluster_sectors, 0); - if (ret < 0) { - if (!preallocated) { - qcow2_free_clusters(bs, offset, s->cluster_size, - QCOW2_DISCARD_ALWAYS); - } - goto fail; - } - - if (l2_refcount == 1) { - l2_table[j] = cpu_to_be64(offset | QCOW_OFLAG_COPIED); - } else { - l2_table[j] = cpu_to_be64(offset); - } - l2_dirty = true; - } - - if (is_active_l1) { - if (l2_dirty) { - qcow2_cache_entry_mark_dirty(bs, s->l2_table_cache, l2_table); - qcow2_cache_depends_on_flush(s->l2_table_cache); - } - qcow2_cache_put(bs, s->l2_table_cache, (void **) &l2_table); - } else { - if (l2_dirty) { - ret = qcow2_pre_write_overlap_check(bs, - QCOW2_OL_INACTIVE_L2 | QCOW2_OL_ACTIVE_L2, l2_offset, - s->cluster_size); - if (ret < 0) { - goto fail; - } - - ret = bdrv_write(bs->file->bs, l2_offset / BDRV_SECTOR_SIZE, - (void *)l2_table, s->cluster_sectors); - if (ret < 0) { - goto fail; - } - } - } - - (*visited_l1_entries)++; - if (status_cb) { - status_cb(bs, *visited_l1_entries, l1_entries, cb_opaque); - } - } - - ret = 0; - -fail: - if (l2_table) { - if (!is_active_l1) { - qemu_vfree(l2_table); - } else { - qcow2_cache_put(bs, s->l2_table_cache, (void **) &l2_table); - } - } - return ret; -} - -/* - * For backed images, expands all zero clusters on the image. For non-backed - * images, deallocates all non-pre-allocated zero clusters (and claims the - * allocation for pre-allocated ones). This is important for downgrading to a - * qcow2 version which doesn't yet support metadata zero clusters. - */ -int qcow2_expand_zero_clusters(BlockDriverState *bs, - BlockDriverAmendStatusCB *status_cb, - void *cb_opaque) -{ - BDRVQcow2State *s = bs->opaque; - uint64_t *l1_table = NULL; - int64_t l1_entries = 0, visited_l1_entries = 0; - int ret; - int i, j; - - if (status_cb) { - l1_entries = s->l1_size; - for (i = 0; i < s->nb_snapshots; i++) { - l1_entries += s->snapshots[i].l1_size; - } - } - - ret = expand_zero_clusters_in_l1(bs, s->l1_table, s->l1_size, - &visited_l1_entries, l1_entries, - status_cb, cb_opaque); - if (ret < 0) { - goto fail; - } - - /* Inactive L1 tables may point to active L2 tables - therefore it is - * necessary to flush the L2 table cache before trying to access the L2 - * tables pointed to by inactive L1 entries (else we might try to expand - * zero clusters that have already been expanded); furthermore, it is also - * necessary to empty the L2 table cache, since it may contain tables which - * are now going to be modified directly on disk, bypassing the cache. - * qcow2_cache_empty() does both for us. */ - ret = qcow2_cache_empty(bs, s->l2_table_cache); - if (ret < 0) { - goto fail; - } - - for (i = 0; i < s->nb_snapshots; i++) { - int l1_sectors = (s->snapshots[i].l1_size * sizeof(uint64_t) + - BDRV_SECTOR_SIZE - 1) / BDRV_SECTOR_SIZE; - - l1_table = g_realloc(l1_table, l1_sectors * BDRV_SECTOR_SIZE); - - ret = bdrv_read(bs->file->bs, - s->snapshots[i].l1_table_offset / BDRV_SECTOR_SIZE, - (void *)l1_table, l1_sectors); - if (ret < 0) { - goto fail; - } - - for (j = 0; j < s->snapshots[i].l1_size; j++) { - be64_to_cpus(&l1_table[j]); - } - - ret = expand_zero_clusters_in_l1(bs, l1_table, s->snapshots[i].l1_size, - &visited_l1_entries, l1_entries, - status_cb, cb_opaque); - if (ret < 0) { - goto fail; - } - } - - ret = 0; - -fail: - g_free(l1_table); - return ret; -} diff --git a/qemu/block/qcow2-refcount.c b/qemu/block/qcow2-refcount.c deleted file mode 100644 index ca6094ff5..000000000 --- a/qemu/block/qcow2-refcount.c +++ /dev/null @@ -1,2921 +0,0 @@ -/* - * Block driver for the QCOW version 2 format - * - * Copyright (c) 2004-2006 Fabrice Bellard - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. - */ - -#include "qemu/osdep.h" -#include "qapi/error.h" -#include "qemu-common.h" -#include "block/block_int.h" -#include "block/qcow2.h" -#include "qemu/range.h" - -static int64_t alloc_clusters_noref(BlockDriverState *bs, uint64_t size); -static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs, - int64_t offset, int64_t length, uint64_t addend, - bool decrease, enum qcow2_discard_type type); - -static uint64_t get_refcount_ro0(const void *refcount_array, uint64_t index); -static uint64_t get_refcount_ro1(const void *refcount_array, uint64_t index); -static uint64_t get_refcount_ro2(const void *refcount_array, uint64_t index); -static uint64_t get_refcount_ro3(const void *refcount_array, uint64_t index); -static uint64_t get_refcount_ro4(const void *refcount_array, uint64_t index); -static uint64_t get_refcount_ro5(const void *refcount_array, uint64_t index); -static uint64_t get_refcount_ro6(const void *refcount_array, uint64_t index); - -static void set_refcount_ro0(void *refcount_array, uint64_t index, - uint64_t value); -static void set_refcount_ro1(void *refcount_array, uint64_t index, - uint64_t value); -static void set_refcount_ro2(void *refcount_array, uint64_t index, - uint64_t value); -static void set_refcount_ro3(void *refcount_array, uint64_t index, - uint64_t value); -static void set_refcount_ro4(void *refcount_array, uint64_t index, - uint64_t value); -static void set_refcount_ro5(void *refcount_array, uint64_t index, - uint64_t value); -static void set_refcount_ro6(void *refcount_array, uint64_t index, - uint64_t value); - - -static Qcow2GetRefcountFunc *const get_refcount_funcs[] = { - &get_refcount_ro0, - &get_refcount_ro1, - &get_refcount_ro2, - &get_refcount_ro3, - &get_refcount_ro4, - &get_refcount_ro5, - &get_refcount_ro6 -}; - -static Qcow2SetRefcountFunc *const set_refcount_funcs[] = { - &set_refcount_ro0, - &set_refcount_ro1, - &set_refcount_ro2, - &set_refcount_ro3, - &set_refcount_ro4, - &set_refcount_ro5, - &set_refcount_ro6 -}; - - -/*********************************************************/ -/* refcount handling */ - -int qcow2_refcount_init(BlockDriverState *bs) -{ - BDRVQcow2State *s = bs->opaque; - unsigned int refcount_table_size2, i; - int ret; - - assert(s->refcount_order >= 0 && s->refcount_order <= 6); - - s->get_refcount = get_refcount_funcs[s->refcount_order]; - s->set_refcount = set_refcount_funcs[s->refcount_order]; - - assert(s->refcount_table_size <= INT_MAX / sizeof(uint64_t)); - refcount_table_size2 = s->refcount_table_size * sizeof(uint64_t); - s->refcount_table = g_try_malloc(refcount_table_size2); - - if (s->refcount_table_size > 0) { - if (s->refcount_table == NULL) { - ret = -ENOMEM; - goto fail; - } - BLKDBG_EVENT(bs->file, BLKDBG_REFTABLE_LOAD); - ret = bdrv_pread(bs->file->bs, s->refcount_table_offset, - s->refcount_table, refcount_table_size2); - if (ret < 0) { - goto fail; - } - for(i = 0; i < s->refcount_table_size; i++) - be64_to_cpus(&s->refcount_table[i]); - } - return 0; - fail: - return ret; -} - -void qcow2_refcount_close(BlockDriverState *bs) -{ - BDRVQcow2State *s = bs->opaque; - g_free(s->refcount_table); -} - - -static uint64_t get_refcount_ro0(const void *refcount_array, uint64_t index) -{ - return (((const uint8_t *)refcount_array)[index / 8] >> (index % 8)) & 0x1; -} - -static void set_refcount_ro0(void *refcount_array, uint64_t index, - uint64_t value) -{ - assert(!(value >> 1)); - ((uint8_t *)refcount_array)[index / 8] &= ~(0x1 << (index % 8)); - ((uint8_t *)refcount_array)[index / 8] |= value << (index % 8); -} - -static uint64_t get_refcount_ro1(const void *refcount_array, uint64_t index) -{ - return (((const uint8_t *)refcount_array)[index / 4] >> (2 * (index % 4))) - & 0x3; -} - -static void set_refcount_ro1(void *refcount_array, uint64_t index, - uint64_t value) -{ - assert(!(value >> 2)); - ((uint8_t *)refcount_array)[index / 4] &= ~(0x3 << (2 * (index % 4))); - ((uint8_t *)refcount_array)[index / 4] |= value << (2 * (index % 4)); -} - -static uint64_t get_refcount_ro2(const void *refcount_array, uint64_t index) -{ - return (((const uint8_t *)refcount_array)[index / 2] >> (4 * (index % 2))) - & 0xf; -} - -static void set_refcount_ro2(void *refcount_array, uint64_t index, - uint64_t value) -{ - assert(!(value >> 4)); - ((uint8_t *)refcount_array)[index / 2] &= ~(0xf << (4 * (index % 2))); - ((uint8_t *)refcount_array)[index / 2] |= value << (4 * (index % 2)); -} - -static uint64_t get_refcount_ro3(const void *refcount_array, uint64_t index) -{ - return ((const uint8_t *)refcount_array)[index]; -} - -static void set_refcount_ro3(void *refcount_array, uint64_t index, - uint64_t value) -{ - assert(!(value >> 8)); - ((uint8_t *)refcount_array)[index] = value; -} - -static uint64_t get_refcount_ro4(const void *refcount_array, uint64_t index) -{ - return be16_to_cpu(((const uint16_t *)refcount_array)[index]); -} - -static void set_refcount_ro4(void *refcount_array, uint64_t index, - uint64_t value) -{ - assert(!(value >> 16)); - ((uint16_t *)refcount_array)[index] = cpu_to_be16(value); -} - -static uint64_t get_refcount_ro5(const void *refcount_array, uint64_t index) -{ - return be32_to_cpu(((const uint32_t *)refcount_array)[index]); -} - -static void set_refcount_ro5(void *refcount_array, uint64_t index, - uint64_t value) -{ - assert(!(value >> 32)); - ((uint32_t *)refcount_array)[index] = cpu_to_be32(value); -} - -static uint64_t get_refcount_ro6(const void *refcount_array, uint64_t index) -{ - return be64_to_cpu(((const uint64_t *)refcount_array)[index]); -} - -static void set_refcount_ro6(void *refcount_array, uint64_t index, - uint64_t value) -{ - ((uint64_t *)refcount_array)[index] = cpu_to_be64(value); -} - - -static int load_refcount_block(BlockDriverState *bs, - int64_t refcount_block_offset, - void **refcount_block) -{ - BDRVQcow2State *s = bs->opaque; - int ret; - - BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_LOAD); - ret = qcow2_cache_get(bs, s->refcount_block_cache, refcount_block_offset, - refcount_block); - - return ret; -} - -/* - * Retrieves the refcount of the cluster given by its index and stores it in - * *refcount. Returns 0 on success and -errno on failure. - */ -int qcow2_get_refcount(BlockDriverState *bs, int64_t cluster_index, - uint64_t *refcount) -{ - BDRVQcow2State *s = bs->opaque; - uint64_t refcount_table_index, block_index; - int64_t refcount_block_offset; - int ret; - void *refcount_block; - - refcount_table_index = cluster_index >> s->refcount_block_bits; - if (refcount_table_index >= s->refcount_table_size) { - *refcount = 0; - return 0; - } - refcount_block_offset = - s->refcount_table[refcount_table_index] & REFT_OFFSET_MASK; - if (!refcount_block_offset) { - *refcount = 0; - return 0; - } - - if (offset_into_cluster(s, refcount_block_offset)) { - qcow2_signal_corruption(bs, true, -1, -1, "Refblock offset %#" PRIx64 - " unaligned (reftable index: %#" PRIx64 ")", - refcount_block_offset, refcount_table_index); - return -EIO; - } - - ret = qcow2_cache_get(bs, s->refcount_block_cache, refcount_block_offset, - &refcount_block); - if (ret < 0) { - return ret; - } - - block_index = cluster_index & (s->refcount_block_size - 1); - *refcount = s->get_refcount(refcount_block, block_index); - - qcow2_cache_put(bs, s->refcount_block_cache, &refcount_block); - - return 0; -} - -/* - * Rounds the refcount table size up to avoid growing the table for each single - * refcount block that is allocated. - */ -static unsigned int next_refcount_table_size(BDRVQcow2State *s, - unsigned int min_size) -{ - unsigned int min_clusters = (min_size >> (s->cluster_bits - 3)) + 1; - unsigned int refcount_table_clusters = - MAX(1, s->refcount_table_size >> (s->cluster_bits - 3)); - - while (min_clusters > refcount_table_clusters) { - refcount_table_clusters = (refcount_table_clusters * 3 + 1) / 2; - } - - return refcount_table_clusters << (s->cluster_bits - 3); -} - - -/* Checks if two offsets are described by the same refcount block */ -static int in_same_refcount_block(BDRVQcow2State *s, uint64_t offset_a, - uint64_t offset_b) -{ - uint64_t block_a = offset_a >> (s->cluster_bits + s->refcount_block_bits); - uint64_t block_b = offset_b >> (s->cluster_bits + s->refcount_block_bits); - - return (block_a == block_b); -} - -/* - * Loads a refcount block. If it doesn't exist yet, it is allocated first - * (including growing the refcount table if needed). - * - * Returns 0 on success or -errno in error case - */ -static int alloc_refcount_block(BlockDriverState *bs, - int64_t cluster_index, void **refcount_block) -{ - BDRVQcow2State *s = bs->opaque; - unsigned int refcount_table_index; - int ret; - - BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC); - - /* Find the refcount block for the given cluster */ - refcount_table_index = cluster_index >> s->refcount_block_bits; - - if (refcount_table_index < s->refcount_table_size) { - - uint64_t refcount_block_offset = - s->refcount_table[refcount_table_index] & REFT_OFFSET_MASK; - - /* If it's already there, we're done */ - if (refcount_block_offset) { - if (offset_into_cluster(s, refcount_block_offset)) { - qcow2_signal_corruption(bs, true, -1, -1, "Refblock offset %#" - PRIx64 " unaligned (reftable index: " - "%#x)", refcount_block_offset, - refcount_table_index); - return -EIO; - } - - return load_refcount_block(bs, refcount_block_offset, - refcount_block); - } - } - - /* - * If we came here, we need to allocate something. Something is at least - * a cluster for the new refcount block. It may also include a new refcount - * table if the old refcount table is too small. - * - * Note that allocating clusters here needs some special care: - * - * - We can't use the normal qcow2_alloc_clusters(), it would try to - * increase the refcount and very likely we would end up with an endless - * recursion. Instead we must place the refcount blocks in a way that - * they can describe them themselves. - * - * - We need to consider that at this point we are inside update_refcounts - * and potentially doing an initial refcount increase. This means that - * some clusters have already been allocated by the caller, but their - * refcount isn't accurate yet. If we allocate clusters for metadata, we - * need to return -EAGAIN to signal the caller that it needs to restart - * the search for free clusters. - * - * - alloc_clusters_noref and qcow2_free_clusters may load a different - * refcount block into the cache - */ - - *refcount_block = NULL; - - /* We write to the refcount table, so we might depend on L2 tables */ - ret = qcow2_cache_flush(bs, s->l2_table_cache); - if (ret < 0) { - return ret; - } - - /* Allocate the refcount block itself and mark it as used */ - int64_t new_block = alloc_clusters_noref(bs, s->cluster_size); - if (new_block < 0) { - return new_block; - } - -#ifdef DEBUG_ALLOC2 - fprintf(stderr, "qcow2: Allocate refcount block %d for %" PRIx64 - " at %" PRIx64 "\n", - refcount_table_index, cluster_index << s->cluster_bits, new_block); -#endif - - if (in_same_refcount_block(s, new_block, cluster_index << s->cluster_bits)) { - /* Zero the new refcount block before updating it */ - ret = qcow2_cache_get_empty(bs, s->refcount_block_cache, new_block, - refcount_block); - if (ret < 0) { - goto fail_block; - } - - memset(*refcount_block, 0, s->cluster_size); - - /* The block describes itself, need to update the cache */ - int block_index = (new_block >> s->cluster_bits) & - (s->refcount_block_size - 1); - s->set_refcount(*refcount_block, block_index, 1); - } else { - /* Described somewhere else. This can recurse at most twice before we - * arrive at a block that describes itself. */ - ret = update_refcount(bs, new_block, s->cluster_size, 1, false, - QCOW2_DISCARD_NEVER); - if (ret < 0) { - goto fail_block; - } - - ret = qcow2_cache_flush(bs, s->refcount_block_cache); - if (ret < 0) { - goto fail_block; - } - - /* Initialize the new refcount block only after updating its refcount, - * update_refcount uses the refcount cache itself */ - ret = qcow2_cache_get_empty(bs, s->refcount_block_cache, new_block, - refcount_block); - if (ret < 0) { - goto fail_block; - } - - memset(*refcount_block, 0, s->cluster_size); - } - - /* Now the new refcount block needs to be written to disk */ - BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_WRITE); - qcow2_cache_entry_mark_dirty(bs, s->refcount_block_cache, *refcount_block); - ret = qcow2_cache_flush(bs, s->refcount_block_cache); - if (ret < 0) { - goto fail_block; - } - - /* If the refcount table is big enough, just hook the block up there */ - if (refcount_table_index < s->refcount_table_size) { - uint64_t data64 = cpu_to_be64(new_block); - BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_HOOKUP); - ret = bdrv_pwrite_sync(bs->file->bs, - s->refcount_table_offset + refcount_table_index * sizeof(uint64_t), - &data64, sizeof(data64)); - if (ret < 0) { - goto fail_block; - } - - s->refcount_table[refcount_table_index] = new_block; - - /* The new refcount block may be where the caller intended to put its - * data, so let it restart the search. */ - return -EAGAIN; - } - - qcow2_cache_put(bs, s->refcount_block_cache, refcount_block); - - /* - * If we come here, we need to grow the refcount table. Again, a new - * refcount table needs some space and we can't simply allocate to avoid - * endless recursion. - * - * Therefore let's grab new refcount blocks at the end of the image, which - * will describe themselves and the new refcount table. This way we can - * reference them only in the new table and do the switch to the new - * refcount table at once without producing an inconsistent state in - * between. - */ - BLKDBG_EVENT(bs->file, BLKDBG_REFTABLE_GROW); - - /* Calculate the number of refcount blocks needed so far; this will be the - * basis for calculating the index of the first cluster used for the - * self-describing refcount structures which we are about to create. - * - * Because we reached this point, there cannot be any refcount entries for - * cluster_index or higher indices yet. However, because new_block has been - * allocated to describe that cluster (and it will assume this role later - * on), we cannot use that index; also, new_block may actually have a higher - * cluster index than cluster_index, so it needs to be taken into account - * here (and 1 needs to be added to its value because that cluster is used). - */ - uint64_t blocks_used = DIV_ROUND_UP(MAX(cluster_index + 1, - (new_block >> s->cluster_bits) + 1), - s->refcount_block_size); - - if (blocks_used > QCOW_MAX_REFTABLE_SIZE / sizeof(uint64_t)) { - return -EFBIG; - } - - /* And now we need at least one block more for the new metadata */ - uint64_t table_size = next_refcount_table_size(s, blocks_used + 1); - uint64_t last_table_size; - uint64_t blocks_clusters; - do { - uint64_t table_clusters = - size_to_clusters(s, table_size * sizeof(uint64_t)); - blocks_clusters = 1 + - ((table_clusters + s->refcount_block_size - 1) - / s->refcount_block_size); - uint64_t meta_clusters = table_clusters + blocks_clusters; - - last_table_size = table_size; - table_size = next_refcount_table_size(s, blocks_used + - ((meta_clusters + s->refcount_block_size - 1) - / s->refcount_block_size)); - - } while (last_table_size != table_size); - -#ifdef DEBUG_ALLOC2 - fprintf(stderr, "qcow2: Grow refcount table %" PRId32 " => %" PRId64 "\n", - s->refcount_table_size, table_size); -#endif - - /* Create the new refcount table and blocks */ - uint64_t meta_offset = (blocks_used * s->refcount_block_size) * - s->cluster_size; - uint64_t table_offset = meta_offset + blocks_clusters * s->cluster_size; - uint64_t *new_table = g_try_new0(uint64_t, table_size); - void *new_blocks = g_try_malloc0(blocks_clusters * s->cluster_size); - - assert(table_size > 0 && blocks_clusters > 0); - if (new_table == NULL || new_blocks == NULL) { - ret = -ENOMEM; - goto fail_table; - } - - /* Fill the new refcount table */ - memcpy(new_table, s->refcount_table, - s->refcount_table_size * sizeof(uint64_t)); - new_table[refcount_table_index] = new_block; - - int i; - for (i = 0; i < blocks_clusters; i++) { - new_table[blocks_used + i] = meta_offset + (i * s->cluster_size); - } - - /* Fill the refcount blocks */ - uint64_t table_clusters = size_to_clusters(s, table_size * sizeof(uint64_t)); - int block = 0; - for (i = 0; i < table_clusters + blocks_clusters; i++) { - s->set_refcount(new_blocks, block++, 1); - } - - /* Write refcount blocks to disk */ - BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_WRITE_BLOCKS); - ret = bdrv_pwrite_sync(bs->file->bs, meta_offset, new_blocks, - blocks_clusters * s->cluster_size); - g_free(new_blocks); - new_blocks = NULL; - if (ret < 0) { - goto fail_table; - } - - /* Write refcount table to disk */ - for(i = 0; i < table_size; i++) { - cpu_to_be64s(&new_table[i]); - } - - BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_WRITE_TABLE); - ret = bdrv_pwrite_sync(bs->file->bs, table_offset, new_table, - table_size * sizeof(uint64_t)); - if (ret < 0) { - goto fail_table; - } - - for(i = 0; i < table_size; i++) { - be64_to_cpus(&new_table[i]); - } - - /* Hook up the new refcount table in the qcow2 header */ - struct QEMU_PACKED { - uint64_t d64; - uint32_t d32; - } data; - cpu_to_be64w(&data.d64, table_offset); - cpu_to_be32w(&data.d32, table_clusters); - BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_SWITCH_TABLE); - ret = bdrv_pwrite_sync(bs->file->bs, - offsetof(QCowHeader, refcount_table_offset), - &data, sizeof(data)); - if (ret < 0) { - goto fail_table; - } - - /* And switch it in memory */ - uint64_t old_table_offset = s->refcount_table_offset; - uint64_t old_table_size = s->refcount_table_size; - - g_free(s->refcount_table); - s->refcount_table = new_table; - s->refcount_table_size = table_size; - s->refcount_table_offset = table_offset; - - /* Free old table. */ - qcow2_free_clusters(bs, old_table_offset, old_table_size * sizeof(uint64_t), - QCOW2_DISCARD_OTHER); - - ret = load_refcount_block(bs, new_block, refcount_block); - if (ret < 0) { - return ret; - } - - /* If we were trying to do the initial refcount update for some cluster - * allocation, we might have used the same clusters to store newly - * allocated metadata. Make the caller search some new space. */ - return -EAGAIN; - -fail_table: - g_free(new_blocks); - g_free(new_table); -fail_block: - if (*refcount_block != NULL) { - qcow2_cache_put(bs, s->refcount_block_cache, refcount_block); - } - return ret; -} - -void qcow2_process_discards(BlockDriverState *bs, int ret) -{ - BDRVQcow2State *s = bs->opaque; - Qcow2DiscardRegion *d, *next; - - QTAILQ_FOREACH_SAFE(d, &s->discards, next, next) { - QTAILQ_REMOVE(&s->discards, d, next); - - /* Discard is optional, ignore the return value */ - if (ret >= 0) { - bdrv_discard(bs->file->bs, - d->offset >> BDRV_SECTOR_BITS, - d->bytes >> BDRV_SECTOR_BITS); - } - - g_free(d); - } -} - -static void update_refcount_discard(BlockDriverState *bs, - uint64_t offset, uint64_t length) -{ - BDRVQcow2State *s = bs->opaque; - Qcow2DiscardRegion *d, *p, *next; - - QTAILQ_FOREACH(d, &s->discards, next) { - uint64_t new_start = MIN(offset, d->offset); - uint64_t new_end = MAX(offset + length, d->offset + d->bytes); - - if (new_end - new_start <= length + d->bytes) { - /* There can't be any overlap, areas ending up here have no - * references any more and therefore shouldn't get freed another - * time. */ - assert(d->bytes + length == new_end - new_start); - d->offset = new_start; - d->bytes = new_end - new_start; - goto found; - } - } - - d = g_malloc(sizeof(*d)); - *d = (Qcow2DiscardRegion) { - .bs = bs, - .offset = offset, - .bytes = length, - }; - QTAILQ_INSERT_TAIL(&s->discards, d, next); - -found: - /* Merge discard requests if they are adjacent now */ - QTAILQ_FOREACH_SAFE(p, &s->discards, next, next) { - if (p == d - || p->offset > d->offset + d->bytes - || d->offset > p->offset + p->bytes) - { - continue; - } - - /* Still no overlap possible */ - assert(p->offset == d->offset + d->bytes - || d->offset == p->offset + p->bytes); - - QTAILQ_REMOVE(&s->discards, p, next); - d->offset = MIN(d->offset, p->offset); - d->bytes += p->bytes; - g_free(p); - } -} - -/* XXX: cache several refcount block clusters ? */ -/* @addend is the absolute value of the addend; if @decrease is set, @addend - * will be subtracted from the current refcount, otherwise it will be added */ -static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs, - int64_t offset, - int64_t length, - uint64_t addend, - bool decrease, - enum qcow2_discard_type type) -{ - BDRVQcow2State *s = bs->opaque; - int64_t start, last, cluster_offset; - void *refcount_block = NULL; - int64_t old_table_index = -1; - int ret; - -#ifdef DEBUG_ALLOC2 - fprintf(stderr, "update_refcount: offset=%" PRId64 " size=%" PRId64 - " addend=%s%" PRIu64 "\n", offset, length, decrease ? "-" : "", - addend); -#endif - if (length < 0) { - return -EINVAL; - } else if (length == 0) { - return 0; - } - - if (decrease) { - qcow2_cache_set_dependency(bs, s->refcount_block_cache, - s->l2_table_cache); - } - - start = start_of_cluster(s, offset); - last = start_of_cluster(s, offset + length - 1); - for(cluster_offset = start; cluster_offset <= last; - cluster_offset += s->cluster_size) - { - int block_index; - uint64_t refcount; - int64_t cluster_index = cluster_offset >> s->cluster_bits; - int64_t table_index = cluster_index >> s->refcount_block_bits; - - /* Load the refcount block and allocate it if needed */ - if (table_index != old_table_index) { - if (refcount_block) { - qcow2_cache_put(bs, s->refcount_block_cache, &refcount_block); - } - ret = alloc_refcount_block(bs, cluster_index, &refcount_block); - if (ret < 0) { - goto fail; - } - } - old_table_index = table_index; - - qcow2_cache_entry_mark_dirty(bs, s->refcount_block_cache, - refcount_block); - - /* we can update the count and save it */ - block_index = cluster_index & (s->refcount_block_size - 1); - - refcount = s->get_refcount(refcount_block, block_index); - if (decrease ? (refcount - addend > refcount) - : (refcount + addend < refcount || - refcount + addend > s->refcount_max)) - { - ret = -EINVAL; - goto fail; - } - if (decrease) { - refcount -= addend; - } else { - refcount += addend; - } - if (refcount == 0 && cluster_index < s->free_cluster_index) { - s->free_cluster_index = cluster_index; - } - s->set_refcount(refcount_block, block_index, refcount); - - if (refcount == 0 && s->discard_passthrough[type]) { - update_refcount_discard(bs, cluster_offset, s->cluster_size); - } - } - - ret = 0; -fail: - if (!s->cache_discards) { - qcow2_process_discards(bs, ret); - } - - /* Write last changed block to disk */ - if (refcount_block) { - qcow2_cache_put(bs, s->refcount_block_cache, &refcount_block); - } - - /* - * Try do undo any updates if an error is returned (This may succeed in - * some cases like ENOSPC for allocating a new refcount block) - */ - if (ret < 0) { - int dummy; - dummy = update_refcount(bs, offset, cluster_offset - offset, addend, - !decrease, QCOW2_DISCARD_NEVER); - (void)dummy; - } - - return ret; -} - -/* - * Increases or decreases the refcount of a given cluster. - * - * @addend is the absolute value of the addend; if @decrease is set, @addend - * will be subtracted from the current refcount, otherwise it will be added. - * - * On success 0 is returned; on failure -errno is returned. - */ -int qcow2_update_cluster_refcount(BlockDriverState *bs, - int64_t cluster_index, - uint64_t addend, bool decrease, - enum qcow2_discard_type type) -{ - BDRVQcow2State *s = bs->opaque; - int ret; - - ret = update_refcount(bs, cluster_index << s->cluster_bits, 1, addend, - decrease, type); - if (ret < 0) { - return ret; - } - - return 0; -} - - - -/*********************************************************/ -/* cluster allocation functions */ - - - -/* return < 0 if error */ -static int64_t alloc_clusters_noref(BlockDriverState *bs, uint64_t size) -{ - BDRVQcow2State *s = bs->opaque; - uint64_t i, nb_clusters, refcount; - int ret; - - /* We can't allocate clusters if they may still be queued for discard. */ - if (s->cache_discards) { - qcow2_process_discards(bs, 0); - } - - nb_clusters = size_to_clusters(s, size); -retry: - for(i = 0; i < nb_clusters; i++) { - uint64_t next_cluster_index = s->free_cluster_index++; - ret = qcow2_get_refcount(bs, next_cluster_index, &refcount); - - if (ret < 0) { - return ret; - } else if (refcount != 0) { - goto retry; - } - } - - /* Make sure that all offsets in the "allocated" range are representable - * in an int64_t */ - if (s->free_cluster_index > 0 && - s->free_cluster_index - 1 > (INT64_MAX >> s->cluster_bits)) - { - return -EFBIG; - } - -#ifdef DEBUG_ALLOC2 - fprintf(stderr, "alloc_clusters: size=%" PRId64 " -> %" PRId64 "\n", - size, - (s->free_cluster_index - nb_clusters) << s->cluster_bits); -#endif - return (s->free_cluster_index - nb_clusters) << s->cluster_bits; -} - -int64_t qcow2_alloc_clusters(BlockDriverState *bs, uint64_t size) -{ - int64_t offset; - int ret; - - BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_ALLOC); - do { - offset = alloc_clusters_noref(bs, size); - if (offset < 0) { - return offset; - } - - ret = update_refcount(bs, offset, size, 1, false, QCOW2_DISCARD_NEVER); - } while (ret == -EAGAIN); - - if (ret < 0) { - return ret; - } - - return offset; -} - -int64_t qcow2_alloc_clusters_at(BlockDriverState *bs, uint64_t offset, - int64_t nb_clusters) -{ - BDRVQcow2State *s = bs->opaque; - uint64_t cluster_index, refcount; - uint64_t i; - int ret; - - assert(nb_clusters >= 0); - if (nb_clusters == 0) { - return 0; - } - - do { - /* Check how many clusters there are free */ - cluster_index = offset >> s->cluster_bits; - for(i = 0; i < nb_clusters; i++) { - ret = qcow2_get_refcount(bs, cluster_index++, &refcount); - if (ret < 0) { - return ret; - } else if (refcount != 0) { - break; - } - } - - /* And then allocate them */ - ret = update_refcount(bs, offset, i << s->cluster_bits, 1, false, - QCOW2_DISCARD_NEVER); - } while (ret == -EAGAIN); - - if (ret < 0) { - return ret; - } - - return i; -} - -/* only used to allocate compressed sectors. We try to allocate - contiguous sectors. size must be <= cluster_size */ -int64_t qcow2_alloc_bytes(BlockDriverState *bs, int size) -{ - BDRVQcow2State *s = bs->opaque; - int64_t offset; - size_t free_in_cluster; - int ret; - - BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_ALLOC_BYTES); - assert(size > 0 && size <= s->cluster_size); - assert(!s->free_byte_offset || offset_into_cluster(s, s->free_byte_offset)); - - offset = s->free_byte_offset; - - if (offset) { - uint64_t refcount; - ret = qcow2_get_refcount(bs, offset >> s->cluster_bits, &refcount); - if (ret < 0) { - return ret; - } - - if (refcount == s->refcount_max) { - offset = 0; - } - } - - free_in_cluster = s->cluster_size - offset_into_cluster(s, offset); - do { - if (!offset || free_in_cluster < size) { - int64_t new_cluster = alloc_clusters_noref(bs, s->cluster_size); - if (new_cluster < 0) { - return new_cluster; - } - - if (!offset || ROUND_UP(offset, s->cluster_size) != new_cluster) { - offset = new_cluster; - free_in_cluster = s->cluster_size; - } else { - free_in_cluster += s->cluster_size; - } - } - - assert(offset); - ret = update_refcount(bs, offset, size, 1, false, QCOW2_DISCARD_NEVER); - if (ret < 0) { - offset = 0; - } - } while (ret == -EAGAIN); - if (ret < 0) { - return ret; - } - - /* The cluster refcount was incremented; refcount blocks must be flushed - * before the caller's L2 table updates. */ - qcow2_cache_set_dependency(bs, s->l2_table_cache, s->refcount_block_cache); - - s->free_byte_offset = offset + size; - if (!offset_into_cluster(s, s->free_byte_offset)) { - s->free_byte_offset = 0; - } - - return offset; -} - -void qcow2_free_clusters(BlockDriverState *bs, - int64_t offset, int64_t size, - enum qcow2_discard_type type) -{ - int ret; - - BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_FREE); - ret = update_refcount(bs, offset, size, 1, true, type); - if (ret < 0) { - fprintf(stderr, "qcow2_free_clusters failed: %s\n", strerror(-ret)); - /* TODO Remember the clusters to free them later and avoid leaking */ - } -} - -/* - * Free a cluster using its L2 entry (handles clusters of all types, e.g. - * normal cluster, compressed cluster, etc.) - */ -void qcow2_free_any_clusters(BlockDriverState *bs, uint64_t l2_entry, - int nb_clusters, enum qcow2_discard_type type) -{ - BDRVQcow2State *s = bs->opaque; - - switch (qcow2_get_cluster_type(l2_entry)) { - case QCOW2_CLUSTER_COMPRESSED: - { - int nb_csectors; - nb_csectors = ((l2_entry >> s->csize_shift) & - s->csize_mask) + 1; - qcow2_free_clusters(bs, - (l2_entry & s->cluster_offset_mask) & ~511, - nb_csectors * 512, type); - } - break; - case QCOW2_CLUSTER_NORMAL: - case QCOW2_CLUSTER_ZERO: - if (l2_entry & L2E_OFFSET_MASK) { - if (offset_into_cluster(s, l2_entry & L2E_OFFSET_MASK)) { - qcow2_signal_corruption(bs, false, -1, -1, - "Cannot free unaligned cluster %#llx", - l2_entry & L2E_OFFSET_MASK); - } else { - qcow2_free_clusters(bs, l2_entry & L2E_OFFSET_MASK, - nb_clusters << s->cluster_bits, type); - } - } - break; - case QCOW2_CLUSTER_UNALLOCATED: - break; - default: - abort(); - } -} - - - -/*********************************************************/ -/* snapshots and image creation */ - - - -/* update the refcounts of snapshots and the copied flag */ -int qcow2_update_snapshot_refcount(BlockDriverState *bs, - int64_t l1_table_offset, int l1_size, int addend) -{ - BDRVQcow2State *s = bs->opaque; - uint64_t *l1_table, *l2_table, l2_offset, offset, l1_size2, refcount; - bool l1_allocated = false; - int64_t old_offset, old_l2_offset; - int i, j, l1_modified = 0, nb_csectors; - int ret; - - assert(addend >= -1 && addend <= 1); - - l2_table = NULL; - l1_table = NULL; - l1_size2 = l1_size * sizeof(uint64_t); - - s->cache_discards = true; - - /* WARNING: qcow2_snapshot_goto relies on this function not using the - * l1_table_offset when it is the current s->l1_table_offset! Be careful - * when changing this! */ - if (l1_table_offset != s->l1_table_offset) { - l1_table = g_try_malloc0(align_offset(l1_size2, 512)); - if (l1_size2 && l1_table == NULL) { - ret = -ENOMEM; - goto fail; - } - l1_allocated = true; - - ret = bdrv_pread(bs->file->bs, l1_table_offset, l1_table, l1_size2); - if (ret < 0) { - goto fail; - } - - for(i = 0;i < l1_size; i++) - be64_to_cpus(&l1_table[i]); - } else { - assert(l1_size == s->l1_size); - l1_table = s->l1_table; - l1_allocated = false; - } - - for(i = 0; i < l1_size; i++) { - l2_offset = l1_table[i]; - if (l2_offset) { - old_l2_offset = l2_offset; - l2_offset &= L1E_OFFSET_MASK; - - if (offset_into_cluster(s, l2_offset)) { - qcow2_signal_corruption(bs, true, -1, -1, "L2 table offset %#" - PRIx64 " unaligned (L1 index: %#x)", - l2_offset, i); - ret = -EIO; - goto fail; - } - - ret = qcow2_cache_get(bs, s->l2_table_cache, l2_offset, - (void**) &l2_table); - if (ret < 0) { - goto fail; - } - - for(j = 0; j < s->l2_size; j++) { - uint64_t cluster_index; - - offset = be64_to_cpu(l2_table[j]); - old_offset = offset; - offset &= ~QCOW_OFLAG_COPIED; - - switch (qcow2_get_cluster_type(offset)) { - case QCOW2_CLUSTER_COMPRESSED: - nb_csectors = ((offset >> s->csize_shift) & - s->csize_mask) + 1; - if (addend != 0) { - ret = update_refcount(bs, - (offset & s->cluster_offset_mask) & ~511, - nb_csectors * 512, abs(addend), addend < 0, - QCOW2_DISCARD_SNAPSHOT); - if (ret < 0) { - goto fail; - } - } - /* compressed clusters are never modified */ - refcount = 2; - break; - - case QCOW2_CLUSTER_NORMAL: - case QCOW2_CLUSTER_ZERO: - if (offset_into_cluster(s, offset & L2E_OFFSET_MASK)) { - qcow2_signal_corruption(bs, true, -1, -1, "Data " - "cluster offset %#llx " - "unaligned (L2 offset: %#" - PRIx64 ", L2 index: %#x)", - offset & L2E_OFFSET_MASK, - l2_offset, j); - ret = -EIO; - goto fail; - } - - cluster_index = (offset & L2E_OFFSET_MASK) >> s->cluster_bits; - if (!cluster_index) { - /* unallocated */ - refcount = 0; - break; - } - if (addend != 0) { - ret = qcow2_update_cluster_refcount(bs, - cluster_index, abs(addend), addend < 0, - QCOW2_DISCARD_SNAPSHOT); - if (ret < 0) { - goto fail; - } - } - - ret = qcow2_get_refcount(bs, cluster_index, &refcount); - if (ret < 0) { - goto fail; - } - break; - - case QCOW2_CLUSTER_UNALLOCATED: - refcount = 0; - break; - - default: - abort(); - } - - if (refcount == 1) { - offset |= QCOW_OFLAG_COPIED; - } - if (offset != old_offset) { - if (addend > 0) { - qcow2_cache_set_dependency(bs, s->l2_table_cache, - s->refcount_block_cache); - } - l2_table[j] = cpu_to_be64(offset); - qcow2_cache_entry_mark_dirty(bs, s->l2_table_cache, - l2_table); - } - } - - qcow2_cache_put(bs, s->l2_table_cache, (void **) &l2_table); - - if (addend != 0) { - ret = qcow2_update_cluster_refcount(bs, l2_offset >> - s->cluster_bits, - abs(addend), addend < 0, - QCOW2_DISCARD_SNAPSHOT); - if (ret < 0) { - goto fail; - } - } - ret = qcow2_get_refcount(bs, l2_offset >> s->cluster_bits, - &refcount); - if (ret < 0) { - goto fail; - } else if (refcount == 1) { - l2_offset |= QCOW_OFLAG_COPIED; - } - if (l2_offset != old_l2_offset) { - l1_table[i] = l2_offset; - l1_modified = 1; - } - } - } - - ret = bdrv_flush(bs); -fail: - if (l2_table) { - qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table); - } - - s->cache_discards = false; - qcow2_process_discards(bs, ret); - - /* Update L1 only if it isn't deleted anyway (addend = -1) */ - if (ret == 0 && addend >= 0 && l1_modified) { - for (i = 0; i < l1_size; i++) { - cpu_to_be64s(&l1_table[i]); - } - - ret = bdrv_pwrite_sync(bs->file->bs, l1_table_offset, - l1_table, l1_size2); - - for (i = 0; i < l1_size; i++) { - be64_to_cpus(&l1_table[i]); - } - } - if (l1_allocated) - g_free(l1_table); - return ret; -} - - - - -/*********************************************************/ -/* refcount checking functions */ - - -static uint64_t refcount_array_byte_size(BDRVQcow2State *s, uint64_t entries) -{ - /* This assertion holds because there is no way we can address more than - * 2^(64 - 9) clusters at once (with cluster size 512 = 2^9, and because - * offsets have to be representable in bytes); due to every cluster - * corresponding to one refcount entry, we are well below that limit */ - assert(entries < (UINT64_C(1) << (64 - 9))); - - /* Thanks to the assertion this will not overflow, because - * s->refcount_order < 7. - * (note: x << s->refcount_order == x * s->refcount_bits) */ - return DIV_ROUND_UP(entries << s->refcount_order, 8); -} - -/** - * Reallocates *array so that it can hold new_size entries. *size must contain - * the current number of entries in *array. If the reallocation fails, *array - * and *size will not be modified and -errno will be returned. If the - * reallocation is successful, *array will be set to the new buffer, *size - * will be set to new_size and 0 will be returned. The size of the reallocated - * refcount array buffer will be aligned to a cluster boundary, and the newly - * allocated area will be zeroed. - */ -static int realloc_refcount_array(BDRVQcow2State *s, void **array, - int64_t *size, int64_t new_size) -{ - int64_t old_byte_size, new_byte_size; - void *new_ptr; - - /* Round to clusters so the array can be directly written to disk */ - old_byte_size = size_to_clusters(s, refcount_array_byte_size(s, *size)) - * s->cluster_size; - new_byte_size = size_to_clusters(s, refcount_array_byte_size(s, new_size)) - * s->cluster_size; - - if (new_byte_size == old_byte_size) { - *size = new_size; - return 0; - } - - assert(new_byte_size > 0); - - if (new_byte_size > SIZE_MAX) { - return -ENOMEM; - } - - new_ptr = g_try_realloc(*array, new_byte_size); - if (!new_ptr) { - return -ENOMEM; - } - - if (new_byte_size > old_byte_size) { - memset((char *)new_ptr + old_byte_size, 0, - new_byte_size - old_byte_size); - } - - *array = new_ptr; - *size = new_size; - - return 0; -} - -/* - * Increases the refcount for a range of clusters in a given refcount table. - * This is used to construct a temporary refcount table out of L1 and L2 tables - * which can be compared to the refcount table saved in the image. - * - * Modifies the number of errors in res. - */ -static int inc_refcounts(BlockDriverState *bs, - BdrvCheckResult *res, - void **refcount_table, - int64_t *refcount_table_size, - int64_t offset, int64_t size) -{ - BDRVQcow2State *s = bs->opaque; - uint64_t start, last, cluster_offset, k, refcount; - int ret; - - if (size <= 0) { - return 0; - } - - start = start_of_cluster(s, offset); - last = start_of_cluster(s, offset + size - 1); - for(cluster_offset = start; cluster_offset <= last; - cluster_offset += s->cluster_size) { - k = cluster_offset >> s->cluster_bits; - if (k >= *refcount_table_size) { - ret = realloc_refcount_array(s, refcount_table, - refcount_table_size, k + 1); - if (ret < 0) { - res->check_errors++; - return ret; - } - } - - refcount = s->get_refcount(*refcount_table, k); - if (refcount == s->refcount_max) { - fprintf(stderr, "ERROR: overflow cluster offset=0x%" PRIx64 - "\n", cluster_offset); - fprintf(stderr, "Use qemu-img amend to increase the refcount entry " - "width or qemu-img convert to create a clean copy if the " - "image cannot be opened for writing\n"); - res->corruptions++; - continue; - } - s->set_refcount(*refcount_table, k, refcount + 1); - } - - return 0; -} - -/* Flags for check_refcounts_l1() and check_refcounts_l2() */ -enum { - CHECK_FRAG_INFO = 0x2, /* update BlockFragInfo counters */ -}; - -/* - * Increases the refcount in the given refcount table for the all clusters - * referenced in the L2 table. While doing so, performs some checks on L2 - * entries. - * - * Returns the number of errors found by the checks or -errno if an internal - * error occurred. - */ -static int check_refcounts_l2(BlockDriverState *bs, BdrvCheckResult *res, - void **refcount_table, - int64_t *refcount_table_size, int64_t l2_offset, - int flags) -{ - BDRVQcow2State *s = bs->opaque; - uint64_t *l2_table, l2_entry; - uint64_t next_contiguous_offset = 0; - int i, l2_size, nb_csectors, ret; - - /* Read L2 table from disk */ - l2_size = s->l2_size * sizeof(uint64_t); - l2_table = g_malloc(l2_size); - - ret = bdrv_pread(bs->file->bs, l2_offset, l2_table, l2_size); - if (ret < 0) { - fprintf(stderr, "ERROR: I/O error in check_refcounts_l2\n"); - res->check_errors++; - goto fail; - } - - /* Do the actual checks */ - for(i = 0; i < s->l2_size; i++) { - l2_entry = be64_to_cpu(l2_table[i]); - - switch (qcow2_get_cluster_type(l2_entry)) { - case QCOW2_CLUSTER_COMPRESSED: - /* Compressed clusters don't have QCOW_OFLAG_COPIED */ - if (l2_entry & QCOW_OFLAG_COPIED) { - fprintf(stderr, "ERROR: cluster %" PRId64 ": " - "copied flag must never be set for compressed " - "clusters\n", l2_entry >> s->cluster_bits); - l2_entry &= ~QCOW_OFLAG_COPIED; - res->corruptions++; - } - - /* Mark cluster as used */ - nb_csectors = ((l2_entry >> s->csize_shift) & - s->csize_mask) + 1; - l2_entry &= s->cluster_offset_mask; - ret = inc_refcounts(bs, res, refcount_table, refcount_table_size, - l2_entry & ~511, nb_csectors * 512); - if (ret < 0) { - goto fail; - } - - if (flags & CHECK_FRAG_INFO) { - res->bfi.allocated_clusters++; - res->bfi.compressed_clusters++; - - /* Compressed clusters are fragmented by nature. Since they - * take up sub-sector space but we only have sector granularity - * I/O we need to re-read the same sectors even for adjacent - * compressed clusters. - */ - res->bfi.fragmented_clusters++; - } - break; - - case QCOW2_CLUSTER_ZERO: - if ((l2_entry & L2E_OFFSET_MASK) == 0) { - break; - } - /* fall through */ - - case QCOW2_CLUSTER_NORMAL: - { - uint64_t offset = l2_entry & L2E_OFFSET_MASK; - - if (flags & CHECK_FRAG_INFO) { - res->bfi.allocated_clusters++; - if (next_contiguous_offset && - offset != next_contiguous_offset) { - res->bfi.fragmented_clusters++; - } - next_contiguous_offset = offset + s->cluster_size; - } - - /* Mark cluster as used */ - ret = inc_refcounts(bs, res, refcount_table, refcount_table_size, - offset, s->cluster_size); - if (ret < 0) { - goto fail; - } - - /* Correct offsets are cluster aligned */ - if (offset_into_cluster(s, offset)) { - fprintf(stderr, "ERROR offset=%" PRIx64 ": Cluster is not " - "properly aligned; L2 entry corrupted.\n", offset); - res->corruptions++; - } - break; - } - - case QCOW2_CLUSTER_UNALLOCATED: - break; - - default: - abort(); - } - } - - g_free(l2_table); - return 0; - -fail: - g_free(l2_table); - return ret; -} - -/* - * Increases the refcount for the L1 table, its L2 tables and all referenced - * clusters in the given refcount table. While doing so, performs some checks - * on L1 and L2 entries. - * - * Returns the number of errors found by the checks or -errno if an internal - * error occurred. - */ -static int check_refcounts_l1(BlockDriverState *bs, - BdrvCheckResult *res, - void **refcount_table, - int64_t *refcount_table_size, - int64_t l1_table_offset, int l1_size, - int flags) -{ - BDRVQcow2State *s = bs->opaque; - uint64_t *l1_table = NULL, l2_offset, l1_size2; - int i, ret; - - l1_size2 = l1_size * sizeof(uint64_t); - - /* Mark L1 table as used */ - ret = inc_refcounts(bs, res, refcount_table, refcount_table_size, - l1_table_offset, l1_size2); - if (ret < 0) { - goto fail; - } - - /* Read L1 table entries from disk */ - if (l1_size2 > 0) { - l1_table = g_try_malloc(l1_size2); - if (l1_table == NULL) { - ret = -ENOMEM; - res->check_errors++; - goto fail; - } - ret = bdrv_pread(bs->file->bs, l1_table_offset, l1_table, l1_size2); - if (ret < 0) { - fprintf(stderr, "ERROR: I/O error in check_refcounts_l1\n"); - res->check_errors++; - goto fail; - } - for(i = 0;i < l1_size; i++) - be64_to_cpus(&l1_table[i]); - } - - /* Do the actual checks */ - for(i = 0; i < l1_size; i++) { - l2_offset = l1_table[i]; - if (l2_offset) { - /* Mark L2 table as used */ - l2_offset &= L1E_OFFSET_MASK; - ret = inc_refcounts(bs, res, refcount_table, refcount_table_size, - l2_offset, s->cluster_size); - if (ret < 0) { - goto fail; - } - - /* L2 tables are cluster aligned */ - if (offset_into_cluster(s, l2_offset)) { - fprintf(stderr, "ERROR l2_offset=%" PRIx64 ": Table is not " - "cluster aligned; L1 entry corrupted\n", l2_offset); - res->corruptions++; - } - - /* Process and check L2 entries */ - ret = check_refcounts_l2(bs, res, refcount_table, - refcount_table_size, l2_offset, flags); - if (ret < 0) { - goto fail; - } - } - } - g_free(l1_table); - return 0; - -fail: - g_free(l1_table); - return ret; -} - -/* - * Checks the OFLAG_COPIED flag for all L1 and L2 entries. - * - * This function does not print an error message nor does it increment - * check_errors if qcow2_get_refcount fails (this is because such an error will - * have been already detected and sufficiently signaled by the calling function - * (qcow2_check_refcounts) by the time this function is called). - */ -static int check_oflag_copied(BlockDriverState *bs, BdrvCheckResult *res, - BdrvCheckMode fix) -{ - BDRVQcow2State *s = bs->opaque; - uint64_t *l2_table = qemu_blockalign(bs, s->cluster_size); - int ret; - uint64_t refcount; - int i, j; - - for (i = 0; i < s->l1_size; i++) { - uint64_t l1_entry = s->l1_table[i]; - uint64_t l2_offset = l1_entry & L1E_OFFSET_MASK; - bool l2_dirty = false; - - if (!l2_offset) { - continue; - } - - ret = qcow2_get_refcount(bs, l2_offset >> s->cluster_bits, - &refcount); - if (ret < 0) { - /* don't print message nor increment check_errors */ - continue; - } - if ((refcount == 1) != ((l1_entry & QCOW_OFLAG_COPIED) != 0)) { - fprintf(stderr, "%s OFLAG_COPIED L2 cluster: l1_index=%d " - "l1_entry=%" PRIx64 " refcount=%" PRIu64 "\n", - fix & BDRV_FIX_ERRORS ? "Repairing" : - "ERROR", - i, l1_entry, refcount); - if (fix & BDRV_FIX_ERRORS) { - s->l1_table[i] = refcount == 1 - ? l1_entry | QCOW_OFLAG_COPIED - : l1_entry & ~QCOW_OFLAG_COPIED; - ret = qcow2_write_l1_entry(bs, i); - if (ret < 0) { - res->check_errors++; - goto fail; - } - res->corruptions_fixed++; - } else { - res->corruptions++; - } - } - - ret = bdrv_pread(bs->file->bs, l2_offset, l2_table, - s->l2_size * sizeof(uint64_t)); - if (ret < 0) { - fprintf(stderr, "ERROR: Could not read L2 table: %s\n", - strerror(-ret)); - res->check_errors++; - goto fail; - } - - for (j = 0; j < s->l2_size; j++) { - uint64_t l2_entry = be64_to_cpu(l2_table[j]); - uint64_t data_offset = l2_entry & L2E_OFFSET_MASK; - int cluster_type = qcow2_get_cluster_type(l2_entry); - - if ((cluster_type == QCOW2_CLUSTER_NORMAL) || - ((cluster_type == QCOW2_CLUSTER_ZERO) && (data_offset != 0))) { - ret = qcow2_get_refcount(bs, - data_offset >> s->cluster_bits, - &refcount); - if (ret < 0) { - /* don't print message nor increment check_errors */ - continue; - } - if ((refcount == 1) != ((l2_entry & QCOW_OFLAG_COPIED) != 0)) { - fprintf(stderr, "%s OFLAG_COPIED data cluster: " - "l2_entry=%" PRIx64 " refcount=%" PRIu64 "\n", - fix & BDRV_FIX_ERRORS ? "Repairing" : - "ERROR", - l2_entry, refcount); - if (fix & BDRV_FIX_ERRORS) { - l2_table[j] = cpu_to_be64(refcount == 1 - ? l2_entry | QCOW_OFLAG_COPIED - : l2_entry & ~QCOW_OFLAG_COPIED); - l2_dirty = true; - res->corruptions_fixed++; - } else { - res->corruptions++; - } - } - } - } - - if (l2_dirty) { - ret = qcow2_pre_write_overlap_check(bs, QCOW2_OL_ACTIVE_L2, - l2_offset, s->cluster_size); - if (ret < 0) { - fprintf(stderr, "ERROR: Could not write L2 table; metadata " - "overlap check failed: %s\n", strerror(-ret)); - res->check_errors++; - goto fail; - } - - ret = bdrv_pwrite(bs->file->bs, l2_offset, l2_table, - s->cluster_size); - if (ret < 0) { - fprintf(stderr, "ERROR: Could not write L2 table: %s\n", - strerror(-ret)); - res->check_errors++; - goto fail; - } - } - } - - ret = 0; - -fail: - qemu_vfree(l2_table); - return ret; -} - -/* - * Checks consistency of refblocks and accounts for each refblock in - * *refcount_table. - */ -static int check_refblocks(BlockDriverState *bs, BdrvCheckResult *res, - BdrvCheckMode fix, bool *rebuild, - void **refcount_table, int64_t *nb_clusters) -{ - BDRVQcow2State *s = bs->opaque; - int64_t i, size; - int ret; - - for(i = 0; i < s->refcount_table_size; i++) { - uint64_t offset, cluster; - offset = s->refcount_table[i]; - cluster = offset >> s->cluster_bits; - - /* Refcount blocks are cluster aligned */ - if (offset_into_cluster(s, offset)) { - fprintf(stderr, "ERROR refcount block %" PRId64 " is not " - "cluster aligned; refcount table entry corrupted\n", i); - res->corruptions++; - *rebuild = true; - continue; - } - - if (cluster >= *nb_clusters) { - fprintf(stderr, "%s refcount block %" PRId64 " is outside image\n", - fix & BDRV_FIX_ERRORS ? "Repairing" : "ERROR", i); - - if (fix & BDRV_FIX_ERRORS) { - int64_t new_nb_clusters; - - if (offset > INT64_MAX - s->cluster_size) { - ret = -EINVAL; - goto resize_fail; - } - - ret = bdrv_truncate(bs->file->bs, offset + s->cluster_size); - if (ret < 0) { - goto resize_fail; - } - size = bdrv_getlength(bs->file->bs); - if (size < 0) { - ret = size; - goto resize_fail; - } - - new_nb_clusters = size_to_clusters(s, size); - assert(new_nb_clusters >= *nb_clusters); - - ret = realloc_refcount_array(s, refcount_table, - nb_clusters, new_nb_clusters); - if (ret < 0) { - res->check_errors++; - return ret; - } - - if (cluster >= *nb_clusters) { - ret = -EINVAL; - goto resize_fail; - } - - res->corruptions_fixed++; - ret = inc_refcounts(bs, res, refcount_table, nb_clusters, - offset, s->cluster_size); - if (ret < 0) { - return ret; - } - /* No need to check whether the refcount is now greater than 1: - * This area was just allocated and zeroed, so it can only be - * exactly 1 after inc_refcounts() */ - continue; - -resize_fail: - res->corruptions++; - *rebuild = true; - fprintf(stderr, "ERROR could not resize image: %s\n", - strerror(-ret)); - } else { - res->corruptions++; - } - continue; - } - - if (offset != 0) { - ret = inc_refcounts(bs, res, refcount_table, nb_clusters, - offset, s->cluster_size); - if (ret < 0) { - return ret; - } - if (s->get_refcount(*refcount_table, cluster) != 1) { - fprintf(stderr, "ERROR refcount block %" PRId64 - " refcount=%" PRIu64 "\n", i, - s->get_refcount(*refcount_table, cluster)); - res->corruptions++; - *rebuild = true; - } - } - } - - return 0; -} - -/* - * Calculates an in-memory refcount table. - */ -static int calculate_refcounts(BlockDriverState *bs, BdrvCheckResult *res, - BdrvCheckMode fix, bool *rebuild, - void **refcount_table, int64_t *nb_clusters) -{ - BDRVQcow2State *s = bs->opaque; - int64_t i; - QCowSnapshot *sn; - int ret; - - if (!*refcount_table) { - int64_t old_size = 0; - ret = realloc_refcount_array(s, refcount_table, - &old_size, *nb_clusters); - if (ret < 0) { - res->check_errors++; - return ret; - } - } - - /* header */ - ret = inc_refcounts(bs, res, refcount_table, nb_clusters, - 0, s->cluster_size); - if (ret < 0) { - return ret; - } - - /* current L1 table */ - ret = check_refcounts_l1(bs, res, refcount_table, nb_clusters, - s->l1_table_offset, s->l1_size, CHECK_FRAG_INFO); - if (ret < 0) { - return ret; - } - - /* snapshots */ - for (i = 0; i < s->nb_snapshots; i++) { - sn = s->snapshots + i; - ret = check_refcounts_l1(bs, res, refcount_table, nb_clusters, - sn->l1_table_offset, sn->l1_size, 0); - if (ret < 0) { - return ret; - } - } - ret = inc_refcounts(bs, res, refcount_table, nb_clusters, - s->snapshots_offset, s->snapshots_size); - if (ret < 0) { - return ret; - } - - /* refcount data */ - ret = inc_refcounts(bs, res, refcount_table, nb_clusters, - s->refcount_table_offset, - s->refcount_table_size * sizeof(uint64_t)); - if (ret < 0) { - return ret; - } - - return check_refblocks(bs, res, fix, rebuild, refcount_table, nb_clusters); -} - -/* - * Compares the actual reference count for each cluster in the image against the - * refcount as reported by the refcount structures on-disk. - */ -static void compare_refcounts(BlockDriverState *bs, BdrvCheckResult *res, - BdrvCheckMode fix, bool *rebuild, - int64_t *highest_cluster, - void *refcount_table, int64_t nb_clusters) -{ - BDRVQcow2State *s = bs->opaque; - int64_t i; - uint64_t refcount1, refcount2; - int ret; - - for (i = 0, *highest_cluster = 0; i < nb_clusters; i++) { - ret = qcow2_get_refcount(bs, i, &refcount1); - if (ret < 0) { - fprintf(stderr, "Can't get refcount for cluster %" PRId64 ": %s\n", - i, strerror(-ret)); - res->check_errors++; - continue; - } - - refcount2 = s->get_refcount(refcount_table, i); - - if (refcount1 > 0 || refcount2 > 0) { - *highest_cluster = i; - } - - if (refcount1 != refcount2) { - /* Check if we're allowed to fix the mismatch */ - int *num_fixed = NULL; - if (refcount1 == 0) { - *rebuild = true; - } else if (refcount1 > refcount2 && (fix & BDRV_FIX_LEAKS)) { - num_fixed = &res->leaks_fixed; - } else if (refcount1 < refcount2 && (fix & BDRV_FIX_ERRORS)) { - num_fixed = &res->corruptions_fixed; - } - - fprintf(stderr, "%s cluster %" PRId64 " refcount=%" PRIu64 - " reference=%" PRIu64 "\n", - num_fixed != NULL ? "Repairing" : - refcount1 < refcount2 ? "ERROR" : - "Leaked", - i, refcount1, refcount2); - - if (num_fixed) { - ret = update_refcount(bs, i << s->cluster_bits, 1, - refcount_diff(refcount1, refcount2), - refcount1 > refcount2, - QCOW2_DISCARD_ALWAYS); - if (ret >= 0) { - (*num_fixed)++; - continue; - } - } - - /* And if we couldn't, print an error */ - if (refcount1 < refcount2) { - res->corruptions++; - } else { - res->leaks++; - } - } - } -} - -/* - * Allocates clusters using an in-memory refcount table (IMRT) in contrast to - * the on-disk refcount structures. - * - * On input, *first_free_cluster tells where to start looking, and need not - * actually be a free cluster; the returned offset will not be before that - * cluster. On output, *first_free_cluster points to the first gap found, even - * if that gap was too small to be used as the returned offset. - * - * Note that *first_free_cluster is a cluster index whereas the return value is - * an offset. - */ -static int64_t alloc_clusters_imrt(BlockDriverState *bs, - int cluster_count, - void **refcount_table, - int64_t *imrt_nb_clusters, - int64_t *first_free_cluster) -{ - BDRVQcow2State *s = bs->opaque; - int64_t cluster = *first_free_cluster, i; - bool first_gap = true; - int contiguous_free_clusters; - int ret; - - /* Starting at *first_free_cluster, find a range of at least cluster_count - * continuously free clusters */ - for (contiguous_free_clusters = 0; - cluster < *imrt_nb_clusters && - contiguous_free_clusters < cluster_count; - cluster++) - { - if (!s->get_refcount(*refcount_table, cluster)) { - contiguous_free_clusters++; - if (first_gap) { - /* If this is the first free cluster found, update - * *first_free_cluster accordingly */ - *first_free_cluster = cluster; - first_gap = false; - } - } else if (contiguous_free_clusters) { - contiguous_free_clusters = 0; - } - } - - /* If contiguous_free_clusters is greater than zero, it contains the number - * of continuously free clusters until the current cluster; the first free - * cluster in the current "gap" is therefore - * cluster - contiguous_free_clusters */ - - /* If no such range could be found, grow the in-memory refcount table - * accordingly to append free clusters at the end of the image */ - if (contiguous_free_clusters < cluster_count) { - /* contiguous_free_clusters clusters are already empty at the image end; - * we need cluster_count clusters; therefore, we have to allocate - * cluster_count - contiguous_free_clusters new clusters at the end of - * the image (which is the current value of cluster; note that cluster - * may exceed old_imrt_nb_clusters if *first_free_cluster pointed beyond - * the image end) */ - ret = realloc_refcount_array(s, refcount_table, imrt_nb_clusters, - cluster + cluster_count - - contiguous_free_clusters); - if (ret < 0) { - return ret; - } - } - - /* Go back to the first free cluster */ - cluster -= contiguous_free_clusters; - for (i = 0; i < cluster_count; i++) { - s->set_refcount(*refcount_table, cluster + i, 1); - } - - return cluster << s->cluster_bits; -} - -/* - * Creates a new refcount structure based solely on the in-memory information - * given through *refcount_table. All necessary allocations will be reflected - * in that array. - * - * On success, the old refcount structure is leaked (it will be covered by the - * new refcount structure). - */ -static int rebuild_refcount_structure(BlockDriverState *bs, - BdrvCheckResult *res, - void **refcount_table, - int64_t *nb_clusters) -{ - BDRVQcow2State *s = bs->opaque; - int64_t first_free_cluster = 0, reftable_offset = -1, cluster = 0; - int64_t refblock_offset, refblock_start, refblock_index; - uint32_t reftable_size = 0; - uint64_t *on_disk_reftable = NULL; - void *on_disk_refblock; - int ret = 0; - struct { - uint64_t reftable_offset; - uint32_t reftable_clusters; - } QEMU_PACKED reftable_offset_and_clusters; - - qcow2_cache_empty(bs, s->refcount_block_cache); - -write_refblocks: - for (; cluster < *nb_clusters; cluster++) { - if (!s->get_refcount(*refcount_table, cluster)) { - continue; - } - - refblock_index = cluster >> s->refcount_block_bits; - refblock_start = refblock_index << s->refcount_block_bits; - - /* Don't allocate a cluster in a refblock already written to disk */ - if (first_free_cluster < refblock_start) { - first_free_cluster = refblock_start; - } - refblock_offset = alloc_clusters_imrt(bs, 1, refcount_table, - nb_clusters, &first_free_cluster); - if (refblock_offset < 0) { - fprintf(stderr, "ERROR allocating refblock: %s\n", - strerror(-refblock_offset)); - res->check_errors++; - ret = refblock_offset; - goto fail; - } - - if (reftable_size <= refblock_index) { - uint32_t old_reftable_size = reftable_size; - uint64_t *new_on_disk_reftable; - - reftable_size = ROUND_UP((refblock_index + 1) * sizeof(uint64_t), - s->cluster_size) / sizeof(uint64_t); - new_on_disk_reftable = g_try_realloc(on_disk_reftable, - reftable_size * - sizeof(uint64_t)); - if (!new_on_disk_reftable) { - res->check_errors++; - ret = -ENOMEM; - goto fail; - } - on_disk_reftable = new_on_disk_reftable; - - memset(on_disk_reftable + old_reftable_size, 0, - (reftable_size - old_reftable_size) * sizeof(uint64_t)); - - /* The offset we have for the reftable is now no longer valid; - * this will leak that range, but we can easily fix that by running - * a leak-fixing check after this rebuild operation */ - reftable_offset = -1; - } - on_disk_reftable[refblock_index] = refblock_offset; - - /* If this is apparently the last refblock (for now), try to squeeze the - * reftable in */ - if (refblock_index == (*nb_clusters - 1) >> s->refcount_block_bits && - reftable_offset < 0) - { - uint64_t reftable_clusters = size_to_clusters(s, reftable_size * - sizeof(uint64_t)); - reftable_offset = alloc_clusters_imrt(bs, reftable_clusters, - refcount_table, nb_clusters, - &first_free_cluster); - if (reftable_offset < 0) { - fprintf(stderr, "ERROR allocating reftable: %s\n", - strerror(-reftable_offset)); - res->check_errors++; - ret = reftable_offset; - goto fail; - } - } - - ret = qcow2_pre_write_overlap_check(bs, 0, refblock_offset, - s->cluster_size); - if (ret < 0) { - fprintf(stderr, "ERROR writing refblock: %s\n", strerror(-ret)); - goto fail; - } - - /* The size of *refcount_table is always cluster-aligned, therefore the - * write operation will not overflow */ - on_disk_refblock = (void *)((char *) *refcount_table + - refblock_index * s->cluster_size); - - ret = bdrv_write(bs->file->bs, refblock_offset / BDRV_SECTOR_SIZE, - on_disk_refblock, s->cluster_sectors); - if (ret < 0) { - fprintf(stderr, "ERROR writing refblock: %s\n", strerror(-ret)); - goto fail; - } - - /* Go to the end of this refblock */ - cluster = refblock_start + s->refcount_block_size - 1; - } - - if (reftable_offset < 0) { - uint64_t post_refblock_start, reftable_clusters; - - post_refblock_start = ROUND_UP(*nb_clusters, s->refcount_block_size); - reftable_clusters = size_to_clusters(s, - reftable_size * sizeof(uint64_t)); - /* Not pretty but simple */ - if (first_free_cluster < post_refblock_start) { - first_free_cluster = post_refblock_start; - } - reftable_offset = alloc_clusters_imrt(bs, reftable_clusters, - refcount_table, nb_clusters, - &first_free_cluster); - if (reftable_offset < 0) { - fprintf(stderr, "ERROR allocating reftable: %s\n", - strerror(-reftable_offset)); - res->check_errors++; - ret = reftable_offset; - goto fail; - } - - goto write_refblocks; - } - - assert(on_disk_reftable); - - for (refblock_index = 0; refblock_index < reftable_size; refblock_index++) { - cpu_to_be64s(&on_disk_reftable[refblock_index]); - } - - ret = qcow2_pre_write_overlap_check(bs, 0, reftable_offset, - reftable_size * sizeof(uint64_t)); - if (ret < 0) { - fprintf(stderr, "ERROR writing reftable: %s\n", strerror(-ret)); - goto fail; - } - - assert(reftable_size < INT_MAX / sizeof(uint64_t)); - ret = bdrv_pwrite(bs->file->bs, reftable_offset, on_disk_reftable, - reftable_size * sizeof(uint64_t)); - if (ret < 0) { - fprintf(stderr, "ERROR writing reftable: %s\n", strerror(-ret)); - goto fail; - } - - /* Enter new reftable into the image header */ - cpu_to_be64w(&reftable_offset_and_clusters.reftable_offset, - reftable_offset); - cpu_to_be32w(&reftable_offset_and_clusters.reftable_clusters, - size_to_clusters(s, reftable_size * sizeof(uint64_t))); - ret = bdrv_pwrite_sync(bs->file->bs, offsetof(QCowHeader, - refcount_table_offset), - &reftable_offset_and_clusters, - sizeof(reftable_offset_and_clusters)); - if (ret < 0) { - fprintf(stderr, "ERROR setting reftable: %s\n", strerror(-ret)); - goto fail; - } - - for (refblock_index = 0; refblock_index < reftable_size; refblock_index++) { - be64_to_cpus(&on_disk_reftable[refblock_index]); - } - s->refcount_table = on_disk_reftable; - s->refcount_table_offset = reftable_offset; - s->refcount_table_size = reftable_size; - - return 0; - -fail: - g_free(on_disk_reftable); - return ret; -} - -/* - * Checks an image for refcount consistency. - * - * Returns 0 if no errors are found, the number of errors in case the image is - * detected as corrupted, and -errno when an internal error occurred. - */ -int qcow2_check_refcounts(BlockDriverState *bs, BdrvCheckResult *res, - BdrvCheckMode fix) -{ - BDRVQcow2State *s = bs->opaque; - BdrvCheckResult pre_compare_res; - int64_t size, highest_cluster, nb_clusters; - void *refcount_table = NULL; - bool rebuild = false; - int ret; - - size = bdrv_getlength(bs->file->bs); - if (size < 0) { - res->check_errors++; - return size; - } - - nb_clusters = size_to_clusters(s, size); - if (nb_clusters > INT_MAX) { - res->check_errors++; - return -EFBIG; - } - - res->bfi.total_clusters = - size_to_clusters(s, bs->total_sectors * BDRV_SECTOR_SIZE); - - ret = calculate_refcounts(bs, res, fix, &rebuild, &refcount_table, - &nb_clusters); - if (ret < 0) { - goto fail; - } - - /* In case we don't need to rebuild the refcount structure (but want to fix - * something), this function is immediately called again, in which case the - * result should be ignored */ - pre_compare_res = *res; - compare_refcounts(bs, res, 0, &rebuild, &highest_cluster, refcount_table, - nb_clusters); - - if (rebuild && (fix & BDRV_FIX_ERRORS)) { - BdrvCheckResult old_res = *res; - int fresh_leaks = 0; - - fprintf(stderr, "Rebuilding refcount structure\n"); - ret = rebuild_refcount_structure(bs, res, &refcount_table, - &nb_clusters); - if (ret < 0) { - goto fail; - } - - res->corruptions = 0; - res->leaks = 0; - - /* Because the old reftable has been exchanged for a new one the - * references have to be recalculated */ - rebuild = false; - memset(refcount_table, 0, refcount_array_byte_size(s, nb_clusters)); - ret = calculate_refcounts(bs, res, 0, &rebuild, &refcount_table, - &nb_clusters); - if (ret < 0) { - goto fail; - } - - if (fix & BDRV_FIX_LEAKS) { - /* The old refcount structures are now leaked, fix it; the result - * can be ignored, aside from leaks which were introduced by - * rebuild_refcount_structure() that could not be fixed */ - BdrvCheckResult saved_res = *res; - *res = (BdrvCheckResult){ 0 }; - - compare_refcounts(bs, res, BDRV_FIX_LEAKS, &rebuild, - &highest_cluster, refcount_table, nb_clusters); - if (rebuild) { - fprintf(stderr, "ERROR rebuilt refcount structure is still " - "broken\n"); - } - - /* Any leaks accounted for here were introduced by - * rebuild_refcount_structure() because that function has created a - * new refcount structure from scratch */ - fresh_leaks = res->leaks; - *res = saved_res; - } - - if (res->corruptions < old_res.corruptions) { - res->corruptions_fixed += old_res.corruptions - res->corruptions; - } - if (res->leaks < old_res.leaks) { - res->leaks_fixed += old_res.leaks - res->leaks; - } - res->leaks += fresh_leaks; - } else if (fix) { - if (rebuild) { - fprintf(stderr, "ERROR need to rebuild refcount structures\n"); - res->check_errors++; - ret = -EIO; - goto fail; - } - - if (res->leaks || res->corruptions) { - *res = pre_compare_res; - compare_refcounts(bs, res, fix, &rebuild, &highest_cluster, - refcount_table, nb_clusters); - } - } - - /* check OFLAG_COPIED */ - ret = check_oflag_copied(bs, res, fix); - if (ret < 0) { - goto fail; - } - - res->image_end_offset = (highest_cluster + 1) * s->cluster_size; - ret = 0; - -fail: - g_free(refcount_table); - - return ret; -} - -#define overlaps_with(ofs, sz) \ - ranges_overlap(offset, size, ofs, sz) - -/* - * Checks if the given offset into the image file is actually free to use by - * looking for overlaps with important metadata sections (L1/L2 tables etc.), - * i.e. a sanity check without relying on the refcount tables. - * - * The ign parameter specifies what checks not to perform (being a bitmask of - * QCow2MetadataOverlap values), i.e., what sections to ignore. - * - * Returns: - * - 0 if writing to this offset will not affect the mentioned metadata - * - a positive QCow2MetadataOverlap value indicating one overlapping section - * - a negative value (-errno) indicating an error while performing a check, - * e.g. when bdrv_read failed on QCOW2_OL_INACTIVE_L2 - */ -int qcow2_check_metadata_overlap(BlockDriverState *bs, int ign, int64_t offset, - int64_t size) -{ - BDRVQcow2State *s = bs->opaque; - int chk = s->overlap_check & ~ign; - int i, j; - - if (!size) { - return 0; - } - - if (chk & QCOW2_OL_MAIN_HEADER) { - if (offset < s->cluster_size) { - return QCOW2_OL_MAIN_HEADER; - } - } - - /* align range to test to cluster boundaries */ - size = align_offset(offset_into_cluster(s, offset) + size, s->cluster_size); - offset = start_of_cluster(s, offset); - - if ((chk & QCOW2_OL_ACTIVE_L1) && s->l1_size) { - if (overlaps_with(s->l1_table_offset, s->l1_size * sizeof(uint64_t))) { - return QCOW2_OL_ACTIVE_L1; - } - } - - if ((chk & QCOW2_OL_REFCOUNT_TABLE) && s->refcount_table_size) { - if (overlaps_with(s->refcount_table_offset, - s->refcount_table_size * sizeof(uint64_t))) { - return QCOW2_OL_REFCOUNT_TABLE; - } - } - - if ((chk & QCOW2_OL_SNAPSHOT_TABLE) && s->snapshots_size) { - if (overlaps_with(s->snapshots_offset, s->snapshots_size)) { - return QCOW2_OL_SNAPSHOT_TABLE; - } - } - - if ((chk & QCOW2_OL_INACTIVE_L1) && s->snapshots) { - for (i = 0; i < s->nb_snapshots; i++) { - if (s->snapshots[i].l1_size && - overlaps_with(s->snapshots[i].l1_table_offset, - s->snapshots[i].l1_size * sizeof(uint64_t))) { - return QCOW2_OL_INACTIVE_L1; - } - } - } - - if ((chk & QCOW2_OL_ACTIVE_L2) && s->l1_table) { - for (i = 0; i < s->l1_size; i++) { - if ((s->l1_table[i] & L1E_OFFSET_MASK) && - overlaps_with(s->l1_table[i] & L1E_OFFSET_MASK, - s->cluster_size)) { - return QCOW2_OL_ACTIVE_L2; - } - } - } - - if ((chk & QCOW2_OL_REFCOUNT_BLOCK) && s->refcount_table) { - for (i = 0; i < s->refcount_table_size; i++) { - if ((s->refcount_table[i] & REFT_OFFSET_MASK) && - overlaps_with(s->refcount_table[i] & REFT_OFFSET_MASK, - s->cluster_size)) { - return QCOW2_OL_REFCOUNT_BLOCK; - } - } - } - - if ((chk & QCOW2_OL_INACTIVE_L2) && s->snapshots) { - for (i = 0; i < s->nb_snapshots; i++) { - uint64_t l1_ofs = s->snapshots[i].l1_table_offset; - uint32_t l1_sz = s->snapshots[i].l1_size; - uint64_t l1_sz2 = l1_sz * sizeof(uint64_t); - uint64_t *l1 = g_try_malloc(l1_sz2); - int ret; - - if (l1_sz2 && l1 == NULL) { - return -ENOMEM; - } - - ret = bdrv_pread(bs->file->bs, l1_ofs, l1, l1_sz2); - if (ret < 0) { - g_free(l1); - return ret; - } - - for (j = 0; j < l1_sz; j++) { - uint64_t l2_ofs = be64_to_cpu(l1[j]) & L1E_OFFSET_MASK; - if (l2_ofs && overlaps_with(l2_ofs, s->cluster_size)) { - g_free(l1); - return QCOW2_OL_INACTIVE_L2; - } - } - - g_free(l1); - } - } - - return 0; -} - -static const char *metadata_ol_names[] = { - [QCOW2_OL_MAIN_HEADER_BITNR] = "qcow2_header", - [QCOW2_OL_ACTIVE_L1_BITNR] = "active L1 table", - [QCOW2_OL_ACTIVE_L2_BITNR] = "active L2 table", - [QCOW2_OL_REFCOUNT_TABLE_BITNR] = "refcount table", - [QCOW2_OL_REFCOUNT_BLOCK_BITNR] = "refcount block", - [QCOW2_OL_SNAPSHOT_TABLE_BITNR] = "snapshot table", - [QCOW2_OL_INACTIVE_L1_BITNR] = "inactive L1 table", - [QCOW2_OL_INACTIVE_L2_BITNR] = "inactive L2 table", -}; - -/* - * First performs a check for metadata overlaps (through - * qcow2_check_metadata_overlap); if that fails with a negative value (error - * while performing a check), that value is returned. If an impending overlap - * is detected, the BDS will be made unusable, the qcow2 file marked corrupt - * and -EIO returned. - * - * Returns 0 if there were neither overlaps nor errors while checking for - * overlaps; or a negative value (-errno) on error. - */ -int qcow2_pre_write_overlap_check(BlockDriverState *bs, int ign, int64_t offset, - int64_t size) -{ - int ret = qcow2_check_metadata_overlap(bs, ign, offset, size); - - if (ret < 0) { - return ret; - } else if (ret > 0) { - int metadata_ol_bitnr = ctz32(ret); - assert(metadata_ol_bitnr < QCOW2_OL_MAX_BITNR); - - qcow2_signal_corruption(bs, true, offset, size, "Preventing invalid " - "write on metadata (overlaps with %s)", - metadata_ol_names[metadata_ol_bitnr]); - return -EIO; - } - - return 0; -} - -/* A pointer to a function of this type is given to walk_over_reftable(). That - * function will create refblocks and pass them to a RefblockFinishOp once they - * are completed (@refblock). @refblock_empty is set if the refblock is - * completely empty. - * - * Along with the refblock, a corresponding reftable entry is passed, in the - * reftable @reftable (which may be reallocated) at @reftable_index. - * - * @allocated should be set to true if a new cluster has been allocated. - */ -typedef int (RefblockFinishOp)(BlockDriverState *bs, uint64_t **reftable, - uint64_t reftable_index, uint64_t *reftable_size, - void *refblock, bool refblock_empty, - bool *allocated, Error **errp); - -/** - * This "operation" for walk_over_reftable() allocates the refblock on disk (if - * it is not empty) and inserts its offset into the new reftable. The size of - * this new reftable is increased as required. - */ -static int alloc_refblock(BlockDriverState *bs, uint64_t **reftable, - uint64_t reftable_index, uint64_t *reftable_size, - void *refblock, bool refblock_empty, bool *allocated, - Error **errp) -{ - BDRVQcow2State *s = bs->opaque; - int64_t offset; - - if (!refblock_empty && reftable_index >= *reftable_size) { - uint64_t *new_reftable; - uint64_t new_reftable_size; - - new_reftable_size = ROUND_UP(reftable_index + 1, - s->cluster_size / sizeof(uint64_t)); - if (new_reftable_size > QCOW_MAX_REFTABLE_SIZE / sizeof(uint64_t)) { - error_setg(errp, - "This operation would make the refcount table grow " - "beyond the maximum size supported by QEMU, aborting"); - return -ENOTSUP; - } - - new_reftable = g_try_realloc(*reftable, new_reftable_size * - sizeof(uint64_t)); - if (!new_reftable) { - error_setg(errp, "Failed to increase reftable buffer size"); - return -ENOMEM; - } - - memset(new_reftable + *reftable_size, 0, - (new_reftable_size - *reftable_size) * sizeof(uint64_t)); - - *reftable = new_reftable; - *reftable_size = new_reftable_size; - } - - if (!refblock_empty && !(*reftable)[reftable_index]) { - offset = qcow2_alloc_clusters(bs, s->cluster_size); - if (offset < 0) { - error_setg_errno(errp, -offset, "Failed to allocate refblock"); - return offset; - } - (*reftable)[reftable_index] = offset; - *allocated = true; - } - - return 0; -} - -/** - * This "operation" for walk_over_reftable() writes the refblock to disk at the - * offset specified by the new reftable's entry. It does not modify the new - * reftable or change any refcounts. - */ -static int flush_refblock(BlockDriverState *bs, uint64_t **reftable, - uint64_t reftable_index, uint64_t *reftable_size, - void *refblock, bool refblock_empty, bool *allocated, - Error **errp) -{ - BDRVQcow2State *s = bs->opaque; - int64_t offset; - int ret; - - if (reftable_index < *reftable_size && (*reftable)[reftable_index]) { - offset = (*reftable)[reftable_index]; - - ret = qcow2_pre_write_overlap_check(bs, 0, offset, s->cluster_size); - if (ret < 0) { - error_setg_errno(errp, -ret, "Overlap check failed"); - return ret; - } - - ret = bdrv_pwrite(bs->file->bs, offset, refblock, s->cluster_size); - if (ret < 0) { - error_setg_errno(errp, -ret, "Failed to write refblock"); - return ret; - } - } else { - assert(refblock_empty); - } - - return 0; -} - -/** - * This function walks over the existing reftable and every referenced refblock; - * if @new_set_refcount is non-NULL, it is called for every refcount entry to - * create an equal new entry in the passed @new_refblock. Once that - * @new_refblock is completely filled, @operation will be called. - * - * @status_cb and @cb_opaque are used for the amend operation's status callback. - * @index is the index of the walk_over_reftable() calls and @total is the total - * number of walk_over_reftable() calls per amend operation. Both are used for - * calculating the parameters for the status callback. - * - * @allocated is set to true if a new cluster has been allocated. - */ -static int walk_over_reftable(BlockDriverState *bs, uint64_t **new_reftable, - uint64_t *new_reftable_index, - uint64_t *new_reftable_size, - void *new_refblock, int new_refblock_size, - int new_refcount_bits, - RefblockFinishOp *operation, bool *allocated, - Qcow2SetRefcountFunc *new_set_refcount, - BlockDriverAmendStatusCB *status_cb, - void *cb_opaque, int index, int total, - Error **errp) -{ - BDRVQcow2State *s = bs->opaque; - uint64_t reftable_index; - bool new_refblock_empty = true; - int refblock_index; - int new_refblock_index = 0; - int ret; - - for (reftable_index = 0; reftable_index < s->refcount_table_size; - reftable_index++) - { - uint64_t refblock_offset = s->refcount_table[reftable_index] - & REFT_OFFSET_MASK; - - status_cb(bs, (uint64_t)index * s->refcount_table_size + reftable_index, - (uint64_t)total * s->refcount_table_size, cb_opaque); - - if (refblock_offset) { - void *refblock; - - if (offset_into_cluster(s, refblock_offset)) { - qcow2_signal_corruption(bs, true, -1, -1, "Refblock offset %#" - PRIx64 " unaligned (reftable index: %#" - PRIx64 ")", refblock_offset, - reftable_index); - error_setg(errp, - "Image is corrupt (unaligned refblock offset)"); - return -EIO; - } - - ret = qcow2_cache_get(bs, s->refcount_block_cache, refblock_offset, - &refblock); - if (ret < 0) { - error_setg_errno(errp, -ret, "Failed to retrieve refblock"); - return ret; - } - - for (refblock_index = 0; refblock_index < s->refcount_block_size; - refblock_index++) - { - uint64_t refcount; - - if (new_refblock_index >= new_refblock_size) { - /* new_refblock is now complete */ - ret = operation(bs, new_reftable, *new_reftable_index, - new_reftable_size, new_refblock, - new_refblock_empty, allocated, errp); - if (ret < 0) { - qcow2_cache_put(bs, s->refcount_block_cache, &refblock); - return ret; - } - - (*new_reftable_index)++; - new_refblock_index = 0; - new_refblock_empty = true; - } - - refcount = s->get_refcount(refblock, refblock_index); - if (new_refcount_bits < 64 && refcount >> new_refcount_bits) { - uint64_t offset; - - qcow2_cache_put(bs, s->refcount_block_cache, &refblock); - - offset = ((reftable_index << s->refcount_block_bits) - + refblock_index) << s->cluster_bits; - - error_setg(errp, "Cannot decrease refcount entry width to " - "%i bits: Cluster at offset %#" PRIx64 " has a " - "refcount of %" PRIu64, new_refcount_bits, - offset, refcount); - return -EINVAL; - } - - if (new_set_refcount) { - new_set_refcount(new_refblock, new_refblock_index++, - refcount); - } else { - new_refblock_index++; - } - new_refblock_empty = new_refblock_empty && refcount == 0; - } - - qcow2_cache_put(bs, s->refcount_block_cache, &refblock); - } else { - /* No refblock means every refcount is 0 */ - for (refblock_index = 0; refblock_index < s->refcount_block_size; - refblock_index++) - { - if (new_refblock_index >= new_refblock_size) { - /* new_refblock is now complete */ - ret = operation(bs, new_reftable, *new_reftable_index, - new_reftable_size, new_refblock, - new_refblock_empty, allocated, errp); - if (ret < 0) { - return ret; - } - - (*new_reftable_index)++; - new_refblock_index = 0; - new_refblock_empty = true; - } - - if (new_set_refcount) { - new_set_refcount(new_refblock, new_refblock_index++, 0); - } else { - new_refblock_index++; - } - } - } - } - - if (new_refblock_index > 0) { - /* Complete the potentially existing partially filled final refblock */ - if (new_set_refcount) { - for (; new_refblock_index < new_refblock_size; - new_refblock_index++) - { - new_set_refcount(new_refblock, new_refblock_index, 0); - } - } - - ret = operation(bs, new_reftable, *new_reftable_index, - new_reftable_size, new_refblock, new_refblock_empty, - allocated, errp); - if (ret < 0) { - return ret; - } - - (*new_reftable_index)++; - } - - status_cb(bs, (uint64_t)(index + 1) * s->refcount_table_size, - (uint64_t)total * s->refcount_table_size, cb_opaque); - - return 0; -} - -int qcow2_change_refcount_order(BlockDriverState *bs, int refcount_order, - BlockDriverAmendStatusCB *status_cb, - void *cb_opaque, Error **errp) -{ - BDRVQcow2State *s = bs->opaque; - Qcow2GetRefcountFunc *new_get_refcount; - Qcow2SetRefcountFunc *new_set_refcount; - void *new_refblock = qemu_blockalign(bs->file->bs, s->cluster_size); - uint64_t *new_reftable = NULL, new_reftable_size = 0; - uint64_t *old_reftable, old_reftable_size, old_reftable_offset; - uint64_t new_reftable_index = 0; - uint64_t i; - int64_t new_reftable_offset = 0, allocated_reftable_size = 0; - int new_refblock_size, new_refcount_bits = 1 << refcount_order; - int old_refcount_order; - int walk_index = 0; - int ret; - bool new_allocation; - - assert(s->qcow_version >= 3); - assert(refcount_order >= 0 && refcount_order <= 6); - - /* see qcow2_open() */ - new_refblock_size = 1 << (s->cluster_bits - (refcount_order - 3)); - - new_get_refcount = get_refcount_funcs[refcount_order]; - new_set_refcount = set_refcount_funcs[refcount_order]; - - - do { - int total_walks; - - new_allocation = false; - - /* At least we have to do this walk and the one which writes the - * refblocks; also, at least we have to do this loop here at least - * twice (normally), first to do the allocations, and second to - * determine that everything is correctly allocated, this then makes - * three walks in total */ - total_walks = MAX(walk_index + 2, 3); - - /* First, allocate the structures so they are present in the refcount - * structures */ - ret = walk_over_reftable(bs, &new_reftable, &new_reftable_index, - &new_reftable_size, NULL, new_refblock_size, - new_refcount_bits, &alloc_refblock, - &new_allocation, NULL, status_cb, cb_opaque, - walk_index++, total_walks, errp); - if (ret < 0) { - goto done; - } - - new_reftable_index = 0; - - if (new_allocation) { - if (new_reftable_offset) { - qcow2_free_clusters(bs, new_reftable_offset, - allocated_reftable_size * sizeof(uint64_t), - QCOW2_DISCARD_NEVER); - } - - new_reftable_offset = qcow2_alloc_clusters(bs, new_reftable_size * - sizeof(uint64_t)); - if (new_reftable_offset < 0) { - error_setg_errno(errp, -new_reftable_offset, - "Failed to allocate the new reftable"); - ret = new_reftable_offset; - goto done; - } - allocated_reftable_size = new_reftable_size; - } - } while (new_allocation); - - /* Second, write the new refblocks */ - ret = walk_over_reftable(bs, &new_reftable, &new_reftable_index, - &new_reftable_size, new_refblock, - new_refblock_size, new_refcount_bits, - &flush_refblock, &new_allocation, new_set_refcount, - status_cb, cb_opaque, walk_index, walk_index + 1, - errp); - if (ret < 0) { - goto done; - } - assert(!new_allocation); - - - /* Write the new reftable */ - ret = qcow2_pre_write_overlap_check(bs, 0, new_reftable_offset, - new_reftable_size * sizeof(uint64_t)); - if (ret < 0) { - error_setg_errno(errp, -ret, "Overlap check failed"); - goto done; - } - - for (i = 0; i < new_reftable_size; i++) { - cpu_to_be64s(&new_reftable[i]); - } - - ret = bdrv_pwrite(bs->file->bs, new_reftable_offset, new_reftable, - new_reftable_size * sizeof(uint64_t)); - - for (i = 0; i < new_reftable_size; i++) { - be64_to_cpus(&new_reftable[i]); - } - - if (ret < 0) { - error_setg_errno(errp, -ret, "Failed to write the new reftable"); - goto done; - } - - - /* Empty the refcount cache */ - ret = qcow2_cache_flush(bs, s->refcount_block_cache); - if (ret < 0) { - error_setg_errno(errp, -ret, "Failed to flush the refblock cache"); - goto done; - } - - /* Update the image header to point to the new reftable; this only updates - * the fields which are relevant to qcow2_update_header(); other fields - * such as s->refcount_table or s->refcount_bits stay stale for now - * (because we have to restore everything if qcow2_update_header() fails) */ - old_refcount_order = s->refcount_order; - old_reftable_size = s->refcount_table_size; - old_reftable_offset = s->refcount_table_offset; - - s->refcount_order = refcount_order; - s->refcount_table_size = new_reftable_size; - s->refcount_table_offset = new_reftable_offset; - - ret = qcow2_update_header(bs); - if (ret < 0) { - s->refcount_order = old_refcount_order; - s->refcount_table_size = old_reftable_size; - s->refcount_table_offset = old_reftable_offset; - error_setg_errno(errp, -ret, "Failed to update the qcow2 header"); - goto done; - } - - /* Now update the rest of the in-memory information */ - old_reftable = s->refcount_table; - s->refcount_table = new_reftable; - - s->refcount_bits = 1 << refcount_order; - s->refcount_max = UINT64_C(1) << (s->refcount_bits - 1); - s->refcount_max += s->refcount_max - 1; - - s->refcount_block_bits = s->cluster_bits - (refcount_order - 3); - s->refcount_block_size = 1 << s->refcount_block_bits; - - s->get_refcount = new_get_refcount; - s->set_refcount = new_set_refcount; - - /* For cleaning up all old refblocks and the old reftable below the "done" - * label */ - new_reftable = old_reftable; - new_reftable_size = old_reftable_size; - new_reftable_offset = old_reftable_offset; - -done: - if (new_reftable) { - /* On success, new_reftable actually points to the old reftable (and - * new_reftable_size is the old reftable's size); but that is just - * fine */ - for (i = 0; i < new_reftable_size; i++) { - uint64_t offset = new_reftable[i] & REFT_OFFSET_MASK; - if (offset) { - qcow2_free_clusters(bs, offset, s->cluster_size, - QCOW2_DISCARD_OTHER); - } - } - g_free(new_reftable); - - if (new_reftable_offset > 0) { - qcow2_free_clusters(bs, new_reftable_offset, - new_reftable_size * sizeof(uint64_t), - QCOW2_DISCARD_OTHER); - } - } - - qemu_vfree(new_refblock); - return ret; -} diff --git a/qemu/block/qcow2-snapshot.c b/qemu/block/qcow2-snapshot.c deleted file mode 100644 index 5f4a17e47..000000000 --- a/qemu/block/qcow2-snapshot.c +++ /dev/null @@ -1,738 +0,0 @@ -/* - * Block driver for the QCOW version 2 format - * - * Copyright (c) 2004-2006 Fabrice Bellard - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. - */ - -#include "qemu/osdep.h" -#include "qapi/error.h" -#include "block/block_int.h" -#include "block/qcow2.h" -#include "qemu/error-report.h" -#include "qemu/cutils.h" - -void qcow2_free_snapshots(BlockDriverState *bs) -{ - BDRVQcow2State *s = bs->opaque; - int i; - - for(i = 0; i < s->nb_snapshots; i++) { - g_free(s->snapshots[i].name); - g_free(s->snapshots[i].id_str); - } - g_free(s->snapshots); - s->snapshots = NULL; - s->nb_snapshots = 0; -} - -int qcow2_read_snapshots(BlockDriverState *bs) -{ - BDRVQcow2State *s = bs->opaque; - QCowSnapshotHeader h; - QCowSnapshotExtraData extra; - QCowSnapshot *sn; - int i, id_str_size, name_size; - int64_t offset; - uint32_t extra_data_size; - int ret; - - if (!s->nb_snapshots) { - s->snapshots = NULL; - s->snapshots_size = 0; - return 0; - } - - offset = s->snapshots_offset; - s->snapshots = g_new0(QCowSnapshot, s->nb_snapshots); - - for(i = 0; i < s->nb_snapshots; i++) { - /* Read statically sized part of the snapshot header */ - offset = align_offset(offset, 8); - ret = bdrv_pread(bs->file->bs, offset, &h, sizeof(h)); - if (ret < 0) { - goto fail; - } - - offset += sizeof(h); - sn = s->snapshots + i; - sn->l1_table_offset = be64_to_cpu(h.l1_table_offset); - sn->l1_size = be32_to_cpu(h.l1_size); - sn->vm_state_size = be32_to_cpu(h.vm_state_size); - sn->date_sec = be32_to_cpu(h.date_sec); - sn->date_nsec = be32_to_cpu(h.date_nsec); - sn->vm_clock_nsec = be64_to_cpu(h.vm_clock_nsec); - extra_data_size = be32_to_cpu(h.extra_data_size); - - id_str_size = be16_to_cpu(h.id_str_size); - name_size = be16_to_cpu(h.name_size); - - /* Read extra data */ - ret = bdrv_pread(bs->file->bs, offset, &extra, - MIN(sizeof(extra), extra_data_size)); - if (ret < 0) { - goto fail; - } - offset += extra_data_size; - - if (extra_data_size >= 8) { - sn->vm_state_size = be64_to_cpu(extra.vm_state_size_large); - } - - if (extra_data_size >= 16) { - sn->disk_size = be64_to_cpu(extra.disk_size); - } else { - sn->disk_size = bs->total_sectors * BDRV_SECTOR_SIZE; - } - - /* Read snapshot ID */ - sn->id_str = g_malloc(id_str_size + 1); - ret = bdrv_pread(bs->file->bs, offset, sn->id_str, id_str_size); - if (ret < 0) { - goto fail; - } - offset += id_str_size; - sn->id_str[id_str_size] = '\0'; - - /* Read snapshot name */ - sn->name = g_malloc(name_size + 1); - ret = bdrv_pread(bs->file->bs, offset, sn->name, name_size); - if (ret < 0) { - goto fail; - } - offset += name_size; - sn->name[name_size] = '\0'; - - if (offset - s->snapshots_offset > QCOW_MAX_SNAPSHOTS_SIZE) { - ret = -EFBIG; - goto fail; - } - } - - assert(offset - s->snapshots_offset <= INT_MAX); - s->snapshots_size = offset - s->snapshots_offset; - return 0; - -fail: - qcow2_free_snapshots(bs); - return ret; -} - -/* add at the end of the file a new list of snapshots */ -static int qcow2_write_snapshots(BlockDriverState *bs) -{ - BDRVQcow2State *s = bs->opaque; - QCowSnapshot *sn; - QCowSnapshotHeader h; - QCowSnapshotExtraData extra; - int i, name_size, id_str_size, snapshots_size; - struct { - uint32_t nb_snapshots; - uint64_t snapshots_offset; - } QEMU_PACKED header_data; - int64_t offset, snapshots_offset = 0; - int ret; - - /* compute the size of the snapshots */ - offset = 0; - for(i = 0; i < s->nb_snapshots; i++) { - sn = s->snapshots + i; - offset = align_offset(offset, 8); - offset += sizeof(h); - offset += sizeof(extra); - offset += strlen(sn->id_str); - offset += strlen(sn->name); - - if (offset > QCOW_MAX_SNAPSHOTS_SIZE) { - ret = -EFBIG; - goto fail; - } - } - - assert(offset <= INT_MAX); - snapshots_size = offset; - - /* Allocate space for the new snapshot list */ - snapshots_offset = qcow2_alloc_clusters(bs, snapshots_size); - offset = snapshots_offset; - if (offset < 0) { - ret = offset; - goto fail; - } - ret = bdrv_flush(bs); - if (ret < 0) { - goto fail; - } - - /* The snapshot list position has not yet been updated, so these clusters - * must indeed be completely free */ - ret = qcow2_pre_write_overlap_check(bs, 0, offset, snapshots_size); - if (ret < 0) { - goto fail; - } - - - /* Write all snapshots to the new list */ - for(i = 0; i < s->nb_snapshots; i++) { - sn = s->snapshots + i; - memset(&h, 0, sizeof(h)); - h.l1_table_offset = cpu_to_be64(sn->l1_table_offset); - h.l1_size = cpu_to_be32(sn->l1_size); - /* If it doesn't fit in 32 bit, older implementations should treat it - * as a disk-only snapshot rather than truncate the VM state */ - if (sn->vm_state_size <= 0xffffffff) { - h.vm_state_size = cpu_to_be32(sn->vm_state_size); - } - h.date_sec = cpu_to_be32(sn->date_sec); - h.date_nsec = cpu_to_be32(sn->date_nsec); - h.vm_clock_nsec = cpu_to_be64(sn->vm_clock_nsec); - h.extra_data_size = cpu_to_be32(sizeof(extra)); - - memset(&extra, 0, sizeof(extra)); - extra.vm_state_size_large = cpu_to_be64(sn->vm_state_size); - extra.disk_size = cpu_to_be64(sn->disk_size); - - id_str_size = strlen(sn->id_str); - name_size = strlen(sn->name); - assert(id_str_size <= UINT16_MAX && name_size <= UINT16_MAX); - h.id_str_size = cpu_to_be16(id_str_size); - h.name_size = cpu_to_be16(name_size); - offset = align_offset(offset, 8); - - ret = bdrv_pwrite(bs->file->bs, offset, &h, sizeof(h)); - if (ret < 0) { - goto fail; - } - offset += sizeof(h); - - ret = bdrv_pwrite(bs->file->bs, offset, &extra, sizeof(extra)); - if (ret < 0) { - goto fail; - } - offset += sizeof(extra); - - ret = bdrv_pwrite(bs->file->bs, offset, sn->id_str, id_str_size); - if (ret < 0) { - goto fail; - } - offset += id_str_size; - - ret = bdrv_pwrite(bs->file->bs, offset, sn->name, name_size); - if (ret < 0) { - goto fail; - } - offset += name_size; - } - - /* - * Update the header to point to the new snapshot table. This requires the - * new table and its refcounts to be stable on disk. - */ - ret = bdrv_flush(bs); - if (ret < 0) { - goto fail; - } - - QEMU_BUILD_BUG_ON(offsetof(QCowHeader, snapshots_offset) != - offsetof(QCowHeader, nb_snapshots) + sizeof(header_data.nb_snapshots)); - - header_data.nb_snapshots = cpu_to_be32(s->nb_snapshots); - header_data.snapshots_offset = cpu_to_be64(snapshots_offset); - - ret = bdrv_pwrite_sync(bs->file->bs, offsetof(QCowHeader, nb_snapshots), - &header_data, sizeof(header_data)); - if (ret < 0) { - goto fail; - } - - /* free the old snapshot table */ - qcow2_free_clusters(bs, s->snapshots_offset, s->snapshots_size, - QCOW2_DISCARD_SNAPSHOT); - s->snapshots_offset = snapshots_offset; - s->snapshots_size = snapshots_size; - return 0; - -fail: - if (snapshots_offset > 0) { - qcow2_free_clusters(bs, snapshots_offset, snapshots_size, - QCOW2_DISCARD_ALWAYS); - } - return ret; -} - -static void find_new_snapshot_id(BlockDriverState *bs, - char *id_str, int id_str_size) -{ - BDRVQcow2State *s = bs->opaque; - QCowSnapshot *sn; - int i; - unsigned long id, id_max = 0; - - for(i = 0; i < s->nb_snapshots; i++) { - sn = s->snapshots + i; - id = strtoul(sn->id_str, NULL, 10); - if (id > id_max) - id_max = id; - } - snprintf(id_str, id_str_size, "%lu", id_max + 1); -} - -static int find_snapshot_by_id_and_name(BlockDriverState *bs, - const char *id, - const char *name) -{ - BDRVQcow2State *s = bs->opaque; - int i; - - if (id && name) { - for (i = 0; i < s->nb_snapshots; i++) { - if (!strcmp(s->snapshots[i].id_str, id) && - !strcmp(s->snapshots[i].name, name)) { - return i; - } - } - } else if (id) { - for (i = 0; i < s->nb_snapshots; i++) { - if (!strcmp(s->snapshots[i].id_str, id)) { - return i; - } - } - } else if (name) { - for (i = 0; i < s->nb_snapshots; i++) { - if (!strcmp(s->snapshots[i].name, name)) { - return i; - } - } - } - - return -1; -} - -static int find_snapshot_by_id_or_name(BlockDriverState *bs, - const char *id_or_name) -{ - int ret; - - ret = find_snapshot_by_id_and_name(bs, id_or_name, NULL); - if (ret >= 0) { - return ret; - } - return find_snapshot_by_id_and_name(bs, NULL, id_or_name); -} - -/* if no id is provided, a new one is constructed */ -int qcow2_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info) -{ - BDRVQcow2State *s = bs->opaque; - QCowSnapshot *new_snapshot_list = NULL; - QCowSnapshot *old_snapshot_list = NULL; - QCowSnapshot sn1, *sn = &sn1; - int i, ret; - uint64_t *l1_table = NULL; - int64_t l1_table_offset; - - if (s->nb_snapshots >= QCOW_MAX_SNAPSHOTS) { - return -EFBIG; - } - - memset(sn, 0, sizeof(*sn)); - - /* Generate an ID */ - find_new_snapshot_id(bs, sn_info->id_str, sizeof(sn_info->id_str)); - - /* Check that the ID is unique */ - if (find_snapshot_by_id_and_name(bs, sn_info->id_str, NULL) >= 0) { - return -EEXIST; - } - - /* Populate sn with passed data */ - sn->id_str = g_strdup(sn_info->id_str); - sn->name = g_strdup(sn_info->name); - - sn->disk_size = bs->total_sectors * BDRV_SECTOR_SIZE; - sn->vm_state_size = sn_info->vm_state_size; - sn->date_sec = sn_info->date_sec; - sn->date_nsec = sn_info->date_nsec; - sn->vm_clock_nsec = sn_info->vm_clock_nsec; - - /* Allocate the L1 table of the snapshot and copy the current one there. */ - l1_table_offset = qcow2_alloc_clusters(bs, s->l1_size * sizeof(uint64_t)); - if (l1_table_offset < 0) { - ret = l1_table_offset; - goto fail; - } - - sn->l1_table_offset = l1_table_offset; - sn->l1_size = s->l1_size; - - l1_table = g_try_new(uint64_t, s->l1_size); - if (s->l1_size && l1_table == NULL) { - ret = -ENOMEM; - goto fail; - } - - for(i = 0; i < s->l1_size; i++) { - l1_table[i] = cpu_to_be64(s->l1_table[i]); - } - - ret = qcow2_pre_write_overlap_check(bs, 0, sn->l1_table_offset, - s->l1_size * sizeof(uint64_t)); - if (ret < 0) { - goto fail; - } - - ret = bdrv_pwrite(bs->file->bs, sn->l1_table_offset, l1_table, - s->l1_size * sizeof(uint64_t)); - if (ret < 0) { - goto fail; - } - - g_free(l1_table); - l1_table = NULL; - - /* - * Increase the refcounts of all clusters and make sure everything is - * stable on disk before updating the snapshot table to contain a pointer - * to the new L1 table. - */ - ret = qcow2_update_snapshot_refcount(bs, s->l1_table_offset, s->l1_size, 1); - if (ret < 0) { - goto fail; - } - - /* Append the new snapshot to the snapshot list */ - new_snapshot_list = g_new(QCowSnapshot, s->nb_snapshots + 1); - if (s->snapshots) { - memcpy(new_snapshot_list, s->snapshots, - s->nb_snapshots * sizeof(QCowSnapshot)); - old_snapshot_list = s->snapshots; - } - s->snapshots = new_snapshot_list; - s->snapshots[s->nb_snapshots++] = *sn; - - ret = qcow2_write_snapshots(bs); - if (ret < 0) { - g_free(s->snapshots); - s->snapshots = old_snapshot_list; - s->nb_snapshots--; - goto fail; - } - - g_free(old_snapshot_list); - - /* The VM state isn't needed any more in the active L1 table; in fact, it - * hurts by causing expensive COW for the next snapshot. */ - qcow2_discard_clusters(bs, qcow2_vm_state_offset(s), - align_offset(sn->vm_state_size, s->cluster_size) - >> BDRV_SECTOR_BITS, - QCOW2_DISCARD_NEVER, false); - -#ifdef DEBUG_ALLOC - { - BdrvCheckResult result = {0}; - qcow2_check_refcounts(bs, &result, 0); - } -#endif - return 0; - -fail: - g_free(sn->id_str); - g_free(sn->name); - g_free(l1_table); - - return ret; -} - -/* copy the snapshot 'snapshot_name' into the current disk image */ -int qcow2_snapshot_goto(BlockDriverState *bs, const char *snapshot_id) -{ - BDRVQcow2State *s = bs->opaque; - QCowSnapshot *sn; - int i, snapshot_index; - int cur_l1_bytes, sn_l1_bytes; - int ret; - uint64_t *sn_l1_table = NULL; - - /* Search the snapshot */ - snapshot_index = find_snapshot_by_id_or_name(bs, snapshot_id); - if (snapshot_index < 0) { - return -ENOENT; - } - sn = &s->snapshots[snapshot_index]; - - if (sn->disk_size != bs->total_sectors * BDRV_SECTOR_SIZE) { - error_report("qcow2: Loading snapshots with different disk " - "size is not implemented"); - ret = -ENOTSUP; - goto fail; - } - - /* - * Make sure that the current L1 table is big enough to contain the whole - * L1 table of the snapshot. If the snapshot L1 table is smaller, the - * current one must be padded with zeros. - */ - ret = qcow2_grow_l1_table(bs, sn->l1_size, true); - if (ret < 0) { - goto fail; - } - - cur_l1_bytes = s->l1_size * sizeof(uint64_t); - sn_l1_bytes = sn->l1_size * sizeof(uint64_t); - - /* - * Copy the snapshot L1 table to the current L1 table. - * - * Before overwriting the old current L1 table on disk, make sure to - * increase all refcounts for the clusters referenced by the new one. - * Decrease the refcount referenced by the old one only when the L1 - * table is overwritten. - */ - sn_l1_table = g_try_malloc0(cur_l1_bytes); - if (cur_l1_bytes && sn_l1_table == NULL) { - ret = -ENOMEM; - goto fail; - } - - ret = bdrv_pread(bs->file->bs, sn->l1_table_offset, - sn_l1_table, sn_l1_bytes); - if (ret < 0) { - goto fail; - } - - ret = qcow2_update_snapshot_refcount(bs, sn->l1_table_offset, - sn->l1_size, 1); - if (ret < 0) { - goto fail; - } - - ret = qcow2_pre_write_overlap_check(bs, QCOW2_OL_ACTIVE_L1, - s->l1_table_offset, cur_l1_bytes); - if (ret < 0) { - goto fail; - } - - ret = bdrv_pwrite_sync(bs->file->bs, s->l1_table_offset, sn_l1_table, - cur_l1_bytes); - if (ret < 0) { - goto fail; - } - - /* - * Decrease refcount of clusters of current L1 table. - * - * At this point, the in-memory s->l1_table points to the old L1 table, - * whereas on disk we already have the new one. - * - * qcow2_update_snapshot_refcount special cases the current L1 table to use - * the in-memory data instead of really using the offset to load a new one, - * which is why this works. - */ - ret = qcow2_update_snapshot_refcount(bs, s->l1_table_offset, - s->l1_size, -1); - - /* - * Now update the in-memory L1 table to be in sync with the on-disk one. We - * need to do this even if updating refcounts failed. - */ - for(i = 0;i < s->l1_size; i++) { - s->l1_table[i] = be64_to_cpu(sn_l1_table[i]); - } - - if (ret < 0) { - goto fail; - } - - g_free(sn_l1_table); - sn_l1_table = NULL; - - /* - * Update QCOW_OFLAG_COPIED in the active L1 table (it may have changed - * when we decreased the refcount of the old snapshot. - */ - ret = qcow2_update_snapshot_refcount(bs, s->l1_table_offset, s->l1_size, 0); - if (ret < 0) { - goto fail; - } - -#ifdef DEBUG_ALLOC - { - BdrvCheckResult result = {0}; - qcow2_check_refcounts(bs, &result, 0); - } -#endif - return 0; - -fail: - g_free(sn_l1_table); - return ret; -} - -int qcow2_snapshot_delete(BlockDriverState *bs, - const char *snapshot_id, - const char *name, - Error **errp) -{ - BDRVQcow2State *s = bs->opaque; - QCowSnapshot sn; - int snapshot_index, ret; - - /* Search the snapshot */ - snapshot_index = find_snapshot_by_id_and_name(bs, snapshot_id, name); - if (snapshot_index < 0) { - error_setg(errp, "Can't find the snapshot"); - return -ENOENT; - } - sn = s->snapshots[snapshot_index]; - - /* Remove it from the snapshot list */ - memmove(s->snapshots + snapshot_index, - s->snapshots + snapshot_index + 1, - (s->nb_snapshots - snapshot_index - 1) * sizeof(sn)); - s->nb_snapshots--; - ret = qcow2_write_snapshots(bs); - if (ret < 0) { - error_setg_errno(errp, -ret, - "Failed to remove snapshot from snapshot list"); - return ret; - } - - /* - * The snapshot is now unused, clean up. If we fail after this point, we - * won't recover but just leak clusters. - */ - g_free(sn.id_str); - g_free(sn.name); - - /* - * Now decrease the refcounts of clusters referenced by the snapshot and - * free the L1 table. - */ - ret = qcow2_update_snapshot_refcount(bs, sn.l1_table_offset, - sn.l1_size, -1); - if (ret < 0) { - error_setg_errno(errp, -ret, "Failed to free the cluster and L1 table"); - return ret; - } - qcow2_free_clusters(bs, sn.l1_table_offset, sn.l1_size * sizeof(uint64_t), - QCOW2_DISCARD_SNAPSHOT); - - /* must update the copied flag on the current cluster offsets */ - ret = qcow2_update_snapshot_refcount(bs, s->l1_table_offset, s->l1_size, 0); - if (ret < 0) { - error_setg_errno(errp, -ret, - "Failed to update snapshot status in disk"); - return ret; - } - -#ifdef DEBUG_ALLOC - { - BdrvCheckResult result = {0}; - qcow2_check_refcounts(bs, &result, 0); - } -#endif - return 0; -} - -int qcow2_snapshot_list(BlockDriverState *bs, QEMUSnapshotInfo **psn_tab) -{ - BDRVQcow2State *s = bs->opaque; - QEMUSnapshotInfo *sn_tab, *sn_info; - QCowSnapshot *sn; - int i; - - if (!s->nb_snapshots) { - *psn_tab = NULL; - return s->nb_snapshots; - } - - sn_tab = g_new0(QEMUSnapshotInfo, s->nb_snapshots); - for(i = 0; i < s->nb_snapshots; i++) { - sn_info = sn_tab + i; - sn = s->snapshots + i; - pstrcpy(sn_info->id_str, sizeof(sn_info->id_str), - sn->id_str); - pstrcpy(sn_info->name, sizeof(sn_info->name), - sn->name); - sn_info->vm_state_size = sn->vm_state_size; - sn_info->date_sec = sn->date_sec; - sn_info->date_nsec = sn->date_nsec; - sn_info->vm_clock_nsec = sn->vm_clock_nsec; - } - *psn_tab = sn_tab; - return s->nb_snapshots; -} - -int qcow2_snapshot_load_tmp(BlockDriverState *bs, - const char *snapshot_id, - const char *name, - Error **errp) -{ - int i, snapshot_index; - BDRVQcow2State *s = bs->opaque; - QCowSnapshot *sn; - uint64_t *new_l1_table; - int new_l1_bytes; - int ret; - - assert(bs->read_only); - - /* Search the snapshot */ - snapshot_index = find_snapshot_by_id_and_name(bs, snapshot_id, name); - if (snapshot_index < 0) { - error_setg(errp, - "Can't find snapshot"); - return -ENOENT; - } - sn = &s->snapshots[snapshot_index]; - - /* Allocate and read in the snapshot's L1 table */ - if (sn->l1_size > QCOW_MAX_L1_SIZE / sizeof(uint64_t)) { - error_setg(errp, "Snapshot L1 table too large"); - return -EFBIG; - } - new_l1_bytes = sn->l1_size * sizeof(uint64_t); - new_l1_table = qemu_try_blockalign(bs->file->bs, - align_offset(new_l1_bytes, 512)); - if (new_l1_table == NULL) { - return -ENOMEM; - } - - ret = bdrv_pread(bs->file->bs, sn->l1_table_offset, - new_l1_table, new_l1_bytes); - if (ret < 0) { - error_setg(errp, "Failed to read l1 table for snapshot"); - qemu_vfree(new_l1_table); - return ret; - } - - /* Switch the L1 table */ - qemu_vfree(s->l1_table); - - s->l1_size = sn->l1_size; - s->l1_table_offset = sn->l1_table_offset; - s->l1_table = new_l1_table; - - for(i = 0;i < s->l1_size; i++) { - be64_to_cpus(&s->l1_table[i]); - } - - return 0; -} diff --git a/qemu/block/qcow2.c b/qemu/block/qcow2.c deleted file mode 100644 index 470734be9..000000000 --- a/qemu/block/qcow2.c +++ /dev/null @@ -1,3373 +0,0 @@ -/* - * Block driver for the QCOW version 2 format - * - * Copyright (c) 2004-2006 Fabrice Bellard - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. - */ -#include "qemu/osdep.h" -#include "block/block_int.h" -#include "sysemu/block-backend.h" -#include "qemu/module.h" -#include -#include "block/qcow2.h" -#include "qemu/error-report.h" -#include "qapi/qmp/qerror.h" -#include "qapi/qmp/qbool.h" -#include "qapi/util.h" -#include "qapi/qmp/types.h" -#include "qapi-event.h" -#include "trace.h" -#include "qemu/option_int.h" -#include "qemu/cutils.h" - -/* - Differences with QCOW: - - - Support for multiple incremental snapshots. - - Memory management by reference counts. - - Clusters which have a reference count of one have the bit - QCOW_OFLAG_COPIED to optimize write performance. - - Size of compressed clusters is stored in sectors to reduce bit usage - in the cluster offsets. - - Support for storing additional data (such as the VM state) in the - snapshots. - - If a backing store is used, the cluster size is not constrained - (could be backported to QCOW). - - L2 tables have always a size of one cluster. -*/ - - -typedef struct { - uint32_t magic; - uint32_t len; -} QEMU_PACKED QCowExtension; - -#define QCOW2_EXT_MAGIC_END 0 -#define QCOW2_EXT_MAGIC_BACKING_FORMAT 0xE2792ACA -#define QCOW2_EXT_MAGIC_FEATURE_TABLE 0x6803f857 - -static int qcow2_probe(const uint8_t *buf, int buf_size, const char *filename) -{ - const QCowHeader *cow_header = (const void *)buf; - - if (buf_size >= sizeof(QCowHeader) && - be32_to_cpu(cow_header->magic) == QCOW_MAGIC && - be32_to_cpu(cow_header->version) >= 2) - return 100; - else - return 0; -} - - -/* - * read qcow2 extension and fill bs - * start reading from start_offset - * finish reading upon magic of value 0 or when end_offset reached - * unknown magic is skipped (future extension this version knows nothing about) - * return 0 upon success, non-0 otherwise - */ -static int qcow2_read_extensions(BlockDriverState *bs, uint64_t start_offset, - uint64_t end_offset, void **p_feature_table, - Error **errp) -{ - BDRVQcow2State *s = bs->opaque; - QCowExtension ext; - uint64_t offset; - int ret; - -#ifdef DEBUG_EXT - printf("qcow2_read_extensions: start=%ld end=%ld\n", start_offset, end_offset); -#endif - offset = start_offset; - while (offset < end_offset) { - -#ifdef DEBUG_EXT - /* Sanity check */ - if (offset > s->cluster_size) - printf("qcow2_read_extension: suspicious offset %lu\n", offset); - - printf("attempting to read extended header in offset %lu\n", offset); -#endif - - ret = bdrv_pread(bs->file->bs, offset, &ext, sizeof(ext)); - if (ret < 0) { - error_setg_errno(errp, -ret, "qcow2_read_extension: ERROR: " - "pread fail from offset %" PRIu64, offset); - return 1; - } - be32_to_cpus(&ext.magic); - be32_to_cpus(&ext.len); - offset += sizeof(ext); -#ifdef DEBUG_EXT - printf("ext.magic = 0x%x\n", ext.magic); -#endif - if (offset > end_offset || ext.len > end_offset - offset) { - error_setg(errp, "Header extension too large"); - return -EINVAL; - } - - switch (ext.magic) { - case QCOW2_EXT_MAGIC_END: - return 0; - - case QCOW2_EXT_MAGIC_BACKING_FORMAT: - if (ext.len >= sizeof(bs->backing_format)) { - error_setg(errp, "ERROR: ext_backing_format: len=%" PRIu32 - " too large (>=%zu)", ext.len, - sizeof(bs->backing_format)); - return 2; - } - ret = bdrv_pread(bs->file->bs, offset, bs->backing_format, ext.len); - if (ret < 0) { - error_setg_errno(errp, -ret, "ERROR: ext_backing_format: " - "Could not read format name"); - return 3; - } - bs->backing_format[ext.len] = '\0'; - s->image_backing_format = g_strdup(bs->backing_format); -#ifdef DEBUG_EXT - printf("Qcow2: Got format extension %s\n", bs->backing_format); -#endif - break; - - case QCOW2_EXT_MAGIC_FEATURE_TABLE: - if (p_feature_table != NULL) { - void* feature_table = g_malloc0(ext.len + 2 * sizeof(Qcow2Feature)); - ret = bdrv_pread(bs->file->bs, offset , feature_table, ext.len); - if (ret < 0) { - error_setg_errno(errp, -ret, "ERROR: ext_feature_table: " - "Could not read table"); - return ret; - } - - *p_feature_table = feature_table; - } - break; - - default: - /* unknown magic - save it in case we need to rewrite the header */ - { - Qcow2UnknownHeaderExtension *uext; - - uext = g_malloc0(sizeof(*uext) + ext.len); - uext->magic = ext.magic; - uext->len = ext.len; - QLIST_INSERT_HEAD(&s->unknown_header_ext, uext, next); - - ret = bdrv_pread(bs->file->bs, offset , uext->data, uext->len); - if (ret < 0) { - error_setg_errno(errp, -ret, "ERROR: unknown extension: " - "Could not read data"); - return ret; - } - } - break; - } - - offset += ((ext.len + 7) & ~7); - } - - return 0; -} - -static void cleanup_unknown_header_ext(BlockDriverState *bs) -{ - BDRVQcow2State *s = bs->opaque; - Qcow2UnknownHeaderExtension *uext, *next; - - QLIST_FOREACH_SAFE(uext, &s->unknown_header_ext, next, next) { - QLIST_REMOVE(uext, next); - g_free(uext); - } -} - -static void report_unsupported_feature(Error **errp, Qcow2Feature *table, - uint64_t mask) -{ - char *features = g_strdup(""); - char *old; - - while (table && table->name[0] != '\0') { - if (table->type == QCOW2_FEAT_TYPE_INCOMPATIBLE) { - if (mask & (1ULL << table->bit)) { - old = features; - features = g_strdup_printf("%s%s%.46s", old, *old ? ", " : "", - table->name); - g_free(old); - mask &= ~(1ULL << table->bit); - } - } - table++; - } - - if (mask) { - old = features; - features = g_strdup_printf("%s%sUnknown incompatible feature: %" PRIx64, - old, *old ? ", " : "", mask); - g_free(old); - } - - error_setg(errp, "Unsupported qcow2 feature(s): %s", features); - g_free(features); -} - -/* - * Sets the dirty bit and flushes afterwards if necessary. - * - * The incompatible_features bit is only set if the image file header was - * updated successfully. Therefore it is not required to check the return - * value of this function. - */ -int qcow2_mark_dirty(BlockDriverState *bs) -{ - BDRVQcow2State *s = bs->opaque; - uint64_t val; - int ret; - - assert(s->qcow_version >= 3); - - if (s->incompatible_features & QCOW2_INCOMPAT_DIRTY) { - return 0; /* already dirty */ - } - - val = cpu_to_be64(s->incompatible_features | QCOW2_INCOMPAT_DIRTY); - ret = bdrv_pwrite(bs->file->bs, offsetof(QCowHeader, incompatible_features), - &val, sizeof(val)); - if (ret < 0) { - return ret; - } - ret = bdrv_flush(bs->file->bs); - if (ret < 0) { - return ret; - } - - /* Only treat image as dirty if the header was updated successfully */ - s->incompatible_features |= QCOW2_INCOMPAT_DIRTY; - return 0; -} - -/* - * Clears the dirty bit and flushes before if necessary. Only call this - * function when there are no pending requests, it does not guard against - * concurrent requests dirtying the image. - */ -static int qcow2_mark_clean(BlockDriverState *bs) -{ - BDRVQcow2State *s = bs->opaque; - - if (s->incompatible_features & QCOW2_INCOMPAT_DIRTY) { - int ret; - - s->incompatible_features &= ~QCOW2_INCOMPAT_DIRTY; - - ret = bdrv_flush(bs); - if (ret < 0) { - return ret; - } - - return qcow2_update_header(bs); - } - return 0; -} - -/* - * Marks the image as corrupt. - */ -int qcow2_mark_corrupt(BlockDriverState *bs) -{ - BDRVQcow2State *s = bs->opaque; - - s->incompatible_features |= QCOW2_INCOMPAT_CORRUPT; - return qcow2_update_header(bs); -} - -/* - * Marks the image as consistent, i.e., unsets the corrupt bit, and flushes - * before if necessary. - */ -int qcow2_mark_consistent(BlockDriverState *bs) -{ - BDRVQcow2State *s = bs->opaque; - - if (s->incompatible_features & QCOW2_INCOMPAT_CORRUPT) { - int ret = bdrv_flush(bs); - if (ret < 0) { - return ret; - } - - s->incompatible_features &= ~QCOW2_INCOMPAT_CORRUPT; - return qcow2_update_header(bs); - } - return 0; -} - -static int qcow2_check(BlockDriverState *bs, BdrvCheckResult *result, - BdrvCheckMode fix) -{ - int ret = qcow2_check_refcounts(bs, result, fix); - if (ret < 0) { - return ret; - } - - if (fix && result->check_errors == 0 && result->corruptions == 0) { - ret = qcow2_mark_clean(bs); - if (ret < 0) { - return ret; - } - return qcow2_mark_consistent(bs); - } - return ret; -} - -static int validate_table_offset(BlockDriverState *bs, uint64_t offset, - uint64_t entries, size_t entry_len) -{ - BDRVQcow2State *s = bs->opaque; - uint64_t size; - - /* Use signed INT64_MAX as the maximum even for uint64_t header fields, - * because values will be passed to qemu functions taking int64_t. */ - if (entries > INT64_MAX / entry_len) { - return -EINVAL; - } - - size = entries * entry_len; - - if (INT64_MAX - size < offset) { - return -EINVAL; - } - - /* Tables must be cluster aligned */ - if (offset & (s->cluster_size - 1)) { - return -EINVAL; - } - - return 0; -} - -static QemuOptsList qcow2_runtime_opts = { - .name = "qcow2", - .head = QTAILQ_HEAD_INITIALIZER(qcow2_runtime_opts.head), - .desc = { - { - .name = QCOW2_OPT_LAZY_REFCOUNTS, - .type = QEMU_OPT_BOOL, - .help = "Postpone refcount updates", - }, - { - .name = QCOW2_OPT_DISCARD_REQUEST, - .type = QEMU_OPT_BOOL, - .help = "Pass guest discard requests to the layer below", - }, - { - .name = QCOW2_OPT_DISCARD_SNAPSHOT, - .type = QEMU_OPT_BOOL, - .help = "Generate discard requests when snapshot related space " - "is freed", - }, - { - .name = QCOW2_OPT_DISCARD_OTHER, - .type = QEMU_OPT_BOOL, - .help = "Generate discard requests when other clusters are freed", - }, - { - .name = QCOW2_OPT_OVERLAP, - .type = QEMU_OPT_STRING, - .help = "Selects which overlap checks to perform from a range of " - "templates (none, constant, cached, all)", - }, - { - .name = QCOW2_OPT_OVERLAP_TEMPLATE, - .type = QEMU_OPT_STRING, - .help = "Selects which overlap checks to perform from a range of " - "templates (none, constant, cached, all)", - }, - { - .name = QCOW2_OPT_OVERLAP_MAIN_HEADER, - .type = QEMU_OPT_BOOL, - .help = "Check for unintended writes into the main qcow2 header", - }, - { - .name = QCOW2_OPT_OVERLAP_ACTIVE_L1, - .type = QEMU_OPT_BOOL, - .help = "Check for unintended writes into the active L1 table", - }, - { - .name = QCOW2_OPT_OVERLAP_ACTIVE_L2, - .type = QEMU_OPT_BOOL, - .help = "Check for unintended writes into an active L2 table", - }, - { - .name = QCOW2_OPT_OVERLAP_REFCOUNT_TABLE, - .type = QEMU_OPT_BOOL, - .help = "Check for unintended writes into the refcount table", - }, - { - .name = QCOW2_OPT_OVERLAP_REFCOUNT_BLOCK, - .type = QEMU_OPT_BOOL, - .help = "Check for unintended writes into a refcount block", - }, - { - .name = QCOW2_OPT_OVERLAP_SNAPSHOT_TABLE, - .type = QEMU_OPT_BOOL, - .help = "Check for unintended writes into the snapshot table", - }, - { - .name = QCOW2_OPT_OVERLAP_INACTIVE_L1, - .type = QEMU_OPT_BOOL, - .help = "Check for unintended writes into an inactive L1 table", - }, - { - .name = QCOW2_OPT_OVERLAP_INACTIVE_L2, - .type = QEMU_OPT_BOOL, - .help = "Check for unintended writes into an inactive L2 table", - }, - { - .name = QCOW2_OPT_CACHE_SIZE, - .type = QEMU_OPT_SIZE, - .help = "Maximum combined metadata (L2 tables and refcount blocks) " - "cache size", - }, - { - .name = QCOW2_OPT_L2_CACHE_SIZE, - .type = QEMU_OPT_SIZE, - .help = "Maximum L2 table cache size", - }, - { - .name = QCOW2_OPT_REFCOUNT_CACHE_SIZE, - .type = QEMU_OPT_SIZE, - .help = "Maximum refcount block cache size", - }, - { - .name = QCOW2_OPT_CACHE_CLEAN_INTERVAL, - .type = QEMU_OPT_NUMBER, - .help = "Clean unused cache entries after this time (in seconds)", - }, - { /* end of list */ } - }, -}; - -static const char *overlap_bool_option_names[QCOW2_OL_MAX_BITNR] = { - [QCOW2_OL_MAIN_HEADER_BITNR] = QCOW2_OPT_OVERLAP_MAIN_HEADER, - [QCOW2_OL_ACTIVE_L1_BITNR] = QCOW2_OPT_OVERLAP_ACTIVE_L1, - [QCOW2_OL_ACTIVE_L2_BITNR] = QCOW2_OPT_OVERLAP_ACTIVE_L2, - [QCOW2_OL_REFCOUNT_TABLE_BITNR] = QCOW2_OPT_OVERLAP_REFCOUNT_TABLE, - [QCOW2_OL_REFCOUNT_BLOCK_BITNR] = QCOW2_OPT_OVERLAP_REFCOUNT_BLOCK, - [QCOW2_OL_SNAPSHOT_TABLE_BITNR] = QCOW2_OPT_OVERLAP_SNAPSHOT_TABLE, - [QCOW2_OL_INACTIVE_L1_BITNR] = QCOW2_OPT_OVERLAP_INACTIVE_L1, - [QCOW2_OL_INACTIVE_L2_BITNR] = QCOW2_OPT_OVERLAP_INACTIVE_L2, -}; - -static void cache_clean_timer_cb(void *opaque) -{ - BlockDriverState *bs = opaque; - BDRVQcow2State *s = bs->opaque; - qcow2_cache_clean_unused(bs, s->l2_table_cache); - qcow2_cache_clean_unused(bs, s->refcount_block_cache); - timer_mod(s->cache_clean_timer, qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL) + - (int64_t) s->cache_clean_interval * 1000); -} - -static void cache_clean_timer_init(BlockDriverState *bs, AioContext *context) -{ - BDRVQcow2State *s = bs->opaque; - if (s->cache_clean_interval > 0) { - s->cache_clean_timer = aio_timer_new(context, QEMU_CLOCK_VIRTUAL, - SCALE_MS, cache_clean_timer_cb, - bs); - timer_mod(s->cache_clean_timer, qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL) + - (int64_t) s->cache_clean_interval * 1000); - } -} - -static void cache_clean_timer_del(BlockDriverState *bs) -{ - BDRVQcow2State *s = bs->opaque; - if (s->cache_clean_timer) { - timer_del(s->cache_clean_timer); - timer_free(s->cache_clean_timer); - s->cache_clean_timer = NULL; - } -} - -static void qcow2_detach_aio_context(BlockDriverState *bs) -{ - cache_clean_timer_del(bs); -} - -static void qcow2_attach_aio_context(BlockDriverState *bs, - AioContext *new_context) -{ - cache_clean_timer_init(bs, new_context); -} - -static void read_cache_sizes(BlockDriverState *bs, QemuOpts *opts, - uint64_t *l2_cache_size, - uint64_t *refcount_cache_size, Error **errp) -{ - BDRVQcow2State *s = bs->opaque; - uint64_t combined_cache_size; - bool l2_cache_size_set, refcount_cache_size_set, combined_cache_size_set; - - combined_cache_size_set = qemu_opt_get(opts, QCOW2_OPT_CACHE_SIZE); - l2_cache_size_set = qemu_opt_get(opts, QCOW2_OPT_L2_CACHE_SIZE); - refcount_cache_size_set = qemu_opt_get(opts, QCOW2_OPT_REFCOUNT_CACHE_SIZE); - - combined_cache_size = qemu_opt_get_size(opts, QCOW2_OPT_CACHE_SIZE, 0); - *l2_cache_size = qemu_opt_get_size(opts, QCOW2_OPT_L2_CACHE_SIZE, 0); - *refcount_cache_size = qemu_opt_get_size(opts, - QCOW2_OPT_REFCOUNT_CACHE_SIZE, 0); - - if (combined_cache_size_set) { - if (l2_cache_size_set && refcount_cache_size_set) { - error_setg(errp, QCOW2_OPT_CACHE_SIZE ", " QCOW2_OPT_L2_CACHE_SIZE - " and " QCOW2_OPT_REFCOUNT_CACHE_SIZE " may not be set " - "the same time"); - return; - } else if (*l2_cache_size > combined_cache_size) { - error_setg(errp, QCOW2_OPT_L2_CACHE_SIZE " may not exceed " - QCOW2_OPT_CACHE_SIZE); - return; - } else if (*refcount_cache_size > combined_cache_size) { - error_setg(errp, QCOW2_OPT_REFCOUNT_CACHE_SIZE " may not exceed " - QCOW2_OPT_CACHE_SIZE); - return; - } - - if (l2_cache_size_set) { - *refcount_cache_size = combined_cache_size - *l2_cache_size; - } else if (refcount_cache_size_set) { - *l2_cache_size = combined_cache_size - *refcount_cache_size; - } else { - *refcount_cache_size = combined_cache_size - / (DEFAULT_L2_REFCOUNT_SIZE_RATIO + 1); - *l2_cache_size = combined_cache_size - *refcount_cache_size; - } - } else { - if (!l2_cache_size_set && !refcount_cache_size_set) { - *l2_cache_size = MAX(DEFAULT_L2_CACHE_BYTE_SIZE, - (uint64_t)DEFAULT_L2_CACHE_CLUSTERS - * s->cluster_size); - *refcount_cache_size = *l2_cache_size - / DEFAULT_L2_REFCOUNT_SIZE_RATIO; - } else if (!l2_cache_size_set) { - *l2_cache_size = *refcount_cache_size - * DEFAULT_L2_REFCOUNT_SIZE_RATIO; - } else if (!refcount_cache_size_set) { - *refcount_cache_size = *l2_cache_size - / DEFAULT_L2_REFCOUNT_SIZE_RATIO; - } - } -} - -typedef struct Qcow2ReopenState { - Qcow2Cache *l2_table_cache; - Qcow2Cache *refcount_block_cache; - bool use_lazy_refcounts; - int overlap_check; - bool discard_passthrough[QCOW2_DISCARD_MAX]; - uint64_t cache_clean_interval; -} Qcow2ReopenState; - -static int qcow2_update_options_prepare(BlockDriverState *bs, - Qcow2ReopenState *r, - QDict *options, int flags, - Error **errp) -{ - BDRVQcow2State *s = bs->opaque; - QemuOpts *opts = NULL; - const char *opt_overlap_check, *opt_overlap_check_template; - int overlap_check_template = 0; - uint64_t l2_cache_size, refcount_cache_size; - int i; - Error *local_err = NULL; - int ret; - - opts = qemu_opts_create(&qcow2_runtime_opts, NULL, 0, &error_abort); - qemu_opts_absorb_qdict(opts, options, &local_err); - if (local_err) { - error_propagate(errp, local_err); - ret = -EINVAL; - goto fail; - } - - /* get L2 table/refcount block cache size from command line options */ - read_cache_sizes(bs, opts, &l2_cache_size, &refcount_cache_size, - &local_err); - if (local_err) { - error_propagate(errp, local_err); - ret = -EINVAL; - goto fail; - } - - l2_cache_size /= s->cluster_size; - if (l2_cache_size < MIN_L2_CACHE_SIZE) { - l2_cache_size = MIN_L2_CACHE_SIZE; - } - if (l2_cache_size > INT_MAX) { - error_setg(errp, "L2 cache size too big"); - ret = -EINVAL; - goto fail; - } - - refcount_cache_size /= s->cluster_size; - if (refcount_cache_size < MIN_REFCOUNT_CACHE_SIZE) { - refcount_cache_size = MIN_REFCOUNT_CACHE_SIZE; - } - if (refcount_cache_size > INT_MAX) { - error_setg(errp, "Refcount cache size too big"); - ret = -EINVAL; - goto fail; - } - - /* alloc new L2 table/refcount block cache, flush old one */ - if (s->l2_table_cache) { - ret = qcow2_cache_flush(bs, s->l2_table_cache); - if (ret) { - error_setg_errno(errp, -ret, "Failed to flush the L2 table cache"); - goto fail; - } - } - - if (s->refcount_block_cache) { - ret = qcow2_cache_flush(bs, s->refcount_block_cache); - if (ret) { - error_setg_errno(errp, -ret, - "Failed to flush the refcount block cache"); - goto fail; - } - } - - r->l2_table_cache = qcow2_cache_create(bs, l2_cache_size); - r->refcount_block_cache = qcow2_cache_create(bs, refcount_cache_size); - if (r->l2_table_cache == NULL || r->refcount_block_cache == NULL) { - error_setg(errp, "Could not allocate metadata caches"); - ret = -ENOMEM; - goto fail; - } - - /* New interval for cache cleanup timer */ - r->cache_clean_interval = - qemu_opt_get_number(opts, QCOW2_OPT_CACHE_CLEAN_INTERVAL, - s->cache_clean_interval); - if (r->cache_clean_interval > UINT_MAX) { - error_setg(errp, "Cache clean interval too big"); - ret = -EINVAL; - goto fail; - } - - /* lazy-refcounts; flush if going from enabled to disabled */ - r->use_lazy_refcounts = qemu_opt_get_bool(opts, QCOW2_OPT_LAZY_REFCOUNTS, - (s->compatible_features & QCOW2_COMPAT_LAZY_REFCOUNTS)); - if (r->use_lazy_refcounts && s->qcow_version < 3) { - error_setg(errp, "Lazy refcounts require a qcow2 image with at least " - "qemu 1.1 compatibility level"); - ret = -EINVAL; - goto fail; - } - - if (s->use_lazy_refcounts && !r->use_lazy_refcounts) { - ret = qcow2_mark_clean(bs); - if (ret < 0) { - error_setg_errno(errp, -ret, "Failed to disable lazy refcounts"); - goto fail; - } - } - - /* Overlap check options */ - opt_overlap_check = qemu_opt_get(opts, QCOW2_OPT_OVERLAP); - opt_overlap_check_template = qemu_opt_get(opts, QCOW2_OPT_OVERLAP_TEMPLATE); - if (opt_overlap_check_template && opt_overlap_check && - strcmp(opt_overlap_check_template, opt_overlap_check)) - { - error_setg(errp, "Conflicting values for qcow2 options '" - QCOW2_OPT_OVERLAP "' ('%s') and '" QCOW2_OPT_OVERLAP_TEMPLATE - "' ('%s')", opt_overlap_check, opt_overlap_check_template); - ret = -EINVAL; - goto fail; - } - if (!opt_overlap_check) { - opt_overlap_check = opt_overlap_check_template ?: "cached"; - } - - if (!strcmp(opt_overlap_check, "none")) { - overlap_check_template = 0; - } else if (!strcmp(opt_overlap_check, "constant")) { - overlap_check_template = QCOW2_OL_CONSTANT; - } else if (!strcmp(opt_overlap_check, "cached")) { - overlap_check_template = QCOW2_OL_CACHED; - } else if (!strcmp(opt_overlap_check, "all")) { - overlap_check_template = QCOW2_OL_ALL; - } else { - error_setg(errp, "Unsupported value '%s' for qcow2 option " - "'overlap-check'. Allowed are any of the following: " - "none, constant, cached, all", opt_overlap_check); - ret = -EINVAL; - goto fail; - } - - r->overlap_check = 0; - for (i = 0; i < QCOW2_OL_MAX_BITNR; i++) { - /* overlap-check defines a template bitmask, but every flag may be - * overwritten through the associated boolean option */ - r->overlap_check |= - qemu_opt_get_bool(opts, overlap_bool_option_names[i], - overlap_check_template & (1 << i)) << i; - } - - r->discard_passthrough[QCOW2_DISCARD_NEVER] = false; - r->discard_passthrough[QCOW2_DISCARD_ALWAYS] = true; - r->discard_passthrough[QCOW2_DISCARD_REQUEST] = - qemu_opt_get_bool(opts, QCOW2_OPT_DISCARD_REQUEST, - flags & BDRV_O_UNMAP); - r->discard_passthrough[QCOW2_DISCARD_SNAPSHOT] = - qemu_opt_get_bool(opts, QCOW2_OPT_DISCARD_SNAPSHOT, true); - r->discard_passthrough[QCOW2_DISCARD_OTHER] = - qemu_opt_get_bool(opts, QCOW2_OPT_DISCARD_OTHER, false); - - ret = 0; -fail: - qemu_opts_del(opts); - opts = NULL; - return ret; -} - -static void qcow2_update_options_commit(BlockDriverState *bs, - Qcow2ReopenState *r) -{ - BDRVQcow2State *s = bs->opaque; - int i; - - if (s->l2_table_cache) { - qcow2_cache_destroy(bs, s->l2_table_cache); - } - if (s->refcount_block_cache) { - qcow2_cache_destroy(bs, s->refcount_block_cache); - } - s->l2_table_cache = r->l2_table_cache; - s->refcount_block_cache = r->refcount_block_cache; - - s->overlap_check = r->overlap_check; - s->use_lazy_refcounts = r->use_lazy_refcounts; - - for (i = 0; i < QCOW2_DISCARD_MAX; i++) { - s->discard_passthrough[i] = r->discard_passthrough[i]; - } - - if (s->cache_clean_interval != r->cache_clean_interval) { - cache_clean_timer_del(bs); - s->cache_clean_interval = r->cache_clean_interval; - cache_clean_timer_init(bs, bdrv_get_aio_context(bs)); - } -} - -static void qcow2_update_options_abort(BlockDriverState *bs, - Qcow2ReopenState *r) -{ - if (r->l2_table_cache) { - qcow2_cache_destroy(bs, r->l2_table_cache); - } - if (r->refcount_block_cache) { - qcow2_cache_destroy(bs, r->refcount_block_cache); - } -} - -static int qcow2_update_options(BlockDriverState *bs, QDict *options, - int flags, Error **errp) -{ - Qcow2ReopenState r = {}; - int ret; - - ret = qcow2_update_options_prepare(bs, &r, options, flags, errp); - if (ret >= 0) { - qcow2_update_options_commit(bs, &r); - } else { - qcow2_update_options_abort(bs, &r); - } - - return ret; -} - -static int qcow2_open(BlockDriverState *bs, QDict *options, int flags, - Error **errp) -{ - BDRVQcow2State *s = bs->opaque; - unsigned int len, i; - int ret = 0; - QCowHeader header; - Error *local_err = NULL; - uint64_t ext_end; - uint64_t l1_vm_state_index; - - ret = bdrv_pread(bs->file->bs, 0, &header, sizeof(header)); - if (ret < 0) { - error_setg_errno(errp, -ret, "Could not read qcow2 header"); - goto fail; - } - be32_to_cpus(&header.magic); - be32_to_cpus(&header.version); - be64_to_cpus(&header.backing_file_offset); - be32_to_cpus(&header.backing_file_size); - be64_to_cpus(&header.size); - be32_to_cpus(&header.cluster_bits); - be32_to_cpus(&header.crypt_method); - be64_to_cpus(&header.l1_table_offset); - be32_to_cpus(&header.l1_size); - be64_to_cpus(&header.refcount_table_offset); - be32_to_cpus(&header.refcount_table_clusters); - be64_to_cpus(&header.snapshots_offset); - be32_to_cpus(&header.nb_snapshots); - - if (header.magic != QCOW_MAGIC) { - error_setg(errp, "Image is not in qcow2 format"); - ret = -EINVAL; - goto fail; - } - if (header.version < 2 || header.version > 3) { - error_setg(errp, "Unsupported qcow2 version %" PRIu32, header.version); - ret = -ENOTSUP; - goto fail; - } - - s->qcow_version = header.version; - - /* Initialise cluster size */ - if (header.cluster_bits < MIN_CLUSTER_BITS || - header.cluster_bits > MAX_CLUSTER_BITS) { - error_setg(errp, "Unsupported cluster size: 2^%" PRIu32, - header.cluster_bits); - ret = -EINVAL; - goto fail; - } - - s->cluster_bits = header.cluster_bits; - s->cluster_size = 1 << s->cluster_bits; - s->cluster_sectors = 1 << (s->cluster_bits - 9); - - /* Initialise version 3 header fields */ - if (header.version == 2) { - header.incompatible_features = 0; - header.compatible_features = 0; - header.autoclear_features = 0; - header.refcount_order = 4; - header.header_length = 72; - } else { - be64_to_cpus(&header.incompatible_features); - be64_to_cpus(&header.compatible_features); - be64_to_cpus(&header.autoclear_features); - be32_to_cpus(&header.refcount_order); - be32_to_cpus(&header.header_length); - - if (header.header_length < 104) { - error_setg(errp, "qcow2 header too short"); - ret = -EINVAL; - goto fail; - } - } - - if (header.header_length > s->cluster_size) { - error_setg(errp, "qcow2 header exceeds cluster size"); - ret = -EINVAL; - goto fail; - } - - if (header.header_length > sizeof(header)) { - s->unknown_header_fields_size = header.header_length - sizeof(header); - s->unknown_header_fields = g_malloc(s->unknown_header_fields_size); - ret = bdrv_pread(bs->file->bs, sizeof(header), s->unknown_header_fields, - s->unknown_header_fields_size); - if (ret < 0) { - error_setg_errno(errp, -ret, "Could not read unknown qcow2 header " - "fields"); - goto fail; - } - } - - if (header.backing_file_offset > s->cluster_size) { - error_setg(errp, "Invalid backing file offset"); - ret = -EINVAL; - goto fail; - } - - if (header.backing_file_offset) { - ext_end = header.backing_file_offset; - } else { - ext_end = 1 << header.cluster_bits; - } - - /* Handle feature bits */ - s->incompatible_features = header.incompatible_features; - s->compatible_features = header.compatible_features; - s->autoclear_features = header.autoclear_features; - - if (s->incompatible_features & ~QCOW2_INCOMPAT_MASK) { - void *feature_table = NULL; - qcow2_read_extensions(bs, header.header_length, ext_end, - &feature_table, NULL); - report_unsupported_feature(errp, feature_table, - s->incompatible_features & - ~QCOW2_INCOMPAT_MASK); - ret = -ENOTSUP; - g_free(feature_table); - goto fail; - } - - if (s->incompatible_features & QCOW2_INCOMPAT_CORRUPT) { - /* Corrupt images may not be written to unless they are being repaired - */ - if ((flags & BDRV_O_RDWR) && !(flags & BDRV_O_CHECK)) { - error_setg(errp, "qcow2: Image is corrupt; cannot be opened " - "read/write"); - ret = -EACCES; - goto fail; - } - } - - /* Check support for various header values */ - if (header.refcount_order > 6) { - error_setg(errp, "Reference count entry width too large; may not " - "exceed 64 bits"); - ret = -EINVAL; - goto fail; - } - s->refcount_order = header.refcount_order; - s->refcount_bits = 1 << s->refcount_order; - s->refcount_max = UINT64_C(1) << (s->refcount_bits - 1); - s->refcount_max += s->refcount_max - 1; - - if (header.crypt_method > QCOW_CRYPT_AES) { - error_setg(errp, "Unsupported encryption method: %" PRIu32, - header.crypt_method); - ret = -EINVAL; - goto fail; - } - if (!qcrypto_cipher_supports(QCRYPTO_CIPHER_ALG_AES_128)) { - error_setg(errp, "AES cipher not available"); - ret = -EINVAL; - goto fail; - } - s->crypt_method_header = header.crypt_method; - if (s->crypt_method_header) { - if (bdrv_uses_whitelist() && - s->crypt_method_header == QCOW_CRYPT_AES) { - error_report("qcow2 built-in AES encryption is deprecated"); - error_printf("Support for it will be removed in a future release.\n" - "You can use 'qemu-img convert' to switch to an\n" - "unencrypted qcow2 image, or a LUKS raw image.\n"); - } - - bs->encrypted = 1; - } - - s->l2_bits = s->cluster_bits - 3; /* L2 is always one cluster */ - s->l2_size = 1 << s->l2_bits; - /* 2^(s->refcount_order - 3) is the refcount width in bytes */ - s->refcount_block_bits = s->cluster_bits - (s->refcount_order - 3); - s->refcount_block_size = 1 << s->refcount_block_bits; - bs->total_sectors = header.size / 512; - s->csize_shift = (62 - (s->cluster_bits - 8)); - s->csize_mask = (1 << (s->cluster_bits - 8)) - 1; - s->cluster_offset_mask = (1LL << s->csize_shift) - 1; - - s->refcount_table_offset = header.refcount_table_offset; - s->refcount_table_size = - header.refcount_table_clusters << (s->cluster_bits - 3); - - if (header.refcount_table_clusters > qcow2_max_refcount_clusters(s)) { - error_setg(errp, "Reference count table too large"); - ret = -EINVAL; - goto fail; - } - - ret = validate_table_offset(bs, s->refcount_table_offset, - s->refcount_table_size, sizeof(uint64_t)); - if (ret < 0) { - error_setg(errp, "Invalid reference count table offset"); - goto fail; - } - - /* Snapshot table offset/length */ - if (header.nb_snapshots > QCOW_MAX_SNAPSHOTS) { - error_setg(errp, "Too many snapshots"); - ret = -EINVAL; - goto fail; - } - - ret = validate_table_offset(bs, header.snapshots_offset, - header.nb_snapshots, - sizeof(QCowSnapshotHeader)); - if (ret < 0) { - error_setg(errp, "Invalid snapshot table offset"); - goto fail; - } - - /* read the level 1 table */ - if (header.l1_size > QCOW_MAX_L1_SIZE / sizeof(uint64_t)) { - error_setg(errp, "Active L1 table too large"); - ret = -EFBIG; - goto fail; - } - s->l1_size = header.l1_size; - - l1_vm_state_index = size_to_l1(s, header.size); - if (l1_vm_state_index > INT_MAX) { - error_setg(errp, "Image is too big"); - ret = -EFBIG; - goto fail; - } - s->l1_vm_state_index = l1_vm_state_index; - - /* the L1 table must contain at least enough entries to put - header.size bytes */ - if (s->l1_size < s->l1_vm_state_index) { - error_setg(errp, "L1 table is too small"); - ret = -EINVAL; - goto fail; - } - - ret = validate_table_offset(bs, header.l1_table_offset, - header.l1_size, sizeof(uint64_t)); - if (ret < 0) { - error_setg(errp, "Invalid L1 table offset"); - goto fail; - } - s->l1_table_offset = header.l1_table_offset; - - - if (s->l1_size > 0) { - s->l1_table = qemu_try_blockalign(bs->file->bs, - align_offset(s->l1_size * sizeof(uint64_t), 512)); - if (s->l1_table == NULL) { - error_setg(errp, "Could not allocate L1 table"); - ret = -ENOMEM; - goto fail; - } - ret = bdrv_pread(bs->file->bs, s->l1_table_offset, s->l1_table, - s->l1_size * sizeof(uint64_t)); - if (ret < 0) { - error_setg_errno(errp, -ret, "Could not read L1 table"); - goto fail; - } - for(i = 0;i < s->l1_size; i++) { - be64_to_cpus(&s->l1_table[i]); - } - } - - /* Parse driver-specific options */ - ret = qcow2_update_options(bs, options, flags, errp); - if (ret < 0) { - goto fail; - } - - s->cluster_cache = g_malloc(s->cluster_size); - /* one more sector for decompressed data alignment */ - s->cluster_data = qemu_try_blockalign(bs->file->bs, QCOW_MAX_CRYPT_CLUSTERS - * s->cluster_size + 512); - if (s->cluster_data == NULL) { - error_setg(errp, "Could not allocate temporary cluster buffer"); - ret = -ENOMEM; - goto fail; - } - - s->cluster_cache_offset = -1; - s->flags = flags; - - ret = qcow2_refcount_init(bs); - if (ret != 0) { - error_setg_errno(errp, -ret, "Could not initialize refcount handling"); - goto fail; - } - - QLIST_INIT(&s->cluster_allocs); - QTAILQ_INIT(&s->discards); - - /* read qcow2 extensions */ - if (qcow2_read_extensions(bs, header.header_length, ext_end, NULL, - &local_err)) { - error_propagate(errp, local_err); - ret = -EINVAL; - goto fail; - } - - /* read the backing file name */ - if (header.backing_file_offset != 0) { - len = header.backing_file_size; - if (len > MIN(1023, s->cluster_size - header.backing_file_offset) || - len >= sizeof(bs->backing_file)) { - error_setg(errp, "Backing file name too long"); - ret = -EINVAL; - goto fail; - } - ret = bdrv_pread(bs->file->bs, header.backing_file_offset, - bs->backing_file, len); - if (ret < 0) { - error_setg_errno(errp, -ret, "Could not read backing file name"); - goto fail; - } - bs->backing_file[len] = '\0'; - s->image_backing_file = g_strdup(bs->backing_file); - } - - /* Internal snapshots */ - s->snapshots_offset = header.snapshots_offset; - s->nb_snapshots = header.nb_snapshots; - - ret = qcow2_read_snapshots(bs); - if (ret < 0) { - error_setg_errno(errp, -ret, "Could not read snapshots"); - goto fail; - } - - /* Clear unknown autoclear feature bits */ - if (!bs->read_only && !(flags & BDRV_O_INACTIVE) && s->autoclear_features) { - s->autoclear_features = 0; - ret = qcow2_update_header(bs); - if (ret < 0) { - error_setg_errno(errp, -ret, "Could not update qcow2 header"); - goto fail; - } - } - - /* Initialise locks */ - qemu_co_mutex_init(&s->lock); - - /* Repair image if dirty */ - if (!(flags & (BDRV_O_CHECK | BDRV_O_INACTIVE)) && !bs->read_only && - (s->incompatible_features & QCOW2_INCOMPAT_DIRTY)) { - BdrvCheckResult result = {0}; - - ret = qcow2_check(bs, &result, BDRV_FIX_ERRORS | BDRV_FIX_LEAKS); - if (ret < 0) { - error_setg_errno(errp, -ret, "Could not repair dirty image"); - goto fail; - } - } - -#ifdef DEBUG_ALLOC - { - BdrvCheckResult result = {0}; - qcow2_check_refcounts(bs, &result, 0); - } -#endif - return ret; - - fail: - g_free(s->unknown_header_fields); - cleanup_unknown_header_ext(bs); - qcow2_free_snapshots(bs); - qcow2_refcount_close(bs); - qemu_vfree(s->l1_table); - /* else pre-write overlap checks in cache_destroy may crash */ - s->l1_table = NULL; - cache_clean_timer_del(bs); - if (s->l2_table_cache) { - qcow2_cache_destroy(bs, s->l2_table_cache); - } - if (s->refcount_block_cache) { - qcow2_cache_destroy(bs, s->refcount_block_cache); - } - g_free(s->cluster_cache); - qemu_vfree(s->cluster_data); - return ret; -} - -static void qcow2_refresh_limits(BlockDriverState *bs, Error **errp) -{ - BDRVQcow2State *s = bs->opaque; - - bs->bl.write_zeroes_alignment = s->cluster_sectors; -} - -static int qcow2_set_key(BlockDriverState *bs, const char *key) -{ - BDRVQcow2State *s = bs->opaque; - uint8_t keybuf[16]; - int len, i; - Error *err = NULL; - - memset(keybuf, 0, 16); - len = strlen(key); - if (len > 16) - len = 16; - /* XXX: we could compress the chars to 7 bits to increase - entropy */ - for(i = 0;i < len;i++) { - keybuf[i] = key[i]; - } - assert(bs->encrypted); - - qcrypto_cipher_free(s->cipher); - s->cipher = qcrypto_cipher_new( - QCRYPTO_CIPHER_ALG_AES_128, - QCRYPTO_CIPHER_MODE_CBC, - keybuf, G_N_ELEMENTS(keybuf), - &err); - - if (!s->cipher) { - /* XXX would be nice if errors in this method could - * be properly propagate to the caller. Would need - * the bdrv_set_key() API signature to be fixed. */ - error_free(err); - return -1; - } - return 0; -} - -static int qcow2_reopen_prepare(BDRVReopenState *state, - BlockReopenQueue *queue, Error **errp) -{ - Qcow2ReopenState *r; - int ret; - - r = g_new0(Qcow2ReopenState, 1); - state->opaque = r; - - ret = qcow2_update_options_prepare(state->bs, r, state->options, - state->flags, errp); - if (ret < 0) { - goto fail; - } - - /* We need to write out any unwritten data if we reopen read-only. */ - if ((state->flags & BDRV_O_RDWR) == 0) { - ret = bdrv_flush(state->bs); - if (ret < 0) { - goto fail; - } - - ret = qcow2_mark_clean(state->bs); - if (ret < 0) { - goto fail; - } - } - - return 0; - -fail: - qcow2_update_options_abort(state->bs, r); - g_free(r); - return ret; -} - -static void qcow2_reopen_commit(BDRVReopenState *state) -{ - qcow2_update_options_commit(state->bs, state->opaque); - g_free(state->opaque); -} - -static void qcow2_reopen_abort(BDRVReopenState *state) -{ - qcow2_update_options_abort(state->bs, state->opaque); - g_free(state->opaque); -} - -static void qcow2_join_options(QDict *options, QDict *old_options) -{ - bool has_new_overlap_template = - qdict_haskey(options, QCOW2_OPT_OVERLAP) || - qdict_haskey(options, QCOW2_OPT_OVERLAP_TEMPLATE); - bool has_new_total_cache_size = - qdict_haskey(options, QCOW2_OPT_CACHE_SIZE); - bool has_all_cache_options; - - /* New overlap template overrides all old overlap options */ - if (has_new_overlap_template) { - qdict_del(old_options, QCOW2_OPT_OVERLAP); - qdict_del(old_options, QCOW2_OPT_OVERLAP_TEMPLATE); - qdict_del(old_options, QCOW2_OPT_OVERLAP_MAIN_HEADER); - qdict_del(old_options, QCOW2_OPT_OVERLAP_ACTIVE_L1); - qdict_del(old_options, QCOW2_OPT_OVERLAP_ACTIVE_L2); - qdict_del(old_options, QCOW2_OPT_OVERLAP_REFCOUNT_TABLE); - qdict_del(old_options, QCOW2_OPT_OVERLAP_REFCOUNT_BLOCK); - qdict_del(old_options, QCOW2_OPT_OVERLAP_SNAPSHOT_TABLE); - qdict_del(old_options, QCOW2_OPT_OVERLAP_INACTIVE_L1); - qdict_del(old_options, QCOW2_OPT_OVERLAP_INACTIVE_L2); - } - - /* New total cache size overrides all old options */ - if (qdict_haskey(options, QCOW2_OPT_CACHE_SIZE)) { - qdict_del(old_options, QCOW2_OPT_L2_CACHE_SIZE); - qdict_del(old_options, QCOW2_OPT_REFCOUNT_CACHE_SIZE); - } - - qdict_join(options, old_options, false); - - /* - * If after merging all cache size options are set, an old total size is - * overwritten. Do keep all options, however, if all three are new. The - * resulting error message is what we want to happen. - */ - has_all_cache_options = - qdict_haskey(options, QCOW2_OPT_CACHE_SIZE) || - qdict_haskey(options, QCOW2_OPT_L2_CACHE_SIZE) || - qdict_haskey(options, QCOW2_OPT_REFCOUNT_CACHE_SIZE); - - if (has_all_cache_options && !has_new_total_cache_size) { - qdict_del(options, QCOW2_OPT_CACHE_SIZE); - } -} - -static int64_t coroutine_fn qcow2_co_get_block_status(BlockDriverState *bs, - int64_t sector_num, int nb_sectors, int *pnum, BlockDriverState **file) -{ - BDRVQcow2State *s = bs->opaque; - uint64_t cluster_offset; - int index_in_cluster, ret; - int64_t status = 0; - - *pnum = nb_sectors; - qemu_co_mutex_lock(&s->lock); - ret = qcow2_get_cluster_offset(bs, sector_num << 9, pnum, &cluster_offset); - qemu_co_mutex_unlock(&s->lock); - if (ret < 0) { - return ret; - } - - if (cluster_offset != 0 && ret != QCOW2_CLUSTER_COMPRESSED && - !s->cipher) { - index_in_cluster = sector_num & (s->cluster_sectors - 1); - cluster_offset |= (index_in_cluster << BDRV_SECTOR_BITS); - *file = bs->file->bs; - status |= BDRV_BLOCK_OFFSET_VALID | cluster_offset; - } - if (ret == QCOW2_CLUSTER_ZERO) { - status |= BDRV_BLOCK_ZERO; - } else if (ret != QCOW2_CLUSTER_UNALLOCATED) { - status |= BDRV_BLOCK_DATA; - } - return status; -} - -/* handle reading after the end of the backing file */ -int qcow2_backing_read1(BlockDriverState *bs, QEMUIOVector *qiov, - int64_t sector_num, int nb_sectors) -{ - int n1; - if ((sector_num + nb_sectors) <= bs->total_sectors) - return nb_sectors; - if (sector_num >= bs->total_sectors) - n1 = 0; - else - n1 = bs->total_sectors - sector_num; - - qemu_iovec_memset(qiov, 512 * n1, 0, 512 * (nb_sectors - n1)); - - return n1; -} - -static coroutine_fn int qcow2_co_readv(BlockDriverState *bs, int64_t sector_num, - int remaining_sectors, QEMUIOVector *qiov) -{ - BDRVQcow2State *s = bs->opaque; - int index_in_cluster, n1; - int ret; - int cur_nr_sectors; /* number of sectors in current iteration */ - uint64_t cluster_offset = 0; - uint64_t bytes_done = 0; - QEMUIOVector hd_qiov; - uint8_t *cluster_data = NULL; - - qemu_iovec_init(&hd_qiov, qiov->niov); - - qemu_co_mutex_lock(&s->lock); - - while (remaining_sectors != 0) { - - /* prepare next request */ - cur_nr_sectors = remaining_sectors; - if (s->cipher) { - cur_nr_sectors = MIN(cur_nr_sectors, - QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors); - } - - ret = qcow2_get_cluster_offset(bs, sector_num << 9, - &cur_nr_sectors, &cluster_offset); - if (ret < 0) { - goto fail; - } - - index_in_cluster = sector_num & (s->cluster_sectors - 1); - - qemu_iovec_reset(&hd_qiov); - qemu_iovec_concat(&hd_qiov, qiov, bytes_done, - cur_nr_sectors * 512); - - switch (ret) { - case QCOW2_CLUSTER_UNALLOCATED: - - if (bs->backing) { - /* read from the base image */ - n1 = qcow2_backing_read1(bs->backing->bs, &hd_qiov, - sector_num, cur_nr_sectors); - if (n1 > 0) { - QEMUIOVector local_qiov; - - qemu_iovec_init(&local_qiov, hd_qiov.niov); - qemu_iovec_concat(&local_qiov, &hd_qiov, 0, - n1 * BDRV_SECTOR_SIZE); - - BLKDBG_EVENT(bs->file, BLKDBG_READ_BACKING_AIO); - qemu_co_mutex_unlock(&s->lock); - ret = bdrv_co_readv(bs->backing->bs, sector_num, - n1, &local_qiov); - qemu_co_mutex_lock(&s->lock); - - qemu_iovec_destroy(&local_qiov); - - if (ret < 0) { - goto fail; - } - } - } else { - /* Note: in this case, no need to wait */ - qemu_iovec_memset(&hd_qiov, 0, 0, 512 * cur_nr_sectors); - } - break; - - case QCOW2_CLUSTER_ZERO: - qemu_iovec_memset(&hd_qiov, 0, 0, 512 * cur_nr_sectors); - break; - - case QCOW2_CLUSTER_COMPRESSED: - /* add AIO support for compressed blocks ? */ - ret = qcow2_decompress_cluster(bs, cluster_offset); - if (ret < 0) { - goto fail; - } - - qemu_iovec_from_buf(&hd_qiov, 0, - s->cluster_cache + index_in_cluster * 512, - 512 * cur_nr_sectors); - break; - - case QCOW2_CLUSTER_NORMAL: - if ((cluster_offset & 511) != 0) { - ret = -EIO; - goto fail; - } - - if (bs->encrypted) { - assert(s->cipher); - - /* - * For encrypted images, read everything into a temporary - * contiguous buffer on which the AES functions can work. - */ - if (!cluster_data) { - cluster_data = - qemu_try_blockalign(bs->file->bs, - QCOW_MAX_CRYPT_CLUSTERS - * s->cluster_size); - if (cluster_data == NULL) { - ret = -ENOMEM; - goto fail; - } - } - - assert(cur_nr_sectors <= - QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors); - qemu_iovec_reset(&hd_qiov); - qemu_iovec_add(&hd_qiov, cluster_data, - 512 * cur_nr_sectors); - } - - BLKDBG_EVENT(bs->file, BLKDBG_READ_AIO); - qemu_co_mutex_unlock(&s->lock); - ret = bdrv_co_readv(bs->file->bs, - (cluster_offset >> 9) + index_in_cluster, - cur_nr_sectors, &hd_qiov); - qemu_co_mutex_lock(&s->lock); - if (ret < 0) { - goto fail; - } - if (bs->encrypted) { - assert(s->cipher); - Error *err = NULL; - if (qcow2_encrypt_sectors(s, sector_num, cluster_data, - cluster_data, cur_nr_sectors, false, - &err) < 0) { - error_free(err); - ret = -EIO; - goto fail; - } - qemu_iovec_from_buf(qiov, bytes_done, - cluster_data, 512 * cur_nr_sectors); - } - break; - - default: - g_assert_not_reached(); - ret = -EIO; - goto fail; - } - - remaining_sectors -= cur_nr_sectors; - sector_num += cur_nr_sectors; - bytes_done += cur_nr_sectors * 512; - } - ret = 0; - -fail: - qemu_co_mutex_unlock(&s->lock); - - qemu_iovec_destroy(&hd_qiov); - qemu_vfree(cluster_data); - - return ret; -} - -static coroutine_fn int qcow2_co_writev(BlockDriverState *bs, - int64_t sector_num, - int remaining_sectors, - QEMUIOVector *qiov) -{ - BDRVQcow2State *s = bs->opaque; - int index_in_cluster; - int ret; - int cur_nr_sectors; /* number of sectors in current iteration */ - uint64_t cluster_offset; - QEMUIOVector hd_qiov; - uint64_t bytes_done = 0; - uint8_t *cluster_data = NULL; - QCowL2Meta *l2meta = NULL; - - trace_qcow2_writev_start_req(qemu_coroutine_self(), sector_num, - remaining_sectors); - - qemu_iovec_init(&hd_qiov, qiov->niov); - - s->cluster_cache_offset = -1; /* disable compressed cache */ - - qemu_co_mutex_lock(&s->lock); - - while (remaining_sectors != 0) { - - l2meta = NULL; - - trace_qcow2_writev_start_part(qemu_coroutine_self()); - index_in_cluster = sector_num & (s->cluster_sectors - 1); - cur_nr_sectors = remaining_sectors; - if (bs->encrypted && - cur_nr_sectors > - QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors - index_in_cluster) { - cur_nr_sectors = - QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors - index_in_cluster; - } - - ret = qcow2_alloc_cluster_offset(bs, sector_num << 9, - &cur_nr_sectors, &cluster_offset, &l2meta); - if (ret < 0) { - goto fail; - } - - assert((cluster_offset & 511) == 0); - - qemu_iovec_reset(&hd_qiov); - qemu_iovec_concat(&hd_qiov, qiov, bytes_done, - cur_nr_sectors * 512); - - if (bs->encrypted) { - Error *err = NULL; - assert(s->cipher); - if (!cluster_data) { - cluster_data = qemu_try_blockalign(bs->file->bs, - QCOW_MAX_CRYPT_CLUSTERS - * s->cluster_size); - if (cluster_data == NULL) { - ret = -ENOMEM; - goto fail; - } - } - - assert(hd_qiov.size <= - QCOW_MAX_CRYPT_CLUSTERS * s->cluster_size); - qemu_iovec_to_buf(&hd_qiov, 0, cluster_data, hd_qiov.size); - - if (qcow2_encrypt_sectors(s, sector_num, cluster_data, - cluster_data, cur_nr_sectors, - true, &err) < 0) { - error_free(err); - ret = -EIO; - goto fail; - } - - qemu_iovec_reset(&hd_qiov); - qemu_iovec_add(&hd_qiov, cluster_data, - cur_nr_sectors * 512); - } - - ret = qcow2_pre_write_overlap_check(bs, 0, - cluster_offset + index_in_cluster * BDRV_SECTOR_SIZE, - cur_nr_sectors * BDRV_SECTOR_SIZE); - if (ret < 0) { - goto fail; - } - - qemu_co_mutex_unlock(&s->lock); - BLKDBG_EVENT(bs->file, BLKDBG_WRITE_AIO); - trace_qcow2_writev_data(qemu_coroutine_self(), - (cluster_offset >> 9) + index_in_cluster); - ret = bdrv_co_writev(bs->file->bs, - (cluster_offset >> 9) + index_in_cluster, - cur_nr_sectors, &hd_qiov); - qemu_co_mutex_lock(&s->lock); - if (ret < 0) { - goto fail; - } - - while (l2meta != NULL) { - QCowL2Meta *next; - - ret = qcow2_alloc_cluster_link_l2(bs, l2meta); - if (ret < 0) { - goto fail; - } - - /* Take the request off the list of running requests */ - if (l2meta->nb_clusters != 0) { - QLIST_REMOVE(l2meta, next_in_flight); - } - - qemu_co_queue_restart_all(&l2meta->dependent_requests); - - next = l2meta->next; - g_free(l2meta); - l2meta = next; - } - - remaining_sectors -= cur_nr_sectors; - sector_num += cur_nr_sectors; - bytes_done += cur_nr_sectors * 512; - trace_qcow2_writev_done_part(qemu_coroutine_self(), cur_nr_sectors); - } - ret = 0; - -fail: - qemu_co_mutex_unlock(&s->lock); - - while (l2meta != NULL) { - QCowL2Meta *next; - - if (l2meta->nb_clusters != 0) { - QLIST_REMOVE(l2meta, next_in_flight); - } - qemu_co_queue_restart_all(&l2meta->dependent_requests); - - next = l2meta->next; - g_free(l2meta); - l2meta = next; - } - - qemu_iovec_destroy(&hd_qiov); - qemu_vfree(cluster_data); - trace_qcow2_writev_done_req(qemu_coroutine_self(), ret); - - return ret; -} - -static int qcow2_inactivate(BlockDriverState *bs) -{ - BDRVQcow2State *s = bs->opaque; - int ret, result = 0; - - ret = qcow2_cache_flush(bs, s->l2_table_cache); - if (ret) { - result = ret; - error_report("Failed to flush the L2 table cache: %s", - strerror(-ret)); - } - - ret = qcow2_cache_flush(bs, s->refcount_block_cache); - if (ret) { - result = ret; - error_report("Failed to flush the refcount block cache: %s", - strerror(-ret)); - } - - if (result == 0) { - qcow2_mark_clean(bs); - } - - return result; -} - -static void qcow2_close(BlockDriverState *bs) -{ - BDRVQcow2State *s = bs->opaque; - qemu_vfree(s->l1_table); - /* else pre-write overlap checks in cache_destroy may crash */ - s->l1_table = NULL; - - if (!(s->flags & BDRV_O_INACTIVE)) { - qcow2_inactivate(bs); - } - - cache_clean_timer_del(bs); - qcow2_cache_destroy(bs, s->l2_table_cache); - qcow2_cache_destroy(bs, s->refcount_block_cache); - - qcrypto_cipher_free(s->cipher); - s->cipher = NULL; - - g_free(s->unknown_header_fields); - cleanup_unknown_header_ext(bs); - - g_free(s->image_backing_file); - g_free(s->image_backing_format); - - g_free(s->cluster_cache); - qemu_vfree(s->cluster_data); - qcow2_refcount_close(bs); - qcow2_free_snapshots(bs); -} - -static void qcow2_invalidate_cache(BlockDriverState *bs, Error **errp) -{ - BDRVQcow2State *s = bs->opaque; - int flags = s->flags; - QCryptoCipher *cipher = NULL; - QDict *options; - Error *local_err = NULL; - int ret; - - /* - * Backing files are read-only which makes all of their metadata immutable, - * that means we don't have to worry about reopening them here. - */ - - cipher = s->cipher; - s->cipher = NULL; - - qcow2_close(bs); - - bdrv_invalidate_cache(bs->file->bs, &local_err); - if (local_err) { - error_propagate(errp, local_err); - bs->drv = NULL; - return; - } - - memset(s, 0, sizeof(BDRVQcow2State)); - options = qdict_clone_shallow(bs->options); - - flags &= ~BDRV_O_INACTIVE; - ret = qcow2_open(bs, options, flags, &local_err); - QDECREF(options); - if (local_err) { - error_propagate(errp, local_err); - error_prepend(errp, "Could not reopen qcow2 layer: "); - bs->drv = NULL; - return; - } else if (ret < 0) { - error_setg_errno(errp, -ret, "Could not reopen qcow2 layer"); - bs->drv = NULL; - return; - } - - s->cipher = cipher; -} - -static size_t header_ext_add(char *buf, uint32_t magic, const void *s, - size_t len, size_t buflen) -{ - QCowExtension *ext_backing_fmt = (QCowExtension*) buf; - size_t ext_len = sizeof(QCowExtension) + ((len + 7) & ~7); - - if (buflen < ext_len) { - return -ENOSPC; - } - - *ext_backing_fmt = (QCowExtension) { - .magic = cpu_to_be32(magic), - .len = cpu_to_be32(len), - }; - memcpy(buf + sizeof(QCowExtension), s, len); - - return ext_len; -} - -/* - * Updates the qcow2 header, including the variable length parts of it, i.e. - * the backing file name and all extensions. qcow2 was not designed to allow - * such changes, so if we run out of space (we can only use the first cluster) - * this function may fail. - * - * Returns 0 on success, -errno in error cases. - */ -int qcow2_update_header(BlockDriverState *bs) -{ - BDRVQcow2State *s = bs->opaque; - QCowHeader *header; - char *buf; - size_t buflen = s->cluster_size; - int ret; - uint64_t total_size; - uint32_t refcount_table_clusters; - size_t header_length; - Qcow2UnknownHeaderExtension *uext; - - buf = qemu_blockalign(bs, buflen); - - /* Header structure */ - header = (QCowHeader*) buf; - - if (buflen < sizeof(*header)) { - ret = -ENOSPC; - goto fail; - } - - header_length = sizeof(*header) + s->unknown_header_fields_size; - total_size = bs->total_sectors * BDRV_SECTOR_SIZE; - refcount_table_clusters = s->refcount_table_size >> (s->cluster_bits - 3); - - *header = (QCowHeader) { - /* Version 2 fields */ - .magic = cpu_to_be32(QCOW_MAGIC), - .version = cpu_to_be32(s->qcow_version), - .backing_file_offset = 0, - .backing_file_size = 0, - .cluster_bits = cpu_to_be32(s->cluster_bits), - .size = cpu_to_be64(total_size), - .crypt_method = cpu_to_be32(s->crypt_method_header), - .l1_size = cpu_to_be32(s->l1_size), - .l1_table_offset = cpu_to_be64(s->l1_table_offset), - .refcount_table_offset = cpu_to_be64(s->refcount_table_offset), - .refcount_table_clusters = cpu_to_be32(refcount_table_clusters), - .nb_snapshots = cpu_to_be32(s->nb_snapshots), - .snapshots_offset = cpu_to_be64(s->snapshots_offset), - - /* Version 3 fields */ - .incompatible_features = cpu_to_be64(s->incompatible_features), - .compatible_features = cpu_to_be64(s->compatible_features), - .autoclear_features = cpu_to_be64(s->autoclear_features), - .refcount_order = cpu_to_be32(s->refcount_order), - .header_length = cpu_to_be32(header_length), - }; - - /* For older versions, write a shorter header */ - switch (s->qcow_version) { - case 2: - ret = offsetof(QCowHeader, incompatible_features); - break; - case 3: - ret = sizeof(*header); - break; - default: - ret = -EINVAL; - goto fail; - } - - buf += ret; - buflen -= ret; - memset(buf, 0, buflen); - - /* Preserve any unknown field in the header */ - if (s->unknown_header_fields_size) { - if (buflen < s->unknown_header_fields_size) { - ret = -ENOSPC; - goto fail; - } - - memcpy(buf, s->unknown_header_fields, s->unknown_header_fields_size); - buf += s->unknown_header_fields_size; - buflen -= s->unknown_header_fields_size; - } - - /* Backing file format header extension */ - if (s->image_backing_format) { - ret = header_ext_add(buf, QCOW2_EXT_MAGIC_BACKING_FORMAT, - s->image_backing_format, - strlen(s->image_backing_format), - buflen); - if (ret < 0) { - goto fail; - } - - buf += ret; - buflen -= ret; - } - - /* Feature table */ - if (s->qcow_version >= 3) { - Qcow2Feature features[] = { - { - .type = QCOW2_FEAT_TYPE_INCOMPATIBLE, - .bit = QCOW2_INCOMPAT_DIRTY_BITNR, - .name = "dirty bit", - }, - { - .type = QCOW2_FEAT_TYPE_INCOMPATIBLE, - .bit = QCOW2_INCOMPAT_CORRUPT_BITNR, - .name = "corrupt bit", - }, - { - .type = QCOW2_FEAT_TYPE_COMPATIBLE, - .bit = QCOW2_COMPAT_LAZY_REFCOUNTS_BITNR, - .name = "lazy refcounts", - }, - }; - - ret = header_ext_add(buf, QCOW2_EXT_MAGIC_FEATURE_TABLE, - features, sizeof(features), buflen); - if (ret < 0) { - goto fail; - } - buf += ret; - buflen -= ret; - } - - /* Keep unknown header extensions */ - QLIST_FOREACH(uext, &s->unknown_header_ext, next) { - ret = header_ext_add(buf, uext->magic, uext->data, uext->len, buflen); - if (ret < 0) { - goto fail; - } - - buf += ret; - buflen -= ret; - } - - /* End of header extensions */ - ret = header_ext_add(buf, QCOW2_EXT_MAGIC_END, NULL, 0, buflen); - if (ret < 0) { - goto fail; - } - - buf += ret; - buflen -= ret; - - /* Backing file name */ - if (s->image_backing_file) { - size_t backing_file_len = strlen(s->image_backing_file); - - if (buflen < backing_file_len) { - ret = -ENOSPC; - goto fail; - } - - /* Using strncpy is ok here, since buf is not NUL-terminated. */ - strncpy(buf, s->image_backing_file, buflen); - - header->backing_file_offset = cpu_to_be64(buf - ((char*) header)); - header->backing_file_size = cpu_to_be32(backing_file_len); - } - - /* Write the new header */ - ret = bdrv_pwrite(bs->file->bs, 0, header, s->cluster_size); - if (ret < 0) { - goto fail; - } - - ret = 0; -fail: - qemu_vfree(header); - return ret; -} - -static int qcow2_change_backing_file(BlockDriverState *bs, - const char *backing_file, const char *backing_fmt) -{ - BDRVQcow2State *s = bs->opaque; - - if (backing_file && strlen(backing_file) > 1023) { - return -EINVAL; - } - - pstrcpy(bs->backing_file, sizeof(bs->backing_file), backing_file ?: ""); - pstrcpy(bs->backing_format, sizeof(bs->backing_format), backing_fmt ?: ""); - - g_free(s->image_backing_file); - g_free(s->image_backing_format); - - s->image_backing_file = backing_file ? g_strdup(bs->backing_file) : NULL; - s->image_backing_format = backing_fmt ? g_strdup(bs->backing_format) : NULL; - - return qcow2_update_header(bs); -} - -static int preallocate(BlockDriverState *bs) -{ - uint64_t nb_sectors; - uint64_t offset; - uint64_t host_offset = 0; - int num; - int ret; - QCowL2Meta *meta; - - nb_sectors = bdrv_nb_sectors(bs); - offset = 0; - - while (nb_sectors) { - num = MIN(nb_sectors, INT_MAX >> BDRV_SECTOR_BITS); - ret = qcow2_alloc_cluster_offset(bs, offset, &num, - &host_offset, &meta); - if (ret < 0) { - return ret; - } - - while (meta) { - QCowL2Meta *next = meta->next; - - ret = qcow2_alloc_cluster_link_l2(bs, meta); - if (ret < 0) { - qcow2_free_any_clusters(bs, meta->alloc_offset, - meta->nb_clusters, QCOW2_DISCARD_NEVER); - return ret; - } - - /* There are no dependent requests, but we need to remove our - * request from the list of in-flight requests */ - QLIST_REMOVE(meta, next_in_flight); - - g_free(meta); - meta = next; - } - - /* TODO Preallocate data if requested */ - - nb_sectors -= num; - offset += num << BDRV_SECTOR_BITS; - } - - /* - * It is expected that the image file is large enough to actually contain - * all of the allocated clusters (otherwise we get failing reads after - * EOF). Extend the image to the last allocated sector. - */ - if (host_offset != 0) { - uint8_t buf[BDRV_SECTOR_SIZE]; - memset(buf, 0, BDRV_SECTOR_SIZE); - ret = bdrv_write(bs->file->bs, - (host_offset >> BDRV_SECTOR_BITS) + num - 1, - buf, 1); - if (ret < 0) { - return ret; - } - } - - return 0; -} - -static int qcow2_create2(const char *filename, int64_t total_size, - const char *backing_file, const char *backing_format, - int flags, size_t cluster_size, PreallocMode prealloc, - QemuOpts *opts, int version, int refcount_order, - Error **errp) -{ - int cluster_bits; - QDict *options; - - /* Calculate cluster_bits */ - cluster_bits = ctz32(cluster_size); - if (cluster_bits < MIN_CLUSTER_BITS || cluster_bits > MAX_CLUSTER_BITS || - (1 << cluster_bits) != cluster_size) - { - error_setg(errp, "Cluster size must be a power of two between %d and " - "%dk", 1 << MIN_CLUSTER_BITS, 1 << (MAX_CLUSTER_BITS - 10)); - return -EINVAL; - } - - /* - * Open the image file and write a minimal qcow2 header. - * - * We keep things simple and start with a zero-sized image. We also - * do without refcount blocks or a L1 table for now. We'll fix the - * inconsistency later. - * - * We do need a refcount table because growing the refcount table means - * allocating two new refcount blocks - the seconds of which would be at - * 2 GB for 64k clusters, and we don't want to have a 2 GB initial file - * size for any qcow2 image. - */ - BlockBackend *blk; - QCowHeader *header; - uint64_t* refcount_table; - Error *local_err = NULL; - int ret; - - if (prealloc == PREALLOC_MODE_FULL || prealloc == PREALLOC_MODE_FALLOC) { - /* Note: The following calculation does not need to be exact; if it is a - * bit off, either some bytes will be "leaked" (which is fine) or we - * will need to increase the file size by some bytes (which is fine, - * too, as long as the bulk is allocated here). Therefore, using - * floating point arithmetic is fine. */ - int64_t meta_size = 0; - uint64_t nreftablee, nrefblocke, nl1e, nl2e; - int64_t aligned_total_size = align_offset(total_size, cluster_size); - int refblock_bits, refblock_size; - /* refcount entry size in bytes */ - double rces = (1 << refcount_order) / 8.; - - /* see qcow2_open() */ - refblock_bits = cluster_bits - (refcount_order - 3); - refblock_size = 1 << refblock_bits; - - /* header: 1 cluster */ - meta_size += cluster_size; - - /* total size of L2 tables */ - nl2e = aligned_total_size / cluster_size; - nl2e = align_offset(nl2e, cluster_size / sizeof(uint64_t)); - meta_size += nl2e * sizeof(uint64_t); - - /* total size of L1 tables */ - nl1e = nl2e * sizeof(uint64_t) / cluster_size; - nl1e = align_offset(nl1e, cluster_size / sizeof(uint64_t)); - meta_size += nl1e * sizeof(uint64_t); - - /* total size of refcount blocks - * - * note: every host cluster is reference-counted, including metadata - * (even refcount blocks are recursively included). - * Let: - * a = total_size (this is the guest disk size) - * m = meta size not including refcount blocks and refcount tables - * c = cluster size - * y1 = number of refcount blocks entries - * y2 = meta size including everything - * rces = refcount entry size in bytes - * then, - * y1 = (y2 + a)/c - * y2 = y1 * rces + y1 * rces * sizeof(u64) / c + m - * we can get y1: - * y1 = (a + m) / (c - rces - rces * sizeof(u64) / c) - */ - nrefblocke = (aligned_total_size + meta_size + cluster_size) - / (cluster_size - rces - rces * sizeof(uint64_t) - / cluster_size); - meta_size += DIV_ROUND_UP(nrefblocke, refblock_size) * cluster_size; - - /* total size of refcount tables */ - nreftablee = nrefblocke / refblock_size; - nreftablee = align_offset(nreftablee, cluster_size / sizeof(uint64_t)); - meta_size += nreftablee * sizeof(uint64_t); - - qemu_opt_set_number(opts, BLOCK_OPT_SIZE, - aligned_total_size + meta_size, &error_abort); - qemu_opt_set(opts, BLOCK_OPT_PREALLOC, PreallocMode_lookup[prealloc], - &error_abort); - } - - ret = bdrv_create_file(filename, opts, &local_err); - if (ret < 0) { - error_propagate(errp, local_err); - return ret; - } - - blk = blk_new_open(filename, NULL, NULL, - BDRV_O_RDWR | BDRV_O_PROTOCOL, &local_err); - if (blk == NULL) { - error_propagate(errp, local_err); - return -EIO; - } - - blk_set_allow_write_beyond_eof(blk, true); - - /* Write the header */ - QEMU_BUILD_BUG_ON((1 << MIN_CLUSTER_BITS) < sizeof(*header)); - header = g_malloc0(cluster_size); - *header = (QCowHeader) { - .magic = cpu_to_be32(QCOW_MAGIC), - .version = cpu_to_be32(version), - .cluster_bits = cpu_to_be32(cluster_bits), - .size = cpu_to_be64(0), - .l1_table_offset = cpu_to_be64(0), - .l1_size = cpu_to_be32(0), - .refcount_table_offset = cpu_to_be64(cluster_size), - .refcount_table_clusters = cpu_to_be32(1), - .refcount_order = cpu_to_be32(refcount_order), - .header_length = cpu_to_be32(sizeof(*header)), - }; - - if (flags & BLOCK_FLAG_ENCRYPT) { - header->crypt_method = cpu_to_be32(QCOW_CRYPT_AES); - } else { - header->crypt_method = cpu_to_be32(QCOW_CRYPT_NONE); - } - - if (flags & BLOCK_FLAG_LAZY_REFCOUNTS) { - header->compatible_features |= - cpu_to_be64(QCOW2_COMPAT_LAZY_REFCOUNTS); - } - - ret = blk_pwrite(blk, 0, header, cluster_size); - g_free(header); - if (ret < 0) { - error_setg_errno(errp, -ret, "Could not write qcow2 header"); - goto out; - } - - /* Write a refcount table with one refcount block */ - refcount_table = g_malloc0(2 * cluster_size); - refcount_table[0] = cpu_to_be64(2 * cluster_size); - ret = blk_pwrite(blk, cluster_size, refcount_table, 2 * cluster_size); - g_free(refcount_table); - - if (ret < 0) { - error_setg_errno(errp, -ret, "Could not write refcount table"); - goto out; - } - - blk_unref(blk); - blk = NULL; - - /* - * And now open the image and make it consistent first (i.e. increase the - * refcount of the cluster that is occupied by the header and the refcount - * table) - */ - options = qdict_new(); - qdict_put(options, "driver", qstring_from_str("qcow2")); - blk = blk_new_open(filename, NULL, options, - BDRV_O_RDWR | BDRV_O_NO_FLUSH, &local_err); - if (blk == NULL) { - error_propagate(errp, local_err); - ret = -EIO; - goto out; - } - - ret = qcow2_alloc_clusters(blk_bs(blk), 3 * cluster_size); - if (ret < 0) { - error_setg_errno(errp, -ret, "Could not allocate clusters for qcow2 " - "header and refcount table"); - goto out; - - } else if (ret != 0) { - error_report("Huh, first cluster in empty image is already in use?"); - abort(); - } - - /* Create a full header (including things like feature table) */ - ret = qcow2_update_header(blk_bs(blk)); - if (ret < 0) { - error_setg_errno(errp, -ret, "Could not update qcow2 header"); - goto out; - } - - /* Okay, now that we have a valid image, let's give it the right size */ - ret = blk_truncate(blk, total_size); - if (ret < 0) { - error_setg_errno(errp, -ret, "Could not resize image"); - goto out; - } - - /* Want a backing file? There you go.*/ - if (backing_file) { - ret = bdrv_change_backing_file(blk_bs(blk), backing_file, backing_format); - if (ret < 0) { - error_setg_errno(errp, -ret, "Could not assign backing file '%s' " - "with format '%s'", backing_file, backing_format); - goto out; - } - } - - /* And if we're supposed to preallocate metadata, do that now */ - if (prealloc != PREALLOC_MODE_OFF) { - BDRVQcow2State *s = blk_bs(blk)->opaque; - qemu_co_mutex_lock(&s->lock); - ret = preallocate(blk_bs(blk)); - qemu_co_mutex_unlock(&s->lock); - if (ret < 0) { - error_setg_errno(errp, -ret, "Could not preallocate metadata"); - goto out; - } - } - - blk_unref(blk); - blk = NULL; - - /* Reopen the image without BDRV_O_NO_FLUSH to flush it before returning */ - options = qdict_new(); - qdict_put(options, "driver", qstring_from_str("qcow2")); - blk = blk_new_open(filename, NULL, options, - BDRV_O_RDWR | BDRV_O_NO_BACKING, &local_err); - if (blk == NULL) { - error_propagate(errp, local_err); - ret = -EIO; - goto out; - } - - ret = 0; -out: - if (blk) { - blk_unref(blk); - } - return ret; -} - -static int qcow2_create(const char *filename, QemuOpts *opts, Error **errp) -{ - char *backing_file = NULL; - char *backing_fmt = NULL; - char *buf = NULL; - uint64_t size = 0; - int flags = 0; - size_t cluster_size = DEFAULT_CLUSTER_SIZE; - PreallocMode prealloc; - int version = 3; - uint64_t refcount_bits = 16; - int refcount_order; - Error *local_err = NULL; - int ret; - - /* Read out options */ - size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0), - BDRV_SECTOR_SIZE); - backing_file = qemu_opt_get_del(opts, BLOCK_OPT_BACKING_FILE); - backing_fmt = qemu_opt_get_del(opts, BLOCK_OPT_BACKING_FMT); - if (qemu_opt_get_bool_del(opts, BLOCK_OPT_ENCRYPT, false)) { - flags |= BLOCK_FLAG_ENCRYPT; - } - cluster_size = qemu_opt_get_size_del(opts, BLOCK_OPT_CLUSTER_SIZE, - DEFAULT_CLUSTER_SIZE); - buf = qemu_opt_get_del(opts, BLOCK_OPT_PREALLOC); - prealloc = qapi_enum_parse(PreallocMode_lookup, buf, - PREALLOC_MODE__MAX, PREALLOC_MODE_OFF, - &local_err); - if (local_err) { - error_propagate(errp, local_err); - ret = -EINVAL; - goto finish; - } - g_free(buf); - buf = qemu_opt_get_del(opts, BLOCK_OPT_COMPAT_LEVEL); - if (!buf) { - /* keep the default */ - } else if (!strcmp(buf, "0.10")) { - version = 2; - } else if (!strcmp(buf, "1.1")) { - version = 3; - } else { - error_setg(errp, "Invalid compatibility level: '%s'", buf); - ret = -EINVAL; - goto finish; - } - - if (qemu_opt_get_bool_del(opts, BLOCK_OPT_LAZY_REFCOUNTS, false)) { - flags |= BLOCK_FLAG_LAZY_REFCOUNTS; - } - - if (backing_file && prealloc != PREALLOC_MODE_OFF) { - error_setg(errp, "Backing file and preallocation cannot be used at " - "the same time"); - ret = -EINVAL; - goto finish; - } - - if (version < 3 && (flags & BLOCK_FLAG_LAZY_REFCOUNTS)) { - error_setg(errp, "Lazy refcounts only supported with compatibility " - "level 1.1 and above (use compat=1.1 or greater)"); - ret = -EINVAL; - goto finish; - } - - refcount_bits = qemu_opt_get_number_del(opts, BLOCK_OPT_REFCOUNT_BITS, - refcount_bits); - if (refcount_bits > 64 || !is_power_of_2(refcount_bits)) { - error_setg(errp, "Refcount width must be a power of two and may not " - "exceed 64 bits"); - ret = -EINVAL; - goto finish; - } - - if (version < 3 && refcount_bits != 16) { - error_setg(errp, "Different refcount widths than 16 bits require " - "compatibility level 1.1 or above (use compat=1.1 or " - "greater)"); - ret = -EINVAL; - goto finish; - } - - refcount_order = ctz32(refcount_bits); - - ret = qcow2_create2(filename, size, backing_file, backing_fmt, flags, - cluster_size, prealloc, opts, version, refcount_order, - &local_err); - if (local_err) { - error_propagate(errp, local_err); - } - -finish: - g_free(backing_file); - g_free(backing_fmt); - g_free(buf); - return ret; -} - -static coroutine_fn int qcow2_co_write_zeroes(BlockDriverState *bs, - int64_t sector_num, int nb_sectors, BdrvRequestFlags flags) -{ - int ret; - BDRVQcow2State *s = bs->opaque; - - /* Emulate misaligned zero writes */ - if (sector_num % s->cluster_sectors || nb_sectors % s->cluster_sectors) { - return -ENOTSUP; - } - - /* Whatever is left can use real zero clusters */ - qemu_co_mutex_lock(&s->lock); - ret = qcow2_zero_clusters(bs, sector_num << BDRV_SECTOR_BITS, - nb_sectors); - qemu_co_mutex_unlock(&s->lock); - - return ret; -} - -static coroutine_fn int qcow2_co_discard(BlockDriverState *bs, - int64_t sector_num, int nb_sectors) -{ - int ret; - BDRVQcow2State *s = bs->opaque; - - qemu_co_mutex_lock(&s->lock); - ret = qcow2_discard_clusters(bs, sector_num << BDRV_SECTOR_BITS, - nb_sectors, QCOW2_DISCARD_REQUEST, false); - qemu_co_mutex_unlock(&s->lock); - return ret; -} - -static int qcow2_truncate(BlockDriverState *bs, int64_t offset) -{ - BDRVQcow2State *s = bs->opaque; - int64_t new_l1_size; - int ret; - - if (offset & 511) { - error_report("The new size must be a multiple of 512"); - return -EINVAL; - } - - /* cannot proceed if image has snapshots */ - if (s->nb_snapshots) { - error_report("Can't resize an image which has snapshots"); - return -ENOTSUP; - } - - /* shrinking is currently not supported */ - if (offset < bs->total_sectors * 512) { - error_report("qcow2 doesn't support shrinking images yet"); - return -ENOTSUP; - } - - new_l1_size = size_to_l1(s, offset); - ret = qcow2_grow_l1_table(bs, new_l1_size, true); - if (ret < 0) { - return ret; - } - - /* write updated header.size */ - offset = cpu_to_be64(offset); - ret = bdrv_pwrite_sync(bs->file->bs, offsetof(QCowHeader, size), - &offset, sizeof(uint64_t)); - if (ret < 0) { - return ret; - } - - s->l1_vm_state_index = new_l1_size; - return 0; -} - -/* XXX: put compressed sectors first, then all the cluster aligned - tables to avoid losing bytes in alignment */ -static int qcow2_write_compressed(BlockDriverState *bs, int64_t sector_num, - const uint8_t *buf, int nb_sectors) -{ - BDRVQcow2State *s = bs->opaque; - z_stream strm; - int ret, out_len; - uint8_t *out_buf; - uint64_t cluster_offset; - - if (nb_sectors == 0) { - /* align end of file to a sector boundary to ease reading with - sector based I/Os */ - cluster_offset = bdrv_getlength(bs->file->bs); - return bdrv_truncate(bs->file->bs, cluster_offset); - } - - if (nb_sectors != s->cluster_sectors) { - ret = -EINVAL; - - /* Zero-pad last write if image size is not cluster aligned */ - if (sector_num + nb_sectors == bs->total_sectors && - nb_sectors < s->cluster_sectors) { - uint8_t *pad_buf = qemu_blockalign(bs, s->cluster_size); - memset(pad_buf, 0, s->cluster_size); - memcpy(pad_buf, buf, nb_sectors * BDRV_SECTOR_SIZE); - ret = qcow2_write_compressed(bs, sector_num, - pad_buf, s->cluster_sectors); - qemu_vfree(pad_buf); - } - return ret; - } - - out_buf = g_malloc(s->cluster_size + (s->cluster_size / 1000) + 128); - - /* best compression, small window, no zlib header */ - memset(&strm, 0, sizeof(strm)); - ret = deflateInit2(&strm, Z_DEFAULT_COMPRESSION, - Z_DEFLATED, -12, - 9, Z_DEFAULT_STRATEGY); - if (ret != 0) { - ret = -EINVAL; - goto fail; - } - - strm.avail_in = s->cluster_size; - strm.next_in = (uint8_t *)buf; - strm.avail_out = s->cluster_size; - strm.next_out = out_buf; - - ret = deflate(&strm, Z_FINISH); - if (ret != Z_STREAM_END && ret != Z_OK) { - deflateEnd(&strm); - ret = -EINVAL; - goto fail; - } - out_len = strm.next_out - out_buf; - - deflateEnd(&strm); - - if (ret != Z_STREAM_END || out_len >= s->cluster_size) { - /* could not compress: write normal cluster */ - ret = bdrv_write(bs, sector_num, buf, s->cluster_sectors); - if (ret < 0) { - goto fail; - } - } else { - cluster_offset = qcow2_alloc_compressed_cluster_offset(bs, - sector_num << 9, out_len); - if (!cluster_offset) { - ret = -EIO; - goto fail; - } - cluster_offset &= s->cluster_offset_mask; - - ret = qcow2_pre_write_overlap_check(bs, 0, cluster_offset, out_len); - if (ret < 0) { - goto fail; - } - - BLKDBG_EVENT(bs->file, BLKDBG_WRITE_COMPRESSED); - ret = bdrv_pwrite(bs->file->bs, cluster_offset, out_buf, out_len); - if (ret < 0) { - goto fail; - } - } - - ret = 0; -fail: - g_free(out_buf); - return ret; -} - -static int make_completely_empty(BlockDriverState *bs) -{ - BDRVQcow2State *s = bs->opaque; - int ret, l1_clusters; - int64_t offset; - uint64_t *new_reftable = NULL; - uint64_t rt_entry, l1_size2; - struct { - uint64_t l1_offset; - uint64_t reftable_offset; - uint32_t reftable_clusters; - } QEMU_PACKED l1_ofs_rt_ofs_cls; - - ret = qcow2_cache_empty(bs, s->l2_table_cache); - if (ret < 0) { - goto fail; - } - - ret = qcow2_cache_empty(bs, s->refcount_block_cache); - if (ret < 0) { - goto fail; - } - - /* Refcounts will be broken utterly */ - ret = qcow2_mark_dirty(bs); - if (ret < 0) { - goto fail; - } - - BLKDBG_EVENT(bs->file, BLKDBG_L1_UPDATE); - - l1_clusters = DIV_ROUND_UP(s->l1_size, s->cluster_size / sizeof(uint64_t)); - l1_size2 = (uint64_t)s->l1_size * sizeof(uint64_t); - - /* After this call, neither the in-memory nor the on-disk refcount - * information accurately describe the actual references */ - - ret = bdrv_write_zeroes(bs->file->bs, s->l1_table_offset / BDRV_SECTOR_SIZE, - l1_clusters * s->cluster_sectors, 0); - if (ret < 0) { - goto fail_broken_refcounts; - } - memset(s->l1_table, 0, l1_size2); - - BLKDBG_EVENT(bs->file, BLKDBG_EMPTY_IMAGE_PREPARE); - - /* Overwrite enough clusters at the beginning of the sectors to place - * the refcount table, a refcount block and the L1 table in; this may - * overwrite parts of the existing refcount and L1 table, which is not - * an issue because the dirty flag is set, complete data loss is in fact - * desired and partial data loss is consequently fine as well */ - ret = bdrv_write_zeroes(bs->file->bs, s->cluster_size / BDRV_SECTOR_SIZE, - (2 + l1_clusters) * s->cluster_size / - BDRV_SECTOR_SIZE, 0); - /* This call (even if it failed overall) may have overwritten on-disk - * refcount structures; in that case, the in-memory refcount information - * will probably differ from the on-disk information which makes the BDS - * unusable */ - if (ret < 0) { - goto fail_broken_refcounts; - } - - BLKDBG_EVENT(bs->file, BLKDBG_L1_UPDATE); - BLKDBG_EVENT(bs->file, BLKDBG_REFTABLE_UPDATE); - - /* "Create" an empty reftable (one cluster) directly after the image - * header and an empty L1 table three clusters after the image header; - * the cluster between those two will be used as the first refblock */ - cpu_to_be64w(&l1_ofs_rt_ofs_cls.l1_offset, 3 * s->cluster_size); - cpu_to_be64w(&l1_ofs_rt_ofs_cls.reftable_offset, s->cluster_size); - cpu_to_be32w(&l1_ofs_rt_ofs_cls.reftable_clusters, 1); - ret = bdrv_pwrite_sync(bs->file->bs, offsetof(QCowHeader, l1_table_offset), - &l1_ofs_rt_ofs_cls, sizeof(l1_ofs_rt_ofs_cls)); - if (ret < 0) { - goto fail_broken_refcounts; - } - - s->l1_table_offset = 3 * s->cluster_size; - - new_reftable = g_try_new0(uint64_t, s->cluster_size / sizeof(uint64_t)); - if (!new_reftable) { - ret = -ENOMEM; - goto fail_broken_refcounts; - } - - s->refcount_table_offset = s->cluster_size; - s->refcount_table_size = s->cluster_size / sizeof(uint64_t); - - g_free(s->refcount_table); - s->refcount_table = new_reftable; - new_reftable = NULL; - - /* Now the in-memory refcount information again corresponds to the on-disk - * information (reftable is empty and no refblocks (the refblock cache is - * empty)); however, this means some clusters (e.g. the image header) are - * referenced, but not refcounted, but the normal qcow2 code assumes that - * the in-memory information is always correct */ - - BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC); - - /* Enter the first refblock into the reftable */ - rt_entry = cpu_to_be64(2 * s->cluster_size); - ret = bdrv_pwrite_sync(bs->file->bs, s->cluster_size, - &rt_entry, sizeof(rt_entry)); - if (ret < 0) { - goto fail_broken_refcounts; - } - s->refcount_table[0] = 2 * s->cluster_size; - - s->free_cluster_index = 0; - assert(3 + l1_clusters <= s->refcount_block_size); - offset = qcow2_alloc_clusters(bs, 3 * s->cluster_size + l1_size2); - if (offset < 0) { - ret = offset; - goto fail_broken_refcounts; - } else if (offset > 0) { - error_report("First cluster in emptied image is in use"); - abort(); - } - - /* Now finally the in-memory information corresponds to the on-disk - * structures and is correct */ - ret = qcow2_mark_clean(bs); - if (ret < 0) { - goto fail; - } - - ret = bdrv_truncate(bs->file->bs, (3 + l1_clusters) * s->cluster_size); - if (ret < 0) { - goto fail; - } - - return 0; - -fail_broken_refcounts: - /* The BDS is unusable at this point. If we wanted to make it usable, we - * would have to call qcow2_refcount_close(), qcow2_refcount_init(), - * qcow2_check_refcounts(), qcow2_refcount_close() and qcow2_refcount_init() - * again. However, because the functions which could have caused this error - * path to be taken are used by those functions as well, it's very likely - * that that sequence will fail as well. Therefore, just eject the BDS. */ - bs->drv = NULL; - -fail: - g_free(new_reftable); - return ret; -} - -static int qcow2_make_empty(BlockDriverState *bs) -{ - BDRVQcow2State *s = bs->opaque; - uint64_t start_sector; - int sector_step = INT_MAX / BDRV_SECTOR_SIZE; - int l1_clusters, ret = 0; - - l1_clusters = DIV_ROUND_UP(s->l1_size, s->cluster_size / sizeof(uint64_t)); - - if (s->qcow_version >= 3 && !s->snapshots && - 3 + l1_clusters <= s->refcount_block_size) { - /* The following function only works for qcow2 v3 images (it requires - * the dirty flag) and only as long as there are no snapshots (because - * it completely empties the image). Furthermore, the L1 table and three - * additional clusters (image header, refcount table, one refcount - * block) have to fit inside one refcount block. */ - return make_completely_empty(bs); - } - - /* This fallback code simply discards every active cluster; this is slow, - * but works in all cases */ - for (start_sector = 0; start_sector < bs->total_sectors; - start_sector += sector_step) - { - /* As this function is generally used after committing an external - * snapshot, QCOW2_DISCARD_SNAPSHOT seems appropriate. Also, the - * default action for this kind of discard is to pass the discard, - * which will ideally result in an actually smaller image file, as - * is probably desired. */ - ret = qcow2_discard_clusters(bs, start_sector * BDRV_SECTOR_SIZE, - MIN(sector_step, - bs->total_sectors - start_sector), - QCOW2_DISCARD_SNAPSHOT, true); - if (ret < 0) { - break; - } - } - - return ret; -} - -static coroutine_fn int qcow2_co_flush_to_os(BlockDriverState *bs) -{ - BDRVQcow2State *s = bs->opaque; - int ret; - - qemu_co_mutex_lock(&s->lock); - ret = qcow2_cache_flush(bs, s->l2_table_cache); - if (ret < 0) { - qemu_co_mutex_unlock(&s->lock); - return ret; - } - - if (qcow2_need_accurate_refcounts(s)) { - ret = qcow2_cache_flush(bs, s->refcount_block_cache); - if (ret < 0) { - qemu_co_mutex_unlock(&s->lock); - return ret; - } - } - qemu_co_mutex_unlock(&s->lock); - - return 0; -} - -static int qcow2_get_info(BlockDriverState *bs, BlockDriverInfo *bdi) -{ - BDRVQcow2State *s = bs->opaque; - bdi->unallocated_blocks_are_zero = true; - bdi->can_write_zeroes_with_unmap = (s->qcow_version >= 3); - bdi->cluster_size = s->cluster_size; - bdi->vm_state_offset = qcow2_vm_state_offset(s); - return 0; -} - -static ImageInfoSpecific *qcow2_get_specific_info(BlockDriverState *bs) -{ - BDRVQcow2State *s = bs->opaque; - ImageInfoSpecific *spec_info = g_new(ImageInfoSpecific, 1); - - *spec_info = (ImageInfoSpecific){ - .type = IMAGE_INFO_SPECIFIC_KIND_QCOW2, - .u.qcow2.data = g_new(ImageInfoSpecificQCow2, 1), - }; - if (s->qcow_version == 2) { - *spec_info->u.qcow2.data = (ImageInfoSpecificQCow2){ - .compat = g_strdup("0.10"), - .refcount_bits = s->refcount_bits, - }; - } else if (s->qcow_version == 3) { - *spec_info->u.qcow2.data = (ImageInfoSpecificQCow2){ - .compat = g_strdup("1.1"), - .lazy_refcounts = s->compatible_features & - QCOW2_COMPAT_LAZY_REFCOUNTS, - .has_lazy_refcounts = true, - .corrupt = s->incompatible_features & - QCOW2_INCOMPAT_CORRUPT, - .has_corrupt = true, - .refcount_bits = s->refcount_bits, - }; - } else { - /* if this assertion fails, this probably means a new version was - * added without having it covered here */ - assert(false); - } - - return spec_info; -} - -#if 0 -static void dump_refcounts(BlockDriverState *bs) -{ - BDRVQcow2State *s = bs->opaque; - int64_t nb_clusters, k, k1, size; - int refcount; - - size = bdrv_getlength(bs->file->bs); - nb_clusters = size_to_clusters(s, size); - for(k = 0; k < nb_clusters;) { - k1 = k; - refcount = get_refcount(bs, k); - k++; - while (k < nb_clusters && get_refcount(bs, k) == refcount) - k++; - printf("%" PRId64 ": refcount=%d nb=%" PRId64 "\n", k, refcount, - k - k1); - } -} -#endif - -static int qcow2_save_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, - int64_t pos) -{ - BDRVQcow2State *s = bs->opaque; - int64_t total_sectors = bs->total_sectors; - bool zero_beyond_eof = bs->zero_beyond_eof; - int ret; - - BLKDBG_EVENT(bs->file, BLKDBG_VMSTATE_SAVE); - bs->zero_beyond_eof = false; - ret = bdrv_pwritev(bs, qcow2_vm_state_offset(s) + pos, qiov); - bs->zero_beyond_eof = zero_beyond_eof; - - /* bdrv_co_do_writev will have increased the total_sectors value to include - * the VM state - the VM state is however not an actual part of the block - * device, therefore, we need to restore the old value. */ - bs->total_sectors = total_sectors; - - return ret; -} - -static int qcow2_load_vmstate(BlockDriverState *bs, uint8_t *buf, - int64_t pos, int size) -{ - BDRVQcow2State *s = bs->opaque; - bool zero_beyond_eof = bs->zero_beyond_eof; - int ret; - - BLKDBG_EVENT(bs->file, BLKDBG_VMSTATE_LOAD); - bs->zero_beyond_eof = false; - ret = bdrv_pread(bs, qcow2_vm_state_offset(s) + pos, buf, size); - bs->zero_beyond_eof = zero_beyond_eof; - - return ret; -} - -/* - * Downgrades an image's version. To achieve this, any incompatible features - * have to be removed. - */ -static int qcow2_downgrade(BlockDriverState *bs, int target_version, - BlockDriverAmendStatusCB *status_cb, void *cb_opaque) -{ - BDRVQcow2State *s = bs->opaque; - int current_version = s->qcow_version; - int ret; - - if (target_version == current_version) { - return 0; - } else if (target_version > current_version) { - return -EINVAL; - } else if (target_version != 2) { - return -EINVAL; - } - - if (s->refcount_order != 4) { - error_report("compat=0.10 requires refcount_bits=16"); - return -ENOTSUP; - } - - /* clear incompatible features */ - if (s->incompatible_features & QCOW2_INCOMPAT_DIRTY) { - ret = qcow2_mark_clean(bs); - if (ret < 0) { - return ret; - } - } - - /* with QCOW2_INCOMPAT_CORRUPT, it is pretty much impossible to get here in - * the first place; if that happens nonetheless, returning -ENOTSUP is the - * best thing to do anyway */ - - if (s->incompatible_features) { - return -ENOTSUP; - } - - /* since we can ignore compatible features, we can set them to 0 as well */ - s->compatible_features = 0; - /* if lazy refcounts have been used, they have already been fixed through - * clearing the dirty flag */ - - /* clearing autoclear features is trivial */ - s->autoclear_features = 0; - - ret = qcow2_expand_zero_clusters(bs, status_cb, cb_opaque); - if (ret < 0) { - return ret; - } - - s->qcow_version = target_version; - ret = qcow2_update_header(bs); - if (ret < 0) { - s->qcow_version = current_version; - return ret; - } - return 0; -} - -typedef enum Qcow2AmendOperation { - /* This is the value Qcow2AmendHelperCBInfo::last_operation will be - * statically initialized to so that the helper CB can discern the first - * invocation from an operation change */ - QCOW2_NO_OPERATION = 0, - - QCOW2_CHANGING_REFCOUNT_ORDER, - QCOW2_DOWNGRADING, -} Qcow2AmendOperation; - -typedef struct Qcow2AmendHelperCBInfo { - /* The code coordinating the amend operations should only modify - * these four fields; the rest will be managed by the CB */ - BlockDriverAmendStatusCB *original_status_cb; - void *original_cb_opaque; - - Qcow2AmendOperation current_operation; - - /* Total number of operations to perform (only set once) */ - int total_operations; - - /* The following fields are managed by the CB */ - - /* Number of operations completed */ - int operations_completed; - - /* Cumulative offset of all completed operations */ - int64_t offset_completed; - - Qcow2AmendOperation last_operation; - int64_t last_work_size; -} Qcow2AmendHelperCBInfo; - -static void qcow2_amend_helper_cb(BlockDriverState *bs, - int64_t operation_offset, - int64_t operation_work_size, void *opaque) -{ - Qcow2AmendHelperCBInfo *info = opaque; - int64_t current_work_size; - int64_t projected_work_size; - - if (info->current_operation != info->last_operation) { - if (info->last_operation != QCOW2_NO_OPERATION) { - info->offset_completed += info->last_work_size; - info->operations_completed++; - } - - info->last_operation = info->current_operation; - } - - assert(info->total_operations > 0); - assert(info->operations_completed < info->total_operations); - - info->last_work_size = operation_work_size; - - current_work_size = info->offset_completed + operation_work_size; - - /* current_work_size is the total work size for (operations_completed + 1) - * operations (which includes this one), so multiply it by the number of - * operations not covered and divide it by the number of operations - * covered to get a projection for the operations not covered */ - projected_work_size = current_work_size * (info->total_operations - - info->operations_completed - 1) - / (info->operations_completed + 1); - - info->original_status_cb(bs, info->offset_completed + operation_offset, - current_work_size + projected_work_size, - info->original_cb_opaque); -} - -static int qcow2_amend_options(BlockDriverState *bs, QemuOpts *opts, - BlockDriverAmendStatusCB *status_cb, - void *cb_opaque) -{ - BDRVQcow2State *s = bs->opaque; - int old_version = s->qcow_version, new_version = old_version; - uint64_t new_size = 0; - const char *backing_file = NULL, *backing_format = NULL; - bool lazy_refcounts = s->use_lazy_refcounts; - const char *compat = NULL; - uint64_t cluster_size = s->cluster_size; - bool encrypt; - int refcount_bits = s->refcount_bits; - int ret; - QemuOptDesc *desc = opts->list->desc; - Qcow2AmendHelperCBInfo helper_cb_info; - - while (desc && desc->name) { - if (!qemu_opt_find(opts, desc->name)) { - /* only change explicitly defined options */ - desc++; - continue; - } - - if (!strcmp(desc->name, BLOCK_OPT_COMPAT_LEVEL)) { - compat = qemu_opt_get(opts, BLOCK_OPT_COMPAT_LEVEL); - if (!compat) { - /* preserve default */ - } else if (!strcmp(compat, "0.10")) { - new_version = 2; - } else if (!strcmp(compat, "1.1")) { - new_version = 3; - } else { - error_report("Unknown compatibility level %s", compat); - return -EINVAL; - } - } else if (!strcmp(desc->name, BLOCK_OPT_PREALLOC)) { - error_report("Cannot change preallocation mode"); - return -ENOTSUP; - } else if (!strcmp(desc->name, BLOCK_OPT_SIZE)) { - new_size = qemu_opt_get_size(opts, BLOCK_OPT_SIZE, 0); - } else if (!strcmp(desc->name, BLOCK_OPT_BACKING_FILE)) { - backing_file = qemu_opt_get(opts, BLOCK_OPT_BACKING_FILE); - } else if (!strcmp(desc->name, BLOCK_OPT_BACKING_FMT)) { - backing_format = qemu_opt_get(opts, BLOCK_OPT_BACKING_FMT); - } else if (!strcmp(desc->name, BLOCK_OPT_ENCRYPT)) { - encrypt = qemu_opt_get_bool(opts, BLOCK_OPT_ENCRYPT, - !!s->cipher); - - if (encrypt != !!s->cipher) { - error_report("Changing the encryption flag is not supported"); - return -ENOTSUP; - } - } else if (!strcmp(desc->name, BLOCK_OPT_CLUSTER_SIZE)) { - cluster_size = qemu_opt_get_size(opts, BLOCK_OPT_CLUSTER_SIZE, - cluster_size); - if (cluster_size != s->cluster_size) { - error_report("Changing the cluster size is not supported"); - return -ENOTSUP; - } - } else if (!strcmp(desc->name, BLOCK_OPT_LAZY_REFCOUNTS)) { - lazy_refcounts = qemu_opt_get_bool(opts, BLOCK_OPT_LAZY_REFCOUNTS, - lazy_refcounts); - } else if (!strcmp(desc->name, BLOCK_OPT_REFCOUNT_BITS)) { - refcount_bits = qemu_opt_get_number(opts, BLOCK_OPT_REFCOUNT_BITS, - refcount_bits); - - if (refcount_bits <= 0 || refcount_bits > 64 || - !is_power_of_2(refcount_bits)) - { - error_report("Refcount width must be a power of two and may " - "not exceed 64 bits"); - return -EINVAL; - } - } else { - /* if this point is reached, this probably means a new option was - * added without having it covered here */ - abort(); - } - - desc++; - } - - helper_cb_info = (Qcow2AmendHelperCBInfo){ - .original_status_cb = status_cb, - .original_cb_opaque = cb_opaque, - .total_operations = (new_version < old_version) - + (s->refcount_bits != refcount_bits) - }; - - /* Upgrade first (some features may require compat=1.1) */ - if (new_version > old_version) { - s->qcow_version = new_version; - ret = qcow2_update_header(bs); - if (ret < 0) { - s->qcow_version = old_version; - return ret; - } - } - - if (s->refcount_bits != refcount_bits) { - int refcount_order = ctz32(refcount_bits); - Error *local_error = NULL; - - if (new_version < 3 && refcount_bits != 16) { - error_report("Different refcount widths than 16 bits require " - "compatibility level 1.1 or above (use compat=1.1 or " - "greater)"); - return -EINVAL; - } - - helper_cb_info.current_operation = QCOW2_CHANGING_REFCOUNT_ORDER; - ret = qcow2_change_refcount_order(bs, refcount_order, - &qcow2_amend_helper_cb, - &helper_cb_info, &local_error); - if (ret < 0) { - error_report_err(local_error); - return ret; - } - } - - if (backing_file || backing_format) { - ret = qcow2_change_backing_file(bs, - backing_file ?: s->image_backing_file, - backing_format ?: s->image_backing_format); - if (ret < 0) { - return ret; - } - } - - if (s->use_lazy_refcounts != lazy_refcounts) { - if (lazy_refcounts) { - if (new_version < 3) { - error_report("Lazy refcounts only supported with compatibility " - "level 1.1 and above (use compat=1.1 or greater)"); - return -EINVAL; - } - s->compatible_features |= QCOW2_COMPAT_LAZY_REFCOUNTS; - ret = qcow2_update_header(bs); - if (ret < 0) { - s->compatible_features &= ~QCOW2_COMPAT_LAZY_REFCOUNTS; - return ret; - } - s->use_lazy_refcounts = true; - } else { - /* make image clean first */ - ret = qcow2_mark_clean(bs); - if (ret < 0) { - return ret; - } - /* now disallow lazy refcounts */ - s->compatible_features &= ~QCOW2_COMPAT_LAZY_REFCOUNTS; - ret = qcow2_update_header(bs); - if (ret < 0) { - s->compatible_features |= QCOW2_COMPAT_LAZY_REFCOUNTS; - return ret; - } - s->use_lazy_refcounts = false; - } - } - - if (new_size) { - ret = bdrv_truncate(bs, new_size); - if (ret < 0) { - return ret; - } - } - - /* Downgrade last (so unsupported features can be removed before) */ - if (new_version < old_version) { - helper_cb_info.current_operation = QCOW2_DOWNGRADING; - ret = qcow2_downgrade(bs, new_version, &qcow2_amend_helper_cb, - &helper_cb_info); - if (ret < 0) { - return ret; - } - } - - return 0; -} - -/* - * If offset or size are negative, respectively, they will not be included in - * the BLOCK_IMAGE_CORRUPTED event emitted. - * fatal will be ignored for read-only BDS; corruptions found there will always - * be considered non-fatal. - */ -void qcow2_signal_corruption(BlockDriverState *bs, bool fatal, int64_t offset, - int64_t size, const char *message_format, ...) -{ - BDRVQcow2State *s = bs->opaque; - const char *node_name; - char *message; - va_list ap; - - fatal = fatal && !bs->read_only; - - if (s->signaled_corruption && - (!fatal || (s->incompatible_features & QCOW2_INCOMPAT_CORRUPT))) - { - return; - } - - va_start(ap, message_format); - message = g_strdup_vprintf(message_format, ap); - va_end(ap); - - if (fatal) { - fprintf(stderr, "qcow2: Marking image as corrupt: %s; further " - "corruption events will be suppressed\n", message); - } else { - fprintf(stderr, "qcow2: Image is corrupt: %s; further non-fatal " - "corruption events will be suppressed\n", message); - } - - node_name = bdrv_get_node_name(bs); - qapi_event_send_block_image_corrupted(bdrv_get_device_name(bs), - *node_name != '\0', node_name, - message, offset >= 0, offset, - size >= 0, size, - fatal, &error_abort); - g_free(message); - - if (fatal) { - qcow2_mark_corrupt(bs); - bs->drv = NULL; /* make BDS unusable */ - } - - s->signaled_corruption = true; -} - -static QemuOptsList qcow2_create_opts = { - .name = "qcow2-create-opts", - .head = QTAILQ_HEAD_INITIALIZER(qcow2_create_opts.head), - .desc = { - { - .name = BLOCK_OPT_SIZE, - .type = QEMU_OPT_SIZE, - .help = "Virtual disk size" - }, - { - .name = BLOCK_OPT_COMPAT_LEVEL, - .type = QEMU_OPT_STRING, - .help = "Compatibility level (0.10 or 1.1)" - }, - { - .name = BLOCK_OPT_BACKING_FILE, - .type = QEMU_OPT_STRING, - .help = "File name of a base image" - }, - { - .name = BLOCK_OPT_BACKING_FMT, - .type = QEMU_OPT_STRING, - .help = "Image format of the base image" - }, - { - .name = BLOCK_OPT_ENCRYPT, - .type = QEMU_OPT_BOOL, - .help = "Encrypt the image", - .def_value_str = "off" - }, - { - .name = BLOCK_OPT_CLUSTER_SIZE, - .type = QEMU_OPT_SIZE, - .help = "qcow2 cluster size", - .def_value_str = stringify(DEFAULT_CLUSTER_SIZE) - }, - { - .name = BLOCK_OPT_PREALLOC, - .type = QEMU_OPT_STRING, - .help = "Preallocation mode (allowed values: off, metadata, " - "falloc, full)" - }, - { - .name = BLOCK_OPT_LAZY_REFCOUNTS, - .type = QEMU_OPT_BOOL, - .help = "Postpone refcount updates", - .def_value_str = "off" - }, - { - .name = BLOCK_OPT_REFCOUNT_BITS, - .type = QEMU_OPT_NUMBER, - .help = "Width of a reference count entry in bits", - .def_value_str = "16" - }, - { /* end of list */ } - } -}; - -BlockDriver bdrv_qcow2 = { - .format_name = "qcow2", - .instance_size = sizeof(BDRVQcow2State), - .bdrv_probe = qcow2_probe, - .bdrv_open = qcow2_open, - .bdrv_close = qcow2_close, - .bdrv_reopen_prepare = qcow2_reopen_prepare, - .bdrv_reopen_commit = qcow2_reopen_commit, - .bdrv_reopen_abort = qcow2_reopen_abort, - .bdrv_join_options = qcow2_join_options, - .bdrv_create = qcow2_create, - .bdrv_has_zero_init = bdrv_has_zero_init_1, - .bdrv_co_get_block_status = qcow2_co_get_block_status, - .bdrv_set_key = qcow2_set_key, - - .bdrv_co_readv = qcow2_co_readv, - .bdrv_co_writev = qcow2_co_writev, - .bdrv_co_flush_to_os = qcow2_co_flush_to_os, - - .bdrv_co_write_zeroes = qcow2_co_write_zeroes, - .bdrv_co_discard = qcow2_co_discard, - .bdrv_truncate = qcow2_truncate, - .bdrv_write_compressed = qcow2_write_compressed, - .bdrv_make_empty = qcow2_make_empty, - - .bdrv_snapshot_create = qcow2_snapshot_create, - .bdrv_snapshot_goto = qcow2_snapshot_goto, - .bdrv_snapshot_delete = qcow2_snapshot_delete, - .bdrv_snapshot_list = qcow2_snapshot_list, - .bdrv_snapshot_load_tmp = qcow2_snapshot_load_tmp, - .bdrv_get_info = qcow2_get_info, - .bdrv_get_specific_info = qcow2_get_specific_info, - - .bdrv_save_vmstate = qcow2_save_vmstate, - .bdrv_load_vmstate = qcow2_load_vmstate, - - .supports_backing = true, - .bdrv_change_backing_file = qcow2_change_backing_file, - - .bdrv_refresh_limits = qcow2_refresh_limits, - .bdrv_invalidate_cache = qcow2_invalidate_cache, - .bdrv_inactivate = qcow2_inactivate, - - .create_opts = &qcow2_create_opts, - .bdrv_check = qcow2_check, - .bdrv_amend_options = qcow2_amend_options, - - .bdrv_detach_aio_context = qcow2_detach_aio_context, - .bdrv_attach_aio_context = qcow2_attach_aio_context, -}; - -static void bdrv_qcow2_init(void) -{ - bdrv_register(&bdrv_qcow2); -} - -block_init(bdrv_qcow2_init); diff --git a/qemu/block/qcow2.h b/qemu/block/qcow2.h deleted file mode 100644 index a063a3c1a..000000000 --- a/qemu/block/qcow2.h +++ /dev/null @@ -1,599 +0,0 @@ -/* - * Block driver for the QCOW version 2 format - * - * Copyright (c) 2004-2006 Fabrice Bellard - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. - */ - -#ifndef BLOCK_QCOW2_H -#define BLOCK_QCOW2_H - -#include "crypto/cipher.h" -#include "qemu/coroutine.h" - -//#define DEBUG_ALLOC -//#define DEBUG_ALLOC2 -//#define DEBUG_EXT - -#define QCOW_MAGIC (('Q' << 24) | ('F' << 16) | ('I' << 8) | 0xfb) - -#define QCOW_CRYPT_NONE 0 -#define QCOW_CRYPT_AES 1 - -#define QCOW_MAX_CRYPT_CLUSTERS 32 -#define QCOW_MAX_SNAPSHOTS 65536 - -/* 8 MB refcount table is enough for 2 PB images at 64k cluster size - * (128 GB for 512 byte clusters, 2 EB for 2 MB clusters) */ -#define QCOW_MAX_REFTABLE_SIZE 0x800000 - -/* 32 MB L1 table is enough for 2 PB images at 64k cluster size - * (128 GB for 512 byte clusters, 2 EB for 2 MB clusters) */ -#define QCOW_MAX_L1_SIZE 0x2000000 - -/* Allow for an average of 1k per snapshot table entry, should be plenty of - * space for snapshot names and IDs */ -#define QCOW_MAX_SNAPSHOTS_SIZE (1024 * QCOW_MAX_SNAPSHOTS) - -/* indicate that the refcount of the referenced cluster is exactly one. */ -#define QCOW_OFLAG_COPIED (1ULL << 63) -/* indicate that the cluster is compressed (they never have the copied flag) */ -#define QCOW_OFLAG_COMPRESSED (1ULL << 62) -/* The cluster reads as all zeros */ -#define QCOW_OFLAG_ZERO (1ULL << 0) - -#define MIN_CLUSTER_BITS 9 -#define MAX_CLUSTER_BITS 21 - -/* Must be at least 2 to cover COW */ -#define MIN_L2_CACHE_SIZE 2 /* clusters */ - -/* Must be at least 4 to cover all cases of refcount table growth */ -#define MIN_REFCOUNT_CACHE_SIZE 4 /* clusters */ - -/* Whichever is more */ -#define DEFAULT_L2_CACHE_CLUSTERS 8 /* clusters */ -#define DEFAULT_L2_CACHE_BYTE_SIZE 1048576 /* bytes */ - -/* The refblock cache needs only a fourth of the L2 cache size to cover as many - * clusters */ -#define DEFAULT_L2_REFCOUNT_SIZE_RATIO 4 - -#define DEFAULT_CLUSTER_SIZE 65536 - - -#define QCOW2_OPT_LAZY_REFCOUNTS "lazy-refcounts" -#define QCOW2_OPT_DISCARD_REQUEST "pass-discard-request" -#define QCOW2_OPT_DISCARD_SNAPSHOT "pass-discard-snapshot" -#define QCOW2_OPT_DISCARD_OTHER "pass-discard-other" -#define QCOW2_OPT_OVERLAP "overlap-check" -#define QCOW2_OPT_OVERLAP_TEMPLATE "overlap-check.template" -#define QCOW2_OPT_OVERLAP_MAIN_HEADER "overlap-check.main-header" -#define QCOW2_OPT_OVERLAP_ACTIVE_L1 "overlap-check.active-l1" -#define QCOW2_OPT_OVERLAP_ACTIVE_L2 "overlap-check.active-l2" -#define QCOW2_OPT_OVERLAP_REFCOUNT_TABLE "overlap-check.refcount-table" -#define QCOW2_OPT_OVERLAP_REFCOUNT_BLOCK "overlap-check.refcount-block" -#define QCOW2_OPT_OVERLAP_SNAPSHOT_TABLE "overlap-check.snapshot-table" -#define QCOW2_OPT_OVERLAP_INACTIVE_L1 "overlap-check.inactive-l1" -#define QCOW2_OPT_OVERLAP_INACTIVE_L2 "overlap-check.inactive-l2" -#define QCOW2_OPT_CACHE_SIZE "cache-size" -#define QCOW2_OPT_L2_CACHE_SIZE "l2-cache-size" -#define QCOW2_OPT_REFCOUNT_CACHE_SIZE "refcount-cache-size" -#define QCOW2_OPT_CACHE_CLEAN_INTERVAL "cache-clean-interval" - -typedef struct QCowHeader { - uint32_t magic; - uint32_t version; - uint64_t backing_file_offset; - uint32_t backing_file_size; - uint32_t cluster_bits; - uint64_t size; /* in bytes */ - uint32_t crypt_method; - uint32_t l1_size; /* XXX: save number of clusters instead ? */ - uint64_t l1_table_offset; - uint64_t refcount_table_offset; - uint32_t refcount_table_clusters; - uint32_t nb_snapshots; - uint64_t snapshots_offset; - - /* The following fields are only valid for version >= 3 */ - uint64_t incompatible_features; - uint64_t compatible_features; - uint64_t autoclear_features; - - uint32_t refcount_order; - uint32_t header_length; -} QEMU_PACKED QCowHeader; - -typedef struct QEMU_PACKED QCowSnapshotHeader { - /* header is 8 byte aligned */ - uint64_t l1_table_offset; - - uint32_t l1_size; - uint16_t id_str_size; - uint16_t name_size; - - uint32_t date_sec; - uint32_t date_nsec; - - uint64_t vm_clock_nsec; - - uint32_t vm_state_size; - uint32_t extra_data_size; /* for extension */ - /* extra data follows */ - /* id_str follows */ - /* name follows */ -} QCowSnapshotHeader; - -typedef struct QEMU_PACKED QCowSnapshotExtraData { - uint64_t vm_state_size_large; - uint64_t disk_size; -} QCowSnapshotExtraData; - - -typedef struct QCowSnapshot { - uint64_t l1_table_offset; - uint32_t l1_size; - char *id_str; - char *name; - uint64_t disk_size; - uint64_t vm_state_size; - uint32_t date_sec; - uint32_t date_nsec; - uint64_t vm_clock_nsec; -} QCowSnapshot; - -struct Qcow2Cache; -typedef struct Qcow2Cache Qcow2Cache; - -typedef struct Qcow2UnknownHeaderExtension { - uint32_t magic; - uint32_t len; - QLIST_ENTRY(Qcow2UnknownHeaderExtension) next; - uint8_t data[]; -} Qcow2UnknownHeaderExtension; - -enum { - QCOW2_FEAT_TYPE_INCOMPATIBLE = 0, - QCOW2_FEAT_TYPE_COMPATIBLE = 1, - QCOW2_FEAT_TYPE_AUTOCLEAR = 2, -}; - -/* Incompatible feature bits */ -enum { - QCOW2_INCOMPAT_DIRTY_BITNR = 0, - QCOW2_INCOMPAT_CORRUPT_BITNR = 1, - QCOW2_INCOMPAT_DIRTY = 1 << QCOW2_INCOMPAT_DIRTY_BITNR, - QCOW2_INCOMPAT_CORRUPT = 1 << QCOW2_INCOMPAT_CORRUPT_BITNR, - - QCOW2_INCOMPAT_MASK = QCOW2_INCOMPAT_DIRTY - | QCOW2_INCOMPAT_CORRUPT, -}; - -/* Compatible feature bits */ -enum { - QCOW2_COMPAT_LAZY_REFCOUNTS_BITNR = 0, - QCOW2_COMPAT_LAZY_REFCOUNTS = 1 << QCOW2_COMPAT_LAZY_REFCOUNTS_BITNR, - - QCOW2_COMPAT_FEAT_MASK = QCOW2_COMPAT_LAZY_REFCOUNTS, -}; - -enum qcow2_discard_type { - QCOW2_DISCARD_NEVER = 0, - QCOW2_DISCARD_ALWAYS, - QCOW2_DISCARD_REQUEST, - QCOW2_DISCARD_SNAPSHOT, - QCOW2_DISCARD_OTHER, - QCOW2_DISCARD_MAX -}; - -typedef struct Qcow2Feature { - uint8_t type; - uint8_t bit; - char name[46]; -} QEMU_PACKED Qcow2Feature; - -typedef struct Qcow2DiscardRegion { - BlockDriverState *bs; - uint64_t offset; - uint64_t bytes; - QTAILQ_ENTRY(Qcow2DiscardRegion) next; -} Qcow2DiscardRegion; - -typedef uint64_t Qcow2GetRefcountFunc(const void *refcount_array, - uint64_t index); -typedef void Qcow2SetRefcountFunc(void *refcount_array, - uint64_t index, uint64_t value); - -typedef struct BDRVQcow2State { - int cluster_bits; - int cluster_size; - int cluster_sectors; - int l2_bits; - int l2_size; - int l1_size; - int l1_vm_state_index; - int refcount_block_bits; - int refcount_block_size; - int csize_shift; - int csize_mask; - uint64_t cluster_offset_mask; - uint64_t l1_table_offset; - uint64_t *l1_table; - - Qcow2Cache* l2_table_cache; - Qcow2Cache* refcount_block_cache; - QEMUTimer *cache_clean_timer; - unsigned cache_clean_interval; - - uint8_t *cluster_cache; - uint8_t *cluster_data; - uint64_t cluster_cache_offset; - QLIST_HEAD(QCowClusterAlloc, QCowL2Meta) cluster_allocs; - - uint64_t *refcount_table; - uint64_t refcount_table_offset; - uint32_t refcount_table_size; - uint64_t free_cluster_index; - uint64_t free_byte_offset; - - CoMutex lock; - - QCryptoCipher *cipher; /* current cipher, NULL if no key yet */ - uint32_t crypt_method_header; - uint64_t snapshots_offset; - int snapshots_size; - unsigned int nb_snapshots; - QCowSnapshot *snapshots; - - int flags; - int qcow_version; - bool use_lazy_refcounts; - int refcount_order; - int refcount_bits; - uint64_t refcount_max; - - Qcow2GetRefcountFunc *get_refcount; - Qcow2SetRefcountFunc *set_refcount; - - bool discard_passthrough[QCOW2_DISCARD_MAX]; - - int overlap_check; /* bitmask of Qcow2MetadataOverlap values */ - bool signaled_corruption; - - uint64_t incompatible_features; - uint64_t compatible_features; - uint64_t autoclear_features; - - size_t unknown_header_fields_size; - void* unknown_header_fields; - QLIST_HEAD(, Qcow2UnknownHeaderExtension) unknown_header_ext; - QTAILQ_HEAD (, Qcow2DiscardRegion) discards; - bool cache_discards; - - /* Backing file path and format as stored in the image (this is not the - * effective path/format, which may be the result of a runtime option - * override) */ - char *image_backing_file; - char *image_backing_format; -} BDRVQcow2State; - -typedef struct Qcow2COWRegion { - /** - * Offset of the COW region in bytes from the start of the first cluster - * touched by the request. - */ - uint64_t offset; - - /** Number of sectors to copy */ - int nb_sectors; -} Qcow2COWRegion; - -/** - * Describes an in-flight (part of a) write request that writes to clusters - * that are not referenced in their L2 table yet. - */ -typedef struct QCowL2Meta -{ - /** Guest offset of the first newly allocated cluster */ - uint64_t offset; - - /** Host offset of the first newly allocated cluster */ - uint64_t alloc_offset; - - /** - * Number of sectors from the start of the first allocated cluster to - * the end of the (possibly shortened) request - */ - int nb_available; - - /** Number of newly allocated clusters */ - int nb_clusters; - - /** - * Requests that overlap with this allocation and wait to be restarted - * when the allocating request has completed. - */ - CoQueue dependent_requests; - - /** - * The COW Region between the start of the first allocated cluster and the - * area the guest actually writes to. - */ - Qcow2COWRegion cow_start; - - /** - * The COW Region between the area the guest actually writes to and the - * end of the last allocated cluster. - */ - Qcow2COWRegion cow_end; - - /** Pointer to next L2Meta of the same write request */ - struct QCowL2Meta *next; - - QLIST_ENTRY(QCowL2Meta) next_in_flight; -} QCowL2Meta; - -enum { - QCOW2_CLUSTER_UNALLOCATED, - QCOW2_CLUSTER_NORMAL, - QCOW2_CLUSTER_COMPRESSED, - QCOW2_CLUSTER_ZERO -}; - -typedef enum QCow2MetadataOverlap { - QCOW2_OL_MAIN_HEADER_BITNR = 0, - QCOW2_OL_ACTIVE_L1_BITNR = 1, - QCOW2_OL_ACTIVE_L2_BITNR = 2, - QCOW2_OL_REFCOUNT_TABLE_BITNR = 3, - QCOW2_OL_REFCOUNT_BLOCK_BITNR = 4, - QCOW2_OL_SNAPSHOT_TABLE_BITNR = 5, - QCOW2_OL_INACTIVE_L1_BITNR = 6, - QCOW2_OL_INACTIVE_L2_BITNR = 7, - - QCOW2_OL_MAX_BITNR = 8, - - QCOW2_OL_NONE = 0, - QCOW2_OL_MAIN_HEADER = (1 << QCOW2_OL_MAIN_HEADER_BITNR), - QCOW2_OL_ACTIVE_L1 = (1 << QCOW2_OL_ACTIVE_L1_BITNR), - QCOW2_OL_ACTIVE_L2 = (1 << QCOW2_OL_ACTIVE_L2_BITNR), - QCOW2_OL_REFCOUNT_TABLE = (1 << QCOW2_OL_REFCOUNT_TABLE_BITNR), - QCOW2_OL_REFCOUNT_BLOCK = (1 << QCOW2_OL_REFCOUNT_BLOCK_BITNR), - QCOW2_OL_SNAPSHOT_TABLE = (1 << QCOW2_OL_SNAPSHOT_TABLE_BITNR), - QCOW2_OL_INACTIVE_L1 = (1 << QCOW2_OL_INACTIVE_L1_BITNR), - /* NOTE: Checking overlaps with inactive L2 tables will result in bdrv - * reads. */ - QCOW2_OL_INACTIVE_L2 = (1 << QCOW2_OL_INACTIVE_L2_BITNR), -} QCow2MetadataOverlap; - -/* Perform all overlap checks which can be done in constant time */ -#define QCOW2_OL_CONSTANT \ - (QCOW2_OL_MAIN_HEADER | QCOW2_OL_ACTIVE_L1 | QCOW2_OL_REFCOUNT_TABLE | \ - QCOW2_OL_SNAPSHOT_TABLE) - -/* Perform all overlap checks which don't require disk access */ -#define QCOW2_OL_CACHED \ - (QCOW2_OL_CONSTANT | QCOW2_OL_ACTIVE_L2 | QCOW2_OL_REFCOUNT_BLOCK | \ - QCOW2_OL_INACTIVE_L1) - -/* Perform all overlap checks */ -#define QCOW2_OL_ALL \ - (QCOW2_OL_CACHED | QCOW2_OL_INACTIVE_L2) - -#define L1E_OFFSET_MASK 0x00fffffffffffe00ULL -#define L2E_OFFSET_MASK 0x00fffffffffffe00ULL -#define L2E_COMPRESSED_OFFSET_SIZE_MASK 0x3fffffffffffffffULL - -#define REFT_OFFSET_MASK 0xfffffffffffffe00ULL - -static inline int64_t start_of_cluster(BDRVQcow2State *s, int64_t offset) -{ - return offset & ~(s->cluster_size - 1); -} - -static inline int64_t offset_into_cluster(BDRVQcow2State *s, int64_t offset) -{ - return offset & (s->cluster_size - 1); -} - -static inline uint64_t size_to_clusters(BDRVQcow2State *s, uint64_t size) -{ - return (size + (s->cluster_size - 1)) >> s->cluster_bits; -} - -static inline int64_t size_to_l1(BDRVQcow2State *s, int64_t size) -{ - int shift = s->cluster_bits + s->l2_bits; - return (size + (1ULL << shift) - 1) >> shift; -} - -static inline int offset_to_l2_index(BDRVQcow2State *s, int64_t offset) -{ - return (offset >> s->cluster_bits) & (s->l2_size - 1); -} - -static inline int64_t align_offset(int64_t offset, int n) -{ - offset = (offset + n - 1) & ~(n - 1); - return offset; -} - -static inline int64_t qcow2_vm_state_offset(BDRVQcow2State *s) -{ - return (int64_t)s->l1_vm_state_index << (s->cluster_bits + s->l2_bits); -} - -static inline uint64_t qcow2_max_refcount_clusters(BDRVQcow2State *s) -{ - return QCOW_MAX_REFTABLE_SIZE >> s->cluster_bits; -} - -static inline int qcow2_get_cluster_type(uint64_t l2_entry) -{ - if (l2_entry & QCOW_OFLAG_COMPRESSED) { - return QCOW2_CLUSTER_COMPRESSED; - } else if (l2_entry & QCOW_OFLAG_ZERO) { - return QCOW2_CLUSTER_ZERO; - } else if (!(l2_entry & L2E_OFFSET_MASK)) { - return QCOW2_CLUSTER_UNALLOCATED; - } else { - return QCOW2_CLUSTER_NORMAL; - } -} - -/* Check whether refcounts are eager or lazy */ -static inline bool qcow2_need_accurate_refcounts(BDRVQcow2State *s) -{ - return !(s->incompatible_features & QCOW2_INCOMPAT_DIRTY); -} - -static inline uint64_t l2meta_cow_start(QCowL2Meta *m) -{ - return m->offset + m->cow_start.offset; -} - -static inline uint64_t l2meta_cow_end(QCowL2Meta *m) -{ - return m->offset + m->cow_end.offset - + (m->cow_end.nb_sectors << BDRV_SECTOR_BITS); -} - -static inline uint64_t refcount_diff(uint64_t r1, uint64_t r2) -{ - return r1 > r2 ? r1 - r2 : r2 - r1; -} - -// FIXME Need qcow2_ prefix to global functions - -/* qcow2.c functions */ -int qcow2_backing_read1(BlockDriverState *bs, QEMUIOVector *qiov, - int64_t sector_num, int nb_sectors); - -int qcow2_mark_dirty(BlockDriverState *bs); -int qcow2_mark_corrupt(BlockDriverState *bs); -int qcow2_mark_consistent(BlockDriverState *bs); -int qcow2_update_header(BlockDriverState *bs); - -void qcow2_signal_corruption(BlockDriverState *bs, bool fatal, int64_t offset, - int64_t size, const char *message_format, ...) - GCC_FMT_ATTR(5, 6); - -/* qcow2-refcount.c functions */ -int qcow2_refcount_init(BlockDriverState *bs); -void qcow2_refcount_close(BlockDriverState *bs); - -int qcow2_get_refcount(BlockDriverState *bs, int64_t cluster_index, - uint64_t *refcount); - -int qcow2_update_cluster_refcount(BlockDriverState *bs, int64_t cluster_index, - uint64_t addend, bool decrease, - enum qcow2_discard_type type); - -int64_t qcow2_alloc_clusters(BlockDriverState *bs, uint64_t size); -int64_t qcow2_alloc_clusters_at(BlockDriverState *bs, uint64_t offset, - int64_t nb_clusters); -int64_t qcow2_alloc_bytes(BlockDriverState *bs, int size); -void qcow2_free_clusters(BlockDriverState *bs, - int64_t offset, int64_t size, - enum qcow2_discard_type type); -void qcow2_free_any_clusters(BlockDriverState *bs, uint64_t l2_entry, - int nb_clusters, enum qcow2_discard_type type); - -int qcow2_update_snapshot_refcount(BlockDriverState *bs, - int64_t l1_table_offset, int l1_size, int addend); - -int qcow2_check_refcounts(BlockDriverState *bs, BdrvCheckResult *res, - BdrvCheckMode fix); - -void qcow2_process_discards(BlockDriverState *bs, int ret); - -int qcow2_check_metadata_overlap(BlockDriverState *bs, int ign, int64_t offset, - int64_t size); -int qcow2_pre_write_overlap_check(BlockDriverState *bs, int ign, int64_t offset, - int64_t size); - -int qcow2_change_refcount_order(BlockDriverState *bs, int refcount_order, - BlockDriverAmendStatusCB *status_cb, - void *cb_opaque, Error **errp); - -/* qcow2-cluster.c functions */ -int qcow2_grow_l1_table(BlockDriverState *bs, uint64_t min_size, - bool exact_size); -int qcow2_write_l1_entry(BlockDriverState *bs, int l1_index); -void qcow2_l2_cache_reset(BlockDriverState *bs); -int qcow2_decompress_cluster(BlockDriverState *bs, uint64_t cluster_offset); -int qcow2_encrypt_sectors(BDRVQcow2State *s, int64_t sector_num, - uint8_t *out_buf, const uint8_t *in_buf, - int nb_sectors, bool enc, Error **errp); - -int qcow2_get_cluster_offset(BlockDriverState *bs, uint64_t offset, - int *num, uint64_t *cluster_offset); -int qcow2_alloc_cluster_offset(BlockDriverState *bs, uint64_t offset, - int *num, uint64_t *host_offset, QCowL2Meta **m); -uint64_t qcow2_alloc_compressed_cluster_offset(BlockDriverState *bs, - uint64_t offset, - int compressed_size); - -int qcow2_alloc_cluster_link_l2(BlockDriverState *bs, QCowL2Meta *m); -int qcow2_discard_clusters(BlockDriverState *bs, uint64_t offset, - int nb_sectors, enum qcow2_discard_type type, bool full_discard); -int qcow2_zero_clusters(BlockDriverState *bs, uint64_t offset, int nb_sectors); - -int qcow2_expand_zero_clusters(BlockDriverState *bs, - BlockDriverAmendStatusCB *status_cb, - void *cb_opaque); - -/* qcow2-snapshot.c functions */ -int qcow2_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info); -int qcow2_snapshot_goto(BlockDriverState *bs, const char *snapshot_id); -int qcow2_snapshot_delete(BlockDriverState *bs, - const char *snapshot_id, - const char *name, - Error **errp); -int qcow2_snapshot_list(BlockDriverState *bs, QEMUSnapshotInfo **psn_tab); -int qcow2_snapshot_load_tmp(BlockDriverState *bs, - const char *snapshot_id, - const char *name, - Error **errp); - -void qcow2_free_snapshots(BlockDriverState *bs); -int qcow2_read_snapshots(BlockDriverState *bs); - -/* qcow2-cache.c functions */ -Qcow2Cache *qcow2_cache_create(BlockDriverState *bs, int num_tables); -int qcow2_cache_destroy(BlockDriverState* bs, Qcow2Cache *c); - -void qcow2_cache_entry_mark_dirty(BlockDriverState *bs, Qcow2Cache *c, - void *table); -int qcow2_cache_flush(BlockDriverState *bs, Qcow2Cache *c); -int qcow2_cache_set_dependency(BlockDriverState *bs, Qcow2Cache *c, - Qcow2Cache *dependency); -void qcow2_cache_depends_on_flush(Qcow2Cache *c); - -void qcow2_cache_clean_unused(BlockDriverState *bs, Qcow2Cache *c); -int qcow2_cache_empty(BlockDriverState *bs, Qcow2Cache *c); - -int qcow2_cache_get(BlockDriverState *bs, Qcow2Cache *c, uint64_t offset, - void **table); -int qcow2_cache_get_empty(BlockDriverState *bs, Qcow2Cache *c, uint64_t offset, - void **table); -void qcow2_cache_put(BlockDriverState *bs, Qcow2Cache *c, void **table); - -#endif diff --git a/qemu/block/qed-check.c b/qemu/block/qed-check.c deleted file mode 100644 index 622f30897..000000000 --- a/qemu/block/qed-check.c +++ /dev/null @@ -1,251 +0,0 @@ -/* - * QEMU Enhanced Disk Format Consistency Check - * - * Copyright IBM, Corp. 2010 - * - * Authors: - * Stefan Hajnoczi - * - * This work is licensed under the terms of the GNU LGPL, version 2 or later. - * See the COPYING.LIB file in the top-level directory. - * - */ - -#include "qemu/osdep.h" -#include "qed.h" - -typedef struct { - BDRVQEDState *s; - BdrvCheckResult *result; - bool fix; /* whether to fix invalid offsets */ - - uint64_t nclusters; - uint32_t *used_clusters; /* referenced cluster bitmap */ - - QEDRequest request; -} QEDCheck; - -static bool qed_test_bit(uint32_t *bitmap, uint64_t n) { - return !!(bitmap[n / 32] & (1 << (n % 32))); -} - -static void qed_set_bit(uint32_t *bitmap, uint64_t n) { - bitmap[n / 32] |= 1 << (n % 32); -} - -/** - * Set bitmap bits for clusters - * - * @check: Check structure - * @offset: Starting offset in bytes - * @n: Number of clusters - */ -static bool qed_set_used_clusters(QEDCheck *check, uint64_t offset, - unsigned int n) -{ - uint64_t cluster = qed_bytes_to_clusters(check->s, offset); - unsigned int corruptions = 0; - - while (n-- != 0) { - /* Clusters should only be referenced once */ - if (qed_test_bit(check->used_clusters, cluster)) { - corruptions++; - } - - qed_set_bit(check->used_clusters, cluster); - cluster++; - } - - check->result->corruptions += corruptions; - return corruptions == 0; -} - -/** - * Check an L2 table - * - * @ret: Number of invalid cluster offsets - */ -static unsigned int qed_check_l2_table(QEDCheck *check, QEDTable *table) -{ - BDRVQEDState *s = check->s; - unsigned int i, num_invalid = 0; - uint64_t last_offset = 0; - - for (i = 0; i < s->table_nelems; i++) { - uint64_t offset = table->offsets[i]; - - if (qed_offset_is_unalloc_cluster(offset) || - qed_offset_is_zero_cluster(offset)) { - continue; - } - check->result->bfi.allocated_clusters++; - if (last_offset && (last_offset + s->header.cluster_size != offset)) { - check->result->bfi.fragmented_clusters++; - } - last_offset = offset; - - /* Detect invalid cluster offset */ - if (!qed_check_cluster_offset(s, offset)) { - if (check->fix) { - table->offsets[i] = 0; - check->result->corruptions_fixed++; - } else { - check->result->corruptions++; - } - - num_invalid++; - continue; - } - - qed_set_used_clusters(check, offset, 1); - } - - return num_invalid; -} - -/** - * Descend tables and check each cluster is referenced once only - */ -static int qed_check_l1_table(QEDCheck *check, QEDTable *table) -{ - BDRVQEDState *s = check->s; - unsigned int i, num_invalid_l1 = 0; - int ret, last_error = 0; - - /* Mark L1 table clusters used */ - qed_set_used_clusters(check, s->header.l1_table_offset, - s->header.table_size); - - for (i = 0; i < s->table_nelems; i++) { - unsigned int num_invalid_l2; - uint64_t offset = table->offsets[i]; - - if (qed_offset_is_unalloc_cluster(offset)) { - continue; - } - - /* Detect invalid L2 offset */ - if (!qed_check_table_offset(s, offset)) { - /* Clear invalid offset */ - if (check->fix) { - table->offsets[i] = 0; - check->result->corruptions_fixed++; - } else { - check->result->corruptions++; - } - - num_invalid_l1++; - continue; - } - - if (!qed_set_used_clusters(check, offset, s->header.table_size)) { - continue; /* skip an invalid table */ - } - - ret = qed_read_l2_table_sync(s, &check->request, offset); - if (ret) { - check->result->check_errors++; - last_error = ret; - continue; - } - - num_invalid_l2 = qed_check_l2_table(check, - check->request.l2_table->table); - - /* Write out fixed L2 table */ - if (num_invalid_l2 > 0 && check->fix) { - ret = qed_write_l2_table_sync(s, &check->request, 0, - s->table_nelems, false); - if (ret) { - check->result->check_errors++; - last_error = ret; - continue; - } - } - } - - /* Drop reference to final table */ - qed_unref_l2_cache_entry(check->request.l2_table); - check->request.l2_table = NULL; - - /* Write out fixed L1 table */ - if (num_invalid_l1 > 0 && check->fix) { - ret = qed_write_l1_table_sync(s, 0, s->table_nelems); - if (ret) { - check->result->check_errors++; - last_error = ret; - } - } - - return last_error; -} - -/** - * Check for unreferenced (leaked) clusters - */ -static void qed_check_for_leaks(QEDCheck *check) -{ - BDRVQEDState *s = check->s; - uint64_t i; - - for (i = s->header.header_size; i < check->nclusters; i++) { - if (!qed_test_bit(check->used_clusters, i)) { - check->result->leaks++; - } - } -} - -/** - * Mark an image clean once it passes check or has been repaired - */ -static void qed_check_mark_clean(BDRVQEDState *s, BdrvCheckResult *result) -{ - /* Skip if there were unfixable corruptions or I/O errors */ - if (result->corruptions > 0 || result->check_errors > 0) { - return; - } - - /* Skip if image is already marked clean */ - if (!(s->header.features & QED_F_NEED_CHECK)) { - return; - } - - /* Ensure fixes reach storage before clearing check bit */ - bdrv_flush(s->bs); - - s->header.features &= ~QED_F_NEED_CHECK; - qed_write_header_sync(s); -} - -int qed_check(BDRVQEDState *s, BdrvCheckResult *result, bool fix) -{ - QEDCheck check = { - .s = s, - .result = result, - .nclusters = qed_bytes_to_clusters(s, s->file_size), - .request = { .l2_table = NULL }, - .fix = fix, - }; - int ret; - - check.used_clusters = g_try_new0(uint32_t, (check.nclusters + 31) / 32); - if (check.nclusters && check.used_clusters == NULL) { - return -ENOMEM; - } - - check.result->bfi.total_clusters = - (s->header.image_size + s->header.cluster_size - 1) / - s->header.cluster_size; - ret = qed_check_l1_table(&check, s->l1_table); - if (ret == 0) { - /* Only check for leaks if entire image was scanned successfully */ - qed_check_for_leaks(&check); - - if (fix) { - qed_check_mark_clean(s, result); - } - } - - g_free(check.used_clusters); - return ret; -} diff --git a/qemu/block/qed-cluster.c b/qemu/block/qed-cluster.c deleted file mode 100644 index c24e75616..000000000 --- a/qemu/block/qed-cluster.c +++ /dev/null @@ -1,166 +0,0 @@ -/* - * QEMU Enhanced Disk Format Cluster functions - * - * Copyright IBM, Corp. 2010 - * - * Authors: - * Stefan Hajnoczi - * Anthony Liguori - * - * This work is licensed under the terms of the GNU LGPL, version 2 or later. - * See the COPYING.LIB file in the top-level directory. - * - */ - -#include "qemu/osdep.h" -#include "qed.h" - -/** - * Count the number of contiguous data clusters - * - * @s: QED state - * @table: L2 table - * @index: First cluster index - * @n: Maximum number of clusters - * @offset: Set to first cluster offset - * - * This function scans tables for contiguous clusters. A contiguous run of - * clusters may be allocated, unallocated, or zero. - */ -static unsigned int qed_count_contiguous_clusters(BDRVQEDState *s, - QEDTable *table, - unsigned int index, - unsigned int n, - uint64_t *offset) -{ - unsigned int end = MIN(index + n, s->table_nelems); - uint64_t last = table->offsets[index]; - unsigned int i; - - *offset = last; - - for (i = index + 1; i < end; i++) { - if (qed_offset_is_unalloc_cluster(last)) { - /* Counting unallocated clusters */ - if (!qed_offset_is_unalloc_cluster(table->offsets[i])) { - break; - } - } else if (qed_offset_is_zero_cluster(last)) { - /* Counting zero clusters */ - if (!qed_offset_is_zero_cluster(table->offsets[i])) { - break; - } - } else { - /* Counting allocated clusters */ - if (table->offsets[i] != last + s->header.cluster_size) { - break; - } - last = table->offsets[i]; - } - } - return i - index; -} - -typedef struct { - BDRVQEDState *s; - uint64_t pos; - size_t len; - - QEDRequest *request; - - /* User callback */ - QEDFindClusterFunc *cb; - void *opaque; -} QEDFindClusterCB; - -static void qed_find_cluster_cb(void *opaque, int ret) -{ - QEDFindClusterCB *find_cluster_cb = opaque; - BDRVQEDState *s = find_cluster_cb->s; - QEDRequest *request = find_cluster_cb->request; - uint64_t offset = 0; - size_t len = 0; - unsigned int index; - unsigned int n; - - if (ret) { - goto out; - } - - index = qed_l2_index(s, find_cluster_cb->pos); - n = qed_bytes_to_clusters(s, - qed_offset_into_cluster(s, find_cluster_cb->pos) + - find_cluster_cb->len); - n = qed_count_contiguous_clusters(s, request->l2_table->table, - index, n, &offset); - - if (qed_offset_is_unalloc_cluster(offset)) { - ret = QED_CLUSTER_L2; - } else if (qed_offset_is_zero_cluster(offset)) { - ret = QED_CLUSTER_ZERO; - } else if (qed_check_cluster_offset(s, offset)) { - ret = QED_CLUSTER_FOUND; - } else { - ret = -EINVAL; - } - - len = MIN(find_cluster_cb->len, n * s->header.cluster_size - - qed_offset_into_cluster(s, find_cluster_cb->pos)); - -out: - find_cluster_cb->cb(find_cluster_cb->opaque, ret, offset, len); - g_free(find_cluster_cb); -} - -/** - * Find the offset of a data cluster - * - * @s: QED state - * @request: L2 cache entry - * @pos: Byte position in device - * @len: Number of bytes - * @cb: Completion function - * @opaque: User data for completion function - * - * This function translates a position in the block device to an offset in the - * image file. It invokes the cb completion callback to report back the - * translated offset or unallocated range in the image file. - * - * If the L2 table exists, request->l2_table points to the L2 table cache entry - * and the caller must free the reference when they are finished. The cache - * entry is exposed in this way to avoid callers having to read the L2 table - * again later during request processing. If request->l2_table is non-NULL it - * will be unreferenced before taking on the new cache entry. - */ -void qed_find_cluster(BDRVQEDState *s, QEDRequest *request, uint64_t pos, - size_t len, QEDFindClusterFunc *cb, void *opaque) -{ - QEDFindClusterCB *find_cluster_cb; - uint64_t l2_offset; - - /* Limit length to L2 boundary. Requests are broken up at the L2 boundary - * so that a request acts on one L2 table at a time. - */ - len = MIN(len, (((pos >> s->l1_shift) + 1) << s->l1_shift) - pos); - - l2_offset = s->l1_table->offsets[qed_l1_index(s, pos)]; - if (qed_offset_is_unalloc_cluster(l2_offset)) { - cb(opaque, QED_CLUSTER_L1, 0, len); - return; - } - if (!qed_check_table_offset(s, l2_offset)) { - cb(opaque, -EINVAL, 0, 0); - return; - } - - find_cluster_cb = g_malloc(sizeof(*find_cluster_cb)); - find_cluster_cb->s = s; - find_cluster_cb->pos = pos; - find_cluster_cb->len = len; - find_cluster_cb->cb = cb; - find_cluster_cb->opaque = opaque; - find_cluster_cb->request = request; - - qed_read_l2_table(s, request, l2_offset, - qed_find_cluster_cb, find_cluster_cb); -} diff --git a/qemu/block/qed-gencb.c b/qemu/block/qed-gencb.c deleted file mode 100644 index faf8ecc84..000000000 --- a/qemu/block/qed-gencb.c +++ /dev/null @@ -1,33 +0,0 @@ -/* - * QEMU Enhanced Disk Format - * - * Copyright IBM, Corp. 2010 - * - * Authors: - * Stefan Hajnoczi - * - * This work is licensed under the terms of the GNU LGPL, version 2 or later. - * See the COPYING.LIB file in the top-level directory. - * - */ - -#include "qemu/osdep.h" -#include "qed.h" - -void *gencb_alloc(size_t len, BlockCompletionFunc *cb, void *opaque) -{ - GenericCB *gencb = g_malloc(len); - gencb->cb = cb; - gencb->opaque = opaque; - return gencb; -} - -void gencb_complete(void *opaque, int ret) -{ - GenericCB *gencb = opaque; - BlockCompletionFunc *cb = gencb->cb; - void *user_opaque = gencb->opaque; - - g_free(gencb); - cb(user_opaque, ret); -} diff --git a/qemu/block/qed-l2-cache.c b/qemu/block/qed-l2-cache.c deleted file mode 100644 index 5cba79465..000000000 --- a/qemu/block/qed-l2-cache.c +++ /dev/null @@ -1,188 +0,0 @@ -/* - * QEMU Enhanced Disk Format L2 Cache - * - * Copyright IBM, Corp. 2010 - * - * Authors: - * Anthony Liguori - * - * This work is licensed under the terms of the GNU LGPL, version 2 or later. - * See the COPYING.LIB file in the top-level directory. - * - */ - -/* - * L2 table cache usage is as follows: - * - * An open image has one L2 table cache that is used to avoid accessing the - * image file for recently referenced L2 tables. - * - * Cluster offset lookup translates the logical offset within the block device - * to a cluster offset within the image file. This is done by indexing into - * the L1 and L2 tables which store cluster offsets. It is here where the L2 - * table cache serves up recently referenced L2 tables. - * - * If there is a cache miss, that L2 table is read from the image file and - * committed to the cache. Subsequent accesses to that L2 table will be served - * from the cache until the table is evicted from the cache. - * - * L2 tables are also committed to the cache when new L2 tables are allocated - * in the image file. Since the L2 table cache is write-through, the new L2 - * table is first written out to the image file and then committed to the - * cache. - * - * Multiple I/O requests may be using an L2 table cache entry at any given - * time. That means an entry may be in use across several requests and - * reference counting is needed to free the entry at the correct time. In - * particular, an entry evicted from the cache will only be freed once all - * references are dropped. - * - * An in-flight I/O request will hold a reference to a L2 table cache entry for - * the period during which it needs to access the L2 table. This includes - * cluster offset lookup, L2 table allocation, and L2 table update when a new - * data cluster has been allocated. - * - * An interesting case occurs when two requests need to access an L2 table that - * is not in the cache. Since the operation to read the table from the image - * file takes some time to complete, both requests may see a cache miss and - * start reading the L2 table from the image file. The first to finish will - * commit its L2 table into the cache. When the second tries to commit its - * table will be deleted in favor of the existing cache entry. - */ - -#include "qemu/osdep.h" -#include "trace.h" -#include "qed.h" - -/* Each L2 holds 2GB so this let's us fully cache a 100GB disk */ -#define MAX_L2_CACHE_SIZE 50 - -/** - * Initialize the L2 cache - */ -void qed_init_l2_cache(L2TableCache *l2_cache) -{ - QTAILQ_INIT(&l2_cache->entries); - l2_cache->n_entries = 0; -} - -/** - * Free the L2 cache - */ -void qed_free_l2_cache(L2TableCache *l2_cache) -{ - CachedL2Table *entry, *next_entry; - - QTAILQ_FOREACH_SAFE(entry, &l2_cache->entries, node, next_entry) { - qemu_vfree(entry->table); - g_free(entry); - } -} - -/** - * Allocate an uninitialized entry from the cache - * - * The returned entry has a reference count of 1 and is owned by the caller. - * The caller must allocate the actual table field for this entry and it must - * be freeable using qemu_vfree(). - */ -CachedL2Table *qed_alloc_l2_cache_entry(L2TableCache *l2_cache) -{ - CachedL2Table *entry; - - entry = g_malloc0(sizeof(*entry)); - entry->ref++; - - trace_qed_alloc_l2_cache_entry(l2_cache, entry); - - return entry; -} - -/** - * Decrease an entry's reference count and free if necessary when the reference - * count drops to zero. - */ -void qed_unref_l2_cache_entry(CachedL2Table *entry) -{ - if (!entry) { - return; - } - - entry->ref--; - trace_qed_unref_l2_cache_entry(entry, entry->ref); - if (entry->ref == 0) { - qemu_vfree(entry->table); - g_free(entry); - } -} - -/** - * Find an entry in the L2 cache. This may return NULL and it's up to the - * caller to satisfy the cache miss. - * - * For a cached entry, this function increases the reference count and returns - * the entry. - */ -CachedL2Table *qed_find_l2_cache_entry(L2TableCache *l2_cache, uint64_t offset) -{ - CachedL2Table *entry; - - QTAILQ_FOREACH(entry, &l2_cache->entries, node) { - if (entry->offset == offset) { - trace_qed_find_l2_cache_entry(l2_cache, entry, offset, entry->ref); - entry->ref++; - return entry; - } - } - return NULL; -} - -/** - * Commit an L2 cache entry into the cache. This is meant to be used as part of - * the process to satisfy a cache miss. A caller would allocate an entry which - * is not actually in the L2 cache and then once the entry was valid and - * present on disk, the entry can be committed into the cache. - * - * Since the cache is write-through, it's important that this function is not - * called until the entry is present on disk and the L1 has been updated to - * point to the entry. - * - * N.B. This function steals a reference to the l2_table from the caller so the - * caller must obtain a new reference by issuing a call to - * qed_find_l2_cache_entry(). - */ -void qed_commit_l2_cache_entry(L2TableCache *l2_cache, CachedL2Table *l2_table) -{ - CachedL2Table *entry; - - entry = qed_find_l2_cache_entry(l2_cache, l2_table->offset); - if (entry) { - qed_unref_l2_cache_entry(entry); - qed_unref_l2_cache_entry(l2_table); - return; - } - - /* Evict an unused cache entry so we have space. If all entries are in use - * we can grow the cache temporarily and we try to shrink back down later. - */ - if (l2_cache->n_entries >= MAX_L2_CACHE_SIZE) { - CachedL2Table *next; - QTAILQ_FOREACH_SAFE(entry, &l2_cache->entries, node, next) { - if (entry->ref > 1) { - continue; - } - - QTAILQ_REMOVE(&l2_cache->entries, entry, node); - l2_cache->n_entries--; - qed_unref_l2_cache_entry(entry); - - /* Stop evicting when we've shrunk back to max size */ - if (l2_cache->n_entries < MAX_L2_CACHE_SIZE) { - break; - } - } - } - - l2_cache->n_entries++; - QTAILQ_INSERT_TAIL(&l2_cache->entries, l2_table, node); -} diff --git a/qemu/block/qed-table.c b/qemu/block/qed-table.c deleted file mode 100644 index 802945f5e..000000000 --- a/qemu/block/qed-table.c +++ /dev/null @@ -1,297 +0,0 @@ -/* - * QEMU Enhanced Disk Format Table I/O - * - * Copyright IBM, Corp. 2010 - * - * Authors: - * Stefan Hajnoczi - * Anthony Liguori - * - * This work is licensed under the terms of the GNU LGPL, version 2 or later. - * See the COPYING.LIB file in the top-level directory. - * - */ - -#include "qemu/osdep.h" -#include "trace.h" -#include "qemu/sockets.h" /* for EINPROGRESS on Windows */ -#include "qed.h" - -typedef struct { - GenericCB gencb; - BDRVQEDState *s; - QEDTable *table; - - struct iovec iov; - QEMUIOVector qiov; -} QEDReadTableCB; - -static void qed_read_table_cb(void *opaque, int ret) -{ - QEDReadTableCB *read_table_cb = opaque; - QEDTable *table = read_table_cb->table; - int noffsets = read_table_cb->qiov.size / sizeof(uint64_t); - int i; - - /* Handle I/O error */ - if (ret) { - goto out; - } - - /* Byteswap offsets */ - for (i = 0; i < noffsets; i++) { - table->offsets[i] = le64_to_cpu(table->offsets[i]); - } - -out: - /* Completion */ - trace_qed_read_table_cb(read_table_cb->s, read_table_cb->table, ret); - gencb_complete(&read_table_cb->gencb, ret); -} - -static void qed_read_table(BDRVQEDState *s, uint64_t offset, QEDTable *table, - BlockCompletionFunc *cb, void *opaque) -{ - QEDReadTableCB *read_table_cb = gencb_alloc(sizeof(*read_table_cb), - cb, opaque); - QEMUIOVector *qiov = &read_table_cb->qiov; - - trace_qed_read_table(s, offset, table); - - read_table_cb->s = s; - read_table_cb->table = table; - read_table_cb->iov.iov_base = table->offsets, - read_table_cb->iov.iov_len = s->header.cluster_size * s->header.table_size, - - qemu_iovec_init_external(qiov, &read_table_cb->iov, 1); - bdrv_aio_readv(s->bs->file->bs, offset / BDRV_SECTOR_SIZE, qiov, - qiov->size / BDRV_SECTOR_SIZE, - qed_read_table_cb, read_table_cb); -} - -typedef struct { - GenericCB gencb; - BDRVQEDState *s; - QEDTable *orig_table; - QEDTable *table; - bool flush; /* flush after write? */ - - struct iovec iov; - QEMUIOVector qiov; -} QEDWriteTableCB; - -static void qed_write_table_cb(void *opaque, int ret) -{ - QEDWriteTableCB *write_table_cb = opaque; - - trace_qed_write_table_cb(write_table_cb->s, - write_table_cb->orig_table, - write_table_cb->flush, - ret); - - if (ret) { - goto out; - } - - if (write_table_cb->flush) { - /* We still need to flush first */ - write_table_cb->flush = false; - bdrv_aio_flush(write_table_cb->s->bs, qed_write_table_cb, - write_table_cb); - return; - } - -out: - qemu_vfree(write_table_cb->table); - gencb_complete(&write_table_cb->gencb, ret); -} - -/** - * Write out an updated part or all of a table - * - * @s: QED state - * @offset: Offset of table in image file, in bytes - * @table: Table - * @index: Index of first element - * @n: Number of elements - * @flush: Whether or not to sync to disk - * @cb: Completion function - * @opaque: Argument for completion function - */ -static void qed_write_table(BDRVQEDState *s, uint64_t offset, QEDTable *table, - unsigned int index, unsigned int n, bool flush, - BlockCompletionFunc *cb, void *opaque) -{ - QEDWriteTableCB *write_table_cb; - unsigned int sector_mask = BDRV_SECTOR_SIZE / sizeof(uint64_t) - 1; - unsigned int start, end, i; - size_t len_bytes; - - trace_qed_write_table(s, offset, table, index, n); - - /* Calculate indices of the first and one after last elements */ - start = index & ~sector_mask; - end = (index + n + sector_mask) & ~sector_mask; - - len_bytes = (end - start) * sizeof(uint64_t); - - write_table_cb = gencb_alloc(sizeof(*write_table_cb), cb, opaque); - write_table_cb->s = s; - write_table_cb->orig_table = table; - write_table_cb->flush = flush; - write_table_cb->table = qemu_blockalign(s->bs, len_bytes); - write_table_cb->iov.iov_base = write_table_cb->table->offsets; - write_table_cb->iov.iov_len = len_bytes; - qemu_iovec_init_external(&write_table_cb->qiov, &write_table_cb->iov, 1); - - /* Byteswap table */ - for (i = start; i < end; i++) { - uint64_t le_offset = cpu_to_le64(table->offsets[i]); - write_table_cb->table->offsets[i - start] = le_offset; - } - - /* Adjust for offset into table */ - offset += start * sizeof(uint64_t); - - bdrv_aio_writev(s->bs->file->bs, offset / BDRV_SECTOR_SIZE, - &write_table_cb->qiov, - write_table_cb->qiov.size / BDRV_SECTOR_SIZE, - qed_write_table_cb, write_table_cb); -} - -/** - * Propagate return value from async callback - */ -static void qed_sync_cb(void *opaque, int ret) -{ - *(int *)opaque = ret; -} - -int qed_read_l1_table_sync(BDRVQEDState *s) -{ - int ret = -EINPROGRESS; - - qed_read_table(s, s->header.l1_table_offset, - s->l1_table, qed_sync_cb, &ret); - while (ret == -EINPROGRESS) { - aio_poll(bdrv_get_aio_context(s->bs), true); - } - - return ret; -} - -void qed_write_l1_table(BDRVQEDState *s, unsigned int index, unsigned int n, - BlockCompletionFunc *cb, void *opaque) -{ - BLKDBG_EVENT(s->bs->file, BLKDBG_L1_UPDATE); - qed_write_table(s, s->header.l1_table_offset, - s->l1_table, index, n, false, cb, opaque); -} - -int qed_write_l1_table_sync(BDRVQEDState *s, unsigned int index, - unsigned int n) -{ - int ret = -EINPROGRESS; - - qed_write_l1_table(s, index, n, qed_sync_cb, &ret); - while (ret == -EINPROGRESS) { - aio_poll(bdrv_get_aio_context(s->bs), true); - } - - return ret; -} - -typedef struct { - GenericCB gencb; - BDRVQEDState *s; - uint64_t l2_offset; - QEDRequest *request; -} QEDReadL2TableCB; - -static void qed_read_l2_table_cb(void *opaque, int ret) -{ - QEDReadL2TableCB *read_l2_table_cb = opaque; - QEDRequest *request = read_l2_table_cb->request; - BDRVQEDState *s = read_l2_table_cb->s; - CachedL2Table *l2_table = request->l2_table; - uint64_t l2_offset = read_l2_table_cb->l2_offset; - - if (ret) { - /* can't trust loaded L2 table anymore */ - qed_unref_l2_cache_entry(l2_table); - request->l2_table = NULL; - } else { - l2_table->offset = l2_offset; - - qed_commit_l2_cache_entry(&s->l2_cache, l2_table); - - /* This is guaranteed to succeed because we just committed the entry - * to the cache. - */ - request->l2_table = qed_find_l2_cache_entry(&s->l2_cache, l2_offset); - assert(request->l2_table != NULL); - } - - gencb_complete(&read_l2_table_cb->gencb, ret); -} - -void qed_read_l2_table(BDRVQEDState *s, QEDRequest *request, uint64_t offset, - BlockCompletionFunc *cb, void *opaque) -{ - QEDReadL2TableCB *read_l2_table_cb; - - qed_unref_l2_cache_entry(request->l2_table); - - /* Check for cached L2 entry */ - request->l2_table = qed_find_l2_cache_entry(&s->l2_cache, offset); - if (request->l2_table) { - cb(opaque, 0); - return; - } - - request->l2_table = qed_alloc_l2_cache_entry(&s->l2_cache); - request->l2_table->table = qed_alloc_table(s); - - read_l2_table_cb = gencb_alloc(sizeof(*read_l2_table_cb), cb, opaque); - read_l2_table_cb->s = s; - read_l2_table_cb->l2_offset = offset; - read_l2_table_cb->request = request; - - BLKDBG_EVENT(s->bs->file, BLKDBG_L2_LOAD); - qed_read_table(s, offset, request->l2_table->table, - qed_read_l2_table_cb, read_l2_table_cb); -} - -int qed_read_l2_table_sync(BDRVQEDState *s, QEDRequest *request, uint64_t offset) -{ - int ret = -EINPROGRESS; - - qed_read_l2_table(s, request, offset, qed_sync_cb, &ret); - while (ret == -EINPROGRESS) { - aio_poll(bdrv_get_aio_context(s->bs), true); - } - - return ret; -} - -void qed_write_l2_table(BDRVQEDState *s, QEDRequest *request, - unsigned int index, unsigned int n, bool flush, - BlockCompletionFunc *cb, void *opaque) -{ - BLKDBG_EVENT(s->bs->file, BLKDBG_L2_UPDATE); - qed_write_table(s, request->l2_table->offset, - request->l2_table->table, index, n, flush, cb, opaque); -} - -int qed_write_l2_table_sync(BDRVQEDState *s, QEDRequest *request, - unsigned int index, unsigned int n, bool flush) -{ - int ret = -EINPROGRESS; - - qed_write_l2_table(s, request, index, n, flush, qed_sync_cb, &ret); - while (ret == -EINPROGRESS) { - aio_poll(bdrv_get_aio_context(s->bs), true); - } - - return ret; -} diff --git a/qemu/block/qed.c b/qemu/block/qed.c deleted file mode 100644 index 0af52741d..000000000 --- a/qemu/block/qed.c +++ /dev/null @@ -1,1689 +0,0 @@ -/* - * QEMU Enhanced Disk Format - * - * Copyright IBM, Corp. 2010 - * - * Authors: - * Stefan Hajnoczi - * Anthony Liguori - * - * This work is licensed under the terms of the GNU LGPL, version 2 or later. - * See the COPYING.LIB file in the top-level directory. - * - */ - -#include "qemu/osdep.h" -#include "qapi/error.h" -#include "qemu/timer.h" -#include "trace.h" -#include "qed.h" -#include "qapi/qmp/qerror.h" -#include "migration/migration.h" -#include "sysemu/block-backend.h" - -static const AIOCBInfo qed_aiocb_info = { - .aiocb_size = sizeof(QEDAIOCB), -}; - -static int bdrv_qed_probe(const uint8_t *buf, int buf_size, - const char *filename) -{ - const QEDHeader *header = (const QEDHeader *)buf; - - if (buf_size < sizeof(*header)) { - return 0; - } - if (le32_to_cpu(header->magic) != QED_MAGIC) { - return 0; - } - return 100; -} - -/** - * Check whether an image format is raw - * - * @fmt: Backing file format, may be NULL - */ -static bool qed_fmt_is_raw(const char *fmt) -{ - return fmt && strcmp(fmt, "raw") == 0; -} - -static void qed_header_le_to_cpu(const QEDHeader *le, QEDHeader *cpu) -{ - cpu->magic = le32_to_cpu(le->magic); - cpu->cluster_size = le32_to_cpu(le->cluster_size); - cpu->table_size = le32_to_cpu(le->table_size); - cpu->header_size = le32_to_cpu(le->header_size); - cpu->features = le64_to_cpu(le->features); - cpu->compat_features = le64_to_cpu(le->compat_features); - cpu->autoclear_features = le64_to_cpu(le->autoclear_features); - cpu->l1_table_offset = le64_to_cpu(le->l1_table_offset); - cpu->image_size = le64_to_cpu(le->image_size); - cpu->backing_filename_offset = le32_to_cpu(le->backing_filename_offset); - cpu->backing_filename_size = le32_to_cpu(le->backing_filename_size); -} - -static void qed_header_cpu_to_le(const QEDHeader *cpu, QEDHeader *le) -{ - le->magic = cpu_to_le32(cpu->magic); - le->cluster_size = cpu_to_le32(cpu->cluster_size); - le->table_size = cpu_to_le32(cpu->table_size); - le->header_size = cpu_to_le32(cpu->header_size); - le->features = cpu_to_le64(cpu->features); - le->compat_features = cpu_to_le64(cpu->compat_features); - le->autoclear_features = cpu_to_le64(cpu->autoclear_features); - le->l1_table_offset = cpu_to_le64(cpu->l1_table_offset); - le->image_size = cpu_to_le64(cpu->image_size); - le->backing_filename_offset = cpu_to_le32(cpu->backing_filename_offset); - le->backing_filename_size = cpu_to_le32(cpu->backing_filename_size); -} - -int qed_write_header_sync(BDRVQEDState *s) -{ - QEDHeader le; - int ret; - - qed_header_cpu_to_le(&s->header, &le); - ret = bdrv_pwrite(s->bs->file->bs, 0, &le, sizeof(le)); - if (ret != sizeof(le)) { - return ret; - } - return 0; -} - -typedef struct { - GenericCB gencb; - BDRVQEDState *s; - struct iovec iov; - QEMUIOVector qiov; - int nsectors; - uint8_t *buf; -} QEDWriteHeaderCB; - -static void qed_write_header_cb(void *opaque, int ret) -{ - QEDWriteHeaderCB *write_header_cb = opaque; - - qemu_vfree(write_header_cb->buf); - gencb_complete(write_header_cb, ret); -} - -static void qed_write_header_read_cb(void *opaque, int ret) -{ - QEDWriteHeaderCB *write_header_cb = opaque; - BDRVQEDState *s = write_header_cb->s; - - if (ret) { - qed_write_header_cb(write_header_cb, ret); - return; - } - - /* Update header */ - qed_header_cpu_to_le(&s->header, (QEDHeader *)write_header_cb->buf); - - bdrv_aio_writev(s->bs->file->bs, 0, &write_header_cb->qiov, - write_header_cb->nsectors, qed_write_header_cb, - write_header_cb); -} - -/** - * Update header in-place (does not rewrite backing filename or other strings) - * - * This function only updates known header fields in-place and does not affect - * extra data after the QED header. - */ -static void qed_write_header(BDRVQEDState *s, BlockCompletionFunc cb, - void *opaque) -{ - /* We must write full sectors for O_DIRECT but cannot necessarily generate - * the data following the header if an unrecognized compat feature is - * active. Therefore, first read the sectors containing the header, update - * them, and write back. - */ - - int nsectors = (sizeof(QEDHeader) + BDRV_SECTOR_SIZE - 1) / - BDRV_SECTOR_SIZE; - size_t len = nsectors * BDRV_SECTOR_SIZE; - QEDWriteHeaderCB *write_header_cb = gencb_alloc(sizeof(*write_header_cb), - cb, opaque); - - write_header_cb->s = s; - write_header_cb->nsectors = nsectors; - write_header_cb->buf = qemu_blockalign(s->bs, len); - write_header_cb->iov.iov_base = write_header_cb->buf; - write_header_cb->iov.iov_len = len; - qemu_iovec_init_external(&write_header_cb->qiov, &write_header_cb->iov, 1); - - bdrv_aio_readv(s->bs->file->bs, 0, &write_header_cb->qiov, nsectors, - qed_write_header_read_cb, write_header_cb); -} - -static uint64_t qed_max_image_size(uint32_t cluster_size, uint32_t table_size) -{ - uint64_t table_entries; - uint64_t l2_size; - - table_entries = (table_size * cluster_size) / sizeof(uint64_t); - l2_size = table_entries * cluster_size; - - return l2_size * table_entries; -} - -static bool qed_is_cluster_size_valid(uint32_t cluster_size) -{ - if (cluster_size < QED_MIN_CLUSTER_SIZE || - cluster_size > QED_MAX_CLUSTER_SIZE) { - return false; - } - if (cluster_size & (cluster_size - 1)) { - return false; /* not power of 2 */ - } - return true; -} - -static bool qed_is_table_size_valid(uint32_t table_size) -{ - if (table_size < QED_MIN_TABLE_SIZE || - table_size > QED_MAX_TABLE_SIZE) { - return false; - } - if (table_size & (table_size - 1)) { - return false; /* not power of 2 */ - } - return true; -} - -static bool qed_is_image_size_valid(uint64_t image_size, uint32_t cluster_size, - uint32_t table_size) -{ - if (image_size % BDRV_SECTOR_SIZE != 0) { - return false; /* not multiple of sector size */ - } - if (image_size > qed_max_image_size(cluster_size, table_size)) { - return false; /* image is too large */ - } - return true; -} - -/** - * Read a string of known length from the image file - * - * @file: Image file - * @offset: File offset to start of string, in bytes - * @n: String length in bytes - * @buf: Destination buffer - * @buflen: Destination buffer length in bytes - * @ret: 0 on success, -errno on failure - * - * The string is NUL-terminated. - */ -static int qed_read_string(BlockDriverState *file, uint64_t offset, size_t n, - char *buf, size_t buflen) -{ - int ret; - if (n >= buflen) { - return -EINVAL; - } - ret = bdrv_pread(file, offset, buf, n); - if (ret < 0) { - return ret; - } - buf[n] = '\0'; - return 0; -} - -/** - * Allocate new clusters - * - * @s: QED state - * @n: Number of contiguous clusters to allocate - * @ret: Offset of first allocated cluster - * - * This function only produces the offset where the new clusters should be - * written. It updates BDRVQEDState but does not make any changes to the image - * file. - */ -static uint64_t qed_alloc_clusters(BDRVQEDState *s, unsigned int n) -{ - uint64_t offset = s->file_size; - s->file_size += n * s->header.cluster_size; - return offset; -} - -QEDTable *qed_alloc_table(BDRVQEDState *s) -{ - /* Honor O_DIRECT memory alignment requirements */ - return qemu_blockalign(s->bs, - s->header.cluster_size * s->header.table_size); -} - -/** - * Allocate a new zeroed L2 table - */ -static CachedL2Table *qed_new_l2_table(BDRVQEDState *s) -{ - CachedL2Table *l2_table = qed_alloc_l2_cache_entry(&s->l2_cache); - - l2_table->table = qed_alloc_table(s); - l2_table->offset = qed_alloc_clusters(s, s->header.table_size); - - memset(l2_table->table->offsets, 0, - s->header.cluster_size * s->header.table_size); - return l2_table; -} - -static void qed_aio_next_io(void *opaque, int ret); - -static void qed_plug_allocating_write_reqs(BDRVQEDState *s) -{ - assert(!s->allocating_write_reqs_plugged); - - s->allocating_write_reqs_plugged = true; -} - -static void qed_unplug_allocating_write_reqs(BDRVQEDState *s) -{ - QEDAIOCB *acb; - - assert(s->allocating_write_reqs_plugged); - - s->allocating_write_reqs_plugged = false; - - acb = QSIMPLEQ_FIRST(&s->allocating_write_reqs); - if (acb) { - qed_aio_next_io(acb, 0); - } -} - -static void qed_finish_clear_need_check(void *opaque, int ret) -{ - /* Do nothing */ -} - -static void qed_flush_after_clear_need_check(void *opaque, int ret) -{ - BDRVQEDState *s = opaque; - - bdrv_aio_flush(s->bs, qed_finish_clear_need_check, s); - - /* No need to wait until flush completes */ - qed_unplug_allocating_write_reqs(s); -} - -static void qed_clear_need_check(void *opaque, int ret) -{ - BDRVQEDState *s = opaque; - - if (ret) { - qed_unplug_allocating_write_reqs(s); - return; - } - - s->header.features &= ~QED_F_NEED_CHECK; - qed_write_header(s, qed_flush_after_clear_need_check, s); -} - -static void qed_need_check_timer_cb(void *opaque) -{ - BDRVQEDState *s = opaque; - - /* The timer should only fire when allocating writes have drained */ - assert(!QSIMPLEQ_FIRST(&s->allocating_write_reqs)); - - trace_qed_need_check_timer_cb(s); - - qed_plug_allocating_write_reqs(s); - - /* Ensure writes are on disk before clearing flag */ - bdrv_aio_flush(s->bs, qed_clear_need_check, s); -} - -static void qed_start_need_check_timer(BDRVQEDState *s) -{ - trace_qed_start_need_check_timer(s); - - /* Use QEMU_CLOCK_VIRTUAL so we don't alter the image file while suspended for - * migration. - */ - timer_mod(s->need_check_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + - NANOSECONDS_PER_SECOND * QED_NEED_CHECK_TIMEOUT); -} - -/* It's okay to call this multiple times or when no timer is started */ -static void qed_cancel_need_check_timer(BDRVQEDState *s) -{ - trace_qed_cancel_need_check_timer(s); - timer_del(s->need_check_timer); -} - -static void bdrv_qed_detach_aio_context(BlockDriverState *bs) -{ - BDRVQEDState *s = bs->opaque; - - qed_cancel_need_check_timer(s); - timer_free(s->need_check_timer); -} - -static void bdrv_qed_attach_aio_context(BlockDriverState *bs, - AioContext *new_context) -{ - BDRVQEDState *s = bs->opaque; - - s->need_check_timer = aio_timer_new(new_context, - QEMU_CLOCK_VIRTUAL, SCALE_NS, - qed_need_check_timer_cb, s); - if (s->header.features & QED_F_NEED_CHECK) { - qed_start_need_check_timer(s); - } -} - -static int bdrv_qed_open(BlockDriverState *bs, QDict *options, int flags, - Error **errp) -{ - BDRVQEDState *s = bs->opaque; - QEDHeader le_header; - int64_t file_size; - int ret; - - s->bs = bs; - QSIMPLEQ_INIT(&s->allocating_write_reqs); - - ret = bdrv_pread(bs->file->bs, 0, &le_header, sizeof(le_header)); - if (ret < 0) { - return ret; - } - qed_header_le_to_cpu(&le_header, &s->header); - - if (s->header.magic != QED_MAGIC) { - error_setg(errp, "Image not in QED format"); - return -EINVAL; - } - if (s->header.features & ~QED_FEATURE_MASK) { - /* image uses unsupported feature bits */ - error_setg(errp, "Unsupported QED features: %" PRIx64, - s->header.features & ~QED_FEATURE_MASK); - return -ENOTSUP; - } - if (!qed_is_cluster_size_valid(s->header.cluster_size)) { - return -EINVAL; - } - - /* Round down file size to the last cluster */ - file_size = bdrv_getlength(bs->file->bs); - if (file_size < 0) { - return file_size; - } - s->file_size = qed_start_of_cluster(s, file_size); - - if (!qed_is_table_size_valid(s->header.table_size)) { - return -EINVAL; - } - if (!qed_is_image_size_valid(s->header.image_size, - s->header.cluster_size, - s->header.table_size)) { - return -EINVAL; - } - if (!qed_check_table_offset(s, s->header.l1_table_offset)) { - return -EINVAL; - } - - s->table_nelems = (s->header.cluster_size * s->header.table_size) / - sizeof(uint64_t); - s->l2_shift = ctz32(s->header.cluster_size); - s->l2_mask = s->table_nelems - 1; - s->l1_shift = s->l2_shift + ctz32(s->table_nelems); - - /* Header size calculation must not overflow uint32_t */ - if (s->header.header_size > UINT32_MAX / s->header.cluster_size) { - return -EINVAL; - } - - if ((s->header.features & QED_F_BACKING_FILE)) { - if ((uint64_t)s->header.backing_filename_offset + - s->header.backing_filename_size > - s->header.cluster_size * s->header.header_size) { - return -EINVAL; - } - - ret = qed_read_string(bs->file->bs, s->header.backing_filename_offset, - s->header.backing_filename_size, bs->backing_file, - sizeof(bs->backing_file)); - if (ret < 0) { - return ret; - } - - if (s->header.features & QED_F_BACKING_FORMAT_NO_PROBE) { - pstrcpy(bs->backing_format, sizeof(bs->backing_format), "raw"); - } - } - - /* Reset unknown autoclear feature bits. This is a backwards - * compatibility mechanism that allows images to be opened by older - * programs, which "knock out" unknown feature bits. When an image is - * opened by a newer program again it can detect that the autoclear - * feature is no longer valid. - */ - if ((s->header.autoclear_features & ~QED_AUTOCLEAR_FEATURE_MASK) != 0 && - !bdrv_is_read_only(bs->file->bs) && !(flags & BDRV_O_INACTIVE)) { - s->header.autoclear_features &= QED_AUTOCLEAR_FEATURE_MASK; - - ret = qed_write_header_sync(s); - if (ret) { - return ret; - } - - /* From here on only known autoclear feature bits are valid */ - bdrv_flush(bs->file->bs); - } - - s->l1_table = qed_alloc_table(s); - qed_init_l2_cache(&s->l2_cache); - - ret = qed_read_l1_table_sync(s); - if (ret) { - goto out; - } - - /* If image was not closed cleanly, check consistency */ - if (!(flags & BDRV_O_CHECK) && (s->header.features & QED_F_NEED_CHECK)) { - /* Read-only images cannot be fixed. There is no risk of corruption - * since write operations are not possible. Therefore, allow - * potentially inconsistent images to be opened read-only. This can - * aid data recovery from an otherwise inconsistent image. - */ - if (!bdrv_is_read_only(bs->file->bs) && - !(flags & BDRV_O_INACTIVE)) { - BdrvCheckResult result = {0}; - - ret = qed_check(s, &result, true); - if (ret) { - goto out; - } - } - } - - bdrv_qed_attach_aio_context(bs, bdrv_get_aio_context(bs)); - -out: - if (ret) { - qed_free_l2_cache(&s->l2_cache); - qemu_vfree(s->l1_table); - } - return ret; -} - -static void bdrv_qed_refresh_limits(BlockDriverState *bs, Error **errp) -{ - BDRVQEDState *s = bs->opaque; - - bs->bl.write_zeroes_alignment = s->header.cluster_size >> BDRV_SECTOR_BITS; -} - -/* We have nothing to do for QED reopen, stubs just return - * success */ -static int bdrv_qed_reopen_prepare(BDRVReopenState *state, - BlockReopenQueue *queue, Error **errp) -{ - return 0; -} - -static void bdrv_qed_close(BlockDriverState *bs) -{ - BDRVQEDState *s = bs->opaque; - - bdrv_qed_detach_aio_context(bs); - - /* Ensure writes reach stable storage */ - bdrv_flush(bs->file->bs); - - /* Clean shutdown, no check required on next open */ - if (s->header.features & QED_F_NEED_CHECK) { - s->header.features &= ~QED_F_NEED_CHECK; - qed_write_header_sync(s); - } - - qed_free_l2_cache(&s->l2_cache); - qemu_vfree(s->l1_table); -} - -static int qed_create(const char *filename, uint32_t cluster_size, - uint64_t image_size, uint32_t table_size, - const char *backing_file, const char *backing_fmt, - QemuOpts *opts, Error **errp) -{ - QEDHeader header = { - .magic = QED_MAGIC, - .cluster_size = cluster_size, - .table_size = table_size, - .header_size = 1, - .features = 0, - .compat_features = 0, - .l1_table_offset = cluster_size, - .image_size = image_size, - }; - QEDHeader le_header; - uint8_t *l1_table = NULL; - size_t l1_size = header.cluster_size * header.table_size; - Error *local_err = NULL; - int ret = 0; - BlockBackend *blk; - - ret = bdrv_create_file(filename, opts, &local_err); - if (ret < 0) { - error_propagate(errp, local_err); - return ret; - } - - blk = blk_new_open(filename, NULL, NULL, - BDRV_O_RDWR | BDRV_O_PROTOCOL, &local_err); - if (blk == NULL) { - error_propagate(errp, local_err); - return -EIO; - } - - blk_set_allow_write_beyond_eof(blk, true); - - /* File must start empty and grow, check truncate is supported */ - ret = blk_truncate(blk, 0); - if (ret < 0) { - goto out; - } - - if (backing_file) { - header.features |= QED_F_BACKING_FILE; - header.backing_filename_offset = sizeof(le_header); - header.backing_filename_size = strlen(backing_file); - - if (qed_fmt_is_raw(backing_fmt)) { - header.features |= QED_F_BACKING_FORMAT_NO_PROBE; - } - } - - qed_header_cpu_to_le(&header, &le_header); - ret = blk_pwrite(blk, 0, &le_header, sizeof(le_header)); - if (ret < 0) { - goto out; - } - ret = blk_pwrite(blk, sizeof(le_header), backing_file, - header.backing_filename_size); - if (ret < 0) { - goto out; - } - - l1_table = g_malloc0(l1_size); - ret = blk_pwrite(blk, header.l1_table_offset, l1_table, l1_size); - if (ret < 0) { - goto out; - } - - ret = 0; /* success */ -out: - g_free(l1_table); - blk_unref(blk); - return ret; -} - -static int bdrv_qed_create(const char *filename, QemuOpts *opts, Error **errp) -{ - uint64_t image_size = 0; - uint32_t cluster_size = QED_DEFAULT_CLUSTER_SIZE; - uint32_t table_size = QED_DEFAULT_TABLE_SIZE; - char *backing_file = NULL; - char *backing_fmt = NULL; - int ret; - - image_size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0), - BDRV_SECTOR_SIZE); - backing_file = qemu_opt_get_del(opts, BLOCK_OPT_BACKING_FILE); - backing_fmt = qemu_opt_get_del(opts, BLOCK_OPT_BACKING_FMT); - cluster_size = qemu_opt_get_size_del(opts, - BLOCK_OPT_CLUSTER_SIZE, - QED_DEFAULT_CLUSTER_SIZE); - table_size = qemu_opt_get_size_del(opts, BLOCK_OPT_TABLE_SIZE, - QED_DEFAULT_TABLE_SIZE); - - if (!qed_is_cluster_size_valid(cluster_size)) { - error_setg(errp, "QED cluster size must be within range [%u, %u] " - "and power of 2", - QED_MIN_CLUSTER_SIZE, QED_MAX_CLUSTER_SIZE); - ret = -EINVAL; - goto finish; - } - if (!qed_is_table_size_valid(table_size)) { - error_setg(errp, "QED table size must be within range [%u, %u] " - "and power of 2", - QED_MIN_TABLE_SIZE, QED_MAX_TABLE_SIZE); - ret = -EINVAL; - goto finish; - } - if (!qed_is_image_size_valid(image_size, cluster_size, table_size)) { - error_setg(errp, "QED image size must be a non-zero multiple of " - "cluster size and less than %" PRIu64 " bytes", - qed_max_image_size(cluster_size, table_size)); - ret = -EINVAL; - goto finish; - } - - ret = qed_create(filename, cluster_size, image_size, table_size, - backing_file, backing_fmt, opts, errp); - -finish: - g_free(backing_file); - g_free(backing_fmt); - return ret; -} - -typedef struct { - BlockDriverState *bs; - Coroutine *co; - uint64_t pos; - int64_t status; - int *pnum; - BlockDriverState **file; -} QEDIsAllocatedCB; - -static void qed_is_allocated_cb(void *opaque, int ret, uint64_t offset, size_t len) -{ - QEDIsAllocatedCB *cb = opaque; - BDRVQEDState *s = cb->bs->opaque; - *cb->pnum = len / BDRV_SECTOR_SIZE; - switch (ret) { - case QED_CLUSTER_FOUND: - offset |= qed_offset_into_cluster(s, cb->pos); - cb->status = BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID | offset; - *cb->file = cb->bs->file->bs; - break; - case QED_CLUSTER_ZERO: - cb->status = BDRV_BLOCK_ZERO; - break; - case QED_CLUSTER_L2: - case QED_CLUSTER_L1: - cb->status = 0; - break; - default: - assert(ret < 0); - cb->status = ret; - break; - } - - if (cb->co) { - qemu_coroutine_enter(cb->co, NULL); - } -} - -static int64_t coroutine_fn bdrv_qed_co_get_block_status(BlockDriverState *bs, - int64_t sector_num, - int nb_sectors, int *pnum, - BlockDriverState **file) -{ - BDRVQEDState *s = bs->opaque; - size_t len = (size_t)nb_sectors * BDRV_SECTOR_SIZE; - QEDIsAllocatedCB cb = { - .bs = bs, - .pos = (uint64_t)sector_num * BDRV_SECTOR_SIZE, - .status = BDRV_BLOCK_OFFSET_MASK, - .pnum = pnum, - .file = file, - }; - QEDRequest request = { .l2_table = NULL }; - - qed_find_cluster(s, &request, cb.pos, len, qed_is_allocated_cb, &cb); - - /* Now sleep if the callback wasn't invoked immediately */ - while (cb.status == BDRV_BLOCK_OFFSET_MASK) { - cb.co = qemu_coroutine_self(); - qemu_coroutine_yield(); - } - - qed_unref_l2_cache_entry(request.l2_table); - - return cb.status; -} - -static BDRVQEDState *acb_to_s(QEDAIOCB *acb) -{ - return acb->common.bs->opaque; -} - -/** - * Read from the backing file or zero-fill if no backing file - * - * @s: QED state - * @pos: Byte position in device - * @qiov: Destination I/O vector - * @backing_qiov: Possibly shortened copy of qiov, to be allocated here - * @cb: Completion function - * @opaque: User data for completion function - * - * This function reads qiov->size bytes starting at pos from the backing file. - * If there is no backing file then zeroes are read. - */ -static void qed_read_backing_file(BDRVQEDState *s, uint64_t pos, - QEMUIOVector *qiov, - QEMUIOVector **backing_qiov, - BlockCompletionFunc *cb, void *opaque) -{ - uint64_t backing_length = 0; - size_t size; - - /* If there is a backing file, get its length. Treat the absence of a - * backing file like a zero length backing file. - */ - if (s->bs->backing) { - int64_t l = bdrv_getlength(s->bs->backing->bs); - if (l < 0) { - cb(opaque, l); - return; - } - backing_length = l; - } - - /* Zero all sectors if reading beyond the end of the backing file */ - if (pos >= backing_length || - pos + qiov->size > backing_length) { - qemu_iovec_memset(qiov, 0, 0, qiov->size); - } - - /* Complete now if there are no backing file sectors to read */ - if (pos >= backing_length) { - cb(opaque, 0); - return; - } - - /* If the read straddles the end of the backing file, shorten it */ - size = MIN((uint64_t)backing_length - pos, qiov->size); - - assert(*backing_qiov == NULL); - *backing_qiov = g_new(QEMUIOVector, 1); - qemu_iovec_init(*backing_qiov, qiov->niov); - qemu_iovec_concat(*backing_qiov, qiov, 0, size); - - BLKDBG_EVENT(s->bs->file, BLKDBG_READ_BACKING_AIO); - bdrv_aio_readv(s->bs->backing->bs, pos / BDRV_SECTOR_SIZE, - *backing_qiov, size / BDRV_SECTOR_SIZE, cb, opaque); -} - -typedef struct { - GenericCB gencb; - BDRVQEDState *s; - QEMUIOVector qiov; - QEMUIOVector *backing_qiov; - struct iovec iov; - uint64_t offset; -} CopyFromBackingFileCB; - -static void qed_copy_from_backing_file_cb(void *opaque, int ret) -{ - CopyFromBackingFileCB *copy_cb = opaque; - qemu_vfree(copy_cb->iov.iov_base); - gencb_complete(©_cb->gencb, ret); -} - -static void qed_copy_from_backing_file_write(void *opaque, int ret) -{ - CopyFromBackingFileCB *copy_cb = opaque; - BDRVQEDState *s = copy_cb->s; - - if (copy_cb->backing_qiov) { - qemu_iovec_destroy(copy_cb->backing_qiov); - g_free(copy_cb->backing_qiov); - copy_cb->backing_qiov = NULL; - } - - if (ret) { - qed_copy_from_backing_file_cb(copy_cb, ret); - return; - } - - BLKDBG_EVENT(s->bs->file, BLKDBG_COW_WRITE); - bdrv_aio_writev(s->bs->file->bs, copy_cb->offset / BDRV_SECTOR_SIZE, - ©_cb->qiov, copy_cb->qiov.size / BDRV_SECTOR_SIZE, - qed_copy_from_backing_file_cb, copy_cb); -} - -/** - * Copy data from backing file into the image - * - * @s: QED state - * @pos: Byte position in device - * @len: Number of bytes - * @offset: Byte offset in image file - * @cb: Completion function - * @opaque: User data for completion function - */ -static void qed_copy_from_backing_file(BDRVQEDState *s, uint64_t pos, - uint64_t len, uint64_t offset, - BlockCompletionFunc *cb, - void *opaque) -{ - CopyFromBackingFileCB *copy_cb; - - /* Skip copy entirely if there is no work to do */ - if (len == 0) { - cb(opaque, 0); - return; - } - - copy_cb = gencb_alloc(sizeof(*copy_cb), cb, opaque); - copy_cb->s = s; - copy_cb->offset = offset; - copy_cb->backing_qiov = NULL; - copy_cb->iov.iov_base = qemu_blockalign(s->bs, len); - copy_cb->iov.iov_len = len; - qemu_iovec_init_external(©_cb->qiov, ©_cb->iov, 1); - - qed_read_backing_file(s, pos, ©_cb->qiov, ©_cb->backing_qiov, - qed_copy_from_backing_file_write, copy_cb); -} - -/** - * Link one or more contiguous clusters into a table - * - * @s: QED state - * @table: L2 table - * @index: First cluster index - * @n: Number of contiguous clusters - * @cluster: First cluster offset - * - * The cluster offset may be an allocated byte offset in the image file, the - * zero cluster marker, or the unallocated cluster marker. - */ -static void qed_update_l2_table(BDRVQEDState *s, QEDTable *table, int index, - unsigned int n, uint64_t cluster) -{ - int i; - for (i = index; i < index + n; i++) { - table->offsets[i] = cluster; - if (!qed_offset_is_unalloc_cluster(cluster) && - !qed_offset_is_zero_cluster(cluster)) { - cluster += s->header.cluster_size; - } - } -} - -static void qed_aio_complete_bh(void *opaque) -{ - QEDAIOCB *acb = opaque; - BlockCompletionFunc *cb = acb->common.cb; - void *user_opaque = acb->common.opaque; - int ret = acb->bh_ret; - - qemu_bh_delete(acb->bh); - qemu_aio_unref(acb); - - /* Invoke callback */ - cb(user_opaque, ret); -} - -static void qed_aio_complete(QEDAIOCB *acb, int ret) -{ - BDRVQEDState *s = acb_to_s(acb); - - trace_qed_aio_complete(s, acb, ret); - - /* Free resources */ - qemu_iovec_destroy(&acb->cur_qiov); - qed_unref_l2_cache_entry(acb->request.l2_table); - - /* Free the buffer we may have allocated for zero writes */ - if (acb->flags & QED_AIOCB_ZERO) { - qemu_vfree(acb->qiov->iov[0].iov_base); - acb->qiov->iov[0].iov_base = NULL; - } - - /* Arrange for a bh to invoke the completion function */ - acb->bh_ret = ret; - acb->bh = aio_bh_new(bdrv_get_aio_context(acb->common.bs), - qed_aio_complete_bh, acb); - qemu_bh_schedule(acb->bh); - - /* Start next allocating write request waiting behind this one. Note that - * requests enqueue themselves when they first hit an unallocated cluster - * but they wait until the entire request is finished before waking up the - * next request in the queue. This ensures that we don't cycle through - * requests multiple times but rather finish one at a time completely. - */ - if (acb == QSIMPLEQ_FIRST(&s->allocating_write_reqs)) { - QSIMPLEQ_REMOVE_HEAD(&s->allocating_write_reqs, next); - acb = QSIMPLEQ_FIRST(&s->allocating_write_reqs); - if (acb) { - qed_aio_next_io(acb, 0); - } else if (s->header.features & QED_F_NEED_CHECK) { - qed_start_need_check_timer(s); - } - } -} - -/** - * Commit the current L2 table to the cache - */ -static void qed_commit_l2_update(void *opaque, int ret) -{ - QEDAIOCB *acb = opaque; - BDRVQEDState *s = acb_to_s(acb); - CachedL2Table *l2_table = acb->request.l2_table; - uint64_t l2_offset = l2_table->offset; - - qed_commit_l2_cache_entry(&s->l2_cache, l2_table); - - /* This is guaranteed to succeed because we just committed the entry to the - * cache. - */ - acb->request.l2_table = qed_find_l2_cache_entry(&s->l2_cache, l2_offset); - assert(acb->request.l2_table != NULL); - - qed_aio_next_io(opaque, ret); -} - -/** - * Update L1 table with new L2 table offset and write it out - */ -static void qed_aio_write_l1_update(void *opaque, int ret) -{ - QEDAIOCB *acb = opaque; - BDRVQEDState *s = acb_to_s(acb); - int index; - - if (ret) { - qed_aio_complete(acb, ret); - return; - } - - index = qed_l1_index(s, acb->cur_pos); - s->l1_table->offsets[index] = acb->request.l2_table->offset; - - qed_write_l1_table(s, index, 1, qed_commit_l2_update, acb); -} - -/** - * Update L2 table with new cluster offsets and write them out - */ -static void qed_aio_write_l2_update(QEDAIOCB *acb, int ret, uint64_t offset) -{ - BDRVQEDState *s = acb_to_s(acb); - bool need_alloc = acb->find_cluster_ret == QED_CLUSTER_L1; - int index; - - if (ret) { - goto err; - } - - if (need_alloc) { - qed_unref_l2_cache_entry(acb->request.l2_table); - acb->request.l2_table = qed_new_l2_table(s); - } - - index = qed_l2_index(s, acb->cur_pos); - qed_update_l2_table(s, acb->request.l2_table->table, index, acb->cur_nclusters, - offset); - - if (need_alloc) { - /* Write out the whole new L2 table */ - qed_write_l2_table(s, &acb->request, 0, s->table_nelems, true, - qed_aio_write_l1_update, acb); - } else { - /* Write out only the updated part of the L2 table */ - qed_write_l2_table(s, &acb->request, index, acb->cur_nclusters, false, - qed_aio_next_io, acb); - } - return; - -err: - qed_aio_complete(acb, ret); -} - -static void qed_aio_write_l2_update_cb(void *opaque, int ret) -{ - QEDAIOCB *acb = opaque; - qed_aio_write_l2_update(acb, ret, acb->cur_cluster); -} - -/** - * Flush new data clusters before updating the L2 table - * - * This flush is necessary when a backing file is in use. A crash during an - * allocating write could result in empty clusters in the image. If the write - * only touched a subregion of the cluster, then backing image sectors have - * been lost in the untouched region. The solution is to flush after writing a - * new data cluster and before updating the L2 table. - */ -static void qed_aio_write_flush_before_l2_update(void *opaque, int ret) -{ - QEDAIOCB *acb = opaque; - BDRVQEDState *s = acb_to_s(acb); - - if (!bdrv_aio_flush(s->bs->file->bs, qed_aio_write_l2_update_cb, opaque)) { - qed_aio_complete(acb, -EIO); - } -} - -/** - * Write data to the image file - */ -static void qed_aio_write_main(void *opaque, int ret) -{ - QEDAIOCB *acb = opaque; - BDRVQEDState *s = acb_to_s(acb); - uint64_t offset = acb->cur_cluster + - qed_offset_into_cluster(s, acb->cur_pos); - BlockCompletionFunc *next_fn; - - trace_qed_aio_write_main(s, acb, ret, offset, acb->cur_qiov.size); - - if (ret) { - qed_aio_complete(acb, ret); - return; - } - - if (acb->find_cluster_ret == QED_CLUSTER_FOUND) { - next_fn = qed_aio_next_io; - } else { - if (s->bs->backing) { - next_fn = qed_aio_write_flush_before_l2_update; - } else { - next_fn = qed_aio_write_l2_update_cb; - } - } - - BLKDBG_EVENT(s->bs->file, BLKDBG_WRITE_AIO); - bdrv_aio_writev(s->bs->file->bs, offset / BDRV_SECTOR_SIZE, - &acb->cur_qiov, acb->cur_qiov.size / BDRV_SECTOR_SIZE, - next_fn, acb); -} - -/** - * Populate back untouched region of new data cluster - */ -static void qed_aio_write_postfill(void *opaque, int ret) -{ - QEDAIOCB *acb = opaque; - BDRVQEDState *s = acb_to_s(acb); - uint64_t start = acb->cur_pos + acb->cur_qiov.size; - uint64_t len = - qed_start_of_cluster(s, start + s->header.cluster_size - 1) - start; - uint64_t offset = acb->cur_cluster + - qed_offset_into_cluster(s, acb->cur_pos) + - acb->cur_qiov.size; - - if (ret) { - qed_aio_complete(acb, ret); - return; - } - - trace_qed_aio_write_postfill(s, acb, start, len, offset); - qed_copy_from_backing_file(s, start, len, offset, - qed_aio_write_main, acb); -} - -/** - * Populate front untouched region of new data cluster - */ -static void qed_aio_write_prefill(void *opaque, int ret) -{ - QEDAIOCB *acb = opaque; - BDRVQEDState *s = acb_to_s(acb); - uint64_t start = qed_start_of_cluster(s, acb->cur_pos); - uint64_t len = qed_offset_into_cluster(s, acb->cur_pos); - - trace_qed_aio_write_prefill(s, acb, start, len, acb->cur_cluster); - qed_copy_from_backing_file(s, start, len, acb->cur_cluster, - qed_aio_write_postfill, acb); -} - -/** - * Check if the QED_F_NEED_CHECK bit should be set during allocating write - */ -static bool qed_should_set_need_check(BDRVQEDState *s) -{ - /* The flush before L2 update path ensures consistency */ - if (s->bs->backing) { - return false; - } - - return !(s->header.features & QED_F_NEED_CHECK); -} - -static void qed_aio_write_zero_cluster(void *opaque, int ret) -{ - QEDAIOCB *acb = opaque; - - if (ret) { - qed_aio_complete(acb, ret); - return; - } - - qed_aio_write_l2_update(acb, 0, 1); -} - -/** - * Write new data cluster - * - * @acb: Write request - * @len: Length in bytes - * - * This path is taken when writing to previously unallocated clusters. - */ -static void qed_aio_write_alloc(QEDAIOCB *acb, size_t len) -{ - BDRVQEDState *s = acb_to_s(acb); - BlockCompletionFunc *cb; - - /* Cancel timer when the first allocating request comes in */ - if (QSIMPLEQ_EMPTY(&s->allocating_write_reqs)) { - qed_cancel_need_check_timer(s); - } - - /* Freeze this request if another allocating write is in progress */ - if (acb != QSIMPLEQ_FIRST(&s->allocating_write_reqs)) { - QSIMPLEQ_INSERT_TAIL(&s->allocating_write_reqs, acb, next); - } - if (acb != QSIMPLEQ_FIRST(&s->allocating_write_reqs) || - s->allocating_write_reqs_plugged) { - return; /* wait for existing request to finish */ - } - - acb->cur_nclusters = qed_bytes_to_clusters(s, - qed_offset_into_cluster(s, acb->cur_pos) + len); - qemu_iovec_concat(&acb->cur_qiov, acb->qiov, acb->qiov_offset, len); - - if (acb->flags & QED_AIOCB_ZERO) { - /* Skip ahead if the clusters are already zero */ - if (acb->find_cluster_ret == QED_CLUSTER_ZERO) { - qed_aio_next_io(acb, 0); - return; - } - - cb = qed_aio_write_zero_cluster; - } else { - cb = qed_aio_write_prefill; - acb->cur_cluster = qed_alloc_clusters(s, acb->cur_nclusters); - } - - if (qed_should_set_need_check(s)) { - s->header.features |= QED_F_NEED_CHECK; - qed_write_header(s, cb, acb); - } else { - cb(acb, 0); - } -} - -/** - * Write data cluster in place - * - * @acb: Write request - * @offset: Cluster offset in bytes - * @len: Length in bytes - * - * This path is taken when writing to already allocated clusters. - */ -static void qed_aio_write_inplace(QEDAIOCB *acb, uint64_t offset, size_t len) -{ - /* Allocate buffer for zero writes */ - if (acb->flags & QED_AIOCB_ZERO) { - struct iovec *iov = acb->qiov->iov; - - if (!iov->iov_base) { - iov->iov_base = qemu_try_blockalign(acb->common.bs, iov->iov_len); - if (iov->iov_base == NULL) { - qed_aio_complete(acb, -ENOMEM); - return; - } - memset(iov->iov_base, 0, iov->iov_len); - } - } - - /* Calculate the I/O vector */ - acb->cur_cluster = offset; - qemu_iovec_concat(&acb->cur_qiov, acb->qiov, acb->qiov_offset, len); - - /* Do the actual write */ - qed_aio_write_main(acb, 0); -} - -/** - * Write data cluster - * - * @opaque: Write request - * @ret: QED_CLUSTER_FOUND, QED_CLUSTER_L2, QED_CLUSTER_L1, - * or -errno - * @offset: Cluster offset in bytes - * @len: Length in bytes - * - * Callback from qed_find_cluster(). - */ -static void qed_aio_write_data(void *opaque, int ret, - uint64_t offset, size_t len) -{ - QEDAIOCB *acb = opaque; - - trace_qed_aio_write_data(acb_to_s(acb), acb, ret, offset, len); - - acb->find_cluster_ret = ret; - - switch (ret) { - case QED_CLUSTER_FOUND: - qed_aio_write_inplace(acb, offset, len); - break; - - case QED_CLUSTER_L2: - case QED_CLUSTER_L1: - case QED_CLUSTER_ZERO: - qed_aio_write_alloc(acb, len); - break; - - default: - qed_aio_complete(acb, ret); - break; - } -} - -/** - * Read data cluster - * - * @opaque: Read request - * @ret: QED_CLUSTER_FOUND, QED_CLUSTER_L2, QED_CLUSTER_L1, - * or -errno - * @offset: Cluster offset in bytes - * @len: Length in bytes - * - * Callback from qed_find_cluster(). - */ -static void qed_aio_read_data(void *opaque, int ret, - uint64_t offset, size_t len) -{ - QEDAIOCB *acb = opaque; - BDRVQEDState *s = acb_to_s(acb); - BlockDriverState *bs = acb->common.bs; - - /* Adjust offset into cluster */ - offset += qed_offset_into_cluster(s, acb->cur_pos); - - trace_qed_aio_read_data(s, acb, ret, offset, len); - - if (ret < 0) { - goto err; - } - - qemu_iovec_concat(&acb->cur_qiov, acb->qiov, acb->qiov_offset, len); - - /* Handle zero cluster and backing file reads */ - if (ret == QED_CLUSTER_ZERO) { - qemu_iovec_memset(&acb->cur_qiov, 0, 0, acb->cur_qiov.size); - qed_aio_next_io(acb, 0); - return; - } else if (ret != QED_CLUSTER_FOUND) { - qed_read_backing_file(s, acb->cur_pos, &acb->cur_qiov, - &acb->backing_qiov, qed_aio_next_io, acb); - return; - } - - BLKDBG_EVENT(bs->file, BLKDBG_READ_AIO); - bdrv_aio_readv(bs->file->bs, offset / BDRV_SECTOR_SIZE, - &acb->cur_qiov, acb->cur_qiov.size / BDRV_SECTOR_SIZE, - qed_aio_next_io, acb); - return; - -err: - qed_aio_complete(acb, ret); -} - -/** - * Begin next I/O or complete the request - */ -static void qed_aio_next_io(void *opaque, int ret) -{ - QEDAIOCB *acb = opaque; - BDRVQEDState *s = acb_to_s(acb); - QEDFindClusterFunc *io_fn = (acb->flags & QED_AIOCB_WRITE) ? - qed_aio_write_data : qed_aio_read_data; - - trace_qed_aio_next_io(s, acb, ret, acb->cur_pos + acb->cur_qiov.size); - - if (acb->backing_qiov) { - qemu_iovec_destroy(acb->backing_qiov); - g_free(acb->backing_qiov); - acb->backing_qiov = NULL; - } - - /* Handle I/O error */ - if (ret) { - qed_aio_complete(acb, ret); - return; - } - - acb->qiov_offset += acb->cur_qiov.size; - acb->cur_pos += acb->cur_qiov.size; - qemu_iovec_reset(&acb->cur_qiov); - - /* Complete request */ - if (acb->cur_pos >= acb->end_pos) { - qed_aio_complete(acb, 0); - return; - } - - /* Find next cluster and start I/O */ - qed_find_cluster(s, &acb->request, - acb->cur_pos, acb->end_pos - acb->cur_pos, - io_fn, acb); -} - -static BlockAIOCB *qed_aio_setup(BlockDriverState *bs, - int64_t sector_num, - QEMUIOVector *qiov, int nb_sectors, - BlockCompletionFunc *cb, - void *opaque, int flags) -{ - QEDAIOCB *acb = qemu_aio_get(&qed_aiocb_info, bs, cb, opaque); - - trace_qed_aio_setup(bs->opaque, acb, sector_num, nb_sectors, - opaque, flags); - - acb->flags = flags; - acb->qiov = qiov; - acb->qiov_offset = 0; - acb->cur_pos = (uint64_t)sector_num * BDRV_SECTOR_SIZE; - acb->end_pos = acb->cur_pos + nb_sectors * BDRV_SECTOR_SIZE; - acb->backing_qiov = NULL; - acb->request.l2_table = NULL; - qemu_iovec_init(&acb->cur_qiov, qiov->niov); - - /* Start request */ - qed_aio_next_io(acb, 0); - return &acb->common; -} - -static BlockAIOCB *bdrv_qed_aio_readv(BlockDriverState *bs, - int64_t sector_num, - QEMUIOVector *qiov, int nb_sectors, - BlockCompletionFunc *cb, - void *opaque) -{ - return qed_aio_setup(bs, sector_num, qiov, nb_sectors, cb, opaque, 0); -} - -static BlockAIOCB *bdrv_qed_aio_writev(BlockDriverState *bs, - int64_t sector_num, - QEMUIOVector *qiov, int nb_sectors, - BlockCompletionFunc *cb, - void *opaque) -{ - return qed_aio_setup(bs, sector_num, qiov, nb_sectors, cb, - opaque, QED_AIOCB_WRITE); -} - -typedef struct { - Coroutine *co; - int ret; - bool done; -} QEDWriteZeroesCB; - -static void coroutine_fn qed_co_write_zeroes_cb(void *opaque, int ret) -{ - QEDWriteZeroesCB *cb = opaque; - - cb->done = true; - cb->ret = ret; - if (cb->co) { - qemu_coroutine_enter(cb->co, NULL); - } -} - -static int coroutine_fn bdrv_qed_co_write_zeroes(BlockDriverState *bs, - int64_t sector_num, - int nb_sectors, - BdrvRequestFlags flags) -{ - BlockAIOCB *blockacb; - BDRVQEDState *s = bs->opaque; - QEDWriteZeroesCB cb = { .done = false }; - QEMUIOVector qiov; - struct iovec iov; - - /* Refuse if there are untouched backing file sectors */ - if (bs->backing) { - if (qed_offset_into_cluster(s, sector_num * BDRV_SECTOR_SIZE) != 0) { - return -ENOTSUP; - } - if (qed_offset_into_cluster(s, nb_sectors * BDRV_SECTOR_SIZE) != 0) { - return -ENOTSUP; - } - } - - /* Zero writes start without an I/O buffer. If a buffer becomes necessary - * then it will be allocated during request processing. - */ - iov.iov_base = NULL, - iov.iov_len = nb_sectors * BDRV_SECTOR_SIZE, - - qemu_iovec_init_external(&qiov, &iov, 1); - blockacb = qed_aio_setup(bs, sector_num, &qiov, nb_sectors, - qed_co_write_zeroes_cb, &cb, - QED_AIOCB_WRITE | QED_AIOCB_ZERO); - if (!blockacb) { - return -EIO; - } - if (!cb.done) { - cb.co = qemu_coroutine_self(); - qemu_coroutine_yield(); - } - assert(cb.done); - return cb.ret; -} - -static int bdrv_qed_truncate(BlockDriverState *bs, int64_t offset) -{ - BDRVQEDState *s = bs->opaque; - uint64_t old_image_size; - int ret; - - if (!qed_is_image_size_valid(offset, s->header.cluster_size, - s->header.table_size)) { - return -EINVAL; - } - - /* Shrinking is currently not supported */ - if ((uint64_t)offset < s->header.image_size) { - return -ENOTSUP; - } - - old_image_size = s->header.image_size; - s->header.image_size = offset; - ret = qed_write_header_sync(s); - if (ret < 0) { - s->header.image_size = old_image_size; - } - return ret; -} - -static int64_t bdrv_qed_getlength(BlockDriverState *bs) -{ - BDRVQEDState *s = bs->opaque; - return s->header.image_size; -} - -static int bdrv_qed_get_info(BlockDriverState *bs, BlockDriverInfo *bdi) -{ - BDRVQEDState *s = bs->opaque; - - memset(bdi, 0, sizeof(*bdi)); - bdi->cluster_size = s->header.cluster_size; - bdi->is_dirty = s->header.features & QED_F_NEED_CHECK; - bdi->unallocated_blocks_are_zero = true; - bdi->can_write_zeroes_with_unmap = true; - return 0; -} - -static int bdrv_qed_change_backing_file(BlockDriverState *bs, - const char *backing_file, - const char *backing_fmt) -{ - BDRVQEDState *s = bs->opaque; - QEDHeader new_header, le_header; - void *buffer; - size_t buffer_len, backing_file_len; - int ret; - - /* Refuse to set backing filename if unknown compat feature bits are - * active. If the image uses an unknown compat feature then we may not - * know the layout of data following the header structure and cannot safely - * add a new string. - */ - if (backing_file && (s->header.compat_features & - ~QED_COMPAT_FEATURE_MASK)) { - return -ENOTSUP; - } - - memcpy(&new_header, &s->header, sizeof(new_header)); - - new_header.features &= ~(QED_F_BACKING_FILE | - QED_F_BACKING_FORMAT_NO_PROBE); - - /* Adjust feature flags */ - if (backing_file) { - new_header.features |= QED_F_BACKING_FILE; - - if (qed_fmt_is_raw(backing_fmt)) { - new_header.features |= QED_F_BACKING_FORMAT_NO_PROBE; - } - } - - /* Calculate new header size */ - backing_file_len = 0; - - if (backing_file) { - backing_file_len = strlen(backing_file); - } - - buffer_len = sizeof(new_header); - new_header.backing_filename_offset = buffer_len; - new_header.backing_filename_size = backing_file_len; - buffer_len += backing_file_len; - - /* Make sure we can rewrite header without failing */ - if (buffer_len > new_header.header_size * new_header.cluster_size) { - return -ENOSPC; - } - - /* Prepare new header */ - buffer = g_malloc(buffer_len); - - qed_header_cpu_to_le(&new_header, &le_header); - memcpy(buffer, &le_header, sizeof(le_header)); - buffer_len = sizeof(le_header); - - if (backing_file) { - memcpy(buffer + buffer_len, backing_file, backing_file_len); - buffer_len += backing_file_len; - } - - /* Write new header */ - ret = bdrv_pwrite_sync(bs->file->bs, 0, buffer, buffer_len); - g_free(buffer); - if (ret == 0) { - memcpy(&s->header, &new_header, sizeof(new_header)); - } - return ret; -} - -static void bdrv_qed_invalidate_cache(BlockDriverState *bs, Error **errp) -{ - BDRVQEDState *s = bs->opaque; - Error *local_err = NULL; - int ret; - - bdrv_qed_close(bs); - - bdrv_invalidate_cache(bs->file->bs, &local_err); - if (local_err) { - error_propagate(errp, local_err); - return; - } - - memset(s, 0, sizeof(BDRVQEDState)); - ret = bdrv_qed_open(bs, NULL, bs->open_flags, &local_err); - if (local_err) { - error_propagate(errp, local_err); - error_prepend(errp, "Could not reopen qed layer: "); - return; - } else if (ret < 0) { - error_setg_errno(errp, -ret, "Could not reopen qed layer"); - return; - } -} - -static int bdrv_qed_check(BlockDriverState *bs, BdrvCheckResult *result, - BdrvCheckMode fix) -{ - BDRVQEDState *s = bs->opaque; - - return qed_check(s, result, !!fix); -} - -static QemuOptsList qed_create_opts = { - .name = "qed-create-opts", - .head = QTAILQ_HEAD_INITIALIZER(qed_create_opts.head), - .desc = { - { - .name = BLOCK_OPT_SIZE, - .type = QEMU_OPT_SIZE, - .help = "Virtual disk size" - }, - { - .name = BLOCK_OPT_BACKING_FILE, - .type = QEMU_OPT_STRING, - .help = "File name of a base image" - }, - { - .name = BLOCK_OPT_BACKING_FMT, - .type = QEMU_OPT_STRING, - .help = "Image format of the base image" - }, - { - .name = BLOCK_OPT_CLUSTER_SIZE, - .type = QEMU_OPT_SIZE, - .help = "Cluster size (in bytes)", - .def_value_str = stringify(QED_DEFAULT_CLUSTER_SIZE) - }, - { - .name = BLOCK_OPT_TABLE_SIZE, - .type = QEMU_OPT_SIZE, - .help = "L1/L2 table size (in clusters)" - }, - { /* end of list */ } - } -}; - -static BlockDriver bdrv_qed = { - .format_name = "qed", - .instance_size = sizeof(BDRVQEDState), - .create_opts = &qed_create_opts, - .supports_backing = true, - - .bdrv_probe = bdrv_qed_probe, - .bdrv_open = bdrv_qed_open, - .bdrv_close = bdrv_qed_close, - .bdrv_reopen_prepare = bdrv_qed_reopen_prepare, - .bdrv_create = bdrv_qed_create, - .bdrv_has_zero_init = bdrv_has_zero_init_1, - .bdrv_co_get_block_status = bdrv_qed_co_get_block_status, - .bdrv_aio_readv = bdrv_qed_aio_readv, - .bdrv_aio_writev = bdrv_qed_aio_writev, - .bdrv_co_write_zeroes = bdrv_qed_co_write_zeroes, - .bdrv_truncate = bdrv_qed_truncate, - .bdrv_getlength = bdrv_qed_getlength, - .bdrv_get_info = bdrv_qed_get_info, - .bdrv_refresh_limits = bdrv_qed_refresh_limits, - .bdrv_change_backing_file = bdrv_qed_change_backing_file, - .bdrv_invalidate_cache = bdrv_qed_invalidate_cache, - .bdrv_check = bdrv_qed_check, - .bdrv_detach_aio_context = bdrv_qed_detach_aio_context, - .bdrv_attach_aio_context = bdrv_qed_attach_aio_context, -}; - -static void bdrv_qed_init(void) -{ - bdrv_register(&bdrv_qed); -} - -block_init(bdrv_qed_init); diff --git a/qemu/block/qed.h b/qemu/block/qed.h deleted file mode 100644 index 22b319875..000000000 --- a/qemu/block/qed.h +++ /dev/null @@ -1,344 +0,0 @@ -/* - * QEMU Enhanced Disk Format - * - * Copyright IBM, Corp. 2010 - * - * Authors: - * Stefan Hajnoczi - * Anthony Liguori - * - * This work is licensed under the terms of the GNU LGPL, version 2 or later. - * See the COPYING.LIB file in the top-level directory. - * - */ - -#ifndef BLOCK_QED_H -#define BLOCK_QED_H - -#include "block/block_int.h" -#include "qemu/cutils.h" - -/* The layout of a QED file is as follows: - * - * +--------+----------+----------+----------+-----+ - * | header | L1 table | cluster0 | cluster1 | ... | - * +--------+----------+----------+----------+-----+ - * - * There is a 2-level pagetable for cluster allocation: - * - * +----------+ - * | L1 table | - * +----------+ - * ,------' | '------. - * +----------+ | +----------+ - * | L2 table | ... | L2 table | - * +----------+ +----------+ - * ,------' | '------. - * +----------+ | +----------+ - * | Data | ... | Data | - * +----------+ +----------+ - * - * The L1 table is fixed size and always present. L2 tables are allocated on - * demand. The L1 table size determines the maximum possible image size; it - * can be influenced using the cluster_size and table_size values. - * - * All fields are little-endian on disk. - */ -#define QED_DEFAULT_CLUSTER_SIZE 65536 -enum { - QED_MAGIC = 'Q' | 'E' << 8 | 'D' << 16 | '\0' << 24, - - /* The image supports a backing file */ - QED_F_BACKING_FILE = 0x01, - - /* The image needs a consistency check before use */ - QED_F_NEED_CHECK = 0x02, - - /* The backing file format must not be probed, treat as raw image */ - QED_F_BACKING_FORMAT_NO_PROBE = 0x04, - - /* Feature bits must be used when the on-disk format changes */ - QED_FEATURE_MASK = QED_F_BACKING_FILE | /* supported feature bits */ - QED_F_NEED_CHECK | - QED_F_BACKING_FORMAT_NO_PROBE, - QED_COMPAT_FEATURE_MASK = 0, /* supported compat feature bits */ - QED_AUTOCLEAR_FEATURE_MASK = 0, /* supported autoclear feature bits */ - - /* Data is stored in groups of sectors called clusters. Cluster size must - * be large to avoid keeping too much metadata. I/O requests that have - * sub-cluster size will require read-modify-write. - */ - QED_MIN_CLUSTER_SIZE = 4 * 1024, /* in bytes */ - QED_MAX_CLUSTER_SIZE = 64 * 1024 * 1024, - - /* Allocated clusters are tracked using a 2-level pagetable. Table size is - * a multiple of clusters so large maximum image sizes can be supported - * without jacking up the cluster size too much. - */ - QED_MIN_TABLE_SIZE = 1, /* in clusters */ - QED_MAX_TABLE_SIZE = 16, - QED_DEFAULT_TABLE_SIZE = 4, - - /* Delay to flush and clean image after last allocating write completes */ - QED_NEED_CHECK_TIMEOUT = 5, /* in seconds */ -}; - -typedef struct { - uint32_t magic; /* QED\0 */ - - uint32_t cluster_size; /* in bytes */ - uint32_t table_size; /* for L1 and L2 tables, in clusters */ - uint32_t header_size; /* in clusters */ - - uint64_t features; /* format feature bits */ - uint64_t compat_features; /* compatible feature bits */ - uint64_t autoclear_features; /* self-resetting feature bits */ - - uint64_t l1_table_offset; /* in bytes */ - uint64_t image_size; /* total logical image size, in bytes */ - - /* if (features & QED_F_BACKING_FILE) */ - uint32_t backing_filename_offset; /* in bytes from start of header */ - uint32_t backing_filename_size; /* in bytes */ -} QEMU_PACKED QEDHeader; - -typedef struct { - uint64_t offsets[0]; /* in bytes */ -} QEDTable; - -/* The L2 cache is a simple write-through cache for L2 structures */ -typedef struct CachedL2Table { - QEDTable *table; - uint64_t offset; /* offset=0 indicates an invalidate entry */ - QTAILQ_ENTRY(CachedL2Table) node; - int ref; -} CachedL2Table; - -typedef struct { - QTAILQ_HEAD(, CachedL2Table) entries; - unsigned int n_entries; -} L2TableCache; - -typedef struct QEDRequest { - CachedL2Table *l2_table; -} QEDRequest; - -enum { - QED_AIOCB_WRITE = 0x0001, /* read or write? */ - QED_AIOCB_ZERO = 0x0002, /* zero write, used with QED_AIOCB_WRITE */ -}; - -typedef struct QEDAIOCB { - BlockAIOCB common; - QEMUBH *bh; - int bh_ret; /* final return status for completion bh */ - QSIMPLEQ_ENTRY(QEDAIOCB) next; /* next request */ - int flags; /* QED_AIOCB_* bits ORed together */ - uint64_t end_pos; /* request end on block device, in bytes */ - - /* User scatter-gather list */ - QEMUIOVector *qiov; - size_t qiov_offset; /* byte count already processed */ - - /* Current cluster scatter-gather list */ - QEMUIOVector cur_qiov; - QEMUIOVector *backing_qiov; - uint64_t cur_pos; /* position on block device, in bytes */ - uint64_t cur_cluster; /* cluster offset in image file */ - unsigned int cur_nclusters; /* number of clusters being accessed */ - int find_cluster_ret; /* used for L1/L2 update */ - - QEDRequest request; -} QEDAIOCB; - -typedef struct { - BlockDriverState *bs; /* device */ - uint64_t file_size; /* length of image file, in bytes */ - - QEDHeader header; /* always cpu-endian */ - QEDTable *l1_table; - L2TableCache l2_cache; /* l2 table cache */ - uint32_t table_nelems; - uint32_t l1_shift; - uint32_t l2_shift; - uint32_t l2_mask; - - /* Allocating write request queue */ - QSIMPLEQ_HEAD(, QEDAIOCB) allocating_write_reqs; - bool allocating_write_reqs_plugged; - - /* Periodic flush and clear need check flag */ - QEMUTimer *need_check_timer; -} BDRVQEDState; - -enum { - QED_CLUSTER_FOUND, /* cluster found */ - QED_CLUSTER_ZERO, /* zero cluster found */ - QED_CLUSTER_L2, /* cluster missing in L2 */ - QED_CLUSTER_L1, /* cluster missing in L1 */ -}; - -/** - * qed_find_cluster() completion callback - * - * @opaque: User data for completion callback - * @ret: QED_CLUSTER_FOUND Success - * QED_CLUSTER_L2 Data cluster unallocated in L2 - * QED_CLUSTER_L1 L2 unallocated in L1 - * -errno POSIX error occurred - * @offset: Data cluster offset - * @len: Contiguous bytes starting from cluster offset - * - * This function is invoked when qed_find_cluster() completes. - * - * On success ret is QED_CLUSTER_FOUND and offset/len are a contiguous range - * in the image file. - * - * On failure ret is QED_CLUSTER_L2 or QED_CLUSTER_L1 for missing L2 or L1 - * table offset, respectively. len is number of contiguous unallocated bytes. - */ -typedef void QEDFindClusterFunc(void *opaque, int ret, uint64_t offset, size_t len); - -/** - * Generic callback for chaining async callbacks - */ -typedef struct { - BlockCompletionFunc *cb; - void *opaque; -} GenericCB; - -void *gencb_alloc(size_t len, BlockCompletionFunc *cb, void *opaque); -void gencb_complete(void *opaque, int ret); - -/** - * Header functions - */ -int qed_write_header_sync(BDRVQEDState *s); - -/** - * L2 cache functions - */ -void qed_init_l2_cache(L2TableCache *l2_cache); -void qed_free_l2_cache(L2TableCache *l2_cache); -CachedL2Table *qed_alloc_l2_cache_entry(L2TableCache *l2_cache); -void qed_unref_l2_cache_entry(CachedL2Table *entry); -CachedL2Table *qed_find_l2_cache_entry(L2TableCache *l2_cache, uint64_t offset); -void qed_commit_l2_cache_entry(L2TableCache *l2_cache, CachedL2Table *l2_table); - -/** - * Table I/O functions - */ -int qed_read_l1_table_sync(BDRVQEDState *s); -void qed_write_l1_table(BDRVQEDState *s, unsigned int index, unsigned int n, - BlockCompletionFunc *cb, void *opaque); -int qed_write_l1_table_sync(BDRVQEDState *s, unsigned int index, - unsigned int n); -int qed_read_l2_table_sync(BDRVQEDState *s, QEDRequest *request, - uint64_t offset); -void qed_read_l2_table(BDRVQEDState *s, QEDRequest *request, uint64_t offset, - BlockCompletionFunc *cb, void *opaque); -void qed_write_l2_table(BDRVQEDState *s, QEDRequest *request, - unsigned int index, unsigned int n, bool flush, - BlockCompletionFunc *cb, void *opaque); -int qed_write_l2_table_sync(BDRVQEDState *s, QEDRequest *request, - unsigned int index, unsigned int n, bool flush); - -/** - * Cluster functions - */ -void qed_find_cluster(BDRVQEDState *s, QEDRequest *request, uint64_t pos, - size_t len, QEDFindClusterFunc *cb, void *opaque); - -/** - * Consistency check - */ -int qed_check(BDRVQEDState *s, BdrvCheckResult *result, bool fix); - -QEDTable *qed_alloc_table(BDRVQEDState *s); - -/** - * Round down to the start of a cluster - */ -static inline uint64_t qed_start_of_cluster(BDRVQEDState *s, uint64_t offset) -{ - return offset & ~(uint64_t)(s->header.cluster_size - 1); -} - -static inline uint64_t qed_offset_into_cluster(BDRVQEDState *s, uint64_t offset) -{ - return offset & (s->header.cluster_size - 1); -} - -static inline uint64_t qed_bytes_to_clusters(BDRVQEDState *s, uint64_t bytes) -{ - return qed_start_of_cluster(s, bytes + (s->header.cluster_size - 1)) / - (s->header.cluster_size - 1); -} - -static inline unsigned int qed_l1_index(BDRVQEDState *s, uint64_t pos) -{ - return pos >> s->l1_shift; -} - -static inline unsigned int qed_l2_index(BDRVQEDState *s, uint64_t pos) -{ - return (pos >> s->l2_shift) & s->l2_mask; -} - -/** - * Test if a cluster offset is valid - */ -static inline bool qed_check_cluster_offset(BDRVQEDState *s, uint64_t offset) -{ - uint64_t header_size = (uint64_t)s->header.header_size * - s->header.cluster_size; - - if (offset & (s->header.cluster_size - 1)) { - return false; - } - return offset >= header_size && offset < s->file_size; -} - -/** - * Test if a table offset is valid - */ -static inline bool qed_check_table_offset(BDRVQEDState *s, uint64_t offset) -{ - uint64_t end_offset = offset + (s->header.table_size - 1) * - s->header.cluster_size; - - /* Overflow check */ - if (end_offset <= offset) { - return false; - } - - return qed_check_cluster_offset(s, offset) && - qed_check_cluster_offset(s, end_offset); -} - -static inline bool qed_offset_is_cluster_aligned(BDRVQEDState *s, - uint64_t offset) -{ - if (qed_offset_into_cluster(s, offset)) { - return false; - } - return true; -} - -static inline bool qed_offset_is_unalloc_cluster(uint64_t offset) -{ - if (offset == 0) { - return true; - } - return false; -} - -static inline bool qed_offset_is_zero_cluster(uint64_t offset) -{ - if (offset == 1) { - return true; - } - return false; -} - -#endif /* BLOCK_QED_H */ diff --git a/qemu/block/quorum.c b/qemu/block/quorum.c deleted file mode 100644 index da15465a9..000000000 --- a/qemu/block/quorum.c +++ /dev/null @@ -1,1091 +0,0 @@ -/* - * Quorum Block filter - * - * Copyright (C) 2012-2014 Nodalink, EURL. - * - * Author: - * Benoît Canet - * - * Based on the design and code of blkverify.c (Copyright (C) 2010 IBM, Corp) - * and blkmirror.c (Copyright (C) 2011 Red Hat, Inc). - * - * This work is licensed under the terms of the GNU GPL, version 2 or later. - * See the COPYING file in the top-level directory. - */ - -#include "qemu/osdep.h" -#include "block/block_int.h" -#include "qapi/qmp/qbool.h" -#include "qapi/qmp/qdict.h" -#include "qapi/qmp/qerror.h" -#include "qapi/qmp/qint.h" -#include "qapi/qmp/qjson.h" -#include "qapi/qmp/qlist.h" -#include "qapi/qmp/qstring.h" -#include "qapi-event.h" -#include "crypto/hash.h" - -#define HASH_LENGTH 32 - -#define QUORUM_OPT_VOTE_THRESHOLD "vote-threshold" -#define QUORUM_OPT_BLKVERIFY "blkverify" -#define QUORUM_OPT_REWRITE "rewrite-corrupted" -#define QUORUM_OPT_READ_PATTERN "read-pattern" - -/* This union holds a vote hash value */ -typedef union QuorumVoteValue { - uint8_t h[HASH_LENGTH]; /* SHA-256 hash */ - int64_t l; /* simpler 64 bits hash */ -} QuorumVoteValue; - -/* A vote item */ -typedef struct QuorumVoteItem { - int index; - QLIST_ENTRY(QuorumVoteItem) next; -} QuorumVoteItem; - -/* this structure is a vote version. A version is the set of votes sharing the - * same vote value. - * The set of votes will be tracked with the items field and its cardinality is - * vote_count. - */ -typedef struct QuorumVoteVersion { - QuorumVoteValue value; - int index; - int vote_count; - QLIST_HEAD(, QuorumVoteItem) items; - QLIST_ENTRY(QuorumVoteVersion) next; -} QuorumVoteVersion; - -/* this structure holds a group of vote versions together */ -typedef struct QuorumVotes { - QLIST_HEAD(, QuorumVoteVersion) vote_list; - bool (*compare)(QuorumVoteValue *a, QuorumVoteValue *b); -} QuorumVotes; - -/* the following structure holds the state of one quorum instance */ -typedef struct BDRVQuorumState { - BdrvChild **children; /* children BlockDriverStates */ - int num_children; /* children count */ - int threshold; /* if less than threshold children reads gave the - * same result a quorum error occurs. - */ - bool is_blkverify; /* true if the driver is in blkverify mode - * Writes are mirrored on two children devices. - * On reads the two children devices' contents are - * compared and if a difference is spotted its - * location is printed and the code aborts. - * It is useful to debug other block drivers by - * comparing them with a reference one. - */ - bool rewrite_corrupted;/* true if the driver must rewrite-on-read corrupted - * block if Quorum is reached. - */ - - QuorumReadPattern read_pattern; -} BDRVQuorumState; - -typedef struct QuorumAIOCB QuorumAIOCB; - -/* Quorum will create one instance of the following structure per operation it - * performs on its children. - * So for each read/write operation coming from the upper layer there will be - * $children_count QuorumChildRequest. - */ -typedef struct QuorumChildRequest { - BlockAIOCB *aiocb; - QEMUIOVector qiov; - uint8_t *buf; - int ret; - QuorumAIOCB *parent; -} QuorumChildRequest; - -/* Quorum will use the following structure to track progress of each read/write - * operation received by the upper layer. - * This structure hold pointers to the QuorumChildRequest structures instances - * used to do operations on each children and track overall progress. - */ -struct QuorumAIOCB { - BlockAIOCB common; - - /* Request metadata */ - uint64_t sector_num; - int nb_sectors; - - QEMUIOVector *qiov; /* calling IOV */ - - QuorumChildRequest *qcrs; /* individual child requests */ - int count; /* number of completed AIOCB */ - int success_count; /* number of successfully completed AIOCB */ - - int rewrite_count; /* number of replica to rewrite: count down to - * zero once writes are fired - */ - - QuorumVotes votes; - - bool is_read; - int vote_ret; - int child_iter; /* which child to read in fifo pattern */ -}; - -static bool quorum_vote(QuorumAIOCB *acb); - -static void quorum_aio_cancel(BlockAIOCB *blockacb) -{ - QuorumAIOCB *acb = container_of(blockacb, QuorumAIOCB, common); - BDRVQuorumState *s = acb->common.bs->opaque; - int i; - - /* cancel all callbacks */ - for (i = 0; i < s->num_children; i++) { - if (acb->qcrs[i].aiocb) { - bdrv_aio_cancel_async(acb->qcrs[i].aiocb); - } - } -} - -static AIOCBInfo quorum_aiocb_info = { - .aiocb_size = sizeof(QuorumAIOCB), - .cancel_async = quorum_aio_cancel, -}; - -static void quorum_aio_finalize(QuorumAIOCB *acb) -{ - int i, ret = 0; - - if (acb->vote_ret) { - ret = acb->vote_ret; - } - - acb->common.cb(acb->common.opaque, ret); - - if (acb->is_read) { - /* on the quorum case acb->child_iter == s->num_children - 1 */ - for (i = 0; i <= acb->child_iter; i++) { - qemu_vfree(acb->qcrs[i].buf); - qemu_iovec_destroy(&acb->qcrs[i].qiov); - } - } - - g_free(acb->qcrs); - qemu_aio_unref(acb); -} - -static bool quorum_sha256_compare(QuorumVoteValue *a, QuorumVoteValue *b) -{ - return !memcmp(a->h, b->h, HASH_LENGTH); -} - -static bool quorum_64bits_compare(QuorumVoteValue *a, QuorumVoteValue *b) -{ - return a->l == b->l; -} - -static QuorumAIOCB *quorum_aio_get(BDRVQuorumState *s, - BlockDriverState *bs, - QEMUIOVector *qiov, - uint64_t sector_num, - int nb_sectors, - BlockCompletionFunc *cb, - void *opaque) -{ - QuorumAIOCB *acb = qemu_aio_get(&quorum_aiocb_info, bs, cb, opaque); - int i; - - acb->common.bs->opaque = s; - acb->sector_num = sector_num; - acb->nb_sectors = nb_sectors; - acb->qiov = qiov; - acb->qcrs = g_new0(QuorumChildRequest, s->num_children); - acb->count = 0; - acb->success_count = 0; - acb->rewrite_count = 0; - acb->votes.compare = quorum_sha256_compare; - QLIST_INIT(&acb->votes.vote_list); - acb->is_read = false; - acb->vote_ret = 0; - - for (i = 0; i < s->num_children; i++) { - acb->qcrs[i].buf = NULL; - acb->qcrs[i].ret = 0; - acb->qcrs[i].parent = acb; - } - - return acb; -} - -static void quorum_report_bad(QuorumOpType type, uint64_t sector_num, - int nb_sectors, char *node_name, int ret) -{ - const char *msg = NULL; - if (ret < 0) { - msg = strerror(-ret); - } - - qapi_event_send_quorum_report_bad(type, !!msg, msg, node_name, - sector_num, nb_sectors, &error_abort); -} - -static void quorum_report_failure(QuorumAIOCB *acb) -{ - const char *reference = bdrv_get_device_or_node_name(acb->common.bs); - qapi_event_send_quorum_failure(reference, acb->sector_num, - acb->nb_sectors, &error_abort); -} - -static int quorum_vote_error(QuorumAIOCB *acb); - -static bool quorum_has_too_much_io_failed(QuorumAIOCB *acb) -{ - BDRVQuorumState *s = acb->common.bs->opaque; - - if (acb->success_count < s->threshold) { - acb->vote_ret = quorum_vote_error(acb); - quorum_report_failure(acb); - return true; - } - - return false; -} - -static void quorum_rewrite_aio_cb(void *opaque, int ret) -{ - QuorumAIOCB *acb = opaque; - - /* one less rewrite to do */ - acb->rewrite_count--; - - /* wait until all rewrite callbacks have completed */ - if (acb->rewrite_count) { - return; - } - - quorum_aio_finalize(acb); -} - -static BlockAIOCB *read_fifo_child(QuorumAIOCB *acb); - -static void quorum_copy_qiov(QEMUIOVector *dest, QEMUIOVector *source) -{ - int i; - assert(dest->niov == source->niov); - assert(dest->size == source->size); - for (i = 0; i < source->niov; i++) { - assert(dest->iov[i].iov_len == source->iov[i].iov_len); - memcpy(dest->iov[i].iov_base, - source->iov[i].iov_base, - source->iov[i].iov_len); - } -} - -static void quorum_aio_cb(void *opaque, int ret) -{ - QuorumChildRequest *sacb = opaque; - QuorumAIOCB *acb = sacb->parent; - BDRVQuorumState *s = acb->common.bs->opaque; - bool rewrite = false; - - if (ret == 0) { - acb->success_count++; - } else { - QuorumOpType type; - type = acb->is_read ? QUORUM_OP_TYPE_READ : QUORUM_OP_TYPE_WRITE; - quorum_report_bad(type, acb->sector_num, acb->nb_sectors, - sacb->aiocb->bs->node_name, ret); - } - - if (acb->is_read && s->read_pattern == QUORUM_READ_PATTERN_FIFO) { - /* We try to read next child in FIFO order if we fail to read */ - if (ret < 0 && (acb->child_iter + 1) < s->num_children) { - acb->child_iter++; - read_fifo_child(acb); - return; - } - - if (ret == 0) { - quorum_copy_qiov(acb->qiov, &acb->qcrs[acb->child_iter].qiov); - } - acb->vote_ret = ret; - quorum_aio_finalize(acb); - return; - } - - sacb->ret = ret; - acb->count++; - assert(acb->count <= s->num_children); - assert(acb->success_count <= s->num_children); - if (acb->count < s->num_children) { - return; - } - - /* Do the vote on read */ - if (acb->is_read) { - rewrite = quorum_vote(acb); - } else { - quorum_has_too_much_io_failed(acb); - } - - /* if no rewrite is done the code will finish right away */ - if (!rewrite) { - quorum_aio_finalize(acb); - } -} - -static void quorum_report_bad_versions(BDRVQuorumState *s, - QuorumAIOCB *acb, - QuorumVoteValue *value) -{ - QuorumVoteVersion *version; - QuorumVoteItem *item; - - QLIST_FOREACH(version, &acb->votes.vote_list, next) { - if (acb->votes.compare(&version->value, value)) { - continue; - } - QLIST_FOREACH(item, &version->items, next) { - quorum_report_bad(QUORUM_OP_TYPE_READ, acb->sector_num, - acb->nb_sectors, - s->children[item->index]->bs->node_name, 0); - } - } -} - -static bool quorum_rewrite_bad_versions(BDRVQuorumState *s, QuorumAIOCB *acb, - QuorumVoteValue *value) -{ - QuorumVoteVersion *version; - QuorumVoteItem *item; - int count = 0; - - /* first count the number of bad versions: done first to avoid concurrency - * issues. - */ - QLIST_FOREACH(version, &acb->votes.vote_list, next) { - if (acb->votes.compare(&version->value, value)) { - continue; - } - QLIST_FOREACH(item, &version->items, next) { - count++; - } - } - - /* quorum_rewrite_aio_cb will count down this to zero */ - acb->rewrite_count = count; - - /* now fire the correcting rewrites */ - QLIST_FOREACH(version, &acb->votes.vote_list, next) { - if (acb->votes.compare(&version->value, value)) { - continue; - } - QLIST_FOREACH(item, &version->items, next) { - bdrv_aio_writev(s->children[item->index]->bs, acb->sector_num, - acb->qiov, acb->nb_sectors, quorum_rewrite_aio_cb, - acb); - } - } - - /* return true if any rewrite is done else false */ - return count; -} - -static void quorum_count_vote(QuorumVotes *votes, - QuorumVoteValue *value, - int index) -{ - QuorumVoteVersion *v = NULL, *version = NULL; - QuorumVoteItem *item; - - /* look if we have something with this hash */ - QLIST_FOREACH(v, &votes->vote_list, next) { - if (votes->compare(&v->value, value)) { - version = v; - break; - } - } - - /* It's a version not yet in the list add it */ - if (!version) { - version = g_new0(QuorumVoteVersion, 1); - QLIST_INIT(&version->items); - memcpy(&version->value, value, sizeof(version->value)); - version->index = index; - version->vote_count = 0; - QLIST_INSERT_HEAD(&votes->vote_list, version, next); - } - - version->vote_count++; - - item = g_new0(QuorumVoteItem, 1); - item->index = index; - QLIST_INSERT_HEAD(&version->items, item, next); -} - -static void quorum_free_vote_list(QuorumVotes *votes) -{ - QuorumVoteVersion *version, *next_version; - QuorumVoteItem *item, *next_item; - - QLIST_FOREACH_SAFE(version, &votes->vote_list, next, next_version) { - QLIST_REMOVE(version, next); - QLIST_FOREACH_SAFE(item, &version->items, next, next_item) { - QLIST_REMOVE(item, next); - g_free(item); - } - g_free(version); - } -} - -static int quorum_compute_hash(QuorumAIOCB *acb, int i, QuorumVoteValue *hash) -{ - QEMUIOVector *qiov = &acb->qcrs[i].qiov; - size_t len = sizeof(hash->h); - uint8_t *data = hash->h; - - /* XXX - would be nice if we could pass in the Error ** - * and propagate that back, but this quorum code is - * restricted to just errno values currently */ - if (qcrypto_hash_bytesv(QCRYPTO_HASH_ALG_SHA256, - qiov->iov, qiov->niov, - &data, &len, - NULL) < 0) { - return -EINVAL; - } - - return 0; -} - -static QuorumVoteVersion *quorum_get_vote_winner(QuorumVotes *votes) -{ - int max = 0; - QuorumVoteVersion *candidate, *winner = NULL; - - QLIST_FOREACH(candidate, &votes->vote_list, next) { - if (candidate->vote_count > max) { - max = candidate->vote_count; - winner = candidate; - } - } - - return winner; -} - -/* qemu_iovec_compare is handy for blkverify mode because it returns the first - * differing byte location. Yet it is handcoded to compare vectors one byte - * after another so it does not benefit from the libc SIMD optimizations. - * quorum_iovec_compare is written for speed and should be used in the non - * blkverify mode of quorum. - */ -static bool quorum_iovec_compare(QEMUIOVector *a, QEMUIOVector *b) -{ - int i; - int result; - - assert(a->niov == b->niov); - for (i = 0; i < a->niov; i++) { - assert(a->iov[i].iov_len == b->iov[i].iov_len); - result = memcmp(a->iov[i].iov_base, - b->iov[i].iov_base, - a->iov[i].iov_len); - if (result) { - return false; - } - } - - return true; -} - -static void GCC_FMT_ATTR(2, 3) quorum_err(QuorumAIOCB *acb, - const char *fmt, ...) -{ - va_list ap; - - va_start(ap, fmt); - fprintf(stderr, "quorum: sector_num=%" PRId64 " nb_sectors=%d ", - acb->sector_num, acb->nb_sectors); - vfprintf(stderr, fmt, ap); - fprintf(stderr, "\n"); - va_end(ap); - exit(1); -} - -static bool quorum_compare(QuorumAIOCB *acb, - QEMUIOVector *a, - QEMUIOVector *b) -{ - BDRVQuorumState *s = acb->common.bs->opaque; - ssize_t offset; - - /* This driver will replace blkverify in this particular case */ - if (s->is_blkverify) { - offset = qemu_iovec_compare(a, b); - if (offset != -1) { - quorum_err(acb, "contents mismatch in sector %" PRId64, - acb->sector_num + - (uint64_t)(offset / BDRV_SECTOR_SIZE)); - } - return true; - } - - return quorum_iovec_compare(a, b); -} - -/* Do a vote to get the error code */ -static int quorum_vote_error(QuorumAIOCB *acb) -{ - BDRVQuorumState *s = acb->common.bs->opaque; - QuorumVoteVersion *winner = NULL; - QuorumVotes error_votes; - QuorumVoteValue result_value; - int i, ret = 0; - bool error = false; - - QLIST_INIT(&error_votes.vote_list); - error_votes.compare = quorum_64bits_compare; - - for (i = 0; i < s->num_children; i++) { - ret = acb->qcrs[i].ret; - if (ret) { - error = true; - result_value.l = ret; - quorum_count_vote(&error_votes, &result_value, i); - } - } - - if (error) { - winner = quorum_get_vote_winner(&error_votes); - ret = winner->value.l; - } - - quorum_free_vote_list(&error_votes); - - return ret; -} - -static bool quorum_vote(QuorumAIOCB *acb) -{ - bool quorum = true; - bool rewrite = false; - int i, j, ret; - QuorumVoteValue hash; - BDRVQuorumState *s = acb->common.bs->opaque; - QuorumVoteVersion *winner; - - if (quorum_has_too_much_io_failed(acb)) { - return false; - } - - /* get the index of the first successful read */ - for (i = 0; i < s->num_children; i++) { - if (!acb->qcrs[i].ret) { - break; - } - } - - assert(i < s->num_children); - - /* compare this read with all other successful reads stopping at quorum - * failure - */ - for (j = i + 1; j < s->num_children; j++) { - if (acb->qcrs[j].ret) { - continue; - } - quorum = quorum_compare(acb, &acb->qcrs[i].qiov, &acb->qcrs[j].qiov); - if (!quorum) { - break; - } - } - - /* Every successful read agrees */ - if (quorum) { - quorum_copy_qiov(acb->qiov, &acb->qcrs[i].qiov); - return false; - } - - /* compute hashes for each successful read, also store indexes */ - for (i = 0; i < s->num_children; i++) { - if (acb->qcrs[i].ret) { - continue; - } - ret = quorum_compute_hash(acb, i, &hash); - /* if ever the hash computation failed */ - if (ret < 0) { - acb->vote_ret = ret; - goto free_exit; - } - quorum_count_vote(&acb->votes, &hash, i); - } - - /* vote to select the most represented version */ - winner = quorum_get_vote_winner(&acb->votes); - - /* if the winner count is smaller than threshold the read fails */ - if (winner->vote_count < s->threshold) { - quorum_report_failure(acb); - acb->vote_ret = -EIO; - goto free_exit; - } - - /* we have a winner: copy it */ - quorum_copy_qiov(acb->qiov, &acb->qcrs[winner->index].qiov); - - /* some versions are bad print them */ - quorum_report_bad_versions(s, acb, &winner->value); - - /* corruption correction is enabled */ - if (s->rewrite_corrupted) { - rewrite = quorum_rewrite_bad_versions(s, acb, &winner->value); - } - -free_exit: - /* free lists */ - quorum_free_vote_list(&acb->votes); - return rewrite; -} - -static BlockAIOCB *read_quorum_children(QuorumAIOCB *acb) -{ - BDRVQuorumState *s = acb->common.bs->opaque; - int i; - - for (i = 0; i < s->num_children; i++) { - acb->qcrs[i].buf = qemu_blockalign(s->children[i]->bs, acb->qiov->size); - qemu_iovec_init(&acb->qcrs[i].qiov, acb->qiov->niov); - qemu_iovec_clone(&acb->qcrs[i].qiov, acb->qiov, acb->qcrs[i].buf); - } - - for (i = 0; i < s->num_children; i++) { - acb->qcrs[i].aiocb = bdrv_aio_readv(s->children[i]->bs, acb->sector_num, - &acb->qcrs[i].qiov, acb->nb_sectors, - quorum_aio_cb, &acb->qcrs[i]); - } - - return &acb->common; -} - -static BlockAIOCB *read_fifo_child(QuorumAIOCB *acb) -{ - BDRVQuorumState *s = acb->common.bs->opaque; - - acb->qcrs[acb->child_iter].buf = - qemu_blockalign(s->children[acb->child_iter]->bs, acb->qiov->size); - qemu_iovec_init(&acb->qcrs[acb->child_iter].qiov, acb->qiov->niov); - qemu_iovec_clone(&acb->qcrs[acb->child_iter].qiov, acb->qiov, - acb->qcrs[acb->child_iter].buf); - acb->qcrs[acb->child_iter].aiocb = - bdrv_aio_readv(s->children[acb->child_iter]->bs, acb->sector_num, - &acb->qcrs[acb->child_iter].qiov, acb->nb_sectors, - quorum_aio_cb, &acb->qcrs[acb->child_iter]); - - return &acb->common; -} - -static BlockAIOCB *quorum_aio_readv(BlockDriverState *bs, - int64_t sector_num, - QEMUIOVector *qiov, - int nb_sectors, - BlockCompletionFunc *cb, - void *opaque) -{ - BDRVQuorumState *s = bs->opaque; - QuorumAIOCB *acb = quorum_aio_get(s, bs, qiov, sector_num, - nb_sectors, cb, opaque); - acb->is_read = true; - - if (s->read_pattern == QUORUM_READ_PATTERN_QUORUM) { - acb->child_iter = s->num_children - 1; - return read_quorum_children(acb); - } - - acb->child_iter = 0; - return read_fifo_child(acb); -} - -static BlockAIOCB *quorum_aio_writev(BlockDriverState *bs, - int64_t sector_num, - QEMUIOVector *qiov, - int nb_sectors, - BlockCompletionFunc *cb, - void *opaque) -{ - BDRVQuorumState *s = bs->opaque; - QuorumAIOCB *acb = quorum_aio_get(s, bs, qiov, sector_num, nb_sectors, - cb, opaque); - int i; - - for (i = 0; i < s->num_children; i++) { - acb->qcrs[i].aiocb = bdrv_aio_writev(s->children[i]->bs, sector_num, - qiov, nb_sectors, &quorum_aio_cb, - &acb->qcrs[i]); - } - - return &acb->common; -} - -static int64_t quorum_getlength(BlockDriverState *bs) -{ - BDRVQuorumState *s = bs->opaque; - int64_t result; - int i; - - /* check that all file have the same length */ - result = bdrv_getlength(s->children[0]->bs); - if (result < 0) { - return result; - } - for (i = 1; i < s->num_children; i++) { - int64_t value = bdrv_getlength(s->children[i]->bs); - if (value < 0) { - return value; - } - if (value != result) { - return -EIO; - } - } - - return result; -} - -static void quorum_invalidate_cache(BlockDriverState *bs, Error **errp) -{ - BDRVQuorumState *s = bs->opaque; - Error *local_err = NULL; - int i; - - for (i = 0; i < s->num_children; i++) { - bdrv_invalidate_cache(s->children[i]->bs, &local_err); - if (local_err) { - error_propagate(errp, local_err); - return; - } - } -} - -static coroutine_fn int quorum_co_flush(BlockDriverState *bs) -{ - BDRVQuorumState *s = bs->opaque; - QuorumVoteVersion *winner = NULL; - QuorumVotes error_votes; - QuorumVoteValue result_value; - int i; - int result = 0; - int success_count = 0; - - QLIST_INIT(&error_votes.vote_list); - error_votes.compare = quorum_64bits_compare; - - for (i = 0; i < s->num_children; i++) { - result = bdrv_co_flush(s->children[i]->bs); - if (result) { - quorum_report_bad(QUORUM_OP_TYPE_FLUSH, 0, - bdrv_nb_sectors(s->children[i]->bs), - s->children[i]->bs->node_name, result); - result_value.l = result; - quorum_count_vote(&error_votes, &result_value, i); - } else { - success_count++; - } - } - - if (success_count >= s->threshold) { - result = 0; - } else { - winner = quorum_get_vote_winner(&error_votes); - result = winner->value.l; - } - quorum_free_vote_list(&error_votes); - - return result; -} - -static bool quorum_recurse_is_first_non_filter(BlockDriverState *bs, - BlockDriverState *candidate) -{ - BDRVQuorumState *s = bs->opaque; - int i; - - for (i = 0; i < s->num_children; i++) { - bool perm = bdrv_recurse_is_first_non_filter(s->children[i]->bs, - candidate); - if (perm) { - return true; - } - } - - return false; -} - -static int quorum_valid_threshold(int threshold, int num_children, Error **errp) -{ - - if (threshold < 1) { - error_setg(errp, QERR_INVALID_PARAMETER_VALUE, - "vote-threshold", "value >= 1"); - return -ERANGE; - } - - if (threshold > num_children) { - error_setg(errp, "threshold may not exceed children count"); - return -ERANGE; - } - - return 0; -} - -static QemuOptsList quorum_runtime_opts = { - .name = "quorum", - .head = QTAILQ_HEAD_INITIALIZER(quorum_runtime_opts.head), - .desc = { - { - .name = QUORUM_OPT_VOTE_THRESHOLD, - .type = QEMU_OPT_NUMBER, - .help = "The number of vote needed for reaching quorum", - }, - { - .name = QUORUM_OPT_BLKVERIFY, - .type = QEMU_OPT_BOOL, - .help = "Trigger block verify mode if set", - }, - { - .name = QUORUM_OPT_REWRITE, - .type = QEMU_OPT_BOOL, - .help = "Rewrite corrupted block on read quorum", - }, - { - .name = QUORUM_OPT_READ_PATTERN, - .type = QEMU_OPT_STRING, - .help = "Allowed pattern: quorum, fifo. Quorum is default", - }, - { /* end of list */ } - }, -}; - -static int parse_read_pattern(const char *opt) -{ - int i; - - if (!opt) { - /* Set quorum as default */ - return QUORUM_READ_PATTERN_QUORUM; - } - - for (i = 0; i < QUORUM_READ_PATTERN__MAX; i++) { - if (!strcmp(opt, QuorumReadPattern_lookup[i])) { - return i; - } - } - - return -EINVAL; -} - -static int quorum_open(BlockDriverState *bs, QDict *options, int flags, - Error **errp) -{ - BDRVQuorumState *s = bs->opaque; - Error *local_err = NULL; - QemuOpts *opts = NULL; - bool *opened; - int i; - int ret = 0; - - qdict_flatten(options); - - /* count how many different children are present */ - s->num_children = qdict_array_entries(options, "children."); - if (s->num_children < 0) { - error_setg(&local_err, "Option children is not a valid array"); - ret = -EINVAL; - goto exit; - } - if (s->num_children < 2) { - error_setg(&local_err, - "Number of provided children must be greater than 1"); - ret = -EINVAL; - goto exit; - } - - opts = qemu_opts_create(&quorum_runtime_opts, NULL, 0, &error_abort); - qemu_opts_absorb_qdict(opts, options, &local_err); - if (local_err) { - ret = -EINVAL; - goto exit; - } - - s->threshold = qemu_opt_get_number(opts, QUORUM_OPT_VOTE_THRESHOLD, 0); - /* and validate it against s->num_children */ - ret = quorum_valid_threshold(s->threshold, s->num_children, &local_err); - if (ret < 0) { - goto exit; - } - - ret = parse_read_pattern(qemu_opt_get(opts, QUORUM_OPT_READ_PATTERN)); - if (ret < 0) { - error_setg(&local_err, "Please set read-pattern as fifo or quorum"); - goto exit; - } - s->read_pattern = ret; - - if (s->read_pattern == QUORUM_READ_PATTERN_QUORUM) { - /* is the driver in blkverify mode */ - if (qemu_opt_get_bool(opts, QUORUM_OPT_BLKVERIFY, false) && - s->num_children == 2 && s->threshold == 2) { - s->is_blkverify = true; - } else if (qemu_opt_get_bool(opts, QUORUM_OPT_BLKVERIFY, false)) { - fprintf(stderr, "blkverify mode is set by setting blkverify=on " - "and using two files with vote_threshold=2\n"); - } - - s->rewrite_corrupted = qemu_opt_get_bool(opts, QUORUM_OPT_REWRITE, - false); - if (s->rewrite_corrupted && s->is_blkverify) { - error_setg(&local_err, - "rewrite-corrupted=on cannot be used with blkverify=on"); - ret = -EINVAL; - goto exit; - } - } - - /* allocate the children array */ - s->children = g_new0(BdrvChild *, s->num_children); - opened = g_new0(bool, s->num_children); - - for (i = 0; i < s->num_children; i++) { - char indexstr[32]; - ret = snprintf(indexstr, 32, "children.%d", i); - assert(ret < 32); - - s->children[i] = bdrv_open_child(NULL, options, indexstr, bs, - &child_format, false, &local_err); - if (local_err) { - ret = -EINVAL; - goto close_exit; - } - - opened[i] = true; - } - - g_free(opened); - goto exit; - -close_exit: - /* cleanup on error */ - for (i = 0; i < s->num_children; i++) { - if (!opened[i]) { - continue; - } - bdrv_unref_child(bs, s->children[i]); - } - g_free(s->children); - g_free(opened); -exit: - qemu_opts_del(opts); - /* propagate error */ - if (local_err) { - error_propagate(errp, local_err); - } - return ret; -} - -static void quorum_close(BlockDriverState *bs) -{ - BDRVQuorumState *s = bs->opaque; - int i; - - for (i = 0; i < s->num_children; i++) { - bdrv_unref_child(bs, s->children[i]); - } - - g_free(s->children); -} - -static void quorum_detach_aio_context(BlockDriverState *bs) -{ - BDRVQuorumState *s = bs->opaque; - int i; - - for (i = 0; i < s->num_children; i++) { - bdrv_detach_aio_context(s->children[i]->bs); - } -} - -static void quorum_attach_aio_context(BlockDriverState *bs, - AioContext *new_context) -{ - BDRVQuorumState *s = bs->opaque; - int i; - - for (i = 0; i < s->num_children; i++) { - bdrv_attach_aio_context(s->children[i]->bs, new_context); - } -} - -static void quorum_refresh_filename(BlockDriverState *bs, QDict *options) -{ - BDRVQuorumState *s = bs->opaque; - QDict *opts; - QList *children; - int i; - - for (i = 0; i < s->num_children; i++) { - bdrv_refresh_filename(s->children[i]->bs); - if (!s->children[i]->bs->full_open_options) { - return; - } - } - - children = qlist_new(); - for (i = 0; i < s->num_children; i++) { - QINCREF(s->children[i]->bs->full_open_options); - qlist_append_obj(children, - QOBJECT(s->children[i]->bs->full_open_options)); - } - - opts = qdict_new(); - qdict_put_obj(opts, "driver", QOBJECT(qstring_from_str("quorum"))); - qdict_put_obj(opts, QUORUM_OPT_VOTE_THRESHOLD, - QOBJECT(qint_from_int(s->threshold))); - qdict_put_obj(opts, QUORUM_OPT_BLKVERIFY, - QOBJECT(qbool_from_bool(s->is_blkverify))); - qdict_put_obj(opts, QUORUM_OPT_REWRITE, - QOBJECT(qbool_from_bool(s->rewrite_corrupted))); - qdict_put_obj(opts, "children", QOBJECT(children)); - - bs->full_open_options = opts; -} - -static BlockDriver bdrv_quorum = { - .format_name = "quorum", - .protocol_name = "quorum", - - .instance_size = sizeof(BDRVQuorumState), - - .bdrv_file_open = quorum_open, - .bdrv_close = quorum_close, - .bdrv_refresh_filename = quorum_refresh_filename, - - .bdrv_co_flush_to_disk = quorum_co_flush, - - .bdrv_getlength = quorum_getlength, - - .bdrv_aio_readv = quorum_aio_readv, - .bdrv_aio_writev = quorum_aio_writev, - .bdrv_invalidate_cache = quorum_invalidate_cache, - - .bdrv_detach_aio_context = quorum_detach_aio_context, - .bdrv_attach_aio_context = quorum_attach_aio_context, - - .is_filter = true, - .bdrv_recurse_is_first_non_filter = quorum_recurse_is_first_non_filter, -}; - -static void bdrv_quorum_init(void) -{ - if (!qcrypto_hash_supports(QCRYPTO_HASH_ALG_SHA256)) { - /* SHA256 hash support is required for quorum device */ - return; - } - bdrv_register(&bdrv_quorum); -} - -block_init(bdrv_quorum_init); diff --git a/qemu/block/raw-aio.h b/qemu/block/raw-aio.h deleted file mode 100644 index 811e37501..000000000 --- a/qemu/block/raw-aio.h +++ /dev/null @@ -1,64 +0,0 @@ -/* - * Declarations for AIO in the raw protocol - * - * Copyright IBM, Corp. 2008 - * - * Authors: - * Anthony Liguori - * - * This work is licensed under the terms of the GNU GPL, version 2. See - * the COPYING file in the top-level directory. - * - * Contributions after 2012-01-13 are licensed under the terms of the - * GNU GPL, version 2 or (at your option) any later version. - */ -#ifndef QEMU_RAW_AIO_H -#define QEMU_RAW_AIO_H - -#include "qemu/iov.h" - -/* AIO request types */ -#define QEMU_AIO_READ 0x0001 -#define QEMU_AIO_WRITE 0x0002 -#define QEMU_AIO_IOCTL 0x0004 -#define QEMU_AIO_FLUSH 0x0008 -#define QEMU_AIO_DISCARD 0x0010 -#define QEMU_AIO_WRITE_ZEROES 0x0020 -#define QEMU_AIO_TYPE_MASK \ - (QEMU_AIO_READ|QEMU_AIO_WRITE|QEMU_AIO_IOCTL|QEMU_AIO_FLUSH| \ - QEMU_AIO_DISCARD|QEMU_AIO_WRITE_ZEROES) - -/* AIO flags */ -#define QEMU_AIO_MISALIGNED 0x1000 -#define QEMU_AIO_BLKDEV 0x2000 - - -/* linux-aio.c - Linux native implementation */ -#ifdef CONFIG_LINUX_AIO -void *laio_init(void); -void laio_cleanup(void *s); -BlockAIOCB *laio_submit(BlockDriverState *bs, void *aio_ctx, int fd, - int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, - BlockCompletionFunc *cb, void *opaque, int type); -void laio_detach_aio_context(void *s, AioContext *old_context); -void laio_attach_aio_context(void *s, AioContext *new_context); -void laio_io_plug(BlockDriverState *bs, void *aio_ctx); -void laio_io_unplug(BlockDriverState *bs, void *aio_ctx, bool unplug); -#endif - -#ifdef _WIN32 -typedef struct QEMUWin32AIOState QEMUWin32AIOState; -QEMUWin32AIOState *win32_aio_init(void); -void win32_aio_cleanup(QEMUWin32AIOState *aio); -int win32_aio_attach(QEMUWin32AIOState *aio, HANDLE hfile); -BlockAIOCB *win32_aio_submit(BlockDriverState *bs, - QEMUWin32AIOState *aio, HANDLE hfile, - int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, - BlockCompletionFunc *cb, void *opaque, int type); -void win32_aio_detach_aio_context(QEMUWin32AIOState *aio, - AioContext *old_context); -void win32_aio_attach_aio_context(QEMUWin32AIOState *aio, - AioContext *new_context); -#endif - -#endif /* QEMU_RAW_AIO_H */ diff --git a/qemu/block/raw-posix.c b/qemu/block/raw-posix.c deleted file mode 100644 index 906d5c941..000000000 --- a/qemu/block/raw-posix.c +++ /dev/null @@ -1,2701 +0,0 @@ -/* - * Block driver for RAW files (posix) - * - * Copyright (c) 2006 Fabrice Bellard - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. - */ -#include "qemu/osdep.h" -#include "qapi/error.h" -#include "qemu/cutils.h" -#include "qemu/error-report.h" -#include "qemu/timer.h" -#include "qemu/log.h" -#include "block/block_int.h" -#include "qemu/module.h" -#include "trace.h" -#include "block/thread-pool.h" -#include "qemu/iov.h" -#include "raw-aio.h" -#include "qapi/util.h" -#include "qapi/qmp/qstring.h" - -#if defined(__APPLE__) && (__MACH__) -#include -#include -#include -#include -#include -#include -#include -//#include -#include -#include -#endif - -#ifdef __sun__ -#define _POSIX_PTHREAD_SEMANTICS 1 -#include -#endif -#ifdef __linux__ -#include -#include -#include -#include -#include -#include -#include -#ifdef __s390__ -#include -#endif -#ifndef FS_NOCOW_FL -#define FS_NOCOW_FL 0x00800000 /* Do not cow file */ -#endif -#endif -#if defined(CONFIG_FALLOCATE_PUNCH_HOLE) || defined(CONFIG_FALLOCATE_ZERO_RANGE) -#include -#endif -#if defined (__FreeBSD__) || defined(__FreeBSD_kernel__) -#include -#include -#endif - -#ifdef __OpenBSD__ -#include -#include -#include -#endif - -#ifdef __NetBSD__ -#include -#include -#include -#include -#endif - -#ifdef __DragonFly__ -#include -#include -#endif - -#ifdef CONFIG_XFS -#include -#endif - -//#define DEBUG_BLOCK - -#ifdef DEBUG_BLOCK -# define DEBUG_BLOCK_PRINT 1 -#else -# define DEBUG_BLOCK_PRINT 0 -#endif -#define DPRINTF(fmt, ...) \ -do { \ - if (DEBUG_BLOCK_PRINT) { \ - printf(fmt, ## __VA_ARGS__); \ - } \ -} while (0) - -/* OS X does not have O_DSYNC */ -#ifndef O_DSYNC -#ifdef O_SYNC -#define O_DSYNC O_SYNC -#elif defined(O_FSYNC) -#define O_DSYNC O_FSYNC -#endif -#endif - -/* Approximate O_DIRECT with O_DSYNC if O_DIRECT isn't available */ -#ifndef O_DIRECT -#define O_DIRECT O_DSYNC -#endif - -#define FTYPE_FILE 0 -#define FTYPE_CD 1 - -#define MAX_BLOCKSIZE 4096 - -typedef struct BDRVRawState { - int fd; - int type; - int open_flags; - size_t buf_align; - -#ifdef CONFIG_LINUX_AIO - int use_aio; - void *aio_ctx; -#endif -#ifdef CONFIG_XFS - bool is_xfs:1; -#endif - bool has_discard:1; - bool has_write_zeroes:1; - bool discard_zeroes:1; - bool has_fallocate; - bool needs_alignment; -} BDRVRawState; - -typedef struct BDRVRawReopenState { - int fd; - int open_flags; -#ifdef CONFIG_LINUX_AIO - int use_aio; -#endif -} BDRVRawReopenState; - -static int fd_open(BlockDriverState *bs); -static int64_t raw_getlength(BlockDriverState *bs); - -typedef struct RawPosixAIOData { - BlockDriverState *bs; - int aio_fildes; - union { - struct iovec *aio_iov; - void *aio_ioctl_buf; - }; - int aio_niov; - uint64_t aio_nbytes; -#define aio_ioctl_cmd aio_nbytes /* for QEMU_AIO_IOCTL */ - off_t aio_offset; - int aio_type; -} RawPosixAIOData; - -#if defined(__FreeBSD__) || defined(__FreeBSD_kernel__) -static int cdrom_reopen(BlockDriverState *bs); -#endif - -#if defined(__NetBSD__) -static int raw_normalize_devicepath(const char **filename) -{ - static char namebuf[PATH_MAX]; - const char *dp, *fname; - struct stat sb; - - fname = *filename; - dp = strrchr(fname, '/'); - if (lstat(fname, &sb) < 0) { - fprintf(stderr, "%s: stat failed: %s\n", - fname, strerror(errno)); - return -errno; - } - - if (!S_ISBLK(sb.st_mode)) { - return 0; - } - - if (dp == NULL) { - snprintf(namebuf, PATH_MAX, "r%s", fname); - } else { - snprintf(namebuf, PATH_MAX, "%.*s/r%s", - (int)(dp - fname), fname, dp + 1); - } - fprintf(stderr, "%s is a block device", fname); - *filename = namebuf; - fprintf(stderr, ", using %s\n", *filename); - - return 0; -} -#else -static int raw_normalize_devicepath(const char **filename) -{ - return 0; -} -#endif - -/* - * Get logical block size via ioctl. On success store it in @sector_size_p. - */ -static int probe_logical_blocksize(int fd, unsigned int *sector_size_p) -{ - unsigned int sector_size; - bool success = false; - - errno = ENOTSUP; - - /* Try a few ioctls to get the right size */ -#ifdef BLKSSZGET - if (ioctl(fd, BLKSSZGET, §or_size) >= 0) { - *sector_size_p = sector_size; - success = true; - } -#endif -#ifdef DKIOCGETBLOCKSIZE - if (ioctl(fd, DKIOCGETBLOCKSIZE, §or_size) >= 0) { - *sector_size_p = sector_size; - success = true; - } -#endif -#ifdef DIOCGSECTORSIZE - if (ioctl(fd, DIOCGSECTORSIZE, §or_size) >= 0) { - *sector_size_p = sector_size; - success = true; - } -#endif - - return success ? 0 : -errno; -} - -/** - * Get physical block size of @fd. - * On success, store it in @blk_size and return 0. - * On failure, return -errno. - */ -static int probe_physical_blocksize(int fd, unsigned int *blk_size) -{ -#ifdef BLKPBSZGET - if (ioctl(fd, BLKPBSZGET, blk_size) < 0) { - return -errno; - } - return 0; -#else - return -ENOTSUP; -#endif -} - -/* Check if read is allowed with given memory buffer and length. - * - * This function is used to check O_DIRECT memory buffer and request alignment. - */ -static bool raw_is_io_aligned(int fd, void *buf, size_t len) -{ - ssize_t ret = pread(fd, buf, len, 0); - - if (ret >= 0) { - return true; - } - -#ifdef __linux__ - /* The Linux kernel returns EINVAL for misaligned O_DIRECT reads. Ignore - * other errors (e.g. real I/O error), which could happen on a failed - * drive, since we only care about probing alignment. - */ - if (errno != EINVAL) { - return true; - } -#endif - - return false; -} - -static void raw_probe_alignment(BlockDriverState *bs, int fd, Error **errp) -{ - BDRVRawState *s = bs->opaque; - char *buf; - size_t max_align = MAX(MAX_BLOCKSIZE, getpagesize()); - - /* For SCSI generic devices the alignment is not really used. - With buffered I/O, we don't have any restrictions. */ - if (bdrv_is_sg(bs) || !s->needs_alignment) { - bs->request_alignment = 1; - s->buf_align = 1; - return; - } - - bs->request_alignment = 0; - s->buf_align = 0; - /* Let's try to use the logical blocksize for the alignment. */ - if (probe_logical_blocksize(fd, &bs->request_alignment) < 0) { - bs->request_alignment = 0; - } -#ifdef CONFIG_XFS - if (s->is_xfs) { - struct dioattr da; - if (xfsctl(NULL, fd, XFS_IOC_DIOINFO, &da) >= 0) { - bs->request_alignment = da.d_miniosz; - /* The kernel returns wrong information for d_mem */ - /* s->buf_align = da.d_mem; */ - } - } -#endif - - /* If we could not get the sizes so far, we can only guess them */ - if (!s->buf_align) { - size_t align; - buf = qemu_memalign(max_align, 2 * max_align); - for (align = 512; align <= max_align; align <<= 1) { - if (raw_is_io_aligned(fd, buf + align, max_align)) { - s->buf_align = align; - break; - } - } - qemu_vfree(buf); - } - - if (!bs->request_alignment) { - size_t align; - buf = qemu_memalign(s->buf_align, max_align); - for (align = 512; align <= max_align; align <<= 1) { - if (raw_is_io_aligned(fd, buf, align)) { - bs->request_alignment = align; - break; - } - } - qemu_vfree(buf); - } - - if (!s->buf_align || !bs->request_alignment) { - error_setg(errp, "Could not find working O_DIRECT alignment. " - "Try cache.direct=off."); - } -} - -static void raw_parse_flags(int bdrv_flags, int *open_flags) -{ - assert(open_flags != NULL); - - *open_flags |= O_BINARY; - *open_flags &= ~O_ACCMODE; - if (bdrv_flags & BDRV_O_RDWR) { - *open_flags |= O_RDWR; - } else { - *open_flags |= O_RDONLY; - } - - /* Use O_DSYNC for write-through caching, no flags for write-back caching, - * and O_DIRECT for no caching. */ - if ((bdrv_flags & BDRV_O_NOCACHE)) { - *open_flags |= O_DIRECT; - } -} - -static void raw_detach_aio_context(BlockDriverState *bs) -{ -#ifdef CONFIG_LINUX_AIO - BDRVRawState *s = bs->opaque; - - if (s->use_aio) { - laio_detach_aio_context(s->aio_ctx, bdrv_get_aio_context(bs)); - } -#endif -} - -static void raw_attach_aio_context(BlockDriverState *bs, - AioContext *new_context) -{ -#ifdef CONFIG_LINUX_AIO - BDRVRawState *s = bs->opaque; - - if (s->use_aio) { - laio_attach_aio_context(s->aio_ctx, new_context); - } -#endif -} - -#ifdef CONFIG_LINUX_AIO -static int raw_set_aio(void **aio_ctx, int *use_aio, int bdrv_flags) -{ - int ret = -1; - assert(aio_ctx != NULL); - assert(use_aio != NULL); - /* - * Currently Linux do AIO only for files opened with O_DIRECT - * specified so check NOCACHE flag too - */ - if ((bdrv_flags & (BDRV_O_NOCACHE|BDRV_O_NATIVE_AIO)) == - (BDRV_O_NOCACHE|BDRV_O_NATIVE_AIO)) { - - /* if non-NULL, laio_init() has already been run */ - if (*aio_ctx == NULL) { - *aio_ctx = laio_init(); - if (!*aio_ctx) { - goto error; - } - } - *use_aio = 1; - } else { - *use_aio = 0; - } - - ret = 0; - -error: - return ret; -} -#endif - -static void raw_parse_filename(const char *filename, QDict *options, - Error **errp) -{ - /* The filename does not have to be prefixed by the protocol name, since - * "file" is the default protocol; therefore, the return value of this - * function call can be ignored. */ - strstart(filename, "file:", &filename); - - qdict_put_obj(options, "filename", QOBJECT(qstring_from_str(filename))); -} - -static QemuOptsList raw_runtime_opts = { - .name = "raw", - .head = QTAILQ_HEAD_INITIALIZER(raw_runtime_opts.head), - .desc = { - { - .name = "filename", - .type = QEMU_OPT_STRING, - .help = "File name of the image", - }, - { /* end of list */ } - }, -}; - -static int raw_open_common(BlockDriverState *bs, QDict *options, - int bdrv_flags, int open_flags, Error **errp) -{ - BDRVRawState *s = bs->opaque; - QemuOpts *opts; - Error *local_err = NULL; - const char *filename = NULL; - int fd, ret; - struct stat st; - - opts = qemu_opts_create(&raw_runtime_opts, NULL, 0, &error_abort); - qemu_opts_absorb_qdict(opts, options, &local_err); - if (local_err) { - error_propagate(errp, local_err); - ret = -EINVAL; - goto fail; - } - - filename = qemu_opt_get(opts, "filename"); - - ret = raw_normalize_devicepath(&filename); - if (ret != 0) { - error_setg_errno(errp, -ret, "Could not normalize device path"); - goto fail; - } - - s->open_flags = open_flags; - raw_parse_flags(bdrv_flags, &s->open_flags); - - s->fd = -1; - fd = qemu_open(filename, s->open_flags, 0644); - if (fd < 0) { - ret = -errno; - if (ret == -EROFS) { - ret = -EACCES; - } - goto fail; - } - s->fd = fd; - -#ifdef CONFIG_LINUX_AIO - if (raw_set_aio(&s->aio_ctx, &s->use_aio, bdrv_flags)) { - qemu_close(fd); - ret = -errno; - error_setg_errno(errp, -ret, "Could not set AIO state"); - goto fail; - } - if (!s->use_aio && (bdrv_flags & BDRV_O_NATIVE_AIO)) { - error_setg(errp, "aio=native was specified, but it requires " - "cache.direct=on, which was not specified."); - ret = -EINVAL; - goto fail; - } -#else - if (bdrv_flags & BDRV_O_NATIVE_AIO) { - error_setg(errp, "aio=native was specified, but is not supported " - "in this build."); - ret = -EINVAL; - goto fail; - } -#endif /* !defined(CONFIG_LINUX_AIO) */ - - s->has_discard = true; - s->has_write_zeroes = true; - if ((bs->open_flags & BDRV_O_NOCACHE) != 0) { - s->needs_alignment = true; - } - - if (fstat(s->fd, &st) < 0) { - ret = -errno; - error_setg_errno(errp, errno, "Could not stat file"); - goto fail; - } - if (S_ISREG(st.st_mode)) { - s->discard_zeroes = true; - s->has_fallocate = true; - } - if (S_ISBLK(st.st_mode)) { -#ifdef BLKDISCARDZEROES - unsigned int arg; - if (ioctl(s->fd, BLKDISCARDZEROES, &arg) == 0 && arg) { - s->discard_zeroes = true; - } -#endif -#ifdef __linux__ - /* On Linux 3.10, BLKDISCARD leaves stale data in the page cache. Do - * not rely on the contents of discarded blocks unless using O_DIRECT. - * Same for BLKZEROOUT. - */ - if (!(bs->open_flags & BDRV_O_NOCACHE)) { - s->discard_zeroes = false; - s->has_write_zeroes = false; - } -#endif - } -#ifdef __FreeBSD__ - if (S_ISCHR(st.st_mode)) { - /* - * The file is a char device (disk), which on FreeBSD isn't behind - * a pager, so force all requests to be aligned. This is needed - * so QEMU makes sure all IO operations on the device are aligned - * to sector size, or else FreeBSD will reject them with EINVAL. - */ - s->needs_alignment = true; - } -#endif - -#ifdef CONFIG_XFS - if (platform_test_xfs_fd(s->fd)) { - s->is_xfs = true; - } -#endif - - raw_attach_aio_context(bs, bdrv_get_aio_context(bs)); - - ret = 0; -fail: - if (filename && (bdrv_flags & BDRV_O_TEMPORARY)) { - unlink(filename); - } - qemu_opts_del(opts); - return ret; -} - -static int raw_open(BlockDriverState *bs, QDict *options, int flags, - Error **errp) -{ - BDRVRawState *s = bs->opaque; - Error *local_err = NULL; - int ret; - - s->type = FTYPE_FILE; - ret = raw_open_common(bs, options, flags, 0, &local_err); - if (local_err) { - error_propagate(errp, local_err); - } - return ret; -} - -static int raw_reopen_prepare(BDRVReopenState *state, - BlockReopenQueue *queue, Error **errp) -{ - BDRVRawState *s; - BDRVRawReopenState *raw_s; - int ret = 0; - Error *local_err = NULL; - - assert(state != NULL); - assert(state->bs != NULL); - - s = state->bs->opaque; - - state->opaque = g_new0(BDRVRawReopenState, 1); - raw_s = state->opaque; - -#ifdef CONFIG_LINUX_AIO - raw_s->use_aio = s->use_aio; - - /* we can use s->aio_ctx instead of a copy, because the use_aio flag is - * valid in the 'false' condition even if aio_ctx is set, and raw_set_aio() - * won't override aio_ctx if aio_ctx is non-NULL */ - if (raw_set_aio(&s->aio_ctx, &raw_s->use_aio, state->flags)) { - error_setg(errp, "Could not set AIO state"); - return -1; - } -#endif - - if (s->type == FTYPE_CD) { - raw_s->open_flags |= O_NONBLOCK; - } - - raw_parse_flags(state->flags, &raw_s->open_flags); - - raw_s->fd = -1; - - int fcntl_flags = O_APPEND | O_NONBLOCK; -#ifdef O_NOATIME - fcntl_flags |= O_NOATIME; -#endif - -#ifdef O_ASYNC - /* Not all operating systems have O_ASYNC, and those that don't - * will not let us track the state into raw_s->open_flags (typically - * you achieve the same effect with an ioctl, for example I_SETSIG - * on Solaris). But we do not use O_ASYNC, so that's fine. - */ - assert((s->open_flags & O_ASYNC) == 0); -#endif - - if ((raw_s->open_flags & ~fcntl_flags) == (s->open_flags & ~fcntl_flags)) { - /* dup the original fd */ - /* TODO: use qemu fcntl wrapper */ -#ifdef F_DUPFD_CLOEXEC - raw_s->fd = fcntl(s->fd, F_DUPFD_CLOEXEC, 0); -#else - raw_s->fd = dup(s->fd); - if (raw_s->fd != -1) { - qemu_set_cloexec(raw_s->fd); - } -#endif - if (raw_s->fd >= 0) { - ret = fcntl_setfl(raw_s->fd, raw_s->open_flags); - if (ret) { - qemu_close(raw_s->fd); - raw_s->fd = -1; - } - } - } - - /* If we cannot use fcntl, or fcntl failed, fall back to qemu_open() */ - if (raw_s->fd == -1) { - const char *normalized_filename = state->bs->filename; - ret = raw_normalize_devicepath(&normalized_filename); - if (ret < 0) { - error_setg_errno(errp, -ret, "Could not normalize device path"); - } else { - assert(!(raw_s->open_flags & O_CREAT)); - raw_s->fd = qemu_open(normalized_filename, raw_s->open_flags); - if (raw_s->fd == -1) { - error_setg_errno(errp, errno, "Could not reopen file"); - ret = -1; - } - } - } - - /* Fail already reopen_prepare() if we can't get a working O_DIRECT - * alignment with the new fd. */ - if (raw_s->fd != -1) { - raw_probe_alignment(state->bs, raw_s->fd, &local_err); - if (local_err) { - qemu_close(raw_s->fd); - raw_s->fd = -1; - error_propagate(errp, local_err); - ret = -EINVAL; - } - } - - return ret; -} - -static void raw_reopen_commit(BDRVReopenState *state) -{ - BDRVRawReopenState *raw_s = state->opaque; - BDRVRawState *s = state->bs->opaque; - - s->open_flags = raw_s->open_flags; - - qemu_close(s->fd); - s->fd = raw_s->fd; -#ifdef CONFIG_LINUX_AIO - s->use_aio = raw_s->use_aio; -#endif - - g_free(state->opaque); - state->opaque = NULL; -} - - -static void raw_reopen_abort(BDRVReopenState *state) -{ - BDRVRawReopenState *raw_s = state->opaque; - - /* nothing to do if NULL, we didn't get far enough */ - if (raw_s == NULL) { - return; - } - - if (raw_s->fd >= 0) { - qemu_close(raw_s->fd); - raw_s->fd = -1; - } - g_free(state->opaque); - state->opaque = NULL; -} - -static void raw_refresh_limits(BlockDriverState *bs, Error **errp) -{ - BDRVRawState *s = bs->opaque; - - raw_probe_alignment(bs, s->fd, errp); - bs->bl.min_mem_alignment = s->buf_align; - bs->bl.opt_mem_alignment = MAX(s->buf_align, getpagesize()); -} - -static int check_for_dasd(int fd) -{ -#ifdef BIODASDINFO2 - struct dasd_information2_t info = {0}; - - return ioctl(fd, BIODASDINFO2, &info); -#else - return -1; -#endif -} - -/** - * Try to get @bs's logical and physical block size. - * On success, store them in @bsz and return zero. - * On failure, return negative errno. - */ -static int hdev_probe_blocksizes(BlockDriverState *bs, BlockSizes *bsz) -{ - BDRVRawState *s = bs->opaque; - int ret; - - /* If DASD, get blocksizes */ - if (check_for_dasd(s->fd) < 0) { - return -ENOTSUP; - } - ret = probe_logical_blocksize(s->fd, &bsz->log); - if (ret < 0) { - return ret; - } - return probe_physical_blocksize(s->fd, &bsz->phys); -} - -/** - * Try to get @bs's geometry: cyls, heads, sectors. - * On success, store them in @geo and return 0. - * On failure return -errno. - * (Allows block driver to assign default geometry values that guest sees) - */ -#ifdef __linux__ -static int hdev_probe_geometry(BlockDriverState *bs, HDGeometry *geo) -{ - BDRVRawState *s = bs->opaque; - struct hd_geometry ioctl_geo = {0}; - - /* If DASD, get its geometry */ - if (check_for_dasd(s->fd) < 0) { - return -ENOTSUP; - } - if (ioctl(s->fd, HDIO_GETGEO, &ioctl_geo) < 0) { - return -errno; - } - /* HDIO_GETGEO may return success even though geo contains zeros - (e.g. certain multipath setups) */ - if (!ioctl_geo.heads || !ioctl_geo.sectors || !ioctl_geo.cylinders) { - return -ENOTSUP; - } - /* Do not return a geometry for partition */ - if (ioctl_geo.start != 0) { - return -ENOTSUP; - } - geo->heads = ioctl_geo.heads; - geo->sectors = ioctl_geo.sectors; - geo->cylinders = ioctl_geo.cylinders; - - return 0; -} -#else /* __linux__ */ -static int hdev_probe_geometry(BlockDriverState *bs, HDGeometry *geo) -{ - return -ENOTSUP; -} -#endif - -static ssize_t handle_aiocb_ioctl(RawPosixAIOData *aiocb) -{ - int ret; - - ret = ioctl(aiocb->aio_fildes, aiocb->aio_ioctl_cmd, aiocb->aio_ioctl_buf); - if (ret == -1) { - return -errno; - } - - return 0; -} - -static ssize_t handle_aiocb_flush(RawPosixAIOData *aiocb) -{ - int ret; - - ret = qemu_fdatasync(aiocb->aio_fildes); - if (ret == -1) { - return -errno; - } - return 0; -} - -#ifdef CONFIG_PREADV - -static bool preadv_present = true; - -static ssize_t -qemu_preadv(int fd, const struct iovec *iov, int nr_iov, off_t offset) -{ - return preadv(fd, iov, nr_iov, offset); -} - -static ssize_t -qemu_pwritev(int fd, const struct iovec *iov, int nr_iov, off_t offset) -{ - return pwritev(fd, iov, nr_iov, offset); -} - -#else - -static bool preadv_present = false; - -static ssize_t -qemu_preadv(int fd, const struct iovec *iov, int nr_iov, off_t offset) -{ - return -ENOSYS; -} - -static ssize_t -qemu_pwritev(int fd, const struct iovec *iov, int nr_iov, off_t offset) -{ - return -ENOSYS; -} - -#endif - -static ssize_t handle_aiocb_rw_vector(RawPosixAIOData *aiocb) -{ - ssize_t len; - - do { - if (aiocb->aio_type & QEMU_AIO_WRITE) - len = qemu_pwritev(aiocb->aio_fildes, - aiocb->aio_iov, - aiocb->aio_niov, - aiocb->aio_offset); - else - len = qemu_preadv(aiocb->aio_fildes, - aiocb->aio_iov, - aiocb->aio_niov, - aiocb->aio_offset); - } while (len == -1 && errno == EINTR); - - if (len == -1) { - return -errno; - } - return len; -} - -/* - * Read/writes the data to/from a given linear buffer. - * - * Returns the number of bytes handles or -errno in case of an error. Short - * reads are only returned if the end of the file is reached. - */ -static ssize_t handle_aiocb_rw_linear(RawPosixAIOData *aiocb, char *buf) -{ - ssize_t offset = 0; - ssize_t len; - - while (offset < aiocb->aio_nbytes) { - if (aiocb->aio_type & QEMU_AIO_WRITE) { - len = pwrite(aiocb->aio_fildes, - (const char *)buf + offset, - aiocb->aio_nbytes - offset, - aiocb->aio_offset + offset); - } else { - len = pread(aiocb->aio_fildes, - buf + offset, - aiocb->aio_nbytes - offset, - aiocb->aio_offset + offset); - } - if (len == -1 && errno == EINTR) { - continue; - } else if (len == -1 && errno == EINVAL && - (aiocb->bs->open_flags & BDRV_O_NOCACHE) && - !(aiocb->aio_type & QEMU_AIO_WRITE) && - offset > 0) { - /* O_DIRECT pread() may fail with EINVAL when offset is unaligned - * after a short read. Assume that O_DIRECT short reads only occur - * at EOF. Therefore this is a short read, not an I/O error. - */ - break; - } else if (len == -1) { - offset = -errno; - break; - } else if (len == 0) { - break; - } - offset += len; - } - - return offset; -} - -static ssize_t handle_aiocb_rw(RawPosixAIOData *aiocb) -{ - ssize_t nbytes; - char *buf; - - if (!(aiocb->aio_type & QEMU_AIO_MISALIGNED)) { - /* - * If there is just a single buffer, and it is properly aligned - * we can just use plain pread/pwrite without any problems. - */ - if (aiocb->aio_niov == 1) { - return handle_aiocb_rw_linear(aiocb, aiocb->aio_iov->iov_base); - } - /* - * We have more than one iovec, and all are properly aligned. - * - * Try preadv/pwritev first and fall back to linearizing the - * buffer if it's not supported. - */ - if (preadv_present) { - nbytes = handle_aiocb_rw_vector(aiocb); - if (nbytes == aiocb->aio_nbytes || - (nbytes < 0 && nbytes != -ENOSYS)) { - return nbytes; - } - preadv_present = false; - } - - /* - * XXX(hch): short read/write. no easy way to handle the reminder - * using these interfaces. For now retry using plain - * pread/pwrite? - */ - } - - /* - * Ok, we have to do it the hard way, copy all segments into - * a single aligned buffer. - */ - buf = qemu_try_blockalign(aiocb->bs, aiocb->aio_nbytes); - if (buf == NULL) { - return -ENOMEM; - } - - if (aiocb->aio_type & QEMU_AIO_WRITE) { - char *p = buf; - int i; - - for (i = 0; i < aiocb->aio_niov; ++i) { - memcpy(p, aiocb->aio_iov[i].iov_base, aiocb->aio_iov[i].iov_len); - p += aiocb->aio_iov[i].iov_len; - } - assert(p - buf == aiocb->aio_nbytes); - } - - nbytes = handle_aiocb_rw_linear(aiocb, buf); - if (!(aiocb->aio_type & QEMU_AIO_WRITE)) { - char *p = buf; - size_t count = aiocb->aio_nbytes, copy; - int i; - - for (i = 0; i < aiocb->aio_niov && count; ++i) { - copy = count; - if (copy > aiocb->aio_iov[i].iov_len) { - copy = aiocb->aio_iov[i].iov_len; - } - memcpy(aiocb->aio_iov[i].iov_base, p, copy); - assert(count >= copy); - p += copy; - count -= copy; - } - assert(count == 0); - } - qemu_vfree(buf); - - return nbytes; -} - -#ifdef CONFIG_XFS -static int xfs_write_zeroes(BDRVRawState *s, int64_t offset, uint64_t bytes) -{ - struct xfs_flock64 fl; - int err; - - memset(&fl, 0, sizeof(fl)); - fl.l_whence = SEEK_SET; - fl.l_start = offset; - fl.l_len = bytes; - - if (xfsctl(NULL, s->fd, XFS_IOC_ZERO_RANGE, &fl) < 0) { - err = errno; - DPRINTF("cannot write zero range (%s)\n", strerror(errno)); - return -err; - } - - return 0; -} - -static int xfs_discard(BDRVRawState *s, int64_t offset, uint64_t bytes) -{ - struct xfs_flock64 fl; - int err; - - memset(&fl, 0, sizeof(fl)); - fl.l_whence = SEEK_SET; - fl.l_start = offset; - fl.l_len = bytes; - - if (xfsctl(NULL, s->fd, XFS_IOC_UNRESVSP64, &fl) < 0) { - err = errno; - DPRINTF("cannot punch hole (%s)\n", strerror(errno)); - return -err; - } - - return 0; -} -#endif - -static int translate_err(int err) -{ - if (err == -ENODEV || err == -ENOSYS || err == -EOPNOTSUPP || - err == -ENOTTY) { - err = -ENOTSUP; - } - return err; -} - -#ifdef CONFIG_FALLOCATE -static int do_fallocate(int fd, int mode, off_t offset, off_t len) -{ - do { - if (fallocate(fd, mode, offset, len) == 0) { - return 0; - } - } while (errno == EINTR); - return translate_err(-errno); -} -#endif - -static ssize_t handle_aiocb_write_zeroes_block(RawPosixAIOData *aiocb) -{ - int ret = -ENOTSUP; - BDRVRawState *s = aiocb->bs->opaque; - - if (!s->has_write_zeroes) { - return -ENOTSUP; - } - -#ifdef BLKZEROOUT - do { - uint64_t range[2] = { aiocb->aio_offset, aiocb->aio_nbytes }; - if (ioctl(aiocb->aio_fildes, BLKZEROOUT, range) == 0) { - return 0; - } - } while (errno == EINTR); - - ret = translate_err(-errno); -#endif - - if (ret == -ENOTSUP) { - s->has_write_zeroes = false; - } - return ret; -} - -static ssize_t handle_aiocb_write_zeroes(RawPosixAIOData *aiocb) -{ -#if defined(CONFIG_FALLOCATE) || defined(CONFIG_XFS) - BDRVRawState *s = aiocb->bs->opaque; -#endif - - if (aiocb->aio_type & QEMU_AIO_BLKDEV) { - return handle_aiocb_write_zeroes_block(aiocb); - } - -#ifdef CONFIG_XFS - if (s->is_xfs) { - return xfs_write_zeroes(s, aiocb->aio_offset, aiocb->aio_nbytes); - } -#endif - -#ifdef CONFIG_FALLOCATE_ZERO_RANGE - if (s->has_write_zeroes) { - int ret = do_fallocate(s->fd, FALLOC_FL_ZERO_RANGE, - aiocb->aio_offset, aiocb->aio_nbytes); - if (ret == 0 || ret != -ENOTSUP) { - return ret; - } - s->has_write_zeroes = false; - } -#endif - -#ifdef CONFIG_FALLOCATE_PUNCH_HOLE - if (s->has_discard && s->has_fallocate) { - int ret = do_fallocate(s->fd, - FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, - aiocb->aio_offset, aiocb->aio_nbytes); - if (ret == 0) { - ret = do_fallocate(s->fd, 0, aiocb->aio_offset, aiocb->aio_nbytes); - if (ret == 0 || ret != -ENOTSUP) { - return ret; - } - s->has_fallocate = false; - } else if (ret != -ENOTSUP) { - return ret; - } else { - s->has_discard = false; - } - } -#endif - -#ifdef CONFIG_FALLOCATE - if (s->has_fallocate && aiocb->aio_offset >= bdrv_getlength(aiocb->bs)) { - int ret = do_fallocate(s->fd, 0, aiocb->aio_offset, aiocb->aio_nbytes); - if (ret == 0 || ret != -ENOTSUP) { - return ret; - } - s->has_fallocate = false; - } -#endif - - return -ENOTSUP; -} - -static ssize_t handle_aiocb_discard(RawPosixAIOData *aiocb) -{ - int ret = -EOPNOTSUPP; - BDRVRawState *s = aiocb->bs->opaque; - - if (!s->has_discard) { - return -ENOTSUP; - } - - if (aiocb->aio_type & QEMU_AIO_BLKDEV) { -#ifdef BLKDISCARD - do { - uint64_t range[2] = { aiocb->aio_offset, aiocb->aio_nbytes }; - if (ioctl(aiocb->aio_fildes, BLKDISCARD, range) == 0) { - return 0; - } - } while (errno == EINTR); - - ret = -errno; -#endif - } else { -#ifdef CONFIG_XFS - if (s->is_xfs) { - return xfs_discard(s, aiocb->aio_offset, aiocb->aio_nbytes); - } -#endif - -#ifdef CONFIG_FALLOCATE_PUNCH_HOLE - ret = do_fallocate(s->fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, - aiocb->aio_offset, aiocb->aio_nbytes); -#endif - } - - ret = translate_err(ret); - if (ret == -ENOTSUP) { - s->has_discard = false; - } - return ret; -} - -static int aio_worker(void *arg) -{ - RawPosixAIOData *aiocb = arg; - ssize_t ret = 0; - - switch (aiocb->aio_type & QEMU_AIO_TYPE_MASK) { - case QEMU_AIO_READ: - ret = handle_aiocb_rw(aiocb); - if (ret >= 0 && ret < aiocb->aio_nbytes) { - iov_memset(aiocb->aio_iov, aiocb->aio_niov, ret, - 0, aiocb->aio_nbytes - ret); - - ret = aiocb->aio_nbytes; - } - if (ret == aiocb->aio_nbytes) { - ret = 0; - } else if (ret >= 0 && ret < aiocb->aio_nbytes) { - ret = -EINVAL; - } - break; - case QEMU_AIO_WRITE: - ret = handle_aiocb_rw(aiocb); - if (ret == aiocb->aio_nbytes) { - ret = 0; - } else if (ret >= 0 && ret < aiocb->aio_nbytes) { - ret = -EINVAL; - } - break; - case QEMU_AIO_FLUSH: - ret = handle_aiocb_flush(aiocb); - break; - case QEMU_AIO_IOCTL: - ret = handle_aiocb_ioctl(aiocb); - break; - case QEMU_AIO_DISCARD: - ret = handle_aiocb_discard(aiocb); - break; - case QEMU_AIO_WRITE_ZEROES: - ret = handle_aiocb_write_zeroes(aiocb); - break; - default: - fprintf(stderr, "invalid aio request (0x%x)\n", aiocb->aio_type); - ret = -EINVAL; - break; - } - - g_free(aiocb); - return ret; -} - -static int paio_submit_co(BlockDriverState *bs, int fd, - int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, - int type) -{ - RawPosixAIOData *acb = g_new(RawPosixAIOData, 1); - ThreadPool *pool; - - acb->bs = bs; - acb->aio_type = type; - acb->aio_fildes = fd; - - acb->aio_nbytes = nb_sectors * BDRV_SECTOR_SIZE; - acb->aio_offset = sector_num * BDRV_SECTOR_SIZE; - - if (qiov) { - acb->aio_iov = qiov->iov; - acb->aio_niov = qiov->niov; - assert(qiov->size == acb->aio_nbytes); - } - - trace_paio_submit_co(sector_num, nb_sectors, type); - pool = aio_get_thread_pool(bdrv_get_aio_context(bs)); - return thread_pool_submit_co(pool, aio_worker, acb); -} - -static BlockAIOCB *paio_submit(BlockDriverState *bs, int fd, - int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, - BlockCompletionFunc *cb, void *opaque, int type) -{ - RawPosixAIOData *acb = g_new(RawPosixAIOData, 1); - ThreadPool *pool; - - acb->bs = bs; - acb->aio_type = type; - acb->aio_fildes = fd; - - acb->aio_nbytes = nb_sectors * BDRV_SECTOR_SIZE; - acb->aio_offset = sector_num * BDRV_SECTOR_SIZE; - - if (qiov) { - acb->aio_iov = qiov->iov; - acb->aio_niov = qiov->niov; - assert(qiov->size == acb->aio_nbytes); - } - - trace_paio_submit(acb, opaque, sector_num, nb_sectors, type); - pool = aio_get_thread_pool(bdrv_get_aio_context(bs)); - return thread_pool_submit_aio(pool, aio_worker, acb, cb, opaque); -} - -static BlockAIOCB *raw_aio_submit(BlockDriverState *bs, - int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, - BlockCompletionFunc *cb, void *opaque, int type) -{ - BDRVRawState *s = bs->opaque; - - if (fd_open(bs) < 0) - return NULL; - - /* - * Check if the underlying device requires requests to be aligned, - * and if the request we are trying to submit is aligned or not. - * If this is the case tell the low-level driver that it needs - * to copy the buffer. - */ - if (s->needs_alignment) { - if (!bdrv_qiov_is_aligned(bs, qiov)) { - type |= QEMU_AIO_MISALIGNED; -#ifdef CONFIG_LINUX_AIO - } else if (s->use_aio) { - return laio_submit(bs, s->aio_ctx, s->fd, sector_num, qiov, - nb_sectors, cb, opaque, type); -#endif - } - } - - return paio_submit(bs, s->fd, sector_num, qiov, nb_sectors, - cb, opaque, type); -} - -static void raw_aio_plug(BlockDriverState *bs) -{ -#ifdef CONFIG_LINUX_AIO - BDRVRawState *s = bs->opaque; - if (s->use_aio) { - laio_io_plug(bs, s->aio_ctx); - } -#endif -} - -static void raw_aio_unplug(BlockDriverState *bs) -{ -#ifdef CONFIG_LINUX_AIO - BDRVRawState *s = bs->opaque; - if (s->use_aio) { - laio_io_unplug(bs, s->aio_ctx, true); - } -#endif -} - -static void raw_aio_flush_io_queue(BlockDriverState *bs) -{ -#ifdef CONFIG_LINUX_AIO - BDRVRawState *s = bs->opaque; - if (s->use_aio) { - laio_io_unplug(bs, s->aio_ctx, false); - } -#endif -} - -static BlockAIOCB *raw_aio_readv(BlockDriverState *bs, - int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, - BlockCompletionFunc *cb, void *opaque) -{ - return raw_aio_submit(bs, sector_num, qiov, nb_sectors, - cb, opaque, QEMU_AIO_READ); -} - -static BlockAIOCB *raw_aio_writev(BlockDriverState *bs, - int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, - BlockCompletionFunc *cb, void *opaque) -{ - return raw_aio_submit(bs, sector_num, qiov, nb_sectors, - cb, opaque, QEMU_AIO_WRITE); -} - -static BlockAIOCB *raw_aio_flush(BlockDriverState *bs, - BlockCompletionFunc *cb, void *opaque) -{ - BDRVRawState *s = bs->opaque; - - if (fd_open(bs) < 0) - return NULL; - - return paio_submit(bs, s->fd, 0, NULL, 0, cb, opaque, QEMU_AIO_FLUSH); -} - -static void raw_close(BlockDriverState *bs) -{ - BDRVRawState *s = bs->opaque; - - raw_detach_aio_context(bs); - -#ifdef CONFIG_LINUX_AIO - if (s->use_aio) { - laio_cleanup(s->aio_ctx); - } -#endif - if (s->fd >= 0) { - qemu_close(s->fd); - s->fd = -1; - } -} - -static int raw_truncate(BlockDriverState *bs, int64_t offset) -{ - BDRVRawState *s = bs->opaque; - struct stat st; - - if (fstat(s->fd, &st)) { - return -errno; - } - - if (S_ISREG(st.st_mode)) { - if (ftruncate(s->fd, offset) < 0) { - return -errno; - } - } else if (S_ISCHR(st.st_mode) || S_ISBLK(st.st_mode)) { - if (offset > raw_getlength(bs)) { - return -EINVAL; - } - } else { - return -ENOTSUP; - } - - return 0; -} - -#ifdef __OpenBSD__ -static int64_t raw_getlength(BlockDriverState *bs) -{ - BDRVRawState *s = bs->opaque; - int fd = s->fd; - struct stat st; - - if (fstat(fd, &st)) - return -errno; - if (S_ISCHR(st.st_mode) || S_ISBLK(st.st_mode)) { - struct disklabel dl; - - if (ioctl(fd, DIOCGDINFO, &dl)) - return -errno; - return (uint64_t)dl.d_secsize * - dl.d_partitions[DISKPART(st.st_rdev)].p_size; - } else - return st.st_size; -} -#elif defined(__NetBSD__) -static int64_t raw_getlength(BlockDriverState *bs) -{ - BDRVRawState *s = bs->opaque; - int fd = s->fd; - struct stat st; - - if (fstat(fd, &st)) - return -errno; - if (S_ISCHR(st.st_mode) || S_ISBLK(st.st_mode)) { - struct dkwedge_info dkw; - - if (ioctl(fd, DIOCGWEDGEINFO, &dkw) != -1) { - return dkw.dkw_size * 512; - } else { - struct disklabel dl; - - if (ioctl(fd, DIOCGDINFO, &dl)) - return -errno; - return (uint64_t)dl.d_secsize * - dl.d_partitions[DISKPART(st.st_rdev)].p_size; - } - } else - return st.st_size; -} -#elif defined(__sun__) -static int64_t raw_getlength(BlockDriverState *bs) -{ - BDRVRawState *s = bs->opaque; - struct dk_minfo minfo; - int ret; - int64_t size; - - ret = fd_open(bs); - if (ret < 0) { - return ret; - } - - /* - * Use the DKIOCGMEDIAINFO ioctl to read the size. - */ - ret = ioctl(s->fd, DKIOCGMEDIAINFO, &minfo); - if (ret != -1) { - return minfo.dki_lbsize * minfo.dki_capacity; - } - - /* - * There are reports that lseek on some devices fails, but - * irc discussion said that contingency on contingency was overkill. - */ - size = lseek(s->fd, 0, SEEK_END); - if (size < 0) { - return -errno; - } - return size; -} -#elif defined(CONFIG_BSD) -static int64_t raw_getlength(BlockDriverState *bs) -{ - BDRVRawState *s = bs->opaque; - int fd = s->fd; - int64_t size; - struct stat sb; -#if defined (__FreeBSD__) || defined(__FreeBSD_kernel__) - int reopened = 0; -#endif - int ret; - - ret = fd_open(bs); - if (ret < 0) - return ret; - -#if defined (__FreeBSD__) || defined(__FreeBSD_kernel__) -again: -#endif - if (!fstat(fd, &sb) && (S_IFCHR & sb.st_mode)) { -#ifdef DIOCGMEDIASIZE - if (ioctl(fd, DIOCGMEDIASIZE, (off_t *)&size)) -#elif defined(DIOCGPART) - { - struct partinfo pi; - if (ioctl(fd, DIOCGPART, &pi) == 0) - size = pi.media_size; - else - size = 0; - } - if (size == 0) -#endif -#if defined(__APPLE__) && defined(__MACH__) - { - uint64_t sectors = 0; - uint32_t sector_size = 0; - - if (ioctl(fd, DKIOCGETBLOCKCOUNT, §ors) == 0 - && ioctl(fd, DKIOCGETBLOCKSIZE, §or_size) == 0) { - size = sectors * sector_size; - } else { - size = lseek(fd, 0LL, SEEK_END); - if (size < 0) { - return -errno; - } - } - } -#else - size = lseek(fd, 0LL, SEEK_END); - if (size < 0) { - return -errno; - } -#endif -#if defined(__FreeBSD__) || defined(__FreeBSD_kernel__) - switch(s->type) { - case FTYPE_CD: - /* XXX FreeBSD acd returns UINT_MAX sectors for an empty drive */ - if (size == 2048LL * (unsigned)-1) - size = 0; - /* XXX no disc? maybe we need to reopen... */ - if (size <= 0 && !reopened && cdrom_reopen(bs) >= 0) { - reopened = 1; - goto again; - } - } -#endif - } else { - size = lseek(fd, 0, SEEK_END); - if (size < 0) { - return -errno; - } - } - return size; -} -#else -static int64_t raw_getlength(BlockDriverState *bs) -{ - BDRVRawState *s = bs->opaque; - int ret; - int64_t size; - - ret = fd_open(bs); - if (ret < 0) { - return ret; - } - - size = lseek(s->fd, 0, SEEK_END); - if (size < 0) { - return -errno; - } - return size; -} -#endif - -static int64_t raw_get_allocated_file_size(BlockDriverState *bs) -{ - struct stat st; - BDRVRawState *s = bs->opaque; - - if (fstat(s->fd, &st) < 0) { - return -errno; - } - return (int64_t)st.st_blocks * 512; -} - -static int raw_create(const char *filename, QemuOpts *opts, Error **errp) -{ - int fd; - int result = 0; - int64_t total_size = 0; - bool nocow = false; - PreallocMode prealloc; - char *buf = NULL; - Error *local_err = NULL; - - strstart(filename, "file:", &filename); - - /* Read out options */ - total_size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0), - BDRV_SECTOR_SIZE); - nocow = qemu_opt_get_bool(opts, BLOCK_OPT_NOCOW, false); - buf = qemu_opt_get_del(opts, BLOCK_OPT_PREALLOC); - prealloc = qapi_enum_parse(PreallocMode_lookup, buf, - PREALLOC_MODE__MAX, PREALLOC_MODE_OFF, - &local_err); - g_free(buf); - if (local_err) { - error_propagate(errp, local_err); - result = -EINVAL; - goto out; - } - - fd = qemu_open(filename, O_RDWR | O_CREAT | O_TRUNC | O_BINARY, - 0644); - if (fd < 0) { - result = -errno; - error_setg_errno(errp, -result, "Could not create file"); - goto out; - } - - if (nocow) { -#ifdef __linux__ - /* Set NOCOW flag to solve performance issue on fs like btrfs. - * This is an optimisation. The FS_IOC_SETFLAGS ioctl return value - * will be ignored since any failure of this operation should not - * block the left work. - */ - int attr; - if (ioctl(fd, FS_IOC_GETFLAGS, &attr) == 0) { - attr |= FS_NOCOW_FL; - ioctl(fd, FS_IOC_SETFLAGS, &attr); - } -#endif - } - - if (ftruncate(fd, total_size) != 0) { - result = -errno; - error_setg_errno(errp, -result, "Could not resize file"); - goto out_close; - } - - switch (prealloc) { -#ifdef CONFIG_POSIX_FALLOCATE - case PREALLOC_MODE_FALLOC: - /* posix_fallocate() doesn't set errno. */ - result = -posix_fallocate(fd, 0, total_size); - if (result != 0) { - error_setg_errno(errp, -result, - "Could not preallocate data for the new file"); - } - break; -#endif - case PREALLOC_MODE_FULL: - { - int64_t num = 0, left = total_size; - buf = g_malloc0(65536); - - while (left > 0) { - num = MIN(left, 65536); - result = write(fd, buf, num); - if (result < 0) { - result = -errno; - error_setg_errno(errp, -result, - "Could not write to the new file"); - break; - } - left -= result; - } - if (result >= 0) { - result = fsync(fd); - if (result < 0) { - result = -errno; - error_setg_errno(errp, -result, - "Could not flush new file to disk"); - } - } - g_free(buf); - break; - } - case PREALLOC_MODE_OFF: - break; - default: - result = -EINVAL; - error_setg(errp, "Unsupported preallocation mode: %s", - PreallocMode_lookup[prealloc]); - break; - } - -out_close: - if (qemu_close(fd) != 0 && result == 0) { - result = -errno; - error_setg_errno(errp, -result, "Could not close the new file"); - } -out: - return result; -} - -/* - * Find allocation range in @bs around offset @start. - * May change underlying file descriptor's file offset. - * If @start is not in a hole, store @start in @data, and the - * beginning of the next hole in @hole, and return 0. - * If @start is in a non-trailing hole, store @start in @hole and the - * beginning of the next non-hole in @data, and return 0. - * If @start is in a trailing hole or beyond EOF, return -ENXIO. - * If we can't find out, return a negative errno other than -ENXIO. - */ -static int find_allocation(BlockDriverState *bs, off_t start, - off_t *data, off_t *hole) -{ -#if defined SEEK_HOLE && defined SEEK_DATA - BDRVRawState *s = bs->opaque; - off_t offs; - - /* - * SEEK_DATA cases: - * D1. offs == start: start is in data - * D2. offs > start: start is in a hole, next data at offs - * D3. offs < 0, errno = ENXIO: either start is in a trailing hole - * or start is beyond EOF - * If the latter happens, the file has been truncated behind - * our back since we opened it. All bets are off then. - * Treating like a trailing hole is simplest. - * D4. offs < 0, errno != ENXIO: we learned nothing - */ - offs = lseek(s->fd, start, SEEK_DATA); - if (offs < 0) { - return -errno; /* D3 or D4 */ - } - assert(offs >= start); - - if (offs > start) { - /* D2: in hole, next data at offs */ - *hole = start; - *data = offs; - return 0; - } - - /* D1: in data, end not yet known */ - - /* - * SEEK_HOLE cases: - * H1. offs == start: start is in a hole - * If this happens here, a hole has been dug behind our back - * since the previous lseek(). - * H2. offs > start: either start is in data, next hole at offs, - * or start is in trailing hole, EOF at offs - * Linux treats trailing holes like any other hole: offs == - * start. Solaris seeks to EOF instead: offs > start (blech). - * If that happens here, a hole has been dug behind our back - * since the previous lseek(). - * H3. offs < 0, errno = ENXIO: start is beyond EOF - * If this happens, the file has been truncated behind our - * back since we opened it. Treat it like a trailing hole. - * H4. offs < 0, errno != ENXIO: we learned nothing - * Pretend we know nothing at all, i.e. "forget" about D1. - */ - offs = lseek(s->fd, start, SEEK_HOLE); - if (offs < 0) { - return -errno; /* D1 and (H3 or H4) */ - } - assert(offs >= start); - - if (offs > start) { - /* - * D1 and H2: either in data, next hole at offs, or it was in - * data but is now in a trailing hole. In the latter case, - * all bets are off. Treating it as if it there was data all - * the way to EOF is safe, so simply do that. - */ - *data = start; - *hole = offs; - return 0; - } - - /* D1 and H1 */ - return -EBUSY; -#else - return -ENOTSUP; -#endif -} - -/* - * Returns the allocation status of the specified sectors. - * - * If 'sector_num' is beyond the end of the disk image the return value is 0 - * and 'pnum' is set to 0. - * - * 'pnum' is set to the number of sectors (including and immediately following - * the specified sector) that are known to be in the same - * allocated/unallocated state. - * - * 'nb_sectors' is the max value 'pnum' should be set to. If nb_sectors goes - * beyond the end of the disk image it will be clamped. - */ -static int64_t coroutine_fn raw_co_get_block_status(BlockDriverState *bs, - int64_t sector_num, - int nb_sectors, int *pnum, - BlockDriverState **file) -{ - off_t start, data = 0, hole = 0; - int64_t total_size; - int ret; - - ret = fd_open(bs); - if (ret < 0) { - return ret; - } - - start = sector_num * BDRV_SECTOR_SIZE; - total_size = bdrv_getlength(bs); - if (total_size < 0) { - return total_size; - } else if (start >= total_size) { - *pnum = 0; - return 0; - } else if (start + nb_sectors * BDRV_SECTOR_SIZE > total_size) { - nb_sectors = DIV_ROUND_UP(total_size - start, BDRV_SECTOR_SIZE); - } - - ret = find_allocation(bs, start, &data, &hole); - if (ret == -ENXIO) { - /* Trailing hole */ - *pnum = nb_sectors; - ret = BDRV_BLOCK_ZERO; - } else if (ret < 0) { - /* No info available, so pretend there are no holes */ - *pnum = nb_sectors; - ret = BDRV_BLOCK_DATA; - } else if (data == start) { - /* On a data extent, compute sectors to the end of the extent, - * possibly including a partial sector at EOF. */ - *pnum = MIN(nb_sectors, DIV_ROUND_UP(hole - start, BDRV_SECTOR_SIZE)); - ret = BDRV_BLOCK_DATA; - } else { - /* On a hole, compute sectors to the beginning of the next extent. */ - assert(hole == start); - *pnum = MIN(nb_sectors, (data - start) / BDRV_SECTOR_SIZE); - ret = BDRV_BLOCK_ZERO; - } - *file = bs; - return ret | BDRV_BLOCK_OFFSET_VALID | start; -} - -static coroutine_fn BlockAIOCB *raw_aio_discard(BlockDriverState *bs, - int64_t sector_num, int nb_sectors, - BlockCompletionFunc *cb, void *opaque) -{ - BDRVRawState *s = bs->opaque; - - return paio_submit(bs, s->fd, sector_num, NULL, nb_sectors, - cb, opaque, QEMU_AIO_DISCARD); -} - -static int coroutine_fn raw_co_write_zeroes( - BlockDriverState *bs, int64_t sector_num, - int nb_sectors, BdrvRequestFlags flags) -{ - BDRVRawState *s = bs->opaque; - - if (!(flags & BDRV_REQ_MAY_UNMAP)) { - return paio_submit_co(bs, s->fd, sector_num, NULL, nb_sectors, - QEMU_AIO_WRITE_ZEROES); - } else if (s->discard_zeroes) { - return paio_submit_co(bs, s->fd, sector_num, NULL, nb_sectors, - QEMU_AIO_DISCARD); - } - return -ENOTSUP; -} - -static int raw_get_info(BlockDriverState *bs, BlockDriverInfo *bdi) -{ - BDRVRawState *s = bs->opaque; - - bdi->unallocated_blocks_are_zero = s->discard_zeroes; - bdi->can_write_zeroes_with_unmap = s->discard_zeroes; - return 0; -} - -static QemuOptsList raw_create_opts = { - .name = "raw-create-opts", - .head = QTAILQ_HEAD_INITIALIZER(raw_create_opts.head), - .desc = { - { - .name = BLOCK_OPT_SIZE, - .type = QEMU_OPT_SIZE, - .help = "Virtual disk size" - }, - { - .name = BLOCK_OPT_NOCOW, - .type = QEMU_OPT_BOOL, - .help = "Turn off copy-on-write (valid only on btrfs)" - }, - { - .name = BLOCK_OPT_PREALLOC, - .type = QEMU_OPT_STRING, - .help = "Preallocation mode (allowed values: off, falloc, full)" - }, - { /* end of list */ } - } -}; - -BlockDriver bdrv_file = { - .format_name = "file", - .protocol_name = "file", - .instance_size = sizeof(BDRVRawState), - .bdrv_needs_filename = true, - .bdrv_probe = NULL, /* no probe for protocols */ - .bdrv_parse_filename = raw_parse_filename, - .bdrv_file_open = raw_open, - .bdrv_reopen_prepare = raw_reopen_prepare, - .bdrv_reopen_commit = raw_reopen_commit, - .bdrv_reopen_abort = raw_reopen_abort, - .bdrv_close = raw_close, - .bdrv_create = raw_create, - .bdrv_has_zero_init = bdrv_has_zero_init_1, - .bdrv_co_get_block_status = raw_co_get_block_status, - .bdrv_co_write_zeroes = raw_co_write_zeroes, - - .bdrv_aio_readv = raw_aio_readv, - .bdrv_aio_writev = raw_aio_writev, - .bdrv_aio_flush = raw_aio_flush, - .bdrv_aio_discard = raw_aio_discard, - .bdrv_refresh_limits = raw_refresh_limits, - .bdrv_io_plug = raw_aio_plug, - .bdrv_io_unplug = raw_aio_unplug, - .bdrv_flush_io_queue = raw_aio_flush_io_queue, - - .bdrv_truncate = raw_truncate, - .bdrv_getlength = raw_getlength, - .bdrv_get_info = raw_get_info, - .bdrv_get_allocated_file_size - = raw_get_allocated_file_size, - - .bdrv_detach_aio_context = raw_detach_aio_context, - .bdrv_attach_aio_context = raw_attach_aio_context, - - .create_opts = &raw_create_opts, -}; - -/***********************************************/ -/* host device */ - -#if defined(__APPLE__) && defined(__MACH__) -static kern_return_t GetBSDPath(io_iterator_t mediaIterator, char *bsdPath, - CFIndex maxPathSize, int flags); -static char *FindEjectableOpticalMedia(io_iterator_t *mediaIterator) -{ - kern_return_t kernResult = KERN_FAILURE; - mach_port_t masterPort; - CFMutableDictionaryRef classesToMatch; - const char *matching_array[] = {kIODVDMediaClass, kIOCDMediaClass}; - char *mediaType = NULL; - - kernResult = IOMasterPort( MACH_PORT_NULL, &masterPort ); - if ( KERN_SUCCESS != kernResult ) { - printf( "IOMasterPort returned %d\n", kernResult ); - } - - int index; - for (index = 0; index < ARRAY_SIZE(matching_array); index++) { - classesToMatch = IOServiceMatching(matching_array[index]); - if (classesToMatch == NULL) { - error_report("IOServiceMatching returned NULL for %s", - matching_array[index]); - continue; - } - CFDictionarySetValue(classesToMatch, CFSTR(kIOMediaEjectableKey), - kCFBooleanTrue); - kernResult = IOServiceGetMatchingServices(masterPort, classesToMatch, - mediaIterator); - if (kernResult != KERN_SUCCESS) { - error_report("Note: IOServiceGetMatchingServices returned %d", - kernResult); - continue; - } - - /* If a match was found, leave the loop */ - if (*mediaIterator != 0) { - DPRINTF("Matching using %s\n", matching_array[index]); - mediaType = g_strdup(matching_array[index]); - break; - } - } - return mediaType; -} - -kern_return_t GetBSDPath(io_iterator_t mediaIterator, char *bsdPath, - CFIndex maxPathSize, int flags) -{ - io_object_t nextMedia; - kern_return_t kernResult = KERN_FAILURE; - *bsdPath = '\0'; - nextMedia = IOIteratorNext( mediaIterator ); - if ( nextMedia ) - { - CFTypeRef bsdPathAsCFString; - bsdPathAsCFString = IORegistryEntryCreateCFProperty( nextMedia, CFSTR( kIOBSDNameKey ), kCFAllocatorDefault, 0 ); - if ( bsdPathAsCFString ) { - size_t devPathLength; - strcpy( bsdPath, _PATH_DEV ); - if (flags & BDRV_O_NOCACHE) { - strcat(bsdPath, "r"); - } - devPathLength = strlen( bsdPath ); - if ( CFStringGetCString( bsdPathAsCFString, bsdPath + devPathLength, maxPathSize - devPathLength, kCFStringEncodingASCII ) ) { - kernResult = KERN_SUCCESS; - } - CFRelease( bsdPathAsCFString ); - } - IOObjectRelease( nextMedia ); - } - - return kernResult; -} - -/* Sets up a real cdrom for use in QEMU */ -static bool setup_cdrom(char *bsd_path, Error **errp) -{ - int index, num_of_test_partitions = 2, fd; - char test_partition[MAXPATHLEN]; - bool partition_found = false; - - /* look for a working partition */ - for (index = 0; index < num_of_test_partitions; index++) { - snprintf(test_partition, sizeof(test_partition), "%ss%d", bsd_path, - index); - fd = qemu_open(test_partition, O_RDONLY | O_BINARY | O_LARGEFILE); - if (fd >= 0) { - partition_found = true; - qemu_close(fd); - break; - } - } - - /* if a working partition on the device was not found */ - if (partition_found == false) { - error_setg(errp, "Failed to find a working partition on disc"); - } else { - DPRINTF("Using %s as optical disc\n", test_partition); - pstrcpy(bsd_path, MAXPATHLEN, test_partition); - } - return partition_found; -} - -/* Prints directions on mounting and unmounting a device */ -static void print_unmounting_directions(const char *file_name) -{ - error_report("If device %s is mounted on the desktop, unmount" - " it first before using it in QEMU", file_name); - error_report("Command to unmount device: diskutil unmountDisk %s", - file_name); - error_report("Command to mount device: diskutil mountDisk %s", file_name); -} - -#endif /* defined(__APPLE__) && defined(__MACH__) */ - -static int hdev_probe_device(const char *filename) -{ - struct stat st; - - /* allow a dedicated CD-ROM driver to match with a higher priority */ - if (strstart(filename, "/dev/cdrom", NULL)) - return 50; - - if (stat(filename, &st) >= 0 && - (S_ISCHR(st.st_mode) || S_ISBLK(st.st_mode))) { - return 100; - } - - return 0; -} - -static int check_hdev_writable(BDRVRawState *s) -{ -#if defined(BLKROGET) - /* Linux block devices can be configured "read-only" using blockdev(8). - * This is independent of device node permissions and therefore open(2) - * with O_RDWR succeeds. Actual writes fail with EPERM. - * - * bdrv_open() is supposed to fail if the disk is read-only. Explicitly - * check for read-only block devices so that Linux block devices behave - * properly. - */ - struct stat st; - int readonly = 0; - - if (fstat(s->fd, &st)) { - return -errno; - } - - if (!S_ISBLK(st.st_mode)) { - return 0; - } - - if (ioctl(s->fd, BLKROGET, &readonly) < 0) { - return -errno; - } - - if (readonly) { - return -EACCES; - } -#endif /* defined(BLKROGET) */ - return 0; -} - -static void hdev_parse_filename(const char *filename, QDict *options, - Error **errp) -{ - /* The prefix is optional, just as for "file". */ - strstart(filename, "host_device:", &filename); - - qdict_put_obj(options, "filename", QOBJECT(qstring_from_str(filename))); -} - -static bool hdev_is_sg(BlockDriverState *bs) -{ - -#if defined(__linux__) - - struct stat st; - struct sg_scsi_id scsiid; - int sg_version; - - if (stat(bs->filename, &st) >= 0 && S_ISCHR(st.st_mode) && - !bdrv_ioctl(bs, SG_GET_VERSION_NUM, &sg_version) && - !bdrv_ioctl(bs, SG_GET_SCSI_ID, &scsiid)) { - DPRINTF("SG device found: type=%d, version=%d\n", - scsiid.scsi_type, sg_version); - return true; - } - -#endif - - return false; -} - -static int hdev_open(BlockDriverState *bs, QDict *options, int flags, - Error **errp) -{ - BDRVRawState *s = bs->opaque; - Error *local_err = NULL; - int ret; - -#if defined(__APPLE__) && defined(__MACH__) - const char *filename = qdict_get_str(options, "filename"); - char bsd_path[MAXPATHLEN] = ""; - bool error_occurred = false; - - /* If using a real cdrom */ - if (strcmp(filename, "/dev/cdrom") == 0) { - char *mediaType = NULL; - kern_return_t ret_val; - io_iterator_t mediaIterator = 0; - - mediaType = FindEjectableOpticalMedia(&mediaIterator); - if (mediaType == NULL) { - error_setg(errp, "Please make sure your CD/DVD is in the optical" - " drive"); - error_occurred = true; - goto hdev_open_Mac_error; - } - - ret_val = GetBSDPath(mediaIterator, bsd_path, sizeof(bsd_path), flags); - if (ret_val != KERN_SUCCESS) { - error_setg(errp, "Could not get BSD path for optical drive"); - error_occurred = true; - goto hdev_open_Mac_error; - } - - /* If a real optical drive was not found */ - if (bsd_path[0] == '\0') { - error_setg(errp, "Failed to obtain bsd path for optical drive"); - error_occurred = true; - goto hdev_open_Mac_error; - } - - /* If using a cdrom disc and finding a partition on the disc failed */ - if (strncmp(mediaType, kIOCDMediaClass, 9) == 0 && - setup_cdrom(bsd_path, errp) == false) { - print_unmounting_directions(bsd_path); - error_occurred = true; - goto hdev_open_Mac_error; - } - - qdict_put(options, "filename", qstring_from_str(bsd_path)); - -hdev_open_Mac_error: - g_free(mediaType); - if (mediaIterator) { - IOObjectRelease(mediaIterator); - } - if (error_occurred) { - return -ENOENT; - } - } -#endif /* defined(__APPLE__) && defined(__MACH__) */ - - s->type = FTYPE_FILE; - - ret = raw_open_common(bs, options, flags, 0, &local_err); - if (ret < 0) { - if (local_err) { - error_propagate(errp, local_err); - } -#if defined(__APPLE__) && defined(__MACH__) - if (*bsd_path) { - filename = bsd_path; - } - /* if a physical device experienced an error while being opened */ - if (strncmp(filename, "/dev/", 5) == 0) { - print_unmounting_directions(filename); - } -#endif /* defined(__APPLE__) && defined(__MACH__) */ - return ret; - } - - /* Since this does ioctl the device must be already opened */ - bs->sg = hdev_is_sg(bs); - - if (flags & BDRV_O_RDWR) { - ret = check_hdev_writable(s); - if (ret < 0) { - raw_close(bs); - error_setg_errno(errp, -ret, "The device is not writable"); - return ret; - } - } - - return ret; -} - -#if defined(__linux__) - -static BlockAIOCB *hdev_aio_ioctl(BlockDriverState *bs, - unsigned long int req, void *buf, - BlockCompletionFunc *cb, void *opaque) -{ - BDRVRawState *s = bs->opaque; - RawPosixAIOData *acb; - ThreadPool *pool; - - if (fd_open(bs) < 0) - return NULL; - - acb = g_new(RawPosixAIOData, 1); - acb->bs = bs; - acb->aio_type = QEMU_AIO_IOCTL; - acb->aio_fildes = s->fd; - acb->aio_offset = 0; - acb->aio_ioctl_buf = buf; - acb->aio_ioctl_cmd = req; - pool = aio_get_thread_pool(bdrv_get_aio_context(bs)); - return thread_pool_submit_aio(pool, aio_worker, acb, cb, opaque); -} -#endif /* linux */ - -static int fd_open(BlockDriverState *bs) -{ - BDRVRawState *s = bs->opaque; - - /* this is just to ensure s->fd is sane (its called by io ops) */ - if (s->fd >= 0) - return 0; - return -EIO; -} - -static coroutine_fn BlockAIOCB *hdev_aio_discard(BlockDriverState *bs, - int64_t sector_num, int nb_sectors, - BlockCompletionFunc *cb, void *opaque) -{ - BDRVRawState *s = bs->opaque; - - if (fd_open(bs) < 0) { - return NULL; - } - return paio_submit(bs, s->fd, sector_num, NULL, nb_sectors, - cb, opaque, QEMU_AIO_DISCARD|QEMU_AIO_BLKDEV); -} - -static coroutine_fn int hdev_co_write_zeroes(BlockDriverState *bs, - int64_t sector_num, int nb_sectors, BdrvRequestFlags flags) -{ - BDRVRawState *s = bs->opaque; - int rc; - - rc = fd_open(bs); - if (rc < 0) { - return rc; - } - if (!(flags & BDRV_REQ_MAY_UNMAP)) { - return paio_submit_co(bs, s->fd, sector_num, NULL, nb_sectors, - QEMU_AIO_WRITE_ZEROES|QEMU_AIO_BLKDEV); - } else if (s->discard_zeroes) { - return paio_submit_co(bs, s->fd, sector_num, NULL, nb_sectors, - QEMU_AIO_DISCARD|QEMU_AIO_BLKDEV); - } - return -ENOTSUP; -} - -static int hdev_create(const char *filename, QemuOpts *opts, - Error **errp) -{ - int fd; - int ret = 0; - struct stat stat_buf; - int64_t total_size = 0; - bool has_prefix; - - /* This function is used by both protocol block drivers and therefore either - * of these prefixes may be given. - * The return value has to be stored somewhere, otherwise this is an error - * due to -Werror=unused-value. */ - has_prefix = - strstart(filename, "host_device:", &filename) || - strstart(filename, "host_cdrom:" , &filename); - - (void)has_prefix; - - ret = raw_normalize_devicepath(&filename); - if (ret < 0) { - error_setg_errno(errp, -ret, "Could not normalize device path"); - return ret; - } - - /* Read out options */ - total_size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0), - BDRV_SECTOR_SIZE); - - fd = qemu_open(filename, O_WRONLY | O_BINARY); - if (fd < 0) { - ret = -errno; - error_setg_errno(errp, -ret, "Could not open device"); - return ret; - } - - if (fstat(fd, &stat_buf) < 0) { - ret = -errno; - error_setg_errno(errp, -ret, "Could not stat device"); - } else if (!S_ISBLK(stat_buf.st_mode) && !S_ISCHR(stat_buf.st_mode)) { - error_setg(errp, - "The given file is neither a block nor a character device"); - ret = -ENODEV; - } else if (lseek(fd, 0, SEEK_END) < total_size) { - error_setg(errp, "Device is too small"); - ret = -ENOSPC; - } - - qemu_close(fd); - return ret; -} - -static BlockDriver bdrv_host_device = { - .format_name = "host_device", - .protocol_name = "host_device", - .instance_size = sizeof(BDRVRawState), - .bdrv_needs_filename = true, - .bdrv_probe_device = hdev_probe_device, - .bdrv_parse_filename = hdev_parse_filename, - .bdrv_file_open = hdev_open, - .bdrv_close = raw_close, - .bdrv_reopen_prepare = raw_reopen_prepare, - .bdrv_reopen_commit = raw_reopen_commit, - .bdrv_reopen_abort = raw_reopen_abort, - .bdrv_create = hdev_create, - .create_opts = &raw_create_opts, - .bdrv_co_write_zeroes = hdev_co_write_zeroes, - - .bdrv_aio_readv = raw_aio_readv, - .bdrv_aio_writev = raw_aio_writev, - .bdrv_aio_flush = raw_aio_flush, - .bdrv_aio_discard = hdev_aio_discard, - .bdrv_refresh_limits = raw_refresh_limits, - .bdrv_io_plug = raw_aio_plug, - .bdrv_io_unplug = raw_aio_unplug, - .bdrv_flush_io_queue = raw_aio_flush_io_queue, - - .bdrv_truncate = raw_truncate, - .bdrv_getlength = raw_getlength, - .bdrv_get_info = raw_get_info, - .bdrv_get_allocated_file_size - = raw_get_allocated_file_size, - .bdrv_probe_blocksizes = hdev_probe_blocksizes, - .bdrv_probe_geometry = hdev_probe_geometry, - - .bdrv_detach_aio_context = raw_detach_aio_context, - .bdrv_attach_aio_context = raw_attach_aio_context, - - /* generic scsi device */ -#ifdef __linux__ - .bdrv_aio_ioctl = hdev_aio_ioctl, -#endif -}; - -#if defined(__linux__) || defined(__FreeBSD__) || defined(__FreeBSD_kernel__) -static void cdrom_parse_filename(const char *filename, QDict *options, - Error **errp) -{ - /* The prefix is optional, just as for "file". */ - strstart(filename, "host_cdrom:", &filename); - - qdict_put_obj(options, "filename", QOBJECT(qstring_from_str(filename))); -} -#endif - -#ifdef __linux__ -static int cdrom_open(BlockDriverState *bs, QDict *options, int flags, - Error **errp) -{ - BDRVRawState *s = bs->opaque; - Error *local_err = NULL; - int ret; - - s->type = FTYPE_CD; - - /* open will not fail even if no CD is inserted, so add O_NONBLOCK */ - ret = raw_open_common(bs, options, flags, O_NONBLOCK, &local_err); - if (local_err) { - error_propagate(errp, local_err); - } - return ret; -} - -static int cdrom_probe_device(const char *filename) -{ - int fd, ret; - int prio = 0; - struct stat st; - - fd = qemu_open(filename, O_RDONLY | O_NONBLOCK); - if (fd < 0) { - goto out; - } - ret = fstat(fd, &st); - if (ret == -1 || !S_ISBLK(st.st_mode)) { - goto outc; - } - - /* Attempt to detect via a CDROM specific ioctl */ - ret = ioctl(fd, CDROM_DRIVE_STATUS, CDSL_CURRENT); - if (ret >= 0) - prio = 100; - -outc: - qemu_close(fd); -out: - return prio; -} - -static bool cdrom_is_inserted(BlockDriverState *bs) -{ - BDRVRawState *s = bs->opaque; - int ret; - - ret = ioctl(s->fd, CDROM_DRIVE_STATUS, CDSL_CURRENT); - return ret == CDS_DISC_OK; -} - -static void cdrom_eject(BlockDriverState *bs, bool eject_flag) -{ - BDRVRawState *s = bs->opaque; - - if (eject_flag) { - if (ioctl(s->fd, CDROMEJECT, NULL) < 0) - perror("CDROMEJECT"); - } else { - if (ioctl(s->fd, CDROMCLOSETRAY, NULL) < 0) - perror("CDROMEJECT"); - } -} - -static void cdrom_lock_medium(BlockDriverState *bs, bool locked) -{ - BDRVRawState *s = bs->opaque; - - if (ioctl(s->fd, CDROM_LOCKDOOR, locked) < 0) { - /* - * Note: an error can happen if the distribution automatically - * mounts the CD-ROM - */ - /* perror("CDROM_LOCKDOOR"); */ - } -} - -static BlockDriver bdrv_host_cdrom = { - .format_name = "host_cdrom", - .protocol_name = "host_cdrom", - .instance_size = sizeof(BDRVRawState), - .bdrv_needs_filename = true, - .bdrv_probe_device = cdrom_probe_device, - .bdrv_parse_filename = cdrom_parse_filename, - .bdrv_file_open = cdrom_open, - .bdrv_close = raw_close, - .bdrv_reopen_prepare = raw_reopen_prepare, - .bdrv_reopen_commit = raw_reopen_commit, - .bdrv_reopen_abort = raw_reopen_abort, - .bdrv_create = hdev_create, - .create_opts = &raw_create_opts, - - .bdrv_aio_readv = raw_aio_readv, - .bdrv_aio_writev = raw_aio_writev, - .bdrv_aio_flush = raw_aio_flush, - .bdrv_refresh_limits = raw_refresh_limits, - .bdrv_io_plug = raw_aio_plug, - .bdrv_io_unplug = raw_aio_unplug, - .bdrv_flush_io_queue = raw_aio_flush_io_queue, - - .bdrv_truncate = raw_truncate, - .bdrv_getlength = raw_getlength, - .has_variable_length = true, - .bdrv_get_allocated_file_size - = raw_get_allocated_file_size, - - .bdrv_detach_aio_context = raw_detach_aio_context, - .bdrv_attach_aio_context = raw_attach_aio_context, - - /* removable device support */ - .bdrv_is_inserted = cdrom_is_inserted, - .bdrv_eject = cdrom_eject, - .bdrv_lock_medium = cdrom_lock_medium, - - /* generic scsi device */ - .bdrv_aio_ioctl = hdev_aio_ioctl, -}; -#endif /* __linux__ */ - -#if defined (__FreeBSD__) || defined(__FreeBSD_kernel__) -static int cdrom_open(BlockDriverState *bs, QDict *options, int flags, - Error **errp) -{ - BDRVRawState *s = bs->opaque; - Error *local_err = NULL; - int ret; - - s->type = FTYPE_CD; - - ret = raw_open_common(bs, options, flags, 0, &local_err); - if (ret) { - if (local_err) { - error_propagate(errp, local_err); - } - return ret; - } - - /* make sure the door isn't locked at this time */ - ioctl(s->fd, CDIOCALLOW); - return 0; -} - -static int cdrom_probe_device(const char *filename) -{ - if (strstart(filename, "/dev/cd", NULL) || - strstart(filename, "/dev/acd", NULL)) - return 100; - return 0; -} - -static int cdrom_reopen(BlockDriverState *bs) -{ - BDRVRawState *s = bs->opaque; - int fd; - - /* - * Force reread of possibly changed/newly loaded disc, - * FreeBSD seems to not notice sometimes... - */ - if (s->fd >= 0) - qemu_close(s->fd); - fd = qemu_open(bs->filename, s->open_flags, 0644); - if (fd < 0) { - s->fd = -1; - return -EIO; - } - s->fd = fd; - - /* make sure the door isn't locked at this time */ - ioctl(s->fd, CDIOCALLOW); - return 0; -} - -static bool cdrom_is_inserted(BlockDriverState *bs) -{ - return raw_getlength(bs) > 0; -} - -static void cdrom_eject(BlockDriverState *bs, bool eject_flag) -{ - BDRVRawState *s = bs->opaque; - - if (s->fd < 0) - return; - - (void) ioctl(s->fd, CDIOCALLOW); - - if (eject_flag) { - if (ioctl(s->fd, CDIOCEJECT) < 0) - perror("CDIOCEJECT"); - } else { - if (ioctl(s->fd, CDIOCCLOSE) < 0) - perror("CDIOCCLOSE"); - } - - cdrom_reopen(bs); -} - -static void cdrom_lock_medium(BlockDriverState *bs, bool locked) -{ - BDRVRawState *s = bs->opaque; - - if (s->fd < 0) - return; - if (ioctl(s->fd, (locked ? CDIOCPREVENT : CDIOCALLOW)) < 0) { - /* - * Note: an error can happen if the distribution automatically - * mounts the CD-ROM - */ - /* perror("CDROM_LOCKDOOR"); */ - } -} - -static BlockDriver bdrv_host_cdrom = { - .format_name = "host_cdrom", - .protocol_name = "host_cdrom", - .instance_size = sizeof(BDRVRawState), - .bdrv_needs_filename = true, - .bdrv_probe_device = cdrom_probe_device, - .bdrv_parse_filename = cdrom_parse_filename, - .bdrv_file_open = cdrom_open, - .bdrv_close = raw_close, - .bdrv_reopen_prepare = raw_reopen_prepare, - .bdrv_reopen_commit = raw_reopen_commit, - .bdrv_reopen_abort = raw_reopen_abort, - .bdrv_create = hdev_create, - .create_opts = &raw_create_opts, - - .bdrv_aio_readv = raw_aio_readv, - .bdrv_aio_writev = raw_aio_writev, - .bdrv_aio_flush = raw_aio_flush, - .bdrv_refresh_limits = raw_refresh_limits, - .bdrv_io_plug = raw_aio_plug, - .bdrv_io_unplug = raw_aio_unplug, - .bdrv_flush_io_queue = raw_aio_flush_io_queue, - - .bdrv_truncate = raw_truncate, - .bdrv_getlength = raw_getlength, - .has_variable_length = true, - .bdrv_get_allocated_file_size - = raw_get_allocated_file_size, - - .bdrv_detach_aio_context = raw_detach_aio_context, - .bdrv_attach_aio_context = raw_attach_aio_context, - - /* removable device support */ - .bdrv_is_inserted = cdrom_is_inserted, - .bdrv_eject = cdrom_eject, - .bdrv_lock_medium = cdrom_lock_medium, -}; -#endif /* __FreeBSD__ */ - -static void bdrv_file_init(void) -{ - /* - * Register all the drivers. Note that order is important, the driver - * registered last will get probed first. - */ - bdrv_register(&bdrv_file); - bdrv_register(&bdrv_host_device); -#ifdef __linux__ - bdrv_register(&bdrv_host_cdrom); -#endif -#if defined(__FreeBSD__) || defined(__FreeBSD_kernel__) - bdrv_register(&bdrv_host_cdrom); -#endif -} - -block_init(bdrv_file_init); diff --git a/qemu/block/raw-win32.c b/qemu/block/raw-win32.c deleted file mode 100644 index fd2389153..000000000 --- a/qemu/block/raw-win32.c +++ /dev/null @@ -1,731 +0,0 @@ -/* - * Block driver for RAW files (win32) - * - * Copyright (c) 2006 Fabrice Bellard - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. - */ -#include "qemu/osdep.h" -#include "qapi/error.h" -#include "qemu/cutils.h" -#include "qemu/timer.h" -#include "block/block_int.h" -#include "qemu/module.h" -#include "raw-aio.h" -#include "trace.h" -#include "block/thread-pool.h" -#include "qemu/iov.h" -#include "qapi/qmp/qstring.h" -#include -#include - -#define FTYPE_FILE 0 -#define FTYPE_CD 1 -#define FTYPE_HARDDISK 2 - -typedef struct RawWin32AIOData { - BlockDriverState *bs; - HANDLE hfile; - struct iovec *aio_iov; - int aio_niov; - size_t aio_nbytes; - off64_t aio_offset; - int aio_type; -} RawWin32AIOData; - -typedef struct BDRVRawState { - HANDLE hfile; - int type; - char drive_path[16]; /* format: "d:\" */ - QEMUWin32AIOState *aio; -} BDRVRawState; - -/* - * Read/writes the data to/from a given linear buffer. - * - * Returns the number of bytes handles or -errno in case of an error. Short - * reads are only returned if the end of the file is reached. - */ -static size_t handle_aiocb_rw(RawWin32AIOData *aiocb) -{ - size_t offset = 0; - int i; - - for (i = 0; i < aiocb->aio_niov; i++) { - OVERLAPPED ov; - DWORD ret, ret_count, len; - - memset(&ov, 0, sizeof(ov)); - ov.Offset = (aiocb->aio_offset + offset); - ov.OffsetHigh = (aiocb->aio_offset + offset) >> 32; - len = aiocb->aio_iov[i].iov_len; - if (aiocb->aio_type & QEMU_AIO_WRITE) { - ret = WriteFile(aiocb->hfile, aiocb->aio_iov[i].iov_base, - len, &ret_count, &ov); - } else { - ret = ReadFile(aiocb->hfile, aiocb->aio_iov[i].iov_base, - len, &ret_count, &ov); - } - if (!ret) { - ret_count = 0; - } - if (ret_count != len) { - offset += ret_count; - break; - } - offset += len; - } - - return offset; -} - -static int aio_worker(void *arg) -{ - RawWin32AIOData *aiocb = arg; - ssize_t ret = 0; - size_t count; - - switch (aiocb->aio_type & QEMU_AIO_TYPE_MASK) { - case QEMU_AIO_READ: - count = handle_aiocb_rw(aiocb); - if (count < aiocb->aio_nbytes) { - /* A short read means that we have reached EOF. Pad the buffer - * with zeros for bytes after EOF. */ - iov_memset(aiocb->aio_iov, aiocb->aio_niov, count, - 0, aiocb->aio_nbytes - count); - - count = aiocb->aio_nbytes; - } - if (count == aiocb->aio_nbytes) { - ret = 0; - } else { - ret = -EINVAL; - } - break; - case QEMU_AIO_WRITE: - count = handle_aiocb_rw(aiocb); - if (count == aiocb->aio_nbytes) { - ret = 0; - } else { - ret = -EINVAL; - } - break; - case QEMU_AIO_FLUSH: - if (!FlushFileBuffers(aiocb->hfile)) { - return -EIO; - } - break; - default: - fprintf(stderr, "invalid aio request (0x%x)\n", aiocb->aio_type); - ret = -EINVAL; - break; - } - - g_free(aiocb); - return ret; -} - -static BlockAIOCB *paio_submit(BlockDriverState *bs, HANDLE hfile, - int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, - BlockCompletionFunc *cb, void *opaque, int type) -{ - RawWin32AIOData *acb = g_new(RawWin32AIOData, 1); - ThreadPool *pool; - - acb->bs = bs; - acb->hfile = hfile; - acb->aio_type = type; - - if (qiov) { - acb->aio_iov = qiov->iov; - acb->aio_niov = qiov->niov; - } - acb->aio_nbytes = nb_sectors * 512; - acb->aio_offset = sector_num * 512; - - trace_paio_submit(acb, opaque, sector_num, nb_sectors, type); - pool = aio_get_thread_pool(bdrv_get_aio_context(bs)); - return thread_pool_submit_aio(pool, aio_worker, acb, cb, opaque); -} - -int qemu_ftruncate64(int fd, int64_t length) -{ - LARGE_INTEGER li; - DWORD dw; - LONG high; - HANDLE h; - BOOL res; - - if ((GetVersion() & 0x80000000UL) && (length >> 32) != 0) - return -1; - - h = (HANDLE)_get_osfhandle(fd); - - /* get current position, ftruncate do not change position */ - li.HighPart = 0; - li.LowPart = SetFilePointer (h, 0, &li.HighPart, FILE_CURRENT); - if (li.LowPart == INVALID_SET_FILE_POINTER && GetLastError() != NO_ERROR) { - return -1; - } - - high = length >> 32; - dw = SetFilePointer(h, (DWORD) length, &high, FILE_BEGIN); - if (dw == INVALID_SET_FILE_POINTER && GetLastError() != NO_ERROR) { - return -1; - } - res = SetEndOfFile(h); - - /* back to old position */ - SetFilePointer(h, li.LowPart, &li.HighPart, FILE_BEGIN); - return res ? 0 : -1; -} - -static int set_sparse(int fd) -{ - DWORD returned; - return (int) DeviceIoControl((HANDLE)_get_osfhandle(fd), FSCTL_SET_SPARSE, - NULL, 0, NULL, 0, &returned, NULL); -} - -static void raw_detach_aio_context(BlockDriverState *bs) -{ - BDRVRawState *s = bs->opaque; - - if (s->aio) { - win32_aio_detach_aio_context(s->aio, bdrv_get_aio_context(bs)); - } -} - -static void raw_attach_aio_context(BlockDriverState *bs, - AioContext *new_context) -{ - BDRVRawState *s = bs->opaque; - - if (s->aio) { - win32_aio_attach_aio_context(s->aio, new_context); - } -} - -static void raw_probe_alignment(BlockDriverState *bs) -{ - BDRVRawState *s = bs->opaque; - DWORD sectorsPerCluster, freeClusters, totalClusters, count; - DISK_GEOMETRY_EX dg; - BOOL status; - - if (s->type == FTYPE_CD) { - bs->request_alignment = 2048; - return; - } - if (s->type == FTYPE_HARDDISK) { - status = DeviceIoControl(s->hfile, IOCTL_DISK_GET_DRIVE_GEOMETRY_EX, - NULL, 0, &dg, sizeof(dg), &count, NULL); - if (status != 0) { - bs->request_alignment = dg.Geometry.BytesPerSector; - return; - } - /* try GetDiskFreeSpace too */ - } - - if (s->drive_path[0]) { - GetDiskFreeSpace(s->drive_path, §orsPerCluster, - &dg.Geometry.BytesPerSector, - &freeClusters, &totalClusters); - bs->request_alignment = dg.Geometry.BytesPerSector; - } -} - -static void raw_parse_flags(int flags, int *access_flags, DWORD *overlapped) -{ - assert(access_flags != NULL); - assert(overlapped != NULL); - - if (flags & BDRV_O_RDWR) { - *access_flags = GENERIC_READ | GENERIC_WRITE; - } else { - *access_flags = GENERIC_READ; - } - - *overlapped = FILE_ATTRIBUTE_NORMAL; - if (flags & BDRV_O_NATIVE_AIO) { - *overlapped |= FILE_FLAG_OVERLAPPED; - } - if (flags & BDRV_O_NOCACHE) { - *overlapped |= FILE_FLAG_NO_BUFFERING; - } -} - -static void raw_parse_filename(const char *filename, QDict *options, - Error **errp) -{ - /* The filename does not have to be prefixed by the protocol name, since - * "file" is the default protocol; therefore, the return value of this - * function call can be ignored. */ - strstart(filename, "file:", &filename); - - qdict_put_obj(options, "filename", QOBJECT(qstring_from_str(filename))); -} - -static QemuOptsList raw_runtime_opts = { - .name = "raw", - .head = QTAILQ_HEAD_INITIALIZER(raw_runtime_opts.head), - .desc = { - { - .name = "filename", - .type = QEMU_OPT_STRING, - .help = "File name of the image", - }, - { /* end of list */ } - }, -}; - -static int raw_open(BlockDriverState *bs, QDict *options, int flags, - Error **errp) -{ - BDRVRawState *s = bs->opaque; - int access_flags; - DWORD overlapped; - QemuOpts *opts; - Error *local_err = NULL; - const char *filename; - int ret; - - s->type = FTYPE_FILE; - - opts = qemu_opts_create(&raw_runtime_opts, NULL, 0, &error_abort); - qemu_opts_absorb_qdict(opts, options, &local_err); - if (local_err) { - error_propagate(errp, local_err); - ret = -EINVAL; - goto fail; - } - - filename = qemu_opt_get(opts, "filename"); - - raw_parse_flags(flags, &access_flags, &overlapped); - - if (filename[0] && filename[1] == ':') { - snprintf(s->drive_path, sizeof(s->drive_path), "%c:\\", filename[0]); - } else if (filename[0] == '\\' && filename[1] == '\\') { - s->drive_path[0] = 0; - } else { - /* Relative path. */ - char buf[MAX_PATH]; - GetCurrentDirectory(MAX_PATH, buf); - snprintf(s->drive_path, sizeof(s->drive_path), "%c:\\", buf[0]); - } - - s->hfile = CreateFile(filename, access_flags, - FILE_SHARE_READ, NULL, - OPEN_EXISTING, overlapped, NULL); - if (s->hfile == INVALID_HANDLE_VALUE) { - int err = GetLastError(); - - if (err == ERROR_ACCESS_DENIED) { - ret = -EACCES; - } else { - ret = -EINVAL; - } - goto fail; - } - - if (flags & BDRV_O_NATIVE_AIO) { - s->aio = win32_aio_init(); - if (s->aio == NULL) { - CloseHandle(s->hfile); - error_setg(errp, "Could not initialize AIO"); - ret = -EINVAL; - goto fail; - } - - ret = win32_aio_attach(s->aio, s->hfile); - if (ret < 0) { - win32_aio_cleanup(s->aio); - CloseHandle(s->hfile); - error_setg_errno(errp, -ret, "Could not enable AIO"); - goto fail; - } - - win32_aio_attach_aio_context(s->aio, bdrv_get_aio_context(bs)); - } - - raw_probe_alignment(bs); - ret = 0; -fail: - qemu_opts_del(opts); - return ret; -} - -static BlockAIOCB *raw_aio_readv(BlockDriverState *bs, - int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, - BlockCompletionFunc *cb, void *opaque) -{ - BDRVRawState *s = bs->opaque; - if (s->aio) { - return win32_aio_submit(bs, s->aio, s->hfile, sector_num, qiov, - nb_sectors, cb, opaque, QEMU_AIO_READ); - } else { - return paio_submit(bs, s->hfile, sector_num, qiov, nb_sectors, - cb, opaque, QEMU_AIO_READ); - } -} - -static BlockAIOCB *raw_aio_writev(BlockDriverState *bs, - int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, - BlockCompletionFunc *cb, void *opaque) -{ - BDRVRawState *s = bs->opaque; - if (s->aio) { - return win32_aio_submit(bs, s->aio, s->hfile, sector_num, qiov, - nb_sectors, cb, opaque, QEMU_AIO_WRITE); - } else { - return paio_submit(bs, s->hfile, sector_num, qiov, nb_sectors, - cb, opaque, QEMU_AIO_WRITE); - } -} - -static BlockAIOCB *raw_aio_flush(BlockDriverState *bs, - BlockCompletionFunc *cb, void *opaque) -{ - BDRVRawState *s = bs->opaque; - return paio_submit(bs, s->hfile, 0, NULL, 0, cb, opaque, QEMU_AIO_FLUSH); -} - -static void raw_close(BlockDriverState *bs) -{ - BDRVRawState *s = bs->opaque; - - if (s->aio) { - win32_aio_detach_aio_context(s->aio, bdrv_get_aio_context(bs)); - win32_aio_cleanup(s->aio); - s->aio = NULL; - } - - CloseHandle(s->hfile); - if (bs->open_flags & BDRV_O_TEMPORARY) { - unlink(bs->filename); - } -} - -static int raw_truncate(BlockDriverState *bs, int64_t offset) -{ - BDRVRawState *s = bs->opaque; - LONG low, high; - DWORD dwPtrLow; - - low = offset; - high = offset >> 32; - - /* - * An error has occurred if the return value is INVALID_SET_FILE_POINTER - * and GetLastError doesn't return NO_ERROR. - */ - dwPtrLow = SetFilePointer(s->hfile, low, &high, FILE_BEGIN); - if (dwPtrLow == INVALID_SET_FILE_POINTER && GetLastError() != NO_ERROR) { - fprintf(stderr, "SetFilePointer error: %lu\n", GetLastError()); - return -EIO; - } - if (SetEndOfFile(s->hfile) == 0) { - fprintf(stderr, "SetEndOfFile error: %lu\n", GetLastError()); - return -EIO; - } - return 0; -} - -static int64_t raw_getlength(BlockDriverState *bs) -{ - BDRVRawState *s = bs->opaque; - LARGE_INTEGER l; - ULARGE_INTEGER available, total, total_free; - DISK_GEOMETRY_EX dg; - DWORD count; - BOOL status; - - switch(s->type) { - case FTYPE_FILE: - l.LowPart = GetFileSize(s->hfile, (PDWORD)&l.HighPart); - if (l.LowPart == 0xffffffffUL && GetLastError() != NO_ERROR) - return -EIO; - break; - case FTYPE_CD: - if (!GetDiskFreeSpaceEx(s->drive_path, &available, &total, &total_free)) - return -EIO; - l.QuadPart = total.QuadPart; - break; - case FTYPE_HARDDISK: - status = DeviceIoControl(s->hfile, IOCTL_DISK_GET_DRIVE_GEOMETRY_EX, - NULL, 0, &dg, sizeof(dg), &count, NULL); - if (status != 0) { - l = dg.DiskSize; - } - break; - default: - return -EIO; - } - return l.QuadPart; -} - -static int64_t raw_get_allocated_file_size(BlockDriverState *bs) -{ - typedef DWORD (WINAPI * get_compressed_t)(const char *filename, - DWORD * high); - get_compressed_t get_compressed; - struct _stati64 st; - const char *filename = bs->filename; - /* WinNT support GetCompressedFileSize to determine allocate size */ - get_compressed = - (get_compressed_t) GetProcAddress(GetModuleHandle("kernel32"), - "GetCompressedFileSizeA"); - if (get_compressed) { - DWORD high, low; - low = get_compressed(filename, &high); - if (low != 0xFFFFFFFFlu || GetLastError() == NO_ERROR) { - return (((int64_t) high) << 32) + low; - } - } - - if (_stati64(filename, &st) < 0) { - return -1; - } - return st.st_size; -} - -static int raw_create(const char *filename, QemuOpts *opts, Error **errp) -{ - int fd; - int64_t total_size = 0; - - strstart(filename, "file:", &filename); - - /* Read out options */ - total_size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0), - BDRV_SECTOR_SIZE); - - fd = qemu_open(filename, O_WRONLY | O_CREAT | O_TRUNC | O_BINARY, - 0644); - if (fd < 0) { - error_setg_errno(errp, errno, "Could not create file"); - return -EIO; - } - set_sparse(fd); - ftruncate(fd, total_size); - qemu_close(fd); - return 0; -} - - -static QemuOptsList raw_create_opts = { - .name = "raw-create-opts", - .head = QTAILQ_HEAD_INITIALIZER(raw_create_opts.head), - .desc = { - { - .name = BLOCK_OPT_SIZE, - .type = QEMU_OPT_SIZE, - .help = "Virtual disk size" - }, - { /* end of list */ } - } -}; - -BlockDriver bdrv_file = { - .format_name = "file", - .protocol_name = "file", - .instance_size = sizeof(BDRVRawState), - .bdrv_needs_filename = true, - .bdrv_parse_filename = raw_parse_filename, - .bdrv_file_open = raw_open, - .bdrv_close = raw_close, - .bdrv_create = raw_create, - .bdrv_has_zero_init = bdrv_has_zero_init_1, - - .bdrv_aio_readv = raw_aio_readv, - .bdrv_aio_writev = raw_aio_writev, - .bdrv_aio_flush = raw_aio_flush, - - .bdrv_truncate = raw_truncate, - .bdrv_getlength = raw_getlength, - .bdrv_get_allocated_file_size - = raw_get_allocated_file_size, - - .create_opts = &raw_create_opts, -}; - -/***********************************************/ -/* host device */ - -static int find_cdrom(char *cdrom_name, int cdrom_name_size) -{ - char drives[256], *pdrv = drives; - UINT type; - - memset(drives, 0, sizeof(drives)); - GetLogicalDriveStrings(sizeof(drives), drives); - while(pdrv[0] != '\0') { - type = GetDriveType(pdrv); - switch(type) { - case DRIVE_CDROM: - snprintf(cdrom_name, cdrom_name_size, "\\\\.\\%c:", pdrv[0]); - return 0; - break; - } - pdrv += lstrlen(pdrv) + 1; - } - return -1; -} - -static int find_device_type(BlockDriverState *bs, const char *filename) -{ - BDRVRawState *s = bs->opaque; - UINT type; - const char *p; - - if (strstart(filename, "\\\\.\\", &p) || - strstart(filename, "//./", &p)) { - if (stristart(p, "PhysicalDrive", NULL)) - return FTYPE_HARDDISK; - snprintf(s->drive_path, sizeof(s->drive_path), "%c:\\", p[0]); - type = GetDriveType(s->drive_path); - switch (type) { - case DRIVE_REMOVABLE: - case DRIVE_FIXED: - return FTYPE_HARDDISK; - case DRIVE_CDROM: - return FTYPE_CD; - default: - return FTYPE_FILE; - } - } else { - return FTYPE_FILE; - } -} - -static int hdev_probe_device(const char *filename) -{ - if (strstart(filename, "/dev/cdrom", NULL)) - return 100; - if (is_windows_drive(filename)) - return 100; - return 0; -} - -static void hdev_parse_filename(const char *filename, QDict *options, - Error **errp) -{ - /* The prefix is optional, just as for "file". */ - strstart(filename, "host_device:", &filename); - - qdict_put_obj(options, "filename", QOBJECT(qstring_from_str(filename))); -} - -static int hdev_open(BlockDriverState *bs, QDict *options, int flags, - Error **errp) -{ - BDRVRawState *s = bs->opaque; - int access_flags, create_flags; - int ret = 0; - DWORD overlapped; - char device_name[64]; - - Error *local_err = NULL; - const char *filename; - - QemuOpts *opts = qemu_opts_create(&raw_runtime_opts, NULL, 0, - &error_abort); - qemu_opts_absorb_qdict(opts, options, &local_err); - if (local_err) { - error_propagate(errp, local_err); - ret = -EINVAL; - goto done; - } - - filename = qemu_opt_get(opts, "filename"); - - if (strstart(filename, "/dev/cdrom", NULL)) { - if (find_cdrom(device_name, sizeof(device_name)) < 0) { - error_setg(errp, "Could not open CD-ROM drive"); - ret = -ENOENT; - goto done; - } - filename = device_name; - } else { - /* transform drive letters into device name */ - if (((filename[0] >= 'a' && filename[0] <= 'z') || - (filename[0] >= 'A' && filename[0] <= 'Z')) && - filename[1] == ':' && filename[2] == '\0') { - snprintf(device_name, sizeof(device_name), "\\\\.\\%c:", filename[0]); - filename = device_name; - } - } - s->type = find_device_type(bs, filename); - - raw_parse_flags(flags, &access_flags, &overlapped); - - create_flags = OPEN_EXISTING; - - s->hfile = CreateFile(filename, access_flags, - FILE_SHARE_READ, NULL, - create_flags, overlapped, NULL); - if (s->hfile == INVALID_HANDLE_VALUE) { - int err = GetLastError(); - - if (err == ERROR_ACCESS_DENIED) { - ret = -EACCES; - } else { - ret = -EINVAL; - } - error_setg_errno(errp, -ret, "Could not open device"); - goto done; - } - -done: - qemu_opts_del(opts); - return ret; -} - -static BlockDriver bdrv_host_device = { - .format_name = "host_device", - .protocol_name = "host_device", - .instance_size = sizeof(BDRVRawState), - .bdrv_needs_filename = true, - .bdrv_parse_filename = hdev_parse_filename, - .bdrv_probe_device = hdev_probe_device, - .bdrv_file_open = hdev_open, - .bdrv_close = raw_close, - - .bdrv_aio_readv = raw_aio_readv, - .bdrv_aio_writev = raw_aio_writev, - .bdrv_aio_flush = raw_aio_flush, - - .bdrv_detach_aio_context = raw_detach_aio_context, - .bdrv_attach_aio_context = raw_attach_aio_context, - - .bdrv_getlength = raw_getlength, - .has_variable_length = true, - - .bdrv_get_allocated_file_size - = raw_get_allocated_file_size, -}; - -static void bdrv_file_init(void) -{ - bdrv_register(&bdrv_file); - bdrv_register(&bdrv_host_device); -} - -block_init(bdrv_file_init); diff --git a/qemu/block/raw_bsd.c b/qemu/block/raw_bsd.c deleted file mode 100644 index a6cc7e991..000000000 --- a/qemu/block/raw_bsd.c +++ /dev/null @@ -1,285 +0,0 @@ -/* BlockDriver implementation for "raw" - * - * Copyright (C) 2010, 2013, Red Hat, Inc. - * Copyright (C) 2010, Blue Swirl - * Copyright (C) 2009, Anthony Liguori - * - * Author: - * Laszlo Ersek - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - */ - -#include "qemu/osdep.h" -#include "block/block_int.h" -#include "qapi/error.h" -#include "qemu/option.h" - -static QemuOptsList raw_create_opts = { - .name = "raw-create-opts", - .head = QTAILQ_HEAD_INITIALIZER(raw_create_opts.head), - .desc = { - { - .name = BLOCK_OPT_SIZE, - .type = QEMU_OPT_SIZE, - .help = "Virtual disk size" - }, - { /* end of list */ } - } -}; - -static int raw_reopen_prepare(BDRVReopenState *reopen_state, - BlockReopenQueue *queue, Error **errp) -{ - return 0; -} - -static int coroutine_fn raw_co_readv(BlockDriverState *bs, int64_t sector_num, - int nb_sectors, QEMUIOVector *qiov) -{ - BLKDBG_EVENT(bs->file, BLKDBG_READ_AIO); - return bdrv_co_readv(bs->file->bs, sector_num, nb_sectors, qiov); -} - -static int coroutine_fn -raw_co_writev_flags(BlockDriverState *bs, int64_t sector_num, int nb_sectors, - QEMUIOVector *qiov, int flags) -{ - void *buf = NULL; - BlockDriver *drv; - QEMUIOVector local_qiov; - int ret; - - if (bs->probed && sector_num == 0) { - /* As long as these conditions are true, we can't get partial writes to - * the probe buffer and can just directly check the request. */ - QEMU_BUILD_BUG_ON(BLOCK_PROBE_BUF_SIZE != 512); - QEMU_BUILD_BUG_ON(BDRV_SECTOR_SIZE != 512); - - if (nb_sectors == 0) { - /* qemu_iovec_to_buf() would fail, but we want to return success - * instead of -EINVAL in this case. */ - return 0; - } - - buf = qemu_try_blockalign(bs->file->bs, 512); - if (!buf) { - ret = -ENOMEM; - goto fail; - } - - ret = qemu_iovec_to_buf(qiov, 0, buf, 512); - if (ret != 512) { - ret = -EINVAL; - goto fail; - } - - drv = bdrv_probe_all(buf, 512, NULL); - if (drv != bs->drv) { - ret = -EPERM; - goto fail; - } - - /* Use the checked buffer, a malicious guest might be overwriting its - * original buffer in the background. */ - qemu_iovec_init(&local_qiov, qiov->niov + 1); - qemu_iovec_add(&local_qiov, buf, 512); - qemu_iovec_concat(&local_qiov, qiov, 512, qiov->size - 512); - qiov = &local_qiov; - } - - BLKDBG_EVENT(bs->file, BLKDBG_WRITE_AIO); - ret = bdrv_co_do_pwritev(bs->file->bs, sector_num * BDRV_SECTOR_SIZE, - nb_sectors * BDRV_SECTOR_SIZE, qiov, flags); - -fail: - if (qiov == &local_qiov) { - qemu_iovec_destroy(&local_qiov); - } - qemu_vfree(buf); - return ret; -} - -static int coroutine_fn -raw_co_writev(BlockDriverState *bs, int64_t sector_num, int nb_sectors, - QEMUIOVector *qiov) -{ - return raw_co_writev_flags(bs, sector_num, nb_sectors, qiov, 0); -} - -static int64_t coroutine_fn raw_co_get_block_status(BlockDriverState *bs, - int64_t sector_num, - int nb_sectors, int *pnum, - BlockDriverState **file) -{ - *pnum = nb_sectors; - *file = bs->file->bs; - return BDRV_BLOCK_RAW | BDRV_BLOCK_OFFSET_VALID | BDRV_BLOCK_DATA | - (sector_num << BDRV_SECTOR_BITS); -} - -static int coroutine_fn raw_co_write_zeroes(BlockDriverState *bs, - int64_t sector_num, int nb_sectors, - BdrvRequestFlags flags) -{ - return bdrv_co_write_zeroes(bs->file->bs, sector_num, nb_sectors, flags); -} - -static int coroutine_fn raw_co_discard(BlockDriverState *bs, - int64_t sector_num, int nb_sectors) -{ - return bdrv_co_discard(bs->file->bs, sector_num, nb_sectors); -} - -static int64_t raw_getlength(BlockDriverState *bs) -{ - return bdrv_getlength(bs->file->bs); -} - -static int raw_get_info(BlockDriverState *bs, BlockDriverInfo *bdi) -{ - return bdrv_get_info(bs->file->bs, bdi); -} - -static void raw_refresh_limits(BlockDriverState *bs, Error **errp) -{ - bs->bl = bs->file->bs->bl; -} - -static int raw_truncate(BlockDriverState *bs, int64_t offset) -{ - return bdrv_truncate(bs->file->bs, offset); -} - -static int raw_media_changed(BlockDriverState *bs) -{ - return bdrv_media_changed(bs->file->bs); -} - -static void raw_eject(BlockDriverState *bs, bool eject_flag) -{ - bdrv_eject(bs->file->bs, eject_flag); -} - -static void raw_lock_medium(BlockDriverState *bs, bool locked) -{ - bdrv_lock_medium(bs->file->bs, locked); -} - -static BlockAIOCB *raw_aio_ioctl(BlockDriverState *bs, - unsigned long int req, void *buf, - BlockCompletionFunc *cb, - void *opaque) -{ - return bdrv_aio_ioctl(bs->file->bs, req, buf, cb, opaque); -} - -static int raw_has_zero_init(BlockDriverState *bs) -{ - return bdrv_has_zero_init(bs->file->bs); -} - -static int raw_create(const char *filename, QemuOpts *opts, Error **errp) -{ - Error *local_err = NULL; - int ret; - - ret = bdrv_create_file(filename, opts, &local_err); - if (local_err) { - error_propagate(errp, local_err); - } - return ret; -} - -static int raw_open(BlockDriverState *bs, QDict *options, int flags, - Error **errp) -{ - bs->sg = bs->file->bs->sg; - - if (bs->probed && !bdrv_is_read_only(bs)) { - fprintf(stderr, - "WARNING: Image format was not specified for '%s' and probing " - "guessed raw.\n" - " Automatically detecting the format is dangerous for " - "raw images, write operations on block 0 will be restricted.\n" - " Specify the 'raw' format explicitly to remove the " - "restrictions.\n", - bs->file->bs->filename); - } - - return 0; -} - -static void raw_close(BlockDriverState *bs) -{ -} - -static int raw_probe(const uint8_t *buf, int buf_size, const char *filename) -{ - /* smallest possible positive score so that raw is used if and only if no - * other block driver works - */ - return 1; -} - -static int raw_probe_blocksizes(BlockDriverState *bs, BlockSizes *bsz) -{ - return bdrv_probe_blocksizes(bs->file->bs, bsz); -} - -static int raw_probe_geometry(BlockDriverState *bs, HDGeometry *geo) -{ - return bdrv_probe_geometry(bs->file->bs, geo); -} - -BlockDriver bdrv_raw = { - .format_name = "raw", - .bdrv_probe = &raw_probe, - .bdrv_reopen_prepare = &raw_reopen_prepare, - .bdrv_open = &raw_open, - .bdrv_close = &raw_close, - .bdrv_create = &raw_create, - .bdrv_co_readv = &raw_co_readv, - .bdrv_co_writev = &raw_co_writev, - .bdrv_co_writev_flags = &raw_co_writev_flags, - .supported_write_flags = BDRV_REQ_FUA, - .bdrv_co_write_zeroes = &raw_co_write_zeroes, - .bdrv_co_discard = &raw_co_discard, - .bdrv_co_get_block_status = &raw_co_get_block_status, - .bdrv_truncate = &raw_truncate, - .bdrv_getlength = &raw_getlength, - .has_variable_length = true, - .bdrv_get_info = &raw_get_info, - .bdrv_refresh_limits = &raw_refresh_limits, - .bdrv_probe_blocksizes = &raw_probe_blocksizes, - .bdrv_probe_geometry = &raw_probe_geometry, - .bdrv_media_changed = &raw_media_changed, - .bdrv_eject = &raw_eject, - .bdrv_lock_medium = &raw_lock_medium, - .bdrv_aio_ioctl = &raw_aio_ioctl, - .create_opts = &raw_create_opts, - .bdrv_has_zero_init = &raw_has_zero_init -}; - -static void bdrv_raw_init(void) -{ - bdrv_register(&bdrv_raw); -} - -block_init(bdrv_raw_init); diff --git a/qemu/block/rbd.c b/qemu/block/rbd.c deleted file mode 100644 index 5bc5b3253..000000000 --- a/qemu/block/rbd.c +++ /dev/null @@ -1,1015 +0,0 @@ -/* - * QEMU Block driver for RADOS (Ceph) - * - * Copyright (C) 2010-2011 Christian Brunner , - * Josh Durgin - * - * This work is licensed under the terms of the GNU GPL, version 2. See - * the COPYING file in the top-level directory. - * - * Contributions after 2012-01-13 are licensed under the terms of the - * GNU GPL, version 2 or (at your option) any later version. - */ - -#include "qemu/osdep.h" - -#include "qapi/error.h" -#include "qemu/error-report.h" -#include "block/block_int.h" -#include "crypto/secret.h" -#include "qemu/cutils.h" - -#include - -/* - * When specifying the image filename use: - * - * rbd:poolname/devicename[@snapshotname][:option1=value1[:option2=value2...]] - * - * poolname must be the name of an existing rados pool. - * - * devicename is the name of the rbd image. - * - * Each option given is used to configure rados, and may be any valid - * Ceph option, "id", or "conf". - * - * The "id" option indicates what user we should authenticate as to - * the Ceph cluster. If it is excluded we will use the Ceph default - * (normally 'admin'). - * - * The "conf" option specifies a Ceph configuration file to read. If - * it is not specified, we will read from the default Ceph locations - * (e.g., /etc/ceph/ceph.conf). To avoid reading _any_ configuration - * file, specify conf=/dev/null. - * - * Configuration values containing :, @, or = can be escaped with a - * leading "\". - */ - -/* rbd_aio_discard added in 0.1.2 */ -#if LIBRBD_VERSION_CODE >= LIBRBD_VERSION(0, 1, 2) -#define LIBRBD_SUPPORTS_DISCARD -#else -#undef LIBRBD_SUPPORTS_DISCARD -#endif - -#define OBJ_MAX_SIZE (1UL << OBJ_DEFAULT_OBJ_ORDER) - -#define RBD_MAX_CONF_NAME_SIZE 128 -#define RBD_MAX_CONF_VAL_SIZE 512 -#define RBD_MAX_CONF_SIZE 1024 -#define RBD_MAX_POOL_NAME_SIZE 128 -#define RBD_MAX_SNAP_NAME_SIZE 128 -#define RBD_MAX_SNAPS 100 - -typedef enum { - RBD_AIO_READ, - RBD_AIO_WRITE, - RBD_AIO_DISCARD, - RBD_AIO_FLUSH -} RBDAIOCmd; - -typedef struct RBDAIOCB { - BlockAIOCB common; - QEMUBH *bh; - int64_t ret; - QEMUIOVector *qiov; - char *bounce; - RBDAIOCmd cmd; - int error; - struct BDRVRBDState *s; -} RBDAIOCB; - -typedef struct RADOSCB { - RBDAIOCB *acb; - struct BDRVRBDState *s; - int64_t size; - char *buf; - int64_t ret; -} RADOSCB; - -typedef struct BDRVRBDState { - rados_t cluster; - rados_ioctx_t io_ctx; - rbd_image_t image; - char name[RBD_MAX_IMAGE_NAME_SIZE]; - char *snap; -} BDRVRBDState; - -static int qemu_rbd_next_tok(char *dst, int dst_len, - char *src, char delim, - const char *name, - char **p, Error **errp) -{ - int l; - char *end; - - *p = NULL; - - if (delim != '\0') { - for (end = src; *end; ++end) { - if (*end == delim) { - break; - } - if (*end == '\\' && end[1] != '\0') { - end++; - } - } - if (*end == delim) { - *p = end + 1; - *end = '\0'; - } - } - l = strlen(src); - if (l >= dst_len) { - error_setg(errp, "%s too long", name); - return -EINVAL; - } else if (l == 0) { - error_setg(errp, "%s too short", name); - return -EINVAL; - } - - pstrcpy(dst, dst_len, src); - - return 0; -} - -static void qemu_rbd_unescape(char *src) -{ - char *p; - - for (p = src; *src; ++src, ++p) { - if (*src == '\\' && src[1] != '\0') { - src++; - } - *p = *src; - } - *p = '\0'; -} - -static int qemu_rbd_parsename(const char *filename, - char *pool, int pool_len, - char *snap, int snap_len, - char *name, int name_len, - char *conf, int conf_len, - Error **errp) -{ - const char *start; - char *p, *buf; - int ret; - - if (!strstart(filename, "rbd:", &start)) { - error_setg(errp, "File name must start with 'rbd:'"); - return -EINVAL; - } - - buf = g_strdup(start); - p = buf; - *snap = '\0'; - *conf = '\0'; - - ret = qemu_rbd_next_tok(pool, pool_len, p, - '/', "pool name", &p, errp); - if (ret < 0 || !p) { - ret = -EINVAL; - goto done; - } - qemu_rbd_unescape(pool); - - if (strchr(p, '@')) { - ret = qemu_rbd_next_tok(name, name_len, p, - '@', "object name", &p, errp); - if (ret < 0) { - goto done; - } - ret = qemu_rbd_next_tok(snap, snap_len, p, - ':', "snap name", &p, errp); - qemu_rbd_unescape(snap); - } else { - ret = qemu_rbd_next_tok(name, name_len, p, - ':', "object name", &p, errp); - } - qemu_rbd_unescape(name); - if (ret < 0 || !p) { - goto done; - } - - ret = qemu_rbd_next_tok(conf, conf_len, p, - '\0', "configuration", &p, errp); - -done: - g_free(buf); - return ret; -} - -static char *qemu_rbd_parse_clientname(const char *conf, char *clientname) -{ - const char *p = conf; - - while (*p) { - int len; - const char *end = strchr(p, ':'); - - if (end) { - len = end - p; - } else { - len = strlen(p); - } - - if (strncmp(p, "id=", 3) == 0) { - len -= 3; - strncpy(clientname, p + 3, len); - clientname[len] = '\0'; - return clientname; - } - if (end == NULL) { - break; - } - p = end + 1; - } - return NULL; -} - - -static int qemu_rbd_set_auth(rados_t cluster, const char *secretid, - Error **errp) -{ - if (secretid == 0) { - return 0; - } - - gchar *secret = qcrypto_secret_lookup_as_base64(secretid, - errp); - if (!secret) { - return -1; - } - - rados_conf_set(cluster, "key", secret); - g_free(secret); - - return 0; -} - - -static int qemu_rbd_set_conf(rados_t cluster, const char *conf, - bool only_read_conf_file, - Error **errp) -{ - char *p, *buf; - char name[RBD_MAX_CONF_NAME_SIZE]; - char value[RBD_MAX_CONF_VAL_SIZE]; - int ret = 0; - - buf = g_strdup(conf); - p = buf; - - while (p) { - ret = qemu_rbd_next_tok(name, sizeof(name), p, - '=', "conf option name", &p, errp); - if (ret < 0) { - break; - } - qemu_rbd_unescape(name); - - if (!p) { - error_setg(errp, "conf option %s has no value", name); - ret = -EINVAL; - break; - } - - ret = qemu_rbd_next_tok(value, sizeof(value), p, - ':', "conf option value", &p, errp); - if (ret < 0) { - break; - } - qemu_rbd_unescape(value); - - if (strcmp(name, "conf") == 0) { - /* read the conf file alone, so it doesn't override more - specific settings for a particular device */ - if (only_read_conf_file) { - ret = rados_conf_read_file(cluster, value); - if (ret < 0) { - error_setg(errp, "error reading conf file %s", value); - break; - } - } - } else if (strcmp(name, "id") == 0) { - /* ignore, this is parsed by qemu_rbd_parse_clientname() */ - } else if (!only_read_conf_file) { - ret = rados_conf_set(cluster, name, value); - if (ret < 0) { - error_setg(errp, "invalid conf option %s", name); - ret = -EINVAL; - break; - } - } - } - - g_free(buf); - return ret; -} - -static int qemu_rbd_create(const char *filename, QemuOpts *opts, Error **errp) -{ - Error *local_err = NULL; - int64_t bytes = 0; - int64_t objsize; - int obj_order = 0; - char pool[RBD_MAX_POOL_NAME_SIZE]; - char name[RBD_MAX_IMAGE_NAME_SIZE]; - char snap_buf[RBD_MAX_SNAP_NAME_SIZE]; - char conf[RBD_MAX_CONF_SIZE]; - char clientname_buf[RBD_MAX_CONF_SIZE]; - char *clientname; - const char *secretid; - rados_t cluster; - rados_ioctx_t io_ctx; - int ret; - - secretid = qemu_opt_get(opts, "password-secret"); - - if (qemu_rbd_parsename(filename, pool, sizeof(pool), - snap_buf, sizeof(snap_buf), - name, sizeof(name), - conf, sizeof(conf), &local_err) < 0) { - error_propagate(errp, local_err); - return -EINVAL; - } - - /* Read out options */ - bytes = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0), - BDRV_SECTOR_SIZE); - objsize = qemu_opt_get_size_del(opts, BLOCK_OPT_CLUSTER_SIZE, 0); - if (objsize) { - if ((objsize - 1) & objsize) { /* not a power of 2? */ - error_setg(errp, "obj size needs to be power of 2"); - return -EINVAL; - } - if (objsize < 4096) { - error_setg(errp, "obj size too small"); - return -EINVAL; - } - obj_order = ctz32(objsize); - } - - clientname = qemu_rbd_parse_clientname(conf, clientname_buf); - if (rados_create(&cluster, clientname) < 0) { - error_setg(errp, "error initializing"); - return -EIO; - } - - if (strstr(conf, "conf=") == NULL) { - /* try default location, but ignore failure */ - rados_conf_read_file(cluster, NULL); - } else if (conf[0] != '\0' && - qemu_rbd_set_conf(cluster, conf, true, &local_err) < 0) { - rados_shutdown(cluster); - error_propagate(errp, local_err); - return -EIO; - } - - if (conf[0] != '\0' && - qemu_rbd_set_conf(cluster, conf, false, &local_err) < 0) { - rados_shutdown(cluster); - error_propagate(errp, local_err); - return -EIO; - } - - if (qemu_rbd_set_auth(cluster, secretid, errp) < 0) { - rados_shutdown(cluster); - return -EIO; - } - - if (rados_connect(cluster) < 0) { - error_setg(errp, "error connecting"); - rados_shutdown(cluster); - return -EIO; - } - - if (rados_ioctx_create(cluster, pool, &io_ctx) < 0) { - error_setg(errp, "error opening pool %s", pool); - rados_shutdown(cluster); - return -EIO; - } - - ret = rbd_create(io_ctx, name, bytes, &obj_order); - rados_ioctx_destroy(io_ctx); - rados_shutdown(cluster); - - return ret; -} - -/* - * This aio completion is being called from rbd_finish_bh() and runs in qemu - * BH context. - */ -static void qemu_rbd_complete_aio(RADOSCB *rcb) -{ - RBDAIOCB *acb = rcb->acb; - int64_t r; - - r = rcb->ret; - - if (acb->cmd != RBD_AIO_READ) { - if (r < 0) { - acb->ret = r; - acb->error = 1; - } else if (!acb->error) { - acb->ret = rcb->size; - } - } else { - if (r < 0) { - memset(rcb->buf, 0, rcb->size); - acb->ret = r; - acb->error = 1; - } else if (r < rcb->size) { - memset(rcb->buf + r, 0, rcb->size - r); - if (!acb->error) { - acb->ret = rcb->size; - } - } else if (!acb->error) { - acb->ret = r; - } - } - - g_free(rcb); - - if (acb->cmd == RBD_AIO_READ) { - qemu_iovec_from_buf(acb->qiov, 0, acb->bounce, acb->qiov->size); - } - qemu_vfree(acb->bounce); - acb->common.cb(acb->common.opaque, (acb->ret > 0 ? 0 : acb->ret)); - - qemu_aio_unref(acb); -} - -/* TODO Convert to fine grained options */ -static QemuOptsList runtime_opts = { - .name = "rbd", - .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head), - .desc = { - { - .name = "filename", - .type = QEMU_OPT_STRING, - .help = "Specification of the rbd image", - }, - { - .name = "password-secret", - .type = QEMU_OPT_STRING, - .help = "ID of secret providing the password", - }, - { /* end of list */ } - }, -}; - -static int qemu_rbd_open(BlockDriverState *bs, QDict *options, int flags, - Error **errp) -{ - BDRVRBDState *s = bs->opaque; - char pool[RBD_MAX_POOL_NAME_SIZE]; - char snap_buf[RBD_MAX_SNAP_NAME_SIZE]; - char conf[RBD_MAX_CONF_SIZE]; - char clientname_buf[RBD_MAX_CONF_SIZE]; - char *clientname; - const char *secretid; - QemuOpts *opts; - Error *local_err = NULL; - const char *filename; - int r; - - opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort); - qemu_opts_absorb_qdict(opts, options, &local_err); - if (local_err) { - error_propagate(errp, local_err); - qemu_opts_del(opts); - return -EINVAL; - } - - filename = qemu_opt_get(opts, "filename"); - secretid = qemu_opt_get(opts, "password-secret"); - - if (qemu_rbd_parsename(filename, pool, sizeof(pool), - snap_buf, sizeof(snap_buf), - s->name, sizeof(s->name), - conf, sizeof(conf), errp) < 0) { - r = -EINVAL; - goto failed_opts; - } - - clientname = qemu_rbd_parse_clientname(conf, clientname_buf); - r = rados_create(&s->cluster, clientname); - if (r < 0) { - error_setg(errp, "error initializing"); - goto failed_opts; - } - - s->snap = NULL; - if (snap_buf[0] != '\0') { - s->snap = g_strdup(snap_buf); - } - - if (strstr(conf, "conf=") == NULL) { - /* try default location, but ignore failure */ - rados_conf_read_file(s->cluster, NULL); - } else if (conf[0] != '\0') { - r = qemu_rbd_set_conf(s->cluster, conf, true, errp); - if (r < 0) { - goto failed_shutdown; - } - } - - if (conf[0] != '\0') { - r = qemu_rbd_set_conf(s->cluster, conf, false, errp); - if (r < 0) { - goto failed_shutdown; - } - } - - if (qemu_rbd_set_auth(s->cluster, secretid, errp) < 0) { - r = -EIO; - goto failed_shutdown; - } - - /* - * Fallback to more conservative semantics if setting cache - * options fails. Ignore errors from setting rbd_cache because the - * only possible error is that the option does not exist, and - * librbd defaults to no caching. If write through caching cannot - * be set up, fall back to no caching. - */ - if (flags & BDRV_O_NOCACHE) { - rados_conf_set(s->cluster, "rbd_cache", "false"); - } else { - rados_conf_set(s->cluster, "rbd_cache", "true"); - } - - r = rados_connect(s->cluster); - if (r < 0) { - error_setg(errp, "error connecting"); - goto failed_shutdown; - } - - r = rados_ioctx_create(s->cluster, pool, &s->io_ctx); - if (r < 0) { - error_setg(errp, "error opening pool %s", pool); - goto failed_shutdown; - } - - r = rbd_open(s->io_ctx, s->name, &s->image, s->snap); - if (r < 0) { - error_setg(errp, "error reading header from %s", s->name); - goto failed_open; - } - - bs->read_only = (s->snap != NULL); - - qemu_opts_del(opts); - return 0; - -failed_open: - rados_ioctx_destroy(s->io_ctx); -failed_shutdown: - rados_shutdown(s->cluster); - g_free(s->snap); -failed_opts: - qemu_opts_del(opts); - return r; -} - -static void qemu_rbd_close(BlockDriverState *bs) -{ - BDRVRBDState *s = bs->opaque; - - rbd_close(s->image); - rados_ioctx_destroy(s->io_ctx); - g_free(s->snap); - rados_shutdown(s->cluster); -} - -static const AIOCBInfo rbd_aiocb_info = { - .aiocb_size = sizeof(RBDAIOCB), -}; - -static void rbd_finish_bh(void *opaque) -{ - RADOSCB *rcb = opaque; - qemu_bh_delete(rcb->acb->bh); - qemu_rbd_complete_aio(rcb); -} - -/* - * This is the callback function for rbd_aio_read and _write - * - * Note: this function is being called from a non qemu thread so - * we need to be careful about what we do here. Generally we only - * schedule a BH, and do the rest of the io completion handling - * from rbd_finish_bh() which runs in a qemu context. - */ -static void rbd_finish_aiocb(rbd_completion_t c, RADOSCB *rcb) -{ - RBDAIOCB *acb = rcb->acb; - - rcb->ret = rbd_aio_get_return_value(c); - rbd_aio_release(c); - - acb->bh = aio_bh_new(bdrv_get_aio_context(acb->common.bs), - rbd_finish_bh, rcb); - qemu_bh_schedule(acb->bh); -} - -static int rbd_aio_discard_wrapper(rbd_image_t image, - uint64_t off, - uint64_t len, - rbd_completion_t comp) -{ -#ifdef LIBRBD_SUPPORTS_DISCARD - return rbd_aio_discard(image, off, len, comp); -#else - return -ENOTSUP; -#endif -} - -static int rbd_aio_flush_wrapper(rbd_image_t image, - rbd_completion_t comp) -{ -#ifdef LIBRBD_SUPPORTS_AIO_FLUSH - return rbd_aio_flush(image, comp); -#else - return -ENOTSUP; -#endif -} - -static BlockAIOCB *rbd_start_aio(BlockDriverState *bs, - int64_t sector_num, - QEMUIOVector *qiov, - int nb_sectors, - BlockCompletionFunc *cb, - void *opaque, - RBDAIOCmd cmd) -{ - RBDAIOCB *acb; - RADOSCB *rcb = NULL; - rbd_completion_t c; - int64_t off, size; - char *buf; - int r; - - BDRVRBDState *s = bs->opaque; - - acb = qemu_aio_get(&rbd_aiocb_info, bs, cb, opaque); - acb->cmd = cmd; - acb->qiov = qiov; - if (cmd == RBD_AIO_DISCARD || cmd == RBD_AIO_FLUSH) { - acb->bounce = NULL; - } else { - acb->bounce = qemu_try_blockalign(bs, qiov->size); - if (acb->bounce == NULL) { - goto failed; - } - } - acb->ret = 0; - acb->error = 0; - acb->s = s; - acb->bh = NULL; - - if (cmd == RBD_AIO_WRITE) { - qemu_iovec_to_buf(acb->qiov, 0, acb->bounce, qiov->size); - } - - buf = acb->bounce; - - off = sector_num * BDRV_SECTOR_SIZE; - size = nb_sectors * BDRV_SECTOR_SIZE; - - rcb = g_new(RADOSCB, 1); - rcb->acb = acb; - rcb->buf = buf; - rcb->s = acb->s; - rcb->size = size; - r = rbd_aio_create_completion(rcb, (rbd_callback_t) rbd_finish_aiocb, &c); - if (r < 0) { - goto failed; - } - - switch (cmd) { - case RBD_AIO_WRITE: - r = rbd_aio_write(s->image, off, size, buf, c); - break; - case RBD_AIO_READ: - r = rbd_aio_read(s->image, off, size, buf, c); - break; - case RBD_AIO_DISCARD: - r = rbd_aio_discard_wrapper(s->image, off, size, c); - break; - case RBD_AIO_FLUSH: - r = rbd_aio_flush_wrapper(s->image, c); - break; - default: - r = -EINVAL; - } - - if (r < 0) { - goto failed_completion; - } - - return &acb->common; - -failed_completion: - rbd_aio_release(c); -failed: - g_free(rcb); - qemu_vfree(acb->bounce); - qemu_aio_unref(acb); - return NULL; -} - -static BlockAIOCB *qemu_rbd_aio_readv(BlockDriverState *bs, - int64_t sector_num, - QEMUIOVector *qiov, - int nb_sectors, - BlockCompletionFunc *cb, - void *opaque) -{ - return rbd_start_aio(bs, sector_num, qiov, nb_sectors, cb, opaque, - RBD_AIO_READ); -} - -static BlockAIOCB *qemu_rbd_aio_writev(BlockDriverState *bs, - int64_t sector_num, - QEMUIOVector *qiov, - int nb_sectors, - BlockCompletionFunc *cb, - void *opaque) -{ - return rbd_start_aio(bs, sector_num, qiov, nb_sectors, cb, opaque, - RBD_AIO_WRITE); -} - -#ifdef LIBRBD_SUPPORTS_AIO_FLUSH -static BlockAIOCB *qemu_rbd_aio_flush(BlockDriverState *bs, - BlockCompletionFunc *cb, - void *opaque) -{ - return rbd_start_aio(bs, 0, NULL, 0, cb, opaque, RBD_AIO_FLUSH); -} - -#else - -static int qemu_rbd_co_flush(BlockDriverState *bs) -{ -#if LIBRBD_VERSION_CODE >= LIBRBD_VERSION(0, 1, 1) - /* rbd_flush added in 0.1.1 */ - BDRVRBDState *s = bs->opaque; - return rbd_flush(s->image); -#else - return 0; -#endif -} -#endif - -static int qemu_rbd_getinfo(BlockDriverState *bs, BlockDriverInfo *bdi) -{ - BDRVRBDState *s = bs->opaque; - rbd_image_info_t info; - int r; - - r = rbd_stat(s->image, &info, sizeof(info)); - if (r < 0) { - return r; - } - - bdi->cluster_size = info.obj_size; - return 0; -} - -static int64_t qemu_rbd_getlength(BlockDriverState *bs) -{ - BDRVRBDState *s = bs->opaque; - rbd_image_info_t info; - int r; - - r = rbd_stat(s->image, &info, sizeof(info)); - if (r < 0) { - return r; - } - - return info.size; -} - -static int qemu_rbd_truncate(BlockDriverState *bs, int64_t offset) -{ - BDRVRBDState *s = bs->opaque; - int r; - - r = rbd_resize(s->image, offset); - if (r < 0) { - return r; - } - - return 0; -} - -static int qemu_rbd_snap_create(BlockDriverState *bs, - QEMUSnapshotInfo *sn_info) -{ - BDRVRBDState *s = bs->opaque; - int r; - - if (sn_info->name[0] == '\0') { - return -EINVAL; /* we need a name for rbd snapshots */ - } - - /* - * rbd snapshots are using the name as the user controlled unique identifier - * we can't use the rbd snapid for that purpose, as it can't be set - */ - if (sn_info->id_str[0] != '\0' && - strcmp(sn_info->id_str, sn_info->name) != 0) { - return -EINVAL; - } - - if (strlen(sn_info->name) >= sizeof(sn_info->id_str)) { - return -ERANGE; - } - - r = rbd_snap_create(s->image, sn_info->name); - if (r < 0) { - error_report("failed to create snap: %s", strerror(-r)); - return r; - } - - return 0; -} - -static int qemu_rbd_snap_remove(BlockDriverState *bs, - const char *snapshot_id, - const char *snapshot_name, - Error **errp) -{ - BDRVRBDState *s = bs->opaque; - int r; - - if (!snapshot_name) { - error_setg(errp, "rbd need a valid snapshot name"); - return -EINVAL; - } - - /* If snapshot_id is specified, it must be equal to name, see - qemu_rbd_snap_list() */ - if (snapshot_id && strcmp(snapshot_id, snapshot_name)) { - error_setg(errp, - "rbd do not support snapshot id, it should be NULL or " - "equal to snapshot name"); - return -EINVAL; - } - - r = rbd_snap_remove(s->image, snapshot_name); - if (r < 0) { - error_setg_errno(errp, -r, "Failed to remove the snapshot"); - } - return r; -} - -static int qemu_rbd_snap_rollback(BlockDriverState *bs, - const char *snapshot_name) -{ - BDRVRBDState *s = bs->opaque; - int r; - - r = rbd_snap_rollback(s->image, snapshot_name); - return r; -} - -static int qemu_rbd_snap_list(BlockDriverState *bs, - QEMUSnapshotInfo **psn_tab) -{ - BDRVRBDState *s = bs->opaque; - QEMUSnapshotInfo *sn_info, *sn_tab = NULL; - int i, snap_count; - rbd_snap_info_t *snaps; - int max_snaps = RBD_MAX_SNAPS; - - do { - snaps = g_new(rbd_snap_info_t, max_snaps); - snap_count = rbd_snap_list(s->image, snaps, &max_snaps); - if (snap_count <= 0) { - g_free(snaps); - } - } while (snap_count == -ERANGE); - - if (snap_count <= 0) { - goto done; - } - - sn_tab = g_new0(QEMUSnapshotInfo, snap_count); - - for (i = 0; i < snap_count; i++) { - const char *snap_name = snaps[i].name; - - sn_info = sn_tab + i; - pstrcpy(sn_info->id_str, sizeof(sn_info->id_str), snap_name); - pstrcpy(sn_info->name, sizeof(sn_info->name), snap_name); - - sn_info->vm_state_size = snaps[i].size; - sn_info->date_sec = 0; - sn_info->date_nsec = 0; - sn_info->vm_clock_nsec = 0; - } - rbd_snap_list_end(snaps); - g_free(snaps); - - done: - *psn_tab = sn_tab; - return snap_count; -} - -#ifdef LIBRBD_SUPPORTS_DISCARD -static BlockAIOCB* qemu_rbd_aio_discard(BlockDriverState *bs, - int64_t sector_num, - int nb_sectors, - BlockCompletionFunc *cb, - void *opaque) -{ - return rbd_start_aio(bs, sector_num, NULL, nb_sectors, cb, opaque, - RBD_AIO_DISCARD); -} -#endif - -#ifdef LIBRBD_SUPPORTS_INVALIDATE -static void qemu_rbd_invalidate_cache(BlockDriverState *bs, - Error **errp) -{ - BDRVRBDState *s = bs->opaque; - int r = rbd_invalidate_cache(s->image); - if (r < 0) { - error_setg_errno(errp, -r, "Failed to invalidate the cache"); - } -} -#endif - -static QemuOptsList qemu_rbd_create_opts = { - .name = "rbd-create-opts", - .head = QTAILQ_HEAD_INITIALIZER(qemu_rbd_create_opts.head), - .desc = { - { - .name = BLOCK_OPT_SIZE, - .type = QEMU_OPT_SIZE, - .help = "Virtual disk size" - }, - { - .name = BLOCK_OPT_CLUSTER_SIZE, - .type = QEMU_OPT_SIZE, - .help = "RBD object size" - }, - { - .name = "password-secret", - .type = QEMU_OPT_STRING, - .help = "ID of secret providing the password", - }, - { /* end of list */ } - } -}; - -static BlockDriver bdrv_rbd = { - .format_name = "rbd", - .instance_size = sizeof(BDRVRBDState), - .bdrv_needs_filename = true, - .bdrv_file_open = qemu_rbd_open, - .bdrv_close = qemu_rbd_close, - .bdrv_create = qemu_rbd_create, - .bdrv_has_zero_init = bdrv_has_zero_init_1, - .bdrv_get_info = qemu_rbd_getinfo, - .create_opts = &qemu_rbd_create_opts, - .bdrv_getlength = qemu_rbd_getlength, - .bdrv_truncate = qemu_rbd_truncate, - .protocol_name = "rbd", - - .bdrv_aio_readv = qemu_rbd_aio_readv, - .bdrv_aio_writev = qemu_rbd_aio_writev, - -#ifdef LIBRBD_SUPPORTS_AIO_FLUSH - .bdrv_aio_flush = qemu_rbd_aio_flush, -#else - .bdrv_co_flush_to_disk = qemu_rbd_co_flush, -#endif - -#ifdef LIBRBD_SUPPORTS_DISCARD - .bdrv_aio_discard = qemu_rbd_aio_discard, -#endif - - .bdrv_snapshot_create = qemu_rbd_snap_create, - .bdrv_snapshot_delete = qemu_rbd_snap_remove, - .bdrv_snapshot_list = qemu_rbd_snap_list, - .bdrv_snapshot_goto = qemu_rbd_snap_rollback, -#ifdef LIBRBD_SUPPORTS_INVALIDATE - .bdrv_invalidate_cache = qemu_rbd_invalidate_cache, -#endif -}; - -static void bdrv_rbd_init(void) -{ - bdrv_register(&bdrv_rbd); -} - -block_init(bdrv_rbd_init); diff --git a/qemu/block/sheepdog.c b/qemu/block/sheepdog.c deleted file mode 100644 index 33e0a3382..000000000 --- a/qemu/block/sheepdog.c +++ /dev/null @@ -1,3042 +0,0 @@ -/* - * Copyright (C) 2009-2010 Nippon Telegraph and Telephone Corporation. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License version - * 2 as published by the Free Software Foundation. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see . - * - * Contributions after 2012-01-13 are licensed under the terms of the - * GNU GPL, version 2 or (at your option) any later version. - */ - -#include "qemu/osdep.h" -#include "qapi/error.h" -#include "qemu/uri.h" -#include "qemu/error-report.h" -#include "qemu/sockets.h" -#include "block/block_int.h" -#include "sysemu/block-backend.h" -#include "qemu/bitops.h" -#include "qemu/cutils.h" - -#define SD_PROTO_VER 0x01 - -#define SD_DEFAULT_ADDR "localhost" -#define SD_DEFAULT_PORT 7000 - -#define SD_OP_CREATE_AND_WRITE_OBJ 0x01 -#define SD_OP_READ_OBJ 0x02 -#define SD_OP_WRITE_OBJ 0x03 -/* 0x04 is used internally by Sheepdog */ - -#define SD_OP_NEW_VDI 0x11 -#define SD_OP_LOCK_VDI 0x12 -#define SD_OP_RELEASE_VDI 0x13 -#define SD_OP_GET_VDI_INFO 0x14 -#define SD_OP_READ_VDIS 0x15 -#define SD_OP_FLUSH_VDI 0x16 -#define SD_OP_DEL_VDI 0x17 -#define SD_OP_GET_CLUSTER_DEFAULT 0x18 - -#define SD_FLAG_CMD_WRITE 0x01 -#define SD_FLAG_CMD_COW 0x02 -#define SD_FLAG_CMD_CACHE 0x04 /* Writeback mode for cache */ -#define SD_FLAG_CMD_DIRECT 0x08 /* Don't use cache */ - -#define SD_RES_SUCCESS 0x00 /* Success */ -#define SD_RES_UNKNOWN 0x01 /* Unknown error */ -#define SD_RES_NO_OBJ 0x02 /* No object found */ -#define SD_RES_EIO 0x03 /* I/O error */ -#define SD_RES_VDI_EXIST 0x04 /* Vdi exists already */ -#define SD_RES_INVALID_PARMS 0x05 /* Invalid parameters */ -#define SD_RES_SYSTEM_ERROR 0x06 /* System error */ -#define SD_RES_VDI_LOCKED 0x07 /* Vdi is locked */ -#define SD_RES_NO_VDI 0x08 /* No vdi found */ -#define SD_RES_NO_BASE_VDI 0x09 /* No base vdi found */ -#define SD_RES_VDI_READ 0x0A /* Cannot read requested vdi */ -#define SD_RES_VDI_WRITE 0x0B /* Cannot write requested vdi */ -#define SD_RES_BASE_VDI_READ 0x0C /* Cannot read base vdi */ -#define SD_RES_BASE_VDI_WRITE 0x0D /* Cannot write base vdi */ -#define SD_RES_NO_TAG 0x0E /* Requested tag is not found */ -#define SD_RES_STARTUP 0x0F /* Sheepdog is on starting up */ -#define SD_RES_VDI_NOT_LOCKED 0x10 /* Vdi is not locked */ -#define SD_RES_SHUTDOWN 0x11 /* Sheepdog is shutting down */ -#define SD_RES_NO_MEM 0x12 /* Cannot allocate memory */ -#define SD_RES_FULL_VDI 0x13 /* we already have the maximum vdis */ -#define SD_RES_VER_MISMATCH 0x14 /* Protocol version mismatch */ -#define SD_RES_NO_SPACE 0x15 /* Server has no room for new objects */ -#define SD_RES_WAIT_FOR_FORMAT 0x16 /* Waiting for a format operation */ -#define SD_RES_WAIT_FOR_JOIN 0x17 /* Waiting for other nodes joining */ -#define SD_RES_JOIN_FAILED 0x18 /* Target node had failed to join sheepdog */ -#define SD_RES_HALT 0x19 /* Sheepdog is stopped serving IO request */ -#define SD_RES_READONLY 0x1A /* Object is read-only */ - -/* - * Object ID rules - * - * 0 - 19 (20 bits): data object space - * 20 - 31 (12 bits): reserved data object space - * 32 - 55 (24 bits): vdi object space - * 56 - 59 ( 4 bits): reserved vdi object space - * 60 - 63 ( 4 bits): object type identifier space - */ - -#define VDI_SPACE_SHIFT 32 -#define VDI_BIT (UINT64_C(1) << 63) -#define VMSTATE_BIT (UINT64_C(1) << 62) -#define MAX_DATA_OBJS (UINT64_C(1) << 20) -#define MAX_CHILDREN 1024 -#define SD_MAX_VDI_LEN 256 -#define SD_MAX_VDI_TAG_LEN 256 -#define SD_NR_VDIS (1U << 24) -#define SD_DATA_OBJ_SIZE (UINT64_C(1) << 22) -#define SD_MAX_VDI_SIZE (SD_DATA_OBJ_SIZE * MAX_DATA_OBJS) -#define SD_DEFAULT_BLOCK_SIZE_SHIFT 22 -/* - * For erasure coding, we use at most SD_EC_MAX_STRIP for data strips and - * (SD_EC_MAX_STRIP - 1) for parity strips - * - * SD_MAX_COPIES is sum of number of data strips and parity strips. - */ -#define SD_EC_MAX_STRIP 16 -#define SD_MAX_COPIES (SD_EC_MAX_STRIP * 2 - 1) - -#define SD_INODE_SIZE (sizeof(SheepdogInode)) -#define CURRENT_VDI_ID 0 - -#define LOCK_TYPE_NORMAL 0 -#define LOCK_TYPE_SHARED 1 /* for iSCSI multipath */ - -typedef struct SheepdogReq { - uint8_t proto_ver; - uint8_t opcode; - uint16_t flags; - uint32_t epoch; - uint32_t id; - uint32_t data_length; - uint32_t opcode_specific[8]; -} SheepdogReq; - -typedef struct SheepdogRsp { - uint8_t proto_ver; - uint8_t opcode; - uint16_t flags; - uint32_t epoch; - uint32_t id; - uint32_t data_length; - uint32_t result; - uint32_t opcode_specific[7]; -} SheepdogRsp; - -typedef struct SheepdogObjReq { - uint8_t proto_ver; - uint8_t opcode; - uint16_t flags; - uint32_t epoch; - uint32_t id; - uint32_t data_length; - uint64_t oid; - uint64_t cow_oid; - uint8_t copies; - uint8_t copy_policy; - uint8_t reserved[6]; - uint64_t offset; -} SheepdogObjReq; - -typedef struct SheepdogObjRsp { - uint8_t proto_ver; - uint8_t opcode; - uint16_t flags; - uint32_t epoch; - uint32_t id; - uint32_t data_length; - uint32_t result; - uint8_t copies; - uint8_t copy_policy; - uint8_t reserved[2]; - uint32_t pad[6]; -} SheepdogObjRsp; - -typedef struct SheepdogVdiReq { - uint8_t proto_ver; - uint8_t opcode; - uint16_t flags; - uint32_t epoch; - uint32_t id; - uint32_t data_length; - uint64_t vdi_size; - uint32_t base_vdi_id; - uint8_t copies; - uint8_t copy_policy; - uint8_t store_policy; - uint8_t block_size_shift; - uint32_t snapid; - uint32_t type; - uint32_t pad[2]; -} SheepdogVdiReq; - -typedef struct SheepdogVdiRsp { - uint8_t proto_ver; - uint8_t opcode; - uint16_t flags; - uint32_t epoch; - uint32_t id; - uint32_t data_length; - uint32_t result; - uint32_t rsvd; - uint32_t vdi_id; - uint32_t pad[5]; -} SheepdogVdiRsp; - -typedef struct SheepdogClusterRsp { - uint8_t proto_ver; - uint8_t opcode; - uint16_t flags; - uint32_t epoch; - uint32_t id; - uint32_t data_length; - uint32_t result; - uint8_t nr_copies; - uint8_t copy_policy; - uint8_t block_size_shift; - uint8_t __pad1; - uint32_t __pad2[6]; -} SheepdogClusterRsp; - -typedef struct SheepdogInode { - char name[SD_MAX_VDI_LEN]; - char tag[SD_MAX_VDI_TAG_LEN]; - uint64_t ctime; - uint64_t snap_ctime; - uint64_t vm_clock_nsec; - uint64_t vdi_size; - uint64_t vm_state_size; - uint16_t copy_policy; - uint8_t nr_copies; - uint8_t block_size_shift; - uint32_t snap_id; - uint32_t vdi_id; - uint32_t parent_vdi_id; - uint32_t child_vdi_id[MAX_CHILDREN]; - uint32_t data_vdi_id[MAX_DATA_OBJS]; -} SheepdogInode; - -#define SD_INODE_HEADER_SIZE offsetof(SheepdogInode, data_vdi_id) - -/* - * 64 bit FNV-1a non-zero initial basis - */ -#define FNV1A_64_INIT ((uint64_t)0xcbf29ce484222325ULL) - -/* - * 64 bit Fowler/Noll/Vo FNV-1a hash code - */ -static inline uint64_t fnv_64a_buf(void *buf, size_t len, uint64_t hval) -{ - unsigned char *bp = buf; - unsigned char *be = bp + len; - while (bp < be) { - hval ^= (uint64_t) *bp++; - hval += (hval << 1) + (hval << 4) + (hval << 5) + - (hval << 7) + (hval << 8) + (hval << 40); - } - return hval; -} - -static inline bool is_data_obj_writable(SheepdogInode *inode, unsigned int idx) -{ - return inode->vdi_id == inode->data_vdi_id[idx]; -} - -static inline bool is_data_obj(uint64_t oid) -{ - return !(VDI_BIT & oid); -} - -static inline uint64_t data_oid_to_idx(uint64_t oid) -{ - return oid & (MAX_DATA_OBJS - 1); -} - -static inline uint32_t oid_to_vid(uint64_t oid) -{ - return (oid & ~VDI_BIT) >> VDI_SPACE_SHIFT; -} - -static inline uint64_t vid_to_vdi_oid(uint32_t vid) -{ - return VDI_BIT | ((uint64_t)vid << VDI_SPACE_SHIFT); -} - -static inline uint64_t vid_to_vmstate_oid(uint32_t vid, uint32_t idx) -{ - return VMSTATE_BIT | ((uint64_t)vid << VDI_SPACE_SHIFT) | idx; -} - -static inline uint64_t vid_to_data_oid(uint32_t vid, uint32_t idx) -{ - return ((uint64_t)vid << VDI_SPACE_SHIFT) | idx; -} - -static inline bool is_snapshot(struct SheepdogInode *inode) -{ - return !!inode->snap_ctime; -} - -static inline size_t count_data_objs(const struct SheepdogInode *inode) -{ - return DIV_ROUND_UP(inode->vdi_size, - (1UL << inode->block_size_shift)); -} - -#undef DPRINTF -#ifdef DEBUG_SDOG -#define DPRINTF(fmt, args...) \ - do { \ - fprintf(stdout, "%s %d: " fmt, __func__, __LINE__, ##args); \ - } while (0) -#else -#define DPRINTF(fmt, args...) -#endif - -typedef struct SheepdogAIOCB SheepdogAIOCB; - -typedef struct AIOReq { - SheepdogAIOCB *aiocb; - unsigned int iov_offset; - - uint64_t oid; - uint64_t base_oid; - uint64_t offset; - unsigned int data_len; - uint8_t flags; - uint32_t id; - bool create; - - QLIST_ENTRY(AIOReq) aio_siblings; -} AIOReq; - -enum AIOCBState { - AIOCB_WRITE_UDATA, - AIOCB_READ_UDATA, - AIOCB_FLUSH_CACHE, - AIOCB_DISCARD_OBJ, -}; - -#define AIOCBOverlapping(x, y) \ - (!(x->max_affect_data_idx < y->min_affect_data_idx \ - || y->max_affect_data_idx < x->min_affect_data_idx)) - -struct SheepdogAIOCB { - BlockAIOCB common; - - QEMUIOVector *qiov; - - int64_t sector_num; - int nb_sectors; - - int ret; - enum AIOCBState aiocb_type; - - Coroutine *coroutine; - void (*aio_done_func)(SheepdogAIOCB *); - - bool cancelable; - int nr_pending; - - uint32_t min_affect_data_idx; - uint32_t max_affect_data_idx; - - /* - * The difference between affect_data_idx and dirty_data_idx: - * affect_data_idx represents range of index of all request types. - * dirty_data_idx represents range of index updated by COW requests. - * dirty_data_idx is used for updating an inode object. - */ - uint32_t min_dirty_data_idx; - uint32_t max_dirty_data_idx; - - QLIST_ENTRY(SheepdogAIOCB) aiocb_siblings; -}; - -typedef struct BDRVSheepdogState { - BlockDriverState *bs; - AioContext *aio_context; - - SheepdogInode inode; - - char name[SD_MAX_VDI_LEN]; - bool is_snapshot; - uint32_t cache_flags; - bool discard_supported; - - char *host_spec; - bool is_unix; - int fd; - - CoMutex lock; - Coroutine *co_send; - Coroutine *co_recv; - - uint32_t aioreq_seq_num; - - /* Every aio request must be linked to either of these queues. */ - QLIST_HEAD(inflight_aio_head, AIOReq) inflight_aio_head; - QLIST_HEAD(failed_aio_head, AIOReq) failed_aio_head; - - CoQueue overlapping_queue; - QLIST_HEAD(inflight_aiocb_head, SheepdogAIOCB) inflight_aiocb_head; -} BDRVSheepdogState; - -typedef struct BDRVSheepdogReopenState { - int fd; - int cache_flags; -} BDRVSheepdogReopenState; - -static const char * sd_strerror(int err) -{ - int i; - - static const struct { - int err; - const char *desc; - } errors[] = { - {SD_RES_SUCCESS, "Success"}, - {SD_RES_UNKNOWN, "Unknown error"}, - {SD_RES_NO_OBJ, "No object found"}, - {SD_RES_EIO, "I/O error"}, - {SD_RES_VDI_EXIST, "VDI exists already"}, - {SD_RES_INVALID_PARMS, "Invalid parameters"}, - {SD_RES_SYSTEM_ERROR, "System error"}, - {SD_RES_VDI_LOCKED, "VDI is already locked"}, - {SD_RES_NO_VDI, "No vdi found"}, - {SD_RES_NO_BASE_VDI, "No base VDI found"}, - {SD_RES_VDI_READ, "Failed read the requested VDI"}, - {SD_RES_VDI_WRITE, "Failed to write the requested VDI"}, - {SD_RES_BASE_VDI_READ, "Failed to read the base VDI"}, - {SD_RES_BASE_VDI_WRITE, "Failed to write the base VDI"}, - {SD_RES_NO_TAG, "Failed to find the requested tag"}, - {SD_RES_STARTUP, "The system is still booting"}, - {SD_RES_VDI_NOT_LOCKED, "VDI isn't locked"}, - {SD_RES_SHUTDOWN, "The system is shutting down"}, - {SD_RES_NO_MEM, "Out of memory on the server"}, - {SD_RES_FULL_VDI, "We already have the maximum vdis"}, - {SD_RES_VER_MISMATCH, "Protocol version mismatch"}, - {SD_RES_NO_SPACE, "Server has no space for new objects"}, - {SD_RES_WAIT_FOR_FORMAT, "Sheepdog is waiting for a format operation"}, - {SD_RES_WAIT_FOR_JOIN, "Sheepdog is waiting for other nodes joining"}, - {SD_RES_JOIN_FAILED, "Target node had failed to join sheepdog"}, - {SD_RES_HALT, "Sheepdog is stopped serving IO request"}, - {SD_RES_READONLY, "Object is read-only"}, - }; - - for (i = 0; i < ARRAY_SIZE(errors); ++i) { - if (errors[i].err == err) { - return errors[i].desc; - } - } - - return "Invalid error code"; -} - -/* - * Sheepdog I/O handling: - * - * 1. In sd_co_rw_vector, we send the I/O requests to the server and - * link the requests to the inflight_list in the - * BDRVSheepdogState. The function exits without waiting for - * receiving the response. - * - * 2. We receive the response in aio_read_response, the fd handler to - * the sheepdog connection. If metadata update is needed, we send - * the write request to the vdi object in sd_write_done, the write - * completion function. We switch back to sd_co_readv/writev after - * all the requests belonging to the AIOCB are finished. - */ - -static inline AIOReq *alloc_aio_req(BDRVSheepdogState *s, SheepdogAIOCB *acb, - uint64_t oid, unsigned int data_len, - uint64_t offset, uint8_t flags, bool create, - uint64_t base_oid, unsigned int iov_offset) -{ - AIOReq *aio_req; - - aio_req = g_malloc(sizeof(*aio_req)); - aio_req->aiocb = acb; - aio_req->iov_offset = iov_offset; - aio_req->oid = oid; - aio_req->base_oid = base_oid; - aio_req->offset = offset; - aio_req->data_len = data_len; - aio_req->flags = flags; - aio_req->id = s->aioreq_seq_num++; - aio_req->create = create; - - acb->nr_pending++; - return aio_req; -} - -static inline void free_aio_req(BDRVSheepdogState *s, AIOReq *aio_req) -{ - SheepdogAIOCB *acb = aio_req->aiocb; - - acb->cancelable = false; - QLIST_REMOVE(aio_req, aio_siblings); - g_free(aio_req); - - acb->nr_pending--; -} - -static void coroutine_fn sd_finish_aiocb(SheepdogAIOCB *acb) -{ - qemu_coroutine_enter(acb->coroutine, NULL); - qemu_aio_unref(acb); -} - -/* - * Check whether the specified acb can be canceled - * - * We can cancel aio when any request belonging to the acb is: - * - Not processed by the sheepdog server. - * - Not linked to the inflight queue. - */ -static bool sd_acb_cancelable(const SheepdogAIOCB *acb) -{ - BDRVSheepdogState *s = acb->common.bs->opaque; - AIOReq *aioreq; - - if (!acb->cancelable) { - return false; - } - - QLIST_FOREACH(aioreq, &s->inflight_aio_head, aio_siblings) { - if (aioreq->aiocb == acb) { - return false; - } - } - - return true; -} - -static void sd_aio_cancel(BlockAIOCB *blockacb) -{ - SheepdogAIOCB *acb = (SheepdogAIOCB *)blockacb; - BDRVSheepdogState *s = acb->common.bs->opaque; - AIOReq *aioreq, *next; - - if (sd_acb_cancelable(acb)) { - /* Remove outstanding requests from failed queue. */ - QLIST_FOREACH_SAFE(aioreq, &s->failed_aio_head, aio_siblings, - next) { - if (aioreq->aiocb == acb) { - free_aio_req(s, aioreq); - } - } - - assert(acb->nr_pending == 0); - if (acb->common.cb) { - acb->common.cb(acb->common.opaque, -ECANCELED); - } - sd_finish_aiocb(acb); - } -} - -static const AIOCBInfo sd_aiocb_info = { - .aiocb_size = sizeof(SheepdogAIOCB), - .cancel_async = sd_aio_cancel, -}; - -static SheepdogAIOCB *sd_aio_setup(BlockDriverState *bs, QEMUIOVector *qiov, - int64_t sector_num, int nb_sectors) -{ - SheepdogAIOCB *acb; - uint32_t object_size; - BDRVSheepdogState *s = bs->opaque; - - object_size = (UINT32_C(1) << s->inode.block_size_shift); - - acb = qemu_aio_get(&sd_aiocb_info, bs, NULL, NULL); - - acb->qiov = qiov; - - acb->sector_num = sector_num; - acb->nb_sectors = nb_sectors; - - acb->aio_done_func = NULL; - acb->cancelable = true; - acb->coroutine = qemu_coroutine_self(); - acb->ret = 0; - acb->nr_pending = 0; - - acb->min_affect_data_idx = acb->sector_num * BDRV_SECTOR_SIZE / object_size; - acb->max_affect_data_idx = (acb->sector_num * BDRV_SECTOR_SIZE + - acb->nb_sectors * BDRV_SECTOR_SIZE) / object_size; - - acb->min_dirty_data_idx = UINT32_MAX; - acb->max_dirty_data_idx = 0; - - return acb; -} - -/* Return -EIO in case of error, file descriptor on success */ -static int connect_to_sdog(BDRVSheepdogState *s, Error **errp) -{ - int fd; - - if (s->is_unix) { - fd = unix_connect(s->host_spec, errp); - } else { - fd = inet_connect(s->host_spec, errp); - - if (fd >= 0) { - int ret = socket_set_nodelay(fd); - if (ret < 0) { - error_report("%s", strerror(errno)); - } - } - } - - if (fd >= 0) { - qemu_set_nonblock(fd); - } else { - fd = -EIO; - } - - return fd; -} - -/* Return 0 on success and -errno in case of error */ -static coroutine_fn int send_co_req(int sockfd, SheepdogReq *hdr, void *data, - unsigned int *wlen) -{ - int ret; - - ret = qemu_co_send(sockfd, hdr, sizeof(*hdr)); - if (ret != sizeof(*hdr)) { - error_report("failed to send a req, %s", strerror(errno)); - return -errno; - } - - ret = qemu_co_send(sockfd, data, *wlen); - if (ret != *wlen) { - error_report("failed to send a req, %s", strerror(errno)); - return -errno; - } - - return ret; -} - -static void restart_co_req(void *opaque) -{ - Coroutine *co = opaque; - - qemu_coroutine_enter(co, NULL); -} - -typedef struct SheepdogReqCo { - int sockfd; - AioContext *aio_context; - SheepdogReq *hdr; - void *data; - unsigned int *wlen; - unsigned int *rlen; - int ret; - bool finished; -} SheepdogReqCo; - -static coroutine_fn void do_co_req(void *opaque) -{ - int ret; - Coroutine *co; - SheepdogReqCo *srco = opaque; - int sockfd = srco->sockfd; - SheepdogReq *hdr = srco->hdr; - void *data = srco->data; - unsigned int *wlen = srco->wlen; - unsigned int *rlen = srco->rlen; - - co = qemu_coroutine_self(); - aio_set_fd_handler(srco->aio_context, sockfd, false, - NULL, restart_co_req, co); - - ret = send_co_req(sockfd, hdr, data, wlen); - if (ret < 0) { - goto out; - } - - aio_set_fd_handler(srco->aio_context, sockfd, false, - restart_co_req, NULL, co); - - ret = qemu_co_recv(sockfd, hdr, sizeof(*hdr)); - if (ret != sizeof(*hdr)) { - error_report("failed to get a rsp, %s", strerror(errno)); - ret = -errno; - goto out; - } - - if (*rlen > hdr->data_length) { - *rlen = hdr->data_length; - } - - if (*rlen) { - ret = qemu_co_recv(sockfd, data, *rlen); - if (ret != *rlen) { - error_report("failed to get the data, %s", strerror(errno)); - ret = -errno; - goto out; - } - } - ret = 0; -out: - /* there is at most one request for this sockfd, so it is safe to - * set each handler to NULL. */ - aio_set_fd_handler(srco->aio_context, sockfd, false, - NULL, NULL, NULL); - - srco->ret = ret; - srco->finished = true; -} - -/* - * Send the request to the sheep in a synchronous manner. - * - * Return 0 on success, -errno in case of error. - */ -static int do_req(int sockfd, AioContext *aio_context, SheepdogReq *hdr, - void *data, unsigned int *wlen, unsigned int *rlen) -{ - Coroutine *co; - SheepdogReqCo srco = { - .sockfd = sockfd, - .aio_context = aio_context, - .hdr = hdr, - .data = data, - .wlen = wlen, - .rlen = rlen, - .ret = 0, - .finished = false, - }; - - if (qemu_in_coroutine()) { - do_co_req(&srco); - } else { - co = qemu_coroutine_create(do_co_req); - qemu_coroutine_enter(co, &srco); - while (!srco.finished) { - aio_poll(aio_context, true); - } - } - - return srco.ret; -} - -static void coroutine_fn add_aio_request(BDRVSheepdogState *s, AIOReq *aio_req, - struct iovec *iov, int niov, - enum AIOCBState aiocb_type); -static void coroutine_fn resend_aioreq(BDRVSheepdogState *s, AIOReq *aio_req); -static int reload_inode(BDRVSheepdogState *s, uint32_t snapid, const char *tag); -static int get_sheep_fd(BDRVSheepdogState *s, Error **errp); -static void co_write_request(void *opaque); - -static coroutine_fn void reconnect_to_sdog(void *opaque) -{ - BDRVSheepdogState *s = opaque; - AIOReq *aio_req, *next; - - aio_set_fd_handler(s->aio_context, s->fd, false, NULL, - NULL, NULL); - close(s->fd); - s->fd = -1; - - /* Wait for outstanding write requests to be completed. */ - while (s->co_send != NULL) { - co_write_request(opaque); - } - - /* Try to reconnect the sheepdog server every one second. */ - while (s->fd < 0) { - Error *local_err = NULL; - s->fd = get_sheep_fd(s, &local_err); - if (s->fd < 0) { - DPRINTF("Wait for connection to be established\n"); - error_report_err(local_err); - co_aio_sleep_ns(bdrv_get_aio_context(s->bs), QEMU_CLOCK_REALTIME, - 1000000000ULL); - } - }; - - /* - * Now we have to resend all the request in the inflight queue. However, - * resend_aioreq() can yield and newly created requests can be added to the - * inflight queue before the coroutine is resumed. To avoid mixing them, we - * have to move all the inflight requests to the failed queue before - * resend_aioreq() is called. - */ - QLIST_FOREACH_SAFE(aio_req, &s->inflight_aio_head, aio_siblings, next) { - QLIST_REMOVE(aio_req, aio_siblings); - QLIST_INSERT_HEAD(&s->failed_aio_head, aio_req, aio_siblings); - } - - /* Resend all the failed aio requests. */ - while (!QLIST_EMPTY(&s->failed_aio_head)) { - aio_req = QLIST_FIRST(&s->failed_aio_head); - QLIST_REMOVE(aio_req, aio_siblings); - QLIST_INSERT_HEAD(&s->inflight_aio_head, aio_req, aio_siblings); - resend_aioreq(s, aio_req); - } -} - -/* - * Receive responses of the I/O requests. - * - * This function is registered as a fd handler, and called from the - * main loop when s->fd is ready for reading responses. - */ -static void coroutine_fn aio_read_response(void *opaque) -{ - SheepdogObjRsp rsp; - BDRVSheepdogState *s = opaque; - int fd = s->fd; - int ret; - AIOReq *aio_req = NULL; - SheepdogAIOCB *acb; - uint64_t idx; - - /* read a header */ - ret = qemu_co_recv(fd, &rsp, sizeof(rsp)); - if (ret != sizeof(rsp)) { - error_report("failed to get the header, %s", strerror(errno)); - goto err; - } - - /* find the right aio_req from the inflight aio list */ - QLIST_FOREACH(aio_req, &s->inflight_aio_head, aio_siblings) { - if (aio_req->id == rsp.id) { - break; - } - } - if (!aio_req) { - error_report("cannot find aio_req %x", rsp.id); - goto err; - } - - acb = aio_req->aiocb; - - switch (acb->aiocb_type) { - case AIOCB_WRITE_UDATA: - /* this coroutine context is no longer suitable for co_recv - * because we may send data to update vdi objects */ - s->co_recv = NULL; - if (!is_data_obj(aio_req->oid)) { - break; - } - idx = data_oid_to_idx(aio_req->oid); - - if (aio_req->create) { - /* - * If the object is newly created one, we need to update - * the vdi object (metadata object). min_dirty_data_idx - * and max_dirty_data_idx are changed to include updated - * index between them. - */ - if (rsp.result == SD_RES_SUCCESS) { - s->inode.data_vdi_id[idx] = s->inode.vdi_id; - acb->max_dirty_data_idx = MAX(idx, acb->max_dirty_data_idx); - acb->min_dirty_data_idx = MIN(idx, acb->min_dirty_data_idx); - } - } - break; - case AIOCB_READ_UDATA: - ret = qemu_co_recvv(fd, acb->qiov->iov, acb->qiov->niov, - aio_req->iov_offset, rsp.data_length); - if (ret != rsp.data_length) { - error_report("failed to get the data, %s", strerror(errno)); - goto err; - } - break; - case AIOCB_FLUSH_CACHE: - if (rsp.result == SD_RES_INVALID_PARMS) { - DPRINTF("disable cache since the server doesn't support it\n"); - s->cache_flags = SD_FLAG_CMD_DIRECT; - rsp.result = SD_RES_SUCCESS; - } - break; - case AIOCB_DISCARD_OBJ: - switch (rsp.result) { - case SD_RES_INVALID_PARMS: - error_report("sheep(%s) doesn't support discard command", - s->host_spec); - rsp.result = SD_RES_SUCCESS; - s->discard_supported = false; - break; - default: - break; - } - } - - switch (rsp.result) { - case SD_RES_SUCCESS: - break; - case SD_RES_READONLY: - if (s->inode.vdi_id == oid_to_vid(aio_req->oid)) { - ret = reload_inode(s, 0, ""); - if (ret < 0) { - goto err; - } - } - if (is_data_obj(aio_req->oid)) { - aio_req->oid = vid_to_data_oid(s->inode.vdi_id, - data_oid_to_idx(aio_req->oid)); - } else { - aio_req->oid = vid_to_vdi_oid(s->inode.vdi_id); - } - resend_aioreq(s, aio_req); - goto out; - default: - acb->ret = -EIO; - error_report("%s", sd_strerror(rsp.result)); - break; - } - - free_aio_req(s, aio_req); - if (!acb->nr_pending) { - /* - * We've finished all requests which belong to the AIOCB, so - * we can switch back to sd_co_readv/writev now. - */ - acb->aio_done_func(acb); - } -out: - s->co_recv = NULL; - return; -err: - s->co_recv = NULL; - reconnect_to_sdog(opaque); -} - -static void co_read_response(void *opaque) -{ - BDRVSheepdogState *s = opaque; - - if (!s->co_recv) { - s->co_recv = qemu_coroutine_create(aio_read_response); - } - - qemu_coroutine_enter(s->co_recv, opaque); -} - -static void co_write_request(void *opaque) -{ - BDRVSheepdogState *s = opaque; - - qemu_coroutine_enter(s->co_send, NULL); -} - -/* - * Return a socket descriptor to read/write objects. - * - * We cannot use this descriptor for other operations because - * the block driver may be on waiting response from the server. - */ -static int get_sheep_fd(BDRVSheepdogState *s, Error **errp) -{ - int fd; - - fd = connect_to_sdog(s, errp); - if (fd < 0) { - return fd; - } - - aio_set_fd_handler(s->aio_context, fd, false, - co_read_response, NULL, s); - return fd; -} - -static int sd_parse_uri(BDRVSheepdogState *s, const char *filename, - char *vdi, uint32_t *snapid, char *tag) -{ - URI *uri; - QueryParams *qp = NULL; - int ret = 0; - - uri = uri_parse(filename); - if (!uri) { - return -EINVAL; - } - - /* transport */ - if (!strcmp(uri->scheme, "sheepdog")) { - s->is_unix = false; - } else if (!strcmp(uri->scheme, "sheepdog+tcp")) { - s->is_unix = false; - } else if (!strcmp(uri->scheme, "sheepdog+unix")) { - s->is_unix = true; - } else { - ret = -EINVAL; - goto out; - } - - if (uri->path == NULL || !strcmp(uri->path, "/")) { - ret = -EINVAL; - goto out; - } - pstrcpy(vdi, SD_MAX_VDI_LEN, uri->path + 1); - - qp = query_params_parse(uri->query); - if (qp->n > 1 || (s->is_unix && !qp->n) || (!s->is_unix && qp->n)) { - ret = -EINVAL; - goto out; - } - - if (s->is_unix) { - /* sheepdog+unix:///vdiname?socket=path */ - if (uri->server || uri->port || strcmp(qp->p[0].name, "socket")) { - ret = -EINVAL; - goto out; - } - s->host_spec = g_strdup(qp->p[0].value); - } else { - /* sheepdog[+tcp]://[host:port]/vdiname */ - s->host_spec = g_strdup_printf("%s:%d", uri->server ?: SD_DEFAULT_ADDR, - uri->port ?: SD_DEFAULT_PORT); - } - - /* snapshot tag */ - if (uri->fragment) { - *snapid = strtoul(uri->fragment, NULL, 10); - if (*snapid == 0) { - pstrcpy(tag, SD_MAX_VDI_TAG_LEN, uri->fragment); - } - } else { - *snapid = CURRENT_VDI_ID; /* search current vdi */ - } - -out: - if (qp) { - query_params_free(qp); - } - uri_free(uri); - return ret; -} - -/* - * Parse a filename (old syntax) - * - * filename must be one of the following formats: - * 1. [vdiname] - * 2. [vdiname]:[snapid] - * 3. [vdiname]:[tag] - * 4. [hostname]:[port]:[vdiname] - * 5. [hostname]:[port]:[vdiname]:[snapid] - * 6. [hostname]:[port]:[vdiname]:[tag] - * - * You can boot from the snapshot images by specifying `snapid` or - * `tag'. - * - * You can run VMs outside the Sheepdog cluster by specifying - * `hostname' and `port' (experimental). - */ -static int parse_vdiname(BDRVSheepdogState *s, const char *filename, - char *vdi, uint32_t *snapid, char *tag) -{ - char *p, *q, *uri; - const char *host_spec, *vdi_spec; - int nr_sep, ret; - - strstart(filename, "sheepdog:", (const char **)&filename); - p = q = g_strdup(filename); - - /* count the number of separators */ - nr_sep = 0; - while (*p) { - if (*p == ':') { - nr_sep++; - } - p++; - } - p = q; - - /* use the first two tokens as host_spec. */ - if (nr_sep >= 2) { - host_spec = p; - p = strchr(p, ':'); - p++; - p = strchr(p, ':'); - *p++ = '\0'; - } else { - host_spec = ""; - } - - vdi_spec = p; - - p = strchr(vdi_spec, ':'); - if (p) { - *p++ = '#'; - } - - uri = g_strdup_printf("sheepdog://%s/%s", host_spec, vdi_spec); - - ret = sd_parse_uri(s, uri, vdi, snapid, tag); - - g_free(q); - g_free(uri); - - return ret; -} - -static int find_vdi_name(BDRVSheepdogState *s, const char *filename, - uint32_t snapid, const char *tag, uint32_t *vid, - bool lock, Error **errp) -{ - int ret, fd; - SheepdogVdiReq hdr; - SheepdogVdiRsp *rsp = (SheepdogVdiRsp *)&hdr; - unsigned int wlen, rlen = 0; - char buf[SD_MAX_VDI_LEN + SD_MAX_VDI_TAG_LEN]; - - fd = connect_to_sdog(s, errp); - if (fd < 0) { - return fd; - } - - /* This pair of strncpy calls ensures that the buffer is zero-filled, - * which is desirable since we'll soon be sending those bytes, and - * don't want the send_req to read uninitialized data. - */ - strncpy(buf, filename, SD_MAX_VDI_LEN); - strncpy(buf + SD_MAX_VDI_LEN, tag, SD_MAX_VDI_TAG_LEN); - - memset(&hdr, 0, sizeof(hdr)); - if (lock) { - hdr.opcode = SD_OP_LOCK_VDI; - hdr.type = LOCK_TYPE_NORMAL; - } else { - hdr.opcode = SD_OP_GET_VDI_INFO; - } - wlen = SD_MAX_VDI_LEN + SD_MAX_VDI_TAG_LEN; - hdr.proto_ver = SD_PROTO_VER; - hdr.data_length = wlen; - hdr.snapid = snapid; - hdr.flags = SD_FLAG_CMD_WRITE; - - ret = do_req(fd, s->aio_context, (SheepdogReq *)&hdr, buf, &wlen, &rlen); - if (ret) { - error_setg_errno(errp, -ret, "cannot get vdi info"); - goto out; - } - - if (rsp->result != SD_RES_SUCCESS) { - error_setg(errp, "cannot get vdi info, %s, %s %" PRIu32 " %s", - sd_strerror(rsp->result), filename, snapid, tag); - if (rsp->result == SD_RES_NO_VDI) { - ret = -ENOENT; - } else if (rsp->result == SD_RES_VDI_LOCKED) { - ret = -EBUSY; - } else { - ret = -EIO; - } - goto out; - } - *vid = rsp->vdi_id; - - ret = 0; -out: - closesocket(fd); - return ret; -} - -static void coroutine_fn add_aio_request(BDRVSheepdogState *s, AIOReq *aio_req, - struct iovec *iov, int niov, - enum AIOCBState aiocb_type) -{ - int nr_copies = s->inode.nr_copies; - SheepdogObjReq hdr; - unsigned int wlen = 0; - int ret; - uint64_t oid = aio_req->oid; - unsigned int datalen = aio_req->data_len; - uint64_t offset = aio_req->offset; - uint8_t flags = aio_req->flags; - uint64_t old_oid = aio_req->base_oid; - bool create = aio_req->create; - - if (!nr_copies) { - error_report("bug"); - } - - memset(&hdr, 0, sizeof(hdr)); - - switch (aiocb_type) { - case AIOCB_FLUSH_CACHE: - hdr.opcode = SD_OP_FLUSH_VDI; - break; - case AIOCB_READ_UDATA: - hdr.opcode = SD_OP_READ_OBJ; - hdr.flags = flags; - break; - case AIOCB_WRITE_UDATA: - if (create) { - hdr.opcode = SD_OP_CREATE_AND_WRITE_OBJ; - } else { - hdr.opcode = SD_OP_WRITE_OBJ; - } - wlen = datalen; - hdr.flags = SD_FLAG_CMD_WRITE | flags; - break; - case AIOCB_DISCARD_OBJ: - hdr.opcode = SD_OP_WRITE_OBJ; - hdr.flags = SD_FLAG_CMD_WRITE | flags; - s->inode.data_vdi_id[data_oid_to_idx(oid)] = 0; - offset = offsetof(SheepdogInode, - data_vdi_id[data_oid_to_idx(oid)]); - oid = vid_to_vdi_oid(s->inode.vdi_id); - wlen = datalen = sizeof(uint32_t); - break; - } - - if (s->cache_flags) { - hdr.flags |= s->cache_flags; - } - - hdr.oid = oid; - hdr.cow_oid = old_oid; - hdr.copies = s->inode.nr_copies; - - hdr.data_length = datalen; - hdr.offset = offset; - - hdr.id = aio_req->id; - - qemu_co_mutex_lock(&s->lock); - s->co_send = qemu_coroutine_self(); - aio_set_fd_handler(s->aio_context, s->fd, false, - co_read_response, co_write_request, s); - socket_set_cork(s->fd, 1); - - /* send a header */ - ret = qemu_co_send(s->fd, &hdr, sizeof(hdr)); - if (ret != sizeof(hdr)) { - error_report("failed to send a req, %s", strerror(errno)); - goto out; - } - - if (wlen) { - ret = qemu_co_sendv(s->fd, iov, niov, aio_req->iov_offset, wlen); - if (ret != wlen) { - error_report("failed to send a data, %s", strerror(errno)); - } - } -out: - socket_set_cork(s->fd, 0); - aio_set_fd_handler(s->aio_context, s->fd, false, - co_read_response, NULL, s); - s->co_send = NULL; - qemu_co_mutex_unlock(&s->lock); -} - -static int read_write_object(int fd, AioContext *aio_context, char *buf, - uint64_t oid, uint8_t copies, - unsigned int datalen, uint64_t offset, - bool write, bool create, uint32_t cache_flags) -{ - SheepdogObjReq hdr; - SheepdogObjRsp *rsp = (SheepdogObjRsp *)&hdr; - unsigned int wlen, rlen; - int ret; - - memset(&hdr, 0, sizeof(hdr)); - - if (write) { - wlen = datalen; - rlen = 0; - hdr.flags = SD_FLAG_CMD_WRITE; - if (create) { - hdr.opcode = SD_OP_CREATE_AND_WRITE_OBJ; - } else { - hdr.opcode = SD_OP_WRITE_OBJ; - } - } else { - wlen = 0; - rlen = datalen; - hdr.opcode = SD_OP_READ_OBJ; - } - - hdr.flags |= cache_flags; - - hdr.oid = oid; - hdr.data_length = datalen; - hdr.offset = offset; - hdr.copies = copies; - - ret = do_req(fd, aio_context, (SheepdogReq *)&hdr, buf, &wlen, &rlen); - if (ret) { - error_report("failed to send a request to the sheep"); - return ret; - } - - switch (rsp->result) { - case SD_RES_SUCCESS: - return 0; - default: - error_report("%s", sd_strerror(rsp->result)); - return -EIO; - } -} - -static int read_object(int fd, AioContext *aio_context, char *buf, - uint64_t oid, uint8_t copies, - unsigned int datalen, uint64_t offset, - uint32_t cache_flags) -{ - return read_write_object(fd, aio_context, buf, oid, copies, - datalen, offset, false, - false, cache_flags); -} - -static int write_object(int fd, AioContext *aio_context, char *buf, - uint64_t oid, uint8_t copies, - unsigned int datalen, uint64_t offset, bool create, - uint32_t cache_flags) -{ - return read_write_object(fd, aio_context, buf, oid, copies, - datalen, offset, true, - create, cache_flags); -} - -/* update inode with the latest state */ -static int reload_inode(BDRVSheepdogState *s, uint32_t snapid, const char *tag) -{ - Error *local_err = NULL; - SheepdogInode *inode; - int ret = 0, fd; - uint32_t vid = 0; - - fd = connect_to_sdog(s, &local_err); - if (fd < 0) { - error_report_err(local_err); - return -EIO; - } - - inode = g_malloc(SD_INODE_HEADER_SIZE); - - ret = find_vdi_name(s, s->name, snapid, tag, &vid, false, &local_err); - if (ret) { - error_report_err(local_err); - goto out; - } - - ret = read_object(fd, s->aio_context, (char *)inode, vid_to_vdi_oid(vid), - s->inode.nr_copies, SD_INODE_HEADER_SIZE, 0, - s->cache_flags); - if (ret < 0) { - goto out; - } - - if (inode->vdi_id != s->inode.vdi_id) { - memcpy(&s->inode, inode, SD_INODE_HEADER_SIZE); - } - -out: - g_free(inode); - closesocket(fd); - - return ret; -} - -static void coroutine_fn resend_aioreq(BDRVSheepdogState *s, AIOReq *aio_req) -{ - SheepdogAIOCB *acb = aio_req->aiocb; - - aio_req->create = false; - - /* check whether this request becomes a CoW one */ - if (acb->aiocb_type == AIOCB_WRITE_UDATA && is_data_obj(aio_req->oid)) { - int idx = data_oid_to_idx(aio_req->oid); - - if (is_data_obj_writable(&s->inode, idx)) { - goto out; - } - - if (s->inode.data_vdi_id[idx]) { - aio_req->base_oid = vid_to_data_oid(s->inode.data_vdi_id[idx], idx); - aio_req->flags |= SD_FLAG_CMD_COW; - } - aio_req->create = true; - } -out: - if (is_data_obj(aio_req->oid)) { - add_aio_request(s, aio_req, acb->qiov->iov, acb->qiov->niov, - acb->aiocb_type); - } else { - struct iovec iov; - iov.iov_base = &s->inode; - iov.iov_len = sizeof(s->inode); - add_aio_request(s, aio_req, &iov, 1, AIOCB_WRITE_UDATA); - } -} - -static void sd_detach_aio_context(BlockDriverState *bs) -{ - BDRVSheepdogState *s = bs->opaque; - - aio_set_fd_handler(s->aio_context, s->fd, false, NULL, - NULL, NULL); -} - -static void sd_attach_aio_context(BlockDriverState *bs, - AioContext *new_context) -{ - BDRVSheepdogState *s = bs->opaque; - - s->aio_context = new_context; - aio_set_fd_handler(new_context, s->fd, false, - co_read_response, NULL, s); -} - -/* TODO Convert to fine grained options */ -static QemuOptsList runtime_opts = { - .name = "sheepdog", - .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head), - .desc = { - { - .name = "filename", - .type = QEMU_OPT_STRING, - .help = "URL to the sheepdog image", - }, - { /* end of list */ } - }, -}; - -static int sd_open(BlockDriverState *bs, QDict *options, int flags, - Error **errp) -{ - int ret, fd; - uint32_t vid = 0; - BDRVSheepdogState *s = bs->opaque; - char vdi[SD_MAX_VDI_LEN], tag[SD_MAX_VDI_TAG_LEN]; - uint32_t snapid; - char *buf = NULL; - QemuOpts *opts; - Error *local_err = NULL; - const char *filename; - - s->bs = bs; - s->aio_context = bdrv_get_aio_context(bs); - - opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort); - qemu_opts_absorb_qdict(opts, options, &local_err); - if (local_err) { - error_propagate(errp, local_err); - ret = -EINVAL; - goto out; - } - - filename = qemu_opt_get(opts, "filename"); - - QLIST_INIT(&s->inflight_aio_head); - QLIST_INIT(&s->failed_aio_head); - QLIST_INIT(&s->inflight_aiocb_head); - s->fd = -1; - - memset(vdi, 0, sizeof(vdi)); - memset(tag, 0, sizeof(tag)); - - if (strstr(filename, "://")) { - ret = sd_parse_uri(s, filename, vdi, &snapid, tag); - } else { - ret = parse_vdiname(s, filename, vdi, &snapid, tag); - } - if (ret < 0) { - error_setg(errp, "Can't parse filename"); - goto out; - } - s->fd = get_sheep_fd(s, errp); - if (s->fd < 0) { - ret = s->fd; - goto out; - } - - ret = find_vdi_name(s, vdi, snapid, tag, &vid, true, errp); - if (ret) { - goto out; - } - - /* - * QEMU block layer emulates writethrough cache as 'writeback + flush', so - * we always set SD_FLAG_CMD_CACHE (writeback cache) as default. - */ - s->cache_flags = SD_FLAG_CMD_CACHE; - if (flags & BDRV_O_NOCACHE) { - s->cache_flags = SD_FLAG_CMD_DIRECT; - } - s->discard_supported = true; - - if (snapid || tag[0] != '\0') { - DPRINTF("%" PRIx32 " snapshot inode was open.\n", vid); - s->is_snapshot = true; - } - - fd = connect_to_sdog(s, errp); - if (fd < 0) { - ret = fd; - goto out; - } - - buf = g_malloc(SD_INODE_SIZE); - ret = read_object(fd, s->aio_context, buf, vid_to_vdi_oid(vid), - 0, SD_INODE_SIZE, 0, s->cache_flags); - - closesocket(fd); - - if (ret) { - error_setg(errp, "Can't read snapshot inode"); - goto out; - } - - memcpy(&s->inode, buf, sizeof(s->inode)); - - bs->total_sectors = s->inode.vdi_size / BDRV_SECTOR_SIZE; - pstrcpy(s->name, sizeof(s->name), vdi); - qemu_co_mutex_init(&s->lock); - qemu_co_queue_init(&s->overlapping_queue); - qemu_opts_del(opts); - g_free(buf); - return 0; -out: - aio_set_fd_handler(bdrv_get_aio_context(bs), s->fd, - false, NULL, NULL, NULL); - if (s->fd >= 0) { - closesocket(s->fd); - } - qemu_opts_del(opts); - g_free(buf); - return ret; -} - -static int sd_reopen_prepare(BDRVReopenState *state, BlockReopenQueue *queue, - Error **errp) -{ - BDRVSheepdogState *s = state->bs->opaque; - BDRVSheepdogReopenState *re_s; - int ret = 0; - - re_s = state->opaque = g_new0(BDRVSheepdogReopenState, 1); - - re_s->cache_flags = SD_FLAG_CMD_CACHE; - if (state->flags & BDRV_O_NOCACHE) { - re_s->cache_flags = SD_FLAG_CMD_DIRECT; - } - - re_s->fd = get_sheep_fd(s, errp); - if (re_s->fd < 0) { - ret = re_s->fd; - return ret; - } - - return ret; -} - -static void sd_reopen_commit(BDRVReopenState *state) -{ - BDRVSheepdogReopenState *re_s = state->opaque; - BDRVSheepdogState *s = state->bs->opaque; - - if (s->fd) { - aio_set_fd_handler(s->aio_context, s->fd, false, - NULL, NULL, NULL); - closesocket(s->fd); - } - - s->fd = re_s->fd; - s->cache_flags = re_s->cache_flags; - - g_free(state->opaque); - state->opaque = NULL; - - return; -} - -static void sd_reopen_abort(BDRVReopenState *state) -{ - BDRVSheepdogReopenState *re_s = state->opaque; - BDRVSheepdogState *s = state->bs->opaque; - - if (re_s == NULL) { - return; - } - - if (re_s->fd) { - aio_set_fd_handler(s->aio_context, re_s->fd, false, - NULL, NULL, NULL); - closesocket(re_s->fd); - } - - g_free(state->opaque); - state->opaque = NULL; - - return; -} - -static int do_sd_create(BDRVSheepdogState *s, uint32_t *vdi_id, int snapshot, - Error **errp) -{ - SheepdogVdiReq hdr; - SheepdogVdiRsp *rsp = (SheepdogVdiRsp *)&hdr; - int fd, ret; - unsigned int wlen, rlen = 0; - char buf[SD_MAX_VDI_LEN]; - - fd = connect_to_sdog(s, errp); - if (fd < 0) { - return fd; - } - - /* FIXME: would it be better to fail (e.g., return -EIO) when filename - * does not fit in buf? For now, just truncate and avoid buffer overrun. - */ - memset(buf, 0, sizeof(buf)); - pstrcpy(buf, sizeof(buf), s->name); - - memset(&hdr, 0, sizeof(hdr)); - hdr.opcode = SD_OP_NEW_VDI; - hdr.base_vdi_id = s->inode.vdi_id; - - wlen = SD_MAX_VDI_LEN; - - hdr.flags = SD_FLAG_CMD_WRITE; - hdr.snapid = snapshot; - - hdr.data_length = wlen; - hdr.vdi_size = s->inode.vdi_size; - hdr.copy_policy = s->inode.copy_policy; - hdr.copies = s->inode.nr_copies; - hdr.block_size_shift = s->inode.block_size_shift; - - ret = do_req(fd, s->aio_context, (SheepdogReq *)&hdr, buf, &wlen, &rlen); - - closesocket(fd); - - if (ret) { - error_setg_errno(errp, -ret, "create failed"); - return ret; - } - - if (rsp->result != SD_RES_SUCCESS) { - error_setg(errp, "%s, %s", sd_strerror(rsp->result), s->inode.name); - return -EIO; - } - - if (vdi_id) { - *vdi_id = rsp->vdi_id; - } - - return 0; -} - -static int sd_prealloc(const char *filename, Error **errp) -{ - BlockBackend *blk = NULL; - BDRVSheepdogState *base = NULL; - unsigned long buf_size; - uint32_t idx, max_idx; - uint32_t object_size; - int64_t vdi_size; - void *buf = NULL; - int ret; - - blk = blk_new_open(filename, NULL, NULL, - BDRV_O_RDWR | BDRV_O_PROTOCOL, errp); - if (blk == NULL) { - ret = -EIO; - goto out_with_err_set; - } - - blk_set_allow_write_beyond_eof(blk, true); - - vdi_size = blk_getlength(blk); - if (vdi_size < 0) { - ret = vdi_size; - goto out; - } - - base = blk_bs(blk)->opaque; - object_size = (UINT32_C(1) << base->inode.block_size_shift); - buf_size = MIN(object_size, SD_DATA_OBJ_SIZE); - buf = g_malloc0(buf_size); - - max_idx = DIV_ROUND_UP(vdi_size, buf_size); - - for (idx = 0; idx < max_idx; idx++) { - /* - * The created image can be a cloned image, so we need to read - * a data from the source image. - */ - ret = blk_pread(blk, idx * buf_size, buf, buf_size); - if (ret < 0) { - goto out; - } - ret = blk_pwrite(blk, idx * buf_size, buf, buf_size); - if (ret < 0) { - goto out; - } - } - - ret = 0; -out: - if (ret < 0) { - error_setg_errno(errp, -ret, "Can't pre-allocate"); - } -out_with_err_set: - if (blk) { - blk_unref(blk); - } - g_free(buf); - - return ret; -} - -/* - * Sheepdog support two kinds of redundancy, full replication and erasure - * coding. - * - * # create a fully replicated vdi with x copies - * -o redundancy=x (1 <= x <= SD_MAX_COPIES) - * - * # create a erasure coded vdi with x data strips and y parity strips - * -o redundancy=x:y (x must be one of {2,4,8,16} and 1 <= y < SD_EC_MAX_STRIP) - */ -static int parse_redundancy(BDRVSheepdogState *s, const char *opt) -{ - struct SheepdogInode *inode = &s->inode; - const char *n1, *n2; - long copy, parity; - char p[10]; - - pstrcpy(p, sizeof(p), opt); - n1 = strtok(p, ":"); - n2 = strtok(NULL, ":"); - - if (!n1) { - return -EINVAL; - } - - copy = strtol(n1, NULL, 10); - if (copy > SD_MAX_COPIES || copy < 1) { - return -EINVAL; - } - if (!n2) { - inode->copy_policy = 0; - inode->nr_copies = copy; - return 0; - } - - if (copy != 2 && copy != 4 && copy != 8 && copy != 16) { - return -EINVAL; - } - - parity = strtol(n2, NULL, 10); - if (parity >= SD_EC_MAX_STRIP || parity < 1) { - return -EINVAL; - } - - /* - * 4 bits for parity and 4 bits for data. - * We have to compress upper data bits because it can't represent 16 - */ - inode->copy_policy = ((copy / 2) << 4) + parity; - inode->nr_copies = copy + parity; - - return 0; -} - -static int parse_block_size_shift(BDRVSheepdogState *s, QemuOpts *opt) -{ - struct SheepdogInode *inode = &s->inode; - uint64_t object_size; - int obj_order; - - object_size = qemu_opt_get_size_del(opt, BLOCK_OPT_OBJECT_SIZE, 0); - if (object_size) { - if ((object_size - 1) & object_size) { /* not a power of 2? */ - return -EINVAL; - } - obj_order = ctz32(object_size); - if (obj_order < 20 || obj_order > 31) { - return -EINVAL; - } - inode->block_size_shift = (uint8_t)obj_order; - } - - return 0; -} - -static int sd_create(const char *filename, QemuOpts *opts, - Error **errp) -{ - int ret = 0; - uint32_t vid = 0; - char *backing_file = NULL; - char *buf = NULL; - BDRVSheepdogState *s; - char tag[SD_MAX_VDI_TAG_LEN]; - uint32_t snapid; - uint64_t max_vdi_size; - bool prealloc = false; - - s = g_new0(BDRVSheepdogState, 1); - - memset(tag, 0, sizeof(tag)); - if (strstr(filename, "://")) { - ret = sd_parse_uri(s, filename, s->name, &snapid, tag); - } else { - ret = parse_vdiname(s, filename, s->name, &snapid, tag); - } - if (ret < 0) { - error_setg(errp, "Can't parse filename"); - goto out; - } - - s->inode.vdi_size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0), - BDRV_SECTOR_SIZE); - backing_file = qemu_opt_get_del(opts, BLOCK_OPT_BACKING_FILE); - buf = qemu_opt_get_del(opts, BLOCK_OPT_PREALLOC); - if (!buf || !strcmp(buf, "off")) { - prealloc = false; - } else if (!strcmp(buf, "full")) { - prealloc = true; - } else { - error_setg(errp, "Invalid preallocation mode: '%s'", buf); - ret = -EINVAL; - goto out; - } - - g_free(buf); - buf = qemu_opt_get_del(opts, BLOCK_OPT_REDUNDANCY); - if (buf) { - ret = parse_redundancy(s, buf); - if (ret < 0) { - error_setg(errp, "Invalid redundancy mode: '%s'", buf); - goto out; - } - } - ret = parse_block_size_shift(s, opts); - if (ret < 0) { - error_setg(errp, "Invalid object_size." - " obect_size needs to be power of 2" - " and be limited from 2^20 to 2^31"); - goto out; - } - - if (backing_file) { - BlockBackend *blk; - BDRVSheepdogState *base; - BlockDriver *drv; - - /* Currently, only Sheepdog backing image is supported. */ - drv = bdrv_find_protocol(backing_file, true, NULL); - if (!drv || strcmp(drv->protocol_name, "sheepdog") != 0) { - error_setg(errp, "backing_file must be a sheepdog image"); - ret = -EINVAL; - goto out; - } - - blk = blk_new_open(backing_file, NULL, NULL, - BDRV_O_PROTOCOL, errp); - if (blk == NULL) { - ret = -EIO; - goto out; - } - - base = blk_bs(blk)->opaque; - - if (!is_snapshot(&base->inode)) { - error_setg(errp, "cannot clone from a non snapshot vdi"); - blk_unref(blk); - ret = -EINVAL; - goto out; - } - s->inode.vdi_id = base->inode.vdi_id; - blk_unref(blk); - } - - s->aio_context = qemu_get_aio_context(); - - /* if block_size_shift is not specified, get cluster default value */ - if (s->inode.block_size_shift == 0) { - SheepdogVdiReq hdr; - SheepdogClusterRsp *rsp = (SheepdogClusterRsp *)&hdr; - Error *local_err = NULL; - int fd; - unsigned int wlen = 0, rlen = 0; - - fd = connect_to_sdog(s, &local_err); - if (fd < 0) { - error_report_err(local_err); - ret = -EIO; - goto out; - } - - memset(&hdr, 0, sizeof(hdr)); - hdr.opcode = SD_OP_GET_CLUSTER_DEFAULT; - hdr.proto_ver = SD_PROTO_VER; - - ret = do_req(fd, s->aio_context, (SheepdogReq *)&hdr, - NULL, &wlen, &rlen); - closesocket(fd); - if (ret) { - error_setg_errno(errp, -ret, "failed to get cluster default"); - goto out; - } - if (rsp->result == SD_RES_SUCCESS) { - s->inode.block_size_shift = rsp->block_size_shift; - } else { - s->inode.block_size_shift = SD_DEFAULT_BLOCK_SIZE_SHIFT; - } - } - - max_vdi_size = (UINT64_C(1) << s->inode.block_size_shift) * MAX_DATA_OBJS; - - if (s->inode.vdi_size > max_vdi_size) { - error_setg(errp, "An image is too large." - " The maximum image size is %"PRIu64 "GB", - max_vdi_size / 1024 / 1024 / 1024); - ret = -EINVAL; - goto out; - } - - ret = do_sd_create(s, &vid, 0, errp); - if (ret) { - goto out; - } - - if (prealloc) { - ret = sd_prealloc(filename, errp); - } -out: - g_free(backing_file); - g_free(buf); - g_free(s); - return ret; -} - -static void sd_close(BlockDriverState *bs) -{ - Error *local_err = NULL; - BDRVSheepdogState *s = bs->opaque; - SheepdogVdiReq hdr; - SheepdogVdiRsp *rsp = (SheepdogVdiRsp *)&hdr; - unsigned int wlen, rlen = 0; - int fd, ret; - - DPRINTF("%s\n", s->name); - - fd = connect_to_sdog(s, &local_err); - if (fd < 0) { - error_report_err(local_err); - return; - } - - memset(&hdr, 0, sizeof(hdr)); - - hdr.opcode = SD_OP_RELEASE_VDI; - hdr.type = LOCK_TYPE_NORMAL; - hdr.base_vdi_id = s->inode.vdi_id; - wlen = strlen(s->name) + 1; - hdr.data_length = wlen; - hdr.flags = SD_FLAG_CMD_WRITE; - - ret = do_req(fd, s->aio_context, (SheepdogReq *)&hdr, - s->name, &wlen, &rlen); - - closesocket(fd); - - if (!ret && rsp->result != SD_RES_SUCCESS && - rsp->result != SD_RES_VDI_NOT_LOCKED) { - error_report("%s, %s", sd_strerror(rsp->result), s->name); - } - - aio_set_fd_handler(bdrv_get_aio_context(bs), s->fd, - false, NULL, NULL, NULL); - closesocket(s->fd); - g_free(s->host_spec); -} - -static int64_t sd_getlength(BlockDriverState *bs) -{ - BDRVSheepdogState *s = bs->opaque; - - return s->inode.vdi_size; -} - -static int sd_truncate(BlockDriverState *bs, int64_t offset) -{ - Error *local_err = NULL; - BDRVSheepdogState *s = bs->opaque; - int ret, fd; - unsigned int datalen; - uint64_t max_vdi_size; - - max_vdi_size = (UINT64_C(1) << s->inode.block_size_shift) * MAX_DATA_OBJS; - if (offset < s->inode.vdi_size) { - error_report("shrinking is not supported"); - return -EINVAL; - } else if (offset > max_vdi_size) { - error_report("too big image size"); - return -EINVAL; - } - - fd = connect_to_sdog(s, &local_err); - if (fd < 0) { - error_report_err(local_err); - return fd; - } - - /* we don't need to update entire object */ - datalen = SD_INODE_SIZE - sizeof(s->inode.data_vdi_id); - s->inode.vdi_size = offset; - ret = write_object(fd, s->aio_context, (char *)&s->inode, - vid_to_vdi_oid(s->inode.vdi_id), s->inode.nr_copies, - datalen, 0, false, s->cache_flags); - close(fd); - - if (ret < 0) { - error_report("failed to update an inode."); - } - - return ret; -} - -/* - * This function is called after writing data objects. If we need to - * update metadata, this sends a write request to the vdi object. - * Otherwise, this switches back to sd_co_readv/writev. - */ -static void coroutine_fn sd_write_done(SheepdogAIOCB *acb) -{ - BDRVSheepdogState *s = acb->common.bs->opaque; - struct iovec iov; - AIOReq *aio_req; - uint32_t offset, data_len, mn, mx; - - mn = acb->min_dirty_data_idx; - mx = acb->max_dirty_data_idx; - if (mn <= mx) { - /* we need to update the vdi object. */ - offset = sizeof(s->inode) - sizeof(s->inode.data_vdi_id) + - mn * sizeof(s->inode.data_vdi_id[0]); - data_len = (mx - mn + 1) * sizeof(s->inode.data_vdi_id[0]); - - acb->min_dirty_data_idx = UINT32_MAX; - acb->max_dirty_data_idx = 0; - - iov.iov_base = &s->inode; - iov.iov_len = sizeof(s->inode); - aio_req = alloc_aio_req(s, acb, vid_to_vdi_oid(s->inode.vdi_id), - data_len, offset, 0, false, 0, offset); - QLIST_INSERT_HEAD(&s->inflight_aio_head, aio_req, aio_siblings); - add_aio_request(s, aio_req, &iov, 1, AIOCB_WRITE_UDATA); - - acb->aio_done_func = sd_finish_aiocb; - acb->aiocb_type = AIOCB_WRITE_UDATA; - return; - } - - sd_finish_aiocb(acb); -} - -/* Delete current working VDI on the snapshot chain */ -static bool sd_delete(BDRVSheepdogState *s) -{ - Error *local_err = NULL; - unsigned int wlen = SD_MAX_VDI_LEN, rlen = 0; - SheepdogVdiReq hdr = { - .opcode = SD_OP_DEL_VDI, - .base_vdi_id = s->inode.vdi_id, - .data_length = wlen, - .flags = SD_FLAG_CMD_WRITE, - }; - SheepdogVdiRsp *rsp = (SheepdogVdiRsp *)&hdr; - int fd, ret; - - fd = connect_to_sdog(s, &local_err); - if (fd < 0) { - error_report_err(local_err); - return false; - } - - ret = do_req(fd, s->aio_context, (SheepdogReq *)&hdr, - s->name, &wlen, &rlen); - closesocket(fd); - if (ret) { - return false; - } - switch (rsp->result) { - case SD_RES_NO_VDI: - error_report("%s was already deleted", s->name); - /* fall through */ - case SD_RES_SUCCESS: - break; - default: - error_report("%s, %s", sd_strerror(rsp->result), s->name); - return false; - } - - return true; -} - -/* - * Create a writable VDI from a snapshot - */ -static int sd_create_branch(BDRVSheepdogState *s) -{ - Error *local_err = NULL; - int ret, fd; - uint32_t vid; - char *buf; - bool deleted; - - DPRINTF("%" PRIx32 " is snapshot.\n", s->inode.vdi_id); - - buf = g_malloc(SD_INODE_SIZE); - - /* - * Even If deletion fails, we will just create extra snapshot based on - * the working VDI which was supposed to be deleted. So no need to - * false bail out. - */ - deleted = sd_delete(s); - ret = do_sd_create(s, &vid, !deleted, &local_err); - if (ret) { - error_report_err(local_err); - goto out; - } - - DPRINTF("%" PRIx32 " is created.\n", vid); - - fd = connect_to_sdog(s, &local_err); - if (fd < 0) { - error_report_err(local_err); - ret = fd; - goto out; - } - - ret = read_object(fd, s->aio_context, buf, vid_to_vdi_oid(vid), - s->inode.nr_copies, SD_INODE_SIZE, 0, s->cache_flags); - - closesocket(fd); - - if (ret < 0) { - goto out; - } - - memcpy(&s->inode, buf, sizeof(s->inode)); - - s->is_snapshot = false; - ret = 0; - DPRINTF("%" PRIx32 " was newly created.\n", s->inode.vdi_id); - -out: - g_free(buf); - - return ret; -} - -/* - * Send I/O requests to the server. - * - * This function sends requests to the server, links the requests to - * the inflight_list in BDRVSheepdogState, and exits without - * waiting the response. The responses are received in the - * `aio_read_response' function which is called from the main loop as - * a fd handler. - * - * Returns 1 when we need to wait a response, 0 when there is no sent - * request and -errno in error cases. - */ -static int coroutine_fn sd_co_rw_vector(void *p) -{ - SheepdogAIOCB *acb = p; - int ret = 0; - unsigned long len, done = 0, total = acb->nb_sectors * BDRV_SECTOR_SIZE; - unsigned long idx; - uint32_t object_size; - uint64_t oid; - uint64_t offset; - BDRVSheepdogState *s = acb->common.bs->opaque; - SheepdogInode *inode = &s->inode; - AIOReq *aio_req; - - if (acb->aiocb_type == AIOCB_WRITE_UDATA && s->is_snapshot) { - /* - * In the case we open the snapshot VDI, Sheepdog creates the - * writable VDI when we do a write operation first. - */ - ret = sd_create_branch(s); - if (ret) { - acb->ret = -EIO; - goto out; - } - } - - object_size = (UINT32_C(1) << inode->block_size_shift); - idx = acb->sector_num * BDRV_SECTOR_SIZE / object_size; - offset = (acb->sector_num * BDRV_SECTOR_SIZE) % object_size; - - /* - * Make sure we don't free the aiocb before we are done with all requests. - * This additional reference is dropped at the end of this function. - */ - acb->nr_pending++; - - while (done != total) { - uint8_t flags = 0; - uint64_t old_oid = 0; - bool create = false; - - oid = vid_to_data_oid(inode->data_vdi_id[idx], idx); - - len = MIN(total - done, object_size - offset); - - switch (acb->aiocb_type) { - case AIOCB_READ_UDATA: - if (!inode->data_vdi_id[idx]) { - qemu_iovec_memset(acb->qiov, done, 0, len); - goto done; - } - break; - case AIOCB_WRITE_UDATA: - if (!inode->data_vdi_id[idx]) { - create = true; - } else if (!is_data_obj_writable(inode, idx)) { - /* Copy-On-Write */ - create = true; - old_oid = oid; - flags = SD_FLAG_CMD_COW; - } - break; - case AIOCB_DISCARD_OBJ: - /* - * We discard the object only when the whole object is - * 1) allocated 2) trimmed. Otherwise, simply skip it. - */ - if (len != object_size || inode->data_vdi_id[idx] == 0) { - goto done; - } - break; - default: - break; - } - - if (create) { - DPRINTF("update ino (%" PRIu32 ") %" PRIu64 " %" PRIu64 " %ld\n", - inode->vdi_id, oid, - vid_to_data_oid(inode->data_vdi_id[idx], idx), idx); - oid = vid_to_data_oid(inode->vdi_id, idx); - DPRINTF("new oid %" PRIx64 "\n", oid); - } - - aio_req = alloc_aio_req(s, acb, oid, len, offset, flags, create, - old_oid, - acb->aiocb_type == AIOCB_DISCARD_OBJ ? - 0 : done); - QLIST_INSERT_HEAD(&s->inflight_aio_head, aio_req, aio_siblings); - - add_aio_request(s, aio_req, acb->qiov->iov, acb->qiov->niov, - acb->aiocb_type); - done: - offset = 0; - idx++; - done += len; - } -out: - if (!--acb->nr_pending) { - return acb->ret; - } - return 1; -} - -static bool check_overlapping_aiocb(BDRVSheepdogState *s, SheepdogAIOCB *aiocb) -{ - SheepdogAIOCB *cb; - - QLIST_FOREACH(cb, &s->inflight_aiocb_head, aiocb_siblings) { - if (AIOCBOverlapping(aiocb, cb)) { - return true; - } - } - - QLIST_INSERT_HEAD(&s->inflight_aiocb_head, aiocb, aiocb_siblings); - return false; -} - -static coroutine_fn int sd_co_writev(BlockDriverState *bs, int64_t sector_num, - int nb_sectors, QEMUIOVector *qiov) -{ - SheepdogAIOCB *acb; - int ret; - int64_t offset = (sector_num + nb_sectors) * BDRV_SECTOR_SIZE; - BDRVSheepdogState *s = bs->opaque; - - if (offset > s->inode.vdi_size) { - ret = sd_truncate(bs, offset); - if (ret < 0) { - return ret; - } - } - - acb = sd_aio_setup(bs, qiov, sector_num, nb_sectors); - acb->aio_done_func = sd_write_done; - acb->aiocb_type = AIOCB_WRITE_UDATA; - -retry: - if (check_overlapping_aiocb(s, acb)) { - qemu_co_queue_wait(&s->overlapping_queue); - goto retry; - } - - ret = sd_co_rw_vector(acb); - if (ret <= 0) { - QLIST_REMOVE(acb, aiocb_siblings); - qemu_co_queue_restart_all(&s->overlapping_queue); - qemu_aio_unref(acb); - return ret; - } - - qemu_coroutine_yield(); - - QLIST_REMOVE(acb, aiocb_siblings); - qemu_co_queue_restart_all(&s->overlapping_queue); - - return acb->ret; -} - -static coroutine_fn int sd_co_readv(BlockDriverState *bs, int64_t sector_num, - int nb_sectors, QEMUIOVector *qiov) -{ - SheepdogAIOCB *acb; - int ret; - BDRVSheepdogState *s = bs->opaque; - - acb = sd_aio_setup(bs, qiov, sector_num, nb_sectors); - acb->aiocb_type = AIOCB_READ_UDATA; - acb->aio_done_func = sd_finish_aiocb; - -retry: - if (check_overlapping_aiocb(s, acb)) { - qemu_co_queue_wait(&s->overlapping_queue); - goto retry; - } - - ret = sd_co_rw_vector(acb); - if (ret <= 0) { - QLIST_REMOVE(acb, aiocb_siblings); - qemu_co_queue_restart_all(&s->overlapping_queue); - qemu_aio_unref(acb); - return ret; - } - - qemu_coroutine_yield(); - - QLIST_REMOVE(acb, aiocb_siblings); - qemu_co_queue_restart_all(&s->overlapping_queue); - return acb->ret; -} - -static int coroutine_fn sd_co_flush_to_disk(BlockDriverState *bs) -{ - BDRVSheepdogState *s = bs->opaque; - SheepdogAIOCB *acb; - AIOReq *aio_req; - - if (s->cache_flags != SD_FLAG_CMD_CACHE) { - return 0; - } - - acb = sd_aio_setup(bs, NULL, 0, 0); - acb->aiocb_type = AIOCB_FLUSH_CACHE; - acb->aio_done_func = sd_finish_aiocb; - - aio_req = alloc_aio_req(s, acb, vid_to_vdi_oid(s->inode.vdi_id), - 0, 0, 0, false, 0, 0); - QLIST_INSERT_HEAD(&s->inflight_aio_head, aio_req, aio_siblings); - add_aio_request(s, aio_req, NULL, 0, acb->aiocb_type); - - qemu_coroutine_yield(); - return acb->ret; -} - -static int sd_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info) -{ - Error *local_err = NULL; - BDRVSheepdogState *s = bs->opaque; - int ret, fd; - uint32_t new_vid; - SheepdogInode *inode; - unsigned int datalen; - - DPRINTF("sn_info: name %s id_str %s s: name %s vm_state_size %" PRId64 " " - "is_snapshot %d\n", sn_info->name, sn_info->id_str, - s->name, sn_info->vm_state_size, s->is_snapshot); - - if (s->is_snapshot) { - error_report("You can't create a snapshot of a snapshot VDI, " - "%s (%" PRIu32 ").", s->name, s->inode.vdi_id); - - return -EINVAL; - } - - DPRINTF("%s %s\n", sn_info->name, sn_info->id_str); - - s->inode.vm_state_size = sn_info->vm_state_size; - s->inode.vm_clock_nsec = sn_info->vm_clock_nsec; - /* It appears that inode.tag does not require a NUL terminator, - * which means this use of strncpy is ok. - */ - strncpy(s->inode.tag, sn_info->name, sizeof(s->inode.tag)); - /* we don't need to update entire object */ - datalen = SD_INODE_SIZE - sizeof(s->inode.data_vdi_id); - inode = g_malloc(datalen); - - /* refresh inode. */ - fd = connect_to_sdog(s, &local_err); - if (fd < 0) { - error_report_err(local_err); - ret = fd; - goto cleanup; - } - - ret = write_object(fd, s->aio_context, (char *)&s->inode, - vid_to_vdi_oid(s->inode.vdi_id), s->inode.nr_copies, - datalen, 0, false, s->cache_flags); - if (ret < 0) { - error_report("failed to write snapshot's inode."); - goto cleanup; - } - - ret = do_sd_create(s, &new_vid, 1, &local_err); - if (ret < 0) { - error_reportf_err(local_err, - "failed to create inode for snapshot: "); - goto cleanup; - } - - ret = read_object(fd, s->aio_context, (char *)inode, - vid_to_vdi_oid(new_vid), s->inode.nr_copies, datalen, 0, - s->cache_flags); - - if (ret < 0) { - error_report("failed to read new inode info. %s", strerror(errno)); - goto cleanup; - } - - memcpy(&s->inode, inode, datalen); - DPRINTF("s->inode: name %s snap_id %x oid %x\n", - s->inode.name, s->inode.snap_id, s->inode.vdi_id); - -cleanup: - g_free(inode); - closesocket(fd); - return ret; -} - -/* - * We implement rollback(loadvm) operation to the specified snapshot by - * 1) switch to the snapshot - * 2) rely on sd_create_branch to delete working VDI and - * 3) create a new working VDI based on the specified snapshot - */ -static int sd_snapshot_goto(BlockDriverState *bs, const char *snapshot_id) -{ - BDRVSheepdogState *s = bs->opaque; - BDRVSheepdogState *old_s; - char tag[SD_MAX_VDI_TAG_LEN]; - uint32_t snapid = 0; - int ret = 0; - - old_s = g_new(BDRVSheepdogState, 1); - - memcpy(old_s, s, sizeof(BDRVSheepdogState)); - - snapid = strtoul(snapshot_id, NULL, 10); - if (snapid) { - tag[0] = 0; - } else { - pstrcpy(tag, sizeof(tag), snapshot_id); - } - - ret = reload_inode(s, snapid, tag); - if (ret) { - goto out; - } - - ret = sd_create_branch(s); - if (ret) { - goto out; - } - - g_free(old_s); - - return 0; -out: - /* recover bdrv_sd_state */ - memcpy(s, old_s, sizeof(BDRVSheepdogState)); - g_free(old_s); - - error_report("failed to open. recover old bdrv_sd_state."); - - return ret; -} - -#define NR_BATCHED_DISCARD 128 - -static bool remove_objects(BDRVSheepdogState *s) -{ - int fd, i = 0, nr_objs = 0; - Error *local_err = NULL; - int ret = 0; - bool result = true; - SheepdogInode *inode = &s->inode; - - fd = connect_to_sdog(s, &local_err); - if (fd < 0) { - error_report_err(local_err); - return false; - } - - nr_objs = count_data_objs(inode); - while (i < nr_objs) { - int start_idx, nr_filled_idx; - - while (i < nr_objs && !inode->data_vdi_id[i]) { - i++; - } - start_idx = i; - - nr_filled_idx = 0; - while (i < nr_objs && nr_filled_idx < NR_BATCHED_DISCARD) { - if (inode->data_vdi_id[i]) { - inode->data_vdi_id[i] = 0; - nr_filled_idx++; - } - - i++; - } - - ret = write_object(fd, s->aio_context, - (char *)&inode->data_vdi_id[start_idx], - vid_to_vdi_oid(s->inode.vdi_id), inode->nr_copies, - (i - start_idx) * sizeof(uint32_t), - offsetof(struct SheepdogInode, - data_vdi_id[start_idx]), - false, s->cache_flags); - if (ret < 0) { - error_report("failed to discard snapshot inode."); - result = false; - goto out; - } - } - -out: - closesocket(fd); - return result; -} - -static int sd_snapshot_delete(BlockDriverState *bs, - const char *snapshot_id, - const char *name, - Error **errp) -{ - unsigned long snap_id = 0; - char snap_tag[SD_MAX_VDI_TAG_LEN]; - Error *local_err = NULL; - int fd, ret; - char buf[SD_MAX_VDI_LEN + SD_MAX_VDI_TAG_LEN]; - BDRVSheepdogState *s = bs->opaque; - unsigned int wlen = SD_MAX_VDI_LEN + SD_MAX_VDI_TAG_LEN, rlen = 0; - uint32_t vid; - SheepdogVdiReq hdr = { - .opcode = SD_OP_DEL_VDI, - .data_length = wlen, - .flags = SD_FLAG_CMD_WRITE, - }; - SheepdogVdiRsp *rsp = (SheepdogVdiRsp *)&hdr; - - if (!remove_objects(s)) { - return -1; - } - - memset(buf, 0, sizeof(buf)); - memset(snap_tag, 0, sizeof(snap_tag)); - pstrcpy(buf, SD_MAX_VDI_LEN, s->name); - ret = qemu_strtoul(snapshot_id, NULL, 10, &snap_id); - if (ret || snap_id > UINT32_MAX) { - error_setg(errp, "Invalid snapshot ID: %s", - snapshot_id ? snapshot_id : ""); - return -EINVAL; - } - - if (snap_id) { - hdr.snapid = (uint32_t) snap_id; - } else { - pstrcpy(snap_tag, sizeof(snap_tag), snapshot_id); - pstrcpy(buf + SD_MAX_VDI_LEN, SD_MAX_VDI_TAG_LEN, snap_tag); - } - - ret = find_vdi_name(s, s->name, snap_id, snap_tag, &vid, true, - &local_err); - if (ret) { - return ret; - } - - fd = connect_to_sdog(s, &local_err); - if (fd < 0) { - error_report_err(local_err); - return -1; - } - - ret = do_req(fd, s->aio_context, (SheepdogReq *)&hdr, - buf, &wlen, &rlen); - closesocket(fd); - if (ret) { - return ret; - } - - switch (rsp->result) { - case SD_RES_NO_VDI: - error_report("%s was already deleted", s->name); - case SD_RES_SUCCESS: - break; - default: - error_report("%s, %s", sd_strerror(rsp->result), s->name); - return -1; - } - - return ret; -} - -static int sd_snapshot_list(BlockDriverState *bs, QEMUSnapshotInfo **psn_tab) -{ - Error *local_err = NULL; - BDRVSheepdogState *s = bs->opaque; - SheepdogReq req; - int fd, nr = 1024, ret, max = BITS_TO_LONGS(SD_NR_VDIS) * sizeof(long); - QEMUSnapshotInfo *sn_tab = NULL; - unsigned wlen, rlen; - int found = 0; - static SheepdogInode inode; - unsigned long *vdi_inuse; - unsigned int start_nr; - uint64_t hval; - uint32_t vid; - - vdi_inuse = g_malloc(max); - - fd = connect_to_sdog(s, &local_err); - if (fd < 0) { - error_report_err(local_err); - ret = fd; - goto out; - } - - rlen = max; - wlen = 0; - - memset(&req, 0, sizeof(req)); - - req.opcode = SD_OP_READ_VDIS; - req.data_length = max; - - ret = do_req(fd, s->aio_context, (SheepdogReq *)&req, - vdi_inuse, &wlen, &rlen); - - closesocket(fd); - if (ret) { - goto out; - } - - sn_tab = g_new0(QEMUSnapshotInfo, nr); - - /* calculate a vdi id with hash function */ - hval = fnv_64a_buf(s->name, strlen(s->name), FNV1A_64_INIT); - start_nr = hval & (SD_NR_VDIS - 1); - - fd = connect_to_sdog(s, &local_err); - if (fd < 0) { - error_report_err(local_err); - ret = fd; - goto out; - } - - for (vid = start_nr; found < nr; vid = (vid + 1) % SD_NR_VDIS) { - if (!test_bit(vid, vdi_inuse)) { - break; - } - - /* we don't need to read entire object */ - ret = read_object(fd, s->aio_context, (char *)&inode, - vid_to_vdi_oid(vid), - 0, SD_INODE_SIZE - sizeof(inode.data_vdi_id), 0, - s->cache_flags); - - if (ret) { - continue; - } - - if (!strcmp(inode.name, s->name) && is_snapshot(&inode)) { - sn_tab[found].date_sec = inode.snap_ctime >> 32; - sn_tab[found].date_nsec = inode.snap_ctime & 0xffffffff; - sn_tab[found].vm_state_size = inode.vm_state_size; - sn_tab[found].vm_clock_nsec = inode.vm_clock_nsec; - - snprintf(sn_tab[found].id_str, sizeof(sn_tab[found].id_str), - "%" PRIu32, inode.snap_id); - pstrcpy(sn_tab[found].name, - MIN(sizeof(sn_tab[found].name), sizeof(inode.tag)), - inode.tag); - found++; - } - } - - closesocket(fd); -out: - *psn_tab = sn_tab; - - g_free(vdi_inuse); - - if (ret < 0) { - return ret; - } - - return found; -} - -static int do_load_save_vmstate(BDRVSheepdogState *s, uint8_t *data, - int64_t pos, int size, int load) -{ - Error *local_err = NULL; - bool create; - int fd, ret = 0, remaining = size; - unsigned int data_len; - uint64_t vmstate_oid; - uint64_t offset; - uint32_t vdi_index; - uint32_t vdi_id = load ? s->inode.parent_vdi_id : s->inode.vdi_id; - uint32_t object_size = (UINT32_C(1) << s->inode.block_size_shift); - - fd = connect_to_sdog(s, &local_err); - if (fd < 0) { - error_report_err(local_err); - return fd; - } - - while (remaining) { - vdi_index = pos / object_size; - offset = pos % object_size; - - data_len = MIN(remaining, object_size - offset); - - vmstate_oid = vid_to_vmstate_oid(vdi_id, vdi_index); - - create = (offset == 0); - if (load) { - ret = read_object(fd, s->aio_context, (char *)data, vmstate_oid, - s->inode.nr_copies, data_len, offset, - s->cache_flags); - } else { - ret = write_object(fd, s->aio_context, (char *)data, vmstate_oid, - s->inode.nr_copies, data_len, offset, create, - s->cache_flags); - } - - if (ret < 0) { - error_report("failed to save vmstate %s", strerror(errno)); - goto cleanup; - } - - pos += data_len; - data += data_len; - remaining -= data_len; - } - ret = size; -cleanup: - closesocket(fd); - return ret; -} - -static int sd_save_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, - int64_t pos) -{ - BDRVSheepdogState *s = bs->opaque; - void *buf; - int ret; - - buf = qemu_blockalign(bs, qiov->size); - qemu_iovec_to_buf(qiov, 0, buf, qiov->size); - ret = do_load_save_vmstate(s, (uint8_t *) buf, pos, qiov->size, 0); - qemu_vfree(buf); - - return ret; -} - -static int sd_load_vmstate(BlockDriverState *bs, uint8_t *data, - int64_t pos, int size) -{ - BDRVSheepdogState *s = bs->opaque; - - return do_load_save_vmstate(s, data, pos, size, 1); -} - - -static coroutine_fn int sd_co_discard(BlockDriverState *bs, int64_t sector_num, - int nb_sectors) -{ - SheepdogAIOCB *acb; - BDRVSheepdogState *s = bs->opaque; - int ret; - QEMUIOVector discard_iov; - struct iovec iov; - uint32_t zero = 0; - - if (!s->discard_supported) { - return 0; - } - - memset(&discard_iov, 0, sizeof(discard_iov)); - memset(&iov, 0, sizeof(iov)); - iov.iov_base = &zero; - iov.iov_len = sizeof(zero); - discard_iov.iov = &iov; - discard_iov.niov = 1; - acb = sd_aio_setup(bs, &discard_iov, sector_num, nb_sectors); - acb->aiocb_type = AIOCB_DISCARD_OBJ; - acb->aio_done_func = sd_finish_aiocb; - -retry: - if (check_overlapping_aiocb(s, acb)) { - qemu_co_queue_wait(&s->overlapping_queue); - goto retry; - } - - ret = sd_co_rw_vector(acb); - if (ret <= 0) { - QLIST_REMOVE(acb, aiocb_siblings); - qemu_co_queue_restart_all(&s->overlapping_queue); - qemu_aio_unref(acb); - return ret; - } - - qemu_coroutine_yield(); - - QLIST_REMOVE(acb, aiocb_siblings); - qemu_co_queue_restart_all(&s->overlapping_queue); - - return acb->ret; -} - -static coroutine_fn int64_t -sd_co_get_block_status(BlockDriverState *bs, int64_t sector_num, int nb_sectors, - int *pnum, BlockDriverState **file) -{ - BDRVSheepdogState *s = bs->opaque; - SheepdogInode *inode = &s->inode; - uint32_t object_size = (UINT32_C(1) << inode->block_size_shift); - uint64_t offset = sector_num * BDRV_SECTOR_SIZE; - unsigned long start = offset / object_size, - end = DIV_ROUND_UP((sector_num + nb_sectors) * - BDRV_SECTOR_SIZE, object_size); - unsigned long idx; - int64_t ret = BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID | offset; - - for (idx = start; idx < end; idx++) { - if (inode->data_vdi_id[idx] == 0) { - break; - } - } - if (idx == start) { - /* Get the longest length of unallocated sectors */ - ret = 0; - for (idx = start + 1; idx < end; idx++) { - if (inode->data_vdi_id[idx] != 0) { - break; - } - } - } - - *pnum = (idx - start) * object_size / BDRV_SECTOR_SIZE; - if (*pnum > nb_sectors) { - *pnum = nb_sectors; - } - if (ret > 0 && ret & BDRV_BLOCK_OFFSET_VALID) { - *file = bs; - } - return ret; -} - -static int64_t sd_get_allocated_file_size(BlockDriverState *bs) -{ - BDRVSheepdogState *s = bs->opaque; - SheepdogInode *inode = &s->inode; - uint32_t object_size = (UINT32_C(1) << inode->block_size_shift); - unsigned long i, last = DIV_ROUND_UP(inode->vdi_size, object_size); - uint64_t size = 0; - - for (i = 0; i < last; i++) { - if (inode->data_vdi_id[i] == 0) { - continue; - } - size += object_size; - } - return size; -} - -static QemuOptsList sd_create_opts = { - .name = "sheepdog-create-opts", - .head = QTAILQ_HEAD_INITIALIZER(sd_create_opts.head), - .desc = { - { - .name = BLOCK_OPT_SIZE, - .type = QEMU_OPT_SIZE, - .help = "Virtual disk size" - }, - { - .name = BLOCK_OPT_BACKING_FILE, - .type = QEMU_OPT_STRING, - .help = "File name of a base image" - }, - { - .name = BLOCK_OPT_PREALLOC, - .type = QEMU_OPT_STRING, - .help = "Preallocation mode (allowed values: off, full)" - }, - { - .name = BLOCK_OPT_REDUNDANCY, - .type = QEMU_OPT_STRING, - .help = "Redundancy of the image" - }, - { - .name = BLOCK_OPT_OBJECT_SIZE, - .type = QEMU_OPT_SIZE, - .help = "Object size of the image" - }, - { /* end of list */ } - } -}; - -static BlockDriver bdrv_sheepdog = { - .format_name = "sheepdog", - .protocol_name = "sheepdog", - .instance_size = sizeof(BDRVSheepdogState), - .bdrv_needs_filename = true, - .bdrv_file_open = sd_open, - .bdrv_reopen_prepare = sd_reopen_prepare, - .bdrv_reopen_commit = sd_reopen_commit, - .bdrv_reopen_abort = sd_reopen_abort, - .bdrv_close = sd_close, - .bdrv_create = sd_create, - .bdrv_has_zero_init = bdrv_has_zero_init_1, - .bdrv_getlength = sd_getlength, - .bdrv_get_allocated_file_size = sd_get_allocated_file_size, - .bdrv_truncate = sd_truncate, - - .bdrv_co_readv = sd_co_readv, - .bdrv_co_writev = sd_co_writev, - .bdrv_co_flush_to_disk = sd_co_flush_to_disk, - .bdrv_co_discard = sd_co_discard, - .bdrv_co_get_block_status = sd_co_get_block_status, - - .bdrv_snapshot_create = sd_snapshot_create, - .bdrv_snapshot_goto = sd_snapshot_goto, - .bdrv_snapshot_delete = sd_snapshot_delete, - .bdrv_snapshot_list = sd_snapshot_list, - - .bdrv_save_vmstate = sd_save_vmstate, - .bdrv_load_vmstate = sd_load_vmstate, - - .bdrv_detach_aio_context = sd_detach_aio_context, - .bdrv_attach_aio_context = sd_attach_aio_context, - - .create_opts = &sd_create_opts, -}; - -static BlockDriver bdrv_sheepdog_tcp = { - .format_name = "sheepdog", - .protocol_name = "sheepdog+tcp", - .instance_size = sizeof(BDRVSheepdogState), - .bdrv_needs_filename = true, - .bdrv_file_open = sd_open, - .bdrv_reopen_prepare = sd_reopen_prepare, - .bdrv_reopen_commit = sd_reopen_commit, - .bdrv_reopen_abort = sd_reopen_abort, - .bdrv_close = sd_close, - .bdrv_create = sd_create, - .bdrv_has_zero_init = bdrv_has_zero_init_1, - .bdrv_getlength = sd_getlength, - .bdrv_get_allocated_file_size = sd_get_allocated_file_size, - .bdrv_truncate = sd_truncate, - - .bdrv_co_readv = sd_co_readv, - .bdrv_co_writev = sd_co_writev, - .bdrv_co_flush_to_disk = sd_co_flush_to_disk, - .bdrv_co_discard = sd_co_discard, - .bdrv_co_get_block_status = sd_co_get_block_status, - - .bdrv_snapshot_create = sd_snapshot_create, - .bdrv_snapshot_goto = sd_snapshot_goto, - .bdrv_snapshot_delete = sd_snapshot_delete, - .bdrv_snapshot_list = sd_snapshot_list, - - .bdrv_save_vmstate = sd_save_vmstate, - .bdrv_load_vmstate = sd_load_vmstate, - - .bdrv_detach_aio_context = sd_detach_aio_context, - .bdrv_attach_aio_context = sd_attach_aio_context, - - .create_opts = &sd_create_opts, -}; - -static BlockDriver bdrv_sheepdog_unix = { - .format_name = "sheepdog", - .protocol_name = "sheepdog+unix", - .instance_size = sizeof(BDRVSheepdogState), - .bdrv_needs_filename = true, - .bdrv_file_open = sd_open, - .bdrv_reopen_prepare = sd_reopen_prepare, - .bdrv_reopen_commit = sd_reopen_commit, - .bdrv_reopen_abort = sd_reopen_abort, - .bdrv_close = sd_close, - .bdrv_create = sd_create, - .bdrv_has_zero_init = bdrv_has_zero_init_1, - .bdrv_getlength = sd_getlength, - .bdrv_get_allocated_file_size = sd_get_allocated_file_size, - .bdrv_truncate = sd_truncate, - - .bdrv_co_readv = sd_co_readv, - .bdrv_co_writev = sd_co_writev, - .bdrv_co_flush_to_disk = sd_co_flush_to_disk, - .bdrv_co_discard = sd_co_discard, - .bdrv_co_get_block_status = sd_co_get_block_status, - - .bdrv_snapshot_create = sd_snapshot_create, - .bdrv_snapshot_goto = sd_snapshot_goto, - .bdrv_snapshot_delete = sd_snapshot_delete, - .bdrv_snapshot_list = sd_snapshot_list, - - .bdrv_save_vmstate = sd_save_vmstate, - .bdrv_load_vmstate = sd_load_vmstate, - - .bdrv_detach_aio_context = sd_detach_aio_context, - .bdrv_attach_aio_context = sd_attach_aio_context, - - .create_opts = &sd_create_opts, -}; - -static void bdrv_sheepdog_init(void) -{ - bdrv_register(&bdrv_sheepdog); - bdrv_register(&bdrv_sheepdog_tcp); - bdrv_register(&bdrv_sheepdog_unix); -} -block_init(bdrv_sheepdog_init); diff --git a/qemu/block/snapshot.c b/qemu/block/snapshot.c deleted file mode 100644 index e9d721df6..000000000 --- a/qemu/block/snapshot.c +++ /dev/null @@ -1,493 +0,0 @@ -/* - * Block layer snapshot related functions - * - * Copyright (c) 2003-2008 Fabrice Bellard - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. - */ - -#include "qemu/osdep.h" -#include "block/snapshot.h" -#include "block/block_int.h" -#include "qapi/error.h" -#include "qapi/qmp/qerror.h" - -QemuOptsList internal_snapshot_opts = { - .name = "snapshot", - .head = QTAILQ_HEAD_INITIALIZER(internal_snapshot_opts.head), - .desc = { - { - .name = SNAPSHOT_OPT_ID, - .type = QEMU_OPT_STRING, - .help = "snapshot id" - },{ - .name = SNAPSHOT_OPT_NAME, - .type = QEMU_OPT_STRING, - .help = "snapshot name" - },{ - /* end of list */ - } - }, -}; - -int bdrv_snapshot_find(BlockDriverState *bs, QEMUSnapshotInfo *sn_info, - const char *name) -{ - QEMUSnapshotInfo *sn_tab, *sn; - int nb_sns, i, ret; - - ret = -ENOENT; - nb_sns = bdrv_snapshot_list(bs, &sn_tab); - if (nb_sns < 0) { - return ret; - } - for (i = 0; i < nb_sns; i++) { - sn = &sn_tab[i]; - if (!strcmp(sn->id_str, name) || !strcmp(sn->name, name)) { - *sn_info = *sn; - ret = 0; - break; - } - } - g_free(sn_tab); - return ret; -} - -/** - * Look up an internal snapshot by @id and @name. - * @bs: block device to search - * @id: unique snapshot ID, or NULL - * @name: snapshot name, or NULL - * @sn_info: location to store information on the snapshot found - * @errp: location to store error, will be set only for exception - * - * This function will traverse snapshot list in @bs to search the matching - * one, @id and @name are the matching condition: - * If both @id and @name are specified, find the first one with id @id and - * name @name. - * If only @id is specified, find the first one with id @id. - * If only @name is specified, find the first one with name @name. - * if none is specified, abort(). - * - * Returns: true when a snapshot is found and @sn_info will be filled, false - * when error or not found. If all operation succeed but no matching one is - * found, @errp will NOT be set. - */ -bool bdrv_snapshot_find_by_id_and_name(BlockDriverState *bs, - const char *id, - const char *name, - QEMUSnapshotInfo *sn_info, - Error **errp) -{ - QEMUSnapshotInfo *sn_tab, *sn; - int nb_sns, i; - bool ret = false; - - assert(id || name); - - nb_sns = bdrv_snapshot_list(bs, &sn_tab); - if (nb_sns < 0) { - error_setg_errno(errp, -nb_sns, "Failed to get a snapshot list"); - return false; - } else if (nb_sns == 0) { - return false; - } - - if (id && name) { - for (i = 0; i < nb_sns; i++) { - sn = &sn_tab[i]; - if (!strcmp(sn->id_str, id) && !strcmp(sn->name, name)) { - *sn_info = *sn; - ret = true; - break; - } - } - } else if (id) { - for (i = 0; i < nb_sns; i++) { - sn = &sn_tab[i]; - if (!strcmp(sn->id_str, id)) { - *sn_info = *sn; - ret = true; - break; - } - } - } else if (name) { - for (i = 0; i < nb_sns; i++) { - sn = &sn_tab[i]; - if (!strcmp(sn->name, name)) { - *sn_info = *sn; - ret = true; - break; - } - } - } - - g_free(sn_tab); - return ret; -} - -int bdrv_can_snapshot(BlockDriverState *bs) -{ - BlockDriver *drv = bs->drv; - if (!drv || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) { - return 0; - } - - if (!drv->bdrv_snapshot_create) { - if (bs->file != NULL) { - return bdrv_can_snapshot(bs->file->bs); - } - return 0; - } - - return 1; -} - -int bdrv_snapshot_create(BlockDriverState *bs, - QEMUSnapshotInfo *sn_info) -{ - BlockDriver *drv = bs->drv; - if (!drv) { - return -ENOMEDIUM; - } - if (drv->bdrv_snapshot_create) { - return drv->bdrv_snapshot_create(bs, sn_info); - } - if (bs->file) { - return bdrv_snapshot_create(bs->file->bs, sn_info); - } - return -ENOTSUP; -} - -int bdrv_snapshot_goto(BlockDriverState *bs, - const char *snapshot_id) -{ - BlockDriver *drv = bs->drv; - int ret, open_ret; - - if (!drv) { - return -ENOMEDIUM; - } - if (drv->bdrv_snapshot_goto) { - return drv->bdrv_snapshot_goto(bs, snapshot_id); - } - - if (bs->file) { - drv->bdrv_close(bs); - ret = bdrv_snapshot_goto(bs->file->bs, snapshot_id); - open_ret = drv->bdrv_open(bs, NULL, bs->open_flags, NULL); - if (open_ret < 0) { - bdrv_unref(bs->file->bs); - bs->drv = NULL; - return open_ret; - } - return ret; - } - - return -ENOTSUP; -} - -/** - * Delete an internal snapshot by @snapshot_id and @name. - * @bs: block device used in the operation - * @snapshot_id: unique snapshot ID, or NULL - * @name: snapshot name, or NULL - * @errp: location to store error - * - * If both @snapshot_id and @name are specified, delete the first one with - * id @snapshot_id and name @name. - * If only @snapshot_id is specified, delete the first one with id - * @snapshot_id. - * If only @name is specified, delete the first one with name @name. - * if none is specified, return -EINVAL. - * - * Returns: 0 on success, -errno on failure. If @bs is not inserted, return - * -ENOMEDIUM. If @snapshot_id and @name are both NULL, return -EINVAL. If @bs - * does not support internal snapshot deletion, return -ENOTSUP. If @bs does - * not support parameter @snapshot_id or @name, or one of them is not correctly - * specified, return -EINVAL. If @bs can't find one matching @id and @name, - * return -ENOENT. If @errp != NULL, it will always be filled with error - * message on failure. - */ -int bdrv_snapshot_delete(BlockDriverState *bs, - const char *snapshot_id, - const char *name, - Error **errp) -{ - BlockDriver *drv = bs->drv; - int ret; - - if (!drv) { - error_setg(errp, QERR_DEVICE_HAS_NO_MEDIUM, bdrv_get_device_name(bs)); - return -ENOMEDIUM; - } - if (!snapshot_id && !name) { - error_setg(errp, "snapshot_id and name are both NULL"); - return -EINVAL; - } - - /* drain all pending i/o before deleting snapshot */ - bdrv_drained_begin(bs); - - if (drv->bdrv_snapshot_delete) { - ret = drv->bdrv_snapshot_delete(bs, snapshot_id, name, errp); - } else if (bs->file) { - ret = bdrv_snapshot_delete(bs->file->bs, snapshot_id, name, errp); - } else { - error_setg(errp, "Block format '%s' used by device '%s' " - "does not support internal snapshot deletion", - drv->format_name, bdrv_get_device_name(bs)); - ret = -ENOTSUP; - } - - bdrv_drained_end(bs); - return ret; -} - -int bdrv_snapshot_delete_by_id_or_name(BlockDriverState *bs, - const char *id_or_name, - Error **errp) -{ - int ret; - Error *local_err = NULL; - - ret = bdrv_snapshot_delete(bs, id_or_name, NULL, &local_err); - if (ret == -ENOENT || ret == -EINVAL) { - error_free(local_err); - local_err = NULL; - ret = bdrv_snapshot_delete(bs, NULL, id_or_name, &local_err); - } - - if (ret < 0) { - error_propagate(errp, local_err); - } - return ret; -} - -int bdrv_snapshot_list(BlockDriverState *bs, - QEMUSnapshotInfo **psn_info) -{ - BlockDriver *drv = bs->drv; - if (!drv) { - return -ENOMEDIUM; - } - if (drv->bdrv_snapshot_list) { - return drv->bdrv_snapshot_list(bs, psn_info); - } - if (bs->file) { - return bdrv_snapshot_list(bs->file->bs, psn_info); - } - return -ENOTSUP; -} - -/** - * Temporarily load an internal snapshot by @snapshot_id and @name. - * @bs: block device used in the operation - * @snapshot_id: unique snapshot ID, or NULL - * @name: snapshot name, or NULL - * @errp: location to store error - * - * If both @snapshot_id and @name are specified, load the first one with - * id @snapshot_id and name @name. - * If only @snapshot_id is specified, load the first one with id - * @snapshot_id. - * If only @name is specified, load the first one with name @name. - * if none is specified, return -EINVAL. - * - * Returns: 0 on success, -errno on fail. If @bs is not inserted, return - * -ENOMEDIUM. If @bs is not readonly, return -EINVAL. If @bs did not support - * internal snapshot, return -ENOTSUP. If qemu can't find a matching @id and - * @name, return -ENOENT. If @errp != NULL, it will always be filled on - * failure. - */ -int bdrv_snapshot_load_tmp(BlockDriverState *bs, - const char *snapshot_id, - const char *name, - Error **errp) -{ - BlockDriver *drv = bs->drv; - - if (!drv) { - error_setg(errp, QERR_DEVICE_HAS_NO_MEDIUM, bdrv_get_device_name(bs)); - return -ENOMEDIUM; - } - if (!snapshot_id && !name) { - error_setg(errp, "snapshot_id and name are both NULL"); - return -EINVAL; - } - if (!bs->read_only) { - error_setg(errp, "Device is not readonly"); - return -EINVAL; - } - if (drv->bdrv_snapshot_load_tmp) { - return drv->bdrv_snapshot_load_tmp(bs, snapshot_id, name, errp); - } - error_setg(errp, "Block format '%s' used by device '%s' " - "does not support temporarily loading internal snapshots", - drv->format_name, bdrv_get_device_name(bs)); - return -ENOTSUP; -} - -int bdrv_snapshot_load_tmp_by_id_or_name(BlockDriverState *bs, - const char *id_or_name, - Error **errp) -{ - int ret; - Error *local_err = NULL; - - ret = bdrv_snapshot_load_tmp(bs, id_or_name, NULL, &local_err); - if (ret == -ENOENT || ret == -EINVAL) { - error_free(local_err); - local_err = NULL; - ret = bdrv_snapshot_load_tmp(bs, NULL, id_or_name, &local_err); - } - - if (local_err) { - error_propagate(errp, local_err); - } - - return ret; -} - - -/* Group operations. All block drivers are involved. - * These functions will properly handle dataplane (take aio_context_acquire - * when appropriate for appropriate block drivers) */ - -bool bdrv_all_can_snapshot(BlockDriverState **first_bad_bs) -{ - bool ok = true; - BlockDriverState *bs = NULL; - - while (ok && (bs = bdrv_next(bs))) { - AioContext *ctx = bdrv_get_aio_context(bs); - - aio_context_acquire(ctx); - if (bdrv_is_inserted(bs) && !bdrv_is_read_only(bs)) { - ok = bdrv_can_snapshot(bs); - } - aio_context_release(ctx); - } - - *first_bad_bs = bs; - return ok; -} - -int bdrv_all_delete_snapshot(const char *name, BlockDriverState **first_bad_bs, - Error **err) -{ - int ret = 0; - BlockDriverState *bs = NULL; - QEMUSnapshotInfo sn1, *snapshot = &sn1; - - while (ret == 0 && (bs = bdrv_next(bs))) { - AioContext *ctx = bdrv_get_aio_context(bs); - - aio_context_acquire(ctx); - if (bdrv_can_snapshot(bs) && - bdrv_snapshot_find(bs, snapshot, name) >= 0) { - ret = bdrv_snapshot_delete_by_id_or_name(bs, name, err); - } - aio_context_release(ctx); - } - - *first_bad_bs = bs; - return ret; -} - - -int bdrv_all_goto_snapshot(const char *name, BlockDriverState **first_bad_bs) -{ - int err = 0; - BlockDriverState *bs = NULL; - - while (err == 0 && (bs = bdrv_next(bs))) { - AioContext *ctx = bdrv_get_aio_context(bs); - - aio_context_acquire(ctx); - if (bdrv_can_snapshot(bs)) { - err = bdrv_snapshot_goto(bs, name); - } - aio_context_release(ctx); - } - - *first_bad_bs = bs; - return err; -} - -int bdrv_all_find_snapshot(const char *name, BlockDriverState **first_bad_bs) -{ - QEMUSnapshotInfo sn; - int err = 0; - BlockDriverState *bs = NULL; - - while (err == 0 && (bs = bdrv_next(bs))) { - AioContext *ctx = bdrv_get_aio_context(bs); - - aio_context_acquire(ctx); - if (bdrv_can_snapshot(bs)) { - err = bdrv_snapshot_find(bs, &sn, name); - } - aio_context_release(ctx); - } - - *first_bad_bs = bs; - return err; -} - -int bdrv_all_create_snapshot(QEMUSnapshotInfo *sn, - BlockDriverState *vm_state_bs, - uint64_t vm_state_size, - BlockDriverState **first_bad_bs) -{ - int err = 0; - BlockDriverState *bs = NULL; - - while (err == 0 && (bs = bdrv_next(bs))) { - AioContext *ctx = bdrv_get_aio_context(bs); - - aio_context_acquire(ctx); - if (bs == vm_state_bs) { - sn->vm_state_size = vm_state_size; - err = bdrv_snapshot_create(bs, sn); - } else if (bdrv_can_snapshot(bs)) { - sn->vm_state_size = 0; - err = bdrv_snapshot_create(bs, sn); - } - aio_context_release(ctx); - } - - *first_bad_bs = bs; - return err; -} - -BlockDriverState *bdrv_all_find_vmstate_bs(void) -{ - bool not_found = true; - BlockDriverState *bs = NULL; - - while (not_found && (bs = bdrv_next(bs))) { - AioContext *ctx = bdrv_get_aio_context(bs); - - aio_context_acquire(ctx); - not_found = !bdrv_can_snapshot(bs); - aio_context_release(ctx); - } - return bs; -} diff --git a/qemu/block/ssh.c b/qemu/block/ssh.c deleted file mode 100644 index 06928ed93..000000000 --- a/qemu/block/ssh.c +++ /dev/null @@ -1,1111 +0,0 @@ -/* - * Secure Shell (ssh) backend for QEMU. - * - * Copyright (C) 2013 Red Hat Inc., Richard W.M. Jones - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. - */ - -#include "qemu/osdep.h" - -#include -#include - -#include "block/block_int.h" -#include "qapi/error.h" -#include "qemu/error-report.h" -#include "qemu/sockets.h" -#include "qemu/uri.h" -#include "qapi/qmp/qint.h" -#include "qapi/qmp/qstring.h" - -/* DEBUG_SSH=1 enables the DPRINTF (debugging printf) statements in - * this block driver code. - * - * TRACE_LIBSSH2= enables tracing in libssh2 itself. Note - * that this requires that libssh2 was specially compiled with the - * `./configure --enable-debug' option, so most likely you will have - * to compile it yourself. The meaning of is described - * here: http://www.libssh2.org/libssh2_trace.html - */ -#define DEBUG_SSH 0 -#define TRACE_LIBSSH2 0 /* or try: LIBSSH2_TRACE_SFTP */ - -#define DPRINTF(fmt, ...) \ - do { \ - if (DEBUG_SSH) { \ - fprintf(stderr, "ssh: %-15s " fmt "\n", \ - __func__, ##__VA_ARGS__); \ - } \ - } while (0) - -typedef struct BDRVSSHState { - /* Coroutine. */ - CoMutex lock; - - /* SSH connection. */ - int sock; /* socket */ - LIBSSH2_SESSION *session; /* ssh session */ - LIBSSH2_SFTP *sftp; /* sftp session */ - LIBSSH2_SFTP_HANDLE *sftp_handle; /* sftp remote file handle */ - - /* See ssh_seek() function below. */ - int64_t offset; - bool offset_op_read; - - /* File attributes at open. We try to keep the .filesize field - * updated if it changes (eg by writing at the end of the file). - */ - LIBSSH2_SFTP_ATTRIBUTES attrs; - - /* Used to warn if 'flush' is not supported. */ - char *hostport; - bool unsafe_flush_warning; -} BDRVSSHState; - -static void ssh_state_init(BDRVSSHState *s) -{ - memset(s, 0, sizeof *s); - s->sock = -1; - s->offset = -1; - qemu_co_mutex_init(&s->lock); -} - -static void ssh_state_free(BDRVSSHState *s) -{ - g_free(s->hostport); - if (s->sftp_handle) { - libssh2_sftp_close(s->sftp_handle); - } - if (s->sftp) { - libssh2_sftp_shutdown(s->sftp); - } - if (s->session) { - libssh2_session_disconnect(s->session, - "from qemu ssh client: " - "user closed the connection"); - libssh2_session_free(s->session); - } - if (s->sock >= 0) { - close(s->sock); - } -} - -static void GCC_FMT_ATTR(3, 4) -session_error_setg(Error **errp, BDRVSSHState *s, const char *fs, ...) -{ - va_list args; - char *msg; - - va_start(args, fs); - msg = g_strdup_vprintf(fs, args); - va_end(args); - - if (s->session) { - char *ssh_err; - int ssh_err_code; - - /* This is not an errno. See . */ - ssh_err_code = libssh2_session_last_error(s->session, - &ssh_err, NULL, 0); - error_setg(errp, "%s: %s (libssh2 error code: %d)", - msg, ssh_err, ssh_err_code); - } else { - error_setg(errp, "%s", msg); - } - g_free(msg); -} - -static void GCC_FMT_ATTR(3, 4) -sftp_error_setg(Error **errp, BDRVSSHState *s, const char *fs, ...) -{ - va_list args; - char *msg; - - va_start(args, fs); - msg = g_strdup_vprintf(fs, args); - va_end(args); - - if (s->sftp) { - char *ssh_err; - int ssh_err_code; - unsigned long sftp_err_code; - - /* This is not an errno. See . */ - ssh_err_code = libssh2_session_last_error(s->session, - &ssh_err, NULL, 0); - /* See . */ - sftp_err_code = libssh2_sftp_last_error((s)->sftp); - - error_setg(errp, - "%s: %s (libssh2 error code: %d, sftp error code: %lu)", - msg, ssh_err, ssh_err_code, sftp_err_code); - } else { - error_setg(errp, "%s", msg); - } - g_free(msg); -} - -static void GCC_FMT_ATTR(2, 3) -sftp_error_report(BDRVSSHState *s, const char *fs, ...) -{ - va_list args; - - va_start(args, fs); - error_vprintf(fs, args); - - if ((s)->sftp) { - char *ssh_err; - int ssh_err_code; - unsigned long sftp_err_code; - - /* This is not an errno. See . */ - ssh_err_code = libssh2_session_last_error(s->session, - &ssh_err, NULL, 0); - /* See . */ - sftp_err_code = libssh2_sftp_last_error((s)->sftp); - - error_printf(": %s (libssh2 error code: %d, sftp error code: %lu)", - ssh_err, ssh_err_code, sftp_err_code); - } - - va_end(args); - error_printf("\n"); -} - -static int parse_uri(const char *filename, QDict *options, Error **errp) -{ - URI *uri = NULL; - QueryParams *qp; - int i; - - uri = uri_parse(filename); - if (!uri) { - return -EINVAL; - } - - if (strcmp(uri->scheme, "ssh") != 0) { - error_setg(errp, "URI scheme must be 'ssh'"); - goto err; - } - - if (!uri->server || strcmp(uri->server, "") == 0) { - error_setg(errp, "missing hostname in URI"); - goto err; - } - - if (!uri->path || strcmp(uri->path, "") == 0) { - error_setg(errp, "missing remote path in URI"); - goto err; - } - - qp = query_params_parse(uri->query); - if (!qp) { - error_setg(errp, "could not parse query parameters"); - goto err; - } - - if(uri->user && strcmp(uri->user, "") != 0) { - qdict_put(options, "user", qstring_from_str(uri->user)); - } - - qdict_put(options, "host", qstring_from_str(uri->server)); - - if (uri->port) { - qdict_put(options, "port", qint_from_int(uri->port)); - } - - qdict_put(options, "path", qstring_from_str(uri->path)); - - /* Pick out any query parameters that we understand, and ignore - * the rest. - */ - for (i = 0; i < qp->n; ++i) { - if (strcmp(qp->p[i].name, "host_key_check") == 0) { - qdict_put(options, "host_key_check", - qstring_from_str(qp->p[i].value)); - } - } - - query_params_free(qp); - uri_free(uri); - return 0; - - err: - if (uri) { - uri_free(uri); - } - return -EINVAL; -} - -static void ssh_parse_filename(const char *filename, QDict *options, - Error **errp) -{ - if (qdict_haskey(options, "user") || - qdict_haskey(options, "host") || - qdict_haskey(options, "port") || - qdict_haskey(options, "path") || - qdict_haskey(options, "host_key_check")) { - error_setg(errp, "user, host, port, path, host_key_check cannot be used at the same time as a file option"); - return; - } - - parse_uri(filename, options, errp); -} - -static int check_host_key_knownhosts(BDRVSSHState *s, - const char *host, int port, Error **errp) -{ - const char *home; - char *knh_file = NULL; - LIBSSH2_KNOWNHOSTS *knh = NULL; - struct libssh2_knownhost *found; - int ret, r; - const char *hostkey; - size_t len; - int type; - - hostkey = libssh2_session_hostkey(s->session, &len, &type); - if (!hostkey) { - ret = -EINVAL; - session_error_setg(errp, s, "failed to read remote host key"); - goto out; - } - - knh = libssh2_knownhost_init(s->session); - if (!knh) { - ret = -EINVAL; - session_error_setg(errp, s, - "failed to initialize known hosts support"); - goto out; - } - - home = getenv("HOME"); - if (home) { - knh_file = g_strdup_printf("%s/.ssh/known_hosts", home); - } else { - knh_file = g_strdup_printf("/root/.ssh/known_hosts"); - } - - /* Read all known hosts from OpenSSH-style known_hosts file. */ - libssh2_knownhost_readfile(knh, knh_file, LIBSSH2_KNOWNHOST_FILE_OPENSSH); - - r = libssh2_knownhost_checkp(knh, host, port, hostkey, len, - LIBSSH2_KNOWNHOST_TYPE_PLAIN| - LIBSSH2_KNOWNHOST_KEYENC_RAW, - &found); - switch (r) { - case LIBSSH2_KNOWNHOST_CHECK_MATCH: - /* OK */ - DPRINTF("host key OK: %s", found->key); - break; - case LIBSSH2_KNOWNHOST_CHECK_MISMATCH: - ret = -EINVAL; - session_error_setg(errp, s, - "host key does not match the one in known_hosts" - " (found key %s)", found->key); - goto out; - case LIBSSH2_KNOWNHOST_CHECK_NOTFOUND: - ret = -EINVAL; - session_error_setg(errp, s, "no host key was found in known_hosts"); - goto out; - case LIBSSH2_KNOWNHOST_CHECK_FAILURE: - ret = -EINVAL; - session_error_setg(errp, s, - "failure matching the host key with known_hosts"); - goto out; - default: - ret = -EINVAL; - session_error_setg(errp, s, "unknown error matching the host key" - " with known_hosts (%d)", r); - goto out; - } - - /* known_hosts checking successful. */ - ret = 0; - - out: - if (knh != NULL) { - libssh2_knownhost_free(knh); - } - g_free(knh_file); - return ret; -} - -static unsigned hex2decimal(char ch) -{ - if (ch >= '0' && ch <= '9') { - return (ch - '0'); - } else if (ch >= 'a' && ch <= 'f') { - return 10 + (ch - 'a'); - } else if (ch >= 'A' && ch <= 'F') { - return 10 + (ch - 'A'); - } - - return -1; -} - -/* Compare the binary fingerprint (hash of host key) with the - * host_key_check parameter. - */ -static int compare_fingerprint(const unsigned char *fingerprint, size_t len, - const char *host_key_check) -{ - unsigned c; - - while (len > 0) { - while (*host_key_check == ':') - host_key_check++; - if (!qemu_isxdigit(host_key_check[0]) || - !qemu_isxdigit(host_key_check[1])) - return 1; - c = hex2decimal(host_key_check[0]) * 16 + - hex2decimal(host_key_check[1]); - if (c - *fingerprint != 0) - return c - *fingerprint; - fingerprint++; - len--; - host_key_check += 2; - } - return *host_key_check - '\0'; -} - -static int -check_host_key_hash(BDRVSSHState *s, const char *hash, - int hash_type, size_t fingerprint_len, Error **errp) -{ - const char *fingerprint; - - fingerprint = libssh2_hostkey_hash(s->session, hash_type); - if (!fingerprint) { - session_error_setg(errp, s, "failed to read remote host key"); - return -EINVAL; - } - - if(compare_fingerprint((unsigned char *) fingerprint, fingerprint_len, - hash) != 0) { - error_setg(errp, "remote host key does not match host_key_check '%s'", - hash); - return -EPERM; - } - - return 0; -} - -static int check_host_key(BDRVSSHState *s, const char *host, int port, - const char *host_key_check, Error **errp) -{ - /* host_key_check=no */ - if (strcmp(host_key_check, "no") == 0) { - return 0; - } - - /* host_key_check=md5:xx:yy:zz:... */ - if (strncmp(host_key_check, "md5:", 4) == 0) { - return check_host_key_hash(s, &host_key_check[4], - LIBSSH2_HOSTKEY_HASH_MD5, 16, errp); - } - - /* host_key_check=sha1:xx:yy:zz:... */ - if (strncmp(host_key_check, "sha1:", 5) == 0) { - return check_host_key_hash(s, &host_key_check[5], - LIBSSH2_HOSTKEY_HASH_SHA1, 20, errp); - } - - /* host_key_check=yes */ - if (strcmp(host_key_check, "yes") == 0) { - return check_host_key_knownhosts(s, host, port, errp); - } - - error_setg(errp, "unknown host_key_check setting (%s)", host_key_check); - return -EINVAL; -} - -static int authenticate(BDRVSSHState *s, const char *user, Error **errp) -{ - int r, ret; - const char *userauthlist; - LIBSSH2_AGENT *agent = NULL; - struct libssh2_agent_publickey *identity; - struct libssh2_agent_publickey *prev_identity = NULL; - - userauthlist = libssh2_userauth_list(s->session, user, strlen(user)); - if (strstr(userauthlist, "publickey") == NULL) { - ret = -EPERM; - error_setg(errp, - "remote server does not support \"publickey\" authentication"); - goto out; - } - - /* Connect to ssh-agent and try each identity in turn. */ - agent = libssh2_agent_init(s->session); - if (!agent) { - ret = -EINVAL; - session_error_setg(errp, s, "failed to initialize ssh-agent support"); - goto out; - } - if (libssh2_agent_connect(agent)) { - ret = -ECONNREFUSED; - session_error_setg(errp, s, "failed to connect to ssh-agent"); - goto out; - } - if (libssh2_agent_list_identities(agent)) { - ret = -EINVAL; - session_error_setg(errp, s, - "failed requesting identities from ssh-agent"); - goto out; - } - - for(;;) { - r = libssh2_agent_get_identity(agent, &identity, prev_identity); - if (r == 1) { /* end of list */ - break; - } - if (r < 0) { - ret = -EINVAL; - session_error_setg(errp, s, - "failed to obtain identity from ssh-agent"); - goto out; - } - r = libssh2_agent_userauth(agent, user, identity); - if (r == 0) { - /* Authenticated! */ - ret = 0; - goto out; - } - /* Failed to authenticate with this identity, try the next one. */ - prev_identity = identity; - } - - ret = -EPERM; - error_setg(errp, "failed to authenticate using publickey authentication " - "and the identities held by your ssh-agent"); - - out: - if (agent != NULL) { - /* Note: libssh2 implementation implicitly calls - * libssh2_agent_disconnect if necessary. - */ - libssh2_agent_free(agent); - } - - return ret; -} - -static int connect_to_ssh(BDRVSSHState *s, QDict *options, - int ssh_flags, int creat_mode, Error **errp) -{ - int r, ret; - const char *host, *user, *path, *host_key_check; - int port; - - if (!qdict_haskey(options, "host")) { - ret = -EINVAL; - error_setg(errp, "No hostname was specified"); - goto err; - } - host = qdict_get_str(options, "host"); - - if (qdict_haskey(options, "port")) { - port = qdict_get_int(options, "port"); - } else { - port = 22; - } - - if (!qdict_haskey(options, "path")) { - ret = -EINVAL; - error_setg(errp, "No path was specified"); - goto err; - } - path = qdict_get_str(options, "path"); - - if (qdict_haskey(options, "user")) { - user = qdict_get_str(options, "user"); - } else { - user = g_get_user_name(); - if (!user) { - error_setg_errno(errp, errno, "Can't get user name"); - ret = -errno; - goto err; - } - } - - if (qdict_haskey(options, "host_key_check")) { - host_key_check = qdict_get_str(options, "host_key_check"); - } else { - host_key_check = "yes"; - } - - /* Construct the host:port name for inet_connect. */ - g_free(s->hostport); - s->hostport = g_strdup_printf("%s:%d", host, port); - - /* Open the socket and connect. */ - s->sock = inet_connect(s->hostport, errp); - if (s->sock < 0) { - ret = -EIO; - goto err; - } - - /* Create SSH session. */ - s->session = libssh2_session_init(); - if (!s->session) { - ret = -EINVAL; - session_error_setg(errp, s, "failed to initialize libssh2 session"); - goto err; - } - -#if TRACE_LIBSSH2 != 0 - libssh2_trace(s->session, TRACE_LIBSSH2); -#endif - - r = libssh2_session_handshake(s->session, s->sock); - if (r != 0) { - ret = -EINVAL; - session_error_setg(errp, s, "failed to establish SSH session"); - goto err; - } - - /* Check the remote host's key against known_hosts. */ - ret = check_host_key(s, host, port, host_key_check, errp); - if (ret < 0) { - goto err; - } - - /* Authenticate. */ - ret = authenticate(s, user, errp); - if (ret < 0) { - goto err; - } - - /* Start SFTP. */ - s->sftp = libssh2_sftp_init(s->session); - if (!s->sftp) { - session_error_setg(errp, s, "failed to initialize sftp handle"); - ret = -EINVAL; - goto err; - } - - /* Open the remote file. */ - DPRINTF("opening file %s flags=0x%x creat_mode=0%o", - path, ssh_flags, creat_mode); - s->sftp_handle = libssh2_sftp_open(s->sftp, path, ssh_flags, creat_mode); - if (!s->sftp_handle) { - session_error_setg(errp, s, "failed to open remote file '%s'", path); - ret = -EINVAL; - goto err; - } - - r = libssh2_sftp_fstat(s->sftp_handle, &s->attrs); - if (r < 0) { - sftp_error_setg(errp, s, "failed to read file attributes"); - return -EINVAL; - } - - /* Delete the options we've used; any not deleted will cause the - * block layer to give an error about unused options. - */ - qdict_del(options, "host"); - qdict_del(options, "port"); - qdict_del(options, "user"); - qdict_del(options, "path"); - qdict_del(options, "host_key_check"); - - return 0; - - err: - if (s->sftp_handle) { - libssh2_sftp_close(s->sftp_handle); - } - s->sftp_handle = NULL; - if (s->sftp) { - libssh2_sftp_shutdown(s->sftp); - } - s->sftp = NULL; - if (s->session) { - libssh2_session_disconnect(s->session, - "from qemu ssh client: " - "error opening connection"); - libssh2_session_free(s->session); - } - s->session = NULL; - - return ret; -} - -static int ssh_file_open(BlockDriverState *bs, QDict *options, int bdrv_flags, - Error **errp) -{ - BDRVSSHState *s = bs->opaque; - int ret; - int ssh_flags; - - ssh_state_init(s); - - ssh_flags = LIBSSH2_FXF_READ; - if (bdrv_flags & BDRV_O_RDWR) { - ssh_flags |= LIBSSH2_FXF_WRITE; - } - - /* Start up SSH. */ - ret = connect_to_ssh(s, options, ssh_flags, 0, errp); - if (ret < 0) { - goto err; - } - - /* Go non-blocking. */ - libssh2_session_set_blocking(s->session, 0); - - return 0; - - err: - if (s->sock >= 0) { - close(s->sock); - } - s->sock = -1; - - return ret; -} - -static QemuOptsList ssh_create_opts = { - .name = "ssh-create-opts", - .head = QTAILQ_HEAD_INITIALIZER(ssh_create_opts.head), - .desc = { - { - .name = BLOCK_OPT_SIZE, - .type = QEMU_OPT_SIZE, - .help = "Virtual disk size" - }, - { /* end of list */ } - } -}; - -static int ssh_create(const char *filename, QemuOpts *opts, Error **errp) -{ - int r, ret; - int64_t total_size = 0; - QDict *uri_options = NULL; - BDRVSSHState s; - ssize_t r2; - char c[1] = { '\0' }; - - ssh_state_init(&s); - - /* Get desired file size. */ - total_size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0), - BDRV_SECTOR_SIZE); - DPRINTF("total_size=%" PRIi64, total_size); - - uri_options = qdict_new(); - r = parse_uri(filename, uri_options, errp); - if (r < 0) { - ret = r; - goto out; - } - - r = connect_to_ssh(&s, uri_options, - LIBSSH2_FXF_READ|LIBSSH2_FXF_WRITE| - LIBSSH2_FXF_CREAT|LIBSSH2_FXF_TRUNC, - 0644, errp); - if (r < 0) { - ret = r; - goto out; - } - - if (total_size > 0) { - libssh2_sftp_seek64(s.sftp_handle, total_size-1); - r2 = libssh2_sftp_write(s.sftp_handle, c, 1); - if (r2 < 0) { - sftp_error_setg(errp, &s, "truncate failed"); - ret = -EINVAL; - goto out; - } - s.attrs.filesize = total_size; - } - - ret = 0; - - out: - ssh_state_free(&s); - if (uri_options != NULL) { - QDECREF(uri_options); - } - return ret; -} - -static void ssh_close(BlockDriverState *bs) -{ - BDRVSSHState *s = bs->opaque; - - ssh_state_free(s); -} - -static int ssh_has_zero_init(BlockDriverState *bs) -{ - BDRVSSHState *s = bs->opaque; - /* Assume false, unless we can positively prove it's true. */ - int has_zero_init = 0; - - if (s->attrs.flags & LIBSSH2_SFTP_ATTR_PERMISSIONS) { - if (s->attrs.permissions & LIBSSH2_SFTP_S_IFREG) { - has_zero_init = 1; - } - } - - return has_zero_init; -} - -static void restart_coroutine(void *opaque) -{ - Coroutine *co = opaque; - - DPRINTF("co=%p", co); - - qemu_coroutine_enter(co, NULL); -} - -static coroutine_fn void set_fd_handler(BDRVSSHState *s, BlockDriverState *bs) -{ - int r; - IOHandler *rd_handler = NULL, *wr_handler = NULL; - Coroutine *co = qemu_coroutine_self(); - - r = libssh2_session_block_directions(s->session); - - if (r & LIBSSH2_SESSION_BLOCK_INBOUND) { - rd_handler = restart_coroutine; - } - if (r & LIBSSH2_SESSION_BLOCK_OUTBOUND) { - wr_handler = restart_coroutine; - } - - DPRINTF("s->sock=%d rd_handler=%p wr_handler=%p", s->sock, - rd_handler, wr_handler); - - aio_set_fd_handler(bdrv_get_aio_context(bs), s->sock, - false, rd_handler, wr_handler, co); -} - -static coroutine_fn void clear_fd_handler(BDRVSSHState *s, - BlockDriverState *bs) -{ - DPRINTF("s->sock=%d", s->sock); - aio_set_fd_handler(bdrv_get_aio_context(bs), s->sock, - false, NULL, NULL, NULL); -} - -/* A non-blocking call returned EAGAIN, so yield, ensuring the - * handlers are set up so that we'll be rescheduled when there is an - * interesting event on the socket. - */ -static coroutine_fn void co_yield(BDRVSSHState *s, BlockDriverState *bs) -{ - set_fd_handler(s, bs); - qemu_coroutine_yield(); - clear_fd_handler(s, bs); -} - -/* SFTP has a function `libssh2_sftp_seek64' which seeks to a position - * in the remote file. Notice that it just updates a field in the - * sftp_handle structure, so there is no network traffic and it cannot - * fail. - * - * However, `libssh2_sftp_seek64' does have a catastrophic effect on - * performance since it causes the handle to throw away all in-flight - * reads and buffered readahead data. Therefore this function tries - * to be intelligent about when to call the underlying libssh2 function. - */ -#define SSH_SEEK_WRITE 0 -#define SSH_SEEK_READ 1 -#define SSH_SEEK_FORCE 2 - -static void ssh_seek(BDRVSSHState *s, int64_t offset, int flags) -{ - bool op_read = (flags & SSH_SEEK_READ) != 0; - bool force = (flags & SSH_SEEK_FORCE) != 0; - - if (force || op_read != s->offset_op_read || offset != s->offset) { - DPRINTF("seeking to offset=%" PRIi64, offset); - libssh2_sftp_seek64(s->sftp_handle, offset); - s->offset = offset; - s->offset_op_read = op_read; - } -} - -static coroutine_fn int ssh_read(BDRVSSHState *s, BlockDriverState *bs, - int64_t offset, size_t size, - QEMUIOVector *qiov) -{ - ssize_t r; - size_t got; - char *buf, *end_of_vec; - struct iovec *i; - - DPRINTF("offset=%" PRIi64 " size=%zu", offset, size); - - ssh_seek(s, offset, SSH_SEEK_READ); - - /* This keeps track of the current iovec element ('i'), where we - * will write to next ('buf'), and the end of the current iovec - * ('end_of_vec'). - */ - i = &qiov->iov[0]; - buf = i->iov_base; - end_of_vec = i->iov_base + i->iov_len; - - /* libssh2 has a hard-coded limit of 2000 bytes per request, - * although it will also do readahead behind our backs. Therefore - * we may have to do repeated reads here until we have read 'size' - * bytes. - */ - for (got = 0; got < size; ) { - again: - DPRINTF("sftp_read buf=%p size=%zu", buf, end_of_vec - buf); - r = libssh2_sftp_read(s->sftp_handle, buf, end_of_vec - buf); - DPRINTF("sftp_read returned %zd", r); - - if (r == LIBSSH2_ERROR_EAGAIN || r == LIBSSH2_ERROR_TIMEOUT) { - co_yield(s, bs); - goto again; - } - if (r < 0) { - sftp_error_report(s, "read failed"); - s->offset = -1; - return -EIO; - } - if (r == 0) { - /* EOF: Short read so pad the buffer with zeroes and return it. */ - qemu_iovec_memset(qiov, got, 0, size - got); - return 0; - } - - got += r; - buf += r; - s->offset += r; - if (buf >= end_of_vec && got < size) { - i++; - buf = i->iov_base; - end_of_vec = i->iov_base + i->iov_len; - } - } - - return 0; -} - -static coroutine_fn int ssh_co_readv(BlockDriverState *bs, - int64_t sector_num, - int nb_sectors, QEMUIOVector *qiov) -{ - BDRVSSHState *s = bs->opaque; - int ret; - - qemu_co_mutex_lock(&s->lock); - ret = ssh_read(s, bs, sector_num * BDRV_SECTOR_SIZE, - nb_sectors * BDRV_SECTOR_SIZE, qiov); - qemu_co_mutex_unlock(&s->lock); - - return ret; -} - -static int ssh_write(BDRVSSHState *s, BlockDriverState *bs, - int64_t offset, size_t size, - QEMUIOVector *qiov) -{ - ssize_t r; - size_t written; - char *buf, *end_of_vec; - struct iovec *i; - - DPRINTF("offset=%" PRIi64 " size=%zu", offset, size); - - ssh_seek(s, offset, SSH_SEEK_WRITE); - - /* This keeps track of the current iovec element ('i'), where we - * will read from next ('buf'), and the end of the current iovec - * ('end_of_vec'). - */ - i = &qiov->iov[0]; - buf = i->iov_base; - end_of_vec = i->iov_base + i->iov_len; - - for (written = 0; written < size; ) { - again: - DPRINTF("sftp_write buf=%p size=%zu", buf, end_of_vec - buf); - r = libssh2_sftp_write(s->sftp_handle, buf, end_of_vec - buf); - DPRINTF("sftp_write returned %zd", r); - - if (r == LIBSSH2_ERROR_EAGAIN || r == LIBSSH2_ERROR_TIMEOUT) { - co_yield(s, bs); - goto again; - } - if (r < 0) { - sftp_error_report(s, "write failed"); - s->offset = -1; - return -EIO; - } - /* The libssh2 API is very unclear about this. A comment in - * the code says "nothing was acked, and no EAGAIN was - * received!" which apparently means that no data got sent - * out, and the underlying channel didn't return any EAGAIN - * indication. I think this is a bug in either libssh2 or - * OpenSSH (server-side). In any case, forcing a seek (to - * discard libssh2 internal buffers), and then trying again - * works for me. - */ - if (r == 0) { - ssh_seek(s, offset + written, SSH_SEEK_WRITE|SSH_SEEK_FORCE); - co_yield(s, bs); - goto again; - } - - written += r; - buf += r; - s->offset += r; - if (buf >= end_of_vec && written < size) { - i++; - buf = i->iov_base; - end_of_vec = i->iov_base + i->iov_len; - } - - if (offset + written > s->attrs.filesize) - s->attrs.filesize = offset + written; - } - - return 0; -} - -static coroutine_fn int ssh_co_writev(BlockDriverState *bs, - int64_t sector_num, - int nb_sectors, QEMUIOVector *qiov) -{ - BDRVSSHState *s = bs->opaque; - int ret; - - qemu_co_mutex_lock(&s->lock); - ret = ssh_write(s, bs, sector_num * BDRV_SECTOR_SIZE, - nb_sectors * BDRV_SECTOR_SIZE, qiov); - qemu_co_mutex_unlock(&s->lock); - - return ret; -} - -static void unsafe_flush_warning(BDRVSSHState *s, const char *what) -{ - if (!s->unsafe_flush_warning) { - error_report("warning: ssh server %s does not support fsync", - s->hostport); - if (what) { - error_report("to support fsync, you need %s", what); - } - s->unsafe_flush_warning = true; - } -} - -#ifdef HAS_LIBSSH2_SFTP_FSYNC - -static coroutine_fn int ssh_flush(BDRVSSHState *s, BlockDriverState *bs) -{ - int r; - - DPRINTF("fsync"); - again: - r = libssh2_sftp_fsync(s->sftp_handle); - if (r == LIBSSH2_ERROR_EAGAIN || r == LIBSSH2_ERROR_TIMEOUT) { - co_yield(s, bs); - goto again; - } - if (r == LIBSSH2_ERROR_SFTP_PROTOCOL && - libssh2_sftp_last_error(s->sftp) == LIBSSH2_FX_OP_UNSUPPORTED) { - unsafe_flush_warning(s, "OpenSSH >= 6.3"); - return 0; - } - if (r < 0) { - sftp_error_report(s, "fsync failed"); - return -EIO; - } - - return 0; -} - -static coroutine_fn int ssh_co_flush(BlockDriverState *bs) -{ - BDRVSSHState *s = bs->opaque; - int ret; - - qemu_co_mutex_lock(&s->lock); - ret = ssh_flush(s, bs); - qemu_co_mutex_unlock(&s->lock); - - return ret; -} - -#else /* !HAS_LIBSSH2_SFTP_FSYNC */ - -static coroutine_fn int ssh_co_flush(BlockDriverState *bs) -{ - BDRVSSHState *s = bs->opaque; - - unsafe_flush_warning(s, "libssh2 >= 1.4.4"); - return 0; -} - -#endif /* !HAS_LIBSSH2_SFTP_FSYNC */ - -static int64_t ssh_getlength(BlockDriverState *bs) -{ - BDRVSSHState *s = bs->opaque; - int64_t length; - - /* Note we cannot make a libssh2 call here. */ - length = (int64_t) s->attrs.filesize; - DPRINTF("length=%" PRIi64, length); - - return length; -} - -static BlockDriver bdrv_ssh = { - .format_name = "ssh", - .protocol_name = "ssh", - .instance_size = sizeof(BDRVSSHState), - .bdrv_parse_filename = ssh_parse_filename, - .bdrv_file_open = ssh_file_open, - .bdrv_create = ssh_create, - .bdrv_close = ssh_close, - .bdrv_has_zero_init = ssh_has_zero_init, - .bdrv_co_readv = ssh_co_readv, - .bdrv_co_writev = ssh_co_writev, - .bdrv_getlength = ssh_getlength, - .bdrv_co_flush_to_disk = ssh_co_flush, - .create_opts = &ssh_create_opts, -}; - -static void bdrv_ssh_init(void) -{ - int r; - - r = libssh2_init(0); - if (r != 0) { - fprintf(stderr, "libssh2 initialization failed, %d\n", r); - exit(EXIT_FAILURE); - } - - bdrv_register(&bdrv_ssh); -} - -block_init(bdrv_ssh_init); diff --git a/qemu/block/stream.c b/qemu/block/stream.c deleted file mode 100644 index 332b9a183..000000000 --- a/qemu/block/stream.c +++ /dev/null @@ -1,246 +0,0 @@ -/* - * Image streaming - * - * Copyright IBM, Corp. 2011 - * - * Authors: - * Stefan Hajnoczi - * - * This work is licensed under the terms of the GNU LGPL, version 2 or later. - * See the COPYING.LIB file in the top-level directory. - * - */ - -#include "qemu/osdep.h" -#include "trace.h" -#include "block/block_int.h" -#include "block/blockjob.h" -#include "qapi/error.h" -#include "qapi/qmp/qerror.h" -#include "qemu/ratelimit.h" -#include "sysemu/block-backend.h" - -enum { - /* - * Size of data buffer for populating the image file. This should be large - * enough to process multiple clusters in a single call, so that populating - * contiguous regions of the image is efficient. - */ - STREAM_BUFFER_SIZE = 512 * 1024, /* in bytes */ -}; - -#define SLICE_TIME 100000000ULL /* ns */ - -typedef struct StreamBlockJob { - BlockJob common; - RateLimit limit; - BlockDriverState *base; - BlockdevOnError on_error; - char *backing_file_str; -} StreamBlockJob; - -static int coroutine_fn stream_populate(BlockDriverState *bs, - int64_t sector_num, int nb_sectors, - void *buf) -{ - struct iovec iov = { - .iov_base = buf, - .iov_len = nb_sectors * BDRV_SECTOR_SIZE, - }; - QEMUIOVector qiov; - - qemu_iovec_init_external(&qiov, &iov, 1); - - /* Copy-on-read the unallocated clusters */ - return bdrv_co_copy_on_readv(bs, sector_num, nb_sectors, &qiov); -} - -typedef struct { - int ret; - bool reached_end; -} StreamCompleteData; - -static void stream_complete(BlockJob *job, void *opaque) -{ - StreamBlockJob *s = container_of(job, StreamBlockJob, common); - StreamCompleteData *data = opaque; - BlockDriverState *base = s->base; - - if (!block_job_is_cancelled(&s->common) && data->reached_end && - data->ret == 0) { - const char *base_id = NULL, *base_fmt = NULL; - if (base) { - base_id = s->backing_file_str; - if (base->drv) { - base_fmt = base->drv->format_name; - } - } - data->ret = bdrv_change_backing_file(job->bs, base_id, base_fmt); - bdrv_set_backing_hd(job->bs, base); - } - - g_free(s->backing_file_str); - block_job_completed(&s->common, data->ret); - g_free(data); -} - -static void coroutine_fn stream_run(void *opaque) -{ - StreamBlockJob *s = opaque; - StreamCompleteData *data; - BlockDriverState *bs = s->common.bs; - BlockDriverState *base = s->base; - int64_t sector_num = 0; - int64_t end = -1; - int error = 0; - int ret = 0; - int n = 0; - void *buf; - - if (!bs->backing) { - goto out; - } - - s->common.len = bdrv_getlength(bs); - if (s->common.len < 0) { - ret = s->common.len; - goto out; - } - - end = s->common.len >> BDRV_SECTOR_BITS; - buf = qemu_blockalign(bs, STREAM_BUFFER_SIZE); - - /* Turn on copy-on-read for the whole block device so that guest read - * requests help us make progress. Only do this when copying the entire - * backing chain since the copy-on-read operation does not take base into - * account. - */ - if (!base) { - bdrv_enable_copy_on_read(bs); - } - - for (sector_num = 0; sector_num < end; sector_num += n) { - uint64_t delay_ns = 0; - bool copy; - -wait: - /* Note that even when no rate limit is applied we need to yield - * with no pending I/O here so that bdrv_drain_all() returns. - */ - block_job_sleep_ns(&s->common, QEMU_CLOCK_REALTIME, delay_ns); - if (block_job_is_cancelled(&s->common)) { - break; - } - - copy = false; - - ret = bdrv_is_allocated(bs, sector_num, - STREAM_BUFFER_SIZE / BDRV_SECTOR_SIZE, &n); - if (ret == 1) { - /* Allocated in the top, no need to copy. */ - } else if (ret >= 0) { - /* Copy if allocated in the intermediate images. Limit to the - * known-unallocated area [sector_num, sector_num+n). */ - ret = bdrv_is_allocated_above(backing_bs(bs), base, - sector_num, n, &n); - - /* Finish early if end of backing file has been reached */ - if (ret == 0 && n == 0) { - n = end - sector_num; - } - - copy = (ret == 1); - } - trace_stream_one_iteration(s, sector_num, n, ret); - if (copy) { - if (s->common.speed) { - delay_ns = ratelimit_calculate_delay(&s->limit, n); - if (delay_ns > 0) { - goto wait; - } - } - ret = stream_populate(bs, sector_num, n, buf); - } - if (ret < 0) { - BlockErrorAction action = - block_job_error_action(&s->common, s->common.bs, s->on_error, - true, -ret); - if (action == BLOCK_ERROR_ACTION_STOP) { - n = 0; - continue; - } - if (error == 0) { - error = ret; - } - if (action == BLOCK_ERROR_ACTION_REPORT) { - break; - } - } - ret = 0; - - /* Publish progress */ - s->common.offset += n * BDRV_SECTOR_SIZE; - } - - if (!base) { - bdrv_disable_copy_on_read(bs); - } - - /* Do not remove the backing file if an error was there but ignored. */ - ret = error; - - qemu_vfree(buf); - -out: - /* Modify backing chain and close BDSes in main loop */ - data = g_malloc(sizeof(*data)); - data->ret = ret; - data->reached_end = sector_num == end; - block_job_defer_to_main_loop(&s->common, stream_complete, data); -} - -static void stream_set_speed(BlockJob *job, int64_t speed, Error **errp) -{ - StreamBlockJob *s = container_of(job, StreamBlockJob, common); - - if (speed < 0) { - error_setg(errp, QERR_INVALID_PARAMETER, "speed"); - return; - } - ratelimit_set_speed(&s->limit, speed / BDRV_SECTOR_SIZE, SLICE_TIME); -} - -static const BlockJobDriver stream_job_driver = { - .instance_size = sizeof(StreamBlockJob), - .job_type = BLOCK_JOB_TYPE_STREAM, - .set_speed = stream_set_speed, -}; - -void stream_start(BlockDriverState *bs, BlockDriverState *base, - const char *backing_file_str, int64_t speed, - BlockdevOnError on_error, - BlockCompletionFunc *cb, - void *opaque, Error **errp) -{ - StreamBlockJob *s; - - if ((on_error == BLOCKDEV_ON_ERROR_STOP || - on_error == BLOCKDEV_ON_ERROR_ENOSPC) && - (!bs->blk || !blk_iostatus_is_enabled(bs->blk))) { - error_setg(errp, QERR_INVALID_PARAMETER, "on-error"); - return; - } - - s = block_job_create(&stream_job_driver, bs, speed, cb, opaque, errp); - if (!s) { - return; - } - - s->base = base; - s->backing_file_str = g_strdup(backing_file_str); - - s->on_error = on_error; - s->common.co = qemu_coroutine_create(stream_run); - trace_stream_start(bs, base, s, s->common.co, opaque); - qemu_coroutine_enter(s->common.co, s); -} diff --git a/qemu/block/throttle-groups.c b/qemu/block/throttle-groups.c deleted file mode 100644 index 4920e0949..000000000 --- a/qemu/block/throttle-groups.c +++ /dev/null @@ -1,483 +0,0 @@ -/* - * QEMU block throttling group infrastructure - * - * Copyright (C) Nodalink, EURL. 2014 - * Copyright (C) Igalia, S.L. 2015 - * - * Authors: - * Benoît Canet - * Alberto Garcia - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation; either version 2 or - * (at your option) version 3 of the License. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, see . - */ - -#include "qemu/osdep.h" -#include "block/throttle-groups.h" -#include "qemu/queue.h" -#include "qemu/thread.h" -#include "sysemu/qtest.h" - -/* The ThrottleGroup structure (with its ThrottleState) is shared - * among different BlockDriverState and it's independent from - * AioContext, so in order to use it from different threads it needs - * its own locking. - * - * This locking is however handled internally in this file, so it's - * transparent to outside users. - * - * The whole ThrottleGroup structure is private and invisible to - * outside users, that only use it through its ThrottleState. - * - * In addition to the ThrottleGroup structure, BlockDriverState has - * fields that need to be accessed by other members of the group and - * therefore also need to be protected by this lock. Once a BDS is - * registered in a group those fields can be accessed by other threads - * any time. - * - * Again, all this is handled internally and is mostly transparent to - * the outside. The 'throttle_timers' field however has an additional - * constraint because it may be temporarily invalid (see for example - * bdrv_set_aio_context()). Therefore in this file a thread will - * access some other BDS's timers only after verifying that that BDS - * has throttled requests in the queue. - */ -typedef struct ThrottleGroup { - char *name; /* This is constant during the lifetime of the group */ - - QemuMutex lock; /* This lock protects the following four fields */ - ThrottleState ts; - QLIST_HEAD(, BlockDriverState) head; - BlockDriverState *tokens[2]; - bool any_timer_armed[2]; - - /* These two are protected by the global throttle_groups_lock */ - unsigned refcount; - QTAILQ_ENTRY(ThrottleGroup) list; -} ThrottleGroup; - -static QemuMutex throttle_groups_lock; -static QTAILQ_HEAD(, ThrottleGroup) throttle_groups = - QTAILQ_HEAD_INITIALIZER(throttle_groups); - -/* Increments the reference count of a ThrottleGroup given its name. - * - * If no ThrottleGroup is found with the given name a new one is - * created. - * - * @name: the name of the ThrottleGroup - * @ret: the ThrottleState member of the ThrottleGroup - */ -ThrottleState *throttle_group_incref(const char *name) -{ - ThrottleGroup *tg = NULL; - ThrottleGroup *iter; - - qemu_mutex_lock(&throttle_groups_lock); - - /* Look for an existing group with that name */ - QTAILQ_FOREACH(iter, &throttle_groups, list) { - if (!strcmp(name, iter->name)) { - tg = iter; - break; - } - } - - /* Create a new one if not found */ - if (!tg) { - tg = g_new0(ThrottleGroup, 1); - tg->name = g_strdup(name); - qemu_mutex_init(&tg->lock); - throttle_init(&tg->ts); - QLIST_INIT(&tg->head); - - QTAILQ_INSERT_TAIL(&throttle_groups, tg, list); - } - - tg->refcount++; - - qemu_mutex_unlock(&throttle_groups_lock); - - return &tg->ts; -} - -/* Decrease the reference count of a ThrottleGroup. - * - * When the reference count reaches zero the ThrottleGroup is - * destroyed. - * - * @ts: The ThrottleGroup to unref, given by its ThrottleState member - */ -void throttle_group_unref(ThrottleState *ts) -{ - ThrottleGroup *tg = container_of(ts, ThrottleGroup, ts); - - qemu_mutex_lock(&throttle_groups_lock); - if (--tg->refcount == 0) { - QTAILQ_REMOVE(&throttle_groups, tg, list); - qemu_mutex_destroy(&tg->lock); - g_free(tg->name); - g_free(tg); - } - qemu_mutex_unlock(&throttle_groups_lock); -} - -/* Get the name from a BlockDriverState's ThrottleGroup. The name (and - * the pointer) is guaranteed to remain constant during the lifetime - * of the group. - * - * @bs: a BlockDriverState that is member of a throttling group - * @ret: the name of the group. - */ -const char *throttle_group_get_name(BlockDriverState *bs) -{ - ThrottleGroup *tg = container_of(bs->throttle_state, ThrottleGroup, ts); - return tg->name; -} - -/* Return the next BlockDriverState in the round-robin sequence, - * simulating a circular list. - * - * This assumes that tg->lock is held. - * - * @bs: the current BlockDriverState - * @ret: the next BlockDriverState in the sequence - */ -static BlockDriverState *throttle_group_next_bs(BlockDriverState *bs) -{ - ThrottleState *ts = bs->throttle_state; - ThrottleGroup *tg = container_of(ts, ThrottleGroup, ts); - BlockDriverState *next = QLIST_NEXT(bs, round_robin); - - if (!next) { - return QLIST_FIRST(&tg->head); - } - - return next; -} - -/* Return the next BlockDriverState in the round-robin sequence with - * pending I/O requests. - * - * This assumes that tg->lock is held. - * - * @bs: the current BlockDriverState - * @is_write: the type of operation (read/write) - * @ret: the next BlockDriverState with pending requests, or bs - * if there is none. - */ -static BlockDriverState *next_throttle_token(BlockDriverState *bs, - bool is_write) -{ - ThrottleGroup *tg = container_of(bs->throttle_state, ThrottleGroup, ts); - BlockDriverState *token, *start; - - start = token = tg->tokens[is_write]; - - /* get next bs round in round robin style */ - token = throttle_group_next_bs(token); - while (token != start && !token->pending_reqs[is_write]) { - token = throttle_group_next_bs(token); - } - - /* If no IO are queued for scheduling on the next round robin token - * then decide the token is the current bs because chances are - * the current bs get the current request queued. - */ - if (token == start && !token->pending_reqs[is_write]) { - token = bs; - } - - return token; -} - -/* Check if the next I/O request for a BlockDriverState needs to be - * throttled or not. If there's no timer set in this group, set one - * and update the token accordingly. - * - * This assumes that tg->lock is held. - * - * @bs: the current BlockDriverState - * @is_write: the type of operation (read/write) - * @ret: whether the I/O request needs to be throttled or not - */ -static bool throttle_group_schedule_timer(BlockDriverState *bs, - bool is_write) -{ - ThrottleState *ts = bs->throttle_state; - ThrottleTimers *tt = &bs->throttle_timers; - ThrottleGroup *tg = container_of(ts, ThrottleGroup, ts); - bool must_wait; - - /* Check if any of the timers in this group is already armed */ - if (tg->any_timer_armed[is_write]) { - return true; - } - - must_wait = throttle_schedule_timer(ts, tt, is_write); - - /* If a timer just got armed, set bs as the current token */ - if (must_wait) { - tg->tokens[is_write] = bs; - tg->any_timer_armed[is_write] = true; - } - - return must_wait; -} - -/* Look for the next pending I/O request and schedule it. - * - * This assumes that tg->lock is held. - * - * @bs: the current BlockDriverState - * @is_write: the type of operation (read/write) - */ -static void schedule_next_request(BlockDriverState *bs, bool is_write) -{ - ThrottleGroup *tg = container_of(bs->throttle_state, ThrottleGroup, ts); - bool must_wait; - BlockDriverState *token; - - /* Check if there's any pending request to schedule next */ - token = next_throttle_token(bs, is_write); - if (!token->pending_reqs[is_write]) { - return; - } - - /* Set a timer for the request if it needs to be throttled */ - must_wait = throttle_group_schedule_timer(token, is_write); - - /* If it doesn't have to wait, queue it for immediate execution */ - if (!must_wait) { - /* Give preference to requests from the current bs */ - if (qemu_in_coroutine() && - qemu_co_queue_next(&bs->throttled_reqs[is_write])) { - token = bs; - } else { - ThrottleTimers *tt = &token->throttle_timers; - int64_t now = qemu_clock_get_ns(tt->clock_type); - timer_mod(tt->timers[is_write], now + 1); - tg->any_timer_armed[is_write] = true; - } - tg->tokens[is_write] = token; - } -} - -/* Check if an I/O request needs to be throttled, wait and set a timer - * if necessary, and schedule the next request using a round robin - * algorithm. - * - * @bs: the current BlockDriverState - * @bytes: the number of bytes for this I/O - * @is_write: the type of operation (read/write) - */ -void coroutine_fn throttle_group_co_io_limits_intercept(BlockDriverState *bs, - unsigned int bytes, - bool is_write) -{ - bool must_wait; - BlockDriverState *token; - - ThrottleGroup *tg = container_of(bs->throttle_state, ThrottleGroup, ts); - qemu_mutex_lock(&tg->lock); - - /* First we check if this I/O has to be throttled. */ - token = next_throttle_token(bs, is_write); - must_wait = throttle_group_schedule_timer(token, is_write); - - /* Wait if there's a timer set or queued requests of this type */ - if (must_wait || bs->pending_reqs[is_write]) { - bs->pending_reqs[is_write]++; - qemu_mutex_unlock(&tg->lock); - qemu_co_queue_wait(&bs->throttled_reqs[is_write]); - qemu_mutex_lock(&tg->lock); - bs->pending_reqs[is_write]--; - } - - /* The I/O will be executed, so do the accounting */ - throttle_account(bs->throttle_state, is_write, bytes); - - /* Schedule the next request */ - schedule_next_request(bs, is_write); - - qemu_mutex_unlock(&tg->lock); -} - -/* Update the throttle configuration for a particular group. Similar - * to throttle_config(), but guarantees atomicity within the - * throttling group. - * - * @bs: a BlockDriverState that is member of the group - * @cfg: the configuration to set - */ -void throttle_group_config(BlockDriverState *bs, ThrottleConfig *cfg) -{ - ThrottleTimers *tt = &bs->throttle_timers; - ThrottleState *ts = bs->throttle_state; - ThrottleGroup *tg = container_of(ts, ThrottleGroup, ts); - qemu_mutex_lock(&tg->lock); - /* throttle_config() cancels the timers */ - if (timer_pending(tt->timers[0])) { - tg->any_timer_armed[0] = false; - } - if (timer_pending(tt->timers[1])) { - tg->any_timer_armed[1] = false; - } - throttle_config(ts, tt, cfg); - qemu_mutex_unlock(&tg->lock); -} - -/* Get the throttle configuration from a particular group. Similar to - * throttle_get_config(), but guarantees atomicity within the - * throttling group. - * - * @bs: a BlockDriverState that is member of the group - * @cfg: the configuration will be written here - */ -void throttle_group_get_config(BlockDriverState *bs, ThrottleConfig *cfg) -{ - ThrottleState *ts = bs->throttle_state; - ThrottleGroup *tg = container_of(ts, ThrottleGroup, ts); - qemu_mutex_lock(&tg->lock); - throttle_get_config(ts, cfg); - qemu_mutex_unlock(&tg->lock); -} - -/* ThrottleTimers callback. This wakes up a request that was waiting - * because it had been throttled. - * - * @bs: the BlockDriverState whose request had been throttled - * @is_write: the type of operation (read/write) - */ -static void timer_cb(BlockDriverState *bs, bool is_write) -{ - ThrottleState *ts = bs->throttle_state; - ThrottleGroup *tg = container_of(ts, ThrottleGroup, ts); - bool empty_queue; - - /* The timer has just been fired, so we can update the flag */ - qemu_mutex_lock(&tg->lock); - tg->any_timer_armed[is_write] = false; - qemu_mutex_unlock(&tg->lock); - - /* Run the request that was waiting for this timer */ - empty_queue = !qemu_co_enter_next(&bs->throttled_reqs[is_write]); - - /* If the request queue was empty then we have to take care of - * scheduling the next one */ - if (empty_queue) { - qemu_mutex_lock(&tg->lock); - schedule_next_request(bs, is_write); - qemu_mutex_unlock(&tg->lock); - } -} - -static void read_timer_cb(void *opaque) -{ - timer_cb(opaque, false); -} - -static void write_timer_cb(void *opaque) -{ - timer_cb(opaque, true); -} - -/* Register a BlockDriverState in the throttling group, also - * initializing its timers and updating its throttle_state pointer to - * point to it. If a throttling group with that name does not exist - * yet, it will be created. - * - * @bs: the BlockDriverState to insert - * @groupname: the name of the group - */ -void throttle_group_register_bs(BlockDriverState *bs, const char *groupname) -{ - int i; - ThrottleState *ts = throttle_group_incref(groupname); - ThrottleGroup *tg = container_of(ts, ThrottleGroup, ts); - int clock_type = QEMU_CLOCK_REALTIME; - - if (qtest_enabled()) { - /* For testing block IO throttling only */ - clock_type = QEMU_CLOCK_VIRTUAL; - } - - bs->throttle_state = ts; - - qemu_mutex_lock(&tg->lock); - /* If the ThrottleGroup is new set this BlockDriverState as the token */ - for (i = 0; i < 2; i++) { - if (!tg->tokens[i]) { - tg->tokens[i] = bs; - } - } - - QLIST_INSERT_HEAD(&tg->head, bs, round_robin); - - throttle_timers_init(&bs->throttle_timers, - bdrv_get_aio_context(bs), - clock_type, - read_timer_cb, - write_timer_cb, - bs); - - qemu_mutex_unlock(&tg->lock); -} - -/* Unregister a BlockDriverState from its group, removing it from the - * list, destroying the timers and setting the throttle_state pointer - * to NULL. - * - * The BlockDriverState must not have pending throttled requests, so - * the caller has to drain them first. - * - * The group will be destroyed if it's empty after this operation. - * - * @bs: the BlockDriverState to remove - */ -void throttle_group_unregister_bs(BlockDriverState *bs) -{ - ThrottleGroup *tg = container_of(bs->throttle_state, ThrottleGroup, ts); - int i; - - assert(bs->pending_reqs[0] == 0 && bs->pending_reqs[1] == 0); - assert(qemu_co_queue_empty(&bs->throttled_reqs[0])); - assert(qemu_co_queue_empty(&bs->throttled_reqs[1])); - - qemu_mutex_lock(&tg->lock); - for (i = 0; i < 2; i++) { - if (tg->tokens[i] == bs) { - BlockDriverState *token = throttle_group_next_bs(bs); - /* Take care of the case where this is the last bs in the group */ - if (token == bs) { - token = NULL; - } - tg->tokens[i] = token; - } - } - - /* remove the current bs from the list */ - QLIST_REMOVE(bs, round_robin); - throttle_timers_destroy(&bs->throttle_timers); - qemu_mutex_unlock(&tg->lock); - - throttle_group_unref(&tg->ts); - bs->throttle_state = NULL; -} - -static void throttle_groups_init(void) -{ - qemu_mutex_init(&throttle_groups_lock); -} - -block_init(throttle_groups_init); diff --git a/qemu/block/vdi.c b/qemu/block/vdi.c deleted file mode 100644 index 75d4819ed..000000000 --- a/qemu/block/vdi.c +++ /dev/null @@ -1,923 +0,0 @@ -/* - * Block driver for the Virtual Disk Image (VDI) format - * - * Copyright (c) 2009, 2012 Stefan Weil - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 2 of the License, or - * (at your option) version 3 or any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see . - * - * Reference: - * http://forums.virtualbox.org/viewtopic.php?t=8046 - * - * This driver supports create / read / write operations on VDI images. - * - * Todo (see also TODO in code): - * - * Some features like snapshots are still missing. - * - * Deallocation of zero-filled blocks and shrinking images are missing, too - * (might be added to common block layer). - * - * Allocation of blocks could be optimized (less writes to block map and - * header). - * - * Read and write of adjacent blocks could be done in one operation - * (current code uses one operation per block (1 MiB). - * - * The code is not thread safe (missing locks for changes in header and - * block table, no problem with current QEMU). - * - * Hints: - * - * Blocks (VDI documentation) correspond to clusters (QEMU). - * QEMU's backing files could be implemented using VDI snapshot files (TODO). - * VDI snapshot files may also contain the complete machine state. - * Maybe this machine state can be converted to QEMU PC machine snapshot data. - * - * The driver keeps a block cache (little endian entries) in memory. - * For the standard block size (1 MiB), a 1 TiB disk will use 4 MiB RAM, - * so this seems to be reasonable. - */ - -#include "qemu/osdep.h" -#include "qapi/error.h" -#include "block/block_int.h" -#include "sysemu/block-backend.h" -#include "qemu/module.h" -#include "migration/migration.h" -#include "qemu/coroutine.h" -#include "qemu/cutils.h" - -#if defined(CONFIG_UUID) -#include -#else -/* TODO: move uuid emulation to some central place in QEMU. */ -#include "sysemu/sysemu.h" /* UUID_FMT */ -typedef unsigned char uuid_t[16]; -#endif - -/* Code configuration options. */ - -/* Enable debug messages. */ -//~ #define CONFIG_VDI_DEBUG - -/* Support write operations on VDI images. */ -#define CONFIG_VDI_WRITE - -/* Support non-standard block (cluster) size. This is untested. - * Maybe it will be needed for very large images. - */ -//~ #define CONFIG_VDI_BLOCK_SIZE - -/* Support static (fixed, pre-allocated) images. */ -#define CONFIG_VDI_STATIC_IMAGE - -/* Command line option for static images. */ -#define BLOCK_OPT_STATIC "static" - -#define KiB 1024 -#define MiB (KiB * KiB) - -#define SECTOR_SIZE 512 -#define DEFAULT_CLUSTER_SIZE (1 * MiB) - -#if defined(CONFIG_VDI_DEBUG) -#define logout(fmt, ...) \ - fprintf(stderr, "vdi\t%-24s" fmt, __func__, ##__VA_ARGS__) -#else -#define logout(fmt, ...) ((void)0) -#endif - -/* Image signature. */ -#define VDI_SIGNATURE 0xbeda107f - -/* Image version. */ -#define VDI_VERSION_1_1 0x00010001 - -/* Image type. */ -#define VDI_TYPE_DYNAMIC 1 -#define VDI_TYPE_STATIC 2 - -/* Innotek / SUN images use these strings in header.text: - * "<<< innotek VirtualBox Disk Image >>>\n" - * "<<< Sun xVM VirtualBox Disk Image >>>\n" - * "<<< Sun VirtualBox Disk Image >>>\n" - * The value does not matter, so QEMU created images use a different text. - */ -#define VDI_TEXT "<<< QEMU VM Virtual Disk Image >>>\n" - -/* A never-allocated block; semantically arbitrary content. */ -#define VDI_UNALLOCATED 0xffffffffU - -/* A discarded (no longer allocated) block; semantically zero-filled. */ -#define VDI_DISCARDED 0xfffffffeU - -#define VDI_IS_ALLOCATED(X) ((X) < VDI_DISCARDED) - -/* The bmap will take up VDI_BLOCKS_IN_IMAGE_MAX * sizeof(uint32_t) bytes; since - * the bmap is read and written in a single operation, its size needs to be - * limited to INT_MAX; furthermore, when opening an image, the bmap size is - * rounded up to be aligned on BDRV_SECTOR_SIZE. - * Therefore this should satisfy the following: - * VDI_BLOCKS_IN_IMAGE_MAX * sizeof(uint32_t) + BDRV_SECTOR_SIZE == INT_MAX + 1 - * (INT_MAX + 1 is the first value not representable as an int) - * This guarantees that any value below or equal to the constant will, when - * multiplied by sizeof(uint32_t) and rounded up to a BDRV_SECTOR_SIZE boundary, - * still be below or equal to INT_MAX. */ -#define VDI_BLOCKS_IN_IMAGE_MAX \ - ((unsigned)((INT_MAX + 1u - BDRV_SECTOR_SIZE) / sizeof(uint32_t))) -#define VDI_DISK_SIZE_MAX ((uint64_t)VDI_BLOCKS_IN_IMAGE_MAX * \ - (uint64_t)DEFAULT_CLUSTER_SIZE) - -#if !defined(CONFIG_UUID) -static inline void uuid_generate(uuid_t out) -{ - memset(out, 0, sizeof(uuid_t)); -} - -static inline int uuid_is_null(const uuid_t uu) -{ - uuid_t null_uuid = { 0 }; - return memcmp(uu, null_uuid, sizeof(uuid_t)) == 0; -} - -# if defined(CONFIG_VDI_DEBUG) -static inline void uuid_unparse(const uuid_t uu, char *out) -{ - snprintf(out, 37, UUID_FMT, - uu[0], uu[1], uu[2], uu[3], uu[4], uu[5], uu[6], uu[7], - uu[8], uu[9], uu[10], uu[11], uu[12], uu[13], uu[14], uu[15]); -} -# endif -#endif - -typedef struct { - char text[0x40]; - uint32_t signature; - uint32_t version; - uint32_t header_size; - uint32_t image_type; - uint32_t image_flags; - char description[256]; - uint32_t offset_bmap; - uint32_t offset_data; - uint32_t cylinders; /* disk geometry, unused here */ - uint32_t heads; /* disk geometry, unused here */ - uint32_t sectors; /* disk geometry, unused here */ - uint32_t sector_size; - uint32_t unused1; - uint64_t disk_size; - uint32_t block_size; - uint32_t block_extra; /* unused here */ - uint32_t blocks_in_image; - uint32_t blocks_allocated; - uuid_t uuid_image; - uuid_t uuid_last_snap; - uuid_t uuid_link; - uuid_t uuid_parent; - uint64_t unused2[7]; -} QEMU_PACKED VdiHeader; - -typedef struct { - /* The block map entries are little endian (even in memory). */ - uint32_t *bmap; - /* Size of block (bytes). */ - uint32_t block_size; - /* Size of block (sectors). */ - uint32_t block_sectors; - /* First sector of block map. */ - uint32_t bmap_sector; - /* VDI header (converted to host endianness). */ - VdiHeader header; - - CoMutex write_lock; - - Error *migration_blocker; -} BDRVVdiState; - -/* Change UUID from little endian (IPRT = VirtualBox format) to big endian - * format (network byte order, standard, see RFC 4122) and vice versa. - */ -static void uuid_convert(uuid_t uuid) -{ - bswap32s((uint32_t *)&uuid[0]); - bswap16s((uint16_t *)&uuid[4]); - bswap16s((uint16_t *)&uuid[6]); -} - -static void vdi_header_to_cpu(VdiHeader *header) -{ - le32_to_cpus(&header->signature); - le32_to_cpus(&header->version); - le32_to_cpus(&header->header_size); - le32_to_cpus(&header->image_type); - le32_to_cpus(&header->image_flags); - le32_to_cpus(&header->offset_bmap); - le32_to_cpus(&header->offset_data); - le32_to_cpus(&header->cylinders); - le32_to_cpus(&header->heads); - le32_to_cpus(&header->sectors); - le32_to_cpus(&header->sector_size); - le64_to_cpus(&header->disk_size); - le32_to_cpus(&header->block_size); - le32_to_cpus(&header->block_extra); - le32_to_cpus(&header->blocks_in_image); - le32_to_cpus(&header->blocks_allocated); - uuid_convert(header->uuid_image); - uuid_convert(header->uuid_last_snap); - uuid_convert(header->uuid_link); - uuid_convert(header->uuid_parent); -} - -static void vdi_header_to_le(VdiHeader *header) -{ - cpu_to_le32s(&header->signature); - cpu_to_le32s(&header->version); - cpu_to_le32s(&header->header_size); - cpu_to_le32s(&header->image_type); - cpu_to_le32s(&header->image_flags); - cpu_to_le32s(&header->offset_bmap); - cpu_to_le32s(&header->offset_data); - cpu_to_le32s(&header->cylinders); - cpu_to_le32s(&header->heads); - cpu_to_le32s(&header->sectors); - cpu_to_le32s(&header->sector_size); - cpu_to_le64s(&header->disk_size); - cpu_to_le32s(&header->block_size); - cpu_to_le32s(&header->block_extra); - cpu_to_le32s(&header->blocks_in_image); - cpu_to_le32s(&header->blocks_allocated); - uuid_convert(header->uuid_image); - uuid_convert(header->uuid_last_snap); - uuid_convert(header->uuid_link); - uuid_convert(header->uuid_parent); -} - -#if defined(CONFIG_VDI_DEBUG) -static void vdi_header_print(VdiHeader *header) -{ - char uuid[37]; - logout("text %s", header->text); - logout("signature 0x%08x\n", header->signature); - logout("header size 0x%04x\n", header->header_size); - logout("image type 0x%04x\n", header->image_type); - logout("image flags 0x%04x\n", header->image_flags); - logout("description %s\n", header->description); - logout("offset bmap 0x%04x\n", header->offset_bmap); - logout("offset data 0x%04x\n", header->offset_data); - logout("cylinders 0x%04x\n", header->cylinders); - logout("heads 0x%04x\n", header->heads); - logout("sectors 0x%04x\n", header->sectors); - logout("sector size 0x%04x\n", header->sector_size); - logout("image size 0x%" PRIx64 " B (%" PRIu64 " MiB)\n", - header->disk_size, header->disk_size / MiB); - logout("block size 0x%04x\n", header->block_size); - logout("block extra 0x%04x\n", header->block_extra); - logout("blocks tot. 0x%04x\n", header->blocks_in_image); - logout("blocks all. 0x%04x\n", header->blocks_allocated); - uuid_unparse(header->uuid_image, uuid); - logout("uuid image %s\n", uuid); - uuid_unparse(header->uuid_last_snap, uuid); - logout("uuid snap %s\n", uuid); - uuid_unparse(header->uuid_link, uuid); - logout("uuid link %s\n", uuid); - uuid_unparse(header->uuid_parent, uuid); - logout("uuid parent %s\n", uuid); -} -#endif - -static int vdi_check(BlockDriverState *bs, BdrvCheckResult *res, - BdrvCheckMode fix) -{ - /* TODO: additional checks possible. */ - BDRVVdiState *s = (BDRVVdiState *)bs->opaque; - uint32_t blocks_allocated = 0; - uint32_t block; - uint32_t *bmap; - logout("\n"); - - if (fix) { - return -ENOTSUP; - } - - bmap = g_try_new(uint32_t, s->header.blocks_in_image); - if (s->header.blocks_in_image && bmap == NULL) { - res->check_errors++; - return -ENOMEM; - } - - memset(bmap, 0xff, s->header.blocks_in_image * sizeof(uint32_t)); - - /* Check block map and value of blocks_allocated. */ - for (block = 0; block < s->header.blocks_in_image; block++) { - uint32_t bmap_entry = le32_to_cpu(s->bmap[block]); - if (VDI_IS_ALLOCATED(bmap_entry)) { - if (bmap_entry < s->header.blocks_in_image) { - blocks_allocated++; - if (!VDI_IS_ALLOCATED(bmap[bmap_entry])) { - bmap[bmap_entry] = bmap_entry; - } else { - fprintf(stderr, "ERROR: block index %" PRIu32 - " also used by %" PRIu32 "\n", bmap[bmap_entry], bmap_entry); - res->corruptions++; - } - } else { - fprintf(stderr, "ERROR: block index %" PRIu32 - " too large, is %" PRIu32 "\n", block, bmap_entry); - res->corruptions++; - } - } - } - if (blocks_allocated != s->header.blocks_allocated) { - fprintf(stderr, "ERROR: allocated blocks mismatch, is %" PRIu32 - ", should be %" PRIu32 "\n", - blocks_allocated, s->header.blocks_allocated); - res->corruptions++; - } - - g_free(bmap); - - return 0; -} - -static int vdi_get_info(BlockDriverState *bs, BlockDriverInfo *bdi) -{ - /* TODO: vdi_get_info would be needed for machine snapshots. - vm_state_offset is still missing. */ - BDRVVdiState *s = (BDRVVdiState *)bs->opaque; - logout("\n"); - bdi->cluster_size = s->block_size; - bdi->vm_state_offset = 0; - bdi->unallocated_blocks_are_zero = true; - return 0; -} - -static int vdi_make_empty(BlockDriverState *bs) -{ - /* TODO: missing code. */ - logout("\n"); - /* The return value for missing code must be 0, see block.c. */ - return 0; -} - -static int vdi_probe(const uint8_t *buf, int buf_size, const char *filename) -{ - const VdiHeader *header = (const VdiHeader *)buf; - int ret = 0; - - logout("\n"); - - if (buf_size < sizeof(*header)) { - /* Header too small, no VDI. */ - } else if (le32_to_cpu(header->signature) == VDI_SIGNATURE) { - ret = 100; - } - - if (ret == 0) { - logout("no vdi image\n"); - } else { - logout("%s", header->text); - } - - return ret; -} - -static int vdi_open(BlockDriverState *bs, QDict *options, int flags, - Error **errp) -{ - BDRVVdiState *s = bs->opaque; - VdiHeader header; - size_t bmap_size; - int ret; - - logout("\n"); - - ret = bdrv_read(bs->file->bs, 0, (uint8_t *)&header, 1); - if (ret < 0) { - goto fail; - } - - vdi_header_to_cpu(&header); -#if defined(CONFIG_VDI_DEBUG) - vdi_header_print(&header); -#endif - - if (header.disk_size > VDI_DISK_SIZE_MAX) { - error_setg(errp, "Unsupported VDI image size (size is 0x%" PRIx64 - ", max supported is 0x%" PRIx64 ")", - header.disk_size, VDI_DISK_SIZE_MAX); - ret = -ENOTSUP; - goto fail; - } - - if (header.disk_size % SECTOR_SIZE != 0) { - /* 'VBoxManage convertfromraw' can create images with odd disk sizes. - We accept them but round the disk size to the next multiple of - SECTOR_SIZE. */ - logout("odd disk size %" PRIu64 " B, round up\n", header.disk_size); - header.disk_size = ROUND_UP(header.disk_size, SECTOR_SIZE); - } - - if (header.signature != VDI_SIGNATURE) { - error_setg(errp, "Image not in VDI format (bad signature %08" PRIx32 - ")", header.signature); - ret = -EINVAL; - goto fail; - } else if (header.version != VDI_VERSION_1_1) { - error_setg(errp, "unsupported VDI image (version %" PRIu32 ".%" PRIu32 - ")", header.version >> 16, header.version & 0xffff); - ret = -ENOTSUP; - goto fail; - } else if (header.offset_bmap % SECTOR_SIZE != 0) { - /* We only support block maps which start on a sector boundary. */ - error_setg(errp, "unsupported VDI image (unaligned block map offset " - "0x%" PRIx32 ")", header.offset_bmap); - ret = -ENOTSUP; - goto fail; - } else if (header.offset_data % SECTOR_SIZE != 0) { - /* We only support data blocks which start on a sector boundary. */ - error_setg(errp, "unsupported VDI image (unaligned data offset 0x%" - PRIx32 ")", header.offset_data); - ret = -ENOTSUP; - goto fail; - } else if (header.sector_size != SECTOR_SIZE) { - error_setg(errp, "unsupported VDI image (sector size %" PRIu32 - " is not %u)", header.sector_size, SECTOR_SIZE); - ret = -ENOTSUP; - goto fail; - } else if (header.block_size != DEFAULT_CLUSTER_SIZE) { - error_setg(errp, "unsupported VDI image (block size %" PRIu32 - " is not %u)", header.block_size, DEFAULT_CLUSTER_SIZE); - ret = -ENOTSUP; - goto fail; - } else if (header.disk_size > - (uint64_t)header.blocks_in_image * header.block_size) { - error_setg(errp, "unsupported VDI image (disk size %" PRIu64 ", " - "image bitmap has room for %" PRIu64 ")", - header.disk_size, - (uint64_t)header.blocks_in_image * header.block_size); - ret = -ENOTSUP; - goto fail; - } else if (!uuid_is_null(header.uuid_link)) { - error_setg(errp, "unsupported VDI image (non-NULL link UUID)"); - ret = -ENOTSUP; - goto fail; - } else if (!uuid_is_null(header.uuid_parent)) { - error_setg(errp, "unsupported VDI image (non-NULL parent UUID)"); - ret = -ENOTSUP; - goto fail; - } else if (header.blocks_in_image > VDI_BLOCKS_IN_IMAGE_MAX) { - error_setg(errp, "unsupported VDI image " - "(too many blocks %u, max is %u)", - header.blocks_in_image, VDI_BLOCKS_IN_IMAGE_MAX); - ret = -ENOTSUP; - goto fail; - } - - bs->total_sectors = header.disk_size / SECTOR_SIZE; - - s->block_size = header.block_size; - s->block_sectors = header.block_size / SECTOR_SIZE; - s->bmap_sector = header.offset_bmap / SECTOR_SIZE; - s->header = header; - - bmap_size = header.blocks_in_image * sizeof(uint32_t); - bmap_size = DIV_ROUND_UP(bmap_size, SECTOR_SIZE); - s->bmap = qemu_try_blockalign(bs->file->bs, bmap_size * SECTOR_SIZE); - if (s->bmap == NULL) { - ret = -ENOMEM; - goto fail; - } - - ret = bdrv_read(bs->file->bs, s->bmap_sector, (uint8_t *)s->bmap, - bmap_size); - if (ret < 0) { - goto fail_free_bmap; - } - - /* Disable migration when vdi images are used */ - error_setg(&s->migration_blocker, "The vdi format used by node '%s' " - "does not support live migration", - bdrv_get_device_or_node_name(bs)); - migrate_add_blocker(s->migration_blocker); - - qemu_co_mutex_init(&s->write_lock); - - return 0; - - fail_free_bmap: - qemu_vfree(s->bmap); - - fail: - return ret; -} - -static int vdi_reopen_prepare(BDRVReopenState *state, - BlockReopenQueue *queue, Error **errp) -{ - return 0; -} - -static int64_t coroutine_fn vdi_co_get_block_status(BlockDriverState *bs, - int64_t sector_num, int nb_sectors, int *pnum, BlockDriverState **file) -{ - /* TODO: Check for too large sector_num (in bdrv_is_allocated or here). */ - BDRVVdiState *s = (BDRVVdiState *)bs->opaque; - size_t bmap_index = sector_num / s->block_sectors; - size_t sector_in_block = sector_num % s->block_sectors; - int n_sectors = s->block_sectors - sector_in_block; - uint32_t bmap_entry = le32_to_cpu(s->bmap[bmap_index]); - uint64_t offset; - int result; - - logout("%p, %" PRId64 ", %d, %p\n", bs, sector_num, nb_sectors, pnum); - if (n_sectors > nb_sectors) { - n_sectors = nb_sectors; - } - *pnum = n_sectors; - result = VDI_IS_ALLOCATED(bmap_entry); - if (!result) { - return 0; - } - - offset = s->header.offset_data + - (uint64_t)bmap_entry * s->block_size + - sector_in_block * SECTOR_SIZE; - *file = bs->file->bs; - return BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID | offset; -} - -static int vdi_co_read(BlockDriverState *bs, - int64_t sector_num, uint8_t *buf, int nb_sectors) -{ - BDRVVdiState *s = bs->opaque; - uint32_t bmap_entry; - uint32_t block_index; - uint32_t sector_in_block; - uint32_t n_sectors; - int ret = 0; - - logout("\n"); - - while (ret >= 0 && nb_sectors > 0) { - block_index = sector_num / s->block_sectors; - sector_in_block = sector_num % s->block_sectors; - n_sectors = s->block_sectors - sector_in_block; - if (n_sectors > nb_sectors) { - n_sectors = nb_sectors; - } - - logout("will read %u sectors starting at sector %" PRIu64 "\n", - n_sectors, sector_num); - - /* prepare next AIO request */ - bmap_entry = le32_to_cpu(s->bmap[block_index]); - if (!VDI_IS_ALLOCATED(bmap_entry)) { - /* Block not allocated, return zeros, no need to wait. */ - memset(buf, 0, n_sectors * SECTOR_SIZE); - ret = 0; - } else { - uint64_t offset = s->header.offset_data / SECTOR_SIZE + - (uint64_t)bmap_entry * s->block_sectors + - sector_in_block; - ret = bdrv_read(bs->file->bs, offset, buf, n_sectors); - } - logout("%u sectors read\n", n_sectors); - - nb_sectors -= n_sectors; - sector_num += n_sectors; - buf += n_sectors * SECTOR_SIZE; - } - - return ret; -} - -static int vdi_co_write(BlockDriverState *bs, - int64_t sector_num, const uint8_t *buf, int nb_sectors) -{ - BDRVVdiState *s = bs->opaque; - uint32_t bmap_entry; - uint32_t block_index; - uint32_t sector_in_block; - uint32_t n_sectors; - uint32_t bmap_first = VDI_UNALLOCATED; - uint32_t bmap_last = VDI_UNALLOCATED; - uint8_t *block = NULL; - int ret = 0; - - logout("\n"); - - while (ret >= 0 && nb_sectors > 0) { - block_index = sector_num / s->block_sectors; - sector_in_block = sector_num % s->block_sectors; - n_sectors = s->block_sectors - sector_in_block; - if (n_sectors > nb_sectors) { - n_sectors = nb_sectors; - } - - logout("will write %u sectors starting at sector %" PRIu64 "\n", - n_sectors, sector_num); - - /* prepare next AIO request */ - bmap_entry = le32_to_cpu(s->bmap[block_index]); - if (!VDI_IS_ALLOCATED(bmap_entry)) { - /* Allocate new block and write to it. */ - uint64_t offset; - bmap_entry = s->header.blocks_allocated; - s->bmap[block_index] = cpu_to_le32(bmap_entry); - s->header.blocks_allocated++; - offset = s->header.offset_data / SECTOR_SIZE + - (uint64_t)bmap_entry * s->block_sectors; - if (block == NULL) { - block = g_malloc(s->block_size); - bmap_first = block_index; - } - bmap_last = block_index; - /* Copy data to be written to new block and zero unused parts. */ - memset(block, 0, sector_in_block * SECTOR_SIZE); - memcpy(block + sector_in_block * SECTOR_SIZE, - buf, n_sectors * SECTOR_SIZE); - memset(block + (sector_in_block + n_sectors) * SECTOR_SIZE, 0, - (s->block_sectors - n_sectors - sector_in_block) * SECTOR_SIZE); - - /* Note that this coroutine does not yield anywhere from reading the - * bmap entry until here, so in regards to all the coroutines trying - * to write to this cluster, the one doing the allocation will - * always be the first to try to acquire the lock. - * Therefore, it is also the first that will actually be able to - * acquire the lock and thus the padded cluster is written before - * the other coroutines can write to the affected area. */ - qemu_co_mutex_lock(&s->write_lock); - ret = bdrv_write(bs->file->bs, offset, block, s->block_sectors); - qemu_co_mutex_unlock(&s->write_lock); - } else { - uint64_t offset = s->header.offset_data / SECTOR_SIZE + - (uint64_t)bmap_entry * s->block_sectors + - sector_in_block; - qemu_co_mutex_lock(&s->write_lock); - /* This lock is only used to make sure the following write operation - * is executed after the write issued by the coroutine allocating - * this cluster, therefore we do not need to keep it locked. - * As stated above, the allocating coroutine will always try to lock - * the mutex before all the other concurrent accesses to that - * cluster, therefore at this point we can be absolutely certain - * that that write operation has returned (there may be other writes - * in flight, but they do not concern this very operation). */ - qemu_co_mutex_unlock(&s->write_lock); - ret = bdrv_write(bs->file->bs, offset, buf, n_sectors); - } - - nb_sectors -= n_sectors; - sector_num += n_sectors; - buf += n_sectors * SECTOR_SIZE; - - logout("%u sectors written\n", n_sectors); - } - - logout("finished data write\n"); - if (ret < 0) { - return ret; - } - - if (block) { - /* One or more new blocks were allocated. */ - VdiHeader *header = (VdiHeader *) block; - uint8_t *base; - uint64_t offset; - - logout("now writing modified header\n"); - assert(VDI_IS_ALLOCATED(bmap_first)); - *header = s->header; - vdi_header_to_le(header); - ret = bdrv_write(bs->file->bs, 0, block, 1); - g_free(block); - block = NULL; - - if (ret < 0) { - return ret; - } - - logout("now writing modified block map entry %u...%u\n", - bmap_first, bmap_last); - /* Write modified sectors from block map. */ - bmap_first /= (SECTOR_SIZE / sizeof(uint32_t)); - bmap_last /= (SECTOR_SIZE / sizeof(uint32_t)); - n_sectors = bmap_last - bmap_first + 1; - offset = s->bmap_sector + bmap_first; - base = ((uint8_t *)&s->bmap[0]) + bmap_first * SECTOR_SIZE; - logout("will write %u block map sectors starting from entry %u\n", - n_sectors, bmap_first); - ret = bdrv_write(bs->file->bs, offset, base, n_sectors); - } - - return ret; -} - -static int vdi_create(const char *filename, QemuOpts *opts, Error **errp) -{ - int ret = 0; - uint64_t bytes = 0; - uint32_t blocks; - size_t block_size = DEFAULT_CLUSTER_SIZE; - uint32_t image_type = VDI_TYPE_DYNAMIC; - VdiHeader header; - size_t i; - size_t bmap_size; - int64_t offset = 0; - Error *local_err = NULL; - BlockBackend *blk = NULL; - uint32_t *bmap = NULL; - - logout("\n"); - - /* Read out options. */ - bytes = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0), - BDRV_SECTOR_SIZE); -#if defined(CONFIG_VDI_BLOCK_SIZE) - /* TODO: Additional checks (SECTOR_SIZE * 2^n, ...). */ - block_size = qemu_opt_get_size_del(opts, - BLOCK_OPT_CLUSTER_SIZE, - DEFAULT_CLUSTER_SIZE); -#endif -#if defined(CONFIG_VDI_STATIC_IMAGE) - if (qemu_opt_get_bool_del(opts, BLOCK_OPT_STATIC, false)) { - image_type = VDI_TYPE_STATIC; - } -#endif - - if (bytes > VDI_DISK_SIZE_MAX) { - ret = -ENOTSUP; - error_setg(errp, "Unsupported VDI image size (size is 0x%" PRIx64 - ", max supported is 0x%" PRIx64 ")", - bytes, VDI_DISK_SIZE_MAX); - goto exit; - } - - ret = bdrv_create_file(filename, opts, &local_err); - if (ret < 0) { - error_propagate(errp, local_err); - goto exit; - } - - blk = blk_new_open(filename, NULL, NULL, - BDRV_O_RDWR | BDRV_O_PROTOCOL, &local_err); - if (blk == NULL) { - error_propagate(errp, local_err); - ret = -EIO; - goto exit; - } - - blk_set_allow_write_beyond_eof(blk, true); - - /* We need enough blocks to store the given disk size, - so always round up. */ - blocks = DIV_ROUND_UP(bytes, block_size); - - bmap_size = blocks * sizeof(uint32_t); - bmap_size = ROUND_UP(bmap_size, SECTOR_SIZE); - - memset(&header, 0, sizeof(header)); - pstrcpy(header.text, sizeof(header.text), VDI_TEXT); - header.signature = VDI_SIGNATURE; - header.version = VDI_VERSION_1_1; - header.header_size = 0x180; - header.image_type = image_type; - header.offset_bmap = 0x200; - header.offset_data = 0x200 + bmap_size; - header.sector_size = SECTOR_SIZE; - header.disk_size = bytes; - header.block_size = block_size; - header.blocks_in_image = blocks; - if (image_type == VDI_TYPE_STATIC) { - header.blocks_allocated = blocks; - } - uuid_generate(header.uuid_image); - uuid_generate(header.uuid_last_snap); - /* There is no need to set header.uuid_link or header.uuid_parent here. */ -#if defined(CONFIG_VDI_DEBUG) - vdi_header_print(&header); -#endif - vdi_header_to_le(&header); - ret = blk_pwrite(blk, offset, &header, sizeof(header)); - if (ret < 0) { - error_setg(errp, "Error writing header to %s", filename); - goto exit; - } - offset += sizeof(header); - - if (bmap_size > 0) { - bmap = g_try_malloc0(bmap_size); - if (bmap == NULL) { - ret = -ENOMEM; - error_setg(errp, "Could not allocate bmap"); - goto exit; - } - for (i = 0; i < blocks; i++) { - if (image_type == VDI_TYPE_STATIC) { - bmap[i] = i; - } else { - bmap[i] = VDI_UNALLOCATED; - } - } - ret = blk_pwrite(blk, offset, bmap, bmap_size); - if (ret < 0) { - error_setg(errp, "Error writing bmap to %s", filename); - goto exit; - } - offset += bmap_size; - } - - if (image_type == VDI_TYPE_STATIC) { - ret = blk_truncate(blk, offset + blocks * block_size); - if (ret < 0) { - error_setg(errp, "Failed to statically allocate %s", filename); - goto exit; - } - } - -exit: - blk_unref(blk); - g_free(bmap); - return ret; -} - -static void vdi_close(BlockDriverState *bs) -{ - BDRVVdiState *s = bs->opaque; - - qemu_vfree(s->bmap); - - migrate_del_blocker(s->migration_blocker); - error_free(s->migration_blocker); -} - -static QemuOptsList vdi_create_opts = { - .name = "vdi-create-opts", - .head = QTAILQ_HEAD_INITIALIZER(vdi_create_opts.head), - .desc = { - { - .name = BLOCK_OPT_SIZE, - .type = QEMU_OPT_SIZE, - .help = "Virtual disk size" - }, -#if defined(CONFIG_VDI_BLOCK_SIZE) - { - .name = BLOCK_OPT_CLUSTER_SIZE, - .type = QEMU_OPT_SIZE, - .help = "VDI cluster (block) size", - .def_value_str = stringify(DEFAULT_CLUSTER_SIZE) - }, -#endif -#if defined(CONFIG_VDI_STATIC_IMAGE) - { - .name = BLOCK_OPT_STATIC, - .type = QEMU_OPT_BOOL, - .help = "VDI static (pre-allocated) image", - .def_value_str = "off" - }, -#endif - /* TODO: An additional option to set UUID values might be useful. */ - { /* end of list */ } - } -}; - -static BlockDriver bdrv_vdi = { - .format_name = "vdi", - .instance_size = sizeof(BDRVVdiState), - .bdrv_probe = vdi_probe, - .bdrv_open = vdi_open, - .bdrv_close = vdi_close, - .bdrv_reopen_prepare = vdi_reopen_prepare, - .bdrv_create = vdi_create, - .bdrv_has_zero_init = bdrv_has_zero_init_1, - .bdrv_co_get_block_status = vdi_co_get_block_status, - .bdrv_make_empty = vdi_make_empty, - - .bdrv_read = vdi_co_read, -#if defined(CONFIG_VDI_WRITE) - .bdrv_write = vdi_co_write, -#endif - - .bdrv_get_info = vdi_get_info, - - .create_opts = &vdi_create_opts, - .bdrv_check = vdi_check, -}; - -static void bdrv_vdi_init(void) -{ - logout("\n"); - bdrv_register(&bdrv_vdi); -} - -block_init(bdrv_vdi_init); diff --git a/qemu/block/vhdx-endian.c b/qemu/block/vhdx-endian.c deleted file mode 100644 index da33cd38e..000000000 --- a/qemu/block/vhdx-endian.c +++ /dev/null @@ -1,224 +0,0 @@ -/* - * Block driver for Hyper-V VHDX Images - * - * Copyright (c) 2013 Red Hat, Inc., - * - * Authors: - * Jeff Cody - * - * This is based on the "VHDX Format Specification v1.00", published 8/25/2012 - * by Microsoft: - * https://www.microsoft.com/en-us/download/details.aspx?id=34750 - * - * This work is licensed under the terms of the GNU LGPL, version 2 or later. - * See the COPYING.LIB file in the top-level directory. - * - */ - -#include "qemu/osdep.h" -#include "qemu-common.h" -#include "block/block_int.h" -#include "block/vhdx.h" - -#include - - -/* - * All the VHDX formats on disk are little endian - the following - * are helper import/export functions to correctly convert - * endianness from disk read to native cpu format, and back again. - */ - - -/* VHDX File Header */ - - -void vhdx_header_le_import(VHDXHeader *h) -{ - assert(h != NULL); - - le32_to_cpus(&h->signature); - le32_to_cpus(&h->checksum); - le64_to_cpus(&h->sequence_number); - - leguid_to_cpus(&h->file_write_guid); - leguid_to_cpus(&h->data_write_guid); - leguid_to_cpus(&h->log_guid); - - le16_to_cpus(&h->log_version); - le16_to_cpus(&h->version); - le32_to_cpus(&h->log_length); - le64_to_cpus(&h->log_offset); -} - -void vhdx_header_le_export(VHDXHeader *orig_h, VHDXHeader *new_h) -{ - assert(orig_h != NULL); - assert(new_h != NULL); - - new_h->signature = cpu_to_le32(orig_h->signature); - new_h->checksum = cpu_to_le32(orig_h->checksum); - new_h->sequence_number = cpu_to_le64(orig_h->sequence_number); - - new_h->file_write_guid = orig_h->file_write_guid; - new_h->data_write_guid = orig_h->data_write_guid; - new_h->log_guid = orig_h->log_guid; - - cpu_to_leguids(&new_h->file_write_guid); - cpu_to_leguids(&new_h->data_write_guid); - cpu_to_leguids(&new_h->log_guid); - - new_h->log_version = cpu_to_le16(orig_h->log_version); - new_h->version = cpu_to_le16(orig_h->version); - new_h->log_length = cpu_to_le32(orig_h->log_length); - new_h->log_offset = cpu_to_le64(orig_h->log_offset); -} - - -/* VHDX Log Headers */ - - -void vhdx_log_desc_le_import(VHDXLogDescriptor *d) -{ - assert(d != NULL); - - le32_to_cpus(&d->signature); - le64_to_cpus(&d->file_offset); - le64_to_cpus(&d->sequence_number); -} - -void vhdx_log_desc_le_export(VHDXLogDescriptor *d) -{ - assert(d != NULL); - - cpu_to_le32s(&d->signature); - cpu_to_le32s(&d->trailing_bytes); - cpu_to_le64s(&d->leading_bytes); - cpu_to_le64s(&d->file_offset); - cpu_to_le64s(&d->sequence_number); -} - -void vhdx_log_data_le_import(VHDXLogDataSector *d) -{ - assert(d != NULL); - - le32_to_cpus(&d->data_signature); - le32_to_cpus(&d->sequence_high); - le32_to_cpus(&d->sequence_low); -} - -void vhdx_log_data_le_export(VHDXLogDataSector *d) -{ - assert(d != NULL); - - cpu_to_le32s(&d->data_signature); - cpu_to_le32s(&d->sequence_high); - cpu_to_le32s(&d->sequence_low); -} - -void vhdx_log_entry_hdr_le_import(VHDXLogEntryHeader *hdr) -{ - assert(hdr != NULL); - - le32_to_cpus(&hdr->signature); - le32_to_cpus(&hdr->checksum); - le32_to_cpus(&hdr->entry_length); - le32_to_cpus(&hdr->tail); - le64_to_cpus(&hdr->sequence_number); - le32_to_cpus(&hdr->descriptor_count); - leguid_to_cpus(&hdr->log_guid); - le64_to_cpus(&hdr->flushed_file_offset); - le64_to_cpus(&hdr->last_file_offset); -} - -void vhdx_log_entry_hdr_le_export(VHDXLogEntryHeader *hdr) -{ - assert(hdr != NULL); - - cpu_to_le32s(&hdr->signature); - cpu_to_le32s(&hdr->checksum); - cpu_to_le32s(&hdr->entry_length); - cpu_to_le32s(&hdr->tail); - cpu_to_le64s(&hdr->sequence_number); - cpu_to_le32s(&hdr->descriptor_count); - cpu_to_leguids(&hdr->log_guid); - cpu_to_le64s(&hdr->flushed_file_offset); - cpu_to_le64s(&hdr->last_file_offset); -} - - -/* Region table entries */ -void vhdx_region_header_le_import(VHDXRegionTableHeader *hdr) -{ - assert(hdr != NULL); - - le32_to_cpus(&hdr->signature); - le32_to_cpus(&hdr->checksum); - le32_to_cpus(&hdr->entry_count); -} - -void vhdx_region_header_le_export(VHDXRegionTableHeader *hdr) -{ - assert(hdr != NULL); - - cpu_to_le32s(&hdr->signature); - cpu_to_le32s(&hdr->checksum); - cpu_to_le32s(&hdr->entry_count); -} - -void vhdx_region_entry_le_import(VHDXRegionTableEntry *e) -{ - assert(e != NULL); - - leguid_to_cpus(&e->guid); - le64_to_cpus(&e->file_offset); - le32_to_cpus(&e->length); - le32_to_cpus(&e->data_bits); -} - -void vhdx_region_entry_le_export(VHDXRegionTableEntry *e) -{ - assert(e != NULL); - - cpu_to_leguids(&e->guid); - cpu_to_le64s(&e->file_offset); - cpu_to_le32s(&e->length); - cpu_to_le32s(&e->data_bits); -} - - -/* Metadata headers & table */ -void vhdx_metadata_header_le_import(VHDXMetadataTableHeader *hdr) -{ - assert(hdr != NULL); - - le64_to_cpus(&hdr->signature); - le16_to_cpus(&hdr->entry_count); -} - -void vhdx_metadata_header_le_export(VHDXMetadataTableHeader *hdr) -{ - assert(hdr != NULL); - - cpu_to_le64s(&hdr->signature); - cpu_to_le16s(&hdr->entry_count); -} - -void vhdx_metadata_entry_le_import(VHDXMetadataTableEntry *e) -{ - assert(e != NULL); - - leguid_to_cpus(&e->item_id); - le32_to_cpus(&e->offset); - le32_to_cpus(&e->length); - le32_to_cpus(&e->data_bits); -} -void vhdx_metadata_entry_le_export(VHDXMetadataTableEntry *e) -{ - assert(e != NULL); - - cpu_to_leguids(&e->item_id); - cpu_to_le32s(&e->offset); - cpu_to_le32s(&e->length); - cpu_to_le32s(&e->data_bits); -} diff --git a/qemu/block/vhdx-log.c b/qemu/block/vhdx-log.c deleted file mode 100644 index 7ea7187fc..000000000 --- a/qemu/block/vhdx-log.c +++ /dev/null @@ -1,1043 +0,0 @@ -/* - * Block driver for Hyper-V VHDX Images - * - * Copyright (c) 2013 Red Hat, Inc., - * - * Authors: - * Jeff Cody - * - * This is based on the "VHDX Format Specification v1.00", published 8/25/2012 - * by Microsoft: - * https://www.microsoft.com/en-us/download/details.aspx?id=34750 - * - * This file covers the functionality of the metadata log writing, parsing, and - * replay. - * - * This work is licensed under the terms of the GNU LGPL, version 2 or later. - * See the COPYING.LIB file in the top-level directory. - * - */ -#include "qemu/osdep.h" -#include "qapi/error.h" -#include "qemu-common.h" -#include "block/block_int.h" -#include "qemu/error-report.h" -#include "qemu/module.h" -#include "block/vhdx.h" - - -typedef struct VHDXLogSequence { - bool valid; - uint32_t count; - VHDXLogEntries log; - VHDXLogEntryHeader hdr; -} VHDXLogSequence; - -typedef struct VHDXLogDescEntries { - VHDXLogEntryHeader hdr; - VHDXLogDescriptor desc[]; -} VHDXLogDescEntries; - -static const MSGUID zero_guid = { 0 }; - -/* The log located on the disk is circular buffer containing - * sectors of 4096 bytes each. - * - * It is assumed for the read/write functions below that the - * circular buffer scheme uses a 'one sector open' to indicate - * the buffer is full. Given the validation methods used for each - * sector, this method should be compatible with other methods that - * do not waste a sector. - */ - - -/* Allow peeking at the hdr entry at the beginning of the current - * read index, without advancing the read index */ -static int vhdx_log_peek_hdr(BlockDriverState *bs, VHDXLogEntries *log, - VHDXLogEntryHeader *hdr) -{ - int ret = 0; - uint64_t offset; - uint32_t read; - - assert(hdr != NULL); - - /* peek is only supported on sector boundaries */ - if (log->read % VHDX_LOG_SECTOR_SIZE) { - ret = -EFAULT; - goto exit; - } - - read = log->read; - /* we are guaranteed that a) log sectors are 4096 bytes, - * and b) the log length is a multiple of 1MB. So, there - * is always a round number of sectors in the buffer */ - if ((read + sizeof(VHDXLogEntryHeader)) > log->length) { - read = 0; - } - - if (read == log->write) { - ret = -EINVAL; - goto exit; - } - - offset = log->offset + read; - - ret = bdrv_pread(bs->file->bs, offset, hdr, sizeof(VHDXLogEntryHeader)); - if (ret < 0) { - goto exit; - } - vhdx_log_entry_hdr_le_import(hdr); - -exit: - return ret; -} - -/* Index increment for log, based on sector boundaries */ -static int vhdx_log_inc_idx(uint32_t idx, uint64_t length) -{ - idx += VHDX_LOG_SECTOR_SIZE; - /* we are guaranteed that a) log sectors are 4096 bytes, - * and b) the log length is a multiple of 1MB. So, there - * is always a round number of sectors in the buffer */ - return idx >= length ? 0 : idx; -} - - -/* Reset the log to empty */ -static void vhdx_log_reset(BlockDriverState *bs, BDRVVHDXState *s) -{ - MSGUID guid = { 0 }; - s->log.read = s->log.write = 0; - /* a log guid of 0 indicates an empty log to any parser of v0 - * VHDX logs */ - vhdx_update_headers(bs, s, false, &guid); -} - -/* Reads num_sectors from the log (all log sectors are 4096 bytes), - * into buffer 'buffer'. Upon return, *sectors_read will contain - * the number of sectors successfully read. - * - * It is assumed that 'buffer' is already allocated, and of sufficient - * size (i.e. >= 4096*num_sectors). - * - * If 'peek' is true, then the tail (read) pointer for the circular buffer is - * not modified. - * - * 0 is returned on success, -errno otherwise. */ -static int vhdx_log_read_sectors(BlockDriverState *bs, VHDXLogEntries *log, - uint32_t *sectors_read, void *buffer, - uint32_t num_sectors, bool peek) -{ - int ret = 0; - uint64_t offset; - uint32_t read; - - read = log->read; - - *sectors_read = 0; - while (num_sectors) { - if (read == log->write) { - /* empty */ - break; - } - offset = log->offset + read; - - ret = bdrv_pread(bs->file->bs, offset, buffer, VHDX_LOG_SECTOR_SIZE); - if (ret < 0) { - goto exit; - } - read = vhdx_log_inc_idx(read, log->length); - - *sectors_read = *sectors_read + 1; - num_sectors--; - } - -exit: - if (!peek) { - log->read = read; - } - return ret; -} - -/* Writes num_sectors to the log (all log sectors are 4096 bytes), - * from buffer 'buffer'. Upon return, *sectors_written will contain - * the number of sectors successfully written. - * - * It is assumed that 'buffer' is at least 4096*num_sectors large. - * - * 0 is returned on success, -errno otherwise */ -static int vhdx_log_write_sectors(BlockDriverState *bs, VHDXLogEntries *log, - uint32_t *sectors_written, void *buffer, - uint32_t num_sectors) -{ - int ret = 0; - uint64_t offset; - uint32_t write; - void *buffer_tmp; - BDRVVHDXState *s = bs->opaque; - - ret = vhdx_user_visible_write(bs, s); - if (ret < 0) { - goto exit; - } - - write = log->write; - - buffer_tmp = buffer; - while (num_sectors) { - - offset = log->offset + write; - write = vhdx_log_inc_idx(write, log->length); - if (write == log->read) { - /* full */ - break; - } - ret = bdrv_pwrite(bs->file->bs, offset, buffer_tmp, - VHDX_LOG_SECTOR_SIZE); - if (ret < 0) { - goto exit; - } - buffer_tmp += VHDX_LOG_SECTOR_SIZE; - - log->write = write; - *sectors_written = *sectors_written + 1; - num_sectors--; - } - -exit: - return ret; -} - - -/* Validates a log entry header */ -static bool vhdx_log_hdr_is_valid(VHDXLogEntries *log, VHDXLogEntryHeader *hdr, - BDRVVHDXState *s) -{ - int valid = false; - - if (hdr->signature != VHDX_LOG_SIGNATURE) { - goto exit; - } - - /* if the individual entry length is larger than the whole log - * buffer, that is obviously invalid */ - if (log->length < hdr->entry_length) { - goto exit; - } - - /* length of entire entry must be in units of 4KB (log sector size) */ - if (hdr->entry_length % (VHDX_LOG_SECTOR_SIZE)) { - goto exit; - } - - /* per spec, sequence # must be > 0 */ - if (hdr->sequence_number == 0) { - goto exit; - } - - /* log entries are only valid if they match the file-wide log guid - * found in the active header */ - if (!guid_eq(hdr->log_guid, s->headers[s->curr_header]->log_guid)) { - goto exit; - } - - if (hdr->descriptor_count * sizeof(VHDXLogDescriptor) > hdr->entry_length) { - goto exit; - } - - valid = true; - -exit: - return valid; -} - -/* - * Given a log header, this will validate that the descriptors and the - * corresponding data sectors (if applicable) - * - * Validation consists of: - * 1. Making sure the sequence numbers matches the entry header - * 2. Verifying a valid signature ('zero' or 'desc' for descriptors) - * 3. File offset field is a multiple of 4KB - * 4. If a data descriptor, the corresponding data sector - * has its signature ('data') and matching sequence number - * - * @desc: the data buffer containing the descriptor - * @hdr: the log entry header - * - * Returns true if valid - */ -static bool vhdx_log_desc_is_valid(VHDXLogDescriptor *desc, - VHDXLogEntryHeader *hdr) -{ - bool ret = false; - - if (desc->sequence_number != hdr->sequence_number) { - goto exit; - } - if (desc->file_offset % VHDX_LOG_SECTOR_SIZE) { - goto exit; - } - - if (desc->signature == VHDX_LOG_ZERO_SIGNATURE) { - if (desc->zero_length % VHDX_LOG_SECTOR_SIZE == 0) { - /* valid */ - ret = true; - } - } else if (desc->signature == VHDX_LOG_DESC_SIGNATURE) { - /* valid */ - ret = true; - } - -exit: - return ret; -} - - -/* Prior to sector data for a log entry, there is the header - * and the descriptors referenced in the header: - * - * [] = 4KB sector - * - * [ hdr, desc ][ desc ][ ... ][ data ][ ... ] - * - * The first sector in a log entry has a 64 byte header, and - * up to 126 32-byte descriptors. If more descriptors than - * 126 are required, then subsequent sectors can have up to 128 - * descriptors. Each sector is 4KB. Data follows the descriptor - * sectors. - * - * This will return the number of sectors needed to encompass - * the passed number of descriptors in desc_cnt. - * - * This will never return 0, even if desc_cnt is 0. - */ -static int vhdx_compute_desc_sectors(uint32_t desc_cnt) -{ - uint32_t desc_sectors; - - desc_cnt += 2; /* account for header in first sector */ - desc_sectors = desc_cnt / 128; - if (desc_cnt % 128) { - desc_sectors++; - } - - return desc_sectors; -} - - -/* Reads the log header, and subsequent descriptors (if any). This - * will allocate all the space for buffer, which must be NULL when - * passed into this function. Each descriptor will also be validated, - * and error returned if any are invalid. */ -static int vhdx_log_read_desc(BlockDriverState *bs, BDRVVHDXState *s, - VHDXLogEntries *log, VHDXLogDescEntries **buffer, - bool convert_endian) -{ - int ret = 0; - uint32_t desc_sectors; - uint32_t sectors_read; - VHDXLogEntryHeader hdr; - VHDXLogDescEntries *desc_entries = NULL; - VHDXLogDescriptor desc; - int i; - - assert(*buffer == NULL); - - ret = vhdx_log_peek_hdr(bs, log, &hdr); - if (ret < 0) { - goto exit; - } - - if (vhdx_log_hdr_is_valid(log, &hdr, s) == false) { - ret = -EINVAL; - goto exit; - } - - desc_sectors = vhdx_compute_desc_sectors(hdr.descriptor_count); - desc_entries = qemu_try_blockalign(bs->file->bs, - desc_sectors * VHDX_LOG_SECTOR_SIZE); - if (desc_entries == NULL) { - ret = -ENOMEM; - goto exit; - } - - ret = vhdx_log_read_sectors(bs, log, §ors_read, desc_entries, - desc_sectors, false); - if (ret < 0) { - goto free_and_exit; - } - if (sectors_read != desc_sectors) { - ret = -EINVAL; - goto free_and_exit; - } - - /* put in proper endianness, and validate each desc */ - for (i = 0; i < hdr.descriptor_count; i++) { - desc = desc_entries->desc[i]; - vhdx_log_desc_le_import(&desc); - if (convert_endian) { - desc_entries->desc[i] = desc; - } - if (vhdx_log_desc_is_valid(&desc, &hdr) == false) { - ret = -EINVAL; - goto free_and_exit; - } - } - if (convert_endian) { - desc_entries->hdr = hdr; - } - - *buffer = desc_entries; - goto exit; - -free_and_exit: - qemu_vfree(desc_entries); -exit: - return ret; -} - - -/* Flushes the descriptor described by desc to the VHDX image file. - * If the descriptor is a data descriptor, than 'data' must be non-NULL, - * and >= 4096 bytes (VHDX_LOG_SECTOR_SIZE), containing the data to be - * written. - * - * Verification is performed to make sure the sequence numbers of a data - * descriptor match the sequence number in the desc. - * - * For a zero descriptor, it may describe multiple sectors to fill with zeroes. - * In this case, it should be noted that zeroes are written to disk, and the - * image file is not extended as a sparse file. */ -static int vhdx_log_flush_desc(BlockDriverState *bs, VHDXLogDescriptor *desc, - VHDXLogDataSector *data) -{ - int ret = 0; - uint64_t seq, file_offset; - uint32_t offset = 0; - void *buffer = NULL; - uint64_t count = 1; - int i; - - buffer = qemu_blockalign(bs, VHDX_LOG_SECTOR_SIZE); - - if (desc->signature == VHDX_LOG_DESC_SIGNATURE) { - /* data sector */ - if (data == NULL) { - ret = -EFAULT; - goto exit; - } - - /* The sequence number of the data sector must match that - * in the descriptor */ - seq = data->sequence_high; - seq <<= 32; - seq |= data->sequence_low & 0xffffffff; - - if (seq != desc->sequence_number) { - ret = -EINVAL; - goto exit; - } - - /* Each data sector is in total 4096 bytes, however the first - * 8 bytes, and last 4 bytes, are located in the descriptor */ - memcpy(buffer, &desc->leading_bytes, 8); - offset += 8; - - memcpy(buffer+offset, data->data, 4084); - offset += 4084; - - memcpy(buffer+offset, &desc->trailing_bytes, 4); - - } else if (desc->signature == VHDX_LOG_ZERO_SIGNATURE) { - /* write 'count' sectors of sector */ - memset(buffer, 0, VHDX_LOG_SECTOR_SIZE); - count = desc->zero_length / VHDX_LOG_SECTOR_SIZE; - } else { - error_report("Invalid VHDX log descriptor entry signature 0x%" PRIx32, - desc->signature); - ret = -EINVAL; - goto exit; - } - - file_offset = desc->file_offset; - - /* count is only > 1 if we are writing zeroes */ - for (i = 0; i < count; i++) { - ret = bdrv_pwrite_sync(bs->file->bs, file_offset, buffer, - VHDX_LOG_SECTOR_SIZE); - if (ret < 0) { - goto exit; - } - file_offset += VHDX_LOG_SECTOR_SIZE; - } - -exit: - qemu_vfree(buffer); - return ret; -} - -/* Flush the entire log (as described by 'logs') to the VHDX image - * file, and then set the log to 'empty' status once complete. - * - * The log entries should be validate prior to flushing */ -static int vhdx_log_flush(BlockDriverState *bs, BDRVVHDXState *s, - VHDXLogSequence *logs) -{ - int ret = 0; - int i; - uint32_t cnt, sectors_read; - uint64_t new_file_size; - void *data = NULL; - VHDXLogDescEntries *desc_entries = NULL; - VHDXLogEntryHeader hdr_tmp = { 0 }; - - cnt = logs->count; - - data = qemu_blockalign(bs, VHDX_LOG_SECTOR_SIZE); - - ret = vhdx_user_visible_write(bs, s); - if (ret < 0) { - goto exit; - } - - /* each iteration represents one log sequence, which may span multiple - * sectors */ - while (cnt--) { - ret = vhdx_log_peek_hdr(bs, &logs->log, &hdr_tmp); - if (ret < 0) { - goto exit; - } - /* if the log shows a FlushedFileOffset larger than our current file - * size, then that means the file has been truncated / corrupted, and - * we must refused to open it / use it */ - if (hdr_tmp.flushed_file_offset > bdrv_getlength(bs->file->bs)) { - ret = -EINVAL; - goto exit; - } - - ret = vhdx_log_read_desc(bs, s, &logs->log, &desc_entries, true); - if (ret < 0) { - goto exit; - } - - for (i = 0; i < desc_entries->hdr.descriptor_count; i++) { - if (desc_entries->desc[i].signature == VHDX_LOG_DESC_SIGNATURE) { - /* data sector, so read a sector to flush */ - ret = vhdx_log_read_sectors(bs, &logs->log, §ors_read, - data, 1, false); - if (ret < 0) { - goto exit; - } - if (sectors_read != 1) { - ret = -EINVAL; - goto exit; - } - vhdx_log_data_le_import(data); - } - - ret = vhdx_log_flush_desc(bs, &desc_entries->desc[i], data); - if (ret < 0) { - goto exit; - } - } - if (bdrv_getlength(bs->file->bs) < desc_entries->hdr.last_file_offset) { - new_file_size = desc_entries->hdr.last_file_offset; - if (new_file_size % (1024*1024)) { - /* round up to nearest 1MB boundary */ - new_file_size = ((new_file_size >> 20) + 1) << 20; - bdrv_truncate(bs->file->bs, new_file_size); - } - } - qemu_vfree(desc_entries); - desc_entries = NULL; - } - - bdrv_flush(bs); - /* once the log is fully flushed, indicate that we have an empty log - * now. This also sets the log guid to 0, to indicate an empty log */ - vhdx_log_reset(bs, s); - -exit: - qemu_vfree(data); - qemu_vfree(desc_entries); - return ret; -} - -static int vhdx_validate_log_entry(BlockDriverState *bs, BDRVVHDXState *s, - VHDXLogEntries *log, uint64_t seq, - bool *valid, VHDXLogEntryHeader *entry) -{ - int ret = 0; - VHDXLogEntryHeader hdr; - void *buffer = NULL; - uint32_t i, desc_sectors, total_sectors, crc; - uint32_t sectors_read = 0; - VHDXLogDescEntries *desc_buffer = NULL; - - *valid = false; - - ret = vhdx_log_peek_hdr(bs, log, &hdr); - if (ret < 0) { - goto inc_and_exit; - } - - if (vhdx_log_hdr_is_valid(log, &hdr, s) == false) { - goto inc_and_exit; - } - - if (seq > 0) { - if (hdr.sequence_number != seq + 1) { - goto inc_and_exit; - } - } - - desc_sectors = vhdx_compute_desc_sectors(hdr.descriptor_count); - - /* Read all log sectors, and calculate log checksum */ - - total_sectors = hdr.entry_length / VHDX_LOG_SECTOR_SIZE; - - - /* read_desc() will increment the read idx */ - ret = vhdx_log_read_desc(bs, s, log, &desc_buffer, false); - if (ret < 0) { - goto free_and_exit; - } - - crc = vhdx_checksum_calc(0xffffffff, (void *)desc_buffer, - desc_sectors * VHDX_LOG_SECTOR_SIZE, 4); - crc ^= 0xffffffff; - - buffer = qemu_blockalign(bs, VHDX_LOG_SECTOR_SIZE); - if (total_sectors > desc_sectors) { - for (i = 0; i < total_sectors - desc_sectors; i++) { - sectors_read = 0; - ret = vhdx_log_read_sectors(bs, log, §ors_read, buffer, - 1, false); - if (ret < 0 || sectors_read != 1) { - goto free_and_exit; - } - crc = vhdx_checksum_calc(crc, buffer, VHDX_LOG_SECTOR_SIZE, -1); - crc ^= 0xffffffff; - } - } - crc ^= 0xffffffff; - if (crc != hdr.checksum) { - goto free_and_exit; - } - - *valid = true; - *entry = hdr; - goto free_and_exit; - -inc_and_exit: - log->read = vhdx_log_inc_idx(log->read, log->length); - -free_and_exit: - qemu_vfree(buffer); - qemu_vfree(desc_buffer); - return ret; -} - -/* Search through the log circular buffer, and find the valid, active - * log sequence, if any exists - * */ -static int vhdx_log_search(BlockDriverState *bs, BDRVVHDXState *s, - VHDXLogSequence *logs) -{ - int ret = 0; - uint32_t tail; - bool seq_valid = false; - VHDXLogSequence candidate = { 0 }; - VHDXLogEntryHeader hdr = { 0 }; - VHDXLogEntries curr_log; - - memcpy(&curr_log, &s->log, sizeof(VHDXLogEntries)); - curr_log.write = curr_log.length; /* assume log is full */ - curr_log.read = 0; - - - /* now we will go through the whole log sector by sector, until - * we find a valid, active log sequence, or reach the end of the - * log buffer */ - for (;;) { - uint64_t curr_seq = 0; - VHDXLogSequence current = { 0 }; - - tail = curr_log.read; - - ret = vhdx_validate_log_entry(bs, s, &curr_log, curr_seq, - &seq_valid, &hdr); - if (ret < 0) { - goto exit; - } - - if (seq_valid) { - current.valid = true; - current.log = curr_log; - current.log.read = tail; - current.log.write = curr_log.read; - current.count = 1; - current.hdr = hdr; - - - for (;;) { - ret = vhdx_validate_log_entry(bs, s, &curr_log, curr_seq, - &seq_valid, &hdr); - if (ret < 0) { - goto exit; - } - if (seq_valid == false) { - break; - } - current.log.write = curr_log.read; - current.count++; - - curr_seq = hdr.sequence_number; - } - } - - if (current.valid) { - if (candidate.valid == false || - current.hdr.sequence_number > candidate.hdr.sequence_number) { - candidate = current; - } - } - - if (curr_log.read < tail) { - break; - } - } - - *logs = candidate; - - if (candidate.valid) { - /* this is the next sequence number, for writes */ - s->log.sequence = candidate.hdr.sequence_number + 1; - } - - -exit: - return ret; -} - -/* Parse the replay log. Per the VHDX spec, if the log is present - * it must be replayed prior to opening the file, even read-only. - * - * If read-only, we must replay the log in RAM (or refuse to open - * a dirty VHDX file read-only) */ -int vhdx_parse_log(BlockDriverState *bs, BDRVVHDXState *s, bool *flushed, - Error **errp) -{ - int ret = 0; - VHDXHeader *hdr; - VHDXLogSequence logs = { 0 }; - - hdr = s->headers[s->curr_header]; - - *flushed = false; - - /* s->log.hdr is freed in vhdx_close() */ - if (s->log.hdr == NULL) { - s->log.hdr = qemu_blockalign(bs, sizeof(VHDXLogEntryHeader)); - } - - s->log.offset = hdr->log_offset; - s->log.length = hdr->log_length; - - if (s->log.offset < VHDX_LOG_MIN_SIZE || - s->log.offset % VHDX_LOG_MIN_SIZE) { - ret = -EINVAL; - goto exit; - } - - /* per spec, only log version of 0 is supported */ - if (hdr->log_version != 0) { - ret = -EINVAL; - goto exit; - } - - /* If either the log guid, or log length is zero, - * then a replay log is not present */ - if (guid_eq(hdr->log_guid, zero_guid)) { - goto exit; - } - - if (hdr->log_length == 0) { - goto exit; - } - - if (hdr->log_length % VHDX_LOG_MIN_SIZE) { - ret = -EINVAL; - goto exit; - } - - - /* The log is present, we need to find if and where there is an active - * sequence of valid entries present in the log. */ - - ret = vhdx_log_search(bs, s, &logs); - if (ret < 0) { - goto exit; - } - - if (logs.valid) { - if (bs->read_only) { - ret = -EPERM; - error_setg(errp, - "VHDX image file '%s' opened read-only, but " - "contains a log that needs to be replayed", - bs->filename); - error_append_hint(errp, "To replay the log, run:\n" - "qemu-img check -r all '%s'\n", - bs->filename); - goto exit; - } - /* now flush the log */ - ret = vhdx_log_flush(bs, s, &logs); - if (ret < 0) { - goto exit; - } - *flushed = true; - } - - -exit: - return ret; -} - - - -static void vhdx_log_raw_to_le_sector(VHDXLogDescriptor *desc, - VHDXLogDataSector *sector, void *data, - uint64_t seq) -{ - /* 8 + 4084 + 4 = 4096, 1 log sector */ - memcpy(&desc->leading_bytes, data, 8); - data += 8; - cpu_to_le64s(&desc->leading_bytes); - memcpy(sector->data, data, 4084); - data += 4084; - memcpy(&desc->trailing_bytes, data, 4); - cpu_to_le32s(&desc->trailing_bytes); - data += 4; - - sector->sequence_high = (uint32_t) (seq >> 32); - sector->sequence_low = (uint32_t) (seq & 0xffffffff); - sector->data_signature = VHDX_LOG_DATA_SIGNATURE; - - vhdx_log_desc_le_export(desc); - vhdx_log_data_le_export(sector); -} - - -static int vhdx_log_write(BlockDriverState *bs, BDRVVHDXState *s, - void *data, uint32_t length, uint64_t offset) -{ - int ret = 0; - void *buffer = NULL; - void *merged_sector = NULL; - void *data_tmp, *sector_write; - unsigned int i; - int sector_offset; - uint32_t desc_sectors, sectors, total_length; - uint32_t sectors_written = 0; - uint32_t aligned_length; - uint32_t leading_length = 0; - uint32_t trailing_length = 0; - uint32_t partial_sectors = 0; - uint32_t bytes_written = 0; - uint64_t file_offset; - VHDXHeader *header; - VHDXLogEntryHeader new_hdr; - VHDXLogDescriptor *new_desc = NULL; - VHDXLogDataSector *data_sector = NULL; - MSGUID new_guid = { 0 }; - - header = s->headers[s->curr_header]; - - /* need to have offset read data, and be on 4096 byte boundary */ - - if (length > header->log_length) { - /* no log present. we could create a log here instead of failing */ - ret = -EINVAL; - goto exit; - } - - if (guid_eq(header->log_guid, zero_guid)) { - vhdx_guid_generate(&new_guid); - vhdx_update_headers(bs, s, false, &new_guid); - } else { - /* currently, we require that the log be flushed after - * every write. */ - ret = -ENOTSUP; - goto exit; - } - - /* 0 is an invalid sequence number, but may also represent the first - * log write (or a wrapped seq) */ - if (s->log.sequence == 0) { - s->log.sequence = 1; - } - - sector_offset = offset % VHDX_LOG_SECTOR_SIZE; - file_offset = (offset / VHDX_LOG_SECTOR_SIZE) * VHDX_LOG_SECTOR_SIZE; - - aligned_length = length; - - /* add in the unaligned head and tail bytes */ - if (sector_offset) { - leading_length = (VHDX_LOG_SECTOR_SIZE - sector_offset); - leading_length = leading_length > length ? length : leading_length; - aligned_length -= leading_length; - partial_sectors++; - } - - sectors = aligned_length / VHDX_LOG_SECTOR_SIZE; - trailing_length = aligned_length - (sectors * VHDX_LOG_SECTOR_SIZE); - if (trailing_length) { - partial_sectors++; - } - - sectors += partial_sectors; - - /* sectors is now how many sectors the data itself takes, not - * including the header and descriptor metadata */ - - new_hdr = (VHDXLogEntryHeader) { - .signature = VHDX_LOG_SIGNATURE, - .tail = s->log.tail, - .sequence_number = s->log.sequence, - .descriptor_count = sectors, - .reserved = 0, - .flushed_file_offset = bdrv_getlength(bs->file->bs), - .last_file_offset = bdrv_getlength(bs->file->bs), - }; - - new_hdr.log_guid = header->log_guid; - - desc_sectors = vhdx_compute_desc_sectors(new_hdr.descriptor_count); - - total_length = (desc_sectors + sectors) * VHDX_LOG_SECTOR_SIZE; - new_hdr.entry_length = total_length; - - vhdx_log_entry_hdr_le_export(&new_hdr); - - buffer = qemu_blockalign(bs, total_length); - memcpy(buffer, &new_hdr, sizeof(new_hdr)); - - new_desc = buffer + sizeof(new_hdr); - data_sector = buffer + (desc_sectors * VHDX_LOG_SECTOR_SIZE); - data_tmp = data; - - /* All log sectors are 4KB, so for any partial sectors we must - * merge the data with preexisting data from the final file - * destination */ - merged_sector = qemu_blockalign(bs, VHDX_LOG_SECTOR_SIZE); - - for (i = 0; i < sectors; i++) { - new_desc->signature = VHDX_LOG_DESC_SIGNATURE; - new_desc->sequence_number = s->log.sequence; - new_desc->file_offset = file_offset; - - if (i == 0 && leading_length) { - /* partial sector at the front of the buffer */ - ret = bdrv_pread(bs->file->bs, file_offset, merged_sector, - VHDX_LOG_SECTOR_SIZE); - if (ret < 0) { - goto exit; - } - memcpy(merged_sector + sector_offset, data_tmp, leading_length); - bytes_written = leading_length; - sector_write = merged_sector; - } else if (i == sectors - 1 && trailing_length) { - /* partial sector at the end of the buffer */ - ret = bdrv_pread(bs->file->bs, - file_offset, - merged_sector + trailing_length, - VHDX_LOG_SECTOR_SIZE - trailing_length); - if (ret < 0) { - goto exit; - } - memcpy(merged_sector, data_tmp, trailing_length); - bytes_written = trailing_length; - sector_write = merged_sector; - } else { - bytes_written = VHDX_LOG_SECTOR_SIZE; - sector_write = data_tmp; - } - - /* populate the raw sector data into the proper structures, - * as well as update the descriptor, and convert to proper - * endianness */ - vhdx_log_raw_to_le_sector(new_desc, data_sector, sector_write, - s->log.sequence); - - data_tmp += bytes_written; - data_sector++; - new_desc++; - file_offset += VHDX_LOG_SECTOR_SIZE; - } - - /* checksum covers entire entry, from the log header through the - * last data sector */ - vhdx_update_checksum(buffer, total_length, - offsetof(VHDXLogEntryHeader, checksum)); - - /* now write to the log */ - ret = vhdx_log_write_sectors(bs, &s->log, §ors_written, buffer, - desc_sectors + sectors); - if (ret < 0) { - goto exit; - } - - if (sectors_written != desc_sectors + sectors) { - /* instead of failing, we could flush the log here */ - ret = -EINVAL; - goto exit; - } - - s->log.sequence++; - /* write new tail */ - s->log.tail = s->log.write; - -exit: - qemu_vfree(buffer); - qemu_vfree(merged_sector); - return ret; -} - -/* Perform a log write, and then immediately flush the entire log */ -int vhdx_log_write_and_flush(BlockDriverState *bs, BDRVVHDXState *s, - void *data, uint32_t length, uint64_t offset) -{ - int ret = 0; - VHDXLogSequence logs = { .valid = true, - .count = 1, - .hdr = { 0 } }; - - - /* Make sure data written (new and/or changed blocks) is stable - * on disk, before creating log entry */ - bdrv_flush(bs); - ret = vhdx_log_write(bs, s, data, length, offset); - if (ret < 0) { - goto exit; - } - logs.log = s->log; - - /* Make sure log is stable on disk */ - bdrv_flush(bs); - ret = vhdx_log_flush(bs, s, &logs); - if (ret < 0) { - goto exit; - } - - s->log = logs.log; - -exit: - return ret; -} - diff --git a/qemu/block/vhdx.c b/qemu/block/vhdx.c deleted file mode 100644 index 2b7b33240..000000000 --- a/qemu/block/vhdx.c +++ /dev/null @@ -1,1981 +0,0 @@ -/* - * Block driver for Hyper-V VHDX Images - * - * Copyright (c) 2013 Red Hat, Inc., - * - * Authors: - * Jeff Cody - * - * This is based on the "VHDX Format Specification v1.00", published 8/25/2012 - * by Microsoft: - * https://www.microsoft.com/en-us/download/details.aspx?id=34750 - * - * This work is licensed under the terms of the GNU LGPL, version 2 or later. - * See the COPYING.LIB file in the top-level directory. - * - */ - -#include "qemu/osdep.h" -#include "qapi/error.h" -#include "qemu-common.h" -#include "block/block_int.h" -#include "sysemu/block-backend.h" -#include "qemu/module.h" -#include "qemu/crc32c.h" -#include "block/vhdx.h" -#include "migration/migration.h" - -#include -#include - -/* Options for VHDX creation */ - -#define VHDX_BLOCK_OPT_LOG_SIZE "log_size" -#define VHDX_BLOCK_OPT_BLOCK_SIZE "block_size" -#define VHDX_BLOCK_OPT_ZERO "block_state_zero" - -typedef enum VHDXImageType { - VHDX_TYPE_DYNAMIC = 0, - VHDX_TYPE_FIXED, - VHDX_TYPE_DIFFERENCING, /* Currently unsupported */ -} VHDXImageType; - -/* Several metadata and region table data entries are identified by - * guids in a MS-specific GUID format. */ - - -/* ------- Known Region Table GUIDs ---------------------- */ -static const MSGUID bat_guid = { .data1 = 0x2dc27766, - .data2 = 0xf623, - .data3 = 0x4200, - .data4 = { 0x9d, 0x64, 0x11, 0x5e, - 0x9b, 0xfd, 0x4a, 0x08} }; - -static const MSGUID metadata_guid = { .data1 = 0x8b7ca206, - .data2 = 0x4790, - .data3 = 0x4b9a, - .data4 = { 0xb8, 0xfe, 0x57, 0x5f, - 0x05, 0x0f, 0x88, 0x6e} }; - - - -/* ------- Known Metadata Entry GUIDs ---------------------- */ -static const MSGUID file_param_guid = { .data1 = 0xcaa16737, - .data2 = 0xfa36, - .data3 = 0x4d43, - .data4 = { 0xb3, 0xb6, 0x33, 0xf0, - 0xaa, 0x44, 0xe7, 0x6b} }; - -static const MSGUID virtual_size_guid = { .data1 = 0x2FA54224, - .data2 = 0xcd1b, - .data3 = 0x4876, - .data4 = { 0xb2, 0x11, 0x5d, 0xbe, - 0xd8, 0x3b, 0xf4, 0xb8} }; - -static const MSGUID page83_guid = { .data1 = 0xbeca12ab, - .data2 = 0xb2e6, - .data3 = 0x4523, - .data4 = { 0x93, 0xef, 0xc3, 0x09, - 0xe0, 0x00, 0xc7, 0x46} }; - - -static const MSGUID phys_sector_guid = { .data1 = 0xcda348c7, - .data2 = 0x445d, - .data3 = 0x4471, - .data4 = { 0x9c, 0xc9, 0xe9, 0x88, - 0x52, 0x51, 0xc5, 0x56} }; - -static const MSGUID parent_locator_guid = { .data1 = 0xa8d35f2d, - .data2 = 0xb30b, - .data3 = 0x454d, - .data4 = { 0xab, 0xf7, 0xd3, - 0xd8, 0x48, 0x34, - 0xab, 0x0c} }; - -static const MSGUID logical_sector_guid = { .data1 = 0x8141bf1d, - .data2 = 0xa96f, - .data3 = 0x4709, - .data4 = { 0xba, 0x47, 0xf2, - 0x33, 0xa8, 0xfa, - 0xab, 0x5f} }; - -/* Each parent type must have a valid GUID; this is for parent images - * of type 'VHDX'. If we were to allow e.g. a QCOW2 parent, we would - * need to make up our own QCOW2 GUID type */ -static const MSGUID parent_vhdx_guid __attribute__((unused)) - = { .data1 = 0xb04aefb7, - .data2 = 0xd19e, - .data3 = 0x4a81, - .data4 = { 0xb7, 0x89, 0x25, 0xb8, - 0xe9, 0x44, 0x59, 0x13} }; - - -#define META_FILE_PARAMETER_PRESENT 0x01 -#define META_VIRTUAL_DISK_SIZE_PRESENT 0x02 -#define META_PAGE_83_PRESENT 0x04 -#define META_LOGICAL_SECTOR_SIZE_PRESENT 0x08 -#define META_PHYS_SECTOR_SIZE_PRESENT 0x10 -#define META_PARENT_LOCATOR_PRESENT 0x20 - -#define META_ALL_PRESENT \ - (META_FILE_PARAMETER_PRESENT | META_VIRTUAL_DISK_SIZE_PRESENT | \ - META_PAGE_83_PRESENT | META_LOGICAL_SECTOR_SIZE_PRESENT | \ - META_PHYS_SECTOR_SIZE_PRESENT) - - -typedef struct VHDXSectorInfo { - uint32_t bat_idx; /* BAT entry index */ - uint32_t sectors_avail; /* sectors available in payload block */ - uint32_t bytes_left; /* bytes left in the block after data to r/w */ - uint32_t bytes_avail; /* bytes available in payload block */ - uint64_t file_offset; /* absolute offset in bytes, in file */ - uint64_t block_offset; /* block offset, in bytes */ -} VHDXSectorInfo; - -/* Calculates new checksum. - * - * Zero is substituted during crc calculation for the original crc field - * crc_offset: byte offset in buf of the buffer crc - * buf: buffer pointer - * size: size of buffer (must be > crc_offset+4) - * - * Note: The buffer should have all multi-byte data in little-endian format, - * and the resulting checksum is in little endian format. - */ -uint32_t vhdx_update_checksum(uint8_t *buf, size_t size, int crc_offset) -{ - uint32_t crc; - - assert(buf != NULL); - assert(size > (crc_offset + sizeof(crc))); - - memset(buf + crc_offset, 0, sizeof(crc)); - crc = crc32c(0xffffffff, buf, size); - cpu_to_le32s(&crc); - memcpy(buf + crc_offset, &crc, sizeof(crc)); - - return crc; -} - -uint32_t vhdx_checksum_calc(uint32_t crc, uint8_t *buf, size_t size, - int crc_offset) -{ - uint32_t crc_new; - uint32_t crc_orig; - assert(buf != NULL); - - if (crc_offset > 0) { - memcpy(&crc_orig, buf + crc_offset, sizeof(crc_orig)); - memset(buf + crc_offset, 0, sizeof(crc_orig)); - } - - crc_new = crc32c(crc, buf, size); - if (crc_offset > 0) { - memcpy(buf + crc_offset, &crc_orig, sizeof(crc_orig)); - } - - return crc_new; -} - -/* Validates the checksum of the buffer, with an in-place CRC. - * - * Zero is substituted during crc calculation for the original crc field, - * and the crc field is restored afterwards. But the buffer will be modifed - * during the calculation, so this may not be not suitable for multi-threaded - * use. - * - * crc_offset: byte offset in buf of the buffer crc - * buf: buffer pointer - * size: size of buffer (must be > crc_offset+4) - * - * returns true if checksum is valid, false otherwise - */ -bool vhdx_checksum_is_valid(uint8_t *buf, size_t size, int crc_offset) -{ - uint32_t crc_orig; - uint32_t crc; - - assert(buf != NULL); - assert(size > (crc_offset + 4)); - - memcpy(&crc_orig, buf + crc_offset, sizeof(crc_orig)); - crc_orig = le32_to_cpu(crc_orig); - - crc = vhdx_checksum_calc(0xffffffff, buf, size, crc_offset); - - return crc == crc_orig; -} - - -/* - * This generates a UUID that is compliant with the MS GUIDs used - * in the VHDX spec (and elsewhere). - */ -void vhdx_guid_generate(MSGUID *guid) -{ - uuid_t uuid; - assert(guid != NULL); - - uuid_generate(uuid); - memcpy(guid, uuid, sizeof(MSGUID)); -} - -/* Check for region overlaps inside the VHDX image */ -static int vhdx_region_check(BDRVVHDXState *s, uint64_t start, uint64_t length) -{ - int ret = 0; - uint64_t end; - VHDXRegionEntry *r; - - end = start + length; - QLIST_FOREACH(r, &s->regions, entries) { - if (!((start >= r->end) || (end <= r->start))) { - ret = -EINVAL; - goto exit; - } - } - -exit: - return ret; -} - -/* Register a region for future checks */ -static void vhdx_region_register(BDRVVHDXState *s, - uint64_t start, uint64_t length) -{ - VHDXRegionEntry *r; - - r = g_malloc0(sizeof(*r)); - - r->start = start; - r->end = start + length; - - QLIST_INSERT_HEAD(&s->regions, r, entries); -} - -/* Free all registered regions */ -static void vhdx_region_unregister_all(BDRVVHDXState *s) -{ - VHDXRegionEntry *r, *r_next; - - QLIST_FOREACH_SAFE(r, &s->regions, entries, r_next) { - QLIST_REMOVE(r, entries); - g_free(r); - } -} - -static void vhdx_set_shift_bits(BDRVVHDXState *s) -{ - s->logical_sector_size_bits = ctz32(s->logical_sector_size); - s->sectors_per_block_bits = ctz32(s->sectors_per_block); - s->chunk_ratio_bits = ctz64(s->chunk_ratio); - s->block_size_bits = ctz32(s->block_size); -} - -/* - * Per the MS VHDX Specification, for every VHDX file: - * - The header section is fixed size - 1 MB - * - The header section is always the first "object" - * - The first 64KB of the header is the File Identifier - * - The first uint64 (8 bytes) is the VHDX Signature ("vhdxfile") - * - The following 512 bytes constitute a UTF-16 string identifiying the - * software that created the file, and is optional and diagnostic only. - * - * Therefore, we probe by looking for the vhdxfile signature "vhdxfile" - */ -static int vhdx_probe(const uint8_t *buf, int buf_size, const char *filename) -{ - if (buf_size >= 8 && !memcmp(buf, "vhdxfile", 8)) { - return 100; - } - return 0; -} - -/* - * Writes the header to the specified offset. - * - * This will optionally read in buffer data from disk (otherwise zero-fill), - * and then update the header checksum. Header is converted to proper - * endianness before being written to the specified file offset - */ -static int vhdx_write_header(BlockDriverState *bs_file, VHDXHeader *hdr, - uint64_t offset, bool read) -{ - uint8_t *buffer = NULL; - int ret; - VHDXHeader *header_le; - - assert(bs_file != NULL); - assert(hdr != NULL); - - /* the header checksum is not over just the packed size of VHDXHeader, - * but rather over the entire 'reserved' range for the header, which is - * 4KB (VHDX_HEADER_SIZE). */ - - buffer = qemu_blockalign(bs_file, VHDX_HEADER_SIZE); - if (read) { - /* if true, we can't assume the extra reserved bytes are 0 */ - ret = bdrv_pread(bs_file, offset, buffer, VHDX_HEADER_SIZE); - if (ret < 0) { - goto exit; - } - } else { - memset(buffer, 0, VHDX_HEADER_SIZE); - } - - /* overwrite the actual VHDXHeader portion */ - header_le = (VHDXHeader *)buffer; - memcpy(header_le, hdr, sizeof(VHDXHeader)); - vhdx_header_le_export(hdr, header_le); - vhdx_update_checksum(buffer, VHDX_HEADER_SIZE, - offsetof(VHDXHeader, checksum)); - ret = bdrv_pwrite_sync(bs_file, offset, header_le, sizeof(VHDXHeader)); - -exit: - qemu_vfree(buffer); - return ret; -} - -/* Update the VHDX headers - * - * This follows the VHDX spec procedures for header updates. - * - * - non-current header is updated with largest sequence number - */ -static int vhdx_update_header(BlockDriverState *bs, BDRVVHDXState *s, - bool generate_data_write_guid, MSGUID *log_guid) -{ - int ret = 0; - int hdr_idx = 0; - uint64_t header_offset = VHDX_HEADER1_OFFSET; - - VHDXHeader *active_header; - VHDXHeader *inactive_header; - - /* operate on the non-current header */ - if (s->curr_header == 0) { - hdr_idx = 1; - header_offset = VHDX_HEADER2_OFFSET; - } - - active_header = s->headers[s->curr_header]; - inactive_header = s->headers[hdr_idx]; - - inactive_header->sequence_number = active_header->sequence_number + 1; - - /* a new file guid must be generated before any file write, including - * headers */ - inactive_header->file_write_guid = s->session_guid; - - /* a new data guid only needs to be generated before any guest-visible - * writes (i.e. something observable via virtual disk read) */ - if (generate_data_write_guid) { - vhdx_guid_generate(&inactive_header->data_write_guid); - } - - /* update the log guid if present */ - if (log_guid) { - inactive_header->log_guid = *log_guid; - } - - ret = vhdx_write_header(bs->file->bs, inactive_header, header_offset, true); - if (ret < 0) { - goto exit; - } - s->curr_header = hdr_idx; - -exit: - return ret; -} - -/* - * The VHDX spec calls for header updates to be performed twice, so that both - * the current and non-current header have valid info - */ -int vhdx_update_headers(BlockDriverState *bs, BDRVVHDXState *s, - bool generate_data_write_guid, MSGUID *log_guid) -{ - int ret; - - ret = vhdx_update_header(bs, s, generate_data_write_guid, log_guid); - if (ret < 0) { - return ret; - } - ret = vhdx_update_header(bs, s, generate_data_write_guid, log_guid); - return ret; -} - -/* opens the specified header block from the VHDX file header section */ -static void vhdx_parse_header(BlockDriverState *bs, BDRVVHDXState *s, - Error **errp) -{ - int ret; - VHDXHeader *header1; - VHDXHeader *header2; - bool h1_valid = false; - bool h2_valid = false; - uint64_t h1_seq = 0; - uint64_t h2_seq = 0; - uint8_t *buffer; - - /* header1 & header2 are freed in vhdx_close() */ - header1 = qemu_blockalign(bs, sizeof(VHDXHeader)); - header2 = qemu_blockalign(bs, sizeof(VHDXHeader)); - - buffer = qemu_blockalign(bs, VHDX_HEADER_SIZE); - - s->headers[0] = header1; - s->headers[1] = header2; - - /* We have to read the whole VHDX_HEADER_SIZE instead of - * sizeof(VHDXHeader), because the checksum is over the whole - * region */ - ret = bdrv_pread(bs->file->bs, VHDX_HEADER1_OFFSET, buffer, - VHDX_HEADER_SIZE); - if (ret < 0) { - goto fail; - } - /* copy over just the relevant portion that we need */ - memcpy(header1, buffer, sizeof(VHDXHeader)); - - if (vhdx_checksum_is_valid(buffer, VHDX_HEADER_SIZE, 4)) { - vhdx_header_le_import(header1); - if (header1->signature == VHDX_HEADER_SIGNATURE && - header1->version == 1) { - h1_seq = header1->sequence_number; - h1_valid = true; - } - } - - ret = bdrv_pread(bs->file->bs, VHDX_HEADER2_OFFSET, buffer, - VHDX_HEADER_SIZE); - if (ret < 0) { - goto fail; - } - /* copy over just the relevant portion that we need */ - memcpy(header2, buffer, sizeof(VHDXHeader)); - - if (vhdx_checksum_is_valid(buffer, VHDX_HEADER_SIZE, 4)) { - vhdx_header_le_import(header2); - if (header2->signature == VHDX_HEADER_SIGNATURE && - header2->version == 1) { - h2_seq = header2->sequence_number; - h2_valid = true; - } - } - - /* If there is only 1 valid header (or no valid headers), we - * don't care what the sequence numbers are */ - if (h1_valid && !h2_valid) { - s->curr_header = 0; - } else if (!h1_valid && h2_valid) { - s->curr_header = 1; - } else if (!h1_valid && !h2_valid) { - goto fail; - } else { - /* If both headers are valid, then we choose the active one by the - * highest sequence number. If the sequence numbers are equal, that is - * invalid */ - if (h1_seq > h2_seq) { - s->curr_header = 0; - } else if (h2_seq > h1_seq) { - s->curr_header = 1; - } else { - /* The Microsoft Disk2VHD tool will create 2 identical - * headers, with identical sequence numbers. If the headers are - * identical, don't consider the file corrupt */ - if (!memcmp(header1, header2, sizeof(VHDXHeader))) { - s->curr_header = 0; - } else { - goto fail; - } - } - } - - vhdx_region_register(s, s->headers[s->curr_header]->log_offset, - s->headers[s->curr_header]->log_length); - goto exit; - -fail: - error_setg_errno(errp, -ret, "No valid VHDX header found"); - qemu_vfree(header1); - qemu_vfree(header2); - s->headers[0] = NULL; - s->headers[1] = NULL; -exit: - qemu_vfree(buffer); -} - - -static int vhdx_open_region_tables(BlockDriverState *bs, BDRVVHDXState *s) -{ - int ret = 0; - uint8_t *buffer; - int offset = 0; - VHDXRegionTableEntry rt_entry; - uint32_t i; - bool bat_rt_found = false; - bool metadata_rt_found = false; - - /* We have to read the whole 64KB block, because the crc32 is over the - * whole block */ - buffer = qemu_blockalign(bs, VHDX_HEADER_BLOCK_SIZE); - - ret = bdrv_pread(bs->file->bs, VHDX_REGION_TABLE_OFFSET, buffer, - VHDX_HEADER_BLOCK_SIZE); - if (ret < 0) { - goto fail; - } - memcpy(&s->rt, buffer, sizeof(s->rt)); - offset += sizeof(s->rt); - - if (!vhdx_checksum_is_valid(buffer, VHDX_HEADER_BLOCK_SIZE, 4)) { - ret = -EINVAL; - goto fail; - } - - vhdx_region_header_le_import(&s->rt); - - if (s->rt.signature != VHDX_REGION_SIGNATURE) { - ret = -EINVAL; - goto fail; - } - - - /* Per spec, maximum region table entry count is 2047 */ - if (s->rt.entry_count > 2047) { - ret = -EINVAL; - goto fail; - } - - for (i = 0; i < s->rt.entry_count; i++) { - memcpy(&rt_entry, buffer + offset, sizeof(rt_entry)); - offset += sizeof(rt_entry); - - vhdx_region_entry_le_import(&rt_entry); - - /* check for region overlap between these entries, and any - * other memory regions in the file */ - ret = vhdx_region_check(s, rt_entry.file_offset, rt_entry.length); - if (ret < 0) { - goto fail; - } - - vhdx_region_register(s, rt_entry.file_offset, rt_entry.length); - - /* see if we recognize the entry */ - if (guid_eq(rt_entry.guid, bat_guid)) { - /* must be unique; if we have already found it this is invalid */ - if (bat_rt_found) { - ret = -EINVAL; - goto fail; - } - bat_rt_found = true; - s->bat_rt = rt_entry; - continue; - } - - if (guid_eq(rt_entry.guid, metadata_guid)) { - /* must be unique; if we have already found it this is invalid */ - if (metadata_rt_found) { - ret = -EINVAL; - goto fail; - } - metadata_rt_found = true; - s->metadata_rt = rt_entry; - continue; - } - - if (rt_entry.data_bits & VHDX_REGION_ENTRY_REQUIRED) { - /* cannot read vhdx file - required region table entry that - * we do not understand. per spec, we must fail to open */ - ret = -ENOTSUP; - goto fail; - } - } - - if (!bat_rt_found || !metadata_rt_found) { - ret = -EINVAL; - goto fail; - } - - ret = 0; - -fail: - qemu_vfree(buffer); - return ret; -} - - - -/* Metadata initial parser - * - * This loads all the metadata entry fields. This may cause additional - * fields to be processed (e.g. parent locator, etc..). - * - * There are 5 Metadata items that are always required: - * - File Parameters (block size, has a parent) - * - Virtual Disk Size (size, in bytes, of the virtual drive) - * - Page 83 Data (scsi page 83 guid) - * - Logical Sector Size (logical sector size in bytes, either 512 or - * 4096. We only support 512 currently) - * - Physical Sector Size (512 or 4096) - * - * Also, if the File Parameters indicate this is a differencing file, - * we must also look for the Parent Locator metadata item. - */ -static int vhdx_parse_metadata(BlockDriverState *bs, BDRVVHDXState *s) -{ - int ret = 0; - uint8_t *buffer; - int offset = 0; - uint32_t i = 0; - VHDXMetadataTableEntry md_entry; - - buffer = qemu_blockalign(bs, VHDX_METADATA_TABLE_MAX_SIZE); - - ret = bdrv_pread(bs->file->bs, s->metadata_rt.file_offset, buffer, - VHDX_METADATA_TABLE_MAX_SIZE); - if (ret < 0) { - goto exit; - } - memcpy(&s->metadata_hdr, buffer, sizeof(s->metadata_hdr)); - offset += sizeof(s->metadata_hdr); - - vhdx_metadata_header_le_import(&s->metadata_hdr); - - if (s->metadata_hdr.signature != VHDX_METADATA_SIGNATURE) { - ret = -EINVAL; - goto exit; - } - - s->metadata_entries.present = 0; - - if ((s->metadata_hdr.entry_count * sizeof(md_entry)) > - (VHDX_METADATA_TABLE_MAX_SIZE - offset)) { - ret = -EINVAL; - goto exit; - } - - for (i = 0; i < s->metadata_hdr.entry_count; i++) { - memcpy(&md_entry, buffer + offset, sizeof(md_entry)); - offset += sizeof(md_entry); - - vhdx_metadata_entry_le_import(&md_entry); - - if (guid_eq(md_entry.item_id, file_param_guid)) { - if (s->metadata_entries.present & META_FILE_PARAMETER_PRESENT) { - ret = -EINVAL; - goto exit; - } - s->metadata_entries.file_parameters_entry = md_entry; - s->metadata_entries.present |= META_FILE_PARAMETER_PRESENT; - continue; - } - - if (guid_eq(md_entry.item_id, virtual_size_guid)) { - if (s->metadata_entries.present & META_VIRTUAL_DISK_SIZE_PRESENT) { - ret = -EINVAL; - goto exit; - } - s->metadata_entries.virtual_disk_size_entry = md_entry; - s->metadata_entries.present |= META_VIRTUAL_DISK_SIZE_PRESENT; - continue; - } - - if (guid_eq(md_entry.item_id, page83_guid)) { - if (s->metadata_entries.present & META_PAGE_83_PRESENT) { - ret = -EINVAL; - goto exit; - } - s->metadata_entries.page83_data_entry = md_entry; - s->metadata_entries.present |= META_PAGE_83_PRESENT; - continue; - } - - if (guid_eq(md_entry.item_id, logical_sector_guid)) { - if (s->metadata_entries.present & - META_LOGICAL_SECTOR_SIZE_PRESENT) { - ret = -EINVAL; - goto exit; - } - s->metadata_entries.logical_sector_size_entry = md_entry; - s->metadata_entries.present |= META_LOGICAL_SECTOR_SIZE_PRESENT; - continue; - } - - if (guid_eq(md_entry.item_id, phys_sector_guid)) { - if (s->metadata_entries.present & META_PHYS_SECTOR_SIZE_PRESENT) { - ret = -EINVAL; - goto exit; - } - s->metadata_entries.phys_sector_size_entry = md_entry; - s->metadata_entries.present |= META_PHYS_SECTOR_SIZE_PRESENT; - continue; - } - - if (guid_eq(md_entry.item_id, parent_locator_guid)) { - if (s->metadata_entries.present & META_PARENT_LOCATOR_PRESENT) { - ret = -EINVAL; - goto exit; - } - s->metadata_entries.parent_locator_entry = md_entry; - s->metadata_entries.present |= META_PARENT_LOCATOR_PRESENT; - continue; - } - - if (md_entry.data_bits & VHDX_META_FLAGS_IS_REQUIRED) { - /* cannot read vhdx file - required region table entry that - * we do not understand. per spec, we must fail to open */ - ret = -ENOTSUP; - goto exit; - } - } - - if (s->metadata_entries.present != META_ALL_PRESENT) { - ret = -ENOTSUP; - goto exit; - } - - ret = bdrv_pread(bs->file->bs, - s->metadata_entries.file_parameters_entry.offset - + s->metadata_rt.file_offset, - &s->params, - sizeof(s->params)); - - if (ret < 0) { - goto exit; - } - - le32_to_cpus(&s->params.block_size); - le32_to_cpus(&s->params.data_bits); - - - /* We now have the file parameters, so we can tell if this is a - * differencing file (i.e.. has_parent), is dynamic or fixed - * sized (leave_blocks_allocated), and the block size */ - - /* The parent locator required iff the file parameters has_parent set */ - if (s->params.data_bits & VHDX_PARAMS_HAS_PARENT) { - if (s->metadata_entries.present & META_PARENT_LOCATOR_PRESENT) { - /* TODO: parse parent locator fields */ - ret = -ENOTSUP; /* temp, until differencing files are supported */ - goto exit; - } else { - /* if has_parent is set, but there is not parent locator present, - * then that is an invalid combination */ - ret = -EINVAL; - goto exit; - } - } - - /* determine virtual disk size, logical sector size, - * and phys sector size */ - - ret = bdrv_pread(bs->file->bs, - s->metadata_entries.virtual_disk_size_entry.offset - + s->metadata_rt.file_offset, - &s->virtual_disk_size, - sizeof(uint64_t)); - if (ret < 0) { - goto exit; - } - ret = bdrv_pread(bs->file->bs, - s->metadata_entries.logical_sector_size_entry.offset - + s->metadata_rt.file_offset, - &s->logical_sector_size, - sizeof(uint32_t)); - if (ret < 0) { - goto exit; - } - ret = bdrv_pread(bs->file->bs, - s->metadata_entries.phys_sector_size_entry.offset - + s->metadata_rt.file_offset, - &s->physical_sector_size, - sizeof(uint32_t)); - if (ret < 0) { - goto exit; - } - - le64_to_cpus(&s->virtual_disk_size); - le32_to_cpus(&s->logical_sector_size); - le32_to_cpus(&s->physical_sector_size); - - if (s->params.block_size < VHDX_BLOCK_SIZE_MIN || - s->params.block_size > VHDX_BLOCK_SIZE_MAX) { - ret = -EINVAL; - goto exit; - } - - /* only 2 supported sector sizes */ - if (s->logical_sector_size != 512 && s->logical_sector_size != 4096) { - ret = -EINVAL; - goto exit; - } - - /* Both block_size and sector_size are guaranteed powers of 2, below. - Due to range checks above, s->sectors_per_block can never be < 256 */ - s->sectors_per_block = s->params.block_size / s->logical_sector_size; - s->chunk_ratio = (VHDX_MAX_SECTORS_PER_BLOCK) * - (uint64_t)s->logical_sector_size / - (uint64_t)s->params.block_size; - - /* These values are ones we will want to use for division / multiplication - * later on, and they are all guaranteed (per the spec) to be powers of 2, - * so we can take advantage of that for shift operations during - * reads/writes */ - if (s->logical_sector_size & (s->logical_sector_size - 1)) { - ret = -EINVAL; - goto exit; - } - if (s->sectors_per_block & (s->sectors_per_block - 1)) { - ret = -EINVAL; - goto exit; - } - if (s->chunk_ratio & (s->chunk_ratio - 1)) { - ret = -EINVAL; - goto exit; - } - s->block_size = s->params.block_size; - if (s->block_size & (s->block_size - 1)) { - ret = -EINVAL; - goto exit; - } - - vhdx_set_shift_bits(s); - - ret = 0; - -exit: - qemu_vfree(buffer); - return ret; -} - -/* - * Calculate the number of BAT entries, including sector - * bitmap entries. - */ -static void vhdx_calc_bat_entries(BDRVVHDXState *s) -{ - uint32_t data_blocks_cnt, bitmap_blocks_cnt; - - data_blocks_cnt = DIV_ROUND_UP(s->virtual_disk_size, s->block_size); - bitmap_blocks_cnt = DIV_ROUND_UP(data_blocks_cnt, s->chunk_ratio); - - if (s->parent_entries) { - s->bat_entries = bitmap_blocks_cnt * (s->chunk_ratio + 1); - } else { - s->bat_entries = data_blocks_cnt + - ((data_blocks_cnt - 1) >> s->chunk_ratio_bits); - } - -} - -static void vhdx_close(BlockDriverState *bs) -{ - BDRVVHDXState *s = bs->opaque; - qemu_vfree(s->headers[0]); - s->headers[0] = NULL; - qemu_vfree(s->headers[1]); - s->headers[1] = NULL; - qemu_vfree(s->bat); - s->bat = NULL; - qemu_vfree(s->parent_entries); - s->parent_entries = NULL; - migrate_del_blocker(s->migration_blocker); - error_free(s->migration_blocker); - qemu_vfree(s->log.hdr); - s->log.hdr = NULL; - vhdx_region_unregister_all(s); -} - -static int vhdx_open(BlockDriverState *bs, QDict *options, int flags, - Error **errp) -{ - BDRVVHDXState *s = bs->opaque; - int ret = 0; - uint32_t i; - uint64_t signature; - Error *local_err = NULL; - - s->bat = NULL; - s->first_visible_write = true; - - qemu_co_mutex_init(&s->lock); - QLIST_INIT(&s->regions); - - /* validate the file signature */ - ret = bdrv_pread(bs->file->bs, 0, &signature, sizeof(uint64_t)); - if (ret < 0) { - goto fail; - } - if (memcmp(&signature, "vhdxfile", 8)) { - ret = -EINVAL; - goto fail; - } - - /* This is used for any header updates, for the file_write_guid. - * The spec dictates that a new value should be used for the first - * header update */ - vhdx_guid_generate(&s->session_guid); - - vhdx_parse_header(bs, s, &local_err); - if (local_err != NULL) { - error_propagate(errp, local_err); - ret = -EINVAL; - goto fail; - } - - ret = vhdx_parse_log(bs, s, &s->log_replayed_on_open, errp); - if (ret < 0) { - goto fail; - } - - ret = vhdx_open_region_tables(bs, s); - if (ret < 0) { - goto fail; - } - - ret = vhdx_parse_metadata(bs, s); - if (ret < 0) { - goto fail; - } - - s->block_size = s->params.block_size; - - /* the VHDX spec dictates that virtual_disk_size is always a multiple of - * logical_sector_size */ - bs->total_sectors = s->virtual_disk_size >> s->logical_sector_size_bits; - - vhdx_calc_bat_entries(s); - - s->bat_offset = s->bat_rt.file_offset; - - if (s->bat_entries > s->bat_rt.length / sizeof(VHDXBatEntry)) { - /* BAT allocation is not large enough for all entries */ - ret = -EINVAL; - goto fail; - } - - /* s->bat is freed in vhdx_close() */ - s->bat = qemu_try_blockalign(bs->file->bs, s->bat_rt.length); - if (s->bat == NULL) { - ret = -ENOMEM; - goto fail; - } - - ret = bdrv_pread(bs->file->bs, s->bat_offset, s->bat, s->bat_rt.length); - if (ret < 0) { - goto fail; - } - - uint64_t payblocks = s->chunk_ratio; - /* endian convert, and verify populated BAT field file offsets against - * region table and log entries */ - for (i = 0; i < s->bat_entries; i++) { - le64_to_cpus(&s->bat[i]); - if (payblocks--) { - /* payload bat entries */ - if ((s->bat[i] & VHDX_BAT_STATE_BIT_MASK) == - PAYLOAD_BLOCK_FULLY_PRESENT) { - ret = vhdx_region_check(s, s->bat[i] & VHDX_BAT_FILE_OFF_MASK, - s->block_size); - if (ret < 0) { - goto fail; - } - } - } else { - payblocks = s->chunk_ratio; - /* Once differencing files are supported, verify sector bitmap - * blocks here */ - } - } - - if (flags & BDRV_O_RDWR) { - ret = vhdx_update_headers(bs, s, false, NULL); - if (ret < 0) { - goto fail; - } - } - - /* TODO: differencing files */ - - /* Disable migration when VHDX images are used */ - error_setg(&s->migration_blocker, "The vhdx format used by node '%s' " - "does not support live migration", - bdrv_get_device_or_node_name(bs)); - migrate_add_blocker(s->migration_blocker); - - return 0; -fail: - vhdx_close(bs); - return ret; -} - -static int vhdx_reopen_prepare(BDRVReopenState *state, - BlockReopenQueue *queue, Error **errp) -{ - return 0; -} - - -/* - * Perform sector to block offset translations, to get various - * sector and file offsets into the image. See VHDXSectorInfo - */ -static void vhdx_block_translate(BDRVVHDXState *s, int64_t sector_num, - int nb_sectors, VHDXSectorInfo *sinfo) -{ - uint32_t block_offset; - - sinfo->bat_idx = sector_num >> s->sectors_per_block_bits; - /* effectively a modulo - this gives us the offset into the block - * (in sector sizes) for our sector number */ - block_offset = sector_num - (sinfo->bat_idx << s->sectors_per_block_bits); - /* the chunk ratio gives us the interleaving of the sector - * bitmaps, so we need to advance our page block index by the - * sector bitmaps entry number */ - sinfo->bat_idx += sinfo->bat_idx >> s->chunk_ratio_bits; - - /* the number of sectors we can read/write in this cycle */ - sinfo->sectors_avail = s->sectors_per_block - block_offset; - - sinfo->bytes_left = sinfo->sectors_avail << s->logical_sector_size_bits; - - if (sinfo->sectors_avail > nb_sectors) { - sinfo->sectors_avail = nb_sectors; - } - - sinfo->bytes_avail = sinfo->sectors_avail << s->logical_sector_size_bits; - - sinfo->file_offset = s->bat[sinfo->bat_idx] & VHDX_BAT_FILE_OFF_MASK; - - sinfo->block_offset = block_offset << s->logical_sector_size_bits; - - /* The file offset must be past the header section, so must be > 0 */ - if (sinfo->file_offset == 0) { - return; - } - - /* block offset is the offset in vhdx logical sectors, in - * the payload data block. Convert that to a byte offset - * in the block, and add in the payload data block offset - * in the file, in bytes, to get the final read address */ - - sinfo->file_offset += sinfo->block_offset; -} - - -static int vhdx_get_info(BlockDriverState *bs, BlockDriverInfo *bdi) -{ - BDRVVHDXState *s = bs->opaque; - - bdi->cluster_size = s->block_size; - - bdi->unallocated_blocks_are_zero = - (s->params.data_bits & VHDX_PARAMS_HAS_PARENT) == 0; - - return 0; -} - - -static coroutine_fn int vhdx_co_readv(BlockDriverState *bs, int64_t sector_num, - int nb_sectors, QEMUIOVector *qiov) -{ - BDRVVHDXState *s = bs->opaque; - int ret = 0; - VHDXSectorInfo sinfo; - uint64_t bytes_done = 0; - QEMUIOVector hd_qiov; - - qemu_iovec_init(&hd_qiov, qiov->niov); - - qemu_co_mutex_lock(&s->lock); - - while (nb_sectors > 0) { - /* We are a differencing file, so we need to inspect the sector bitmap - * to see if we have the data or not */ - if (s->params.data_bits & VHDX_PARAMS_HAS_PARENT) { - /* not supported yet */ - ret = -ENOTSUP; - goto exit; - } else { - vhdx_block_translate(s, sector_num, nb_sectors, &sinfo); - - qemu_iovec_reset(&hd_qiov); - qemu_iovec_concat(&hd_qiov, qiov, bytes_done, sinfo.bytes_avail); - - /* check the payload block state */ - switch (s->bat[sinfo.bat_idx] & VHDX_BAT_STATE_BIT_MASK) { - case PAYLOAD_BLOCK_NOT_PRESENT: /* fall through */ - case PAYLOAD_BLOCK_UNDEFINED: - case PAYLOAD_BLOCK_UNMAPPED: - case PAYLOAD_BLOCK_UNMAPPED_v095: - case PAYLOAD_BLOCK_ZERO: - /* return zero */ - qemu_iovec_memset(&hd_qiov, 0, 0, sinfo.bytes_avail); - break; - case PAYLOAD_BLOCK_FULLY_PRESENT: - qemu_co_mutex_unlock(&s->lock); - ret = bdrv_co_readv(bs->file->bs, - sinfo.file_offset >> BDRV_SECTOR_BITS, - sinfo.sectors_avail, &hd_qiov); - qemu_co_mutex_lock(&s->lock); - if (ret < 0) { - goto exit; - } - break; - case PAYLOAD_BLOCK_PARTIALLY_PRESENT: - /* we don't yet support difference files, fall through - * to error */ - default: - ret = -EIO; - goto exit; - break; - } - nb_sectors -= sinfo.sectors_avail; - sector_num += sinfo.sectors_avail; - bytes_done += sinfo.bytes_avail; - } - } - ret = 0; -exit: - qemu_co_mutex_unlock(&s->lock); - qemu_iovec_destroy(&hd_qiov); - return ret; -} - -/* - * Allocate a new payload block at the end of the file. - * - * Allocation will happen at 1MB alignment inside the file - * - * Returns the file offset start of the new payload block - */ -static int vhdx_allocate_block(BlockDriverState *bs, BDRVVHDXState *s, - uint64_t *new_offset) -{ - *new_offset = bdrv_getlength(bs->file->bs); - - /* per the spec, the address for a block is in units of 1MB */ - *new_offset = ROUND_UP(*new_offset, 1024 * 1024); - - return bdrv_truncate(bs->file->bs, *new_offset + s->block_size); -} - -/* - * Update the BAT table entry with the new file offset, and the new entry - * state */ -static void vhdx_update_bat_table_entry(BlockDriverState *bs, BDRVVHDXState *s, - VHDXSectorInfo *sinfo, - uint64_t *bat_entry_le, - uint64_t *bat_offset, int state) -{ - /* The BAT entry is a uint64, with 44 bits for the file offset in units of - * 1MB, and 3 bits for the block state. */ - if ((state == PAYLOAD_BLOCK_ZERO) || - (state == PAYLOAD_BLOCK_UNDEFINED) || - (state == PAYLOAD_BLOCK_NOT_PRESENT) || - (state == PAYLOAD_BLOCK_UNMAPPED)) { - s->bat[sinfo->bat_idx] = 0; /* For PAYLOAD_BLOCK_ZERO, the - FileOffsetMB field is denoted as - 'reserved' in the v1.0 spec. If it is - non-zero, MS Hyper-V will fail to read - the disk image */ - } else { - s->bat[sinfo->bat_idx] = sinfo->file_offset; - } - - s->bat[sinfo->bat_idx] |= state & VHDX_BAT_STATE_BIT_MASK; - - *bat_entry_le = cpu_to_le64(s->bat[sinfo->bat_idx]); - *bat_offset = s->bat_offset + sinfo->bat_idx * sizeof(VHDXBatEntry); - -} - -/* Per the spec, on the first write of guest-visible data to the file the - * data write guid must be updated in the header */ -int vhdx_user_visible_write(BlockDriverState *bs, BDRVVHDXState *s) -{ - int ret = 0; - if (s->first_visible_write) { - s->first_visible_write = false; - ret = vhdx_update_headers(bs, s, true, NULL); - } - return ret; -} - -static coroutine_fn int vhdx_co_writev(BlockDriverState *bs, int64_t sector_num, - int nb_sectors, QEMUIOVector *qiov) -{ - int ret = -ENOTSUP; - BDRVVHDXState *s = bs->opaque; - VHDXSectorInfo sinfo; - uint64_t bytes_done = 0; - uint64_t bat_entry = 0; - uint64_t bat_entry_offset = 0; - QEMUIOVector hd_qiov; - struct iovec iov1 = { 0 }; - struct iovec iov2 = { 0 }; - int sectors_to_write; - int bat_state; - uint64_t bat_prior_offset = 0; - bool bat_update = false; - - qemu_iovec_init(&hd_qiov, qiov->niov); - - qemu_co_mutex_lock(&s->lock); - - ret = vhdx_user_visible_write(bs, s); - if (ret < 0) { - goto exit; - } - - while (nb_sectors > 0) { - bool use_zero_buffers = false; - bat_update = false; - if (s->params.data_bits & VHDX_PARAMS_HAS_PARENT) { - /* not supported yet */ - ret = -ENOTSUP; - goto exit; - } else { - vhdx_block_translate(s, sector_num, nb_sectors, &sinfo); - sectors_to_write = sinfo.sectors_avail; - - qemu_iovec_reset(&hd_qiov); - /* check the payload block state */ - bat_state = s->bat[sinfo.bat_idx] & VHDX_BAT_STATE_BIT_MASK; - switch (bat_state) { - case PAYLOAD_BLOCK_ZERO: - /* in this case, we need to preserve zero writes for - * data that is not part of this write, so we must pad - * the rest of the buffer to zeroes */ - - /* if we are on a posix system with ftruncate() that extends - * a file, then it is zero-filled for us. On Win32, the raw - * layer uses SetFilePointer and SetFileEnd, which does not - * zero fill AFAIK */ - - /* Queue another write of zero buffers if the underlying file - * does not zero-fill on file extension */ - - if (bdrv_has_zero_init(bs->file->bs) == 0) { - use_zero_buffers = true; - - /* zero fill the front, if any */ - if (sinfo.block_offset) { - iov1.iov_len = sinfo.block_offset; - iov1.iov_base = qemu_blockalign(bs, iov1.iov_len); - memset(iov1.iov_base, 0, iov1.iov_len); - qemu_iovec_concat_iov(&hd_qiov, &iov1, 1, 0, - iov1.iov_len); - sectors_to_write += iov1.iov_len >> BDRV_SECTOR_BITS; - } - - /* our actual data */ - qemu_iovec_concat(&hd_qiov, qiov, bytes_done, - sinfo.bytes_avail); - - /* zero fill the back, if any */ - if ((sinfo.bytes_avail - sinfo.block_offset) < - s->block_size) { - iov2.iov_len = s->block_size - - (sinfo.bytes_avail + sinfo.block_offset); - iov2.iov_base = qemu_blockalign(bs, iov2.iov_len); - memset(iov2.iov_base, 0, iov2.iov_len); - qemu_iovec_concat_iov(&hd_qiov, &iov2, 1, 0, - iov2.iov_len); - sectors_to_write += iov2.iov_len >> BDRV_SECTOR_BITS; - } - } - /* fall through */ - case PAYLOAD_BLOCK_NOT_PRESENT: /* fall through */ - case PAYLOAD_BLOCK_UNMAPPED: - case PAYLOAD_BLOCK_UNMAPPED_v095: - case PAYLOAD_BLOCK_UNDEFINED: - bat_prior_offset = sinfo.file_offset; - ret = vhdx_allocate_block(bs, s, &sinfo.file_offset); - if (ret < 0) { - goto exit; - } - /* once we support differencing files, this may also be - * partially present */ - /* update block state to the newly specified state */ - vhdx_update_bat_table_entry(bs, s, &sinfo, &bat_entry, - &bat_entry_offset, - PAYLOAD_BLOCK_FULLY_PRESENT); - bat_update = true; - /* since we just allocated a block, file_offset is the - * beginning of the payload block. It needs to be the - * write address, which includes the offset into the block */ - if (!use_zero_buffers) { - sinfo.file_offset += sinfo.block_offset; - } - /* fall through */ - case PAYLOAD_BLOCK_FULLY_PRESENT: - /* if the file offset address is in the header zone, - * there is a problem */ - if (sinfo.file_offset < (1024 * 1024)) { - ret = -EFAULT; - goto error_bat_restore; - } - - if (!use_zero_buffers) { - qemu_iovec_concat(&hd_qiov, qiov, bytes_done, - sinfo.bytes_avail); - } - /* block exists, so we can just overwrite it */ - qemu_co_mutex_unlock(&s->lock); - ret = bdrv_co_writev(bs->file->bs, - sinfo.file_offset >> BDRV_SECTOR_BITS, - sectors_to_write, &hd_qiov); - qemu_co_mutex_lock(&s->lock); - if (ret < 0) { - goto error_bat_restore; - } - break; - case PAYLOAD_BLOCK_PARTIALLY_PRESENT: - /* we don't yet support difference files, fall through - * to error */ - default: - ret = -EIO; - goto exit; - break; - } - - if (bat_update) { - /* this will update the BAT entry into the log journal, and - * then flush the log journal out to disk */ - ret = vhdx_log_write_and_flush(bs, s, &bat_entry, - sizeof(VHDXBatEntry), - bat_entry_offset); - if (ret < 0) { - goto exit; - } - } - - nb_sectors -= sinfo.sectors_avail; - sector_num += sinfo.sectors_avail; - bytes_done += sinfo.bytes_avail; - - } - } - - goto exit; - -error_bat_restore: - if (bat_update) { - /* keep metadata in sync, and restore the bat entry state - * if error. */ - sinfo.file_offset = bat_prior_offset; - vhdx_update_bat_table_entry(bs, s, &sinfo, &bat_entry, - &bat_entry_offset, bat_state); - } -exit: - qemu_vfree(iov1.iov_base); - qemu_vfree(iov2.iov_base); - qemu_co_mutex_unlock(&s->lock); - qemu_iovec_destroy(&hd_qiov); - return ret; -} - - - -/* - * Create VHDX Headers - * - * There are 2 headers, and the highest sequence number will represent - * the active header - */ -static int vhdx_create_new_headers(BlockDriverState *bs, uint64_t image_size, - uint32_t log_size) -{ - int ret = 0; - VHDXHeader *hdr = NULL; - - hdr = g_new0(VHDXHeader, 1); - - hdr->signature = VHDX_HEADER_SIGNATURE; - hdr->sequence_number = g_random_int(); - hdr->log_version = 0; - hdr->version = 1; - hdr->log_length = log_size; - hdr->log_offset = VHDX_HEADER_SECTION_END; - vhdx_guid_generate(&hdr->file_write_guid); - vhdx_guid_generate(&hdr->data_write_guid); - - ret = vhdx_write_header(bs, hdr, VHDX_HEADER1_OFFSET, false); - if (ret < 0) { - goto exit; - } - hdr->sequence_number++; - ret = vhdx_write_header(bs, hdr, VHDX_HEADER2_OFFSET, false); - if (ret < 0) { - goto exit; - } - -exit: - g_free(hdr); - return ret; -} - -#define VHDX_METADATA_ENTRY_BUFFER_SIZE \ - (sizeof(VHDXFileParameters) +\ - sizeof(VHDXVirtualDiskSize) +\ - sizeof(VHDXPage83Data) +\ - sizeof(VHDXVirtualDiskLogicalSectorSize) +\ - sizeof(VHDXVirtualDiskPhysicalSectorSize)) - -/* - * Create the Metadata entries. - * - * For more details on the entries, see section 3.5 (pg 29) in the - * VHDX 1.00 specification. - * - * We support 5 metadata entries (all required by spec): - * File Parameters, - * Virtual Disk Size, - * Page 83 Data, - * Logical Sector Size, - * Physical Sector Size - * - * The first 64KB of the Metadata section is reserved for the metadata - * header and entries; beyond that, the metadata items themselves reside. - */ -static int vhdx_create_new_metadata(BlockDriverState *bs, - uint64_t image_size, - uint32_t block_size, - uint32_t sector_size, - uint64_t metadata_offset, - VHDXImageType type) -{ - int ret = 0; - uint32_t offset = 0; - void *buffer = NULL; - void *entry_buffer; - VHDXMetadataTableHeader *md_table; - VHDXMetadataTableEntry *md_table_entry; - - /* Metadata entries */ - VHDXFileParameters *mt_file_params; - VHDXVirtualDiskSize *mt_virtual_size; - VHDXPage83Data *mt_page83; - VHDXVirtualDiskLogicalSectorSize *mt_log_sector_size; - VHDXVirtualDiskPhysicalSectorSize *mt_phys_sector_size; - - entry_buffer = g_malloc0(VHDX_METADATA_ENTRY_BUFFER_SIZE); - - mt_file_params = entry_buffer; - offset += sizeof(VHDXFileParameters); - mt_virtual_size = entry_buffer + offset; - offset += sizeof(VHDXVirtualDiskSize); - mt_page83 = entry_buffer + offset; - offset += sizeof(VHDXPage83Data); - mt_log_sector_size = entry_buffer + offset; - offset += sizeof(VHDXVirtualDiskLogicalSectorSize); - mt_phys_sector_size = entry_buffer + offset; - - mt_file_params->block_size = cpu_to_le32(block_size); - if (type == VHDX_TYPE_FIXED) { - mt_file_params->data_bits |= VHDX_PARAMS_LEAVE_BLOCKS_ALLOCED; - cpu_to_le32s(&mt_file_params->data_bits); - } - - vhdx_guid_generate(&mt_page83->page_83_data); - cpu_to_leguids(&mt_page83->page_83_data); - mt_virtual_size->virtual_disk_size = cpu_to_le64(image_size); - mt_log_sector_size->logical_sector_size = cpu_to_le32(sector_size); - mt_phys_sector_size->physical_sector_size = cpu_to_le32(sector_size); - - buffer = g_malloc0(VHDX_HEADER_BLOCK_SIZE); - md_table = buffer; - - md_table->signature = VHDX_METADATA_SIGNATURE; - md_table->entry_count = 5; - vhdx_metadata_header_le_export(md_table); - - - /* This will reference beyond the reserved table portion */ - offset = 64 * KiB; - - md_table_entry = buffer + sizeof(VHDXMetadataTableHeader); - - md_table_entry[0].item_id = file_param_guid; - md_table_entry[0].offset = offset; - md_table_entry[0].length = sizeof(VHDXFileParameters); - md_table_entry[0].data_bits |= VHDX_META_FLAGS_IS_REQUIRED; - offset += md_table_entry[0].length; - vhdx_metadata_entry_le_export(&md_table_entry[0]); - - md_table_entry[1].item_id = virtual_size_guid; - md_table_entry[1].offset = offset; - md_table_entry[1].length = sizeof(VHDXVirtualDiskSize); - md_table_entry[1].data_bits |= VHDX_META_FLAGS_IS_REQUIRED | - VHDX_META_FLAGS_IS_VIRTUAL_DISK; - offset += md_table_entry[1].length; - vhdx_metadata_entry_le_export(&md_table_entry[1]); - - md_table_entry[2].item_id = page83_guid; - md_table_entry[2].offset = offset; - md_table_entry[2].length = sizeof(VHDXPage83Data); - md_table_entry[2].data_bits |= VHDX_META_FLAGS_IS_REQUIRED | - VHDX_META_FLAGS_IS_VIRTUAL_DISK; - offset += md_table_entry[2].length; - vhdx_metadata_entry_le_export(&md_table_entry[2]); - - md_table_entry[3].item_id = logical_sector_guid; - md_table_entry[3].offset = offset; - md_table_entry[3].length = sizeof(VHDXVirtualDiskLogicalSectorSize); - md_table_entry[3].data_bits |= VHDX_META_FLAGS_IS_REQUIRED | - VHDX_META_FLAGS_IS_VIRTUAL_DISK; - offset += md_table_entry[3].length; - vhdx_metadata_entry_le_export(&md_table_entry[3]); - - md_table_entry[4].item_id = phys_sector_guid; - md_table_entry[4].offset = offset; - md_table_entry[4].length = sizeof(VHDXVirtualDiskPhysicalSectorSize); - md_table_entry[4].data_bits |= VHDX_META_FLAGS_IS_REQUIRED | - VHDX_META_FLAGS_IS_VIRTUAL_DISK; - vhdx_metadata_entry_le_export(&md_table_entry[4]); - - ret = bdrv_pwrite(bs, metadata_offset, buffer, VHDX_HEADER_BLOCK_SIZE); - if (ret < 0) { - goto exit; - } - - ret = bdrv_pwrite(bs, metadata_offset + (64 * KiB), entry_buffer, - VHDX_METADATA_ENTRY_BUFFER_SIZE); - if (ret < 0) { - goto exit; - } - - -exit: - g_free(buffer); - g_free(entry_buffer); - return ret; -} - -/* This create the actual BAT itself. We currently only support - * 'Dynamic' and 'Fixed' image types. - * - * Dynamic images: default state of the BAT is all zeroes. - * - * Fixed images: default state of the BAT is fully populated, with - * file offsets and state PAYLOAD_BLOCK_FULLY_PRESENT. - */ -static int vhdx_create_bat(BlockDriverState *bs, BDRVVHDXState *s, - uint64_t image_size, VHDXImageType type, - bool use_zero_blocks, uint64_t file_offset, - uint32_t length) -{ - int ret = 0; - uint64_t data_file_offset; - uint64_t total_sectors = 0; - uint64_t sector_num = 0; - uint64_t unused; - int block_state; - VHDXSectorInfo sinfo; - - assert(s->bat == NULL); - - /* this gives a data start after BAT/bitmap entries, and well - * past any metadata entries (with a 4 MB buffer for future - * expansion */ - data_file_offset = file_offset + length + 5 * MiB; - total_sectors = image_size >> s->logical_sector_size_bits; - - if (type == VHDX_TYPE_DYNAMIC) { - /* All zeroes, so we can just extend the file - the end of the BAT - * is the furthest thing we have written yet */ - ret = bdrv_truncate(bs, data_file_offset); - if (ret < 0) { - goto exit; - } - } else if (type == VHDX_TYPE_FIXED) { - ret = bdrv_truncate(bs, data_file_offset + image_size); - if (ret < 0) { - goto exit; - } - } else { - ret = -ENOTSUP; - goto exit; - } - - if (type == VHDX_TYPE_FIXED || - use_zero_blocks || - bdrv_has_zero_init(bs) == 0) { - /* for a fixed file, the default BAT entry is not zero */ - s->bat = g_try_malloc0(length); - if (length && s->bat == NULL) { - ret = -ENOMEM; - goto exit; - } - block_state = type == VHDX_TYPE_FIXED ? PAYLOAD_BLOCK_FULLY_PRESENT : - PAYLOAD_BLOCK_NOT_PRESENT; - block_state = use_zero_blocks ? PAYLOAD_BLOCK_ZERO : block_state; - /* fill the BAT by emulating sector writes of sectors_per_block size */ - while (sector_num < total_sectors) { - vhdx_block_translate(s, sector_num, s->sectors_per_block, &sinfo); - sinfo.file_offset = data_file_offset + - (sector_num << s->logical_sector_size_bits); - sinfo.file_offset = ROUND_UP(sinfo.file_offset, MiB); - vhdx_update_bat_table_entry(bs, s, &sinfo, &unused, &unused, - block_state); - cpu_to_le64s(&s->bat[sinfo.bat_idx]); - sector_num += s->sectors_per_block; - } - ret = bdrv_pwrite(bs, file_offset, s->bat, length); - if (ret < 0) { - goto exit; - } - } - - - -exit: - g_free(s->bat); - return ret; -} - -/* Creates the region table header, and region table entries. - * There are 2 supported region table entries: BAT, and Metadata/ - * - * As the calculations for the BAT region table are also needed - * to create the BAT itself, we will also cause the BAT to be - * created. - */ -static int vhdx_create_new_region_table(BlockDriverState *bs, - uint64_t image_size, - uint32_t block_size, - uint32_t sector_size, - uint32_t log_size, - bool use_zero_blocks, - VHDXImageType type, - uint64_t *metadata_offset) -{ - int ret = 0; - uint32_t offset = 0; - void *buffer = NULL; - uint64_t bat_file_offset; - uint32_t bat_length; - BDRVVHDXState *s = NULL; - VHDXRegionTableHeader *region_table; - VHDXRegionTableEntry *rt_bat; - VHDXRegionTableEntry *rt_metadata; - - assert(metadata_offset != NULL); - - /* Populate enough of the BDRVVHDXState to be able to use the - * pre-existing BAT calculation, translation, and update functions */ - s = g_new0(BDRVVHDXState, 1); - - s->chunk_ratio = (VHDX_MAX_SECTORS_PER_BLOCK) * - (uint64_t) sector_size / (uint64_t) block_size; - - s->sectors_per_block = block_size / sector_size; - s->virtual_disk_size = image_size; - s->block_size = block_size; - s->logical_sector_size = sector_size; - - vhdx_set_shift_bits(s); - - vhdx_calc_bat_entries(s); - - /* At this point the VHDX state is populated enough for creation */ - - /* a single buffer is used so we can calculate the checksum over the - * entire 64KB block */ - buffer = g_malloc0(VHDX_HEADER_BLOCK_SIZE); - region_table = buffer; - offset += sizeof(VHDXRegionTableHeader); - rt_bat = buffer + offset; - offset += sizeof(VHDXRegionTableEntry); - rt_metadata = buffer + offset; - - region_table->signature = VHDX_REGION_SIGNATURE; - region_table->entry_count = 2; /* BAT and Metadata */ - - rt_bat->guid = bat_guid; - rt_bat->length = ROUND_UP(s->bat_entries * sizeof(VHDXBatEntry), MiB); - rt_bat->file_offset = ROUND_UP(VHDX_HEADER_SECTION_END + log_size, MiB); - s->bat_offset = rt_bat->file_offset; - - rt_metadata->guid = metadata_guid; - rt_metadata->file_offset = ROUND_UP(rt_bat->file_offset + rt_bat->length, - MiB); - rt_metadata->length = 1 * MiB; /* min size, and more than enough */ - *metadata_offset = rt_metadata->file_offset; - - bat_file_offset = rt_bat->file_offset; - bat_length = rt_bat->length; - - vhdx_region_header_le_export(region_table); - vhdx_region_entry_le_export(rt_bat); - vhdx_region_entry_le_export(rt_metadata); - - vhdx_update_checksum(buffer, VHDX_HEADER_BLOCK_SIZE, - offsetof(VHDXRegionTableHeader, checksum)); - - - /* The region table gives us the data we need to create the BAT, - * so do that now */ - ret = vhdx_create_bat(bs, s, image_size, type, use_zero_blocks, - bat_file_offset, bat_length); - if (ret < 0) { - goto exit; - } - - /* Now write out the region headers to disk */ - ret = bdrv_pwrite(bs, VHDX_REGION_TABLE_OFFSET, buffer, - VHDX_HEADER_BLOCK_SIZE); - if (ret < 0) { - goto exit; - } - - ret = bdrv_pwrite(bs, VHDX_REGION_TABLE2_OFFSET, buffer, - VHDX_HEADER_BLOCK_SIZE); - if (ret < 0) { - goto exit; - } - -exit: - g_free(s); - g_free(buffer); - return ret; -} - -/* We need to create the following elements: - * - * .-----------------------------------------------------------------. - * | (A) | (B) | (C) | (D) | (E) | - * | File ID | Header1 | Header 2 | Region Tbl 1 | Region Tbl 2 | - * | | | | | | - * .-----------------------------------------------------------------. - * 0 64KB 128KB 192KB 256KB 320KB - * - * - * .---- ~ ----------- ~ ------------ ~ ---------------- ~ -----------. - * | (F) | (G) | (H) | | - * | Journal Log | BAT / Bitmap | Metadata | .... data ...... | - * | | | | | - * .---- ~ ----------- ~ ------------ ~ ---------------- ~ -----------. - * 1MB - */ -static int vhdx_create(const char *filename, QemuOpts *opts, Error **errp) -{ - int ret = 0; - uint64_t image_size = (uint64_t) 2 * GiB; - uint32_t log_size = 1 * MiB; - uint32_t block_size = 0; - uint64_t signature; - uint64_t metadata_offset; - bool use_zero_blocks = false; - - gunichar2 *creator = NULL; - glong creator_items; - BlockBackend *blk; - char *type = NULL; - VHDXImageType image_type; - Error *local_err = NULL; - - image_size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0), - BDRV_SECTOR_SIZE); - log_size = qemu_opt_get_size_del(opts, VHDX_BLOCK_OPT_LOG_SIZE, 0); - block_size = qemu_opt_get_size_del(opts, VHDX_BLOCK_OPT_BLOCK_SIZE, 0); - type = qemu_opt_get_del(opts, BLOCK_OPT_SUBFMT); - use_zero_blocks = qemu_opt_get_bool_del(opts, VHDX_BLOCK_OPT_ZERO, true); - - if (image_size > VHDX_MAX_IMAGE_SIZE) { - error_setg_errno(errp, EINVAL, "Image size too large; max of 64TB"); - ret = -EINVAL; - goto exit; - } - - if (type == NULL) { - type = g_strdup("dynamic"); - } - - if (!strcmp(type, "dynamic")) { - image_type = VHDX_TYPE_DYNAMIC; - } else if (!strcmp(type, "fixed")) { - image_type = VHDX_TYPE_FIXED; - } else if (!strcmp(type, "differencing")) { - error_setg_errno(errp, ENOTSUP, - "Differencing files not yet supported"); - ret = -ENOTSUP; - goto exit; - } else { - ret = -EINVAL; - goto exit; - } - - /* These are pretty arbitrary, and mainly designed to keep the BAT - * size reasonable to load into RAM */ - if (block_size == 0) { - if (image_size > 32 * TiB) { - block_size = 64 * MiB; - } else if (image_size > (uint64_t) 100 * GiB) { - block_size = 32 * MiB; - } else if (image_size > 1 * GiB) { - block_size = 16 * MiB; - } else { - block_size = 8 * MiB; - } - } - - - /* make the log size close to what was specified, but must be - * min 1MB, and multiple of 1MB */ - log_size = ROUND_UP(log_size, MiB); - - block_size = ROUND_UP(block_size, MiB); - block_size = block_size > VHDX_BLOCK_SIZE_MAX ? VHDX_BLOCK_SIZE_MAX : - block_size; - - ret = bdrv_create_file(filename, opts, &local_err); - if (ret < 0) { - error_propagate(errp, local_err); - goto exit; - } - - blk = blk_new_open(filename, NULL, NULL, - BDRV_O_RDWR | BDRV_O_PROTOCOL, &local_err); - if (blk == NULL) { - error_propagate(errp, local_err); - ret = -EIO; - goto exit; - } - - blk_set_allow_write_beyond_eof(blk, true); - - /* Create (A) */ - - /* The creator field is optional, but may be useful for - * debugging / diagnostics */ - creator = g_utf8_to_utf16("QEMU v" QEMU_VERSION, -1, NULL, - &creator_items, NULL); - signature = cpu_to_le64(VHDX_FILE_SIGNATURE); - ret = blk_pwrite(blk, VHDX_FILE_ID_OFFSET, &signature, sizeof(signature)); - if (ret < 0) { - goto delete_and_exit; - } - if (creator) { - ret = blk_pwrite(blk, VHDX_FILE_ID_OFFSET + sizeof(signature), - creator, creator_items * sizeof(gunichar2)); - if (ret < 0) { - goto delete_and_exit; - } - } - - - /* Creates (B),(C) */ - ret = vhdx_create_new_headers(blk_bs(blk), image_size, log_size); - if (ret < 0) { - goto delete_and_exit; - } - - /* Creates (D),(E),(G) explicitly. (F) created as by-product */ - ret = vhdx_create_new_region_table(blk_bs(blk), image_size, block_size, 512, - log_size, use_zero_blocks, image_type, - &metadata_offset); - if (ret < 0) { - goto delete_and_exit; - } - - /* Creates (H) */ - ret = vhdx_create_new_metadata(blk_bs(blk), image_size, block_size, 512, - metadata_offset, image_type); - if (ret < 0) { - goto delete_and_exit; - } - - -delete_and_exit: - blk_unref(blk); -exit: - g_free(type); - g_free(creator); - return ret; -} - -/* If opened r/w, the VHDX driver will automatically replay the log, - * if one is present, inside the vhdx_open() call. - * - * If qemu-img check -r all is called, the image is automatically opened - * r/w and any log has already been replayed, so there is nothing (currently) - * for us to do here - */ -static int vhdx_check(BlockDriverState *bs, BdrvCheckResult *result, - BdrvCheckMode fix) -{ - BDRVVHDXState *s = bs->opaque; - - if (s->log_replayed_on_open) { - result->corruptions_fixed++; - } - return 0; -} - -static QemuOptsList vhdx_create_opts = { - .name = "vhdx-create-opts", - .head = QTAILQ_HEAD_INITIALIZER(vhdx_create_opts.head), - .desc = { - { - .name = BLOCK_OPT_SIZE, - .type = QEMU_OPT_SIZE, - .help = "Virtual disk size; max of 64TB." - }, - { - .name = VHDX_BLOCK_OPT_LOG_SIZE, - .type = QEMU_OPT_SIZE, - .def_value_str = stringify(DEFAULT_LOG_SIZE), - .help = "Log size; min 1MB." - }, - { - .name = VHDX_BLOCK_OPT_BLOCK_SIZE, - .type = QEMU_OPT_SIZE, - .def_value_str = stringify(0), - .help = "Block Size; min 1MB, max 256MB. " \ - "0 means auto-calculate based on image size." - }, - { - .name = BLOCK_OPT_SUBFMT, - .type = QEMU_OPT_STRING, - .help = "VHDX format type, can be either 'dynamic' or 'fixed'. "\ - "Default is 'dynamic'." - }, - { - .name = VHDX_BLOCK_OPT_ZERO, - .type = QEMU_OPT_BOOL, - .help = "Force use of payload blocks of type 'ZERO'. "\ - "Non-standard, but default. Do not set to 'off' when "\ - "using 'qemu-img convert' with subformat=dynamic." - }, - { NULL } - } -}; - -static BlockDriver bdrv_vhdx = { - .format_name = "vhdx", - .instance_size = sizeof(BDRVVHDXState), - .bdrv_probe = vhdx_probe, - .bdrv_open = vhdx_open, - .bdrv_close = vhdx_close, - .bdrv_reopen_prepare = vhdx_reopen_prepare, - .bdrv_co_readv = vhdx_co_readv, - .bdrv_co_writev = vhdx_co_writev, - .bdrv_create = vhdx_create, - .bdrv_get_info = vhdx_get_info, - .bdrv_check = vhdx_check, - .bdrv_has_zero_init = bdrv_has_zero_init_1, - - .create_opts = &vhdx_create_opts, -}; - -static void bdrv_vhdx_init(void) -{ - bdrv_register(&bdrv_vhdx); -} - -block_init(bdrv_vhdx_init); diff --git a/qemu/block/vhdx.h b/qemu/block/vhdx.h deleted file mode 100644 index 7003ab7a7..000000000 --- a/qemu/block/vhdx.h +++ /dev/null @@ -1,453 +0,0 @@ -/* - * Block driver for Hyper-V VHDX Images - * - * Copyright (c) 2013 Red Hat, Inc., - * - * Authors: - * Jeff Cody - * - * This is based on the "VHDX Format Specification v1.00", published 8/25/2012 - * by Microsoft: - * https://www.microsoft.com/en-us/download/details.aspx?id=34750 - * - * This work is licensed under the terms of the GNU LGPL, version 2 or later. - * See the COPYING.LIB file in the top-level directory. - * - */ - -#ifndef BLOCK_VHDX_H -#define BLOCK_VHDX_H - -#define KiB (1 * 1024) -#define MiB (KiB * 1024) -#define GiB (MiB * 1024) -#define TiB ((uint64_t) GiB * 1024) - -#define DEFAULT_LOG_SIZE 1048576 /* 1MiB */ -/* Structures and fields present in the VHDX file */ - -/* The header section has the following blocks, - * each block is 64KB: - * - * _____________________________________________________________________________ - * | File Id. | Header 1 | Header 2 | Region Table | Reserved (768KB) | - * |----------|---------------|------------|--------------|--------------------| - * | | | | | | - * 0.........64KB...........128KB........192KB..........256KB................1MB - */ - -#define VHDX_HEADER_BLOCK_SIZE (64 * 1024) - -#define VHDX_FILE_ID_OFFSET 0 -#define VHDX_HEADER1_OFFSET (VHDX_HEADER_BLOCK_SIZE * 1) -#define VHDX_HEADER2_OFFSET (VHDX_HEADER_BLOCK_SIZE * 2) -#define VHDX_REGION_TABLE_OFFSET (VHDX_HEADER_BLOCK_SIZE * 3) -#define VHDX_REGION_TABLE2_OFFSET (VHDX_HEADER_BLOCK_SIZE * 4) - -#define VHDX_HEADER_SECTION_END (1 * MiB) -/* - * A note on the use of MS-GUID fields. For more details on the GUID, - * please see: https://en.wikipedia.org/wiki/Globally_unique_identifier. - * - * The VHDX specification only states that these are MS GUIDs, and which - * bytes are data1-data4. It makes no mention of what algorithm should be used - * to generate the GUID, nor what standard. However, looking at the specified - * known GUID fields, it appears the GUIDs are: - * Standard/DCE GUID type (noted by 10b in the MSB of byte 0 of .data4) - * Random algorithm (noted by 0x4XXX for .data3) - */ - -/* ---- HEADER SECTION STRUCTURES ---- */ - -/* These structures are ones that are defined in the VHDX specification - * document */ - -#define VHDX_FILE_SIGNATURE 0x656C696678646876ULL /* "vhdxfile" in ASCII */ -typedef struct VHDXFileIdentifier { - uint64_t signature; /* "vhdxfile" in ASCII */ - uint16_t creator[256]; /* optional; utf-16 string to identify - the vhdx file creator. Diagnostic - only */ -} VHDXFileIdentifier; - - -/* the guid is a 16 byte unique ID - the definition for this used by - * Microsoft is not just 16 bytes though - it is a structure that is defined, - * so we need to follow it here so that endianness does not trip us up */ - -typedef struct QEMU_PACKED MSGUID { - uint32_t data1; - uint16_t data2; - uint16_t data3; - uint8_t data4[8]; -} MSGUID; - -#define guid_eq(a, b) \ - (memcmp(&(a), &(b), sizeof(MSGUID)) == 0) - -#define VHDX_HEADER_SIZE (4 * 1024) /* although the vhdx_header struct in disk - is only 582 bytes, for purposes of crc - the header is the first 4KB of the 64KB - block */ - -/* The full header is 4KB, although the actual header data is much smaller. - * But for the checksum calculation, it is over the entire 4KB structure, - * not just the defined portion of it */ -#define VHDX_HEADER_SIGNATURE 0x64616568 -typedef struct QEMU_PACKED VHDXHeader { - uint32_t signature; /* "head" in ASCII */ - uint32_t checksum; /* CRC-32C hash of the whole header */ - uint64_t sequence_number; /* Seq number of this header. Each - VHDX file has 2 of these headers, - and only the header with the highest - sequence number is valid */ - MSGUID file_write_guid; /* 128 bit unique identifier. Must be - updated to new, unique value before - the first modification is made to - file */ - MSGUID data_write_guid; /* 128 bit unique identifier. Must be - updated to new, unique value before - the first modification is made to - visible data. Visbile data is - defined as: - - system & user metadata - - raw block data - - disk size - - any change that will - cause the virtual disk - sector read to differ - - This does not need to change if - blocks are re-arranged */ - MSGUID log_guid; /* 128 bit unique identifier. If zero, - there is no valid log. If non-zero, - log entries with this guid are - valid. */ - uint16_t log_version; /* version of the log format. Must be - set to zero */ - uint16_t version; /* version of the vhdx file. Currently, - only supported version is "1" */ - uint32_t log_length; /* length of the log. Must be multiple - of 1MB */ - uint64_t log_offset; /* byte offset in the file of the log. - Must also be a multiple of 1MB */ -} VHDXHeader; - -/* Header for the region table block */ -#define VHDX_REGION_SIGNATURE 0x69676572 /* "regi" in ASCII */ -typedef struct QEMU_PACKED VHDXRegionTableHeader { - uint32_t signature; /* "regi" in ASCII */ - uint32_t checksum; /* CRC-32C hash of the 64KB table */ - uint32_t entry_count; /* number of valid entries */ - uint32_t reserved; -} VHDXRegionTableHeader; - -/* Individual region table entry. There may be a maximum of 2047 of these - * - * There are two known region table properties. Both are required. - * BAT (block allocation table): 2DC27766F62342009D64115E9BFD4A08 - * Metadata: 8B7CA20647904B9AB8FE575F050F886E - */ -#define VHDX_REGION_ENTRY_REQUIRED 0x01 /* if set, parser must understand - this entry in order to open - file */ -typedef struct QEMU_PACKED VHDXRegionTableEntry { - MSGUID guid; /* 128-bit unique identifier */ - uint64_t file_offset; /* offset of the object in the file. - Must be multiple of 1MB */ - uint32_t length; /* length, in bytes, of the object */ - uint32_t data_bits; -} VHDXRegionTableEntry; - - -/* ---- LOG ENTRY STRUCTURES ---- */ -#define VHDX_LOG_MIN_SIZE (1024 * 1024) -#define VHDX_LOG_SECTOR_SIZE 4096 -#define VHDX_LOG_HDR_SIZE 64 -#define VHDX_LOG_SIGNATURE 0x65676f6c -typedef struct QEMU_PACKED VHDXLogEntryHeader { - uint32_t signature; /* "loge" in ASCII */ - uint32_t checksum; /* CRC-32C hash of the 64KB table */ - uint32_t entry_length; /* length in bytes, multiple of 1MB */ - uint32_t tail; /* byte offset of first log entry of a - seq, where this entry is the last - entry */ - uint64_t sequence_number; /* incremented with each log entry. - May not be zero. */ - uint32_t descriptor_count; /* number of descriptors in this log - entry, must be >= 0 */ - uint32_t reserved; - MSGUID log_guid; /* value of the log_guid from - vhdx_header. If not found in - vhdx_header, it is invalid */ - uint64_t flushed_file_offset; /* see spec for full details - this - should be vhdx file size in bytes */ - uint64_t last_file_offset; /* size in bytes that all allocated - file structures fit into */ -} VHDXLogEntryHeader; - -#define VHDX_LOG_DESC_SIZE 32 -#define VHDX_LOG_DESC_SIGNATURE 0x63736564 -#define VHDX_LOG_ZERO_SIGNATURE 0x6f72657a -typedef struct QEMU_PACKED VHDXLogDescriptor { - uint32_t signature; /* "zero" or "desc" in ASCII */ - union { - uint32_t reserved; /* zero desc */ - uint32_t trailing_bytes; /* data desc: bytes 4092-4096 of the - data sector */ - }; - union { - uint64_t zero_length; /* zero desc: length of the section to - zero */ - uint64_t leading_bytes; /* data desc: bytes 0-7 of the data - sector */ - }; - uint64_t file_offset; /* file offset to write zeros - multiple - of 4kB */ - uint64_t sequence_number; /* must match same field in - vhdx_log_entry_header */ -} VHDXLogDescriptor; - -#define VHDX_LOG_DATA_SIGNATURE 0x61746164 -typedef struct QEMU_PACKED VHDXLogDataSector { - uint32_t data_signature; /* "data" in ASCII */ - uint32_t sequence_high; /* 4 MSB of 8 byte sequence_number */ - uint8_t data[4084]; /* raw data, bytes 8-4091 (inclusive). - see the data descriptor field for the - other mising bytes */ - uint32_t sequence_low; /* 4 LSB of 8 byte sequence_number */ -} VHDXLogDataSector; - - - -/* block states - different state values depending on whether it is a - * payload block, or a sector block. */ - -#define PAYLOAD_BLOCK_NOT_PRESENT 0 -#define PAYLOAD_BLOCK_UNDEFINED 1 -#define PAYLOAD_BLOCK_ZERO 2 -#define PAYLOAD_BLOCK_UNMAPPED 3 -#define PAYLOAD_BLOCK_UNMAPPED_v095 5 -#define PAYLOAD_BLOCK_FULLY_PRESENT 6 -#define PAYLOAD_BLOCK_PARTIALLY_PRESENT 7 - -#define SB_BLOCK_NOT_PRESENT 0 -#define SB_BLOCK_PRESENT 6 - -/* per the spec */ -#define VHDX_MAX_SECTORS_PER_BLOCK (1 << 23) - -/* upper 44 bits are the file offset in 1MB units lower 3 bits are the state - other bits are reserved */ -#define VHDX_BAT_STATE_BIT_MASK 0x07 -#define VHDX_BAT_FILE_OFF_MASK 0xFFFFFFFFFFF00000ULL /* upper 44 bits */ -typedef uint64_t VHDXBatEntry; - -/* ---- METADATA REGION STRUCTURES ---- */ - -#define VHDX_METADATA_ENTRY_SIZE 32 -#define VHDX_METADATA_MAX_ENTRIES 2047 /* not including the header */ -#define VHDX_METADATA_TABLE_MAX_SIZE \ - (VHDX_METADATA_ENTRY_SIZE * (VHDX_METADATA_MAX_ENTRIES+1)) -#define VHDX_METADATA_SIGNATURE 0x617461646174656DULL /* "metadata" in ASCII */ -typedef struct QEMU_PACKED VHDXMetadataTableHeader { - uint64_t signature; /* "metadata" in ASCII */ - uint16_t reserved; - uint16_t entry_count; /* number table entries. <= 2047 */ - uint32_t reserved2[5]; -} VHDXMetadataTableHeader; - -#define VHDX_META_FLAGS_IS_USER 0x01 /* max 1024 entries */ -#define VHDX_META_FLAGS_IS_VIRTUAL_DISK 0x02 /* virtual disk metadata if set, - otherwise file metdata */ -#define VHDX_META_FLAGS_IS_REQUIRED 0x04 /* parse must understand this - entry to open the file */ -typedef struct QEMU_PACKED VHDXMetadataTableEntry { - MSGUID item_id; /* 128-bit identifier for metadata */ - uint32_t offset; /* byte offset of the metadata. At - least 64kB. Relative to start of - metadata region */ - /* note: if length = 0, so is offset */ - uint32_t length; /* length of metadata. <= 1MB. */ - uint32_t data_bits; /* least-significant 3 bits are flags, - the rest are reserved (see above) */ - uint32_t reserved2; -} VHDXMetadataTableEntry; - -#define VHDX_PARAMS_LEAVE_BLOCKS_ALLOCED 0x01 /* Do not change any blocks to - be BLOCK_NOT_PRESENT. - If set indicates a fixed - size VHDX file */ -#define VHDX_PARAMS_HAS_PARENT 0x02 /* has parent / backing file */ -#define VHDX_BLOCK_SIZE_MIN (1 * MiB) -#define VHDX_BLOCK_SIZE_MAX (256 * MiB) -typedef struct QEMU_PACKED VHDXFileParameters { - uint32_t block_size; /* size of each payload block, always - power of 2, <= 256MB and >= 1MB. */ - uint32_t data_bits; /* least-significant 2 bits are flags, - the rest are reserved (see above) */ -} VHDXFileParameters; - -#define VHDX_MAX_IMAGE_SIZE ((uint64_t) 64 * TiB) -typedef struct QEMU_PACKED VHDXVirtualDiskSize { - uint64_t virtual_disk_size; /* Size of the virtual disk, in bytes. - Must be multiple of the sector size, - max of 64TB */ -} VHDXVirtualDiskSize; - -typedef struct QEMU_PACKED VHDXPage83Data { - MSGUID page_83_data; /* unique id for scsi devices that - support page 0x83 */ -} VHDXPage83Data; - -typedef struct QEMU_PACKED VHDXVirtualDiskLogicalSectorSize { - uint32_t logical_sector_size; /* virtual disk sector size (in bytes). - Can only be 512 or 4096 bytes */ -} VHDXVirtualDiskLogicalSectorSize; - -typedef struct QEMU_PACKED VHDXVirtualDiskPhysicalSectorSize { - uint32_t physical_sector_size; /* physical sector size (in bytes). - Can only be 512 or 4096 bytes */ -} VHDXVirtualDiskPhysicalSectorSize; - -typedef struct QEMU_PACKED VHDXParentLocatorHeader { - MSGUID locator_type; /* type of the parent virtual disk. */ - uint16_t reserved; - uint16_t key_value_count; /* number of key/value pairs for this - locator */ -} VHDXParentLocatorHeader; - -/* key and value strings are UNICODE strings, UTF-16 LE encoding, no NULs */ -typedef struct QEMU_PACKED VHDXParentLocatorEntry { - uint32_t key_offset; /* offset in metadata for key, > 0 */ - uint32_t value_offset; /* offset in metadata for value, >0 */ - uint16_t key_length; /* length of entry key, > 0 */ - uint16_t value_length; /* length of entry value, > 0 */ -} VHDXParentLocatorEntry; - - -/* ----- END VHDX SPECIFICATION STRUCTURES ---- */ - -typedef struct VHDXMetadataEntries { - VHDXMetadataTableEntry file_parameters_entry; - VHDXMetadataTableEntry virtual_disk_size_entry; - VHDXMetadataTableEntry page83_data_entry; - VHDXMetadataTableEntry logical_sector_size_entry; - VHDXMetadataTableEntry phys_sector_size_entry; - VHDXMetadataTableEntry parent_locator_entry; - uint16_t present; -} VHDXMetadataEntries; - -typedef struct VHDXLogEntries { - uint64_t offset; - uint64_t length; - uint32_t write; - uint32_t read; - VHDXLogEntryHeader *hdr; - void *desc_buffer; - uint64_t sequence; - uint32_t tail; -} VHDXLogEntries; - -typedef struct VHDXRegionEntry { - uint64_t start; - uint64_t end; - QLIST_ENTRY(VHDXRegionEntry) entries; -} VHDXRegionEntry; - -typedef struct BDRVVHDXState { - CoMutex lock; - - int curr_header; - VHDXHeader *headers[2]; - - VHDXRegionTableHeader rt; - VHDXRegionTableEntry bat_rt; /* region table for the BAT */ - VHDXRegionTableEntry metadata_rt; /* region table for the metadata */ - - VHDXMetadataTableHeader metadata_hdr; - VHDXMetadataEntries metadata_entries; - - VHDXFileParameters params; - uint32_t block_size; - uint32_t block_size_bits; - uint32_t sectors_per_block; - uint32_t sectors_per_block_bits; - - uint64_t virtual_disk_size; - uint32_t logical_sector_size; - uint32_t physical_sector_size; - - uint64_t chunk_ratio; - uint32_t chunk_ratio_bits; - uint32_t logical_sector_size_bits; - - uint32_t bat_entries; - VHDXBatEntry *bat; - uint64_t bat_offset; - - bool first_visible_write; - MSGUID session_guid; - - VHDXLogEntries log; - - VHDXParentLocatorHeader parent_header; - VHDXParentLocatorEntry *parent_entries; - - Error *migration_blocker; - - bool log_replayed_on_open; - - QLIST_HEAD(VHDXRegionHead, VHDXRegionEntry) regions; -} BDRVVHDXState; - -void vhdx_guid_generate(MSGUID *guid); - -int vhdx_update_headers(BlockDriverState *bs, BDRVVHDXState *s, bool rw, - MSGUID *log_guid); - -uint32_t vhdx_update_checksum(uint8_t *buf, size_t size, int crc_offset); -uint32_t vhdx_checksum_calc(uint32_t crc, uint8_t *buf, size_t size, - int crc_offset); - -bool vhdx_checksum_is_valid(uint8_t *buf, size_t size, int crc_offset); - -int vhdx_parse_log(BlockDriverState *bs, BDRVVHDXState *s, bool *flushed, - Error **errp); - -int vhdx_log_write_and_flush(BlockDriverState *bs, BDRVVHDXState *s, - void *data, uint32_t length, uint64_t offset); - -static inline void leguid_to_cpus(MSGUID *guid) -{ - le32_to_cpus(&guid->data1); - le16_to_cpus(&guid->data2); - le16_to_cpus(&guid->data3); -} - -static inline void cpu_to_leguids(MSGUID *guid) -{ - cpu_to_le32s(&guid->data1); - cpu_to_le16s(&guid->data2); - cpu_to_le16s(&guid->data3); -} - -void vhdx_header_le_import(VHDXHeader *h); -void vhdx_header_le_export(VHDXHeader *orig_h, VHDXHeader *new_h); -void vhdx_log_desc_le_import(VHDXLogDescriptor *d); -void vhdx_log_desc_le_export(VHDXLogDescriptor *d); -void vhdx_log_data_le_import(VHDXLogDataSector *d); -void vhdx_log_data_le_export(VHDXLogDataSector *d); -void vhdx_log_entry_hdr_le_import(VHDXLogEntryHeader *hdr); -void vhdx_log_entry_hdr_le_export(VHDXLogEntryHeader *hdr); -void vhdx_region_header_le_import(VHDXRegionTableHeader *hdr); -void vhdx_region_header_le_export(VHDXRegionTableHeader *hdr); -void vhdx_region_entry_le_import(VHDXRegionTableEntry *e); -void vhdx_region_entry_le_export(VHDXRegionTableEntry *e); -void vhdx_metadata_header_le_import(VHDXMetadataTableHeader *hdr); -void vhdx_metadata_header_le_export(VHDXMetadataTableHeader *hdr); -void vhdx_metadata_entry_le_import(VHDXMetadataTableEntry *e); -void vhdx_metadata_entry_le_export(VHDXMetadataTableEntry *e); -int vhdx_user_visible_write(BlockDriverState *bs, BDRVVHDXState *s); - -#endif diff --git a/qemu/block/vmdk.c b/qemu/block/vmdk.c deleted file mode 100644 index 45f9d3c5b..000000000 --- a/qemu/block/vmdk.c +++ /dev/null @@ -1,2349 +0,0 @@ -/* - * Block driver for the VMDK format - * - * Copyright (c) 2004 Fabrice Bellard - * Copyright (c) 2005 Filip Navara - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. - */ - -#include "qemu/osdep.h" -#include "qapi/error.h" -#include "block/block_int.h" -#include "sysemu/block-backend.h" -#include "qapi/qmp/qerror.h" -#include "qemu/error-report.h" -#include "qemu/module.h" -#include "migration/migration.h" -#include "qemu/cutils.h" -#include -#include - -#define VMDK3_MAGIC (('C' << 24) | ('O' << 16) | ('W' << 8) | 'D') -#define VMDK4_MAGIC (('K' << 24) | ('D' << 16) | ('M' << 8) | 'V') -#define VMDK4_COMPRESSION_DEFLATE 1 -#define VMDK4_FLAG_NL_DETECT (1 << 0) -#define VMDK4_FLAG_RGD (1 << 1) -/* Zeroed-grain enable bit */ -#define VMDK4_FLAG_ZERO_GRAIN (1 << 2) -#define VMDK4_FLAG_COMPRESS (1 << 16) -#define VMDK4_FLAG_MARKER (1 << 17) -#define VMDK4_GD_AT_END 0xffffffffffffffffULL - -#define VMDK_GTE_ZEROED 0x1 - -/* VMDK internal error codes */ -#define VMDK_OK 0 -#define VMDK_ERROR (-1) -/* Cluster not allocated */ -#define VMDK_UNALLOC (-2) -#define VMDK_ZEROED (-3) - -#define BLOCK_OPT_ZEROED_GRAIN "zeroed_grain" - -typedef struct { - uint32_t version; - uint32_t flags; - uint32_t disk_sectors; - uint32_t granularity; - uint32_t l1dir_offset; - uint32_t l1dir_size; - uint32_t file_sectors; - uint32_t cylinders; - uint32_t heads; - uint32_t sectors_per_track; -} QEMU_PACKED VMDK3Header; - -typedef struct { - uint32_t version; - uint32_t flags; - uint64_t capacity; - uint64_t granularity; - uint64_t desc_offset; - uint64_t desc_size; - /* Number of GrainTableEntries per GrainTable */ - uint32_t num_gtes_per_gt; - uint64_t rgd_offset; - uint64_t gd_offset; - uint64_t grain_offset; - char filler[1]; - char check_bytes[4]; - uint16_t compressAlgorithm; -} QEMU_PACKED VMDK4Header; - -#define L2_CACHE_SIZE 16 - -typedef struct VmdkExtent { - BdrvChild *file; - bool flat; - bool compressed; - bool has_marker; - bool has_zero_grain; - int version; - int64_t sectors; - int64_t end_sector; - int64_t flat_start_offset; - int64_t l1_table_offset; - int64_t l1_backup_table_offset; - uint32_t *l1_table; - uint32_t *l1_backup_table; - unsigned int l1_size; - uint32_t l1_entry_sectors; - - unsigned int l2_size; - uint32_t *l2_cache; - uint32_t l2_cache_offsets[L2_CACHE_SIZE]; - uint32_t l2_cache_counts[L2_CACHE_SIZE]; - - int64_t cluster_sectors; - int64_t next_cluster_sector; - char *type; -} VmdkExtent; - -typedef struct BDRVVmdkState { - CoMutex lock; - uint64_t desc_offset; - bool cid_updated; - bool cid_checked; - uint32_t cid; - uint32_t parent_cid; - int num_extents; - /* Extent array with num_extents entries, ascend ordered by address */ - VmdkExtent *extents; - Error *migration_blocker; - char *create_type; -} BDRVVmdkState; - -typedef struct VmdkMetaData { - unsigned int l1_index; - unsigned int l2_index; - unsigned int l2_offset; - int valid; - uint32_t *l2_cache_entry; -} VmdkMetaData; - -typedef struct VmdkGrainMarker { - uint64_t lba; - uint32_t size; - uint8_t data[0]; -} QEMU_PACKED VmdkGrainMarker; - -enum { - MARKER_END_OF_STREAM = 0, - MARKER_GRAIN_TABLE = 1, - MARKER_GRAIN_DIRECTORY = 2, - MARKER_FOOTER = 3, -}; - -static int vmdk_probe(const uint8_t *buf, int buf_size, const char *filename) -{ - uint32_t magic; - - if (buf_size < 4) { - return 0; - } - magic = be32_to_cpu(*(uint32_t *)buf); - if (magic == VMDK3_MAGIC || - magic == VMDK4_MAGIC) { - return 100; - } else { - const char *p = (const char *)buf; - const char *end = p + buf_size; - while (p < end) { - if (*p == '#') { - /* skip comment line */ - while (p < end && *p != '\n') { - p++; - } - p++; - continue; - } - if (*p == ' ') { - while (p < end && *p == ' ') { - p++; - } - /* skip '\r' if windows line endings used. */ - if (p < end && *p == '\r') { - p++; - } - /* only accept blank lines before 'version=' line */ - if (p == end || *p != '\n') { - return 0; - } - p++; - continue; - } - if (end - p >= strlen("version=X\n")) { - if (strncmp("version=1\n", p, strlen("version=1\n")) == 0 || - strncmp("version=2\n", p, strlen("version=2\n")) == 0) { - return 100; - } - } - if (end - p >= strlen("version=X\r\n")) { - if (strncmp("version=1\r\n", p, strlen("version=1\r\n")) == 0 || - strncmp("version=2\r\n", p, strlen("version=2\r\n")) == 0) { - return 100; - } - } - return 0; - } - return 0; - } -} - -#define SECTOR_SIZE 512 -#define DESC_SIZE (20 * SECTOR_SIZE) /* 20 sectors of 512 bytes each */ -#define BUF_SIZE 4096 -#define HEADER_SIZE 512 /* first sector of 512 bytes */ - -static void vmdk_free_extents(BlockDriverState *bs) -{ - int i; - BDRVVmdkState *s = bs->opaque; - VmdkExtent *e; - - for (i = 0; i < s->num_extents; i++) { - e = &s->extents[i]; - g_free(e->l1_table); - g_free(e->l2_cache); - g_free(e->l1_backup_table); - g_free(e->type); - if (e->file != bs->file) { - bdrv_unref_child(bs, e->file); - } - } - g_free(s->extents); -} - -static void vmdk_free_last_extent(BlockDriverState *bs) -{ - BDRVVmdkState *s = bs->opaque; - - if (s->num_extents == 0) { - return; - } - s->num_extents--; - s->extents = g_renew(VmdkExtent, s->extents, s->num_extents); -} - -static uint32_t vmdk_read_cid(BlockDriverState *bs, int parent) -{ - char *desc; - uint32_t cid = 0xffffffff; - const char *p_name, *cid_str; - size_t cid_str_size; - BDRVVmdkState *s = bs->opaque; - int ret; - - desc = g_malloc0(DESC_SIZE); - ret = bdrv_pread(bs->file->bs, s->desc_offset, desc, DESC_SIZE); - if (ret < 0) { - g_free(desc); - return 0; - } - - if (parent) { - cid_str = "parentCID"; - cid_str_size = sizeof("parentCID"); - } else { - cid_str = "CID"; - cid_str_size = sizeof("CID"); - } - - desc[DESC_SIZE - 1] = '\0'; - p_name = strstr(desc, cid_str); - if (p_name != NULL) { - p_name += cid_str_size; - sscanf(p_name, "%" SCNx32, &cid); - } - - g_free(desc); - return cid; -} - -static int vmdk_write_cid(BlockDriverState *bs, uint32_t cid) -{ - char *desc, *tmp_desc; - char *p_name, *tmp_str; - BDRVVmdkState *s = bs->opaque; - int ret = 0; - - desc = g_malloc0(DESC_SIZE); - tmp_desc = g_malloc0(DESC_SIZE); - ret = bdrv_pread(bs->file->bs, s->desc_offset, desc, DESC_SIZE); - if (ret < 0) { - goto out; - } - - desc[DESC_SIZE - 1] = '\0'; - tmp_str = strstr(desc, "parentCID"); - if (tmp_str == NULL) { - ret = -EINVAL; - goto out; - } - - pstrcpy(tmp_desc, DESC_SIZE, tmp_str); - p_name = strstr(desc, "CID"); - if (p_name != NULL) { - p_name += sizeof("CID"); - snprintf(p_name, DESC_SIZE - (p_name - desc), "%" PRIx32 "\n", cid); - pstrcat(desc, DESC_SIZE, tmp_desc); - } - - ret = bdrv_pwrite_sync(bs->file->bs, s->desc_offset, desc, DESC_SIZE); - -out: - g_free(desc); - g_free(tmp_desc); - return ret; -} - -static int vmdk_is_cid_valid(BlockDriverState *bs) -{ - BDRVVmdkState *s = bs->opaque; - uint32_t cur_pcid; - - if (!s->cid_checked && bs->backing) { - BlockDriverState *p_bs = bs->backing->bs; - - cur_pcid = vmdk_read_cid(p_bs, 0); - if (s->parent_cid != cur_pcid) { - /* CID not valid */ - return 0; - } - } - s->cid_checked = true; - /* CID valid */ - return 1; -} - -/* We have nothing to do for VMDK reopen, stubs just return success */ -static int vmdk_reopen_prepare(BDRVReopenState *state, - BlockReopenQueue *queue, Error **errp) -{ - assert(state != NULL); - assert(state->bs != NULL); - return 0; -} - -static int vmdk_parent_open(BlockDriverState *bs) -{ - char *p_name; - char *desc; - BDRVVmdkState *s = bs->opaque; - int ret; - - desc = g_malloc0(DESC_SIZE + 1); - ret = bdrv_pread(bs->file->bs, s->desc_offset, desc, DESC_SIZE); - if (ret < 0) { - goto out; - } - ret = 0; - - p_name = strstr(desc, "parentFileNameHint"); - if (p_name != NULL) { - char *end_name; - - p_name += sizeof("parentFileNameHint") + 1; - end_name = strchr(p_name, '\"'); - if (end_name == NULL) { - ret = -EINVAL; - goto out; - } - if ((end_name - p_name) > sizeof(bs->backing_file) - 1) { - ret = -EINVAL; - goto out; - } - - pstrcpy(bs->backing_file, end_name - p_name + 1, p_name); - } - -out: - g_free(desc); - return ret; -} - -/* Create and append extent to the extent array. Return the added VmdkExtent - * address. return NULL if allocation failed. */ -static int vmdk_add_extent(BlockDriverState *bs, - BdrvChild *file, bool flat, int64_t sectors, - int64_t l1_offset, int64_t l1_backup_offset, - uint32_t l1_size, - int l2_size, uint64_t cluster_sectors, - VmdkExtent **new_extent, - Error **errp) -{ - VmdkExtent *extent; - BDRVVmdkState *s = bs->opaque; - int64_t nb_sectors; - - if (cluster_sectors > 0x200000) { - /* 0x200000 * 512Bytes = 1GB for one cluster is unrealistic */ - error_setg(errp, "Invalid granularity, image may be corrupt"); - return -EFBIG; - } - if (l1_size > 512 * 1024 * 1024) { - /* Although with big capacity and small l1_entry_sectors, we can get a - * big l1_size, we don't want unbounded value to allocate the table. - * Limit it to 512M, which is 16PB for default cluster and L2 table - * size */ - error_setg(errp, "L1 size too big"); - return -EFBIG; - } - - nb_sectors = bdrv_nb_sectors(file->bs); - if (nb_sectors < 0) { - return nb_sectors; - } - - s->extents = g_renew(VmdkExtent, s->extents, s->num_extents + 1); - extent = &s->extents[s->num_extents]; - s->num_extents++; - - memset(extent, 0, sizeof(VmdkExtent)); - extent->file = file; - extent->flat = flat; - extent->sectors = sectors; - extent->l1_table_offset = l1_offset; - extent->l1_backup_table_offset = l1_backup_offset; - extent->l1_size = l1_size; - extent->l1_entry_sectors = l2_size * cluster_sectors; - extent->l2_size = l2_size; - extent->cluster_sectors = flat ? sectors : cluster_sectors; - extent->next_cluster_sector = ROUND_UP(nb_sectors, cluster_sectors); - - if (s->num_extents > 1) { - extent->end_sector = (*(extent - 1)).end_sector + extent->sectors; - } else { - extent->end_sector = extent->sectors; - } - bs->total_sectors = extent->end_sector; - if (new_extent) { - *new_extent = extent; - } - return 0; -} - -static int vmdk_init_tables(BlockDriverState *bs, VmdkExtent *extent, - Error **errp) -{ - int ret; - size_t l1_size; - int i; - - /* read the L1 table */ - l1_size = extent->l1_size * sizeof(uint32_t); - extent->l1_table = g_try_malloc(l1_size); - if (l1_size && extent->l1_table == NULL) { - return -ENOMEM; - } - - ret = bdrv_pread(extent->file->bs, - extent->l1_table_offset, - extent->l1_table, - l1_size); - if (ret < 0) { - error_setg_errno(errp, -ret, - "Could not read l1 table from extent '%s'", - extent->file->bs->filename); - goto fail_l1; - } - for (i = 0; i < extent->l1_size; i++) { - le32_to_cpus(&extent->l1_table[i]); - } - - if (extent->l1_backup_table_offset) { - extent->l1_backup_table = g_try_malloc(l1_size); - if (l1_size && extent->l1_backup_table == NULL) { - ret = -ENOMEM; - goto fail_l1; - } - ret = bdrv_pread(extent->file->bs, - extent->l1_backup_table_offset, - extent->l1_backup_table, - l1_size); - if (ret < 0) { - error_setg_errno(errp, -ret, - "Could not read l1 backup table from extent '%s'", - extent->file->bs->filename); - goto fail_l1b; - } - for (i = 0; i < extent->l1_size; i++) { - le32_to_cpus(&extent->l1_backup_table[i]); - } - } - - extent->l2_cache = - g_new(uint32_t, extent->l2_size * L2_CACHE_SIZE); - return 0; - fail_l1b: - g_free(extent->l1_backup_table); - fail_l1: - g_free(extent->l1_table); - return ret; -} - -static int vmdk_open_vmfs_sparse(BlockDriverState *bs, - BdrvChild *file, - int flags, Error **errp) -{ - int ret; - uint32_t magic; - VMDK3Header header; - VmdkExtent *extent; - - ret = bdrv_pread(file->bs, sizeof(magic), &header, sizeof(header)); - if (ret < 0) { - error_setg_errno(errp, -ret, - "Could not read header from file '%s'", - file->bs->filename); - return ret; - } - ret = vmdk_add_extent(bs, file, false, - le32_to_cpu(header.disk_sectors), - (int64_t)le32_to_cpu(header.l1dir_offset) << 9, - 0, - le32_to_cpu(header.l1dir_size), - 4096, - le32_to_cpu(header.granularity), - &extent, - errp); - if (ret < 0) { - return ret; - } - ret = vmdk_init_tables(bs, extent, errp); - if (ret) { - /* free extent allocated by vmdk_add_extent */ - vmdk_free_last_extent(bs); - } - return ret; -} - -static int vmdk_open_desc_file(BlockDriverState *bs, int flags, char *buf, - QDict *options, Error **errp); - -static char *vmdk_read_desc(BlockDriverState *file, uint64_t desc_offset, - Error **errp) -{ - int64_t size; - char *buf; - int ret; - - size = bdrv_getlength(file); - if (size < 0) { - error_setg_errno(errp, -size, "Could not access file"); - return NULL; - } - - if (size < 4) { - /* Both descriptor file and sparse image must be much larger than 4 - * bytes, also callers of vmdk_read_desc want to compare the first 4 - * bytes with VMDK4_MAGIC, let's error out if less is read. */ - error_setg(errp, "File is too small, not a valid image"); - return NULL; - } - - size = MIN(size, (1 << 20) - 1); /* avoid unbounded allocation */ - buf = g_malloc(size + 1); - - ret = bdrv_pread(file, desc_offset, buf, size); - if (ret < 0) { - error_setg_errno(errp, -ret, "Could not read from file"); - g_free(buf); - return NULL; - } - buf[ret] = 0; - - return buf; -} - -static int vmdk_open_vmdk4(BlockDriverState *bs, - BdrvChild *file, - int flags, QDict *options, Error **errp) -{ - int ret; - uint32_t magic; - uint32_t l1_size, l1_entry_sectors; - VMDK4Header header; - VmdkExtent *extent; - BDRVVmdkState *s = bs->opaque; - int64_t l1_backup_offset = 0; - bool compressed; - - ret = bdrv_pread(file->bs, sizeof(magic), &header, sizeof(header)); - if (ret < 0) { - error_setg_errno(errp, -ret, - "Could not read header from file '%s'", - file->bs->filename); - return -EINVAL; - } - if (header.capacity == 0) { - uint64_t desc_offset = le64_to_cpu(header.desc_offset); - if (desc_offset) { - char *buf = vmdk_read_desc(file->bs, desc_offset << 9, errp); - if (!buf) { - return -EINVAL; - } - ret = vmdk_open_desc_file(bs, flags, buf, options, errp); - g_free(buf); - return ret; - } - } - - if (!s->create_type) { - s->create_type = g_strdup("monolithicSparse"); - } - - if (le64_to_cpu(header.gd_offset) == VMDK4_GD_AT_END) { - /* - * The footer takes precedence over the header, so read it in. The - * footer starts at offset -1024 from the end: One sector for the - * footer, and another one for the end-of-stream marker. - */ - struct { - struct { - uint64_t val; - uint32_t size; - uint32_t type; - uint8_t pad[512 - 16]; - } QEMU_PACKED footer_marker; - - uint32_t magic; - VMDK4Header header; - uint8_t pad[512 - 4 - sizeof(VMDK4Header)]; - - struct { - uint64_t val; - uint32_t size; - uint32_t type; - uint8_t pad[512 - 16]; - } QEMU_PACKED eos_marker; - } QEMU_PACKED footer; - - ret = bdrv_pread(file->bs, - bs->file->bs->total_sectors * 512 - 1536, - &footer, sizeof(footer)); - if (ret < 0) { - error_setg_errno(errp, -ret, "Failed to read footer"); - return ret; - } - - /* Some sanity checks for the footer */ - if (be32_to_cpu(footer.magic) != VMDK4_MAGIC || - le32_to_cpu(footer.footer_marker.size) != 0 || - le32_to_cpu(footer.footer_marker.type) != MARKER_FOOTER || - le64_to_cpu(footer.eos_marker.val) != 0 || - le32_to_cpu(footer.eos_marker.size) != 0 || - le32_to_cpu(footer.eos_marker.type) != MARKER_END_OF_STREAM) - { - error_setg(errp, "Invalid footer"); - return -EINVAL; - } - - header = footer.header; - } - - compressed = - le16_to_cpu(header.compressAlgorithm) == VMDK4_COMPRESSION_DEFLATE; - if (le32_to_cpu(header.version) > 3) { - error_setg(errp, "Unsupported VMDK version %" PRIu32, - le32_to_cpu(header.version)); - return -ENOTSUP; - } else if (le32_to_cpu(header.version) == 3 && (flags & BDRV_O_RDWR) && - !compressed) { - /* VMware KB 2064959 explains that version 3 added support for - * persistent changed block tracking (CBT), and backup software can - * read it as version=1 if it doesn't care about the changed area - * information. So we are safe to enable read only. */ - error_setg(errp, "VMDK version 3 must be read only"); - return -EINVAL; - } - - if (le32_to_cpu(header.num_gtes_per_gt) > 512) { - error_setg(errp, "L2 table size too big"); - return -EINVAL; - } - - l1_entry_sectors = le32_to_cpu(header.num_gtes_per_gt) - * le64_to_cpu(header.granularity); - if (l1_entry_sectors == 0) { - error_setg(errp, "L1 entry size is invalid"); - return -EINVAL; - } - l1_size = (le64_to_cpu(header.capacity) + l1_entry_sectors - 1) - / l1_entry_sectors; - if (le32_to_cpu(header.flags) & VMDK4_FLAG_RGD) { - l1_backup_offset = le64_to_cpu(header.rgd_offset) << 9; - } - if (bdrv_nb_sectors(file->bs) < le64_to_cpu(header.grain_offset)) { - error_setg(errp, "File truncated, expecting at least %" PRId64 " bytes", - (int64_t)(le64_to_cpu(header.grain_offset) - * BDRV_SECTOR_SIZE)); - return -EINVAL; - } - - ret = vmdk_add_extent(bs, file, false, - le64_to_cpu(header.capacity), - le64_to_cpu(header.gd_offset) << 9, - l1_backup_offset, - l1_size, - le32_to_cpu(header.num_gtes_per_gt), - le64_to_cpu(header.granularity), - &extent, - errp); - if (ret < 0) { - return ret; - } - extent->compressed = - le16_to_cpu(header.compressAlgorithm) == VMDK4_COMPRESSION_DEFLATE; - if (extent->compressed) { - g_free(s->create_type); - s->create_type = g_strdup("streamOptimized"); - } - extent->has_marker = le32_to_cpu(header.flags) & VMDK4_FLAG_MARKER; - extent->version = le32_to_cpu(header.version); - extent->has_zero_grain = le32_to_cpu(header.flags) & VMDK4_FLAG_ZERO_GRAIN; - ret = vmdk_init_tables(bs, extent, errp); - if (ret) { - /* free extent allocated by vmdk_add_extent */ - vmdk_free_last_extent(bs); - } - return ret; -} - -/* find an option value out of descriptor file */ -static int vmdk_parse_description(const char *desc, const char *opt_name, - char *buf, int buf_size) -{ - char *opt_pos, *opt_end; - const char *end = desc + strlen(desc); - - opt_pos = strstr(desc, opt_name); - if (!opt_pos) { - return VMDK_ERROR; - } - /* Skip "=\"" following opt_name */ - opt_pos += strlen(opt_name) + 2; - if (opt_pos >= end) { - return VMDK_ERROR; - } - opt_end = opt_pos; - while (opt_end < end && *opt_end != '"') { - opt_end++; - } - if (opt_end == end || buf_size < opt_end - opt_pos + 1) { - return VMDK_ERROR; - } - pstrcpy(buf, opt_end - opt_pos + 1, opt_pos); - return VMDK_OK; -} - -/* Open an extent file and append to bs array */ -static int vmdk_open_sparse(BlockDriverState *bs, BdrvChild *file, int flags, - char *buf, QDict *options, Error **errp) -{ - uint32_t magic; - - magic = ldl_be_p(buf); - switch (magic) { - case VMDK3_MAGIC: - return vmdk_open_vmfs_sparse(bs, file, flags, errp); - break; - case VMDK4_MAGIC: - return vmdk_open_vmdk4(bs, file, flags, options, errp); - break; - default: - error_setg(errp, "Image not in VMDK format"); - return -EINVAL; - break; - } -} - -static const char *next_line(const char *s) -{ - while (*s) { - if (*s == '\n') { - return s + 1; - } - s++; - } - return s; -} - -static int vmdk_parse_extents(const char *desc, BlockDriverState *bs, - const char *desc_file_path, QDict *options, - Error **errp) -{ - int ret; - int matches; - char access[11]; - char type[11]; - char fname[512]; - const char *p, *np; - int64_t sectors = 0; - int64_t flat_offset; - char *extent_path; - BdrvChild *extent_file; - BDRVVmdkState *s = bs->opaque; - VmdkExtent *extent; - char extent_opt_prefix[32]; - Error *local_err = NULL; - - for (p = desc; *p; p = next_line(p)) { - /* parse extent line in one of below formats: - * - * RW [size in sectors] FLAT "file-name.vmdk" OFFSET - * RW [size in sectors] SPARSE "file-name.vmdk" - * RW [size in sectors] VMFS "file-name.vmdk" - * RW [size in sectors] VMFSSPARSE "file-name.vmdk" - */ - flat_offset = -1; - matches = sscanf(p, "%10s %" SCNd64 " %10s \"%511[^\n\r\"]\" %" SCNd64, - access, §ors, type, fname, &flat_offset); - if (matches < 4 || strcmp(access, "RW")) { - continue; - } else if (!strcmp(type, "FLAT")) { - if (matches != 5 || flat_offset < 0) { - goto invalid; - } - } else if (!strcmp(type, "VMFS")) { - if (matches == 4) { - flat_offset = 0; - } else { - goto invalid; - } - } else if (matches != 4) { - goto invalid; - } - - if (sectors <= 0 || - (strcmp(type, "FLAT") && strcmp(type, "SPARSE") && - strcmp(type, "VMFS") && strcmp(type, "VMFSSPARSE")) || - (strcmp(access, "RW"))) { - continue; - } - - if (!path_is_absolute(fname) && !path_has_protocol(fname) && - !desc_file_path[0]) - { - error_setg(errp, "Cannot use relative extent paths with VMDK " - "descriptor file '%s'", bs->file->bs->filename); - return -EINVAL; - } - - extent_path = g_malloc0(PATH_MAX); - path_combine(extent_path, PATH_MAX, desc_file_path, fname); - - ret = snprintf(extent_opt_prefix, 32, "extents.%d", s->num_extents); - assert(ret < 32); - - extent_file = bdrv_open_child(extent_path, options, extent_opt_prefix, - bs, &child_file, false, &local_err); - g_free(extent_path); - if (local_err) { - error_propagate(errp, local_err); - return -EINVAL; - } - - /* save to extents array */ - if (!strcmp(type, "FLAT") || !strcmp(type, "VMFS")) { - /* FLAT extent */ - - ret = vmdk_add_extent(bs, extent_file, true, sectors, - 0, 0, 0, 0, 0, &extent, errp); - if (ret < 0) { - bdrv_unref_child(bs, extent_file); - return ret; - } - extent->flat_start_offset = flat_offset << 9; - } else if (!strcmp(type, "SPARSE") || !strcmp(type, "VMFSSPARSE")) { - /* SPARSE extent and VMFSSPARSE extent are both "COWD" sparse file*/ - char *buf = vmdk_read_desc(extent_file->bs, 0, errp); - if (!buf) { - ret = -EINVAL; - } else { - ret = vmdk_open_sparse(bs, extent_file, bs->open_flags, buf, - options, errp); - } - g_free(buf); - if (ret) { - bdrv_unref_child(bs, extent_file); - return ret; - } - extent = &s->extents[s->num_extents - 1]; - } else { - error_setg(errp, "Unsupported extent type '%s'", type); - bdrv_unref_child(bs, extent_file); - return -ENOTSUP; - } - extent->type = g_strdup(type); - } - return 0; - -invalid: - np = next_line(p); - assert(np != p); - if (np[-1] == '\n') { - np--; - } - error_setg(errp, "Invalid extent line: %.*s", (int)(np - p), p); - return -EINVAL; -} - -static int vmdk_open_desc_file(BlockDriverState *bs, int flags, char *buf, - QDict *options, Error **errp) -{ - int ret; - char ct[128]; - BDRVVmdkState *s = bs->opaque; - - if (vmdk_parse_description(buf, "createType", ct, sizeof(ct))) { - error_setg(errp, "invalid VMDK image descriptor"); - ret = -EINVAL; - goto exit; - } - if (strcmp(ct, "monolithicFlat") && - strcmp(ct, "vmfs") && - strcmp(ct, "vmfsSparse") && - strcmp(ct, "twoGbMaxExtentSparse") && - strcmp(ct, "twoGbMaxExtentFlat")) { - error_setg(errp, "Unsupported image type '%s'", ct); - ret = -ENOTSUP; - goto exit; - } - s->create_type = g_strdup(ct); - s->desc_offset = 0; - ret = vmdk_parse_extents(buf, bs, bs->file->bs->exact_filename, options, - errp); -exit: - return ret; -} - -static int vmdk_open(BlockDriverState *bs, QDict *options, int flags, - Error **errp) -{ - char *buf; - int ret; - BDRVVmdkState *s = bs->opaque; - uint32_t magic; - - buf = vmdk_read_desc(bs->file->bs, 0, errp); - if (!buf) { - return -EINVAL; - } - - magic = ldl_be_p(buf); - switch (magic) { - case VMDK3_MAGIC: - case VMDK4_MAGIC: - ret = vmdk_open_sparse(bs, bs->file, flags, buf, options, - errp); - s->desc_offset = 0x200; - break; - default: - ret = vmdk_open_desc_file(bs, flags, buf, options, errp); - break; - } - if (ret) { - goto fail; - } - - /* try to open parent images, if exist */ - ret = vmdk_parent_open(bs); - if (ret) { - goto fail; - } - s->cid = vmdk_read_cid(bs, 0); - s->parent_cid = vmdk_read_cid(bs, 1); - qemu_co_mutex_init(&s->lock); - - /* Disable migration when VMDK images are used */ - error_setg(&s->migration_blocker, "The vmdk format used by node '%s' " - "does not support live migration", - bdrv_get_device_or_node_name(bs)); - migrate_add_blocker(s->migration_blocker); - g_free(buf); - return 0; - -fail: - g_free(buf); - g_free(s->create_type); - s->create_type = NULL; - vmdk_free_extents(bs); - return ret; -} - - -static void vmdk_refresh_limits(BlockDriverState *bs, Error **errp) -{ - BDRVVmdkState *s = bs->opaque; - int i; - - for (i = 0; i < s->num_extents; i++) { - if (!s->extents[i].flat) { - bs->bl.write_zeroes_alignment = - MAX(bs->bl.write_zeroes_alignment, - s->extents[i].cluster_sectors); - } - } -} - -/** - * get_whole_cluster - * - * Copy backing file's cluster that covers @sector_num, otherwise write zero, - * to the cluster at @cluster_sector_num. - * - * If @skip_start_sector < @skip_end_sector, the relative range - * [@skip_start_sector, @skip_end_sector) is not copied or written, and leave - * it for call to write user data in the request. - */ -static int get_whole_cluster(BlockDriverState *bs, - VmdkExtent *extent, - uint64_t cluster_sector_num, - uint64_t sector_num, - uint64_t skip_start_sector, - uint64_t skip_end_sector) -{ - int ret = VMDK_OK; - int64_t cluster_bytes; - uint8_t *whole_grain; - - /* For COW, align request sector_num to cluster start */ - sector_num = QEMU_ALIGN_DOWN(sector_num, extent->cluster_sectors); - cluster_bytes = extent->cluster_sectors << BDRV_SECTOR_BITS; - whole_grain = qemu_blockalign(bs, cluster_bytes); - - if (!bs->backing) { - memset(whole_grain, 0, skip_start_sector << BDRV_SECTOR_BITS); - memset(whole_grain + (skip_end_sector << BDRV_SECTOR_BITS), 0, - cluster_bytes - (skip_end_sector << BDRV_SECTOR_BITS)); - } - - assert(skip_end_sector <= extent->cluster_sectors); - /* we will be here if it's first write on non-exist grain(cluster). - * try to read from parent image, if exist */ - if (bs->backing && !vmdk_is_cid_valid(bs)) { - ret = VMDK_ERROR; - goto exit; - } - - /* Read backing data before skip range */ - if (skip_start_sector > 0) { - if (bs->backing) { - ret = bdrv_read(bs->backing->bs, sector_num, - whole_grain, skip_start_sector); - if (ret < 0) { - ret = VMDK_ERROR; - goto exit; - } - } - ret = bdrv_write(extent->file->bs, cluster_sector_num, whole_grain, - skip_start_sector); - if (ret < 0) { - ret = VMDK_ERROR; - goto exit; - } - } - /* Read backing data after skip range */ - if (skip_end_sector < extent->cluster_sectors) { - if (bs->backing) { - ret = bdrv_read(bs->backing->bs, sector_num + skip_end_sector, - whole_grain + (skip_end_sector << BDRV_SECTOR_BITS), - extent->cluster_sectors - skip_end_sector); - if (ret < 0) { - ret = VMDK_ERROR; - goto exit; - } - } - ret = bdrv_write(extent->file->bs, cluster_sector_num + skip_end_sector, - whole_grain + (skip_end_sector << BDRV_SECTOR_BITS), - extent->cluster_sectors - skip_end_sector); - if (ret < 0) { - ret = VMDK_ERROR; - goto exit; - } - } - -exit: - qemu_vfree(whole_grain); - return ret; -} - -static int vmdk_L2update(VmdkExtent *extent, VmdkMetaData *m_data, - uint32_t offset) -{ - offset = cpu_to_le32(offset); - /* update L2 table */ - if (bdrv_pwrite_sync( - extent->file->bs, - ((int64_t)m_data->l2_offset * 512) - + (m_data->l2_index * sizeof(offset)), - &offset, sizeof(offset)) < 0) { - return VMDK_ERROR; - } - /* update backup L2 table */ - if (extent->l1_backup_table_offset != 0) { - m_data->l2_offset = extent->l1_backup_table[m_data->l1_index]; - if (bdrv_pwrite_sync( - extent->file->bs, - ((int64_t)m_data->l2_offset * 512) - + (m_data->l2_index * sizeof(offset)), - &offset, sizeof(offset)) < 0) { - return VMDK_ERROR; - } - } - if (m_data->l2_cache_entry) { - *m_data->l2_cache_entry = offset; - } - - return VMDK_OK; -} - -/** - * get_cluster_offset - * - * Look up cluster offset in extent file by sector number, and store in - * @cluster_offset. - * - * For flat extents, the start offset as parsed from the description file is - * returned. - * - * For sparse extents, look up in L1, L2 table. If allocate is true, return an - * offset for a new cluster and update L2 cache. If there is a backing file, - * COW is done before returning; otherwise, zeroes are written to the allocated - * cluster. Both COW and zero writing skips the sector range - * [@skip_start_sector, @skip_end_sector) passed in by caller, because caller - * has new data to write there. - * - * Returns: VMDK_OK if cluster exists and mapped in the image. - * VMDK_UNALLOC if cluster is not mapped and @allocate is false. - * VMDK_ERROR if failed. - */ -static int get_cluster_offset(BlockDriverState *bs, - VmdkExtent *extent, - VmdkMetaData *m_data, - uint64_t offset, - bool allocate, - uint64_t *cluster_offset, - uint64_t skip_start_sector, - uint64_t skip_end_sector) -{ - unsigned int l1_index, l2_offset, l2_index; - int min_index, i, j; - uint32_t min_count, *l2_table; - bool zeroed = false; - int64_t ret; - int64_t cluster_sector; - - if (m_data) { - m_data->valid = 0; - } - if (extent->flat) { - *cluster_offset = extent->flat_start_offset; - return VMDK_OK; - } - - offset -= (extent->end_sector - extent->sectors) * SECTOR_SIZE; - l1_index = (offset >> 9) / extent->l1_entry_sectors; - if (l1_index >= extent->l1_size) { - return VMDK_ERROR; - } - l2_offset = extent->l1_table[l1_index]; - if (!l2_offset) { - return VMDK_UNALLOC; - } - for (i = 0; i < L2_CACHE_SIZE; i++) { - if (l2_offset == extent->l2_cache_offsets[i]) { - /* increment the hit count */ - if (++extent->l2_cache_counts[i] == 0xffffffff) { - for (j = 0; j < L2_CACHE_SIZE; j++) { - extent->l2_cache_counts[j] >>= 1; - } - } - l2_table = extent->l2_cache + (i * extent->l2_size); - goto found; - } - } - /* not found: load a new entry in the least used one */ - min_index = 0; - min_count = 0xffffffff; - for (i = 0; i < L2_CACHE_SIZE; i++) { - if (extent->l2_cache_counts[i] < min_count) { - min_count = extent->l2_cache_counts[i]; - min_index = i; - } - } - l2_table = extent->l2_cache + (min_index * extent->l2_size); - if (bdrv_pread( - extent->file->bs, - (int64_t)l2_offset * 512, - l2_table, - extent->l2_size * sizeof(uint32_t) - ) != extent->l2_size * sizeof(uint32_t)) { - return VMDK_ERROR; - } - - extent->l2_cache_offsets[min_index] = l2_offset; - extent->l2_cache_counts[min_index] = 1; - found: - l2_index = ((offset >> 9) / extent->cluster_sectors) % extent->l2_size; - cluster_sector = le32_to_cpu(l2_table[l2_index]); - - if (m_data) { - m_data->valid = 1; - m_data->l1_index = l1_index; - m_data->l2_index = l2_index; - m_data->l2_offset = l2_offset; - m_data->l2_cache_entry = &l2_table[l2_index]; - } - if (extent->has_zero_grain && cluster_sector == VMDK_GTE_ZEROED) { - zeroed = true; - } - - if (!cluster_sector || zeroed) { - if (!allocate) { - return zeroed ? VMDK_ZEROED : VMDK_UNALLOC; - } - - cluster_sector = extent->next_cluster_sector; - extent->next_cluster_sector += extent->cluster_sectors; - - /* First of all we write grain itself, to avoid race condition - * that may to corrupt the image. - * This problem may occur because of insufficient space on host disk - * or inappropriate VM shutdown. - */ - ret = get_whole_cluster(bs, extent, - cluster_sector, - offset >> BDRV_SECTOR_BITS, - skip_start_sector, skip_end_sector); - if (ret) { - return ret; - } - } - *cluster_offset = cluster_sector << BDRV_SECTOR_BITS; - return VMDK_OK; -} - -static VmdkExtent *find_extent(BDRVVmdkState *s, - int64_t sector_num, VmdkExtent *start_hint) -{ - VmdkExtent *extent = start_hint; - - if (!extent) { - extent = &s->extents[0]; - } - while (extent < &s->extents[s->num_extents]) { - if (sector_num < extent->end_sector) { - return extent; - } - extent++; - } - return NULL; -} - -static inline uint64_t vmdk_find_index_in_cluster(VmdkExtent *extent, - int64_t sector_num) -{ - uint64_t index_in_cluster, extent_begin_sector, extent_relative_sector_num; - - extent_begin_sector = extent->end_sector - extent->sectors; - extent_relative_sector_num = sector_num - extent_begin_sector; - index_in_cluster = extent_relative_sector_num % extent->cluster_sectors; - return index_in_cluster; -} - -static int64_t coroutine_fn vmdk_co_get_block_status(BlockDriverState *bs, - int64_t sector_num, int nb_sectors, int *pnum, BlockDriverState **file) -{ - BDRVVmdkState *s = bs->opaque; - int64_t index_in_cluster, n, ret; - uint64_t offset; - VmdkExtent *extent; - - extent = find_extent(s, sector_num, NULL); - if (!extent) { - return 0; - } - qemu_co_mutex_lock(&s->lock); - ret = get_cluster_offset(bs, extent, NULL, - sector_num * 512, false, &offset, - 0, 0); - qemu_co_mutex_unlock(&s->lock); - - index_in_cluster = vmdk_find_index_in_cluster(extent, sector_num); - switch (ret) { - case VMDK_ERROR: - ret = -EIO; - break; - case VMDK_UNALLOC: - ret = 0; - break; - case VMDK_ZEROED: - ret = BDRV_BLOCK_ZERO; - break; - case VMDK_OK: - ret = BDRV_BLOCK_DATA; - if (!extent->compressed) { - ret |= BDRV_BLOCK_OFFSET_VALID; - ret |= (offset + (index_in_cluster << BDRV_SECTOR_BITS)) - & BDRV_BLOCK_OFFSET_MASK; - } - *file = extent->file->bs; - break; - } - - n = extent->cluster_sectors - index_in_cluster; - if (n > nb_sectors) { - n = nb_sectors; - } - *pnum = n; - return ret; -} - -static int vmdk_write_extent(VmdkExtent *extent, int64_t cluster_offset, - int64_t offset_in_cluster, const uint8_t *buf, - int nb_sectors, int64_t sector_num) -{ - int ret; - VmdkGrainMarker *data = NULL; - uLongf buf_len; - const uint8_t *write_buf = buf; - int write_len = nb_sectors * 512; - int64_t write_offset; - int64_t write_end_sector; - - if (extent->compressed) { - if (!extent->has_marker) { - ret = -EINVAL; - goto out; - } - buf_len = (extent->cluster_sectors << 9) * 2; - data = g_malloc(buf_len + sizeof(VmdkGrainMarker)); - if (compress(data->data, &buf_len, buf, nb_sectors << 9) != Z_OK || - buf_len == 0) { - ret = -EINVAL; - goto out; - } - data->lba = sector_num; - data->size = buf_len; - write_buf = (uint8_t *)data; - write_len = buf_len + sizeof(VmdkGrainMarker); - } - write_offset = cluster_offset + offset_in_cluster, - ret = bdrv_pwrite(extent->file->bs, write_offset, write_buf, write_len); - - write_end_sector = DIV_ROUND_UP(write_offset + write_len, BDRV_SECTOR_SIZE); - - if (extent->compressed) { - extent->next_cluster_sector = write_end_sector; - } else { - extent->next_cluster_sector = MAX(extent->next_cluster_sector, - write_end_sector); - } - - if (ret != write_len) { - ret = ret < 0 ? ret : -EIO; - goto out; - } - ret = 0; - out: - g_free(data); - return ret; -} - -static int vmdk_read_extent(VmdkExtent *extent, int64_t cluster_offset, - int64_t offset_in_cluster, uint8_t *buf, - int nb_sectors) -{ - int ret; - int cluster_bytes, buf_bytes; - uint8_t *cluster_buf, *compressed_data; - uint8_t *uncomp_buf; - uint32_t data_len; - VmdkGrainMarker *marker; - uLongf buf_len; - - - if (!extent->compressed) { - ret = bdrv_pread(extent->file->bs, - cluster_offset + offset_in_cluster, - buf, nb_sectors * 512); - if (ret == nb_sectors * 512) { - return 0; - } else { - return -EIO; - } - } - cluster_bytes = extent->cluster_sectors * 512; - /* Read two clusters in case GrainMarker + compressed data > one cluster */ - buf_bytes = cluster_bytes * 2; - cluster_buf = g_malloc(buf_bytes); - uncomp_buf = g_malloc(cluster_bytes); - ret = bdrv_pread(extent->file->bs, - cluster_offset, - cluster_buf, buf_bytes); - if (ret < 0) { - goto out; - } - compressed_data = cluster_buf; - buf_len = cluster_bytes; - data_len = cluster_bytes; - if (extent->has_marker) { - marker = (VmdkGrainMarker *)cluster_buf; - compressed_data = marker->data; - data_len = le32_to_cpu(marker->size); - } - if (!data_len || data_len > buf_bytes) { - ret = -EINVAL; - goto out; - } - ret = uncompress(uncomp_buf, &buf_len, compressed_data, data_len); - if (ret != Z_OK) { - ret = -EINVAL; - goto out; - - } - if (offset_in_cluster < 0 || - offset_in_cluster + nb_sectors * 512 > buf_len) { - ret = -EINVAL; - goto out; - } - memcpy(buf, uncomp_buf + offset_in_cluster, nb_sectors * 512); - ret = 0; - - out: - g_free(uncomp_buf); - g_free(cluster_buf); - return ret; -} - -static int vmdk_read(BlockDriverState *bs, int64_t sector_num, - uint8_t *buf, int nb_sectors) -{ - BDRVVmdkState *s = bs->opaque; - int ret; - uint64_t n, index_in_cluster; - VmdkExtent *extent = NULL; - uint64_t cluster_offset; - - while (nb_sectors > 0) { - extent = find_extent(s, sector_num, extent); - if (!extent) { - return -EIO; - } - ret = get_cluster_offset(bs, extent, NULL, - sector_num << 9, false, &cluster_offset, - 0, 0); - index_in_cluster = vmdk_find_index_in_cluster(extent, sector_num); - n = extent->cluster_sectors - index_in_cluster; - if (n > nb_sectors) { - n = nb_sectors; - } - if (ret != VMDK_OK) { - /* if not allocated, try to read from parent image, if exist */ - if (bs->backing && ret != VMDK_ZEROED) { - if (!vmdk_is_cid_valid(bs)) { - return -EINVAL; - } - ret = bdrv_read(bs->backing->bs, sector_num, buf, n); - if (ret < 0) { - return ret; - } - } else { - memset(buf, 0, 512 * n); - } - } else { - ret = vmdk_read_extent(extent, - cluster_offset, index_in_cluster * 512, - buf, n); - if (ret) { - return ret; - } - } - nb_sectors -= n; - sector_num += n; - buf += n * 512; - } - return 0; -} - -static coroutine_fn int vmdk_co_read(BlockDriverState *bs, int64_t sector_num, - uint8_t *buf, int nb_sectors) -{ - int ret; - BDRVVmdkState *s = bs->opaque; - qemu_co_mutex_lock(&s->lock); - ret = vmdk_read(bs, sector_num, buf, nb_sectors); - qemu_co_mutex_unlock(&s->lock); - return ret; -} - -/** - * vmdk_write: - * @zeroed: buf is ignored (data is zero), use zeroed_grain GTE feature - * if possible, otherwise return -ENOTSUP. - * @zero_dry_run: used for zeroed == true only, don't update L2 table, just try - * with each cluster. By dry run we can find if the zero write - * is possible without modifying image data. - * - * Returns: error code with 0 for success. - */ -static int vmdk_write(BlockDriverState *bs, int64_t sector_num, - const uint8_t *buf, int nb_sectors, - bool zeroed, bool zero_dry_run) -{ - BDRVVmdkState *s = bs->opaque; - VmdkExtent *extent = NULL; - int ret; - int64_t index_in_cluster, n; - uint64_t cluster_offset; - VmdkMetaData m_data; - - if (sector_num > bs->total_sectors) { - error_report("Wrong offset: sector_num=0x%" PRIx64 - " total_sectors=0x%" PRIx64, - sector_num, bs->total_sectors); - return -EIO; - } - - while (nb_sectors > 0) { - extent = find_extent(s, sector_num, extent); - if (!extent) { - return -EIO; - } - index_in_cluster = vmdk_find_index_in_cluster(extent, sector_num); - n = extent->cluster_sectors - index_in_cluster; - if (n > nb_sectors) { - n = nb_sectors; - } - ret = get_cluster_offset(bs, extent, &m_data, sector_num << 9, - !(extent->compressed || zeroed), - &cluster_offset, - index_in_cluster, index_in_cluster + n); - if (extent->compressed) { - if (ret == VMDK_OK) { - /* Refuse write to allocated cluster for streamOptimized */ - error_report("Could not write to allocated cluster" - " for streamOptimized"); - return -EIO; - } else { - /* allocate */ - ret = get_cluster_offset(bs, extent, &m_data, sector_num << 9, - true, &cluster_offset, 0, 0); - } - } - if (ret == VMDK_ERROR) { - return -EINVAL; - } - if (zeroed) { - /* Do zeroed write, buf is ignored */ - if (extent->has_zero_grain && - index_in_cluster == 0 && - n >= extent->cluster_sectors) { - n = extent->cluster_sectors; - if (!zero_dry_run) { - /* update L2 tables */ - if (vmdk_L2update(extent, &m_data, VMDK_GTE_ZEROED) - != VMDK_OK) { - return -EIO; - } - } - } else { - return -ENOTSUP; - } - } else { - ret = vmdk_write_extent(extent, - cluster_offset, index_in_cluster * 512, - buf, n, sector_num); - if (ret) { - return ret; - } - if (m_data.valid) { - /* update L2 tables */ - if (vmdk_L2update(extent, &m_data, - cluster_offset >> BDRV_SECTOR_BITS) - != VMDK_OK) { - return -EIO; - } - } - } - nb_sectors -= n; - sector_num += n; - buf += n * 512; - - /* update CID on the first write every time the virtual disk is - * opened */ - if (!s->cid_updated) { - ret = vmdk_write_cid(bs, g_random_int()); - if (ret < 0) { - return ret; - } - s->cid_updated = true; - } - } - return 0; -} - -static coroutine_fn int vmdk_co_write(BlockDriverState *bs, int64_t sector_num, - const uint8_t *buf, int nb_sectors) -{ - int ret; - BDRVVmdkState *s = bs->opaque; - qemu_co_mutex_lock(&s->lock); - ret = vmdk_write(bs, sector_num, buf, nb_sectors, false, false); - qemu_co_mutex_unlock(&s->lock); - return ret; -} - -static int vmdk_write_compressed(BlockDriverState *bs, - int64_t sector_num, - const uint8_t *buf, - int nb_sectors) -{ - BDRVVmdkState *s = bs->opaque; - if (s->num_extents == 1 && s->extents[0].compressed) { - return vmdk_write(bs, sector_num, buf, nb_sectors, false, false); - } else { - return -ENOTSUP; - } -} - -static int coroutine_fn vmdk_co_write_zeroes(BlockDriverState *bs, - int64_t sector_num, - int nb_sectors, - BdrvRequestFlags flags) -{ - int ret; - BDRVVmdkState *s = bs->opaque; - qemu_co_mutex_lock(&s->lock); - /* write zeroes could fail if sectors not aligned to cluster, test it with - * dry_run == true before really updating image */ - ret = vmdk_write(bs, sector_num, NULL, nb_sectors, true, true); - if (!ret) { - ret = vmdk_write(bs, sector_num, NULL, nb_sectors, true, false); - } - qemu_co_mutex_unlock(&s->lock); - return ret; -} - -static int vmdk_create_extent(const char *filename, int64_t filesize, - bool flat, bool compress, bool zeroed_grain, - QemuOpts *opts, Error **errp) -{ - int ret, i; - BlockBackend *blk = NULL; - VMDK4Header header; - Error *local_err = NULL; - uint32_t tmp, magic, grains, gd_sectors, gt_size, gt_count; - uint32_t *gd_buf = NULL; - int gd_buf_size; - - ret = bdrv_create_file(filename, opts, &local_err); - if (ret < 0) { - error_propagate(errp, local_err); - goto exit; - } - - blk = blk_new_open(filename, NULL, NULL, - BDRV_O_RDWR | BDRV_O_PROTOCOL, &local_err); - if (blk == NULL) { - error_propagate(errp, local_err); - ret = -EIO; - goto exit; - } - - blk_set_allow_write_beyond_eof(blk, true); - - if (flat) { - ret = blk_truncate(blk, filesize); - if (ret < 0) { - error_setg_errno(errp, -ret, "Could not truncate file"); - } - goto exit; - } - magic = cpu_to_be32(VMDK4_MAGIC); - memset(&header, 0, sizeof(header)); - if (compress) { - header.version = 3; - } else if (zeroed_grain) { - header.version = 2; - } else { - header.version = 1; - } - header.flags = VMDK4_FLAG_RGD | VMDK4_FLAG_NL_DETECT - | (compress ? VMDK4_FLAG_COMPRESS | VMDK4_FLAG_MARKER : 0) - | (zeroed_grain ? VMDK4_FLAG_ZERO_GRAIN : 0); - header.compressAlgorithm = compress ? VMDK4_COMPRESSION_DEFLATE : 0; - header.capacity = filesize / BDRV_SECTOR_SIZE; - header.granularity = 128; - header.num_gtes_per_gt = BDRV_SECTOR_SIZE; - - grains = DIV_ROUND_UP(filesize / BDRV_SECTOR_SIZE, header.granularity); - gt_size = DIV_ROUND_UP(header.num_gtes_per_gt * sizeof(uint32_t), - BDRV_SECTOR_SIZE); - gt_count = DIV_ROUND_UP(grains, header.num_gtes_per_gt); - gd_sectors = DIV_ROUND_UP(gt_count * sizeof(uint32_t), BDRV_SECTOR_SIZE); - - header.desc_offset = 1; - header.desc_size = 20; - header.rgd_offset = header.desc_offset + header.desc_size; - header.gd_offset = header.rgd_offset + gd_sectors + (gt_size * gt_count); - header.grain_offset = - ROUND_UP(header.gd_offset + gd_sectors + (gt_size * gt_count), - header.granularity); - /* swap endianness for all header fields */ - header.version = cpu_to_le32(header.version); - header.flags = cpu_to_le32(header.flags); - header.capacity = cpu_to_le64(header.capacity); - header.granularity = cpu_to_le64(header.granularity); - header.num_gtes_per_gt = cpu_to_le32(header.num_gtes_per_gt); - header.desc_offset = cpu_to_le64(header.desc_offset); - header.desc_size = cpu_to_le64(header.desc_size); - header.rgd_offset = cpu_to_le64(header.rgd_offset); - header.gd_offset = cpu_to_le64(header.gd_offset); - header.grain_offset = cpu_to_le64(header.grain_offset); - header.compressAlgorithm = cpu_to_le16(header.compressAlgorithm); - - header.check_bytes[0] = 0xa; - header.check_bytes[1] = 0x20; - header.check_bytes[2] = 0xd; - header.check_bytes[3] = 0xa; - - /* write all the data */ - ret = blk_pwrite(blk, 0, &magic, sizeof(magic)); - if (ret < 0) { - error_setg(errp, QERR_IO_ERROR); - goto exit; - } - ret = blk_pwrite(blk, sizeof(magic), &header, sizeof(header)); - if (ret < 0) { - error_setg(errp, QERR_IO_ERROR); - goto exit; - } - - ret = blk_truncate(blk, le64_to_cpu(header.grain_offset) << 9); - if (ret < 0) { - error_setg_errno(errp, -ret, "Could not truncate file"); - goto exit; - } - - /* write grain directory */ - gd_buf_size = gd_sectors * BDRV_SECTOR_SIZE; - gd_buf = g_malloc0(gd_buf_size); - for (i = 0, tmp = le64_to_cpu(header.rgd_offset) + gd_sectors; - i < gt_count; i++, tmp += gt_size) { - gd_buf[i] = cpu_to_le32(tmp); - } - ret = blk_pwrite(blk, le64_to_cpu(header.rgd_offset) * BDRV_SECTOR_SIZE, - gd_buf, gd_buf_size); - if (ret < 0) { - error_setg(errp, QERR_IO_ERROR); - goto exit; - } - - /* write backup grain directory */ - for (i = 0, tmp = le64_to_cpu(header.gd_offset) + gd_sectors; - i < gt_count; i++, tmp += gt_size) { - gd_buf[i] = cpu_to_le32(tmp); - } - ret = blk_pwrite(blk, le64_to_cpu(header.gd_offset) * BDRV_SECTOR_SIZE, - gd_buf, gd_buf_size); - if (ret < 0) { - error_setg(errp, QERR_IO_ERROR); - goto exit; - } - - ret = 0; -exit: - if (blk) { - blk_unref(blk); - } - g_free(gd_buf); - return ret; -} - -static int filename_decompose(const char *filename, char *path, char *prefix, - char *postfix, size_t buf_len, Error **errp) -{ - const char *p, *q; - - if (filename == NULL || !strlen(filename)) { - error_setg(errp, "No filename provided"); - return VMDK_ERROR; - } - p = strrchr(filename, '/'); - if (p == NULL) { - p = strrchr(filename, '\\'); - } - if (p == NULL) { - p = strrchr(filename, ':'); - } - if (p != NULL) { - p++; - if (p - filename >= buf_len) { - return VMDK_ERROR; - } - pstrcpy(path, p - filename + 1, filename); - } else { - p = filename; - path[0] = '\0'; - } - q = strrchr(p, '.'); - if (q == NULL) { - pstrcpy(prefix, buf_len, p); - postfix[0] = '\0'; - } else { - if (q - p >= buf_len) { - return VMDK_ERROR; - } - pstrcpy(prefix, q - p + 1, p); - pstrcpy(postfix, buf_len, q); - } - return VMDK_OK; -} - -static int vmdk_create(const char *filename, QemuOpts *opts, Error **errp) -{ - int idx = 0; - BlockBackend *new_blk = NULL; - Error *local_err = NULL; - char *desc = NULL; - int64_t total_size = 0, filesize; - char *adapter_type = NULL; - char *backing_file = NULL; - char *fmt = NULL; - int flags = 0; - int ret = 0; - bool flat, split, compress; - GString *ext_desc_lines; - char *path = g_malloc0(PATH_MAX); - char *prefix = g_malloc0(PATH_MAX); - char *postfix = g_malloc0(PATH_MAX); - char *desc_line = g_malloc0(BUF_SIZE); - char *ext_filename = g_malloc0(PATH_MAX); - char *desc_filename = g_malloc0(PATH_MAX); - const int64_t split_size = 0x80000000; /* VMDK has constant split size */ - const char *desc_extent_line; - char *parent_desc_line = g_malloc0(BUF_SIZE); - uint32_t parent_cid = 0xffffffff; - uint32_t number_heads = 16; - bool zeroed_grain = false; - uint32_t desc_offset = 0, desc_len; - const char desc_template[] = - "# Disk DescriptorFile\n" - "version=1\n" - "CID=%" PRIx32 "\n" - "parentCID=%" PRIx32 "\n" - "createType=\"%s\"\n" - "%s" - "\n" - "# Extent description\n" - "%s" - "\n" - "# The Disk Data Base\n" - "#DDB\n" - "\n" - "ddb.virtualHWVersion = \"%d\"\n" - "ddb.geometry.cylinders = \"%" PRId64 "\"\n" - "ddb.geometry.heads = \"%" PRIu32 "\"\n" - "ddb.geometry.sectors = \"63\"\n" - "ddb.adapterType = \"%s\"\n"; - - ext_desc_lines = g_string_new(NULL); - - if (filename_decompose(filename, path, prefix, postfix, PATH_MAX, errp)) { - ret = -EINVAL; - goto exit; - } - /* Read out options */ - total_size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0), - BDRV_SECTOR_SIZE); - adapter_type = qemu_opt_get_del(opts, BLOCK_OPT_ADAPTER_TYPE); - backing_file = qemu_opt_get_del(opts, BLOCK_OPT_BACKING_FILE); - if (qemu_opt_get_bool_del(opts, BLOCK_OPT_COMPAT6, false)) { - flags |= BLOCK_FLAG_COMPAT6; - } - fmt = qemu_opt_get_del(opts, BLOCK_OPT_SUBFMT); - if (qemu_opt_get_bool_del(opts, BLOCK_OPT_ZEROED_GRAIN, false)) { - zeroed_grain = true; - } - - if (!adapter_type) { - adapter_type = g_strdup("ide"); - } else if (strcmp(adapter_type, "ide") && - strcmp(adapter_type, "buslogic") && - strcmp(adapter_type, "lsilogic") && - strcmp(adapter_type, "legacyESX")) { - error_setg(errp, "Unknown adapter type: '%s'", adapter_type); - ret = -EINVAL; - goto exit; - } - if (strcmp(adapter_type, "ide") != 0) { - /* that's the number of heads with which vmware operates when - creating, exporting, etc. vmdk files with a non-ide adapter type */ - number_heads = 255; - } - if (!fmt) { - /* Default format to monolithicSparse */ - fmt = g_strdup("monolithicSparse"); - } else if (strcmp(fmt, "monolithicFlat") && - strcmp(fmt, "monolithicSparse") && - strcmp(fmt, "twoGbMaxExtentSparse") && - strcmp(fmt, "twoGbMaxExtentFlat") && - strcmp(fmt, "streamOptimized")) { - error_setg(errp, "Unknown subformat: '%s'", fmt); - ret = -EINVAL; - goto exit; - } - split = !(strcmp(fmt, "twoGbMaxExtentFlat") && - strcmp(fmt, "twoGbMaxExtentSparse")); - flat = !(strcmp(fmt, "monolithicFlat") && - strcmp(fmt, "twoGbMaxExtentFlat")); - compress = !strcmp(fmt, "streamOptimized"); - if (flat) { - desc_extent_line = "RW %" PRId64 " FLAT \"%s\" 0\n"; - } else { - desc_extent_line = "RW %" PRId64 " SPARSE \"%s\"\n"; - } - if (flat && backing_file) { - error_setg(errp, "Flat image can't have backing file"); - ret = -ENOTSUP; - goto exit; - } - if (flat && zeroed_grain) { - error_setg(errp, "Flat image can't enable zeroed grain"); - ret = -ENOTSUP; - goto exit; - } - if (backing_file) { - BlockBackend *blk; - char *full_backing = g_new0(char, PATH_MAX); - bdrv_get_full_backing_filename_from_filename(filename, backing_file, - full_backing, PATH_MAX, - &local_err); - if (local_err) { - g_free(full_backing); - error_propagate(errp, local_err); - ret = -ENOENT; - goto exit; - } - - blk = blk_new_open(full_backing, NULL, NULL, - BDRV_O_NO_BACKING, errp); - g_free(full_backing); - if (blk == NULL) { - ret = -EIO; - goto exit; - } - if (strcmp(blk_bs(blk)->drv->format_name, "vmdk")) { - blk_unref(blk); - ret = -EINVAL; - goto exit; - } - parent_cid = vmdk_read_cid(blk_bs(blk), 0); - blk_unref(blk); - snprintf(parent_desc_line, BUF_SIZE, - "parentFileNameHint=\"%s\"", backing_file); - } - - /* Create extents */ - filesize = total_size; - while (filesize > 0) { - int64_t size = filesize; - - if (split && size > split_size) { - size = split_size; - } - if (split) { - snprintf(desc_filename, PATH_MAX, "%s-%c%03d%s", - prefix, flat ? 'f' : 's', ++idx, postfix); - } else if (flat) { - snprintf(desc_filename, PATH_MAX, "%s-flat%s", prefix, postfix); - } else { - snprintf(desc_filename, PATH_MAX, "%s%s", prefix, postfix); - } - snprintf(ext_filename, PATH_MAX, "%s%s", path, desc_filename); - - if (vmdk_create_extent(ext_filename, size, - flat, compress, zeroed_grain, opts, errp)) { - ret = -EINVAL; - goto exit; - } - filesize -= size; - - /* Format description line */ - snprintf(desc_line, BUF_SIZE, - desc_extent_line, size / BDRV_SECTOR_SIZE, desc_filename); - g_string_append(ext_desc_lines, desc_line); - } - /* generate descriptor file */ - desc = g_strdup_printf(desc_template, - g_random_int(), - parent_cid, - fmt, - parent_desc_line, - ext_desc_lines->str, - (flags & BLOCK_FLAG_COMPAT6 ? 6 : 4), - total_size / - (int64_t)(63 * number_heads * BDRV_SECTOR_SIZE), - number_heads, - adapter_type); - desc_len = strlen(desc); - /* the descriptor offset = 0x200 */ - if (!split && !flat) { - desc_offset = 0x200; - } else { - ret = bdrv_create_file(filename, opts, &local_err); - if (ret < 0) { - error_propagate(errp, local_err); - goto exit; - } - } - - new_blk = blk_new_open(filename, NULL, NULL, - BDRV_O_RDWR | BDRV_O_PROTOCOL, &local_err); - if (new_blk == NULL) { - error_propagate(errp, local_err); - ret = -EIO; - goto exit; - } - - blk_set_allow_write_beyond_eof(new_blk, true); - - ret = blk_pwrite(new_blk, desc_offset, desc, desc_len); - if (ret < 0) { - error_setg_errno(errp, -ret, "Could not write description"); - goto exit; - } - /* bdrv_pwrite write padding zeros to align to sector, we don't need that - * for description file */ - if (desc_offset == 0) { - ret = blk_truncate(new_blk, desc_len); - if (ret < 0) { - error_setg_errno(errp, -ret, "Could not truncate file"); - } - } -exit: - if (new_blk) { - blk_unref(new_blk); - } - g_free(adapter_type); - g_free(backing_file); - g_free(fmt); - g_free(desc); - g_free(path); - g_free(prefix); - g_free(postfix); - g_free(desc_line); - g_free(ext_filename); - g_free(desc_filename); - g_free(parent_desc_line); - g_string_free(ext_desc_lines, true); - return ret; -} - -static void vmdk_close(BlockDriverState *bs) -{ - BDRVVmdkState *s = bs->opaque; - - vmdk_free_extents(bs); - g_free(s->create_type); - - migrate_del_blocker(s->migration_blocker); - error_free(s->migration_blocker); -} - -static coroutine_fn int vmdk_co_flush(BlockDriverState *bs) -{ - BDRVVmdkState *s = bs->opaque; - int i, err; - int ret = 0; - - for (i = 0; i < s->num_extents; i++) { - err = bdrv_co_flush(s->extents[i].file->bs); - if (err < 0) { - ret = err; - } - } - return ret; -} - -static int64_t vmdk_get_allocated_file_size(BlockDriverState *bs) -{ - int i; - int64_t ret = 0; - int64_t r; - BDRVVmdkState *s = bs->opaque; - - ret = bdrv_get_allocated_file_size(bs->file->bs); - if (ret < 0) { - return ret; - } - for (i = 0; i < s->num_extents; i++) { - if (s->extents[i].file == bs->file) { - continue; - } - r = bdrv_get_allocated_file_size(s->extents[i].file->bs); - if (r < 0) { - return r; - } - ret += r; - } - return ret; -} - -static int vmdk_has_zero_init(BlockDriverState *bs) -{ - int i; - BDRVVmdkState *s = bs->opaque; - - /* If has a flat extent and its underlying storage doesn't have zero init, - * return 0. */ - for (i = 0; i < s->num_extents; i++) { - if (s->extents[i].flat) { - if (!bdrv_has_zero_init(s->extents[i].file->bs)) { - return 0; - } - } - } - return 1; -} - -static ImageInfo *vmdk_get_extent_info(VmdkExtent *extent) -{ - ImageInfo *info = g_new0(ImageInfo, 1); - - *info = (ImageInfo){ - .filename = g_strdup(extent->file->bs->filename), - .format = g_strdup(extent->type), - .virtual_size = extent->sectors * BDRV_SECTOR_SIZE, - .compressed = extent->compressed, - .has_compressed = extent->compressed, - .cluster_size = extent->cluster_sectors * BDRV_SECTOR_SIZE, - .has_cluster_size = !extent->flat, - }; - - return info; -} - -static int vmdk_check(BlockDriverState *bs, BdrvCheckResult *result, - BdrvCheckMode fix) -{ - BDRVVmdkState *s = bs->opaque; - VmdkExtent *extent = NULL; - int64_t sector_num = 0; - int64_t total_sectors = bdrv_nb_sectors(bs); - int ret; - uint64_t cluster_offset; - - if (fix) { - return -ENOTSUP; - } - - for (;;) { - if (sector_num >= total_sectors) { - return 0; - } - extent = find_extent(s, sector_num, extent); - if (!extent) { - fprintf(stderr, - "ERROR: could not find extent for sector %" PRId64 "\n", - sector_num); - break; - } - ret = get_cluster_offset(bs, extent, NULL, - sector_num << BDRV_SECTOR_BITS, - false, &cluster_offset, 0, 0); - if (ret == VMDK_ERROR) { - fprintf(stderr, - "ERROR: could not get cluster_offset for sector %" - PRId64 "\n", sector_num); - break; - } - if (ret == VMDK_OK && - cluster_offset >= bdrv_getlength(extent->file->bs)) - { - fprintf(stderr, - "ERROR: cluster offset for sector %" - PRId64 " points after EOF\n", sector_num); - break; - } - sector_num += extent->cluster_sectors; - } - - result->corruptions++; - return 0; -} - -static ImageInfoSpecific *vmdk_get_specific_info(BlockDriverState *bs) -{ - int i; - BDRVVmdkState *s = bs->opaque; - ImageInfoSpecific *spec_info = g_new0(ImageInfoSpecific, 1); - ImageInfoList **next; - - *spec_info = (ImageInfoSpecific){ - .type = IMAGE_INFO_SPECIFIC_KIND_VMDK, - .u = { - .vmdk.data = g_new0(ImageInfoSpecificVmdk, 1), - }, - }; - - *spec_info->u.vmdk.data = (ImageInfoSpecificVmdk) { - .create_type = g_strdup(s->create_type), - .cid = s->cid, - .parent_cid = s->parent_cid, - }; - - next = &spec_info->u.vmdk.data->extents; - for (i = 0; i < s->num_extents; i++) { - *next = g_new0(ImageInfoList, 1); - (*next)->value = vmdk_get_extent_info(&s->extents[i]); - (*next)->next = NULL; - next = &(*next)->next; - } - - return spec_info; -} - -static bool vmdk_extents_type_eq(const VmdkExtent *a, const VmdkExtent *b) -{ - return a->flat == b->flat && - a->compressed == b->compressed && - (a->flat || a->cluster_sectors == b->cluster_sectors); -} - -static int vmdk_get_info(BlockDriverState *bs, BlockDriverInfo *bdi) -{ - int i; - BDRVVmdkState *s = bs->opaque; - assert(s->num_extents); - - /* See if we have multiple extents but they have different cases */ - for (i = 1; i < s->num_extents; i++) { - if (!vmdk_extents_type_eq(&s->extents[0], &s->extents[i])) { - return -ENOTSUP; - } - } - bdi->needs_compressed_writes = s->extents[0].compressed; - if (!s->extents[0].flat) { - bdi->cluster_size = s->extents[0].cluster_sectors << BDRV_SECTOR_BITS; - } - return 0; -} - -static void vmdk_detach_aio_context(BlockDriverState *bs) -{ - BDRVVmdkState *s = bs->opaque; - int i; - - for (i = 0; i < s->num_extents; i++) { - bdrv_detach_aio_context(s->extents[i].file->bs); - } -} - -static void vmdk_attach_aio_context(BlockDriverState *bs, - AioContext *new_context) -{ - BDRVVmdkState *s = bs->opaque; - int i; - - for (i = 0; i < s->num_extents; i++) { - bdrv_attach_aio_context(s->extents[i].file->bs, new_context); - } -} - -static QemuOptsList vmdk_create_opts = { - .name = "vmdk-create-opts", - .head = QTAILQ_HEAD_INITIALIZER(vmdk_create_opts.head), - .desc = { - { - .name = BLOCK_OPT_SIZE, - .type = QEMU_OPT_SIZE, - .help = "Virtual disk size" - }, - { - .name = BLOCK_OPT_ADAPTER_TYPE, - .type = QEMU_OPT_STRING, - .help = "Virtual adapter type, can be one of " - "ide (default), lsilogic, buslogic or legacyESX" - }, - { - .name = BLOCK_OPT_BACKING_FILE, - .type = QEMU_OPT_STRING, - .help = "File name of a base image" - }, - { - .name = BLOCK_OPT_COMPAT6, - .type = QEMU_OPT_BOOL, - .help = "VMDK version 6 image", - .def_value_str = "off" - }, - { - .name = BLOCK_OPT_SUBFMT, - .type = QEMU_OPT_STRING, - .help = - "VMDK flat extent format, can be one of " - "{monolithicSparse (default) | monolithicFlat | twoGbMaxExtentSparse | twoGbMaxExtentFlat | streamOptimized} " - }, - { - .name = BLOCK_OPT_ZEROED_GRAIN, - .type = QEMU_OPT_BOOL, - .help = "Enable efficient zero writes " - "using the zeroed-grain GTE feature" - }, - { /* end of list */ } - } -}; - -static BlockDriver bdrv_vmdk = { - .format_name = "vmdk", - .instance_size = sizeof(BDRVVmdkState), - .bdrv_probe = vmdk_probe, - .bdrv_open = vmdk_open, - .bdrv_check = vmdk_check, - .bdrv_reopen_prepare = vmdk_reopen_prepare, - .bdrv_read = vmdk_co_read, - .bdrv_write = vmdk_co_write, - .bdrv_write_compressed = vmdk_write_compressed, - .bdrv_co_write_zeroes = vmdk_co_write_zeroes, - .bdrv_close = vmdk_close, - .bdrv_create = vmdk_create, - .bdrv_co_flush_to_disk = vmdk_co_flush, - .bdrv_co_get_block_status = vmdk_co_get_block_status, - .bdrv_get_allocated_file_size = vmdk_get_allocated_file_size, - .bdrv_has_zero_init = vmdk_has_zero_init, - .bdrv_get_specific_info = vmdk_get_specific_info, - .bdrv_refresh_limits = vmdk_refresh_limits, - .bdrv_get_info = vmdk_get_info, - .bdrv_detach_aio_context = vmdk_detach_aio_context, - .bdrv_attach_aio_context = vmdk_attach_aio_context, - - .supports_backing = true, - .create_opts = &vmdk_create_opts, -}; - -static void bdrv_vmdk_init(void) -{ - bdrv_register(&bdrv_vmdk); -} - -block_init(bdrv_vmdk_init); diff --git a/qemu/block/vpc.c b/qemu/block/vpc.c deleted file mode 100644 index 3e2ea698d..000000000 --- a/qemu/block/vpc.c +++ /dev/null @@ -1,1074 +0,0 @@ -/* - * Block driver for Connectix / Microsoft Virtual PC images - * - * Copyright (c) 2005 Alex Beregszaszi - * Copyright (c) 2009 Kevin Wolf - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. - */ -#include "qemu/osdep.h" -#include "qapi/error.h" -#include "qemu-common.h" -#include "block/block_int.h" -#include "sysemu/block-backend.h" -#include "qemu/module.h" -#include "migration/migration.h" -#if defined(CONFIG_UUID) -#include -#endif - -/**************************************************************/ - -#define HEADER_SIZE 512 - -//#define CACHE - -enum vhd_type { - VHD_FIXED = 2, - VHD_DYNAMIC = 3, - VHD_DIFFERENCING = 4, -}; - -/* Seconds since Jan 1, 2000 0:00:00 (UTC) */ -#define VHD_TIMESTAMP_BASE 946684800 - -#define VHD_CHS_MAX_C 65535LL -#define VHD_CHS_MAX_H 16 -#define VHD_CHS_MAX_S 255 - -#define VHD_MAX_SECTORS 0xff000000 /* 2040 GiB max image size */ -#define VHD_MAX_GEOMETRY (VHD_CHS_MAX_C * VHD_CHS_MAX_H * VHD_CHS_MAX_S) - -#define VPC_OPT_FORCE_SIZE "force_size" - -/* always big-endian */ -typedef struct vhd_footer { - char creator[8]; /* "conectix" */ - uint32_t features; - uint32_t version; - - /* Offset of next header structure, 0xFFFFFFFF if none */ - uint64_t data_offset; - - /* Seconds since Jan 1, 2000 0:00:00 (UTC) */ - uint32_t timestamp; - - char creator_app[4]; /* e.g., "vpc " */ - uint16_t major; - uint16_t minor; - char creator_os[4]; /* "Wi2k" */ - - uint64_t orig_size; - uint64_t current_size; - - uint16_t cyls; - uint8_t heads; - uint8_t secs_per_cyl; - - uint32_t type; - - /* Checksum of the Hard Disk Footer ("one's complement of the sum of all - the bytes in the footer without the checksum field") */ - uint32_t checksum; - - /* UUID used to identify a parent hard disk (backing file) */ - uint8_t uuid[16]; - - uint8_t in_saved_state; -} QEMU_PACKED VHDFooter; - -typedef struct vhd_dyndisk_header { - char magic[8]; /* "cxsparse" */ - - /* Offset of next header structure, 0xFFFFFFFF if none */ - uint64_t data_offset; - - /* Offset of the Block Allocation Table (BAT) */ - uint64_t table_offset; - - uint32_t version; - uint32_t max_table_entries; /* 32bit/entry */ - - /* 2 MB by default, must be a power of two */ - uint32_t block_size; - - uint32_t checksum; - uint8_t parent_uuid[16]; - uint32_t parent_timestamp; - uint32_t reserved; - - /* Backing file name (in UTF-16) */ - uint8_t parent_name[512]; - - struct { - uint32_t platform; - uint32_t data_space; - uint32_t data_length; - uint32_t reserved; - uint64_t data_offset; - } parent_locator[8]; -} QEMU_PACKED VHDDynDiskHeader; - -typedef struct BDRVVPCState { - CoMutex lock; - uint8_t footer_buf[HEADER_SIZE]; - uint64_t free_data_block_offset; - int max_table_entries; - uint32_t *pagetable; - uint64_t bat_offset; - uint64_t last_bitmap_offset; - - uint32_t block_size; - uint32_t bitmap_size; - bool force_use_chs; - bool force_use_sz; - -#ifdef CACHE - uint8_t *pageentry_u8; - uint32_t *pageentry_u32; - uint16_t *pageentry_u16; - - uint64_t last_bitmap; -#endif - - Error *migration_blocker; -} BDRVVPCState; - -#define VPC_OPT_SIZE_CALC "force_size_calc" -static QemuOptsList vpc_runtime_opts = { - .name = "vpc-runtime-opts", - .head = QTAILQ_HEAD_INITIALIZER(vpc_runtime_opts.head), - .desc = { - { - .name = VPC_OPT_SIZE_CALC, - .type = QEMU_OPT_STRING, - .help = "Force disk size calculation to use either CHS geometry, " - "or use the disk current_size specified in the VHD footer. " - "{chs, current_size}" - }, - { /* end of list */ } - } -}; - -static uint32_t vpc_checksum(uint8_t* buf, size_t size) -{ - uint32_t res = 0; - int i; - - for (i = 0; i < size; i++) - res += buf[i]; - - return ~res; -} - - -static int vpc_probe(const uint8_t *buf, int buf_size, const char *filename) -{ - if (buf_size >= 8 && !strncmp((char *)buf, "conectix", 8)) - return 100; - return 0; -} - -static void vpc_parse_options(BlockDriverState *bs, QemuOpts *opts, - Error **errp) -{ - BDRVVPCState *s = bs->opaque; - const char *size_calc; - - size_calc = qemu_opt_get(opts, VPC_OPT_SIZE_CALC); - - if (!size_calc) { - /* no override, use autodetect only */ - } else if (!strcmp(size_calc, "current_size")) { - s->force_use_sz = true; - } else if (!strcmp(size_calc, "chs")) { - s->force_use_chs = true; - } else { - error_setg(errp, "Invalid size calculation mode: '%s'", size_calc); - } -} - -static int vpc_open(BlockDriverState *bs, QDict *options, int flags, - Error **errp) -{ - BDRVVPCState *s = bs->opaque; - int i; - VHDFooter *footer; - VHDDynDiskHeader *dyndisk_header; - QemuOpts *opts = NULL; - Error *local_err = NULL; - bool use_chs; - uint8_t buf[HEADER_SIZE]; - uint32_t checksum; - uint64_t computed_size; - uint64_t pagetable_size; - int disk_type = VHD_DYNAMIC; - int ret; - - opts = qemu_opts_create(&vpc_runtime_opts, NULL, 0, &error_abort); - qemu_opts_absorb_qdict(opts, options, &local_err); - if (local_err) { - error_propagate(errp, local_err); - ret = -EINVAL; - goto fail; - } - - vpc_parse_options(bs, opts, &local_err); - if (local_err) { - error_propagate(errp, local_err); - ret = -EINVAL; - goto fail; - } - - ret = bdrv_pread(bs->file->bs, 0, s->footer_buf, HEADER_SIZE); - if (ret < 0) { - error_setg(errp, "Unable to read VHD header"); - goto fail; - } - - footer = (VHDFooter *) s->footer_buf; - if (strncmp(footer->creator, "conectix", 8)) { - int64_t offset = bdrv_getlength(bs->file->bs); - if (offset < 0) { - ret = offset; - error_setg(errp, "Invalid file size"); - goto fail; - } else if (offset < HEADER_SIZE) { - ret = -EINVAL; - error_setg(errp, "File too small for a VHD header"); - goto fail; - } - - /* If a fixed disk, the footer is found only at the end of the file */ - ret = bdrv_pread(bs->file->bs, offset-HEADER_SIZE, s->footer_buf, - HEADER_SIZE); - if (ret < 0) { - goto fail; - } - if (strncmp(footer->creator, "conectix", 8)) { - error_setg(errp, "invalid VPC image"); - ret = -EINVAL; - goto fail; - } - disk_type = VHD_FIXED; - } - - checksum = be32_to_cpu(footer->checksum); - footer->checksum = 0; - if (vpc_checksum(s->footer_buf, HEADER_SIZE) != checksum) - fprintf(stderr, "block-vpc: The header checksum of '%s' is " - "incorrect.\n", bs->filename); - - /* Write 'checksum' back to footer, or else will leave it with zero. */ - footer->checksum = cpu_to_be32(checksum); - - /* The visible size of a image in Virtual PC depends on the geometry - rather than on the size stored in the footer (the size in the footer - is too large usually) */ - bs->total_sectors = (int64_t) - be16_to_cpu(footer->cyls) * footer->heads * footer->secs_per_cyl; - - /* Microsoft Virtual PC and Microsoft Hyper-V produce and read - * VHD image sizes differently. VPC will rely on CHS geometry, - * while Hyper-V and disk2vhd use the size specified in the footer. - * - * We use a couple of approaches to try and determine the correct method: - * look at the Creator App field, and look for images that have CHS - * geometry that is the maximum value. - * - * If the CHS geometry is the maximum CHS geometry, then we assume that - * the size is the footer->current_size to avoid truncation. Otherwise, - * we follow the table based on footer->creator_app: - * - * Known creator apps: - * 'vpc ' : CHS Virtual PC (uses disk geometry) - * 'qemu' : CHS QEMU (uses disk geometry) - * 'qem2' : current_size QEMU (uses current_size) - * 'win ' : current_size Hyper-V - * 'd2v ' : current_size Disk2vhd - * 'tap\0' : current_size XenServer - * 'CTXS' : current_size XenConverter - * - * The user can override the table values via drive options, however - * even with an override we will still use current_size for images - * that have CHS geometry of the maximum size. - */ - use_chs = (!!strncmp(footer->creator_app, "win ", 4) && - !!strncmp(footer->creator_app, "qem2", 4) && - !!strncmp(footer->creator_app, "d2v ", 4) && - !!strncmp(footer->creator_app, "CTXS", 4) && - !!memcmp(footer->creator_app, "tap", 4)) || s->force_use_chs; - - if (!use_chs || bs->total_sectors == VHD_MAX_GEOMETRY || s->force_use_sz) { - bs->total_sectors = be64_to_cpu(footer->current_size) / - BDRV_SECTOR_SIZE; - } - - /* Allow a maximum disk size of 2040 GiB */ - if (bs->total_sectors > VHD_MAX_SECTORS) { - ret = -EFBIG; - goto fail; - } - - if (disk_type == VHD_DYNAMIC) { - ret = bdrv_pread(bs->file->bs, be64_to_cpu(footer->data_offset), buf, - HEADER_SIZE); - if (ret < 0) { - error_setg(errp, "Error reading dynamic VHD header"); - goto fail; - } - - dyndisk_header = (VHDDynDiskHeader *) buf; - - if (strncmp(dyndisk_header->magic, "cxsparse", 8)) { - error_setg(errp, "Invalid header magic"); - ret = -EINVAL; - goto fail; - } - - s->block_size = be32_to_cpu(dyndisk_header->block_size); - if (!is_power_of_2(s->block_size) || s->block_size < BDRV_SECTOR_SIZE) { - error_setg(errp, "Invalid block size %" PRIu32, s->block_size); - ret = -EINVAL; - goto fail; - } - s->bitmap_size = ((s->block_size / (8 * 512)) + 511) & ~511; - - s->max_table_entries = be32_to_cpu(dyndisk_header->max_table_entries); - - if ((bs->total_sectors * 512) / s->block_size > 0xffffffffU) { - error_setg(errp, "Too many blocks"); - ret = -EINVAL; - goto fail; - } - - computed_size = (uint64_t) s->max_table_entries * s->block_size; - if (computed_size < bs->total_sectors * 512) { - error_setg(errp, "Page table too small"); - ret = -EINVAL; - goto fail; - } - - if (s->max_table_entries > SIZE_MAX / 4 || - s->max_table_entries > (int) INT_MAX / 4) { - error_setg(errp, "Max Table Entries too large (%" PRId32 ")", - s->max_table_entries); - ret = -EINVAL; - goto fail; - } - - pagetable_size = (uint64_t) s->max_table_entries * 4; - - s->pagetable = qemu_try_blockalign(bs->file->bs, pagetable_size); - if (s->pagetable == NULL) { - error_setg(errp, "Unable to allocate memory for page table"); - ret = -ENOMEM; - goto fail; - } - - s->bat_offset = be64_to_cpu(dyndisk_header->table_offset); - - ret = bdrv_pread(bs->file->bs, s->bat_offset, s->pagetable, - pagetable_size); - if (ret < 0) { - error_setg(errp, "Error reading pagetable"); - goto fail; - } - - s->free_data_block_offset = - ROUND_UP(s->bat_offset + pagetable_size, 512); - - for (i = 0; i < s->max_table_entries; i++) { - be32_to_cpus(&s->pagetable[i]); - if (s->pagetable[i] != 0xFFFFFFFF) { - int64_t next = (512 * (int64_t) s->pagetable[i]) + - s->bitmap_size + s->block_size; - - if (next > s->free_data_block_offset) { - s->free_data_block_offset = next; - } - } - } - - if (s->free_data_block_offset > bdrv_getlength(bs->file->bs)) { - error_setg(errp, "block-vpc: free_data_block_offset points after " - "the end of file. The image has been truncated."); - ret = -EINVAL; - goto fail; - } - - s->last_bitmap_offset = (int64_t) -1; - -#ifdef CACHE - s->pageentry_u8 = g_malloc(512); - s->pageentry_u32 = s->pageentry_u8; - s->pageentry_u16 = s->pageentry_u8; - s->last_pagetable = -1; -#endif - } - - qemu_co_mutex_init(&s->lock); - - /* Disable migration when VHD images are used */ - error_setg(&s->migration_blocker, "The vpc format used by node '%s' " - "does not support live migration", - bdrv_get_device_or_node_name(bs)); - migrate_add_blocker(s->migration_blocker); - - return 0; - -fail: - qemu_vfree(s->pagetable); -#ifdef CACHE - g_free(s->pageentry_u8); -#endif - return ret; -} - -static int vpc_reopen_prepare(BDRVReopenState *state, - BlockReopenQueue *queue, Error **errp) -{ - return 0; -} - -/* - * Returns the absolute byte offset of the given sector in the image file. - * If the sector is not allocated, -1 is returned instead. - * - * The parameter write must be 1 if the offset will be used for a write - * operation (the block bitmaps is updated then), 0 otherwise. - */ -static inline int64_t get_sector_offset(BlockDriverState *bs, - int64_t sector_num, int write) -{ - BDRVVPCState *s = bs->opaque; - uint64_t offset = sector_num * 512; - uint64_t bitmap_offset, block_offset; - uint32_t pagetable_index, pageentry_index; - - pagetable_index = offset / s->block_size; - pageentry_index = (offset % s->block_size) / 512; - - if (pagetable_index >= s->max_table_entries || s->pagetable[pagetable_index] == 0xffffffff) - return -1; /* not allocated */ - - bitmap_offset = 512 * (uint64_t) s->pagetable[pagetable_index]; - block_offset = bitmap_offset + s->bitmap_size + (512 * pageentry_index); - - /* We must ensure that we don't write to any sectors which are marked as - unused in the bitmap. We get away with setting all bits in the block - bitmap each time we write to a new block. This might cause Virtual PC to - miss sparse read optimization, but it's not a problem in terms of - correctness. */ - if (write && (s->last_bitmap_offset != bitmap_offset)) { - uint8_t bitmap[s->bitmap_size]; - - s->last_bitmap_offset = bitmap_offset; - memset(bitmap, 0xff, s->bitmap_size); - bdrv_pwrite_sync(bs->file->bs, bitmap_offset, bitmap, s->bitmap_size); - } - - return block_offset; -} - -/* - * Writes the footer to the end of the image file. This is needed when the - * file grows as it overwrites the old footer - * - * Returns 0 on success and < 0 on error - */ -static int rewrite_footer(BlockDriverState* bs) -{ - int ret; - BDRVVPCState *s = bs->opaque; - int64_t offset = s->free_data_block_offset; - - ret = bdrv_pwrite_sync(bs->file->bs, offset, s->footer_buf, HEADER_SIZE); - if (ret < 0) - return ret; - - return 0; -} - -/* - * Allocates a new block. This involves writing a new footer and updating - * the Block Allocation Table to use the space at the old end of the image - * file (overwriting the old footer) - * - * Returns the sectors' offset in the image file on success and < 0 on error - */ -static int64_t alloc_block(BlockDriverState* bs, int64_t sector_num) -{ - BDRVVPCState *s = bs->opaque; - int64_t bat_offset; - uint32_t index, bat_value; - int ret; - uint8_t bitmap[s->bitmap_size]; - - /* Check if sector_num is valid */ - if ((sector_num < 0) || (sector_num > bs->total_sectors)) - return -1; - - /* Write entry into in-memory BAT */ - index = (sector_num * 512) / s->block_size; - if (s->pagetable[index] != 0xFFFFFFFF) - return -1; - - s->pagetable[index] = s->free_data_block_offset / 512; - - /* Initialize the block's bitmap */ - memset(bitmap, 0xff, s->bitmap_size); - ret = bdrv_pwrite_sync(bs->file->bs, s->free_data_block_offset, bitmap, - s->bitmap_size); - if (ret < 0) { - return ret; - } - - /* Write new footer (the old one will be overwritten) */ - s->free_data_block_offset += s->block_size + s->bitmap_size; - ret = rewrite_footer(bs); - if (ret < 0) - goto fail; - - /* Write BAT entry to disk */ - bat_offset = s->bat_offset + (4 * index); - bat_value = cpu_to_be32(s->pagetable[index]); - ret = bdrv_pwrite_sync(bs->file->bs, bat_offset, &bat_value, 4); - if (ret < 0) - goto fail; - - return get_sector_offset(bs, sector_num, 0); - -fail: - s->free_data_block_offset -= (s->block_size + s->bitmap_size); - return -1; -} - -static int vpc_get_info(BlockDriverState *bs, BlockDriverInfo *bdi) -{ - BDRVVPCState *s = (BDRVVPCState *)bs->opaque; - VHDFooter *footer = (VHDFooter *) s->footer_buf; - - if (be32_to_cpu(footer->type) != VHD_FIXED) { - bdi->cluster_size = s->block_size; - } - - bdi->unallocated_blocks_are_zero = true; - return 0; -} - -static int vpc_read(BlockDriverState *bs, int64_t sector_num, - uint8_t *buf, int nb_sectors) -{ - BDRVVPCState *s = bs->opaque; - int ret; - int64_t offset; - int64_t sectors, sectors_per_block; - VHDFooter *footer = (VHDFooter *) s->footer_buf; - - if (be32_to_cpu(footer->type) == VHD_FIXED) { - return bdrv_read(bs->file->bs, sector_num, buf, nb_sectors); - } - while (nb_sectors > 0) { - offset = get_sector_offset(bs, sector_num, 0); - - sectors_per_block = s->block_size >> BDRV_SECTOR_BITS; - sectors = sectors_per_block - (sector_num % sectors_per_block); - if (sectors > nb_sectors) { - sectors = nb_sectors; - } - - if (offset == -1) { - memset(buf, 0, sectors * BDRV_SECTOR_SIZE); - } else { - ret = bdrv_pread(bs->file->bs, offset, buf, - sectors * BDRV_SECTOR_SIZE); - if (ret != sectors * BDRV_SECTOR_SIZE) { - return -1; - } - } - - nb_sectors -= sectors; - sector_num += sectors; - buf += sectors * BDRV_SECTOR_SIZE; - } - return 0; -} - -static coroutine_fn int vpc_co_read(BlockDriverState *bs, int64_t sector_num, - uint8_t *buf, int nb_sectors) -{ - int ret; - BDRVVPCState *s = bs->opaque; - qemu_co_mutex_lock(&s->lock); - ret = vpc_read(bs, sector_num, buf, nb_sectors); - qemu_co_mutex_unlock(&s->lock); - return ret; -} - -static int vpc_write(BlockDriverState *bs, int64_t sector_num, - const uint8_t *buf, int nb_sectors) -{ - BDRVVPCState *s = bs->opaque; - int64_t offset; - int64_t sectors, sectors_per_block; - int ret; - VHDFooter *footer = (VHDFooter *) s->footer_buf; - - if (be32_to_cpu(footer->type) == VHD_FIXED) { - return bdrv_write(bs->file->bs, sector_num, buf, nb_sectors); - } - while (nb_sectors > 0) { - offset = get_sector_offset(bs, sector_num, 1); - - sectors_per_block = s->block_size >> BDRV_SECTOR_BITS; - sectors = sectors_per_block - (sector_num % sectors_per_block); - if (sectors > nb_sectors) { - sectors = nb_sectors; - } - - if (offset == -1) { - offset = alloc_block(bs, sector_num); - if (offset < 0) - return -1; - } - - ret = bdrv_pwrite(bs->file->bs, offset, buf, - sectors * BDRV_SECTOR_SIZE); - if (ret != sectors * BDRV_SECTOR_SIZE) { - return -1; - } - - nb_sectors -= sectors; - sector_num += sectors; - buf += sectors * BDRV_SECTOR_SIZE; - } - - return 0; -} - -static coroutine_fn int vpc_co_write(BlockDriverState *bs, int64_t sector_num, - const uint8_t *buf, int nb_sectors) -{ - int ret; - BDRVVPCState *s = bs->opaque; - qemu_co_mutex_lock(&s->lock); - ret = vpc_write(bs, sector_num, buf, nb_sectors); - qemu_co_mutex_unlock(&s->lock); - return ret; -} - -static int64_t coroutine_fn vpc_co_get_block_status(BlockDriverState *bs, - int64_t sector_num, int nb_sectors, int *pnum, BlockDriverState **file) -{ - BDRVVPCState *s = bs->opaque; - VHDFooter *footer = (VHDFooter*) s->footer_buf; - int64_t start, offset; - bool allocated; - int n; - - if (be32_to_cpu(footer->type) == VHD_FIXED) { - *pnum = nb_sectors; - *file = bs->file->bs; - return BDRV_BLOCK_RAW | BDRV_BLOCK_OFFSET_VALID | BDRV_BLOCK_DATA | - (sector_num << BDRV_SECTOR_BITS); - } - - offset = get_sector_offset(bs, sector_num, 0); - start = offset; - allocated = (offset != -1); - *pnum = 0; - - do { - /* All sectors in a block are contiguous (without using the bitmap) */ - n = ROUND_UP(sector_num + 1, s->block_size / BDRV_SECTOR_SIZE) - - sector_num; - n = MIN(n, nb_sectors); - - *pnum += n; - sector_num += n; - nb_sectors -= n; - /* *pnum can't be greater than one block for allocated - * sectors since there is always a bitmap in between. */ - if (allocated) { - *file = bs->file->bs; - return BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID | start; - } - if (nb_sectors == 0) { - break; - } - offset = get_sector_offset(bs, sector_num, 0); - } while (offset == -1); - - return 0; -} - -/* - * Calculates the number of cylinders, heads and sectors per cylinder - * based on a given number of sectors. This is the algorithm described - * in the VHD specification. - * - * Note that the geometry doesn't always exactly match total_sectors but - * may round it down. - * - * Returns 0 on success, -EFBIG if the size is larger than 2040 GiB. Override - * the hardware EIDE and ATA-2 limit of 16 heads (max disk size of 127 GB) - * and instead allow up to 255 heads. - */ -static int calculate_geometry(int64_t total_sectors, uint16_t* cyls, - uint8_t* heads, uint8_t* secs_per_cyl) -{ - uint32_t cyls_times_heads; - - total_sectors = MIN(total_sectors, VHD_MAX_GEOMETRY); - - if (total_sectors >= 65535LL * 16 * 63) { - *secs_per_cyl = 255; - *heads = 16; - cyls_times_heads = total_sectors / *secs_per_cyl; - } else { - *secs_per_cyl = 17; - cyls_times_heads = total_sectors / *secs_per_cyl; - *heads = (cyls_times_heads + 1023) / 1024; - - if (*heads < 4) { - *heads = 4; - } - - if (cyls_times_heads >= (*heads * 1024) || *heads > 16) { - *secs_per_cyl = 31; - *heads = 16; - cyls_times_heads = total_sectors / *secs_per_cyl; - } - - if (cyls_times_heads >= (*heads * 1024)) { - *secs_per_cyl = 63; - *heads = 16; - cyls_times_heads = total_sectors / *secs_per_cyl; - } - } - - *cyls = cyls_times_heads / *heads; - - return 0; -} - -static int create_dynamic_disk(BlockBackend *blk, uint8_t *buf, - int64_t total_sectors) -{ - VHDDynDiskHeader *dyndisk_header = - (VHDDynDiskHeader *) buf; - size_t block_size, num_bat_entries; - int i; - int ret; - int64_t offset = 0; - - /* Write the footer (twice: at the beginning and at the end) */ - block_size = 0x200000; - num_bat_entries = (total_sectors + block_size / 512) / (block_size / 512); - - ret = blk_pwrite(blk, offset, buf, HEADER_SIZE); - if (ret < 0) { - goto fail; - } - - offset = 1536 + ((num_bat_entries * 4 + 511) & ~511); - ret = blk_pwrite(blk, offset, buf, HEADER_SIZE); - if (ret < 0) { - goto fail; - } - - /* Write the initial BAT */ - offset = 3 * 512; - - memset(buf, 0xFF, 512); - for (i = 0; i < (num_bat_entries * 4 + 511) / 512; i++) { - ret = blk_pwrite(blk, offset, buf, 512); - if (ret < 0) { - goto fail; - } - offset += 512; - } - - /* Prepare the Dynamic Disk Header */ - memset(buf, 0, 1024); - - memcpy(dyndisk_header->magic, "cxsparse", 8); - - /* - * Note: The spec is actually wrong here for data_offset, it says - * 0xFFFFFFFF, but MS tools expect all 64 bits to be set. - */ - dyndisk_header->data_offset = cpu_to_be64(0xFFFFFFFFFFFFFFFFULL); - dyndisk_header->table_offset = cpu_to_be64(3 * 512); - dyndisk_header->version = cpu_to_be32(0x00010000); - dyndisk_header->block_size = cpu_to_be32(block_size); - dyndisk_header->max_table_entries = cpu_to_be32(num_bat_entries); - - dyndisk_header->checksum = cpu_to_be32(vpc_checksum(buf, 1024)); - - /* Write the header */ - offset = 512; - - ret = blk_pwrite(blk, offset, buf, 1024); - if (ret < 0) { - goto fail; - } - - fail: - return ret; -} - -static int create_fixed_disk(BlockBackend *blk, uint8_t *buf, - int64_t total_size) -{ - int ret; - - /* Add footer to total size */ - total_size += HEADER_SIZE; - - ret = blk_truncate(blk, total_size); - if (ret < 0) { - return ret; - } - - ret = blk_pwrite(blk, total_size - HEADER_SIZE, buf, HEADER_SIZE); - if (ret < 0) { - return ret; - } - - return ret; -} - -static int vpc_create(const char *filename, QemuOpts *opts, Error **errp) -{ - uint8_t buf[1024]; - VHDFooter *footer = (VHDFooter *) buf; - char *disk_type_param; - int i; - uint16_t cyls = 0; - uint8_t heads = 0; - uint8_t secs_per_cyl = 0; - int64_t total_sectors; - int64_t total_size; - int disk_type; - int ret = -EIO; - bool force_size; - Error *local_err = NULL; - BlockBackend *blk = NULL; - - /* Read out options */ - total_size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0), - BDRV_SECTOR_SIZE); - disk_type_param = qemu_opt_get_del(opts, BLOCK_OPT_SUBFMT); - if (disk_type_param) { - if (!strcmp(disk_type_param, "dynamic")) { - disk_type = VHD_DYNAMIC; - } else if (!strcmp(disk_type_param, "fixed")) { - disk_type = VHD_FIXED; - } else { - error_setg(errp, "Invalid disk type, %s", disk_type_param); - ret = -EINVAL; - goto out; - } - } else { - disk_type = VHD_DYNAMIC; - } - - force_size = qemu_opt_get_bool_del(opts, VPC_OPT_FORCE_SIZE, false); - - ret = bdrv_create_file(filename, opts, &local_err); - if (ret < 0) { - error_propagate(errp, local_err); - goto out; - } - - blk = blk_new_open(filename, NULL, NULL, - BDRV_O_RDWR | BDRV_O_PROTOCOL, &local_err); - if (blk == NULL) { - error_propagate(errp, local_err); - ret = -EIO; - goto out; - } - - blk_set_allow_write_beyond_eof(blk, true); - - /* - * Calculate matching total_size and geometry. Increase the number of - * sectors requested until we get enough (or fail). This ensures that - * qemu-img convert doesn't truncate images, but rather rounds up. - * - * If the image size can't be represented by a spec conformant CHS geometry, - * we set the geometry to 65535 x 16 x 255 (CxHxS) sectors and use - * the image size from the VHD footer to calculate total_sectors. - */ - if (force_size) { - /* This will force the use of total_size for sector count, below */ - cyls = VHD_CHS_MAX_C; - heads = VHD_CHS_MAX_H; - secs_per_cyl = VHD_CHS_MAX_S; - } else { - total_sectors = MIN(VHD_MAX_GEOMETRY, total_size / BDRV_SECTOR_SIZE); - for (i = 0; total_sectors > (int64_t)cyls * heads * secs_per_cyl; i++) { - calculate_geometry(total_sectors + i, &cyls, &heads, &secs_per_cyl); - } - } - - if ((int64_t)cyls * heads * secs_per_cyl == VHD_MAX_GEOMETRY) { - total_sectors = total_size / BDRV_SECTOR_SIZE; - /* Allow a maximum disk size of 2040 GiB */ - if (total_sectors > VHD_MAX_SECTORS) { - error_setg(errp, "Disk size is too large, max size is 2040 GiB"); - ret = -EFBIG; - goto out; - } - } else { - total_sectors = (int64_t)cyls * heads * secs_per_cyl; - total_size = total_sectors * BDRV_SECTOR_SIZE; - } - - /* Prepare the Hard Disk Footer */ - memset(buf, 0, 1024); - - memcpy(footer->creator, "conectix", 8); - if (force_size) { - memcpy(footer->creator_app, "qem2", 4); - } else { - memcpy(footer->creator_app, "qemu", 4); - } - memcpy(footer->creator_os, "Wi2k", 4); - - footer->features = cpu_to_be32(0x02); - footer->version = cpu_to_be32(0x00010000); - if (disk_type == VHD_DYNAMIC) { - footer->data_offset = cpu_to_be64(HEADER_SIZE); - } else { - footer->data_offset = cpu_to_be64(0xFFFFFFFFFFFFFFFFULL); - } - footer->timestamp = cpu_to_be32(time(NULL) - VHD_TIMESTAMP_BASE); - - /* Version of Virtual PC 2007 */ - footer->major = cpu_to_be16(0x0005); - footer->minor = cpu_to_be16(0x0003); - footer->orig_size = cpu_to_be64(total_size); - footer->current_size = cpu_to_be64(total_size); - footer->cyls = cpu_to_be16(cyls); - footer->heads = heads; - footer->secs_per_cyl = secs_per_cyl; - - footer->type = cpu_to_be32(disk_type); - -#if defined(CONFIG_UUID) - uuid_generate(footer->uuid); -#endif - - footer->checksum = cpu_to_be32(vpc_checksum(buf, HEADER_SIZE)); - - if (disk_type == VHD_DYNAMIC) { - ret = create_dynamic_disk(blk, buf, total_sectors); - } else { - ret = create_fixed_disk(blk, buf, total_size); - } - if (ret < 0) { - error_setg(errp, "Unable to create or write VHD header"); - } - -out: - blk_unref(blk); - g_free(disk_type_param); - return ret; -} - -static int vpc_has_zero_init(BlockDriverState *bs) -{ - BDRVVPCState *s = bs->opaque; - VHDFooter *footer = (VHDFooter *) s->footer_buf; - - if (be32_to_cpu(footer->type) == VHD_FIXED) { - return bdrv_has_zero_init(bs->file->bs); - } else { - return 1; - } -} - -static void vpc_close(BlockDriverState *bs) -{ - BDRVVPCState *s = bs->opaque; - qemu_vfree(s->pagetable); -#ifdef CACHE - g_free(s->pageentry_u8); -#endif - - migrate_del_blocker(s->migration_blocker); - error_free(s->migration_blocker); -} - -static QemuOptsList vpc_create_opts = { - .name = "vpc-create-opts", - .head = QTAILQ_HEAD_INITIALIZER(vpc_create_opts.head), - .desc = { - { - .name = BLOCK_OPT_SIZE, - .type = QEMU_OPT_SIZE, - .help = "Virtual disk size" - }, - { - .name = BLOCK_OPT_SUBFMT, - .type = QEMU_OPT_STRING, - .help = - "Type of virtual hard disk format. Supported formats are " - "{dynamic (default) | fixed} " - }, - { - .name = VPC_OPT_FORCE_SIZE, - .type = QEMU_OPT_BOOL, - .help = "Force disk size calculation to use the actual size " - "specified, rather than using the nearest CHS-based " - "calculation" - }, - { /* end of list */ } - } -}; - -static BlockDriver bdrv_vpc = { - .format_name = "vpc", - .instance_size = sizeof(BDRVVPCState), - - .bdrv_probe = vpc_probe, - .bdrv_open = vpc_open, - .bdrv_close = vpc_close, - .bdrv_reopen_prepare = vpc_reopen_prepare, - .bdrv_create = vpc_create, - - .bdrv_read = vpc_co_read, - .bdrv_write = vpc_co_write, - .bdrv_co_get_block_status = vpc_co_get_block_status, - - .bdrv_get_info = vpc_get_info, - - .create_opts = &vpc_create_opts, - .bdrv_has_zero_init = vpc_has_zero_init, -}; - -static void bdrv_vpc_init(void) -{ - bdrv_register(&bdrv_vpc); -} - -block_init(bdrv_vpc_init); diff --git a/qemu/block/vvfat.c b/qemu/block/vvfat.c deleted file mode 100644 index 183fc4f04..000000000 --- a/qemu/block/vvfat.c +++ /dev/null @@ -1,3050 +0,0 @@ -/* vim:set shiftwidth=4 ts=4: */ -/* - * QEMU Block driver for virtual VFAT (shadows a local directory) - * - * Copyright (c) 2004,2005 Johannes E. Schindelin - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. - */ -#include "qemu/osdep.h" -#include -#include "qapi/error.h" -#include "block/block_int.h" -#include "qemu/module.h" -#include "migration/migration.h" -#include "qapi/qmp/qint.h" -#include "qapi/qmp/qbool.h" -#include "qapi/qmp/qstring.h" -#include "qemu/cutils.h" - -#ifndef S_IWGRP -#define S_IWGRP 0 -#endif -#ifndef S_IWOTH -#define S_IWOTH 0 -#endif - -/* TODO: add ":bootsector=blabla.img:" */ -/* LATER TODO: add automatic boot sector generation from - BOOTEASY.ASM and Ranish Partition Manager - Note that DOS assumes the system files to be the first files in the - file system (test if the boot sector still relies on that fact)! */ -/* MAYBE TODO: write block-visofs.c */ -/* TODO: call try_commit() only after a timeout */ - -/* #define DEBUG */ - -#ifdef DEBUG - -#define DLOG(a) a - -static void checkpoint(void); - -#ifdef __MINGW32__ -void nonono(const char* file, int line, const char* msg) { - fprintf(stderr, "Nonono! %s:%d %s\n", file, line, msg); - exit(-5); -} -#undef assert -#define assert(a) do {if (!(a)) nonono(__FILE__, __LINE__, #a);}while(0) -#endif - -#else - -#define DLOG(a) - -#endif - -/* dynamic array functions */ -typedef struct array_t { - char* pointer; - unsigned int size,next,item_size; -} array_t; - -static inline void array_init(array_t* array,unsigned int item_size) -{ - array->pointer = NULL; - array->size=0; - array->next=0; - array->item_size=item_size; -} - -static inline void array_free(array_t* array) -{ - g_free(array->pointer); - array->size=array->next=0; -} - -/* does not automatically grow */ -static inline void* array_get(array_t* array,unsigned int index) { - assert(index < array->next); - return array->pointer + index * array->item_size; -} - -static inline int array_ensure_allocated(array_t* array, int index) -{ - if((index + 1) * array->item_size > array->size) { - int new_size = (index + 32) * array->item_size; - array->pointer = g_realloc(array->pointer, new_size); - if (!array->pointer) - return -1; - array->size = new_size; - array->next = index + 1; - } - - return 0; -} - -static inline void* array_get_next(array_t* array) { - unsigned int next = array->next; - void* result; - - if (array_ensure_allocated(array, next) < 0) - return NULL; - - array->next = next + 1; - result = array_get(array, next); - - return result; -} - -static inline void* array_insert(array_t* array,unsigned int index,unsigned int count) { - if((array->next+count)*array->item_size>array->size) { - int increment=count*array->item_size; - array->pointer=g_realloc(array->pointer,array->size+increment); - if(!array->pointer) - return NULL; - array->size+=increment; - } - memmove(array->pointer+(index+count)*array->item_size, - array->pointer+index*array->item_size, - (array->next-index)*array->item_size); - array->next+=count; - return array->pointer+index*array->item_size; -} - -/* this performs a "roll", so that the element which was at index_from becomes - * index_to, but the order of all other elements is preserved. */ -static inline int array_roll(array_t* array,int index_to,int index_from,int count) -{ - char* buf; - char* from; - char* to; - int is; - - if(!array || - index_to<0 || index_to>=array->next || - index_from<0 || index_from>=array->next) - return -1; - - if(index_to==index_from) - return 0; - - is=array->item_size; - from=array->pointer+index_from*is; - to=array->pointer+index_to*is; - buf=g_malloc(is*count); - memcpy(buf,from,is*count); - - if(index_to=0); - assert(count > 0); - assert(index + count <= array->next); - if(array_roll(array,array->next-1,index,count)) - return -1; - array->next -= count; - return 0; -} - -static int array_remove(array_t* array,int index) -{ - return array_remove_slice(array, index, 1); -} - -/* return the index for a given member */ -static int array_index(array_t* array, void* pointer) -{ - size_t offset = (char*)pointer - array->pointer; - assert((offset % array->item_size) == 0); - assert(offset/array->item_size < array->next); - return offset/array->item_size; -} - -/* These structures are used to fake a disk and the VFAT filesystem. - * For this reason we need to use QEMU_PACKED. */ - -typedef struct bootsector_t { - uint8_t jump[3]; - uint8_t name[8]; - uint16_t sector_size; - uint8_t sectors_per_cluster; - uint16_t reserved_sectors; - uint8_t number_of_fats; - uint16_t root_entries; - uint16_t total_sectors16; - uint8_t media_type; - uint16_t sectors_per_fat; - uint16_t sectors_per_track; - uint16_t number_of_heads; - uint32_t hidden_sectors; - uint32_t total_sectors; - union { - struct { - uint8_t drive_number; - uint8_t current_head; - uint8_t signature; - uint32_t id; - uint8_t volume_label[11]; - } QEMU_PACKED fat16; - struct { - uint32_t sectors_per_fat; - uint16_t flags; - uint8_t major,minor; - uint32_t first_cluster_of_root_directory; - uint16_t info_sector; - uint16_t backup_boot_sector; - uint16_t ignored; - } QEMU_PACKED fat32; - } u; - uint8_t fat_type[8]; - uint8_t ignored[0x1c0]; - uint8_t magic[2]; -} QEMU_PACKED bootsector_t; - -typedef struct { - uint8_t head; - uint8_t sector; - uint8_t cylinder; -} mbr_chs_t; - -typedef struct partition_t { - uint8_t attributes; /* 0x80 = bootable */ - mbr_chs_t start_CHS; - uint8_t fs_type; /* 0x1 = FAT12, 0x6 = FAT16, 0xe = FAT16_LBA, 0xb = FAT32, 0xc = FAT32_LBA */ - mbr_chs_t end_CHS; - uint32_t start_sector_long; - uint32_t length_sector_long; -} QEMU_PACKED partition_t; - -typedef struct mbr_t { - uint8_t ignored[0x1b8]; - uint32_t nt_id; - uint8_t ignored2[2]; - partition_t partition[4]; - uint8_t magic[2]; -} QEMU_PACKED mbr_t; - -typedef struct direntry_t { - uint8_t name[8 + 3]; - uint8_t attributes; - uint8_t reserved[2]; - uint16_t ctime; - uint16_t cdate; - uint16_t adate; - uint16_t begin_hi; - uint16_t mtime; - uint16_t mdate; - uint16_t begin; - uint32_t size; -} QEMU_PACKED direntry_t; - -/* this structure are used to transparently access the files */ - -typedef struct mapping_t { - /* begin is the first cluster, end is the last+1 */ - uint32_t begin,end; - /* as s->directory is growable, no pointer may be used here */ - unsigned int dir_index; - /* the clusters of a file may be in any order; this points to the first */ - int first_mapping_index; - union { - /* offset is - * - the offset in the file (in clusters) for a file, or - * - the next cluster of the directory for a directory, and - * - the address of the buffer for a faked entry - */ - struct { - uint32_t offset; - } file; - struct { - int parent_mapping_index; - int first_dir_index; - } dir; - } info; - /* path contains the full path, i.e. it always starts with s->path */ - char* path; - - enum { MODE_UNDEFINED = 0, MODE_NORMAL = 1, MODE_MODIFIED = 2, - MODE_DIRECTORY = 4, MODE_FAKED = 8, - MODE_DELETED = 16, MODE_RENAMED = 32 } mode; - int read_only; -} mapping_t; - -#ifdef DEBUG -static void print_direntry(const struct direntry_t*); -static void print_mapping(const struct mapping_t* mapping); -#endif - -/* here begins the real VVFAT driver */ - -typedef struct BDRVVVFATState { - CoMutex lock; - BlockDriverState* bs; /* pointer to parent */ - unsigned int first_sectors_number; /* 1 for a single partition, 0x40 for a disk with partition table */ - unsigned char first_sectors[0x40*0x200]; - - int fat_type; /* 16 or 32 */ - array_t fat,directory,mapping; - char volume_label[11]; - - unsigned int cluster_size; - unsigned int sectors_per_cluster; - unsigned int sectors_per_fat; - unsigned int sectors_of_root_directory; - uint32_t last_cluster_of_root_directory; - unsigned int faked_sectors; /* how many sectors are faked before file data */ - uint32_t sector_count; /* total number of sectors of the partition */ - uint32_t cluster_count; /* total number of clusters of this partition */ - uint32_t max_fat_value; - - int current_fd; - mapping_t* current_mapping; - unsigned char* cluster; /* points to current cluster */ - unsigned char* cluster_buffer; /* points to a buffer to hold temp data */ - unsigned int current_cluster; - - /* write support */ - BlockDriverState* write_target; - char* qcow_filename; - BlockDriverState* qcow; - void* fat2; - char* used_clusters; - array_t commits; - const char* path; - int downcase_short_names; - - Error *migration_blocker; -} BDRVVVFATState; - -/* take the sector position spos and convert it to Cylinder/Head/Sector position - * if the position is outside the specified geometry, fill maximum value for CHS - * and return 1 to signal overflow. - */ -static int sector2CHS(mbr_chs_t *chs, int spos, int cyls, int heads, int secs) -{ - int head,sector; - sector = spos % secs; spos /= secs; - head = spos % heads; spos /= heads; - if (spos >= cyls) { - /* Overflow, - it happens if 32bit sector positions are used, while CHS is only 24bit. - Windows/Dos is said to take 1023/255/63 as nonrepresentable CHS */ - chs->head = 0xFF; - chs->sector = 0xFF; - chs->cylinder = 0xFF; - return 1; - } - chs->head = (uint8_t)head; - chs->sector = (uint8_t)( (sector+1) | ((spos>>8)<<6) ); - chs->cylinder = (uint8_t)spos; - return 0; -} - -static void init_mbr(BDRVVVFATState *s, int cyls, int heads, int secs) -{ - /* TODO: if the files mbr.img and bootsect.img exist, use them */ - mbr_t* real_mbr=(mbr_t*)s->first_sectors; - partition_t* partition = &(real_mbr->partition[0]); - int lba; - - memset(s->first_sectors,0,512); - - /* Win NT Disk Signature */ - real_mbr->nt_id= cpu_to_le32(0xbe1afdfa); - - partition->attributes=0x80; /* bootable */ - - /* LBA is used when partition is outside the CHS geometry */ - lba = sector2CHS(&partition->start_CHS, s->first_sectors_number - 1, - cyls, heads, secs); - lba |= sector2CHS(&partition->end_CHS, s->bs->total_sectors - 1, - cyls, heads, secs); - - /*LBA partitions are identified only by start/length_sector_long not by CHS*/ - partition->start_sector_long = cpu_to_le32(s->first_sectors_number - 1); - partition->length_sector_long = cpu_to_le32(s->bs->total_sectors - - s->first_sectors_number + 1); - - /* FAT12/FAT16/FAT32 */ - /* DOS uses different types when partition is LBA, - probably to prevent older versions from using CHS on them */ - partition->fs_type= s->fat_type==12 ? 0x1: - s->fat_type==16 ? (lba?0xe:0x06): - /*fat_tyoe==32*/ (lba?0xc:0x0b); - - real_mbr->magic[0]=0x55; real_mbr->magic[1]=0xaa; -} - -/* direntry functions */ - -/* dest is assumed to hold 258 bytes, and pads with 0xffff up to next multiple of 26 */ -static inline int short2long_name(char* dest,const char* src) -{ - int i; - int len; - for(i=0;i<129 && src[i];i++) { - dest[2*i]=src[i]; - dest[2*i+1]=0; - } - len=2*i; - dest[2*i]=dest[2*i+1]=0; - for(i=2*i+2;(i%26);i++) - dest[i]=0xff; - return len; -} - -static inline direntry_t* create_long_filename(BDRVVVFATState* s,const char* filename) -{ - char buffer[258]; - int length=short2long_name(buffer,filename), - number_of_entries=(length+25)/26,i; - direntry_t* entry; - - for(i=0;idirectory)); - entry->attributes=0xf; - entry->reserved[0]=0; - entry->begin=0; - entry->name[0]=(number_of_entries-i)|(i==0?0x40:0); - } - for(i=0;i<26*number_of_entries;i++) { - int offset=(i%26); - if(offset<10) offset=1+offset; - else if(offset<22) offset=14+offset-10; - else offset=28+offset-22; - entry=array_get(&(s->directory),s->directory.next-1-(i/26)); - entry->name[offset]=buffer[i]; - } - return array_get(&(s->directory),s->directory.next-number_of_entries); -} - -static char is_free(const direntry_t* direntry) -{ - return direntry->name[0]==0xe5 || direntry->name[0]==0x00; -} - -static char is_volume_label(const direntry_t* direntry) -{ - return direntry->attributes == 0x28; -} - -static char is_long_name(const direntry_t* direntry) -{ - return direntry->attributes == 0xf; -} - -static char is_short_name(const direntry_t* direntry) -{ - return !is_volume_label(direntry) && !is_long_name(direntry) - && !is_free(direntry); -} - -static char is_directory(const direntry_t* direntry) -{ - return direntry->attributes & 0x10 && direntry->name[0] != 0xe5; -} - -static inline char is_dot(const direntry_t* direntry) -{ - return is_short_name(direntry) && direntry->name[0] == '.'; -} - -static char is_file(const direntry_t* direntry) -{ - return is_short_name(direntry) && !is_directory(direntry); -} - -static inline uint32_t begin_of_direntry(const direntry_t* direntry) -{ - return le16_to_cpu(direntry->begin)|(le16_to_cpu(direntry->begin_hi)<<16); -} - -static inline uint32_t filesize_of_direntry(const direntry_t* direntry) -{ - return le32_to_cpu(direntry->size); -} - -static void set_begin_of_direntry(direntry_t* direntry, uint32_t begin) -{ - direntry->begin = cpu_to_le16(begin & 0xffff); - direntry->begin_hi = cpu_to_le16((begin >> 16) & 0xffff); -} - -/* fat functions */ - -static inline uint8_t fat_chksum(const direntry_t* entry) -{ - uint8_t chksum=0; - int i; - - for (i = 0; i < ARRAY_SIZE(entry->name); i++) { - chksum = (((chksum & 0xfe) >> 1) | - ((chksum & 0x01) ? 0x80 : 0)) + entry->name[i]; - } - - return chksum; -} - -/* if return_time==0, this returns the fat_date, else the fat_time */ -static uint16_t fat_datetime(time_t time,int return_time) { - struct tm* t; - struct tm t1; - t = &t1; - localtime_r(&time,t); - if(return_time) - return cpu_to_le16((t->tm_sec/2)|(t->tm_min<<5)|(t->tm_hour<<11)); - return cpu_to_le16((t->tm_mday)|((t->tm_mon+1)<<5)|((t->tm_year-80)<<9)); -} - -static inline void fat_set(BDRVVVFATState* s,unsigned int cluster,uint32_t value) -{ - if(s->fat_type==32) { - uint32_t* entry=array_get(&(s->fat),cluster); - *entry=cpu_to_le32(value); - } else if(s->fat_type==16) { - uint16_t* entry=array_get(&(s->fat),cluster); - *entry=cpu_to_le16(value&0xffff); - } else { - int offset = (cluster*3/2); - unsigned char* p = array_get(&(s->fat), offset); - switch (cluster&1) { - case 0: - p[0] = value&0xff; - p[1] = (p[1]&0xf0) | ((value>>8)&0xf); - break; - case 1: - p[0] = (p[0]&0xf) | ((value&0xf)<<4); - p[1] = (value>>4); - break; - } - } -} - -static inline uint32_t fat_get(BDRVVVFATState* s,unsigned int cluster) -{ - if(s->fat_type==32) { - uint32_t* entry=array_get(&(s->fat),cluster); - return le32_to_cpu(*entry); - } else if(s->fat_type==16) { - uint16_t* entry=array_get(&(s->fat),cluster); - return le16_to_cpu(*entry); - } else { - const uint8_t* x=(uint8_t*)(s->fat.pointer)+cluster*3/2; - return ((x[0]|(x[1]<<8))>>(cluster&1?4:0))&0x0fff; - } -} - -static inline int fat_eof(BDRVVVFATState* s,uint32_t fat_entry) -{ - if(fat_entry>s->max_fat_value-8) - return -1; - return 0; -} - -static inline void init_fat(BDRVVVFATState* s) -{ - if (s->fat_type == 12) { - array_init(&(s->fat),1); - array_ensure_allocated(&(s->fat), - s->sectors_per_fat * 0x200 * 3 / 2 - 1); - } else { - array_init(&(s->fat),(s->fat_type==32?4:2)); - array_ensure_allocated(&(s->fat), - s->sectors_per_fat * 0x200 / s->fat.item_size - 1); - } - memset(s->fat.pointer,0,s->fat.size); - - switch(s->fat_type) { - case 12: s->max_fat_value=0xfff; break; - case 16: s->max_fat_value=0xffff; break; - case 32: s->max_fat_value=0x0fffffff; break; - default: s->max_fat_value=0; /* error... */ - } - -} - -/* TODO: in create_short_filename, 0xe5->0x05 is not yet handled! */ -/* TODO: in parse_short_filename, 0x05->0xe5 is not yet handled! */ -static inline direntry_t* create_short_and_long_name(BDRVVVFATState* s, - unsigned int directory_start, const char* filename, int is_dot) -{ - int i,j,long_index=s->directory.next; - direntry_t* entry = NULL; - direntry_t* entry_long = NULL; - - if(is_dot) { - entry=array_get_next(&(s->directory)); - memset(entry->name, 0x20, sizeof(entry->name)); - memcpy(entry->name,filename,strlen(filename)); - return entry; - } - - entry_long=create_long_filename(s,filename); - - i = strlen(filename); - for(j = i - 1; j>0 && filename[j]!='.';j--); - if (j > 0) - i = (j > 8 ? 8 : j); - else if (i > 8) - i = 8; - - entry=array_get_next(&(s->directory)); - memset(entry->name, 0x20, sizeof(entry->name)); - memcpy(entry->name, filename, i); - - if (j > 0) { - for (i = 0; i < 3 && filename[j + 1 + i]; i++) { - entry->name[8 + i] = filename[j + 1 + i]; - } - } - - /* upcase & remove unwanted characters */ - for(i=10;i>=0;i--) { - if(i==10 || i==7) for(;i>0 && entry->name[i]==' ';i--); - if(entry->name[i]<=' ' || entry->name[i]>0x7f - || strchr(".*?<>|\":/\\[];,+='",entry->name[i])) - entry->name[i]='_'; - else if(entry->name[i]>='a' && entry->name[i]<='z') - entry->name[i]+='A'-'a'; - } - - /* mangle duplicates */ - while(1) { - direntry_t* entry1=array_get(&(s->directory),directory_start); - int j; - - for(;entry1name,entry->name,11)) - break; /* found dupe */ - if(entry1==entry) /* no dupe found */ - break; - - /* use all 8 characters of name */ - if(entry->name[7]==' ') { - int j; - for(j=6;j>0 && entry->name[j]==' ';j--) - entry->name[j]='~'; - } - - /* increment number */ - for(j=7;j>0 && entry->name[j]=='9';j--) - entry->name[j]='0'; - if(j>0) { - if(entry->name[j]<'0' || entry->name[j]>'9') - entry->name[j]='0'; - else - entry->name[j]++; - } - } - - /* calculate checksum; propagate to long name */ - if(entry_long) { - uint8_t chksum=fat_chksum(entry); - - /* calculate anew, because realloc could have taken place */ - entry_long=array_get(&(s->directory),long_index); - while(entry_longreserved[1]=chksum; - entry_long++; - } - } - - return entry; -} - -/* - * Read a directory. (the index of the corresponding mapping must be passed). - */ -static int read_directory(BDRVVVFATState* s, int mapping_index) -{ - mapping_t* mapping = array_get(&(s->mapping), mapping_index); - direntry_t* direntry; - const char* dirname = mapping->path; - int first_cluster = mapping->begin; - int parent_index = mapping->info.dir.parent_mapping_index; - mapping_t* parent_mapping = (mapping_t*) - (parent_index >= 0 ? array_get(&(s->mapping), parent_index) : NULL); - int first_cluster_of_parent = parent_mapping ? parent_mapping->begin : -1; - - DIR* dir=opendir(dirname); - struct dirent* entry; - int i; - - assert(mapping->mode & MODE_DIRECTORY); - - if(!dir) { - mapping->end = mapping->begin; - return -1; - } - - i = mapping->info.dir.first_dir_index = - first_cluster == 0 ? 0 : s->directory.next; - - /* actually read the directory, and allocate the mappings */ - while((entry=readdir(dir))) { - unsigned int length=strlen(dirname)+2+strlen(entry->d_name); - char* buffer; - direntry_t* direntry; - struct stat st; - int is_dot=!strcmp(entry->d_name,"."); - int is_dotdot=!strcmp(entry->d_name,".."); - - if(first_cluster == 0 && (is_dotdot || is_dot)) - continue; - - buffer = g_malloc(length); - snprintf(buffer,length,"%s/%s",dirname,entry->d_name); - - if(stat(buffer,&st)<0) { - g_free(buffer); - continue; - } - - /* create directory entry for this file */ - direntry=create_short_and_long_name(s, i, entry->d_name, - is_dot || is_dotdot); - direntry->attributes=(S_ISDIR(st.st_mode)?0x10:0x20); - direntry->reserved[0]=direntry->reserved[1]=0; - direntry->ctime=fat_datetime(st.st_ctime,1); - direntry->cdate=fat_datetime(st.st_ctime,0); - direntry->adate=fat_datetime(st.st_atime,0); - direntry->begin_hi=0; - direntry->mtime=fat_datetime(st.st_mtime,1); - direntry->mdate=fat_datetime(st.st_mtime,0); - if(is_dotdot) - set_begin_of_direntry(direntry, first_cluster_of_parent); - else if(is_dot) - set_begin_of_direntry(direntry, first_cluster); - else - direntry->begin=0; /* do that later */ - if (st.st_size > 0x7fffffff) { - fprintf(stderr, "File %s is larger than 2GB\n", buffer); - g_free(buffer); - closedir(dir); - return -2; - } - direntry->size=cpu_to_le32(S_ISDIR(st.st_mode)?0:st.st_size); - - /* create mapping for this file */ - if(!is_dot && !is_dotdot && (S_ISDIR(st.st_mode) || st.st_size)) { - s->current_mapping = array_get_next(&(s->mapping)); - s->current_mapping->begin=0; - s->current_mapping->end=st.st_size; - /* - * we get the direntry of the most recent direntry, which - * contains the short name and all the relevant information. - */ - s->current_mapping->dir_index=s->directory.next-1; - s->current_mapping->first_mapping_index = -1; - if (S_ISDIR(st.st_mode)) { - s->current_mapping->mode = MODE_DIRECTORY; - s->current_mapping->info.dir.parent_mapping_index = - mapping_index; - } else { - s->current_mapping->mode = MODE_UNDEFINED; - s->current_mapping->info.file.offset = 0; - } - s->current_mapping->path=buffer; - s->current_mapping->read_only = - (st.st_mode & (S_IWUSR | S_IWGRP | S_IWOTH)) == 0; - } else { - g_free(buffer); - } - } - closedir(dir); - - /* fill with zeroes up to the end of the cluster */ - while(s->directory.next%(0x10*s->sectors_per_cluster)) { - direntry_t* direntry=array_get_next(&(s->directory)); - memset(direntry,0,sizeof(direntry_t)); - } - -/* TODO: if there are more entries, bootsector has to be adjusted! */ -#define ROOT_ENTRIES (0x02 * 0x10 * s->sectors_per_cluster) - if (mapping_index == 0 && s->directory.next < ROOT_ENTRIES) { - /* root directory */ - int cur = s->directory.next; - array_ensure_allocated(&(s->directory), ROOT_ENTRIES - 1); - s->directory.next = ROOT_ENTRIES; - memset(array_get(&(s->directory), cur), 0, - (ROOT_ENTRIES - cur) * sizeof(direntry_t)); - } - - /* reget the mapping, since s->mapping was possibly realloc()ed */ - mapping = array_get(&(s->mapping), mapping_index); - first_cluster += (s->directory.next - mapping->info.dir.first_dir_index) - * 0x20 / s->cluster_size; - mapping->end = first_cluster; - - direntry = array_get(&(s->directory), mapping->dir_index); - set_begin_of_direntry(direntry, mapping->begin); - - return 0; -} - -static inline uint32_t sector2cluster(BDRVVVFATState* s,off_t sector_num) -{ - return (sector_num-s->faked_sectors)/s->sectors_per_cluster; -} - -static inline off_t cluster2sector(BDRVVVFATState* s, uint32_t cluster_num) -{ - return s->faked_sectors + s->sectors_per_cluster * cluster_num; -} - -static int init_directories(BDRVVVFATState* s, - const char *dirname, int heads, int secs, - Error **errp) -{ - bootsector_t* bootsector; - mapping_t* mapping; - unsigned int i; - unsigned int cluster; - - memset(&(s->first_sectors[0]),0,0x40*0x200); - - s->cluster_size=s->sectors_per_cluster*0x200; - s->cluster_buffer=g_malloc(s->cluster_size); - - /* - * The formula: sc = spf+1+spf*spc*(512*8/fat_type), - * where sc is sector_count, - * spf is sectors_per_fat, - * spc is sectors_per_clusters, and - * fat_type = 12, 16 or 32. - */ - i = 1+s->sectors_per_cluster*0x200*8/s->fat_type; - s->sectors_per_fat=(s->sector_count+i)/i; /* round up */ - - array_init(&(s->mapping),sizeof(mapping_t)); - array_init(&(s->directory),sizeof(direntry_t)); - - /* add volume label */ - { - direntry_t* entry=array_get_next(&(s->directory)); - entry->attributes=0x28; /* archive | volume label */ - memcpy(entry->name, s->volume_label, sizeof(entry->name)); - } - - /* Now build FAT, and write back information into directory */ - init_fat(s); - - s->faked_sectors=s->first_sectors_number+s->sectors_per_fat*2; - s->cluster_count=sector2cluster(s, s->sector_count); - - mapping = array_get_next(&(s->mapping)); - mapping->begin = 0; - mapping->dir_index = 0; - mapping->info.dir.parent_mapping_index = -1; - mapping->first_mapping_index = -1; - mapping->path = g_strdup(dirname); - i = strlen(mapping->path); - if (i > 0 && mapping->path[i - 1] == '/') - mapping->path[i - 1] = '\0'; - mapping->mode = MODE_DIRECTORY; - mapping->read_only = 0; - s->path = mapping->path; - - for (i = 0, cluster = 0; i < s->mapping.next; i++) { - /* MS-DOS expects the FAT to be 0 for the root directory - * (except for the media byte). */ - /* LATER TODO: still true for FAT32? */ - int fix_fat = (i != 0); - mapping = array_get(&(s->mapping), i); - - if (mapping->mode & MODE_DIRECTORY) { - mapping->begin = cluster; - if(read_directory(s, i)) { - error_setg(errp, "Could not read directory %s", - mapping->path); - return -1; - } - mapping = array_get(&(s->mapping), i); - } else { - assert(mapping->mode == MODE_UNDEFINED); - mapping->mode=MODE_NORMAL; - mapping->begin = cluster; - if (mapping->end > 0) { - direntry_t* direntry = array_get(&(s->directory), - mapping->dir_index); - - mapping->end = cluster + 1 + (mapping->end-1)/s->cluster_size; - set_begin_of_direntry(direntry, mapping->begin); - } else { - mapping->end = cluster + 1; - fix_fat = 0; - } - } - - assert(mapping->begin < mapping->end); - - /* next free cluster */ - cluster = mapping->end; - - if(cluster > s->cluster_count) { - error_setg(errp, - "Directory does not fit in FAT%d (capacity %.2f MB)", - s->fat_type, s->sector_count / 2000.0); - return -1; - } - - /* fix fat for entry */ - if (fix_fat) { - int j; - for(j = mapping->begin; j < mapping->end - 1; j++) - fat_set(s, j, j+1); - fat_set(s, mapping->end - 1, s->max_fat_value); - } - } - - mapping = array_get(&(s->mapping), 0); - s->sectors_of_root_directory = mapping->end * s->sectors_per_cluster; - s->last_cluster_of_root_directory = mapping->end; - - /* the FAT signature */ - fat_set(s,0,s->max_fat_value); - fat_set(s,1,s->max_fat_value); - - s->current_mapping = NULL; - - bootsector=(bootsector_t*)(s->first_sectors+(s->first_sectors_number-1)*0x200); - bootsector->jump[0]=0xeb; - bootsector->jump[1]=0x3e; - bootsector->jump[2]=0x90; - memcpy(bootsector->name,"QEMU ",8); - bootsector->sector_size=cpu_to_le16(0x200); - bootsector->sectors_per_cluster=s->sectors_per_cluster; - bootsector->reserved_sectors=cpu_to_le16(1); - bootsector->number_of_fats=0x2; /* number of FATs */ - bootsector->root_entries=cpu_to_le16(s->sectors_of_root_directory*0x10); - bootsector->total_sectors16=s->sector_count>0xffff?0:cpu_to_le16(s->sector_count); - bootsector->media_type=(s->first_sectors_number>1?0xf8:0xf0); /* media descriptor (f8=hd, f0=3.5 fd)*/ - s->fat.pointer[0] = bootsector->media_type; - bootsector->sectors_per_fat=cpu_to_le16(s->sectors_per_fat); - bootsector->sectors_per_track = cpu_to_le16(secs); - bootsector->number_of_heads = cpu_to_le16(heads); - bootsector->hidden_sectors=cpu_to_le32(s->first_sectors_number==1?0:0x3f); - bootsector->total_sectors=cpu_to_le32(s->sector_count>0xffff?s->sector_count:0); - - /* LATER TODO: if FAT32, this is wrong */ - bootsector->u.fat16.drive_number=s->first_sectors_number==1?0:0x80; /* fda=0, hda=0x80 */ - bootsector->u.fat16.current_head=0; - bootsector->u.fat16.signature=0x29; - bootsector->u.fat16.id=cpu_to_le32(0xfabe1afd); - - memcpy(bootsector->u.fat16.volume_label, s->volume_label, - sizeof(bootsector->u.fat16.volume_label)); - memcpy(bootsector->fat_type,(s->fat_type==12?"FAT12 ":s->fat_type==16?"FAT16 ":"FAT32 "),8); - bootsector->magic[0]=0x55; bootsector->magic[1]=0xaa; - - return 0; -} - -#ifdef DEBUG -static BDRVVVFATState *vvv = NULL; -#endif - -static int enable_write_target(BDRVVVFATState *s, Error **errp); -static int is_consistent(BDRVVVFATState *s); - -static QemuOptsList runtime_opts = { - .name = "vvfat", - .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head), - .desc = { - { - .name = "dir", - .type = QEMU_OPT_STRING, - .help = "Host directory to map to the vvfat device", - }, - { - .name = "fat-type", - .type = QEMU_OPT_NUMBER, - .help = "FAT type (12, 16 or 32)", - }, - { - .name = "floppy", - .type = QEMU_OPT_BOOL, - .help = "Create a floppy rather than a hard disk image", - }, - { - .name = "label", - .type = QEMU_OPT_STRING, - .help = "Use a volume label other than QEMU VVFAT", - }, - { - .name = "rw", - .type = QEMU_OPT_BOOL, - .help = "Make the image writable", - }, - { /* end of list */ } - }, -}; - -static void vvfat_parse_filename(const char *filename, QDict *options, - Error **errp) -{ - int fat_type = 0; - bool floppy = false; - bool rw = false; - int i; - - if (!strstart(filename, "fat:", NULL)) { - error_setg(errp, "File name string must start with 'fat:'"); - return; - } - - /* Parse options */ - if (strstr(filename, ":32:")) { - fat_type = 32; - } else if (strstr(filename, ":16:")) { - fat_type = 16; - } else if (strstr(filename, ":12:")) { - fat_type = 12; - } - - if (strstr(filename, ":floppy:")) { - floppy = true; - } - - if (strstr(filename, ":rw:")) { - rw = true; - } - - /* Get the directory name without options */ - i = strrchr(filename, ':') - filename; - assert(i >= 3); - if (filename[i - 2] == ':' && qemu_isalpha(filename[i - 1])) { - /* workaround for DOS drive names */ - filename += i - 1; - } else { - filename += i + 1; - } - - /* Fill in the options QDict */ - qdict_put(options, "dir", qstring_from_str(filename)); - qdict_put(options, "fat-type", qint_from_int(fat_type)); - qdict_put(options, "floppy", qbool_from_bool(floppy)); - qdict_put(options, "rw", qbool_from_bool(rw)); -} - -static int vvfat_open(BlockDriverState *bs, QDict *options, int flags, - Error **errp) -{ - BDRVVVFATState *s = bs->opaque; - int cyls, heads, secs; - bool floppy; - const char *dirname, *label; - QemuOpts *opts; - Error *local_err = NULL; - int ret; - -#ifdef DEBUG - vvv = s; -#endif - - opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort); - qemu_opts_absorb_qdict(opts, options, &local_err); - if (local_err) { - error_propagate(errp, local_err); - ret = -EINVAL; - goto fail; - } - - dirname = qemu_opt_get(opts, "dir"); - if (!dirname) { - error_setg(errp, "vvfat block driver requires a 'dir' option"); - ret = -EINVAL; - goto fail; - } - - s->fat_type = qemu_opt_get_number(opts, "fat-type", 0); - floppy = qemu_opt_get_bool(opts, "floppy", false); - - memset(s->volume_label, ' ', sizeof(s->volume_label)); - label = qemu_opt_get(opts, "label"); - if (label) { - size_t label_length = strlen(label); - if (label_length > 11) { - error_setg(errp, "vvfat label cannot be longer than 11 bytes"); - ret = -EINVAL; - goto fail; - } - memcpy(s->volume_label, label, label_length); - } else { - memcpy(s->volume_label, "QEMU VVFAT", 10); - } - - if (floppy) { - /* 1.44MB or 2.88MB floppy. 2.88MB can be FAT12 (default) or FAT16. */ - if (!s->fat_type) { - s->fat_type = 12; - secs = 36; - s->sectors_per_cluster = 2; - } else { - secs = s->fat_type == 12 ? 18 : 36; - s->sectors_per_cluster = 1; - } - s->first_sectors_number = 1; - cyls = 80; - heads = 2; - } else { - /* 32MB or 504MB disk*/ - if (!s->fat_type) { - s->fat_type = 16; - } - s->first_sectors_number = 0x40; - cyls = s->fat_type == 12 ? 64 : 1024; - heads = 16; - secs = 63; - } - - switch (s->fat_type) { - case 32: - fprintf(stderr, "Big fat greek warning: FAT32 has not been tested. " - "You are welcome to do so!\n"); - break; - case 16: - case 12: - break; - default: - error_setg(errp, "Valid FAT types are only 12, 16 and 32"); - ret = -EINVAL; - goto fail; - } - - - s->bs = bs; - - /* LATER TODO: if FAT32, adjust */ - s->sectors_per_cluster=0x10; - - s->current_cluster=0xffffffff; - - /* read only is the default for safety */ - bs->read_only = 1; - s->qcow = s->write_target = NULL; - s->qcow_filename = NULL; - s->fat2 = NULL; - s->downcase_short_names = 1; - - fprintf(stderr, "vvfat %s chs %d,%d,%d\n", - dirname, cyls, heads, secs); - - s->sector_count = cyls * heads * secs - (s->first_sectors_number - 1); - - if (qemu_opt_get_bool(opts, "rw", false)) { - ret = enable_write_target(s, errp); - if (ret < 0) { - goto fail; - } - bs->read_only = 0; - } - - bs->total_sectors = cyls * heads * secs; - - if (init_directories(s, dirname, heads, secs, errp)) { - ret = -EIO; - goto fail; - } - - s->sector_count = s->faked_sectors + s->sectors_per_cluster*s->cluster_count; - - if (s->first_sectors_number == 0x40) { - init_mbr(s, cyls, heads, secs); - } - - // assert(is_consistent(s)); - qemu_co_mutex_init(&s->lock); - - /* Disable migration when vvfat is used rw */ - if (s->qcow) { - error_setg(&s->migration_blocker, - "The vvfat (rw) format used by node '%s' " - "does not support live migration", - bdrv_get_device_or_node_name(bs)); - migrate_add_blocker(s->migration_blocker); - } - - ret = 0; -fail: - qemu_opts_del(opts); - return ret; -} - -static inline void vvfat_close_current_file(BDRVVVFATState *s) -{ - if(s->current_mapping) { - s->current_mapping = NULL; - if (s->current_fd) { - qemu_close(s->current_fd); - s->current_fd = 0; - } - } - s->current_cluster = -1; -} - -/* mappings between index1 and index2-1 are supposed to be ordered - * return value is the index of the last mapping for which end>cluster_num - */ -static inline int find_mapping_for_cluster_aux(BDRVVVFATState* s,int cluster_num,int index1,int index2) -{ - while(1) { - int index3; - mapping_t* mapping; - index3=(index1+index2)/2; - mapping=array_get(&(s->mapping),index3); - assert(mapping->begin < mapping->end); - if(mapping->begin>=cluster_num) { - assert(index2!=index3 || index2==0); - if(index2==index3) - return index1; - index2=index3; - } else { - if(index1==index3) - return mapping->end<=cluster_num ? index2 : index1; - index1=index3; - } - assert(index1<=index2); - DLOG(mapping=array_get(&(s->mapping),index1); - assert(mapping->begin<=cluster_num); - assert(index2 >= s->mapping.next || - ((mapping = array_get(&(s->mapping),index2)) && - mapping->end>cluster_num))); - } -} - -static inline mapping_t* find_mapping_for_cluster(BDRVVVFATState* s,int cluster_num) -{ - int index=find_mapping_for_cluster_aux(s,cluster_num,0,s->mapping.next); - mapping_t* mapping; - if(index>=s->mapping.next) - return NULL; - mapping=array_get(&(s->mapping),index); - if(mapping->begin>cluster_num) - return NULL; - assert(mapping->begin<=cluster_num && mapping->end>cluster_num); - return mapping; -} - -static int open_file(BDRVVVFATState* s,mapping_t* mapping) -{ - if(!mapping) - return -1; - if(!s->current_mapping || - strcmp(s->current_mapping->path,mapping->path)) { - /* open file */ - int fd = qemu_open(mapping->path, O_RDONLY | O_BINARY | O_LARGEFILE); - if(fd<0) - return -1; - vvfat_close_current_file(s); - s->current_fd = fd; - s->current_mapping = mapping; - } - return 0; -} - -static inline int read_cluster(BDRVVVFATState *s,int cluster_num) -{ - if(s->current_cluster != cluster_num) { - int result=0; - off_t offset; - assert(!s->current_mapping || s->current_fd || (s->current_mapping->mode & MODE_DIRECTORY)); - if(!s->current_mapping - || s->current_mapping->begin>cluster_num - || s->current_mapping->end<=cluster_num) { - /* binary search of mappings for file */ - mapping_t* mapping=find_mapping_for_cluster(s,cluster_num); - - assert(!mapping || (cluster_num>=mapping->begin && cluster_numend)); - - if (mapping && mapping->mode & MODE_DIRECTORY) { - vvfat_close_current_file(s); - s->current_mapping = mapping; -read_cluster_directory: - offset = s->cluster_size*(cluster_num-s->current_mapping->begin); - s->cluster = (unsigned char*)s->directory.pointer+offset - + 0x20*s->current_mapping->info.dir.first_dir_index; - assert(((s->cluster-(unsigned char*)s->directory.pointer)%s->cluster_size)==0); - assert((char*)s->cluster+s->cluster_size <= s->directory.pointer+s->directory.next*s->directory.item_size); - s->current_cluster = cluster_num; - return 0; - } - - if(open_file(s,mapping)) - return -2; - } else if (s->current_mapping->mode & MODE_DIRECTORY) - goto read_cluster_directory; - - assert(s->current_fd); - - offset=s->cluster_size*(cluster_num-s->current_mapping->begin)+s->current_mapping->info.file.offset; - if(lseek(s->current_fd, offset, SEEK_SET)!=offset) - return -3; - s->cluster=s->cluster_buffer; - result=read(s->current_fd,s->cluster,s->cluster_size); - if(result<0) { - s->current_cluster = -1; - return -1; - } - s->current_cluster = cluster_num; - } - return 0; -} - -#ifdef DEBUG -static void print_direntry(const direntry_t* direntry) -{ - int j = 0; - char buffer[1024]; - - fprintf(stderr, "direntry %p: ", direntry); - if(!direntry) - return; - if(is_long_name(direntry)) { - unsigned char* c=(unsigned char*)direntry; - int i; - for(i=1;i<11 && c[i] && c[i]!=0xff;i+=2) -#define ADD_CHAR(c) {buffer[j] = (c); if (buffer[j] < ' ') buffer[j] = 0xb0; j++;} - ADD_CHAR(c[i]); - for(i=14;i<26 && c[i] && c[i]!=0xff;i+=2) - ADD_CHAR(c[i]); - for(i=28;i<32 && c[i] && c[i]!=0xff;i+=2) - ADD_CHAR(c[i]); - buffer[j] = 0; - fprintf(stderr, "%s\n", buffer); - } else { - int i; - for(i=0;i<11;i++) - ADD_CHAR(direntry->name[i]); - buffer[j] = 0; - fprintf(stderr,"%s attributes=0x%02x begin=%d size=%d\n", - buffer, - direntry->attributes, - begin_of_direntry(direntry),le32_to_cpu(direntry->size)); - } -} - -static void print_mapping(const mapping_t* mapping) -{ - fprintf(stderr, "mapping (%p): begin, end = %d, %d, dir_index = %d, " - "first_mapping_index = %d, name = %s, mode = 0x%x, " , - mapping, mapping->begin, mapping->end, mapping->dir_index, - mapping->first_mapping_index, mapping->path, mapping->mode); - - if (mapping->mode & MODE_DIRECTORY) - fprintf(stderr, "parent_mapping_index = %d, first_dir_index = %d\n", mapping->info.dir.parent_mapping_index, mapping->info.dir.first_dir_index); - else - fprintf(stderr, "offset = %d\n", mapping->info.file.offset); -} -#endif - -static int vvfat_read(BlockDriverState *bs, int64_t sector_num, - uint8_t *buf, int nb_sectors) -{ - BDRVVVFATState *s = bs->opaque; - int i; - - for(i=0;i= bs->total_sectors) - return -1; - if (s->qcow) { - int n; - if (bdrv_is_allocated(s->qcow, sector_num, nb_sectors-i, &n)) { -DLOG(fprintf(stderr, "sectors %d+%d allocated\n", (int)sector_num, n)); - if (bdrv_read(s->qcow, sector_num, buf + i*0x200, n)) { - return -1; - } - i += n - 1; - sector_num += n - 1; - continue; - } -DLOG(fprintf(stderr, "sector %d not allocated\n", (int)sector_num)); - } - if(sector_numfaked_sectors) { - if(sector_numfirst_sectors_number) - memcpy(buf+i*0x200,&(s->first_sectors[sector_num*0x200]),0x200); - else if(sector_num-s->first_sectors_numbersectors_per_fat) - memcpy(buf+i*0x200,&(s->fat.pointer[(sector_num-s->first_sectors_number)*0x200]),0x200); - else if(sector_num-s->first_sectors_number-s->sectors_per_fatsectors_per_fat) - memcpy(buf+i*0x200,&(s->fat.pointer[(sector_num-s->first_sectors_number-s->sectors_per_fat)*0x200]),0x200); - } else { - uint32_t sector=sector_num-s->faked_sectors, - sector_offset_in_cluster=(sector%s->sectors_per_cluster), - cluster_num=sector/s->sectors_per_cluster; - if(cluster_num > s->cluster_count || read_cluster(s, cluster_num) != 0) { - /* LATER TODO: strict: return -1; */ - memset(buf+i*0x200,0,0x200); - continue; - } - memcpy(buf+i*0x200,s->cluster+sector_offset_in_cluster*0x200,0x200); - } - } - return 0; -} - -static coroutine_fn int vvfat_co_read(BlockDriverState *bs, int64_t sector_num, - uint8_t *buf, int nb_sectors) -{ - int ret; - BDRVVVFATState *s = bs->opaque; - qemu_co_mutex_lock(&s->lock); - ret = vvfat_read(bs, sector_num, buf, nb_sectors); - qemu_co_mutex_unlock(&s->lock); - return ret; -} - -/* LATER TODO: statify all functions */ - -/* - * Idea of the write support (use snapshot): - * - * 1. check if all data is consistent, recording renames, modifications, - * new files and directories (in s->commits). - * - * 2. if the data is not consistent, stop committing - * - * 3. handle renames, and create new files and directories (do not yet - * write their contents) - * - * 4. walk the directories, fixing the mapping and direntries, and marking - * the handled mappings as not deleted - * - * 5. commit the contents of the files - * - * 6. handle deleted files and directories - * - */ - -typedef struct commit_t { - char* path; - union { - struct { uint32_t cluster; } rename; - struct { int dir_index; uint32_t modified_offset; } writeout; - struct { uint32_t first_cluster; } new_file; - struct { uint32_t cluster; } mkdir; - } param; - /* DELETEs and RMDIRs are handled differently: see handle_deletes() */ - enum { - ACTION_RENAME, ACTION_WRITEOUT, ACTION_NEW_FILE, ACTION_MKDIR - } action; -} commit_t; - -static void clear_commits(BDRVVVFATState* s) -{ - int i; -DLOG(fprintf(stderr, "clear_commits (%d commits)\n", s->commits.next)); - for (i = 0; i < s->commits.next; i++) { - commit_t* commit = array_get(&(s->commits), i); - assert(commit->path || commit->action == ACTION_WRITEOUT); - if (commit->action != ACTION_WRITEOUT) { - assert(commit->path); - g_free(commit->path); - } else - assert(commit->path == NULL); - } - s->commits.next = 0; -} - -static void schedule_rename(BDRVVVFATState* s, - uint32_t cluster, char* new_path) -{ - commit_t* commit = array_get_next(&(s->commits)); - commit->path = new_path; - commit->param.rename.cluster = cluster; - commit->action = ACTION_RENAME; -} - -static void schedule_writeout(BDRVVVFATState* s, - int dir_index, uint32_t modified_offset) -{ - commit_t* commit = array_get_next(&(s->commits)); - commit->path = NULL; - commit->param.writeout.dir_index = dir_index; - commit->param.writeout.modified_offset = modified_offset; - commit->action = ACTION_WRITEOUT; -} - -static void schedule_new_file(BDRVVVFATState* s, - char* path, uint32_t first_cluster) -{ - commit_t* commit = array_get_next(&(s->commits)); - commit->path = path; - commit->param.new_file.first_cluster = first_cluster; - commit->action = ACTION_NEW_FILE; -} - -static void schedule_mkdir(BDRVVVFATState* s, uint32_t cluster, char* path) -{ - commit_t* commit = array_get_next(&(s->commits)); - commit->path = path; - commit->param.mkdir.cluster = cluster; - commit->action = ACTION_MKDIR; -} - -typedef struct { - /* - * Since the sequence number is at most 0x3f, and the filename - * length is at most 13 times the sequence number, the maximal - * filename length is 0x3f * 13 bytes. - */ - unsigned char name[0x3f * 13 + 1]; - int checksum, len; - int sequence_number; -} long_file_name; - -static void lfn_init(long_file_name* lfn) -{ - lfn->sequence_number = lfn->len = 0; - lfn->checksum = 0x100; -} - -/* return 0 if parsed successfully, > 0 if no long name, < 0 if error */ -static int parse_long_name(long_file_name* lfn, - const direntry_t* direntry) -{ - int i, j, offset; - const unsigned char* pointer = (const unsigned char*)direntry; - - if (!is_long_name(direntry)) - return 1; - - if (pointer[0] & 0x40) { - lfn->sequence_number = pointer[0] & 0x3f; - lfn->checksum = pointer[13]; - lfn->name[0] = 0; - lfn->name[lfn->sequence_number * 13] = 0; - } else if ((pointer[0] & 0x3f) != --lfn->sequence_number) - return -1; - else if (pointer[13] != lfn->checksum) - return -2; - else if (pointer[12] || pointer[26] || pointer[27]) - return -3; - - offset = 13 * (lfn->sequence_number - 1); - for (i = 0, j = 1; i < 13; i++, j+=2) { - if (j == 11) - j = 14; - else if (j == 26) - j = 28; - - if (pointer[j+1] == 0) - lfn->name[offset + i] = pointer[j]; - else if (pointer[j+1] != 0xff || (pointer[0] & 0x40) == 0) - return -4; - else - lfn->name[offset + i] = 0; - } - - if (pointer[0] & 0x40) - lfn->len = offset + strlen((char*)lfn->name + offset); - - return 0; -} - -/* returns 0 if successful, >0 if no short_name, and <0 on error */ -static int parse_short_name(BDRVVVFATState* s, - long_file_name* lfn, direntry_t* direntry) -{ - int i, j; - - if (!is_short_name(direntry)) - return 1; - - for (j = 7; j >= 0 && direntry->name[j] == ' '; j--); - for (i = 0; i <= j; i++) { - if (direntry->name[i] <= ' ' || direntry->name[i] > 0x7f) - return -1; - else if (s->downcase_short_names) - lfn->name[i] = qemu_tolower(direntry->name[i]); - else - lfn->name[i] = direntry->name[i]; - } - - for (j = 2; j >= 0 && direntry->name[8 + j] == ' '; j--) { - } - if (j >= 0) { - lfn->name[i++] = '.'; - lfn->name[i + j + 1] = '\0'; - for (;j >= 0; j--) { - uint8_t c = direntry->name[8 + j]; - if (c <= ' ' || c > 0x7f) { - return -2; - } else if (s->downcase_short_names) { - lfn->name[i + j] = qemu_tolower(c); - } else { - lfn->name[i + j] = c; - } - } - } else - lfn->name[i + j + 1] = '\0'; - - lfn->len = strlen((char*)lfn->name); - - return 0; -} - -static inline uint32_t modified_fat_get(BDRVVVFATState* s, - unsigned int cluster) -{ - if (cluster < s->last_cluster_of_root_directory) { - if (cluster + 1 == s->last_cluster_of_root_directory) - return s->max_fat_value; - else - return cluster + 1; - } - - if (s->fat_type==32) { - uint32_t* entry=((uint32_t*)s->fat2)+cluster; - return le32_to_cpu(*entry); - } else if (s->fat_type==16) { - uint16_t* entry=((uint16_t*)s->fat2)+cluster; - return le16_to_cpu(*entry); - } else { - const uint8_t* x=s->fat2+cluster*3/2; - return ((x[0]|(x[1]<<8))>>(cluster&1?4:0))&0x0fff; - } -} - -static inline int cluster_was_modified(BDRVVVFATState* s, uint32_t cluster_num) -{ - int was_modified = 0; - int i, dummy; - - if (s->qcow == NULL) - return 0; - - for (i = 0; !was_modified && i < s->sectors_per_cluster; i++) - was_modified = bdrv_is_allocated(s->qcow, - cluster2sector(s, cluster_num) + i, 1, &dummy); - - return was_modified; -} - -static const char* get_basename(const char* path) -{ - char* basename = strrchr(path, '/'); - if (basename == NULL) - return path; - else - return basename + 1; /* strip '/' */ -} - -/* - * The array s->used_clusters holds the states of the clusters. If it is - * part of a file, it has bit 2 set, in case of a directory, bit 1. If it - * was modified, bit 3 is set. - * If any cluster is allocated, but not part of a file or directory, this - * driver refuses to commit. - */ -typedef enum { - USED_DIRECTORY = 1, USED_FILE = 2, USED_ANY = 3, USED_ALLOCATED = 4 -} used_t; - -/* - * get_cluster_count_for_direntry() not only determines how many clusters - * are occupied by direntry, but also if it was renamed or modified. - * - * A file is thought to be renamed *only* if there already was a file with - * exactly the same first cluster, but a different name. - * - * Further, the files/directories handled by this function are - * assumed to be *not* deleted (and *only* those). - */ -static uint32_t get_cluster_count_for_direntry(BDRVVVFATState* s, - direntry_t* direntry, const char* path) -{ - /* - * This is a little bit tricky: - * IF the guest OS just inserts a cluster into the file chain, - * and leaves the rest alone, (i.e. the original file had clusters - * 15 -> 16, but now has 15 -> 32 -> 16), then the following happens: - * - * - do_commit will write the cluster into the file at the given - * offset, but - * - * - the cluster which is overwritten should be moved to a later - * position in the file. - * - * I am not aware that any OS does something as braindead, but this - * situation could happen anyway when not committing for a long time. - * Just to be sure that this does not bite us, detect it, and copy the - * contents of the clusters to-be-overwritten into the qcow. - */ - int copy_it = 0; - int was_modified = 0; - int32_t ret = 0; - - uint32_t cluster_num = begin_of_direntry(direntry); - uint32_t offset = 0; - int first_mapping_index = -1; - mapping_t* mapping = NULL; - const char* basename2 = NULL; - - vvfat_close_current_file(s); - - /* the root directory */ - if (cluster_num == 0) - return 0; - - /* write support */ - if (s->qcow) { - basename2 = get_basename(path); - - mapping = find_mapping_for_cluster(s, cluster_num); - - if (mapping) { - const char* basename; - - assert(mapping->mode & MODE_DELETED); - mapping->mode &= ~MODE_DELETED; - - basename = get_basename(mapping->path); - - assert(mapping->mode & MODE_NORMAL); - - /* rename */ - if (strcmp(basename, basename2)) - schedule_rename(s, cluster_num, g_strdup(path)); - } else if (is_file(direntry)) - /* new file */ - schedule_new_file(s, g_strdup(path), cluster_num); - else { - abort(); - return 0; - } - } - - while(1) { - if (s->qcow) { - if (!copy_it && cluster_was_modified(s, cluster_num)) { - if (mapping == NULL || - mapping->begin > cluster_num || - mapping->end <= cluster_num) - mapping = find_mapping_for_cluster(s, cluster_num); - - - if (mapping && - (mapping->mode & MODE_DIRECTORY) == 0) { - - /* was modified in qcow */ - if (offset != mapping->info.file.offset + s->cluster_size - * (cluster_num - mapping->begin)) { - /* offset of this cluster in file chain has changed */ - abort(); - copy_it = 1; - } else if (offset == 0) { - const char* basename = get_basename(mapping->path); - - if (strcmp(basename, basename2)) - copy_it = 1; - first_mapping_index = array_index(&(s->mapping), mapping); - } - - if (mapping->first_mapping_index != first_mapping_index - && mapping->info.file.offset > 0) { - abort(); - copy_it = 1; - } - - /* need to write out? */ - if (!was_modified && is_file(direntry)) { - was_modified = 1; - schedule_writeout(s, mapping->dir_index, offset); - } - } - } - - if (copy_it) { - int i, dummy; - /* - * This is horribly inefficient, but that is okay, since - * it is rarely executed, if at all. - */ - int64_t offset = cluster2sector(s, cluster_num); - - vvfat_close_current_file(s); - for (i = 0; i < s->sectors_per_cluster; i++) { - if (!bdrv_is_allocated(s->qcow, offset + i, 1, &dummy)) { - if (vvfat_read(s->bs, offset, s->cluster_buffer, 1)) { - return -1; - } - if (bdrv_write(s->qcow, offset, s->cluster_buffer, 1)) { - return -2; - } - } - } - } - } - - ret++; - if (s->used_clusters[cluster_num] & USED_ANY) - return 0; - s->used_clusters[cluster_num] = USED_FILE; - - cluster_num = modified_fat_get(s, cluster_num); - - if (fat_eof(s, cluster_num)) - return ret; - else if (cluster_num < 2 || cluster_num > s->max_fat_value - 16) - return -1; - - offset += s->cluster_size; - } -} - -/* - * This function looks at the modified data (qcow). - * It returns 0 upon inconsistency or error, and the number of clusters - * used by the directory, its subdirectories and their files. - */ -static int check_directory_consistency(BDRVVVFATState *s, - int cluster_num, const char* path) -{ - int ret = 0; - unsigned char* cluster = g_malloc(s->cluster_size); - direntry_t* direntries = (direntry_t*)cluster; - mapping_t* mapping = find_mapping_for_cluster(s, cluster_num); - - long_file_name lfn; - int path_len = strlen(path); - char path2[PATH_MAX + 1]; - - assert(path_len < PATH_MAX); /* len was tested before! */ - pstrcpy(path2, sizeof(path2), path); - path2[path_len] = '/'; - path2[path_len + 1] = '\0'; - - if (mapping) { - const char* basename = get_basename(mapping->path); - const char* basename2 = get_basename(path); - - assert(mapping->mode & MODE_DIRECTORY); - - assert(mapping->mode & MODE_DELETED); - mapping->mode &= ~MODE_DELETED; - - if (strcmp(basename, basename2)) - schedule_rename(s, cluster_num, g_strdup(path)); - } else - /* new directory */ - schedule_mkdir(s, cluster_num, g_strdup(path)); - - lfn_init(&lfn); - do { - int i; - int subret = 0; - - ret++; - - if (s->used_clusters[cluster_num] & USED_ANY) { - fprintf(stderr, "cluster %d used more than once\n", (int)cluster_num); - goto fail; - } - s->used_clusters[cluster_num] = USED_DIRECTORY; - -DLOG(fprintf(stderr, "read cluster %d (sector %d)\n", (int)cluster_num, (int)cluster2sector(s, cluster_num))); - subret = vvfat_read(s->bs, cluster2sector(s, cluster_num), cluster, - s->sectors_per_cluster); - if (subret) { - fprintf(stderr, "Error fetching direntries\n"); - fail: - g_free(cluster); - return 0; - } - - for (i = 0; i < 0x10 * s->sectors_per_cluster; i++) { - int cluster_count = 0; - -DLOG(fprintf(stderr, "check direntry %d:\n", i); print_direntry(direntries + i)); - if (is_volume_label(direntries + i) || is_dot(direntries + i) || - is_free(direntries + i)) - continue; - - subret = parse_long_name(&lfn, direntries + i); - if (subret < 0) { - fprintf(stderr, "Error in long name\n"); - goto fail; - } - if (subret == 0 || is_free(direntries + i)) - continue; - - if (fat_chksum(direntries+i) != lfn.checksum) { - subret = parse_short_name(s, &lfn, direntries + i); - if (subret < 0) { - fprintf(stderr, "Error in short name (%d)\n", subret); - goto fail; - } - if (subret > 0 || !strcmp((char*)lfn.name, ".") - || !strcmp((char*)lfn.name, "..")) - continue; - } - lfn.checksum = 0x100; /* cannot use long name twice */ - - if (path_len + 1 + lfn.len >= PATH_MAX) { - fprintf(stderr, "Name too long: %s/%s\n", path, lfn.name); - goto fail; - } - pstrcpy(path2 + path_len + 1, sizeof(path2) - path_len - 1, - (char*)lfn.name); - - if (is_directory(direntries + i)) { - if (begin_of_direntry(direntries + i) == 0) { - DLOG(fprintf(stderr, "invalid begin for directory: %s\n", path2); print_direntry(direntries + i)); - goto fail; - } - cluster_count = check_directory_consistency(s, - begin_of_direntry(direntries + i), path2); - if (cluster_count == 0) { - DLOG(fprintf(stderr, "problem in directory %s:\n", path2); print_direntry(direntries + i)); - goto fail; - } - } else if (is_file(direntries + i)) { - /* check file size with FAT */ - cluster_count = get_cluster_count_for_direntry(s, direntries + i, path2); - if (cluster_count != - (le32_to_cpu(direntries[i].size) + s->cluster_size - - 1) / s->cluster_size) { - DLOG(fprintf(stderr, "Cluster count mismatch\n")); - goto fail; - } - } else - abort(); /* cluster_count = 0; */ - - ret += cluster_count; - } - - cluster_num = modified_fat_get(s, cluster_num); - } while(!fat_eof(s, cluster_num)); - - g_free(cluster); - return ret; -} - -/* returns 1 on success */ -static int is_consistent(BDRVVVFATState* s) -{ - int i, check; - int used_clusters_count = 0; - -DLOG(checkpoint()); - /* - * - get modified FAT - * - compare the two FATs (TODO) - * - get buffer for marking used clusters - * - recurse direntries from root (using bs->bdrv_read to make - * sure to get the new data) - * - check that the FAT agrees with the size - * - count the number of clusters occupied by this directory and - * its files - * - check that the cumulative used cluster count agrees with the - * FAT - * - if all is fine, return number of used clusters - */ - if (s->fat2 == NULL) { - int size = 0x200 * s->sectors_per_fat; - s->fat2 = g_malloc(size); - memcpy(s->fat2, s->fat.pointer, size); - } - check = vvfat_read(s->bs, - s->first_sectors_number, s->fat2, s->sectors_per_fat); - if (check) { - fprintf(stderr, "Could not copy fat\n"); - return 0; - } - assert (s->used_clusters); - for (i = 0; i < sector2cluster(s, s->sector_count); i++) - s->used_clusters[i] &= ~USED_ANY; - - clear_commits(s); - - /* mark every mapped file/directory as deleted. - * (check_directory_consistency() will unmark those still present). */ - if (s->qcow) - for (i = 0; i < s->mapping.next; i++) { - mapping_t* mapping = array_get(&(s->mapping), i); - if (mapping->first_mapping_index < 0) - mapping->mode |= MODE_DELETED; - } - - used_clusters_count = check_directory_consistency(s, 0, s->path); - if (used_clusters_count <= 0) { - DLOG(fprintf(stderr, "problem in directory\n")); - return 0; - } - - check = s->last_cluster_of_root_directory; - for (i = check; i < sector2cluster(s, s->sector_count); i++) { - if (modified_fat_get(s, i)) { - if(!s->used_clusters[i]) { - DLOG(fprintf(stderr, "FAT was modified (%d), but cluster is not used?\n", i)); - return 0; - } - check++; - } - - if (s->used_clusters[i] == USED_ALLOCATED) { - /* allocated, but not used... */ - DLOG(fprintf(stderr, "unused, modified cluster: %d\n", i)); - return 0; - } - } - - if (check != used_clusters_count) - return 0; - - return used_clusters_count; -} - -static inline void adjust_mapping_indices(BDRVVVFATState* s, - int offset, int adjust) -{ - int i; - - for (i = 0; i < s->mapping.next; i++) { - mapping_t* mapping = array_get(&(s->mapping), i); - -#define ADJUST_MAPPING_INDEX(name) \ - if (mapping->name >= offset) \ - mapping->name += adjust - - ADJUST_MAPPING_INDEX(first_mapping_index); - if (mapping->mode & MODE_DIRECTORY) - ADJUST_MAPPING_INDEX(info.dir.parent_mapping_index); - } -} - -/* insert or update mapping */ -static mapping_t* insert_mapping(BDRVVVFATState* s, - uint32_t begin, uint32_t end) -{ - /* - * - find mapping where mapping->begin >= begin, - * - if mapping->begin > begin: insert - * - adjust all references to mappings! - * - else: adjust - * - replace name - */ - int index = find_mapping_for_cluster_aux(s, begin, 0, s->mapping.next); - mapping_t* mapping = NULL; - mapping_t* first_mapping = array_get(&(s->mapping), 0); - - if (index < s->mapping.next && (mapping = array_get(&(s->mapping), index)) - && mapping->begin < begin) { - mapping->end = begin; - index++; - mapping = array_get(&(s->mapping), index); - } - if (index >= s->mapping.next || mapping->begin > begin) { - mapping = array_insert(&(s->mapping), index, 1); - mapping->path = NULL; - adjust_mapping_indices(s, index, +1); - } - - mapping->begin = begin; - mapping->end = end; - -DLOG(mapping_t* next_mapping; -assert(index + 1 >= s->mapping.next || -((next_mapping = array_get(&(s->mapping), index + 1)) && - next_mapping->begin >= end))); - - if (s->current_mapping && first_mapping != (mapping_t*)s->mapping.pointer) - s->current_mapping = array_get(&(s->mapping), - s->current_mapping - first_mapping); - - return mapping; -} - -static int remove_mapping(BDRVVVFATState* s, int mapping_index) -{ - mapping_t* mapping = array_get(&(s->mapping), mapping_index); - mapping_t* first_mapping = array_get(&(s->mapping), 0); - - /* free mapping */ - if (mapping->first_mapping_index < 0) { - g_free(mapping->path); - } - - /* remove from s->mapping */ - array_remove(&(s->mapping), mapping_index); - - /* adjust all references to mappings */ - adjust_mapping_indices(s, mapping_index, -1); - - if (s->current_mapping && first_mapping != (mapping_t*)s->mapping.pointer) - s->current_mapping = array_get(&(s->mapping), - s->current_mapping - first_mapping); - - return 0; -} - -static void adjust_dirindices(BDRVVVFATState* s, int offset, int adjust) -{ - int i; - for (i = 0; i < s->mapping.next; i++) { - mapping_t* mapping = array_get(&(s->mapping), i); - if (mapping->dir_index >= offset) - mapping->dir_index += adjust; - if ((mapping->mode & MODE_DIRECTORY) && - mapping->info.dir.first_dir_index >= offset) - mapping->info.dir.first_dir_index += adjust; - } -} - -static direntry_t* insert_direntries(BDRVVVFATState* s, - int dir_index, int count) -{ - /* - * make room in s->directory, - * adjust_dirindices - */ - direntry_t* result = array_insert(&(s->directory), dir_index, count); - if (result == NULL) - return NULL; - adjust_dirindices(s, dir_index, count); - return result; -} - -static int remove_direntries(BDRVVVFATState* s, int dir_index, int count) -{ - int ret = array_remove_slice(&(s->directory), dir_index, count); - if (ret) - return ret; - adjust_dirindices(s, dir_index, -count); - return 0; -} - -/* - * Adapt the mappings of the cluster chain starting at first cluster - * (i.e. if a file starts at first_cluster, the chain is followed according - * to the modified fat, and the corresponding entries in s->mapping are - * adjusted) - */ -static int commit_mappings(BDRVVVFATState* s, - uint32_t first_cluster, int dir_index) -{ - mapping_t* mapping = find_mapping_for_cluster(s, first_cluster); - direntry_t* direntry = array_get(&(s->directory), dir_index); - uint32_t cluster = first_cluster; - - vvfat_close_current_file(s); - - assert(mapping); - assert(mapping->begin == first_cluster); - mapping->first_mapping_index = -1; - mapping->dir_index = dir_index; - mapping->mode = (dir_index <= 0 || is_directory(direntry)) ? - MODE_DIRECTORY : MODE_NORMAL; - - while (!fat_eof(s, cluster)) { - uint32_t c, c1; - - for (c = cluster, c1 = modified_fat_get(s, c); c + 1 == c1; - c = c1, c1 = modified_fat_get(s, c1)); - - c++; - if (c > mapping->end) { - int index = array_index(&(s->mapping), mapping); - int i, max_i = s->mapping.next - index; - for (i = 1; i < max_i && mapping[i].begin < c; i++); - while (--i > 0) - remove_mapping(s, index + 1); - } - assert(mapping == array_get(&(s->mapping), s->mapping.next - 1) - || mapping[1].begin >= c); - mapping->end = c; - - if (!fat_eof(s, c1)) { - int i = find_mapping_for_cluster_aux(s, c1, 0, s->mapping.next); - mapping_t* next_mapping = i >= s->mapping.next ? NULL : - array_get(&(s->mapping), i); - - if (next_mapping == NULL || next_mapping->begin > c1) { - int i1 = array_index(&(s->mapping), mapping); - - next_mapping = insert_mapping(s, c1, c1+1); - - if (c1 < c) - i1++; - mapping = array_get(&(s->mapping), i1); - } - - next_mapping->dir_index = mapping->dir_index; - next_mapping->first_mapping_index = - mapping->first_mapping_index < 0 ? - array_index(&(s->mapping), mapping) : - mapping->first_mapping_index; - next_mapping->path = mapping->path; - next_mapping->mode = mapping->mode; - next_mapping->read_only = mapping->read_only; - if (mapping->mode & MODE_DIRECTORY) { - next_mapping->info.dir.parent_mapping_index = - mapping->info.dir.parent_mapping_index; - next_mapping->info.dir.first_dir_index = - mapping->info.dir.first_dir_index + - 0x10 * s->sectors_per_cluster * - (mapping->end - mapping->begin); - } else - next_mapping->info.file.offset = mapping->info.file.offset + - mapping->end - mapping->begin; - - mapping = next_mapping; - } - - cluster = c1; - } - - return 0; -} - -static int commit_direntries(BDRVVVFATState* s, - int dir_index, int parent_mapping_index) -{ - direntry_t* direntry = array_get(&(s->directory), dir_index); - uint32_t first_cluster = dir_index == 0 ? 0 : begin_of_direntry(direntry); - mapping_t* mapping = find_mapping_for_cluster(s, first_cluster); - - int factor = 0x10 * s->sectors_per_cluster; - int old_cluster_count, new_cluster_count; - int current_dir_index = mapping->info.dir.first_dir_index; - int first_dir_index = current_dir_index; - int ret, i; - uint32_t c; - -DLOG(fprintf(stderr, "commit_direntries for %s, parent_mapping_index %d\n", mapping->path, parent_mapping_index)); - - assert(direntry); - assert(mapping); - assert(mapping->begin == first_cluster); - assert(mapping->info.dir.first_dir_index < s->directory.next); - assert(mapping->mode & MODE_DIRECTORY); - assert(dir_index == 0 || is_directory(direntry)); - - mapping->info.dir.parent_mapping_index = parent_mapping_index; - - if (first_cluster == 0) { - old_cluster_count = new_cluster_count = - s->last_cluster_of_root_directory; - } else { - for (old_cluster_count = 0, c = first_cluster; !fat_eof(s, c); - c = fat_get(s, c)) - old_cluster_count++; - - for (new_cluster_count = 0, c = first_cluster; !fat_eof(s, c); - c = modified_fat_get(s, c)) - new_cluster_count++; - } - - if (new_cluster_count > old_cluster_count) { - if (insert_direntries(s, - current_dir_index + factor * old_cluster_count, - factor * (new_cluster_count - old_cluster_count)) == NULL) - return -1; - } else if (new_cluster_count < old_cluster_count) - remove_direntries(s, - current_dir_index + factor * new_cluster_count, - factor * (old_cluster_count - new_cluster_count)); - - for (c = first_cluster; !fat_eof(s, c); c = modified_fat_get(s, c)) { - direntry_t *first_direntry; - void* direntry = array_get(&(s->directory), current_dir_index); - int ret = vvfat_read(s->bs, cluster2sector(s, c), direntry, - s->sectors_per_cluster); - if (ret) - return ret; - - /* The first directory entry on the filesystem is the volume name */ - first_direntry = (direntry_t*) s->directory.pointer; - assert(!memcmp(first_direntry->name, s->volume_label, 11)); - - current_dir_index += factor; - } - - ret = commit_mappings(s, first_cluster, dir_index); - if (ret) - return ret; - - /* recurse */ - for (i = 0; i < factor * new_cluster_count; i++) { - direntry = array_get(&(s->directory), first_dir_index + i); - if (is_directory(direntry) && !is_dot(direntry)) { - mapping = find_mapping_for_cluster(s, first_cluster); - assert(mapping->mode & MODE_DIRECTORY); - ret = commit_direntries(s, first_dir_index + i, - array_index(&(s->mapping), mapping)); - if (ret) - return ret; - } - } - - return 0; -} - -/* commit one file (adjust contents, adjust mapping), - return first_mapping_index */ -static int commit_one_file(BDRVVVFATState* s, - int dir_index, uint32_t offset) -{ - direntry_t* direntry = array_get(&(s->directory), dir_index); - uint32_t c = begin_of_direntry(direntry); - uint32_t first_cluster = c; - mapping_t* mapping = find_mapping_for_cluster(s, c); - uint32_t size = filesize_of_direntry(direntry); - char* cluster = g_malloc(s->cluster_size); - uint32_t i; - int fd = 0; - - assert(offset < size); - assert((offset % s->cluster_size) == 0); - - for (i = s->cluster_size; i < offset; i += s->cluster_size) - c = modified_fat_get(s, c); - - fd = qemu_open(mapping->path, O_RDWR | O_CREAT | O_BINARY, 0666); - if (fd < 0) { - fprintf(stderr, "Could not open %s... (%s, %d)\n", mapping->path, - strerror(errno), errno); - g_free(cluster); - return fd; - } - if (offset > 0) { - if (lseek(fd, offset, SEEK_SET) != offset) { - qemu_close(fd); - g_free(cluster); - return -3; - } - } - - while (offset < size) { - uint32_t c1; - int rest_size = (size - offset > s->cluster_size ? - s->cluster_size : size - offset); - int ret; - - c1 = modified_fat_get(s, c); - - assert((size - offset == 0 && fat_eof(s, c)) || - (size > offset && c >=2 && !fat_eof(s, c))); - - ret = vvfat_read(s->bs, cluster2sector(s, c), - (uint8_t*)cluster, (rest_size + 0x1ff) / 0x200); - - if (ret < 0) { - qemu_close(fd); - g_free(cluster); - return ret; - } - - if (write(fd, cluster, rest_size) < 0) { - qemu_close(fd); - g_free(cluster); - return -2; - } - - offset += rest_size; - c = c1; - } - - if (ftruncate(fd, size)) { - perror("ftruncate()"); - qemu_close(fd); - g_free(cluster); - return -4; - } - qemu_close(fd); - g_free(cluster); - - return commit_mappings(s, first_cluster, dir_index); -} - -#ifdef DEBUG -/* test, if all mappings point to valid direntries */ -static void check1(BDRVVVFATState* s) -{ - int i; - for (i = 0; i < s->mapping.next; i++) { - mapping_t* mapping = array_get(&(s->mapping), i); - if (mapping->mode & MODE_DELETED) { - fprintf(stderr, "deleted\n"); - continue; - } - assert(mapping->dir_index < s->directory.next); - direntry_t* direntry = array_get(&(s->directory), mapping->dir_index); - assert(mapping->begin == begin_of_direntry(direntry) || mapping->first_mapping_index >= 0); - if (mapping->mode & MODE_DIRECTORY) { - assert(mapping->info.dir.first_dir_index + 0x10 * s->sectors_per_cluster * (mapping->end - mapping->begin) <= s->directory.next); - assert((mapping->info.dir.first_dir_index % (0x10 * s->sectors_per_cluster)) == 0); - } - } -} - -/* test, if all direntries have mappings */ -static void check2(BDRVVVFATState* s) -{ - int i; - int first_mapping = -1; - - for (i = 0; i < s->directory.next; i++) { - direntry_t* direntry = array_get(&(s->directory), i); - - if (is_short_name(direntry) && begin_of_direntry(direntry)) { - mapping_t* mapping = find_mapping_for_cluster(s, begin_of_direntry(direntry)); - assert(mapping); - assert(mapping->dir_index == i || is_dot(direntry)); - assert(mapping->begin == begin_of_direntry(direntry) || is_dot(direntry)); - } - - if ((i % (0x10 * s->sectors_per_cluster)) == 0) { - /* cluster start */ - int j, count = 0; - - for (j = 0; j < s->mapping.next; j++) { - mapping_t* mapping = array_get(&(s->mapping), j); - if (mapping->mode & MODE_DELETED) - continue; - if (mapping->mode & MODE_DIRECTORY) { - if (mapping->info.dir.first_dir_index <= i && mapping->info.dir.first_dir_index + 0x10 * s->sectors_per_cluster > i) { - assert(++count == 1); - if (mapping->first_mapping_index == -1) - first_mapping = array_index(&(s->mapping), mapping); - else - assert(first_mapping == mapping->first_mapping_index); - if (mapping->info.dir.parent_mapping_index < 0) - assert(j == 0); - else { - mapping_t* parent = array_get(&(s->mapping), mapping->info.dir.parent_mapping_index); - assert(parent->mode & MODE_DIRECTORY); - assert(parent->info.dir.first_dir_index < mapping->info.dir.first_dir_index); - } - } - } - } - if (count == 0) - first_mapping = -1; - } - } -} -#endif - -static int handle_renames_and_mkdirs(BDRVVVFATState* s) -{ - int i; - -#ifdef DEBUG - fprintf(stderr, "handle_renames\n"); - for (i = 0; i < s->commits.next; i++) { - commit_t* commit = array_get(&(s->commits), i); - fprintf(stderr, "%d, %s (%d, %d)\n", i, commit->path ? commit->path : "(null)", commit->param.rename.cluster, commit->action); - } -#endif - - for (i = 0; i < s->commits.next;) { - commit_t* commit = array_get(&(s->commits), i); - if (commit->action == ACTION_RENAME) { - mapping_t* mapping = find_mapping_for_cluster(s, - commit->param.rename.cluster); - char* old_path = mapping->path; - - assert(commit->path); - mapping->path = commit->path; - if (rename(old_path, mapping->path)) - return -2; - - if (mapping->mode & MODE_DIRECTORY) { - int l1 = strlen(mapping->path); - int l2 = strlen(old_path); - int diff = l1 - l2; - direntry_t* direntry = array_get(&(s->directory), - mapping->info.dir.first_dir_index); - uint32_t c = mapping->begin; - int i = 0; - - /* recurse */ - while (!fat_eof(s, c)) { - do { - direntry_t* d = direntry + i; - - if (is_file(d) || (is_directory(d) && !is_dot(d))) { - mapping_t* m = find_mapping_for_cluster(s, - begin_of_direntry(d)); - int l = strlen(m->path); - char* new_path = g_malloc(l + diff + 1); - - assert(!strncmp(m->path, mapping->path, l2)); - - pstrcpy(new_path, l + diff + 1, mapping->path); - pstrcpy(new_path + l1, l + diff + 1 - l1, - m->path + l2); - - schedule_rename(s, m->begin, new_path); - } - i++; - } while((i % (0x10 * s->sectors_per_cluster)) != 0); - c = fat_get(s, c); - } - } - - g_free(old_path); - array_remove(&(s->commits), i); - continue; - } else if (commit->action == ACTION_MKDIR) { - mapping_t* mapping; - int j, parent_path_len; - -#ifdef __MINGW32__ - if (mkdir(commit->path)) - return -5; -#else - if (mkdir(commit->path, 0755)) - return -5; -#endif - - mapping = insert_mapping(s, commit->param.mkdir.cluster, - commit->param.mkdir.cluster + 1); - if (mapping == NULL) - return -6; - - mapping->mode = MODE_DIRECTORY; - mapping->read_only = 0; - mapping->path = commit->path; - j = s->directory.next; - assert(j); - insert_direntries(s, s->directory.next, - 0x10 * s->sectors_per_cluster); - mapping->info.dir.first_dir_index = j; - - parent_path_len = strlen(commit->path) - - strlen(get_basename(commit->path)) - 1; - for (j = 0; j < s->mapping.next; j++) { - mapping_t* m = array_get(&(s->mapping), j); - if (m->first_mapping_index < 0 && m != mapping && - !strncmp(m->path, mapping->path, parent_path_len) && - strlen(m->path) == parent_path_len) - break; - } - assert(j < s->mapping.next); - mapping->info.dir.parent_mapping_index = j; - - array_remove(&(s->commits), i); - continue; - } - - i++; - } - return 0; -} - -/* - * TODO: make sure that the short name is not matching *another* file - */ -static int handle_commits(BDRVVVFATState* s) -{ - int i, fail = 0; - - vvfat_close_current_file(s); - - for (i = 0; !fail && i < s->commits.next; i++) { - commit_t* commit = array_get(&(s->commits), i); - switch(commit->action) { - case ACTION_RENAME: case ACTION_MKDIR: - abort(); - fail = -2; - break; - case ACTION_WRITEOUT: { -#ifndef NDEBUG - /* these variables are only used by assert() below */ - direntry_t* entry = array_get(&(s->directory), - commit->param.writeout.dir_index); - uint32_t begin = begin_of_direntry(entry); - mapping_t* mapping = find_mapping_for_cluster(s, begin); -#endif - - assert(mapping); - assert(mapping->begin == begin); - assert(commit->path == NULL); - - if (commit_one_file(s, commit->param.writeout.dir_index, - commit->param.writeout.modified_offset)) - fail = -3; - - break; - } - case ACTION_NEW_FILE: { - int begin = commit->param.new_file.first_cluster; - mapping_t* mapping = find_mapping_for_cluster(s, begin); - direntry_t* entry; - int i; - - /* find direntry */ - for (i = 0; i < s->directory.next; i++) { - entry = array_get(&(s->directory), i); - if (is_file(entry) && begin_of_direntry(entry) == begin) - break; - } - - if (i >= s->directory.next) { - fail = -6; - continue; - } - - /* make sure there exists an initial mapping */ - if (mapping && mapping->begin != begin) { - mapping->end = begin; - mapping = NULL; - } - if (mapping == NULL) { - mapping = insert_mapping(s, begin, begin+1); - } - /* most members will be fixed in commit_mappings() */ - assert(commit->path); - mapping->path = commit->path; - mapping->read_only = 0; - mapping->mode = MODE_NORMAL; - mapping->info.file.offset = 0; - - if (commit_one_file(s, i, 0)) - fail = -7; - - break; - } - default: - abort(); - } - } - if (i > 0 && array_remove_slice(&(s->commits), 0, i)) - return -1; - return fail; -} - -static int handle_deletes(BDRVVVFATState* s) -{ - int i, deferred = 1, deleted = 1; - - /* delete files corresponding to mappings marked as deleted */ - /* handle DELETEs and unused mappings (modified_fat_get(s, mapping->begin) == 0) */ - while (deferred && deleted) { - deferred = 0; - deleted = 0; - - for (i = 1; i < s->mapping.next; i++) { - mapping_t* mapping = array_get(&(s->mapping), i); - if (mapping->mode & MODE_DELETED) { - direntry_t* entry = array_get(&(s->directory), - mapping->dir_index); - - if (is_free(entry)) { - /* remove file/directory */ - if (mapping->mode & MODE_DIRECTORY) { - int j, next_dir_index = s->directory.next, - first_dir_index = mapping->info.dir.first_dir_index; - - if (rmdir(mapping->path) < 0) { - if (errno == ENOTEMPTY) { - deferred++; - continue; - } else - return -5; - } - - for (j = 1; j < s->mapping.next; j++) { - mapping_t* m = array_get(&(s->mapping), j); - if (m->mode & MODE_DIRECTORY && - m->info.dir.first_dir_index > - first_dir_index && - m->info.dir.first_dir_index < - next_dir_index) - next_dir_index = - m->info.dir.first_dir_index; - } - remove_direntries(s, first_dir_index, - next_dir_index - first_dir_index); - - deleted++; - } - } else { - if (unlink(mapping->path)) - return -4; - deleted++; - } - DLOG(fprintf(stderr, "DELETE (%d)\n", i); print_mapping(mapping); print_direntry(entry)); - remove_mapping(s, i); - } - } - } - - return 0; -} - -/* - * synchronize mapping with new state: - * - * - copy FAT (with bdrv_read) - * - mark all filenames corresponding to mappings as deleted - * - recurse direntries from root (using bs->bdrv_read) - * - delete files corresponding to mappings marked as deleted - */ -static int do_commit(BDRVVVFATState* s) -{ - int ret = 0; - - /* the real meat are the commits. Nothing to do? Move along! */ - if (s->commits.next == 0) - return 0; - - vvfat_close_current_file(s); - - ret = handle_renames_and_mkdirs(s); - if (ret) { - fprintf(stderr, "Error handling renames (%d)\n", ret); - abort(); - return ret; - } - - /* copy FAT (with bdrv_read) */ - memcpy(s->fat.pointer, s->fat2, 0x200 * s->sectors_per_fat); - - /* recurse direntries from root (using bs->bdrv_read) */ - ret = commit_direntries(s, 0, -1); - if (ret) { - fprintf(stderr, "Fatal: error while committing (%d)\n", ret); - abort(); - return ret; - } - - ret = handle_commits(s); - if (ret) { - fprintf(stderr, "Error handling commits (%d)\n", ret); - abort(); - return ret; - } - - ret = handle_deletes(s); - if (ret) { - fprintf(stderr, "Error deleting\n"); - abort(); - return ret; - } - - if (s->qcow->drv->bdrv_make_empty) { - s->qcow->drv->bdrv_make_empty(s->qcow); - } - - memset(s->used_clusters, 0, sector2cluster(s, s->sector_count)); - -DLOG(checkpoint()); - return 0; -} - -static int try_commit(BDRVVVFATState* s) -{ - vvfat_close_current_file(s); -DLOG(checkpoint()); - if(!is_consistent(s)) - return -1; - return do_commit(s); -} - -static int vvfat_write(BlockDriverState *bs, int64_t sector_num, - const uint8_t *buf, int nb_sectors) -{ - BDRVVVFATState *s = bs->opaque; - int i, ret; - -DLOG(checkpoint()); - - /* Check if we're operating in read-only mode */ - if (s->qcow == NULL) { - return -EACCES; - } - - vvfat_close_current_file(s); - - /* - * Some sanity checks: - * - do not allow writing to the boot sector - * - do not allow to write non-ASCII filenames - */ - - if (sector_num < s->first_sectors_number) - return -1; - - for (i = sector2cluster(s, sector_num); - i <= sector2cluster(s, sector_num + nb_sectors - 1);) { - mapping_t* mapping = find_mapping_for_cluster(s, i); - if (mapping) { - if (mapping->read_only) { - fprintf(stderr, "Tried to write to write-protected file %s\n", - mapping->path); - return -1; - } - - if (mapping->mode & MODE_DIRECTORY) { - int begin = cluster2sector(s, i); - int end = begin + s->sectors_per_cluster, k; - int dir_index; - const direntry_t* direntries; - long_file_name lfn; - - lfn_init(&lfn); - - if (begin < sector_num) - begin = sector_num; - if (end > sector_num + nb_sectors) - end = sector_num + nb_sectors; - dir_index = mapping->dir_index + - 0x10 * (begin - mapping->begin * s->sectors_per_cluster); - direntries = (direntry_t*)(buf + 0x200 * (begin - sector_num)); - - for (k = 0; k < (end - begin) * 0x10; k++) { - /* do not allow non-ASCII filenames */ - if (parse_long_name(&lfn, direntries + k) < 0) { - fprintf(stderr, "Warning: non-ASCII filename\n"); - return -1; - } - /* no access to the direntry of a read-only file */ - else if (is_short_name(direntries+k) && - (direntries[k].attributes & 1)) { - if (memcmp(direntries + k, - array_get(&(s->directory), dir_index + k), - sizeof(direntry_t))) { - fprintf(stderr, "Warning: tried to write to write-protected file\n"); - return -1; - } - } - } - } - i = mapping->end; - } else - i++; - } - - /* - * Use qcow backend. Commit later. - */ -DLOG(fprintf(stderr, "Write to qcow backend: %d + %d\n", (int)sector_num, nb_sectors)); - ret = bdrv_write(s->qcow, sector_num, buf, nb_sectors); - if (ret < 0) { - fprintf(stderr, "Error writing to qcow backend\n"); - return ret; - } - - for (i = sector2cluster(s, sector_num); - i <= sector2cluster(s, sector_num + nb_sectors - 1); i++) - if (i >= 0) - s->used_clusters[i] |= USED_ALLOCATED; - -DLOG(checkpoint()); - /* TODO: add timeout */ - try_commit(s); - -DLOG(checkpoint()); - return 0; -} - -static coroutine_fn int vvfat_co_write(BlockDriverState *bs, int64_t sector_num, - const uint8_t *buf, int nb_sectors) -{ - int ret; - BDRVVVFATState *s = bs->opaque; - qemu_co_mutex_lock(&s->lock); - ret = vvfat_write(bs, sector_num, buf, nb_sectors); - qemu_co_mutex_unlock(&s->lock); - return ret; -} - -static int64_t coroutine_fn vvfat_co_get_block_status(BlockDriverState *bs, - int64_t sector_num, int nb_sectors, int *n, BlockDriverState **file) -{ - BDRVVVFATState* s = bs->opaque; - *n = s->sector_count - sector_num; - if (*n > nb_sectors) { - *n = nb_sectors; - } else if (*n < 0) { - return 0; - } - return BDRV_BLOCK_DATA; -} - -static int write_target_commit(BlockDriverState *bs, int64_t sector_num, - const uint8_t* buffer, int nb_sectors) { - BDRVVVFATState* s = *((BDRVVVFATState**) bs->opaque); - return try_commit(s); -} - -static void write_target_close(BlockDriverState *bs) { - BDRVVVFATState* s = *((BDRVVVFATState**) bs->opaque); - bdrv_unref(s->qcow); - g_free(s->qcow_filename); -} - -static BlockDriver vvfat_write_target = { - .format_name = "vvfat_write_target", - .bdrv_write = write_target_commit, - .bdrv_close = write_target_close, -}; - -static int enable_write_target(BDRVVVFATState *s, Error **errp) -{ - BlockDriver *bdrv_qcow = NULL; - BlockDriverState *backing; - QemuOpts *opts = NULL; - int ret; - int size = sector2cluster(s, s->sector_count); - QDict *options; - - s->used_clusters = calloc(size, 1); - - array_init(&(s->commits), sizeof(commit_t)); - - s->qcow_filename = g_malloc(PATH_MAX); - ret = get_tmp_filename(s->qcow_filename, PATH_MAX); - if (ret < 0) { - error_setg_errno(errp, -ret, "can't create temporary file"); - goto err; - } - - bdrv_qcow = bdrv_find_format("qcow"); - if (!bdrv_qcow) { - error_setg(errp, "Failed to locate qcow driver"); - ret = -ENOENT; - goto err; - } - - opts = qemu_opts_create(bdrv_qcow->create_opts, NULL, 0, &error_abort); - qemu_opt_set_number(opts, BLOCK_OPT_SIZE, s->sector_count * 512, - &error_abort); - qemu_opt_set(opts, BLOCK_OPT_BACKING_FILE, "fat:", &error_abort); - - ret = bdrv_create(bdrv_qcow, s->qcow_filename, opts, errp); - qemu_opts_del(opts); - if (ret < 0) { - goto err; - } - - s->qcow = NULL; - options = qdict_new(); - qdict_put(options, "driver", qstring_from_str("qcow")); - ret = bdrv_open(&s->qcow, s->qcow_filename, NULL, options, - BDRV_O_RDWR | BDRV_O_NO_FLUSH, errp); - if (ret < 0) { - goto err; - } - -#ifndef _WIN32 - unlink(s->qcow_filename); -#endif - - backing = bdrv_new(); - bdrv_set_backing_hd(s->bs, backing); - bdrv_unref(backing); - - s->bs->backing->bs->drv = &vvfat_write_target; - s->bs->backing->bs->opaque = g_new(void *, 1); - *(void**)s->bs->backing->bs->opaque = s; - - return 0; - -err: - g_free(s->qcow_filename); - s->qcow_filename = NULL; - return ret; -} - -static void vvfat_close(BlockDriverState *bs) -{ - BDRVVVFATState *s = bs->opaque; - - vvfat_close_current_file(s); - array_free(&(s->fat)); - array_free(&(s->directory)); - array_free(&(s->mapping)); - g_free(s->cluster_buffer); - - if (s->qcow) { - migrate_del_blocker(s->migration_blocker); - error_free(s->migration_blocker); - } -} - -static BlockDriver bdrv_vvfat = { - .format_name = "vvfat", - .protocol_name = "fat", - .instance_size = sizeof(BDRVVVFATState), - - .bdrv_parse_filename = vvfat_parse_filename, - .bdrv_file_open = vvfat_open, - .bdrv_close = vvfat_close, - - .bdrv_read = vvfat_co_read, - .bdrv_write = vvfat_co_write, - .bdrv_co_get_block_status = vvfat_co_get_block_status, -}; - -static void bdrv_vvfat_init(void) -{ - bdrv_register(&bdrv_vvfat); -} - -block_init(bdrv_vvfat_init); - -#ifdef DEBUG -static void checkpoint(void) { - assert(((mapping_t*)array_get(&(vvv->mapping), 0))->end == 2); - check1(vvv); - check2(vvv); - assert(!vvv->current_mapping || vvv->current_fd || (vvv->current_mapping->mode & MODE_DIRECTORY)); -#if 0 - if (((direntry_t*)vvv->directory.pointer)[1].attributes != 0xf) - fprintf(stderr, "Nonono!\n"); - mapping_t* mapping; - direntry_t* direntry; - assert(vvv->mapping.size >= vvv->mapping.item_size * vvv->mapping.next); - assert(vvv->directory.size >= vvv->directory.item_size * vvv->directory.next); - if (vvv->mapping.next<47) - return; - assert((mapping = array_get(&(vvv->mapping), 47))); - assert(mapping->dir_index < vvv->directory.next); - direntry = array_get(&(vvv->directory), mapping->dir_index); - assert(!memcmp(direntry->name, "USB H ", 11) || direntry->name[0]==0); -#endif -} -#endif diff --git a/qemu/block/win32-aio.c b/qemu/block/win32-aio.c deleted file mode 100644 index 2d509a9a7..000000000 --- a/qemu/block/win32-aio.c +++ /dev/null @@ -1,219 +0,0 @@ -/* - * Block driver for RAW files (win32) - * - * Copyright (c) 2006 Fabrice Bellard - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. - */ -#include "qemu/osdep.h" -#include "qemu-common.h" -#include "qemu/timer.h" -#include "block/block_int.h" -#include "qemu/module.h" -#include "block/aio.h" -#include "raw-aio.h" -#include "qemu/event_notifier.h" -#include "qemu/iov.h" -#include -#include - -#define FTYPE_FILE 0 -#define FTYPE_CD 1 -#define FTYPE_HARDDISK 2 - -struct QEMUWin32AIOState { - HANDLE hIOCP; - EventNotifier e; - int count; - bool is_aio_context_attached; -}; - -typedef struct QEMUWin32AIOCB { - BlockAIOCB common; - struct QEMUWin32AIOState *ctx; - int nbytes; - OVERLAPPED ov; - QEMUIOVector *qiov; - void *buf; - bool is_read; - bool is_linear; -} QEMUWin32AIOCB; - -/* - * Completes an AIO request (calls the callback and frees the ACB). - */ -static void win32_aio_process_completion(QEMUWin32AIOState *s, - QEMUWin32AIOCB *waiocb, DWORD count) -{ - int ret; - s->count--; - - if (waiocb->ov.Internal != 0) { - ret = -EIO; - } else { - ret = 0; - if (count < waiocb->nbytes) { - /* Short reads mean EOF, pad with zeros. */ - if (waiocb->is_read) { - qemu_iovec_memset(waiocb->qiov, count, 0, - waiocb->qiov->size - count); - } else { - ret = -EINVAL; - } - } - } - - if (!waiocb->is_linear) { - if (ret == 0 && waiocb->is_read) { - QEMUIOVector *qiov = waiocb->qiov; - iov_from_buf(qiov->iov, qiov->niov, 0, waiocb->buf, qiov->size); - } - qemu_vfree(waiocb->buf); - } - - - waiocb->common.cb(waiocb->common.opaque, ret); - qemu_aio_unref(waiocb); -} - -static void win32_aio_completion_cb(EventNotifier *e) -{ - QEMUWin32AIOState *s = container_of(e, QEMUWin32AIOState, e); - DWORD count; - ULONG_PTR key; - OVERLAPPED *ov; - - event_notifier_test_and_clear(&s->e); - while (GetQueuedCompletionStatus(s->hIOCP, &count, &key, &ov, 0)) { - QEMUWin32AIOCB *waiocb = container_of(ov, QEMUWin32AIOCB, ov); - - win32_aio_process_completion(s, waiocb, count); - } -} - -static const AIOCBInfo win32_aiocb_info = { - .aiocb_size = sizeof(QEMUWin32AIOCB), -}; - -BlockAIOCB *win32_aio_submit(BlockDriverState *bs, - QEMUWin32AIOState *aio, HANDLE hfile, - int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, - BlockCompletionFunc *cb, void *opaque, int type) -{ - struct QEMUWin32AIOCB *waiocb; - uint64_t offset = sector_num * 512; - DWORD rc; - - waiocb = qemu_aio_get(&win32_aiocb_info, bs, cb, opaque); - waiocb->nbytes = nb_sectors * 512; - waiocb->qiov = qiov; - waiocb->is_read = (type == QEMU_AIO_READ); - - if (qiov->niov > 1) { - waiocb->buf = qemu_try_blockalign(bs, qiov->size); - if (waiocb->buf == NULL) { - goto out; - } - if (type & QEMU_AIO_WRITE) { - iov_to_buf(qiov->iov, qiov->niov, 0, waiocb->buf, qiov->size); - } - waiocb->is_linear = false; - } else { - waiocb->buf = qiov->iov[0].iov_base; - waiocb->is_linear = true; - } - - memset(&waiocb->ov, 0, sizeof(waiocb->ov)); - waiocb->ov.Offset = (DWORD)offset; - waiocb->ov.OffsetHigh = (DWORD)(offset >> 32); - waiocb->ov.hEvent = event_notifier_get_handle(&aio->e); - - aio->count++; - - if (type & QEMU_AIO_READ) { - rc = ReadFile(hfile, waiocb->buf, waiocb->nbytes, NULL, &waiocb->ov); - } else { - rc = WriteFile(hfile, waiocb->buf, waiocb->nbytes, NULL, &waiocb->ov); - } - if(rc == 0 && GetLastError() != ERROR_IO_PENDING) { - goto out_dec_count; - } - return &waiocb->common; - -out_dec_count: - aio->count--; -out: - qemu_aio_unref(waiocb); - return NULL; -} - -int win32_aio_attach(QEMUWin32AIOState *aio, HANDLE hfile) -{ - if (CreateIoCompletionPort(hfile, aio->hIOCP, (ULONG_PTR) 0, 0) == NULL) { - return -EINVAL; - } else { - return 0; - } -} - -void win32_aio_detach_aio_context(QEMUWin32AIOState *aio, - AioContext *old_context) -{ - aio_set_event_notifier(old_context, &aio->e, false, NULL); - aio->is_aio_context_attached = false; -} - -void win32_aio_attach_aio_context(QEMUWin32AIOState *aio, - AioContext *new_context) -{ - aio->is_aio_context_attached = true; - aio_set_event_notifier(new_context, &aio->e, false, - win32_aio_completion_cb); -} - -QEMUWin32AIOState *win32_aio_init(void) -{ - QEMUWin32AIOState *s; - - s = g_malloc0(sizeof(*s)); - if (event_notifier_init(&s->e, false) < 0) { - goto out_free_state; - } - - s->hIOCP = CreateIoCompletionPort(INVALID_HANDLE_VALUE, NULL, 0, 0); - if (s->hIOCP == NULL) { - goto out_close_efd; - } - - return s; - -out_close_efd: - event_notifier_cleanup(&s->e); -out_free_state: - g_free(s); - return NULL; -} - -void win32_aio_cleanup(QEMUWin32AIOState *aio) -{ - assert(!aio->is_aio_context_attached); - CloseHandle(aio->hIOCP); - event_notifier_cleanup(&aio->e); - g_free(aio); -} diff --git a/qemu/block/write-threshold.c b/qemu/block/write-threshold.c deleted file mode 100644 index cc2ca7183..000000000 --- a/qemu/block/write-threshold.c +++ /dev/null @@ -1,126 +0,0 @@ -/* - * QEMU System Emulator block write threshold notification - * - * Copyright Red Hat, Inc. 2014 - * - * Authors: - * Francesco Romani - * - * This work is licensed under the terms of the GNU LGPL, version 2 or later. - * See the COPYING.LIB file in the top-level directory. - */ - -#include "qemu/osdep.h" -#include "block/block_int.h" -#include "qemu/coroutine.h" -#include "block/write-threshold.h" -#include "qemu/notify.h" -#include "qapi-event.h" -#include "qmp-commands.h" - - -uint64_t bdrv_write_threshold_get(const BlockDriverState *bs) -{ - return bs->write_threshold_offset; -} - -bool bdrv_write_threshold_is_set(const BlockDriverState *bs) -{ - return bs->write_threshold_offset > 0; -} - -static void write_threshold_disable(BlockDriverState *bs) -{ - if (bdrv_write_threshold_is_set(bs)) { - notifier_with_return_remove(&bs->write_threshold_notifier); - bs->write_threshold_offset = 0; - } -} - -uint64_t bdrv_write_threshold_exceeded(const BlockDriverState *bs, - const BdrvTrackedRequest *req) -{ - if (bdrv_write_threshold_is_set(bs)) { - if (req->offset > bs->write_threshold_offset) { - return (req->offset - bs->write_threshold_offset) + req->bytes; - } - if ((req->offset + req->bytes) > bs->write_threshold_offset) { - return (req->offset + req->bytes) - bs->write_threshold_offset; - } - } - return 0; -} - -static int coroutine_fn before_write_notify(NotifierWithReturn *notifier, - void *opaque) -{ - BdrvTrackedRequest *req = opaque; - BlockDriverState *bs = req->bs; - uint64_t amount = 0; - - amount = bdrv_write_threshold_exceeded(bs, req); - if (amount > 0) { - qapi_event_send_block_write_threshold( - bs->node_name, - amount, - bs->write_threshold_offset, - &error_abort); - - /* autodisable to avoid flooding the monitor */ - write_threshold_disable(bs); - } - - return 0; /* should always let other notifiers run */ -} - -static void write_threshold_register_notifier(BlockDriverState *bs) -{ - bs->write_threshold_notifier.notify = before_write_notify; - notifier_with_return_list_add(&bs->before_write_notifiers, - &bs->write_threshold_notifier); -} - -static void write_threshold_update(BlockDriverState *bs, - int64_t threshold_bytes) -{ - bs->write_threshold_offset = threshold_bytes; -} - -void bdrv_write_threshold_set(BlockDriverState *bs, uint64_t threshold_bytes) -{ - if (bdrv_write_threshold_is_set(bs)) { - if (threshold_bytes > 0) { - write_threshold_update(bs, threshold_bytes); - } else { - write_threshold_disable(bs); - } - } else { - if (threshold_bytes > 0) { - /* avoid multiple registration */ - write_threshold_register_notifier(bs); - write_threshold_update(bs, threshold_bytes); - } - /* discard bogus disable request */ - } -} - -void qmp_block_set_write_threshold(const char *node_name, - uint64_t threshold_bytes, - Error **errp) -{ - BlockDriverState *bs; - AioContext *aio_context; - - bs = bdrv_find_node(node_name); - if (!bs) { - error_setg(errp, "Device '%s' not found", node_name); - return; - } - - aio_context = bdrv_get_aio_context(bs); - aio_context_acquire(aio_context); - - bdrv_write_threshold_set(bs, threshold_bytes); - - aio_context_release(aio_context); -} -- cgit 1.2.3-korg