diff options
author | José Pekkarinen <jose.pekkarinen@nokia.com> | 2016-05-18 13:18:31 +0300 |
---|---|---|
committer | José Pekkarinen <jose.pekkarinen@nokia.com> | 2016-05-18 13:42:15 +0300 |
commit | 437fd90c0250dee670290f9b714253671a990160 (patch) | |
tree | b871786c360704244a07411c69fb58da9ead4a06 /qemu/block | |
parent | 5bbd6fe9b8bab2a93e548c5a53b032d1939eec05 (diff) |
These changes are the raw update to qemu-2.6.
Collission happened in the following patches:
migration: do cleanup operation after completion(738df5b9)
Bug fix.(1750c932f86)
kvmclock: add a new function to update env->tsc.(b52baab2)
The code provided by the patches was already in the upstreamed
version.
Change-Id: I3cc11841a6a76ae20887b2e245710199e1ea7f9a
Signed-off-by: José Pekkarinen <jose.pekkarinen@nokia.com>
Diffstat (limited to 'qemu/block')
61 files changed, 6908 insertions, 2566 deletions
diff --git a/qemu/block/Makefile.objs b/qemu/block/Makefile.objs index 58ef2ef3f..44a541622 100644 --- a/qemu/block/Makefile.objs +++ b/qemu/block/Makefile.objs @@ -4,7 +4,7 @@ block-obj-y += qed.o qed-gencb.o qed-l2-cache.o qed-table.o qed-cluster.o block-obj-y += qed-check.o block-obj-$(CONFIG_VHDX) += vhdx.o vhdx-endian.o vhdx-log.o block-obj-y += quorum.o -block-obj-y += parallels.o blkdebug.o blkverify.o +block-obj-y += parallels.o blkdebug.o blkverify.o blkreplay.o block-obj-y += block-backend.o snapshot.o qapi.o block-obj-$(CONFIG_WIN32) += raw-win32.o win32-aio.o block-obj-$(CONFIG_POSIX) += raw-posix.o @@ -20,9 +20,11 @@ block-obj-$(CONFIG_RBD) += rbd.o block-obj-$(CONFIG_GLUSTERFS) += gluster.o block-obj-$(CONFIG_ARCHIPELAGO) += archipelago.o block-obj-$(CONFIG_LIBSSH2) += ssh.o -block-obj-y += accounting.o +block-obj-y += accounting.o dirty-bitmap.o block-obj-y += write-threshold.o +block-obj-y += crypto.o + common-obj-y += stream.o common-obj-y += commit.o common-obj-y += backup.o diff --git a/qemu/block/accounting.c b/qemu/block/accounting.c index 01d594ffd..3f457c4e7 100644 --- a/qemu/block/accounting.c +++ b/qemu/block/accounting.c @@ -2,6 +2,7 @@ * QEMU System Emulator block accounting * * Copyright (c) 2011 Christoph Hellwig + * Copyright (c) 2015 Igalia, S.L. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal @@ -22,9 +23,58 @@ * THE SOFTWARE. */ +#include "qemu/osdep.h" #include "block/accounting.h" #include "block/block_int.h" #include "qemu/timer.h" +#include "sysemu/qtest.h" + +static QEMUClockType clock_type = QEMU_CLOCK_REALTIME; +static const int qtest_latency_ns = NANOSECONDS_PER_SECOND / 1000; + +void block_acct_init(BlockAcctStats *stats, bool account_invalid, + bool account_failed) +{ + stats->account_invalid = account_invalid; + stats->account_failed = account_failed; + + if (qtest_enabled()) { + clock_type = QEMU_CLOCK_VIRTUAL; + } +} + +void block_acct_cleanup(BlockAcctStats *stats) +{ + BlockAcctTimedStats *s, *next; + QSLIST_FOREACH_SAFE(s, &stats->intervals, entries, next) { + g_free(s); + } +} + +void block_acct_add_interval(BlockAcctStats *stats, unsigned interval_length) +{ + BlockAcctTimedStats *s; + unsigned i; + + s = g_new0(BlockAcctTimedStats, 1); + s->interval_length = interval_length; + QSLIST_INSERT_HEAD(&stats->intervals, s, entries); + + for (i = 0; i < BLOCK_MAX_IOTYPE; i++) { + timed_average_init(&s->latency[i], clock_type, + (uint64_t) interval_length * NANOSECONDS_PER_SECOND); + } +} + +BlockAcctTimedStats *block_acct_interval_next(BlockAcctStats *stats, + BlockAcctTimedStats *s) +{ + if (s == NULL) { + return QSLIST_FIRST(&stats->intervals); + } else { + return QSLIST_NEXT(s, entries); + } +} void block_acct_start(BlockAcctStats *stats, BlockAcctCookie *cookie, int64_t bytes, enum BlockAcctType type) @@ -32,26 +82,69 @@ void block_acct_start(BlockAcctStats *stats, BlockAcctCookie *cookie, assert(type < BLOCK_MAX_IOTYPE); cookie->bytes = bytes; - cookie->start_time_ns = qemu_clock_get_ns(QEMU_CLOCK_REALTIME); + cookie->start_time_ns = qemu_clock_get_ns(clock_type); cookie->type = type; } void block_acct_done(BlockAcctStats *stats, BlockAcctCookie *cookie) { + BlockAcctTimedStats *s; + int64_t time_ns = qemu_clock_get_ns(clock_type); + int64_t latency_ns = time_ns - cookie->start_time_ns; + + if (qtest_enabled()) { + latency_ns = qtest_latency_ns; + } + assert(cookie->type < BLOCK_MAX_IOTYPE); stats->nr_bytes[cookie->type] += cookie->bytes; stats->nr_ops[cookie->type]++; - stats->total_time_ns[cookie->type] += - qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - cookie->start_time_ns; + stats->total_time_ns[cookie->type] += latency_ns; + stats->last_access_time_ns = time_ns; + + QSLIST_FOREACH(s, &stats->intervals, entries) { + timed_average_account(&s->latency[cookie->type], latency_ns); + } } +void block_acct_failed(BlockAcctStats *stats, BlockAcctCookie *cookie) +{ + assert(cookie->type < BLOCK_MAX_IOTYPE); + + stats->failed_ops[cookie->type]++; + + if (stats->account_failed) { + BlockAcctTimedStats *s; + int64_t time_ns = qemu_clock_get_ns(clock_type); + int64_t latency_ns = time_ns - cookie->start_time_ns; + + if (qtest_enabled()) { + latency_ns = qtest_latency_ns; + } -void block_acct_highest_sector(BlockAcctStats *stats, int64_t sector_num, - unsigned int nb_sectors) + stats->total_time_ns[cookie->type] += latency_ns; + stats->last_access_time_ns = time_ns; + + QSLIST_FOREACH(s, &stats->intervals, entries) { + timed_average_account(&s->latency[cookie->type], latency_ns); + } + } +} + +void block_acct_invalid(BlockAcctStats *stats, enum BlockAcctType type) { - if (stats->wr_highest_sector < sector_num + nb_sectors - 1) { - stats->wr_highest_sector = sector_num + nb_sectors - 1; + assert(type < BLOCK_MAX_IOTYPE); + + /* block_acct_done() and block_acct_failed() update + * total_time_ns[], but this one does not. The reason is that + * invalid requests are accounted during their submission, + * therefore there's no actual I/O involved. */ + + stats->invalid_ops[type]++; + + if (stats->account_invalid) { + stats->last_access_time_ns = qemu_clock_get_ns(clock_type); } } @@ -61,3 +154,20 @@ void block_acct_merge_done(BlockAcctStats *stats, enum BlockAcctType type, assert(type < BLOCK_MAX_IOTYPE); stats->merged[type] += num_requests; } + +int64_t block_acct_idle_time_ns(BlockAcctStats *stats) +{ + return qemu_clock_get_ns(clock_type) - stats->last_access_time_ns; +} + +double block_acct_queue_depth(BlockAcctTimedStats *stats, + enum BlockAcctType type) +{ + uint64_t sum, elapsed; + + assert(type < BLOCK_MAX_IOTYPE); + + sum = timed_average_sum(&stats->latency[type], &elapsed); + + return (double) sum / elapsed; +} diff --git a/qemu/block/archipelago.c b/qemu/block/archipelago.c index 855655c6b..b9f5e69d4 100644 --- a/qemu/block/archipelago.c +++ b/qemu/block/archipelago.c @@ -50,7 +50,8 @@ * */ -#include "qemu-common.h" +#include "qemu/osdep.h" +#include "qemu/cutils.h" #include "block/block_int.h" #include "qemu/error-report.h" #include "qemu/thread.h" @@ -59,7 +60,6 @@ #include "qapi/qmp/qjson.h" #include "qemu/atomic.h" -#include <inttypes.h> #include <xseg/xseg.h> #include <xseg/protocol.h> diff --git a/qemu/block/backup.c b/qemu/block/backup.c index 965654d52..491fd1406 100644 --- a/qemu/block/backup.c +++ b/qemu/block/backup.c @@ -11,21 +11,20 @@ * */ -#include <stdio.h> -#include <errno.h> -#include <unistd.h> +#include "qemu/osdep.h" #include "trace.h" #include "block/block.h" #include "block/block_int.h" #include "block/blockjob.h" +#include "qapi/error.h" #include "qapi/qmp/qerror.h" #include "qemu/ratelimit.h" +#include "qemu/cutils.h" +#include "sysemu/block-backend.h" +#include "qemu/bitmap.h" -#define BACKUP_CLUSTER_BITS 16 -#define BACKUP_CLUSTER_SIZE (1 << BACKUP_CLUSTER_BITS) -#define BACKUP_SECTORS_PER_CLUSTER (BACKUP_CLUSTER_SIZE / BDRV_SECTOR_SIZE) - +#define BACKUP_CLUSTER_SIZE_DEFAULT (1 << 16) #define SLICE_TIME 100000000ULL /* ns */ typedef struct CowRequest { @@ -46,10 +45,17 @@ typedef struct BackupBlockJob { BlockdevOnError on_target_error; CoRwlock flush_rwlock; uint64_t sectors_read; - HBitmap *bitmap; + unsigned long *done_bitmap; + int64_t cluster_size; QLIST_HEAD(, CowRequest) inflight_reqs; } BackupBlockJob; +/* Size of a cluster in sectors, instead of bytes. */ +static inline int64_t cluster_size_sectors(BackupBlockJob *job) +{ + return job->cluster_size / BDRV_SECTOR_SIZE; +} + /* See if in-flight requests overlap and wait for them to complete */ static void coroutine_fn wait_for_overlapping_requests(BackupBlockJob *job, int64_t start, @@ -89,7 +95,8 @@ static void cow_request_end(CowRequest *req) static int coroutine_fn backup_do_cow(BlockDriverState *bs, int64_t sector_num, int nb_sectors, - bool *error_is_read) + bool *error_is_read, + bool is_write_notifier) { BackupBlockJob *job = (BackupBlockJob *)bs->job; CowRequest cow_request; @@ -97,13 +104,14 @@ static int coroutine_fn backup_do_cow(BlockDriverState *bs, QEMUIOVector bounce_qiov; void *bounce_buffer = NULL; int ret = 0; + int64_t sectors_per_cluster = cluster_size_sectors(job); int64_t start, end; int n; qemu_co_rwlock_rdlock(&job->flush_rwlock); - start = sector_num / BACKUP_SECTORS_PER_CLUSTER; - end = DIV_ROUND_UP(sector_num + nb_sectors, BACKUP_SECTORS_PER_CLUSTER); + start = sector_num / sectors_per_cluster; + end = DIV_ROUND_UP(sector_num + nb_sectors, sectors_per_cluster); trace_backup_do_cow_enter(job, start, sector_num, nb_sectors); @@ -111,26 +119,32 @@ static int coroutine_fn backup_do_cow(BlockDriverState *bs, cow_request_begin(&cow_request, job, start, end); for (; start < end; start++) { - if (hbitmap_get(job->bitmap, start)) { + if (test_bit(start, job->done_bitmap)) { trace_backup_do_cow_skip(job, start); continue; /* already copied */ } trace_backup_do_cow_process(job, start); - n = MIN(BACKUP_SECTORS_PER_CLUSTER, + n = MIN(sectors_per_cluster, job->common.len / BDRV_SECTOR_SIZE - - start * BACKUP_SECTORS_PER_CLUSTER); + start * sectors_per_cluster); if (!bounce_buffer) { - bounce_buffer = qemu_blockalign(bs, BACKUP_CLUSTER_SIZE); + bounce_buffer = qemu_blockalign(bs, job->cluster_size); } iov.iov_base = bounce_buffer; iov.iov_len = n * BDRV_SECTOR_SIZE; qemu_iovec_init_external(&bounce_qiov, &iov, 1); - ret = bdrv_co_readv(bs, start * BACKUP_SECTORS_PER_CLUSTER, n, - &bounce_qiov); + if (is_write_notifier) { + ret = bdrv_co_readv_no_serialising(bs, + start * sectors_per_cluster, + n, &bounce_qiov); + } else { + ret = bdrv_co_readv(bs, start * sectors_per_cluster, n, + &bounce_qiov); + } if (ret < 0) { trace_backup_do_cow_read_fail(job, start, ret); if (error_is_read) { @@ -141,11 +155,11 @@ static int coroutine_fn backup_do_cow(BlockDriverState *bs, if (buffer_is_zero(iov.iov_base, iov.iov_len)) { ret = bdrv_co_write_zeroes(job->target, - start * BACKUP_SECTORS_PER_CLUSTER, + start * sectors_per_cluster, n, BDRV_REQ_MAY_UNMAP); } else { ret = bdrv_co_writev(job->target, - start * BACKUP_SECTORS_PER_CLUSTER, n, + start * sectors_per_cluster, n, &bounce_qiov); } if (ret < 0) { @@ -156,7 +170,7 @@ static int coroutine_fn backup_do_cow(BlockDriverState *bs, goto out; } - hbitmap_set(job->bitmap, start, 1); + set_bit(start, job->done_bitmap); /* Publish progress, guest I/O counts as progress too. Note that the * offset field is an opaque progress value, it is not a disk offset. @@ -190,7 +204,7 @@ static int coroutine_fn backup_before_write_notify( assert((req->offset & (BDRV_SECTOR_SIZE - 1)) == 0); assert((req->bytes & (BDRV_SECTOR_SIZE - 1)) == 0); - return backup_do_cow(req->bs, sector_num, nb_sectors, NULL); + return backup_do_cow(req->bs, sector_num, nb_sectors, NULL, true); } static void backup_set_speed(BlockJob *job, int64_t speed, Error **errp) @@ -208,7 +222,41 @@ static void backup_iostatus_reset(BlockJob *job) { BackupBlockJob *s = container_of(job, BackupBlockJob, common); - bdrv_iostatus_reset(s->target); + if (s->target->blk) { + blk_iostatus_reset(s->target->blk); + } +} + +static void backup_cleanup_sync_bitmap(BackupBlockJob *job, int ret) +{ + BdrvDirtyBitmap *bm; + BlockDriverState *bs = job->common.bs; + + if (ret < 0 || block_job_is_cancelled(&job->common)) { + /* Merge the successor back into the parent, delete nothing. */ + bm = bdrv_reclaim_dirty_bitmap(bs, job->sync_bitmap, NULL); + assert(bm); + } else { + /* Everything is fine, delete this bitmap and install the backup. */ + bm = bdrv_dirty_bitmap_abdicate(bs, job->sync_bitmap, NULL); + assert(bm); + } +} + +static void backup_commit(BlockJob *job) +{ + BackupBlockJob *s = container_of(job, BackupBlockJob, common); + if (s->sync_bitmap) { + backup_cleanup_sync_bitmap(s, 0); + } +} + +static void backup_abort(BlockJob *job) +{ + BackupBlockJob *s = container_of(job, BackupBlockJob, common); + if (s->sync_bitmap) { + backup_cleanup_sync_bitmap(s, -1); + } } static const BlockJobDriver backup_job_driver = { @@ -216,6 +264,8 @@ static const BlockJobDriver backup_job_driver = { .job_type = BLOCK_JOB_TYPE_BACKUP, .set_speed = backup_set_speed, .iostatus_reset = backup_iostatus_reset, + .commit = backup_commit, + .abort = backup_abort, }; static BlockErrorAction backup_error_action(BackupBlockJob *job, @@ -280,21 +330,22 @@ static int coroutine_fn backup_run_incremental(BackupBlockJob *job) int64_t cluster; int64_t end; int64_t last_cluster = -1; + int64_t sectors_per_cluster = cluster_size_sectors(job); BlockDriverState *bs = job->common.bs; HBitmapIter hbi; granularity = bdrv_dirty_bitmap_granularity(job->sync_bitmap); - clusters_per_iter = MAX((granularity / BACKUP_CLUSTER_SIZE), 1); + clusters_per_iter = MAX((granularity / job->cluster_size), 1); bdrv_dirty_iter_init(job->sync_bitmap, &hbi); /* Find the next dirty sector(s) */ while ((sector = hbitmap_iter_next(&hbi)) != -1) { - cluster = sector / BACKUP_SECTORS_PER_CLUSTER; + cluster = sector / sectors_per_cluster; /* Fake progress updates for any clusters we skipped */ if (cluster != last_cluster + 1) { job->common.offset += ((cluster - last_cluster - 1) * - BACKUP_CLUSTER_SIZE); + job->cluster_size); } for (end = cluster + clusters_per_iter; cluster < end; cluster++) { @@ -302,8 +353,9 @@ static int coroutine_fn backup_run_incremental(BackupBlockJob *job) if (yield_and_check(job)) { return ret; } - ret = backup_do_cow(bs, cluster * BACKUP_SECTORS_PER_CLUSTER, - BACKUP_SECTORS_PER_CLUSTER, &error_is_read); + ret = backup_do_cow(bs, cluster * sectors_per_cluster, + sectors_per_cluster, &error_is_read, + false); if ((ret < 0) && backup_error_action(job, error_is_read, -ret) == BLOCK_ERROR_ACTION_REPORT) { @@ -314,17 +366,17 @@ static int coroutine_fn backup_run_incremental(BackupBlockJob *job) /* If the bitmap granularity is smaller than the backup granularity, * we need to advance the iterator pointer to the next cluster. */ - if (granularity < BACKUP_CLUSTER_SIZE) { - bdrv_set_dirty_iter(&hbi, cluster * BACKUP_SECTORS_PER_CLUSTER); + if (granularity < job->cluster_size) { + bdrv_set_dirty_iter(&hbi, cluster * sectors_per_cluster); } last_cluster = cluster - 1; } /* Play some final catchup with the progress meter */ - end = DIV_ROUND_UP(job->common.len, BACKUP_CLUSTER_SIZE); + end = DIV_ROUND_UP(job->common.len, job->cluster_size); if (last_cluster + 1 < end) { - job->common.offset += ((end - last_cluster - 1) * BACKUP_CLUSTER_SIZE); + job->common.offset += ((end - last_cluster - 1) * job->cluster_size); } return ret; @@ -341,19 +393,21 @@ static void coroutine_fn backup_run(void *opaque) .notify = backup_before_write_notify, }; int64_t start, end; + int64_t sectors_per_cluster = cluster_size_sectors(job); int ret = 0; QLIST_INIT(&job->inflight_reqs); qemu_co_rwlock_init(&job->flush_rwlock); start = 0; - end = DIV_ROUND_UP(job->common.len, BACKUP_CLUSTER_SIZE); + end = DIV_ROUND_UP(job->common.len, job->cluster_size); - job->bitmap = hbitmap_alloc(end, 0); + job->done_bitmap = bitmap_new(end); - bdrv_set_enable_write_cache(target, true); - bdrv_set_on_error(target, on_target_error, on_target_error); - bdrv_iostatus_enable(target); + if (target->blk) { + blk_set_on_error(target->blk, on_target_error, on_target_error); + blk_iostatus_enable(target->blk); + } bdrv_add_before_write_notifier(bs, &before_write); @@ -382,7 +436,7 @@ static void coroutine_fn backup_run(void *opaque) /* Check to see if these blocks are already in the * backing file. */ - for (i = 0; i < BACKUP_SECTORS_PER_CLUSTER;) { + for (i = 0; i < sectors_per_cluster;) { /* bdrv_is_allocated() only returns true/false based * on the first set of sectors it comes across that * are are all in the same state. @@ -391,8 +445,8 @@ static void coroutine_fn backup_run(void *opaque) * needed but at some point that is always the case. */ alloced = bdrv_is_allocated(bs, - start * BACKUP_SECTORS_PER_CLUSTER + i, - BACKUP_SECTORS_PER_CLUSTER - i, &n); + start * sectors_per_cluster + i, + sectors_per_cluster - i, &n); i += n; if (alloced == 1 || n == 0) { @@ -407,8 +461,8 @@ static void coroutine_fn backup_run(void *opaque) } } /* FULL sync mode we copy the whole drive. */ - ret = backup_do_cow(bs, start * BACKUP_SECTORS_PER_CLUSTER, - BACKUP_SECTORS_PER_CLUSTER, &error_is_read); + ret = backup_do_cow(bs, start * sectors_per_cluster, + sectors_per_cluster, &error_is_read, false); if (ret < 0) { /* Depending on error action, fail now or retry cluster */ BlockErrorAction action = @@ -428,22 +482,11 @@ static void coroutine_fn backup_run(void *opaque) /* wait until pending backup_do_cow() calls have completed */ qemu_co_rwlock_wrlock(&job->flush_rwlock); qemu_co_rwlock_unlock(&job->flush_rwlock); + g_free(job->done_bitmap); - if (job->sync_bitmap) { - BdrvDirtyBitmap *bm; - if (ret < 0 || block_job_is_cancelled(&job->common)) { - /* Merge the successor back into the parent, delete nothing. */ - bm = bdrv_reclaim_dirty_bitmap(bs, job->sync_bitmap, NULL); - assert(bm); - } else { - /* Everything is fine, delete this bitmap and install the backup. */ - bm = bdrv_dirty_bitmap_abdicate(bs, job->sync_bitmap, NULL); - assert(bm); - } + if (target->blk) { + blk_iostatus_disable(target->blk); } - hbitmap_free(job->bitmap); - - bdrv_iostatus_disable(target); bdrv_op_unblock_all(target, job->common.blocker); data = g_malloc(sizeof(*data)); @@ -457,9 +500,11 @@ void backup_start(BlockDriverState *bs, BlockDriverState *target, BlockdevOnError on_source_error, BlockdevOnError on_target_error, BlockCompletionFunc *cb, void *opaque, - Error **errp) + BlockJobTxn *txn, Error **errp) { int64_t len; + BlockDriverInfo bdi; + int ret; assert(bs); assert(target); @@ -472,7 +517,7 @@ void backup_start(BlockDriverState *bs, BlockDriverState *target, if ((on_source_error == BLOCKDEV_ON_ERROR_STOP || on_source_error == BLOCKDEV_ON_ERROR_ENOSPC) && - !bdrv_iostatus_is_enabled(bs)) { + (!bs->blk || !blk_iostatus_is_enabled(bs->blk))) { error_setg(errp, QERR_INVALID_PARAMETER, "on-source-error"); return; } @@ -529,16 +574,35 @@ void backup_start(BlockDriverState *bs, BlockDriverState *target, goto error; } - bdrv_op_block_all(target, job->common.blocker); - job->on_source_error = on_source_error; job->on_target_error = on_target_error; job->target = target; job->sync_mode = sync_mode; job->sync_bitmap = sync_mode == MIRROR_SYNC_MODE_INCREMENTAL ? sync_bitmap : NULL; + + /* If there is no backing file on the target, we cannot rely on COW if our + * backup cluster size is smaller than the target cluster size. Even for + * targets with a backing file, try to avoid COW if possible. */ + ret = bdrv_get_info(job->target, &bdi); + if (ret < 0 && !target->backing) { + error_setg_errno(errp, -ret, + "Couldn't determine the cluster size of the target image, " + "which has no backing file"); + error_append_hint(errp, + "Aborting, since this may create an unusable destination image\n"); + goto error; + } else if (ret < 0 && target->backing) { + /* Not fatal; just trudge on ahead. */ + job->cluster_size = BACKUP_CLUSTER_SIZE_DEFAULT; + } else { + job->cluster_size = MAX(BACKUP_CLUSTER_SIZE_DEFAULT, bdi.cluster_size); + } + + bdrv_op_block_all(target, job->common.blocker); job->common.len = len; job->common.co = qemu_coroutine_create(backup_run); + block_job_txn_add_job(txn, &job->common); qemu_coroutine_enter(job->common.co, job); return; diff --git a/qemu/block/blkdebug.c b/qemu/block/blkdebug.c index bc247f46f..20d25bda6 100644 --- a/qemu/block/blkdebug.c +++ b/qemu/block/blkdebug.c @@ -22,7 +22,9 @@ * THE SOFTWARE. */ -#include "qemu-common.h" +#include "qemu/osdep.h" +#include "qapi/error.h" +#include "qemu/cutils.h" #include "qemu/config-file.h" #include "block/block_int.h" #include "qemu/module.h" @@ -30,12 +32,13 @@ #include "qapi/qmp/qdict.h" #include "qapi/qmp/qint.h" #include "qapi/qmp/qstring.h" +#include "sysemu/qtest.h" typedef struct BDRVBlkdebugState { int state; int new_state; - QLIST_HEAD(, BlkdebugRule) rules[BLKDBG_EVENT_MAX]; + QLIST_HEAD(, BlkdebugRule) rules[BLKDBG__MAX]; QSIMPLEQ_HEAD(, BlkdebugRule) active_rules; QLIST_HEAD(, BlkdebugSuspendedReq) suspended_reqs; } BDRVBlkdebugState; @@ -63,7 +66,7 @@ enum { }; typedef struct BlkdebugRule { - BlkDebugEvent event; + BlkdebugEvent event; int action; int state; union { @@ -142,69 +145,12 @@ static QemuOptsList *config_groups[] = { NULL }; -static const char *event_names[BLKDBG_EVENT_MAX] = { - [BLKDBG_L1_UPDATE] = "l1_update", - [BLKDBG_L1_GROW_ALLOC_TABLE] = "l1_grow.alloc_table", - [BLKDBG_L1_GROW_WRITE_TABLE] = "l1_grow.write_table", - [BLKDBG_L1_GROW_ACTIVATE_TABLE] = "l1_grow.activate_table", - - [BLKDBG_L2_LOAD] = "l2_load", - [BLKDBG_L2_UPDATE] = "l2_update", - [BLKDBG_L2_UPDATE_COMPRESSED] = "l2_update_compressed", - [BLKDBG_L2_ALLOC_COW_READ] = "l2_alloc.cow_read", - [BLKDBG_L2_ALLOC_WRITE] = "l2_alloc.write", - - [BLKDBG_READ_AIO] = "read_aio", - [BLKDBG_READ_BACKING_AIO] = "read_backing_aio", - [BLKDBG_READ_COMPRESSED] = "read_compressed", - - [BLKDBG_WRITE_AIO] = "write_aio", - [BLKDBG_WRITE_COMPRESSED] = "write_compressed", - - [BLKDBG_VMSTATE_LOAD] = "vmstate_load", - [BLKDBG_VMSTATE_SAVE] = "vmstate_save", - - [BLKDBG_COW_READ] = "cow_read", - [BLKDBG_COW_WRITE] = "cow_write", - - [BLKDBG_REFTABLE_LOAD] = "reftable_load", - [BLKDBG_REFTABLE_GROW] = "reftable_grow", - [BLKDBG_REFTABLE_UPDATE] = "reftable_update", - - [BLKDBG_REFBLOCK_LOAD] = "refblock_load", - [BLKDBG_REFBLOCK_UPDATE] = "refblock_update", - [BLKDBG_REFBLOCK_UPDATE_PART] = "refblock_update_part", - [BLKDBG_REFBLOCK_ALLOC] = "refblock_alloc", - [BLKDBG_REFBLOCK_ALLOC_HOOKUP] = "refblock_alloc.hookup", - [BLKDBG_REFBLOCK_ALLOC_WRITE] = "refblock_alloc.write", - [BLKDBG_REFBLOCK_ALLOC_WRITE_BLOCKS] = "refblock_alloc.write_blocks", - [BLKDBG_REFBLOCK_ALLOC_WRITE_TABLE] = "refblock_alloc.write_table", - [BLKDBG_REFBLOCK_ALLOC_SWITCH_TABLE] = "refblock_alloc.switch_table", - - [BLKDBG_CLUSTER_ALLOC] = "cluster_alloc", - [BLKDBG_CLUSTER_ALLOC_BYTES] = "cluster_alloc_bytes", - [BLKDBG_CLUSTER_FREE] = "cluster_free", - - [BLKDBG_FLUSH_TO_OS] = "flush_to_os", - [BLKDBG_FLUSH_TO_DISK] = "flush_to_disk", - - [BLKDBG_PWRITEV_RMW_HEAD] = "pwritev_rmw.head", - [BLKDBG_PWRITEV_RMW_AFTER_HEAD] = "pwritev_rmw.after_head", - [BLKDBG_PWRITEV_RMW_TAIL] = "pwritev_rmw.tail", - [BLKDBG_PWRITEV_RMW_AFTER_TAIL] = "pwritev_rmw.after_tail", - [BLKDBG_PWRITEV] = "pwritev", - [BLKDBG_PWRITEV_ZERO] = "pwritev_zero", - [BLKDBG_PWRITEV_DONE] = "pwritev_done", - - [BLKDBG_EMPTY_IMAGE_PREPARE] = "empty_image_prepare", -}; - -static int get_event_by_name(const char *name, BlkDebugEvent *event) +static int get_event_by_name(const char *name, BlkdebugEvent *event) { int i; - for (i = 0; i < BLKDBG_EVENT_MAX; i++) { - if (!strcmp(event_names[i], name)) { + for (i = 0; i < BLKDBG__MAX; i++) { + if (!strcmp(BlkdebugEvent_lookup[i], name)) { *event = i; return 0; } @@ -223,7 +169,7 @@ static int add_rule(void *opaque, QemuOpts *opts, Error **errp) struct add_rule_data *d = opaque; BDRVBlkdebugState *s = d->s; const char* event_name; - BlkDebugEvent event; + BlkdebugEvent event; struct BlkdebugRule *rule; /* Find the right event for the rule */ @@ -426,11 +372,11 @@ static int blkdebug_open(BlockDriverState *bs, QDict *options, int flags, /* Set initial state */ s->state = 1; - /* Open the backing file */ - assert(bs->file == NULL); - ret = bdrv_open_image(&bs->file, qemu_opt_get(opts, "x-image"), options, "image", - bs, &child_file, false, &local_err); - if (ret < 0) { + /* Open the image file */ + bs->file = bdrv_open_child(qemu_opt_get(opts, "x-image"), options, "image", + bs, &child_file, false, &local_err); + if (local_err) { + ret = -EINVAL; error_propagate(errp, local_err); goto out; } @@ -449,7 +395,7 @@ static int blkdebug_open(BlockDriverState *bs, QDict *options, int flags, goto out; fail_unref: - bdrv_unref(bs->file); + bdrv_unref_child(bs, bs->file); out: qemu_opts_del(opts); return ret; @@ -510,7 +456,8 @@ static BlockAIOCB *blkdebug_aio_readv(BlockDriverState *bs, return inject_error(bs, cb, opaque, rule); } - return bdrv_aio_readv(bs->file, sector_num, qiov, nb_sectors, cb, opaque); + return bdrv_aio_readv(bs->file->bs, sector_num, qiov, nb_sectors, + cb, opaque); } static BlockAIOCB *blkdebug_aio_writev(BlockDriverState *bs, @@ -532,7 +479,8 @@ static BlockAIOCB *blkdebug_aio_writev(BlockDriverState *bs, return inject_error(bs, cb, opaque, rule); } - return bdrv_aio_writev(bs->file, sector_num, qiov, nb_sectors, cb, opaque); + return bdrv_aio_writev(bs->file->bs, sector_num, qiov, nb_sectors, + cb, opaque); } static BlockAIOCB *blkdebug_aio_flush(BlockDriverState *bs, @@ -551,7 +499,7 @@ static BlockAIOCB *blkdebug_aio_flush(BlockDriverState *bs, return inject_error(bs, cb, opaque, rule); } - return bdrv_aio_flush(bs->file, cb, opaque); + return bdrv_aio_flush(bs->file->bs, cb, opaque); } @@ -561,7 +509,7 @@ static void blkdebug_close(BlockDriverState *bs) BlkdebugRule *rule, *next; int i; - for (i = 0; i < BLKDBG_EVENT_MAX; i++) { + for (i = 0; i < BLKDBG__MAX; i++) { QLIST_FOREACH_SAFE(rule, &s->rules[i], next, next) { remove_rule(rule); } @@ -581,9 +529,13 @@ static void suspend_request(BlockDriverState *bs, BlkdebugRule *rule) remove_rule(rule); QLIST_INSERT_HEAD(&s->suspended_reqs, &r, next); - printf("blkdebug: Suspended request '%s'\n", r.tag); + if (!qtest_enabled()) { + printf("blkdebug: Suspended request '%s'\n", r.tag); + } qemu_coroutine_yield(); - printf("blkdebug: Resuming request '%s'\n", r.tag); + if (!qtest_enabled()) { + printf("blkdebug: Resuming request '%s'\n", r.tag); + } QLIST_REMOVE(&r, next); g_free(r.tag); @@ -620,13 +572,13 @@ static bool process_rule(BlockDriverState *bs, struct BlkdebugRule *rule, return injected; } -static void blkdebug_debug_event(BlockDriverState *bs, BlkDebugEvent event) +static void blkdebug_debug_event(BlockDriverState *bs, BlkdebugEvent event) { BDRVBlkdebugState *s = bs->opaque; struct BlkdebugRule *rule, *next; bool injected; - assert((int)event >= 0 && event < BLKDBG_EVENT_MAX); + assert((int)event >= 0 && event < BLKDBG__MAX); injected = false; s->new_state = s->state; @@ -641,7 +593,7 @@ static int blkdebug_debug_breakpoint(BlockDriverState *bs, const char *event, { BDRVBlkdebugState *s = bs->opaque; struct BlkdebugRule *rule; - BlkDebugEvent blkdebug_event; + BlkdebugEvent blkdebug_event; if (get_event_by_name(event, &blkdebug_event) < 0) { return -ENOENT; @@ -683,7 +635,7 @@ static int blkdebug_debug_remove_breakpoint(BlockDriverState *bs, BlkdebugRule *rule, *next; int i, ret = -ENOENT; - for (i = 0; i < BLKDBG_EVENT_MAX; i++) { + for (i = 0; i < BLKDBG__MAX; i++) { QLIST_FOREACH_SAFE(rule, &s->rules[i], next, next) { if (rule->action == ACTION_SUSPEND && !strcmp(rule->options.suspend.tag, tag)) { @@ -716,55 +668,50 @@ static bool blkdebug_debug_is_suspended(BlockDriverState *bs, const char *tag) static int64_t blkdebug_getlength(BlockDriverState *bs) { - return bdrv_getlength(bs->file); + return bdrv_getlength(bs->file->bs); } static int blkdebug_truncate(BlockDriverState *bs, int64_t offset) { - return bdrv_truncate(bs->file, offset); + return bdrv_truncate(bs->file->bs, offset); } -static void blkdebug_refresh_filename(BlockDriverState *bs) +static void blkdebug_refresh_filename(BlockDriverState *bs, QDict *options) { QDict *opts; const QDictEntry *e; bool force_json = false; - for (e = qdict_first(bs->options); e; e = qdict_next(bs->options, e)) { + for (e = qdict_first(options); e; e = qdict_next(options, e)) { if (strcmp(qdict_entry_key(e), "config") && - strcmp(qdict_entry_key(e), "x-image") && - strcmp(qdict_entry_key(e), "image") && - strncmp(qdict_entry_key(e), "image.", strlen("image."))) + strcmp(qdict_entry_key(e), "x-image")) { force_json = true; break; } } - if (force_json && !bs->file->full_open_options) { + if (force_json && !bs->file->bs->full_open_options) { /* The config file cannot be recreated, so creating a plain filename * is impossible */ return; } - if (!force_json && bs->file->exact_filename[0]) { + if (!force_json && bs->file->bs->exact_filename[0]) { snprintf(bs->exact_filename, sizeof(bs->exact_filename), "blkdebug:%s:%s", - qdict_get_try_str(bs->options, "config") ?: "", - bs->file->exact_filename); + qdict_get_try_str(options, "config") ?: "", + bs->file->bs->exact_filename); } opts = qdict_new(); qdict_put_obj(opts, "driver", QOBJECT(qstring_from_str("blkdebug"))); - QINCREF(bs->file->full_open_options); - qdict_put_obj(opts, "image", QOBJECT(bs->file->full_open_options)); + QINCREF(bs->file->bs->full_open_options); + qdict_put_obj(opts, "image", QOBJECT(bs->file->bs->full_open_options)); - for (e = qdict_first(bs->options); e; e = qdict_next(bs->options, e)) { - if (strcmp(qdict_entry_key(e), "x-image") && - strcmp(qdict_entry_key(e), "image") && - strncmp(qdict_entry_key(e), "image.", strlen("image."))) - { + for (e = qdict_first(options); e; e = qdict_next(options, e)) { + if (strcmp(qdict_entry_key(e), "x-image")) { qobject_incref(qdict_entry_value(e)); qdict_put_obj(opts, qdict_entry_key(e), qdict_entry_value(e)); } @@ -773,6 +720,12 @@ static void blkdebug_refresh_filename(BlockDriverState *bs) bs->full_open_options = opts; } +static int blkdebug_reopen_prepare(BDRVReopenState *reopen_state, + BlockReopenQueue *queue, Error **errp) +{ + return 0; +} + static BlockDriver bdrv_blkdebug = { .format_name = "blkdebug", .protocol_name = "blkdebug", @@ -781,6 +734,7 @@ static BlockDriver bdrv_blkdebug = { .bdrv_parse_filename = blkdebug_parse_filename, .bdrv_file_open = blkdebug_open, .bdrv_close = blkdebug_close, + .bdrv_reopen_prepare = blkdebug_reopen_prepare, .bdrv_getlength = blkdebug_getlength, .bdrv_truncate = blkdebug_truncate, .bdrv_refresh_filename = blkdebug_refresh_filename, diff --git a/qemu/block/blkreplay.c b/qemu/block/blkreplay.c new file mode 100755 index 000000000..42f1813af --- /dev/null +++ b/qemu/block/blkreplay.c @@ -0,0 +1,160 @@ +/* + * Block protocol for record/replay + * + * Copyright (c) 2010-2016 Institute for System Programming + * of the Russian Academy of Sciences. + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + * + */ + +#include "qemu/osdep.h" +#include "qemu-common.h" +#include "block/block_int.h" +#include "sysemu/replay.h" +#include "qapi/error.h" + +typedef struct Request { + Coroutine *co; + QEMUBH *bh; +} Request; + +/* Next request id. + This counter is global, because requests from different + block devices should not get overlapping ids. */ +static uint64_t request_id; + +static int blkreplay_open(BlockDriverState *bs, QDict *options, int flags, + Error **errp) +{ + Error *local_err = NULL; + int ret; + + /* Open the image file */ + bs->file = bdrv_open_child(NULL, options, "image", + bs, &child_file, false, &local_err); + if (local_err) { + ret = -EINVAL; + error_propagate(errp, local_err); + goto fail; + } + + ret = 0; +fail: + if (ret < 0) { + bdrv_unref_child(bs, bs->file); + } + return ret; +} + +static void blkreplay_close(BlockDriverState *bs) +{ +} + +static int64_t blkreplay_getlength(BlockDriverState *bs) +{ + return bdrv_getlength(bs->file->bs); +} + +/* This bh is used for synchronization of return from coroutines. + It continues yielded coroutine which then finishes its execution. + BH is called adjusted to some replay checkpoint, therefore + record and replay will always finish coroutines deterministically. +*/ +static void blkreplay_bh_cb(void *opaque) +{ + Request *req = opaque; + qemu_coroutine_enter(req->co, NULL); + qemu_bh_delete(req->bh); + g_free(req); +} + +static void block_request_create(uint64_t reqid, BlockDriverState *bs, + Coroutine *co) +{ + Request *req = g_new(Request, 1); + *req = (Request) { + .co = co, + .bh = aio_bh_new(bdrv_get_aio_context(bs), blkreplay_bh_cb, req), + }; + replay_block_event(req->bh, reqid); +} + +static int coroutine_fn blkreplay_co_readv(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, QEMUIOVector *qiov) +{ + uint64_t reqid = request_id++; + int ret = bdrv_co_readv(bs->file->bs, sector_num, nb_sectors, qiov); + block_request_create(reqid, bs, qemu_coroutine_self()); + qemu_coroutine_yield(); + + return ret; +} + +static int coroutine_fn blkreplay_co_writev(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, QEMUIOVector *qiov) +{ + uint64_t reqid = request_id++; + int ret = bdrv_co_writev(bs->file->bs, sector_num, nb_sectors, qiov); + block_request_create(reqid, bs, qemu_coroutine_self()); + qemu_coroutine_yield(); + + return ret; +} + +static int coroutine_fn blkreplay_co_write_zeroes(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, BdrvRequestFlags flags) +{ + uint64_t reqid = request_id++; + int ret = bdrv_co_write_zeroes(bs->file->bs, sector_num, nb_sectors, flags); + block_request_create(reqid, bs, qemu_coroutine_self()); + qemu_coroutine_yield(); + + return ret; +} + +static int coroutine_fn blkreplay_co_discard(BlockDriverState *bs, + int64_t sector_num, int nb_sectors) +{ + uint64_t reqid = request_id++; + int ret = bdrv_co_discard(bs->file->bs, sector_num, nb_sectors); + block_request_create(reqid, bs, qemu_coroutine_self()); + qemu_coroutine_yield(); + + return ret; +} + +static int coroutine_fn blkreplay_co_flush(BlockDriverState *bs) +{ + uint64_t reqid = request_id++; + int ret = bdrv_co_flush(bs->file->bs); + block_request_create(reqid, bs, qemu_coroutine_self()); + qemu_coroutine_yield(); + + return ret; +} + +static BlockDriver bdrv_blkreplay = { + .format_name = "blkreplay", + .protocol_name = "blkreplay", + .instance_size = 0, + + .bdrv_file_open = blkreplay_open, + .bdrv_close = blkreplay_close, + .bdrv_getlength = blkreplay_getlength, + + .bdrv_co_readv = blkreplay_co_readv, + .bdrv_co_writev = blkreplay_co_writev, + + .bdrv_co_write_zeroes = blkreplay_co_write_zeroes, + .bdrv_co_discard = blkreplay_co_discard, + .bdrv_co_flush = blkreplay_co_flush, +}; + +static void bdrv_blkreplay_init(void) +{ + bdrv_register(&bdrv_blkreplay); +} + +block_init(bdrv_blkreplay_init); diff --git a/qemu/block/blkverify.c b/qemu/block/blkverify.c index d277e6322..9414b7a84 100644 --- a/qemu/block/blkverify.c +++ b/qemu/block/blkverify.c @@ -7,14 +7,16 @@ * See the COPYING file in the top-level directory. */ -#include <stdarg.h> +#include "qemu/osdep.h" +#include "qapi/error.h" #include "qemu/sockets.h" /* for EINPROGRESS on Windows */ #include "block/block_int.h" #include "qapi/qmp/qdict.h" #include "qapi/qmp/qstring.h" +#include "qemu/cutils.h" typedef struct { - BlockDriverState *test_file; + BdrvChild *test_file; } BDRVBlkverifyState; typedef struct BlkverifyAIOCB BlkverifyAIOCB; @@ -123,26 +125,29 @@ static int blkverify_open(BlockDriverState *bs, QDict *options, int flags, } /* Open the raw file */ - assert(bs->file == NULL); - ret = bdrv_open_image(&bs->file, qemu_opt_get(opts, "x-raw"), options, - "raw", bs, &child_file, false, &local_err); - if (ret < 0) { + bs->file = bdrv_open_child(qemu_opt_get(opts, "x-raw"), options, "raw", + bs, &child_file, false, &local_err); + if (local_err) { + ret = -EINVAL; error_propagate(errp, local_err); goto fail; } /* Open the test file */ - assert(s->test_file == NULL); - ret = bdrv_open_image(&s->test_file, qemu_opt_get(opts, "x-image"), options, - "test", bs, &child_format, false, &local_err); - if (ret < 0) { + s->test_file = bdrv_open_child(qemu_opt_get(opts, "x-image"), options, + "test", bs, &child_format, false, + &local_err); + if (local_err) { + ret = -EINVAL; error_propagate(errp, local_err); - s->test_file = NULL; goto fail; } ret = 0; fail: + if (ret < 0) { + bdrv_unref_child(bs, bs->file); + } qemu_opts_del(opts); return ret; } @@ -151,7 +156,7 @@ static void blkverify_close(BlockDriverState *bs) { BDRVBlkverifyState *s = bs->opaque; - bdrv_unref(s->test_file); + bdrv_unref_child(bs, s->test_file); s->test_file = NULL; } @@ -159,7 +164,7 @@ static int64_t blkverify_getlength(BlockDriverState *bs) { BDRVBlkverifyState *s = bs->opaque; - return bdrv_getlength(s->test_file); + return bdrv_getlength(s->test_file->bs); } static BlkverifyAIOCB *blkverify_aio_get(BlockDriverState *bs, bool is_write, @@ -238,13 +243,13 @@ static BlockAIOCB *blkverify_aio_readv(BlockDriverState *bs, nb_sectors, cb, opaque); acb->verify = blkverify_verify_readv; - acb->buf = qemu_blockalign(bs->file, qiov->size); + acb->buf = qemu_blockalign(bs->file->bs, qiov->size); qemu_iovec_init(&acb->raw_qiov, acb->qiov->niov); qemu_iovec_clone(&acb->raw_qiov, qiov, acb->buf); - bdrv_aio_readv(s->test_file, sector_num, qiov, nb_sectors, + bdrv_aio_readv(s->test_file->bs, sector_num, qiov, nb_sectors, blkverify_aio_cb, acb); - bdrv_aio_readv(bs->file, sector_num, &acb->raw_qiov, nb_sectors, + bdrv_aio_readv(bs->file->bs, sector_num, &acb->raw_qiov, nb_sectors, blkverify_aio_cb, acb); return &acb->common; } @@ -257,9 +262,9 @@ static BlockAIOCB *blkverify_aio_writev(BlockDriverState *bs, BlkverifyAIOCB *acb = blkverify_aio_get(bs, true, sector_num, qiov, nb_sectors, cb, opaque); - bdrv_aio_writev(s->test_file, sector_num, qiov, nb_sectors, + bdrv_aio_writev(s->test_file->bs, sector_num, qiov, nb_sectors, blkverify_aio_cb, acb); - bdrv_aio_writev(bs->file, sector_num, qiov, nb_sectors, + bdrv_aio_writev(bs->file->bs, sector_num, qiov, nb_sectors, blkverify_aio_cb, acb); return &acb->common; } @@ -271,7 +276,7 @@ static BlockAIOCB *blkverify_aio_flush(BlockDriverState *bs, BDRVBlkverifyState *s = bs->opaque; /* Only flush test file, the raw file is not important */ - return bdrv_aio_flush(s->test_file, cb, opaque); + return bdrv_aio_flush(s->test_file->bs, cb, opaque); } static bool blkverify_recurse_is_first_non_filter(BlockDriverState *bs, @@ -279,13 +284,13 @@ static bool blkverify_recurse_is_first_non_filter(BlockDriverState *bs, { BDRVBlkverifyState *s = bs->opaque; - bool perm = bdrv_recurse_is_first_non_filter(bs->file, candidate); + bool perm = bdrv_recurse_is_first_non_filter(bs->file->bs, candidate); if (perm) { return true; } - return bdrv_recurse_is_first_non_filter(s->test_file, candidate); + return bdrv_recurse_is_first_non_filter(s->test_file->bs, candidate); } /* Propagate AioContext changes to ->test_file */ @@ -293,7 +298,7 @@ static void blkverify_detach_aio_context(BlockDriverState *bs) { BDRVBlkverifyState *s = bs->opaque; - bdrv_detach_aio_context(s->test_file); + bdrv_detach_aio_context(s->test_file->bs); } static void blkverify_attach_aio_context(BlockDriverState *bs, @@ -301,32 +306,38 @@ static void blkverify_attach_aio_context(BlockDriverState *bs, { BDRVBlkverifyState *s = bs->opaque; - bdrv_attach_aio_context(s->test_file, new_context); + bdrv_attach_aio_context(s->test_file->bs, new_context); } -static void blkverify_refresh_filename(BlockDriverState *bs) +static void blkverify_refresh_filename(BlockDriverState *bs, QDict *options) { BDRVBlkverifyState *s = bs->opaque; - /* bs->file has already been refreshed */ - bdrv_refresh_filename(s->test_file); + /* bs->file->bs has already been refreshed */ + bdrv_refresh_filename(s->test_file->bs); - if (bs->file->full_open_options && s->test_file->full_open_options) { + if (bs->file->bs->full_open_options + && s->test_file->bs->full_open_options) + { QDict *opts = qdict_new(); qdict_put_obj(opts, "driver", QOBJECT(qstring_from_str("blkverify"))); - QINCREF(bs->file->full_open_options); - qdict_put_obj(opts, "raw", QOBJECT(bs->file->full_open_options)); - QINCREF(s->test_file->full_open_options); - qdict_put_obj(opts, "test", QOBJECT(s->test_file->full_open_options)); + QINCREF(bs->file->bs->full_open_options); + qdict_put_obj(opts, "raw", QOBJECT(bs->file->bs->full_open_options)); + QINCREF(s->test_file->bs->full_open_options); + qdict_put_obj(opts, "test", + QOBJECT(s->test_file->bs->full_open_options)); bs->full_open_options = opts; } - if (bs->file->exact_filename[0] && s->test_file->exact_filename[0]) { + if (bs->file->bs->exact_filename[0] + && s->test_file->bs->exact_filename[0]) + { snprintf(bs->exact_filename, sizeof(bs->exact_filename), "blkverify:%s:%s", - bs->file->exact_filename, s->test_file->exact_filename); + bs->file->bs->exact_filename, + s->test_file->bs->exact_filename); } } diff --git a/qemu/block/block-backend.c b/qemu/block/block-backend.c index aee8a1202..16c9d5e0f 100644 --- a/qemu/block/block-backend.c +++ b/qemu/block/block-backend.c @@ -10,74 +10,105 @@ * or later. See the COPYING.LIB file in the top-level directory. */ +#include "qemu/osdep.h" #include "sysemu/block-backend.h" #include "block/block_int.h" +#include "block/blockjob.h" +#include "block/throttle-groups.h" #include "sysemu/blockdev.h" +#include "sysemu/sysemu.h" #include "qapi-event.h" +#include "qemu/id.h" /* Number of coroutines to reserve per attached device model */ #define COROUTINE_POOL_RESERVATION 64 +#define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */ + +static AioContext *blk_aiocb_get_aio_context(BlockAIOCB *acb); + struct BlockBackend { char *name; int refcnt; - BlockDriverState *bs; + BdrvChild *root; DriveInfo *legacy_dinfo; /* null unless created by drive_new() */ - QTAILQ_ENTRY(BlockBackend) link; /* for blk_backends */ + QTAILQ_ENTRY(BlockBackend) link; /* for block_backends */ + QTAILQ_ENTRY(BlockBackend) monitor_link; /* for monitor_block_backends */ void *dev; /* attached device model, if any */ /* TODO change to DeviceState when all users are qdevified */ const BlockDevOps *dev_ops; void *dev_opaque; + + /* the block size for which the guest device expects atomicity */ + int guest_block_size; + + /* If the BDS tree is removed, some of its options are stored here (which + * can be used to restore those options in the new BDS on insert) */ + BlockBackendRootState root_state; + + bool enable_write_cache; + + /* I/O stats (display with "info blockstats"). */ + BlockAcctStats stats; + + BlockdevOnError on_read_error, on_write_error; + bool iostatus_enabled; + BlockDeviceIoStatus iostatus; + + bool allow_write_beyond_eof; + + NotifierList remove_bs_notifiers, insert_bs_notifiers; }; typedef struct BlockBackendAIOCB { BlockAIOCB common; QEMUBH *bh; + BlockBackend *blk; int ret; } BlockBackendAIOCB; static const AIOCBInfo block_backend_aiocb_info = { + .get_aio_context = blk_aiocb_get_aio_context, .aiocb_size = sizeof(BlockBackendAIOCB), }; static void drive_info_del(DriveInfo *dinfo); -/* All the BlockBackends (except for hidden ones) */ -static QTAILQ_HEAD(, BlockBackend) blk_backends = - QTAILQ_HEAD_INITIALIZER(blk_backends); +/* All BlockBackends */ +static QTAILQ_HEAD(, BlockBackend) block_backends = + QTAILQ_HEAD_INITIALIZER(block_backends); + +/* All BlockBackends referenced by the monitor and which are iterated through by + * blk_next() */ +static QTAILQ_HEAD(, BlockBackend) monitor_block_backends = + QTAILQ_HEAD_INITIALIZER(monitor_block_backends); + +static void blk_root_inherit_options(int *child_flags, QDict *child_options, + int parent_flags, QDict *parent_options) +{ + /* We're not supposed to call this function for root nodes */ + abort(); +} + +static const BdrvChildRole child_root = { + .inherit_options = blk_root_inherit_options, +}; /* - * Create a new BlockBackend with @name, with a reference count of one. - * @name must not be null or empty. - * Fail if a BlockBackend with this name already exists. + * Create a new BlockBackend with a reference count of one. * Store an error through @errp on failure, unless it's null. * Return the new BlockBackend on success, null on failure. */ -BlockBackend *blk_new(const char *name, Error **errp) +BlockBackend *blk_new(Error **errp) { BlockBackend *blk; - assert(name && name[0]); - if (!id_wellformed(name)) { - error_setg(errp, "Invalid device name"); - return NULL; - } - if (blk_by_name(name)) { - error_setg(errp, "Device with id '%s' already exists", name); - return NULL; - } - if (bdrv_find_node(name)) { - error_setg(errp, - "Device name '%s' conflicts with an existing node name", - name); - return NULL; - } - blk = g_new0(BlockBackend, 1); - blk->name = g_strdup(name); blk->refcnt = 1; - QTAILQ_INSERT_TAIL(&blk_backends, blk, link); + notifier_list_init(&blk->remove_bs_notifiers); + notifier_list_init(&blk->insert_bs_notifiers); + QTAILQ_INSERT_TAIL(&block_backends, blk, link); return blk; } @@ -85,18 +116,18 @@ BlockBackend *blk_new(const char *name, Error **errp) * Create a new BlockBackend with a new BlockDriverState attached. * Otherwise just like blk_new(), which see. */ -BlockBackend *blk_new_with_bs(const char *name, Error **errp) +BlockBackend *blk_new_with_bs(Error **errp) { BlockBackend *blk; BlockDriverState *bs; - blk = blk_new(name, errp); + blk = blk_new(errp); if (!blk) { return NULL; } bs = bdrv_new_root(); - blk->bs = bs; + blk->root = bdrv_root_attach_child(bs, "root", &child_root); bs->blk = blk; return blk; } @@ -113,44 +144,46 @@ BlockBackend *blk_new_with_bs(const char *name, Error **errp) * though, so callers of this function have to be able to specify @filename and * @flags. */ -BlockBackend *blk_new_open(const char *name, const char *filename, - const char *reference, QDict *options, int flags, - Error **errp) +BlockBackend *blk_new_open(const char *filename, const char *reference, + QDict *options, int flags, Error **errp) { BlockBackend *blk; int ret; - blk = blk_new_with_bs(name, errp); + blk = blk_new_with_bs(errp); if (!blk) { QDECREF(options); return NULL; } - ret = bdrv_open(&blk->bs, filename, reference, options, flags, NULL, errp); + ret = bdrv_open(&blk->root->bs, filename, reference, options, flags, errp); if (ret < 0) { blk_unref(blk); return NULL; } + blk_set_enable_write_cache(blk, true); + return blk; } static void blk_delete(BlockBackend *blk) { assert(!blk->refcnt); + assert(!blk->name); assert(!blk->dev); - if (blk->bs) { - assert(blk->bs->blk == blk); - blk->bs->blk = NULL; - bdrv_unref(blk->bs); - blk->bs = NULL; + if (blk->root) { + blk_remove_bs(blk); } - /* Avoid double-remove after blk_hide_on_behalf_of_hmp_drive_del() */ - if (blk->name[0]) { - QTAILQ_REMOVE(&blk_backends, blk, link); + assert(QLIST_EMPTY(&blk->remove_bs_notifiers.notifiers)); + assert(QLIST_EMPTY(&blk->insert_bs_notifiers.notifiers)); + if (blk->root_state.throttle_state) { + g_free(blk->root_state.throttle_group); + throttle_group_unref(blk->root_state.throttle_state); } - g_free(blk->name); + QTAILQ_REMOVE(&block_backends, blk, link); drive_info_del(blk->legacy_dinfo); + block_acct_cleanup(&blk->stats); g_free(blk); } @@ -164,6 +197,11 @@ static void drive_info_del(DriveInfo *dinfo) g_free(dinfo); } +int blk_get_refcnt(BlockBackend *blk) +{ + return blk ? blk->refcnt : 0; +} + /* * Increment @blk's reference count. * @blk must not be null. @@ -189,7 +227,32 @@ void blk_unref(BlockBackend *blk) } /* - * Return the BlockBackend after @blk. + * Behaves similarly to blk_next() but iterates over all BlockBackends, even the + * ones which are hidden (i.e. are not referenced by the monitor). + */ +static BlockBackend *blk_all_next(BlockBackend *blk) +{ + return blk ? QTAILQ_NEXT(blk, link) + : QTAILQ_FIRST(&block_backends); +} + +void blk_remove_all_bs(void) +{ + BlockBackend *blk = NULL; + + while ((blk = blk_all_next(blk)) != NULL) { + AioContext *ctx = blk_get_aio_context(blk); + + aio_context_acquire(ctx); + if (blk->root) { + blk_remove_bs(blk); + } + aio_context_release(ctx); + } +} + +/* + * Return the monitor-owned BlockBackend after @blk. * If @blk is null, return the first one. * Else, return @blk's next sibling, which may be null. * @@ -200,17 +263,91 @@ void blk_unref(BlockBackend *blk) */ BlockBackend *blk_next(BlockBackend *blk) { - return blk ? QTAILQ_NEXT(blk, link) : QTAILQ_FIRST(&blk_backends); + return blk ? QTAILQ_NEXT(blk, monitor_link) + : QTAILQ_FIRST(&monitor_block_backends); +} + +/* + * Iterates over all BlockDriverStates which are attached to a BlockBackend. + * This function is for use by bdrv_next(). + * + * @bs must be NULL or a BDS that is attached to a BB. + */ +BlockDriverState *blk_next_root_bs(BlockDriverState *bs) +{ + BlockBackend *blk; + + if (bs) { + assert(bs->blk); + blk = bs->blk; + } else { + blk = NULL; + } + + do { + blk = blk_all_next(blk); + } while (blk && !blk->root); + + return blk ? blk->root->bs : NULL; +} + +/* + * Add a BlockBackend into the list of backends referenced by the monitor, with + * the given @name acting as the handle for the monitor. + * Strictly for use by blockdev.c. + * + * @name must not be null or empty. + * + * Returns true on success and false on failure. In the latter case, an Error + * object is returned through @errp. + */ +bool monitor_add_blk(BlockBackend *blk, const char *name, Error **errp) +{ + assert(!blk->name); + assert(name && name[0]); + + if (!id_wellformed(name)) { + error_setg(errp, "Invalid device name"); + return false; + } + if (blk_by_name(name)) { + error_setg(errp, "Device with id '%s' already exists", name); + return false; + } + if (bdrv_find_node(name)) { + error_setg(errp, + "Device name '%s' conflicts with an existing node name", + name); + return false; + } + + blk->name = g_strdup(name); + QTAILQ_INSERT_TAIL(&monitor_block_backends, blk, monitor_link); + return true; +} + +/* + * Remove a BlockBackend from the list of backends referenced by the monitor. + * Strictly for use by blockdev.c. + */ +void monitor_remove_blk(BlockBackend *blk) +{ + if (!blk->name) { + return; + } + + QTAILQ_REMOVE(&monitor_block_backends, blk, monitor_link); + g_free(blk->name); + blk->name = NULL; } /* * Return @blk's name, a non-null string. - * Wart: the name is empty iff @blk has been hidden with - * blk_hide_on_behalf_of_hmp_drive_del(). + * Returns an empty string iff @blk is not referenced by the monitor. */ const char *blk_name(BlockBackend *blk) { - return blk->name; + return blk->name ?: ""; } /* @@ -219,10 +356,10 @@ const char *blk_name(BlockBackend *blk) */ BlockBackend *blk_by_name(const char *name) { - BlockBackend *blk; + BlockBackend *blk = NULL; assert(name); - QTAILQ_FOREACH(blk, &blk_backends, link) { + while ((blk = blk_next(blk)) != NULL) { if (!strcmp(name, blk->name)) { return blk; } @@ -235,7 +372,7 @@ BlockBackend *blk_by_name(const char *name) */ BlockDriverState *blk_bs(BlockBackend *blk) { - return blk->bs; + return blk->root ? blk->root->bs : NULL; } /* @@ -263,9 +400,9 @@ DriveInfo *blk_set_legacy_dinfo(BlockBackend *blk, DriveInfo *dinfo) */ BlockBackend *blk_by_legacy_dinfo(DriveInfo *dinfo) { - BlockBackend *blk; + BlockBackend *blk = NULL; - QTAILQ_FOREACH(blk, &blk_backends, link) { + while ((blk = blk_next(blk)) != NULL) { if (blk->legacy_dinfo == dinfo) { return blk; } @@ -274,21 +411,32 @@ BlockBackend *blk_by_legacy_dinfo(DriveInfo *dinfo) } /* - * Hide @blk. - * @blk must not have been hidden already. - * Make attached BlockDriverState, if any, anonymous. - * Once hidden, @blk is invisible to all functions that don't receive - * it as argument. For example, blk_by_name() won't return it. - * Strictly for use by do_drive_del(). - * TODO get rid of it! + * Disassociates the currently associated BlockDriverState from @blk. */ -void blk_hide_on_behalf_of_hmp_drive_del(BlockBackend *blk) +void blk_remove_bs(BlockBackend *blk) { - QTAILQ_REMOVE(&blk_backends, blk, link); - blk->name[0] = 0; - if (blk->bs) { - bdrv_make_anon(blk->bs); - } + assert(blk->root->bs->blk == blk); + + notifier_list_notify(&blk->remove_bs_notifiers, blk); + + blk_update_root_state(blk); + + blk->root->bs->blk = NULL; + bdrv_root_unref_child(blk->root); + blk->root = NULL; +} + +/* + * Associates a new BlockDriverState with @blk. + */ +void blk_insert_bs(BlockBackend *blk, BlockDriverState *bs) +{ + assert(!blk->root && !bs->blk); + bdrv_ref(bs); + blk->root = bdrv_root_attach_child(bs, "root", &child_root); + bs->blk = blk; + + notifier_list_notify(&blk->insert_bs_notifiers, blk); } /* @@ -303,7 +451,7 @@ int blk_attach_dev(BlockBackend *blk, void *dev) } blk_ref(blk); blk->dev = dev; - bdrv_iostatus_reset(blk->bs); + blk_iostatus_reset(blk); return 0; } @@ -330,7 +478,7 @@ void blk_detach_dev(BlockBackend *blk, void *dev) blk->dev = NULL; blk->dev_ops = NULL; blk->dev_opaque = NULL; - bdrv_set_guest_block_size(blk->bs, 512); + blk->guest_block_size = 512; blk_unref(blk); } @@ -364,18 +512,15 @@ void blk_set_dev_ops(BlockBackend *blk, const BlockDevOps *ops, void blk_dev_change_media_cb(BlockBackend *blk, bool load) { if (blk->dev_ops && blk->dev_ops->change_media_cb) { - bool tray_was_closed = !blk_dev_is_tray_open(blk); + bool tray_was_open, tray_is_open; + tray_was_open = blk_dev_is_tray_open(blk); blk->dev_ops->change_media_cb(blk->dev_opaque, load); - if (tray_was_closed) { - /* tray open */ - qapi_event_send_device_tray_moved(blk_name(blk), - true, &error_abort); - } - if (load) { - /* tray close */ - qapi_event_send_device_tray_moved(blk_name(blk), - false, &error_abort); + tray_is_open = blk_dev_is_tray_open(blk); + + if (tray_was_open != tray_is_open) { + qapi_event_send_device_tray_moved(blk_name(blk), tray_is_open, + &error_abort); } } } @@ -390,6 +535,14 @@ bool blk_dev_has_removable_media(BlockBackend *blk) } /* + * Does @blk's attached device model have a tray? + */ +bool blk_dev_has_tray(BlockBackend *blk) +{ + return blk->dev_ops && blk->dev_ops->is_tray_open; +} + +/* * Notify @blk's attached device model of a media eject request. * If @force is true, the medium is about to be yanked out forcefully. */ @@ -405,7 +558,7 @@ void blk_dev_eject_request(BlockBackend *blk, bool force) */ bool blk_dev_is_tray_open(BlockBackend *blk) { - if (blk->dev_ops && blk->dev_ops->is_tray_open) { + if (blk_dev_has_tray(blk)) { return blk->dev_ops->is_tray_open(blk->dev_opaque); } return false; @@ -435,7 +588,53 @@ void blk_dev_resize_cb(BlockBackend *blk) void blk_iostatus_enable(BlockBackend *blk) { - bdrv_iostatus_enable(blk->bs); + blk->iostatus_enabled = true; + blk->iostatus = BLOCK_DEVICE_IO_STATUS_OK; +} + +/* The I/O status is only enabled if the drive explicitly + * enables it _and_ the VM is configured to stop on errors */ +bool blk_iostatus_is_enabled(const BlockBackend *blk) +{ + return (blk->iostatus_enabled && + (blk->on_write_error == BLOCKDEV_ON_ERROR_ENOSPC || + blk->on_write_error == BLOCKDEV_ON_ERROR_STOP || + blk->on_read_error == BLOCKDEV_ON_ERROR_STOP)); +} + +BlockDeviceIoStatus blk_iostatus(const BlockBackend *blk) +{ + return blk->iostatus; +} + +void blk_iostatus_disable(BlockBackend *blk) +{ + blk->iostatus_enabled = false; +} + +void blk_iostatus_reset(BlockBackend *blk) +{ + if (blk_iostatus_is_enabled(blk)) { + BlockDriverState *bs = blk_bs(blk); + blk->iostatus = BLOCK_DEVICE_IO_STATUS_OK; + if (bs && bs->job) { + block_job_iostatus_reset(bs->job); + } + } +} + +void blk_iostatus_set_err(BlockBackend *blk, int error) +{ + assert(blk_iostatus_is_enabled(blk)); + if (blk->iostatus == BLOCK_DEVICE_IO_STATUS_OK) { + blk->iostatus = error == ENOSPC ? BLOCK_DEVICE_IO_STATUS_NOSPACE : + BLOCK_DEVICE_IO_STATUS_FAILED; + } +} + +void blk_set_allow_write_beyond_eof(BlockBackend *blk, bool allow) +{ + blk->allow_write_beyond_eof = allow; } static int blk_check_byte_request(BlockBackend *blk, int64_t offset, @@ -447,21 +646,23 @@ static int blk_check_byte_request(BlockBackend *blk, int64_t offset, return -EIO; } - if (!blk_is_inserted(blk)) { + if (!blk_is_available(blk)) { return -ENOMEDIUM; } - len = blk_getlength(blk); - if (len < 0) { - return len; - } - if (offset < 0) { return -EIO; } - if (offset > len || len - offset < size) { - return -EIO; + if (!blk->allow_write_beyond_eof) { + len = blk_getlength(blk); + if (len < 0) { + return len; + } + + if (offset > len || len - offset < size) { + return -EIO; + } } return 0; @@ -482,48 +683,144 @@ static int blk_check_request(BlockBackend *blk, int64_t sector_num, nb_sectors * BDRV_SECTOR_SIZE); } -int blk_read(BlockBackend *blk, int64_t sector_num, uint8_t *buf, - int nb_sectors) +static int coroutine_fn blk_co_preadv(BlockBackend *blk, int64_t offset, + unsigned int bytes, QEMUIOVector *qiov, + BdrvRequestFlags flags) { - int ret = blk_check_request(blk, sector_num, nb_sectors); + int ret = blk_check_byte_request(blk, offset, bytes); if (ret < 0) { return ret; } - return bdrv_read(blk->bs, sector_num, buf, nb_sectors); + return bdrv_co_do_preadv(blk_bs(blk), offset, bytes, qiov, flags); } -int blk_read_unthrottled(BlockBackend *blk, int64_t sector_num, uint8_t *buf, - int nb_sectors) +static int coroutine_fn blk_co_pwritev(BlockBackend *blk, int64_t offset, + unsigned int bytes, QEMUIOVector *qiov, + BdrvRequestFlags flags) { - int ret = blk_check_request(blk, sector_num, nb_sectors); + int ret; + + ret = blk_check_byte_request(blk, offset, bytes); if (ret < 0) { return ret; } - return bdrv_read_unthrottled(blk->bs, sector_num, buf, nb_sectors); + if (!blk->enable_write_cache) { + flags |= BDRV_REQ_FUA; + } + + return bdrv_co_do_pwritev(blk_bs(blk), offset, bytes, qiov, flags); } -int blk_write(BlockBackend *blk, int64_t sector_num, const uint8_t *buf, - int nb_sectors) +typedef struct BlkRwCo { + BlockBackend *blk; + int64_t offset; + QEMUIOVector *qiov; + int ret; + BdrvRequestFlags flags; +} BlkRwCo; + +static void blk_read_entry(void *opaque) { - int ret = blk_check_request(blk, sector_num, nb_sectors); - if (ret < 0) { - return ret; + BlkRwCo *rwco = opaque; + + rwco->ret = blk_co_preadv(rwco->blk, rwco->offset, rwco->qiov->size, + rwco->qiov, rwco->flags); +} + +static void blk_write_entry(void *opaque) +{ + BlkRwCo *rwco = opaque; + + rwco->ret = blk_co_pwritev(rwco->blk, rwco->offset, rwco->qiov->size, + rwco->qiov, rwco->flags); +} + +static int blk_prw(BlockBackend *blk, int64_t offset, uint8_t *buf, + int64_t bytes, CoroutineEntry co_entry, + BdrvRequestFlags flags) +{ + AioContext *aio_context; + QEMUIOVector qiov; + struct iovec iov; + Coroutine *co; + BlkRwCo rwco; + + iov = (struct iovec) { + .iov_base = buf, + .iov_len = bytes, + }; + qemu_iovec_init_external(&qiov, &iov, 1); + + rwco = (BlkRwCo) { + .blk = blk, + .offset = offset, + .qiov = &qiov, + .flags = flags, + .ret = NOT_DONE, + }; + + co = qemu_coroutine_create(co_entry); + qemu_coroutine_enter(co, &rwco); + + aio_context = blk_get_aio_context(blk); + while (rwco.ret == NOT_DONE) { + aio_poll(aio_context, true); } - return bdrv_write(blk->bs, sector_num, buf, nb_sectors); + return rwco.ret; } -int blk_write_zeroes(BlockBackend *blk, int64_t sector_num, - int nb_sectors, BdrvRequestFlags flags) +static int blk_rw(BlockBackend *blk, int64_t sector_num, uint8_t *buf, + int nb_sectors, CoroutineEntry co_entry, + BdrvRequestFlags flags) { - int ret = blk_check_request(blk, sector_num, nb_sectors); + if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) { + return -EINVAL; + } + + return blk_prw(blk, sector_num << BDRV_SECTOR_BITS, buf, + nb_sectors << BDRV_SECTOR_BITS, co_entry, flags); +} + +int blk_read(BlockBackend *blk, int64_t sector_num, uint8_t *buf, + int nb_sectors) +{ + return blk_rw(blk, sector_num, buf, nb_sectors, blk_read_entry, 0); +} + +int blk_read_unthrottled(BlockBackend *blk, int64_t sector_num, uint8_t *buf, + int nb_sectors) +{ + BlockDriverState *bs = blk_bs(blk); + bool enabled; + int ret; + + ret = blk_check_request(blk, sector_num, nb_sectors); if (ret < 0) { return ret; } - return bdrv_write_zeroes(blk->bs, sector_num, nb_sectors, flags); + enabled = bs->io_limits_enabled; + bs->io_limits_enabled = false; + ret = blk_read(blk, sector_num, buf, nb_sectors); + bs->io_limits_enabled = enabled; + return ret; +} + +int blk_write(BlockBackend *blk, int64_t sector_num, const uint8_t *buf, + int nb_sectors) +{ + return blk_rw(blk, sector_num, (uint8_t*) buf, nb_sectors, + blk_write_entry, 0); +} + +int blk_write_zeroes(BlockBackend *blk, int64_t sector_num, + int nb_sectors, BdrvRequestFlags flags) +{ + return blk_rw(blk, sector_num, NULL, nb_sectors, blk_write_entry, + flags | BDRV_REQ_ZERO_WRITE); } static void error_callback_bh(void *opaque) @@ -534,13 +831,15 @@ static void error_callback_bh(void *opaque) qemu_aio_unref(acb); } -static BlockAIOCB *abort_aio_request(BlockBackend *blk, BlockCompletionFunc *cb, - void *opaque, int ret) +BlockAIOCB *blk_abort_aio_request(BlockBackend *blk, + BlockCompletionFunc *cb, + void *opaque, int ret) { struct BlockBackendAIOCB *acb; QEMUBH *bh; acb = blk_aio_get(&block_backend_aiocb_info, blk, cb, opaque); + acb->blk = blk; acb->ret = ret; bh = aio_bh_new(blk_get_aio_context(blk), error_callback_bh, acb); @@ -550,82 +849,182 @@ static BlockAIOCB *abort_aio_request(BlockBackend *blk, BlockCompletionFunc *cb, return &acb->common; } +typedef struct BlkAioEmAIOCB { + BlockAIOCB common; + BlkRwCo rwco; + int bytes; + bool has_returned; + QEMUBH* bh; +} BlkAioEmAIOCB; + +static const AIOCBInfo blk_aio_em_aiocb_info = { + .aiocb_size = sizeof(BlkAioEmAIOCB), +}; + +static void blk_aio_complete(BlkAioEmAIOCB *acb) +{ + if (acb->bh) { + assert(acb->has_returned); + qemu_bh_delete(acb->bh); + } + if (acb->has_returned) { + acb->common.cb(acb->common.opaque, acb->rwco.ret); + qemu_aio_unref(acb); + } +} + +static void blk_aio_complete_bh(void *opaque) +{ + blk_aio_complete(opaque); +} + +static BlockAIOCB *blk_aio_prwv(BlockBackend *blk, int64_t offset, int bytes, + QEMUIOVector *qiov, CoroutineEntry co_entry, + BdrvRequestFlags flags, + BlockCompletionFunc *cb, void *opaque) +{ + BlkAioEmAIOCB *acb; + Coroutine *co; + + acb = blk_aio_get(&blk_aio_em_aiocb_info, blk, cb, opaque); + acb->rwco = (BlkRwCo) { + .blk = blk, + .offset = offset, + .qiov = qiov, + .flags = flags, + .ret = NOT_DONE, + }; + acb->bytes = bytes; + acb->bh = NULL; + acb->has_returned = false; + + co = qemu_coroutine_create(co_entry); + qemu_coroutine_enter(co, acb); + + acb->has_returned = true; + if (acb->rwco.ret != NOT_DONE) { + acb->bh = aio_bh_new(blk_get_aio_context(blk), blk_aio_complete_bh, acb); + qemu_bh_schedule(acb->bh); + } + + return &acb->common; +} + +static void blk_aio_read_entry(void *opaque) +{ + BlkAioEmAIOCB *acb = opaque; + BlkRwCo *rwco = &acb->rwco; + + assert(rwco->qiov->size == acb->bytes); + rwco->ret = blk_co_preadv(rwco->blk, rwco->offset, acb->bytes, + rwco->qiov, rwco->flags); + blk_aio_complete(acb); +} + +static void blk_aio_write_entry(void *opaque) +{ + BlkAioEmAIOCB *acb = opaque; + BlkRwCo *rwco = &acb->rwco; + + assert(!rwco->qiov || rwco->qiov->size == acb->bytes); + rwco->ret = blk_co_pwritev(rwco->blk, rwco->offset, acb->bytes, + rwco->qiov, rwco->flags); + blk_aio_complete(acb); +} + BlockAIOCB *blk_aio_write_zeroes(BlockBackend *blk, int64_t sector_num, int nb_sectors, BdrvRequestFlags flags, BlockCompletionFunc *cb, void *opaque) { - int ret = blk_check_request(blk, sector_num, nb_sectors); - if (ret < 0) { - return abort_aio_request(blk, cb, opaque, ret); + if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) { + return blk_abort_aio_request(blk, cb, opaque, -EINVAL); } - return bdrv_aio_write_zeroes(blk->bs, sector_num, nb_sectors, flags, - cb, opaque); + return blk_aio_prwv(blk, sector_num << BDRV_SECTOR_BITS, + nb_sectors << BDRV_SECTOR_BITS, NULL, + blk_aio_write_entry, flags | BDRV_REQ_ZERO_WRITE, + cb, opaque); } int blk_pread(BlockBackend *blk, int64_t offset, void *buf, int count) { - int ret = blk_check_byte_request(blk, offset, count); + int ret = blk_prw(blk, offset, buf, count, blk_read_entry, 0); if (ret < 0) { return ret; } - - return bdrv_pread(blk->bs, offset, buf, count); + return count; } int blk_pwrite(BlockBackend *blk, int64_t offset, const void *buf, int count) { - int ret = blk_check_byte_request(blk, offset, count); + int ret = blk_prw(blk, offset, (void*) buf, count, blk_write_entry, 0); if (ret < 0) { return ret; } - - return bdrv_pwrite(blk->bs, offset, buf, count); + return count; } int64_t blk_getlength(BlockBackend *blk) { - return bdrv_getlength(blk->bs); + if (!blk_is_available(blk)) { + return -ENOMEDIUM; + } + + return bdrv_getlength(blk_bs(blk)); } void blk_get_geometry(BlockBackend *blk, uint64_t *nb_sectors_ptr) { - bdrv_get_geometry(blk->bs, nb_sectors_ptr); + if (!blk_bs(blk)) { + *nb_sectors_ptr = 0; + } else { + bdrv_get_geometry(blk_bs(blk), nb_sectors_ptr); + } } int64_t blk_nb_sectors(BlockBackend *blk) { - return bdrv_nb_sectors(blk->bs); + if (!blk_is_available(blk)) { + return -ENOMEDIUM; + } + + return bdrv_nb_sectors(blk_bs(blk)); } BlockAIOCB *blk_aio_readv(BlockBackend *blk, int64_t sector_num, QEMUIOVector *iov, int nb_sectors, BlockCompletionFunc *cb, void *opaque) { - int ret = blk_check_request(blk, sector_num, nb_sectors); - if (ret < 0) { - return abort_aio_request(blk, cb, opaque, ret); + if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) { + return blk_abort_aio_request(blk, cb, opaque, -EINVAL); } - return bdrv_aio_readv(blk->bs, sector_num, iov, nb_sectors, cb, opaque); + assert(nb_sectors << BDRV_SECTOR_BITS == iov->size); + return blk_aio_prwv(blk, sector_num << BDRV_SECTOR_BITS, iov->size, iov, + blk_aio_read_entry, 0, cb, opaque); } BlockAIOCB *blk_aio_writev(BlockBackend *blk, int64_t sector_num, QEMUIOVector *iov, int nb_sectors, BlockCompletionFunc *cb, void *opaque) { - int ret = blk_check_request(blk, sector_num, nb_sectors); - if (ret < 0) { - return abort_aio_request(blk, cb, opaque, ret); + if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) { + return blk_abort_aio_request(blk, cb, opaque, -EINVAL); } - return bdrv_aio_writev(blk->bs, sector_num, iov, nb_sectors, cb, opaque); + assert(nb_sectors << BDRV_SECTOR_BITS == iov->size); + return blk_aio_prwv(blk, sector_num << BDRV_SECTOR_BITS, iov->size, iov, + blk_aio_write_entry, 0, cb, opaque); } BlockAIOCB *blk_aio_flush(BlockBackend *blk, BlockCompletionFunc *cb, void *opaque) { - return bdrv_aio_flush(blk->bs, cb, opaque); + if (!blk_is_available(blk)) { + return blk_abort_aio_request(blk, cb, opaque, -ENOMEDIUM); + } + + return bdrv_aio_flush(blk_bs(blk), cb, opaque); } BlockAIOCB *blk_aio_discard(BlockBackend *blk, @@ -634,10 +1033,10 @@ BlockAIOCB *blk_aio_discard(BlockBackend *blk, { int ret = blk_check_request(blk, sector_num, nb_sectors); if (ret < 0) { - return abort_aio_request(blk, cb, opaque, ret); + return blk_abort_aio_request(blk, cb, opaque, ret); } - return bdrv_aio_discard(blk->bs, sector_num, nb_sectors, cb, opaque); + return bdrv_aio_discard(blk_bs(blk), sector_num, nb_sectors, cb, opaque); } void blk_aio_cancel(BlockAIOCB *acb) @@ -661,18 +1060,26 @@ int blk_aio_multiwrite(BlockBackend *blk, BlockRequest *reqs, int num_reqs) } } - return bdrv_aio_multiwrite(blk->bs, reqs, num_reqs); + return bdrv_aio_multiwrite(blk_bs(blk), reqs, num_reqs); } int blk_ioctl(BlockBackend *blk, unsigned long int req, void *buf) { - return bdrv_ioctl(blk->bs, req, buf); + if (!blk_is_available(blk)) { + return -ENOMEDIUM; + } + + return bdrv_ioctl(blk_bs(blk), req, buf); } BlockAIOCB *blk_aio_ioctl(BlockBackend *blk, unsigned long int req, void *buf, BlockCompletionFunc *cb, void *opaque) { - return bdrv_aio_ioctl(blk->bs, req, buf, cb, opaque); + if (!blk_is_available(blk)) { + return blk_abort_aio_request(blk, cb, opaque, -ENOMEDIUM); + } + + return bdrv_aio_ioctl(blk_bs(blk), req, buf, cb, opaque); } int blk_co_discard(BlockBackend *blk, int64_t sector_num, int nb_sectors) @@ -682,27 +1089,32 @@ int blk_co_discard(BlockBackend *blk, int64_t sector_num, int nb_sectors) return ret; } - return bdrv_co_discard(blk->bs, sector_num, nb_sectors); + return bdrv_co_discard(blk_bs(blk), sector_num, nb_sectors); } int blk_co_flush(BlockBackend *blk) { - return bdrv_co_flush(blk->bs); + if (!blk_is_available(blk)) { + return -ENOMEDIUM; + } + + return bdrv_co_flush(blk_bs(blk)); } int blk_flush(BlockBackend *blk) { - return bdrv_flush(blk->bs); -} + if (!blk_is_available(blk)) { + return -ENOMEDIUM; + } -int blk_flush_all(void) -{ - return bdrv_flush_all(); + return bdrv_flush(blk_bs(blk)); } void blk_drain(BlockBackend *blk) { - bdrv_drain(blk->bs); + if (blk_bs(blk)) { + bdrv_drain(blk_bs(blk)); + } } void blk_drain_all(void) @@ -710,119 +1122,273 @@ void blk_drain_all(void) bdrv_drain_all(); } +void blk_set_on_error(BlockBackend *blk, BlockdevOnError on_read_error, + BlockdevOnError on_write_error) +{ + blk->on_read_error = on_read_error; + blk->on_write_error = on_write_error; +} + BlockdevOnError blk_get_on_error(BlockBackend *blk, bool is_read) { - return bdrv_get_on_error(blk->bs, is_read); + return is_read ? blk->on_read_error : blk->on_write_error; } BlockErrorAction blk_get_error_action(BlockBackend *blk, bool is_read, int error) { - return bdrv_get_error_action(blk->bs, is_read, error); + BlockdevOnError on_err = blk_get_on_error(blk, is_read); + + switch (on_err) { + case BLOCKDEV_ON_ERROR_ENOSPC: + return (error == ENOSPC) ? + BLOCK_ERROR_ACTION_STOP : BLOCK_ERROR_ACTION_REPORT; + case BLOCKDEV_ON_ERROR_STOP: + return BLOCK_ERROR_ACTION_STOP; + case BLOCKDEV_ON_ERROR_REPORT: + return BLOCK_ERROR_ACTION_REPORT; + case BLOCKDEV_ON_ERROR_IGNORE: + return BLOCK_ERROR_ACTION_IGNORE; + default: + abort(); + } +} + +static void send_qmp_error_event(BlockBackend *blk, + BlockErrorAction action, + bool is_read, int error) +{ + IoOperationType optype; + + optype = is_read ? IO_OPERATION_TYPE_READ : IO_OPERATION_TYPE_WRITE; + qapi_event_send_block_io_error(blk_name(blk), optype, action, + blk_iostatus_is_enabled(blk), + error == ENOSPC, strerror(error), + &error_abort); } +/* This is done by device models because, while the block layer knows + * about the error, it does not know whether an operation comes from + * the device or the block layer (from a job, for example). + */ void blk_error_action(BlockBackend *blk, BlockErrorAction action, bool is_read, int error) { - bdrv_error_action(blk->bs, action, is_read, error); + assert(error >= 0); + + if (action == BLOCK_ERROR_ACTION_STOP) { + /* First set the iostatus, so that "info block" returns an iostatus + * that matches the events raised so far (an additional error iostatus + * is fine, but not a lost one). + */ + blk_iostatus_set_err(blk, error); + + /* Then raise the request to stop the VM and the event. + * qemu_system_vmstop_request_prepare has two effects. First, + * it ensures that the STOP event always comes after the + * BLOCK_IO_ERROR event. Second, it ensures that even if management + * can observe the STOP event and do a "cont" before the STOP + * event is issued, the VM will not stop. In this case, vm_start() + * also ensures that the STOP/RESUME pair of events is emitted. + */ + qemu_system_vmstop_request_prepare(); + send_qmp_error_event(blk, action, is_read, error); + qemu_system_vmstop_request(RUN_STATE_IO_ERROR); + } else { + send_qmp_error_event(blk, action, is_read, error); + } } int blk_is_read_only(BlockBackend *blk) { - return bdrv_is_read_only(blk->bs); + BlockDriverState *bs = blk_bs(blk); + + if (bs) { + return bdrv_is_read_only(bs); + } else { + return blk->root_state.read_only; + } } int blk_is_sg(BlockBackend *blk) { - return bdrv_is_sg(blk->bs); + BlockDriverState *bs = blk_bs(blk); + + if (!bs) { + return 0; + } + + return bdrv_is_sg(bs); } int blk_enable_write_cache(BlockBackend *blk) { - return bdrv_enable_write_cache(blk->bs); + return blk->enable_write_cache; } void blk_set_enable_write_cache(BlockBackend *blk, bool wce) { - bdrv_set_enable_write_cache(blk->bs, wce); + blk->enable_write_cache = wce; } void blk_invalidate_cache(BlockBackend *blk, Error **errp) { - bdrv_invalidate_cache(blk->bs, errp); + BlockDriverState *bs = blk_bs(blk); + + if (!bs) { + error_setg(errp, "Device '%s' has no medium", blk->name); + return; + } + + bdrv_invalidate_cache(bs, errp); +} + +bool blk_is_inserted(BlockBackend *blk) +{ + BlockDriverState *bs = blk_bs(blk); + + return bs && bdrv_is_inserted(bs); } -int blk_is_inserted(BlockBackend *blk) +bool blk_is_available(BlockBackend *blk) { - return bdrv_is_inserted(blk->bs); + return blk_is_inserted(blk) && !blk_dev_is_tray_open(blk); } void blk_lock_medium(BlockBackend *blk, bool locked) { - bdrv_lock_medium(blk->bs, locked); + BlockDriverState *bs = blk_bs(blk); + + if (bs) { + bdrv_lock_medium(bs, locked); + } } void blk_eject(BlockBackend *blk, bool eject_flag) { - bdrv_eject(blk->bs, eject_flag); + BlockDriverState *bs = blk_bs(blk); + + if (bs) { + bdrv_eject(bs, eject_flag); + } } int blk_get_flags(BlockBackend *blk) { - return bdrv_get_flags(blk->bs); + BlockDriverState *bs = blk_bs(blk); + + if (bs) { + return bdrv_get_flags(bs); + } else { + return blk->root_state.open_flags; + } } int blk_get_max_transfer_length(BlockBackend *blk) { - return blk->bs->bl.max_transfer_length; + BlockDriverState *bs = blk_bs(blk); + + if (bs) { + return bs->bl.max_transfer_length; + } else { + return 0; + } +} + +int blk_get_max_iov(BlockBackend *blk) +{ + return blk->root->bs->bl.max_iov; } void blk_set_guest_block_size(BlockBackend *blk, int align) { - bdrv_set_guest_block_size(blk->bs, align); + blk->guest_block_size = align; +} + +void *blk_try_blockalign(BlockBackend *blk, size_t size) +{ + return qemu_try_blockalign(blk ? blk_bs(blk) : NULL, size); } void *blk_blockalign(BlockBackend *blk, size_t size) { - return qemu_blockalign(blk ? blk->bs : NULL, size); + return qemu_blockalign(blk ? blk_bs(blk) : NULL, size); } bool blk_op_is_blocked(BlockBackend *blk, BlockOpType op, Error **errp) { - return bdrv_op_is_blocked(blk->bs, op, errp); + BlockDriverState *bs = blk_bs(blk); + + if (!bs) { + return false; + } + + return bdrv_op_is_blocked(bs, op, errp); } void blk_op_unblock(BlockBackend *blk, BlockOpType op, Error *reason) { - bdrv_op_unblock(blk->bs, op, reason); + BlockDriverState *bs = blk_bs(blk); + + if (bs) { + bdrv_op_unblock(bs, op, reason); + } } void blk_op_block_all(BlockBackend *blk, Error *reason) { - bdrv_op_block_all(blk->bs, reason); + BlockDriverState *bs = blk_bs(blk); + + if (bs) { + bdrv_op_block_all(bs, reason); + } } void blk_op_unblock_all(BlockBackend *blk, Error *reason) { - bdrv_op_unblock_all(blk->bs, reason); + BlockDriverState *bs = blk_bs(blk); + + if (bs) { + bdrv_op_unblock_all(bs, reason); + } } AioContext *blk_get_aio_context(BlockBackend *blk) { - return bdrv_get_aio_context(blk->bs); + BlockDriverState *bs = blk_bs(blk); + + if (bs) { + return bdrv_get_aio_context(bs); + } else { + return qemu_get_aio_context(); + } +} + +static AioContext *blk_aiocb_get_aio_context(BlockAIOCB *acb) +{ + BlockBackendAIOCB *blk_acb = DO_UPCAST(BlockBackendAIOCB, common, acb); + return blk_get_aio_context(blk_acb->blk); } void blk_set_aio_context(BlockBackend *blk, AioContext *new_context) { - bdrv_set_aio_context(blk->bs, new_context); + BlockDriverState *bs = blk_bs(blk); + + if (bs) { + bdrv_set_aio_context(bs, new_context); + } } void blk_add_aio_context_notifier(BlockBackend *blk, void (*attached_aio_context)(AioContext *new_context, void *opaque), void (*detach_aio_context)(void *opaque), void *opaque) { - bdrv_add_aio_context_notifier(blk->bs, attached_aio_context, - detach_aio_context, opaque); + BlockDriverState *bs = blk_bs(blk); + + if (bs) { + bdrv_add_aio_context_notifier(bs, attached_aio_context, + detach_aio_context, opaque); + } } void blk_remove_aio_context_notifier(BlockBackend *blk, @@ -831,28 +1397,45 @@ void blk_remove_aio_context_notifier(BlockBackend *blk, void (*detach_aio_context)(void *), void *opaque) { - bdrv_remove_aio_context_notifier(blk->bs, attached_aio_context, - detach_aio_context, opaque); + BlockDriverState *bs = blk_bs(blk); + + if (bs) { + bdrv_remove_aio_context_notifier(bs, attached_aio_context, + detach_aio_context, opaque); + } } -void blk_add_close_notifier(BlockBackend *blk, Notifier *notify) +void blk_add_remove_bs_notifier(BlockBackend *blk, Notifier *notify) { - bdrv_add_close_notifier(blk->bs, notify); + notifier_list_add(&blk->remove_bs_notifiers, notify); +} + +void blk_add_insert_bs_notifier(BlockBackend *blk, Notifier *notify) +{ + notifier_list_add(&blk->insert_bs_notifiers, notify); } void blk_io_plug(BlockBackend *blk) { - bdrv_io_plug(blk->bs); + BlockDriverState *bs = blk_bs(blk); + + if (bs) { + bdrv_io_plug(bs); + } } void blk_io_unplug(BlockBackend *blk) { - bdrv_io_unplug(blk->bs); + BlockDriverState *bs = blk_bs(blk); + + if (bs) { + bdrv_io_unplug(bs); + } } BlockAcctStats *blk_get_stats(BlockBackend *blk) { - return bdrv_get_stats(blk->bs); + return &blk->stats; } void *blk_aio_get(const AIOCBInfo *aiocb_info, BlockBackend *blk, @@ -864,12 +1447,13 @@ void *blk_aio_get(const AIOCBInfo *aiocb_info, BlockBackend *blk, int coroutine_fn blk_co_write_zeroes(BlockBackend *blk, int64_t sector_num, int nb_sectors, BdrvRequestFlags flags) { - int ret = blk_check_request(blk, sector_num, nb_sectors); - if (ret < 0) { - return ret; + if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) { + return -EINVAL; } - return bdrv_co_write_zeroes(blk->bs, sector_num, nb_sectors, flags); + return blk_co_pwritev(blk, sector_num << BDRV_SECTOR_BITS, + nb_sectors << BDRV_SECTOR_BITS, NULL, + flags | BDRV_REQ_ZERO_WRITE); } int blk_write_compressed(BlockBackend *blk, int64_t sector_num, @@ -880,12 +1464,16 @@ int blk_write_compressed(BlockBackend *blk, int64_t sector_num, return ret; } - return bdrv_write_compressed(blk->bs, sector_num, buf, nb_sectors); + return bdrv_write_compressed(blk_bs(blk), sector_num, buf, nb_sectors); } int blk_truncate(BlockBackend *blk, int64_t offset) { - return bdrv_truncate(blk->bs, offset); + if (!blk_is_available(blk)) { + return -ENOMEDIUM; + } + + return bdrv_truncate(blk_bs(blk), offset); } int blk_discard(BlockBackend *blk, int64_t sector_num, int nb_sectors) @@ -895,26 +1483,153 @@ int blk_discard(BlockBackend *blk, int64_t sector_num, int nb_sectors) return ret; } - return bdrv_discard(blk->bs, sector_num, nb_sectors); + return bdrv_discard(blk_bs(blk), sector_num, nb_sectors); } int blk_save_vmstate(BlockBackend *blk, const uint8_t *buf, int64_t pos, int size) { - return bdrv_save_vmstate(blk->bs, buf, pos, size); + int ret; + + if (!blk_is_available(blk)) { + return -ENOMEDIUM; + } + + ret = bdrv_save_vmstate(blk_bs(blk), buf, pos, size); + if (ret < 0) { + return ret; + } + + if (ret == size && !blk->enable_write_cache) { + ret = bdrv_flush(blk_bs(blk)); + } + + return ret < 0 ? ret : size; } int blk_load_vmstate(BlockBackend *blk, uint8_t *buf, int64_t pos, int size) { - return bdrv_load_vmstate(blk->bs, buf, pos, size); + if (!blk_is_available(blk)) { + return -ENOMEDIUM; + } + + return bdrv_load_vmstate(blk_bs(blk), buf, pos, size); } int blk_probe_blocksizes(BlockBackend *blk, BlockSizes *bsz) { - return bdrv_probe_blocksizes(blk->bs, bsz); + if (!blk_is_available(blk)) { + return -ENOMEDIUM; + } + + return bdrv_probe_blocksizes(blk_bs(blk), bsz); } int blk_probe_geometry(BlockBackend *blk, HDGeometry *geo) { - return bdrv_probe_geometry(blk->bs, geo); + if (!blk_is_available(blk)) { + return -ENOMEDIUM; + } + + return bdrv_probe_geometry(blk_bs(blk), geo); +} + +/* + * Updates the BlockBackendRootState object with data from the currently + * attached BlockDriverState. + */ +void blk_update_root_state(BlockBackend *blk) +{ + assert(blk->root); + + blk->root_state.open_flags = blk->root->bs->open_flags; + blk->root_state.read_only = blk->root->bs->read_only; + blk->root_state.detect_zeroes = blk->root->bs->detect_zeroes; + + if (blk->root_state.throttle_group) { + g_free(blk->root_state.throttle_group); + throttle_group_unref(blk->root_state.throttle_state); + } + if (blk->root->bs->throttle_state) { + const char *name = throttle_group_get_name(blk->root->bs); + blk->root_state.throttle_group = g_strdup(name); + blk->root_state.throttle_state = throttle_group_incref(name); + } else { + blk->root_state.throttle_group = NULL; + blk->root_state.throttle_state = NULL; + } +} + +/* + * Applies the information in the root state to the given BlockDriverState. This + * does not include the flags which have to be specified for bdrv_open(), use + * blk_get_open_flags_from_root_state() to inquire them. + */ +void blk_apply_root_state(BlockBackend *blk, BlockDriverState *bs) +{ + bs->detect_zeroes = blk->root_state.detect_zeroes; + if (blk->root_state.throttle_group) { + bdrv_io_limits_enable(bs, blk->root_state.throttle_group); + } +} + +/* + * Returns the flags to be used for bdrv_open() of a BlockDriverState which is + * supposed to inherit the root state. + */ +int blk_get_open_flags_from_root_state(BlockBackend *blk) +{ + int bs_flags; + + bs_flags = blk->root_state.read_only ? 0 : BDRV_O_RDWR; + bs_flags |= blk->root_state.open_flags & ~BDRV_O_RDWR; + + return bs_flags; +} + +BlockBackendRootState *blk_get_root_state(BlockBackend *blk) +{ + return &blk->root_state; +} + +int blk_commit_all(void) +{ + BlockBackend *blk = NULL; + + while ((blk = blk_all_next(blk)) != NULL) { + AioContext *aio_context = blk_get_aio_context(blk); + + aio_context_acquire(aio_context); + if (blk_is_inserted(blk) && blk->root->bs->backing) { + int ret = bdrv_commit(blk->root->bs); + if (ret < 0) { + aio_context_release(aio_context); + return ret; + } + } + aio_context_release(aio_context); + } + return 0; +} + +int blk_flush_all(void) +{ + BlockBackend *blk = NULL; + int result = 0; + + while ((blk = blk_all_next(blk)) != NULL) { + AioContext *aio_context = blk_get_aio_context(blk); + int ret; + + aio_context_acquire(aio_context); + if (blk_is_inserted(blk)) { + ret = blk_flush(blk); + if (ret < 0 && !result) { + result = ret; + } + } + aio_context_release(aio_context); + } + + return result; } diff --git a/qemu/block/bochs.c b/qemu/block/bochs.c index 199ac2b9a..af8b7abdf 100644 --- a/qemu/block/bochs.c +++ b/qemu/block/bochs.c @@ -22,6 +22,8 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. */ +#include "qemu/osdep.h" +#include "qapi/error.h" #include "qemu-common.h" #include "block/block_int.h" #include "qemu/module.h" @@ -103,7 +105,7 @@ static int bochs_open(BlockDriverState *bs, QDict *options, int flags, bs->read_only = 1; // no write support yet - ret = bdrv_pread(bs->file, 0, &bochs, sizeof(bochs)); + ret = bdrv_pread(bs->file->bs, 0, &bochs, sizeof(bochs)); if (ret < 0) { return ret; } @@ -137,7 +139,7 @@ static int bochs_open(BlockDriverState *bs, QDict *options, int flags, return -ENOMEM; } - ret = bdrv_pread(bs->file, le32_to_cpu(bochs.header), s->catalog_bitmap, + ret = bdrv_pread(bs->file->bs, le32_to_cpu(bochs.header), s->catalog_bitmap, s->catalog_size * 4); if (ret < 0) { goto fail; @@ -206,7 +208,7 @@ static int64_t seek_to_sector(BlockDriverState *bs, int64_t sector_num) (s->extent_blocks + s->bitmap_blocks)); /* read in bitmap for current extent */ - ret = bdrv_pread(bs->file, bitmap_offset + (extent_offset / 8), + ret = bdrv_pread(bs->file->bs, bitmap_offset + (extent_offset / 8), &bitmap_entry, 1); if (ret < 0) { return ret; @@ -229,7 +231,7 @@ static int bochs_read(BlockDriverState *bs, int64_t sector_num, if (block_offset < 0) { return block_offset; } else if (block_offset > 0) { - ret = bdrv_pread(bs->file, block_offset, buf, 512); + ret = bdrv_pread(bs->file->bs, block_offset, buf, 512); if (ret < 0) { return ret; } diff --git a/qemu/block/cloop.c b/qemu/block/cloop.c index f328be06f..a84f14019 100644 --- a/qemu/block/cloop.c +++ b/qemu/block/cloop.c @@ -21,6 +21,8 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. */ +#include "qemu/osdep.h" +#include "qapi/error.h" #include "qemu-common.h" #include "block/block_int.h" #include "qemu/module.h" @@ -66,7 +68,7 @@ static int cloop_open(BlockDriverState *bs, QDict *options, int flags, bs->read_only = 1; /* read header */ - ret = bdrv_pread(bs->file, 128, &s->block_size, 4); + ret = bdrv_pread(bs->file->bs, 128, &s->block_size, 4); if (ret < 0) { return ret; } @@ -92,7 +94,7 @@ static int cloop_open(BlockDriverState *bs, QDict *options, int flags, return -EINVAL; } - ret = bdrv_pread(bs->file, 128 + 4, &s->n_blocks, 4); + ret = bdrv_pread(bs->file->bs, 128 + 4, &s->n_blocks, 4); if (ret < 0) { return ret; } @@ -123,7 +125,7 @@ static int cloop_open(BlockDriverState *bs, QDict *options, int flags, return -ENOMEM; } - ret = bdrv_pread(bs->file, 128 + 4 + 4, s->offsets, offsets_size); + ret = bdrv_pread(bs->file->bs, 128 + 4 + 4, s->offsets, offsets_size); if (ret < 0) { goto fail; } @@ -203,8 +205,8 @@ static inline int cloop_read_block(BlockDriverState *bs, int block_num) int ret; uint32_t bytes = s->offsets[block_num + 1] - s->offsets[block_num]; - ret = bdrv_pread(bs->file, s->offsets[block_num], s->compressed_block, - bytes); + ret = bdrv_pread(bs->file->bs, s->offsets[block_num], + s->compressed_block, bytes); if (ret != bytes) { return -1; } diff --git a/qemu/block/commit.c b/qemu/block/commit.c index 7312a5bdc..cba0e8c1e 100644 --- a/qemu/block/commit.c +++ b/qemu/block/commit.c @@ -12,11 +12,14 @@ * */ +#include "qemu/osdep.h" #include "trace.h" #include "block/block_int.h" #include "block/blockjob.h" +#include "qapi/error.h" #include "qapi/qmp/qerror.h" #include "qemu/ratelimit.h" +#include "sysemu/block-backend.h" enum { /* @@ -213,7 +216,7 @@ void commit_start(BlockDriverState *bs, BlockDriverState *base, if ((on_error == BLOCKDEV_ON_ERROR_STOP || on_error == BLOCKDEV_ON_ERROR_ENOSPC) && - !bdrv_iostatus_is_enabled(bs)) { + (!bs->blk || !blk_iostatus_is_enabled(bs->blk))) { error_setg(errp, "Invalid parameter combination"); return; } @@ -235,14 +238,14 @@ void commit_start(BlockDriverState *bs, BlockDriverState *base, orig_overlay_flags = bdrv_get_flags(overlay_bs); /* convert base & overlay_bs to r/w, if necessary */ - if (!(orig_base_flags & BDRV_O_RDWR)) { - reopen_queue = bdrv_reopen_queue(reopen_queue, base, - orig_base_flags | BDRV_O_RDWR); - } if (!(orig_overlay_flags & BDRV_O_RDWR)) { - reopen_queue = bdrv_reopen_queue(reopen_queue, overlay_bs, + reopen_queue = bdrv_reopen_queue(reopen_queue, overlay_bs, NULL, orig_overlay_flags | BDRV_O_RDWR); } + if (!(orig_base_flags & BDRV_O_RDWR)) { + reopen_queue = bdrv_reopen_queue(reopen_queue, base, NULL, + orig_base_flags | BDRV_O_RDWR); + } if (reopen_queue) { bdrv_reopen_multiple(reopen_queue, &local_err); if (local_err != NULL) { diff --git a/qemu/block/crypto.c b/qemu/block/crypto.c new file mode 100644 index 000000000..1903e84fb --- /dev/null +++ b/qemu/block/crypto.c @@ -0,0 +1,586 @@ +/* + * QEMU block full disk encryption + * + * Copyright (c) 2015-2016 Red Hat, Inc. + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, see <http://www.gnu.org/licenses/>. + * + */ + +#include "qemu/osdep.h" + +#include "block/block_int.h" +#include "sysemu/block-backend.h" +#include "crypto/block.h" +#include "qapi/opts-visitor.h" +#include "qapi-visit.h" +#include "qapi/error.h" + +#define BLOCK_CRYPTO_OPT_LUKS_KEY_SECRET "key-secret" +#define BLOCK_CRYPTO_OPT_LUKS_CIPHER_ALG "cipher-alg" +#define BLOCK_CRYPTO_OPT_LUKS_CIPHER_MODE "cipher-mode" +#define BLOCK_CRYPTO_OPT_LUKS_IVGEN_ALG "ivgen-alg" +#define BLOCK_CRYPTO_OPT_LUKS_IVGEN_HASH_ALG "ivgen-hash-alg" +#define BLOCK_CRYPTO_OPT_LUKS_HASH_ALG "hash-alg" + +typedef struct BlockCrypto BlockCrypto; + +struct BlockCrypto { + QCryptoBlock *block; +}; + + +static int block_crypto_probe_generic(QCryptoBlockFormat format, + const uint8_t *buf, + int buf_size, + const char *filename) +{ + if (qcrypto_block_has_format(format, buf, buf_size)) { + return 100; + } else { + return 0; + } +} + + +static ssize_t block_crypto_read_func(QCryptoBlock *block, + size_t offset, + uint8_t *buf, + size_t buflen, + Error **errp, + void *opaque) +{ + BlockDriverState *bs = opaque; + ssize_t ret; + + ret = bdrv_pread(bs->file->bs, offset, buf, buflen); + if (ret < 0) { + error_setg_errno(errp, -ret, "Could not read encryption header"); + return ret; + } + return ret; +} + + +struct BlockCryptoCreateData { + const char *filename; + QemuOpts *opts; + BlockBackend *blk; + uint64_t size; +}; + + +static ssize_t block_crypto_write_func(QCryptoBlock *block, + size_t offset, + const uint8_t *buf, + size_t buflen, + Error **errp, + void *opaque) +{ + struct BlockCryptoCreateData *data = opaque; + ssize_t ret; + + ret = blk_pwrite(data->blk, offset, buf, buflen); + if (ret < 0) { + error_setg_errno(errp, -ret, "Could not write encryption header"); + return ret; + } + return ret; +} + + +static ssize_t block_crypto_init_func(QCryptoBlock *block, + size_t headerlen, + Error **errp, + void *opaque) +{ + struct BlockCryptoCreateData *data = opaque; + int ret; + + /* User provided size should reflect amount of space made + * available to the guest, so we must take account of that + * which will be used by the crypto header + */ + data->size += headerlen; + + qemu_opt_set_number(data->opts, BLOCK_OPT_SIZE, data->size, &error_abort); + ret = bdrv_create_file(data->filename, data->opts, errp); + if (ret < 0) { + return -1; + } + + data->blk = blk_new_open(data->filename, NULL, NULL, + BDRV_O_RDWR | BDRV_O_PROTOCOL, errp); + if (!data->blk) { + return -1; + } + + return 0; +} + + +static QemuOptsList block_crypto_runtime_opts_luks = { + .name = "crypto", + .head = QTAILQ_HEAD_INITIALIZER(block_crypto_runtime_opts_luks.head), + .desc = { + { + .name = BLOCK_CRYPTO_OPT_LUKS_KEY_SECRET, + .type = QEMU_OPT_STRING, + .help = "ID of the secret that provides the encryption key", + }, + { /* end of list */ } + }, +}; + + +static QemuOptsList block_crypto_create_opts_luks = { + .name = "crypto", + .head = QTAILQ_HEAD_INITIALIZER(block_crypto_create_opts_luks.head), + .desc = { + { + .name = BLOCK_OPT_SIZE, + .type = QEMU_OPT_SIZE, + .help = "Virtual disk size" + }, + { + .name = BLOCK_CRYPTO_OPT_LUKS_KEY_SECRET, + .type = QEMU_OPT_STRING, + .help = "ID of the secret that provides the encryption key", + }, + { + .name = BLOCK_CRYPTO_OPT_LUKS_CIPHER_ALG, + .type = QEMU_OPT_STRING, + .help = "Name of encryption cipher algorithm", + }, + { + .name = BLOCK_CRYPTO_OPT_LUKS_CIPHER_MODE, + .type = QEMU_OPT_STRING, + .help = "Name of encryption cipher mode", + }, + { + .name = BLOCK_CRYPTO_OPT_LUKS_IVGEN_ALG, + .type = QEMU_OPT_STRING, + .help = "Name of IV generator algorithm", + }, + { + .name = BLOCK_CRYPTO_OPT_LUKS_IVGEN_HASH_ALG, + .type = QEMU_OPT_STRING, + .help = "Name of IV generator hash algorithm", + }, + { + .name = BLOCK_CRYPTO_OPT_LUKS_HASH_ALG, + .type = QEMU_OPT_STRING, + .help = "Name of encryption hash algorithm", + }, + { /* end of list */ } + }, +}; + + +static QCryptoBlockOpenOptions * +block_crypto_open_opts_init(QCryptoBlockFormat format, + QemuOpts *opts, + Error **errp) +{ + OptsVisitor *ov; + QCryptoBlockOpenOptions *ret = NULL; + Error *local_err = NULL; + Error *end_err = NULL; + + ret = g_new0(QCryptoBlockOpenOptions, 1); + ret->format = format; + + ov = opts_visitor_new(opts); + + visit_start_struct(opts_get_visitor(ov), + NULL, NULL, 0, &local_err); + if (local_err) { + goto out; + } + + switch (format) { + case Q_CRYPTO_BLOCK_FORMAT_LUKS: + visit_type_QCryptoBlockOptionsLUKS_members( + opts_get_visitor(ov), &ret->u.luks, &local_err); + break; + + default: + error_setg(&local_err, "Unsupported block format %d", format); + break; + } + + visit_end_struct(opts_get_visitor(ov), &end_err); + error_propagate(&local_err, end_err); + + out: + if (local_err) { + error_propagate(errp, local_err); + qapi_free_QCryptoBlockOpenOptions(ret); + ret = NULL; + } + opts_visitor_cleanup(ov); + return ret; +} + + +static QCryptoBlockCreateOptions * +block_crypto_create_opts_init(QCryptoBlockFormat format, + QemuOpts *opts, + Error **errp) +{ + OptsVisitor *ov; + QCryptoBlockCreateOptions *ret = NULL; + Error *local_err = NULL; + Error *end_err = NULL; + + ret = g_new0(QCryptoBlockCreateOptions, 1); + ret->format = format; + + ov = opts_visitor_new(opts); + + visit_start_struct(opts_get_visitor(ov), + NULL, NULL, 0, &local_err); + if (local_err) { + goto out; + } + + switch (format) { + case Q_CRYPTO_BLOCK_FORMAT_LUKS: + visit_type_QCryptoBlockCreateOptionsLUKS_members( + opts_get_visitor(ov), &ret->u.luks, &local_err); + break; + + default: + error_setg(&local_err, "Unsupported block format %d", format); + break; + } + + visit_end_struct(opts_get_visitor(ov), &end_err); + error_propagate(&local_err, end_err); + + out: + if (local_err) { + error_propagate(errp, local_err); + qapi_free_QCryptoBlockCreateOptions(ret); + ret = NULL; + } + opts_visitor_cleanup(ov); + return ret; +} + + +static int block_crypto_open_generic(QCryptoBlockFormat format, + QemuOptsList *opts_spec, + BlockDriverState *bs, + QDict *options, + int flags, + Error **errp) +{ + BlockCrypto *crypto = bs->opaque; + QemuOpts *opts = NULL; + Error *local_err = NULL; + int ret = -EINVAL; + QCryptoBlockOpenOptions *open_opts = NULL; + unsigned int cflags = 0; + + opts = qemu_opts_create(opts_spec, NULL, 0, &error_abort); + qemu_opts_absorb_qdict(opts, options, &local_err); + if (local_err) { + error_propagate(errp, local_err); + goto cleanup; + } + + open_opts = block_crypto_open_opts_init(format, opts, errp); + if (!open_opts) { + goto cleanup; + } + + if (flags & BDRV_O_NO_IO) { + cflags |= QCRYPTO_BLOCK_OPEN_NO_IO; + } + crypto->block = qcrypto_block_open(open_opts, + block_crypto_read_func, + bs, + cflags, + errp); + + if (!crypto->block) { + ret = -EIO; + goto cleanup; + } + + bs->encrypted = 1; + bs->valid_key = 1; + + ret = 0; + cleanup: + qapi_free_QCryptoBlockOpenOptions(open_opts); + return ret; +} + + +static int block_crypto_create_generic(QCryptoBlockFormat format, + const char *filename, + QemuOpts *opts, + Error **errp) +{ + int ret = -EINVAL; + QCryptoBlockCreateOptions *create_opts = NULL; + QCryptoBlock *crypto = NULL; + struct BlockCryptoCreateData data = { + .size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0), + BDRV_SECTOR_SIZE), + .opts = opts, + .filename = filename, + }; + + create_opts = block_crypto_create_opts_init(format, opts, errp); + if (!create_opts) { + return -1; + } + + crypto = qcrypto_block_create(create_opts, + block_crypto_init_func, + block_crypto_write_func, + &data, + errp); + + if (!crypto) { + ret = -EIO; + goto cleanup; + } + + ret = 0; + cleanup: + qcrypto_block_free(crypto); + blk_unref(data.blk); + qapi_free_QCryptoBlockCreateOptions(create_opts); + return ret; +} + +static int block_crypto_truncate(BlockDriverState *bs, int64_t offset) +{ + BlockCrypto *crypto = bs->opaque; + size_t payload_offset = + qcrypto_block_get_payload_offset(crypto->block); + + offset += payload_offset; + + return bdrv_truncate(bs->file->bs, offset); +} + +static void block_crypto_close(BlockDriverState *bs) +{ + BlockCrypto *crypto = bs->opaque; + qcrypto_block_free(crypto->block); +} + + +#define BLOCK_CRYPTO_MAX_SECTORS 32 + +static coroutine_fn int +block_crypto_co_readv(BlockDriverState *bs, int64_t sector_num, + int remaining_sectors, QEMUIOVector *qiov) +{ + BlockCrypto *crypto = bs->opaque; + int cur_nr_sectors; /* number of sectors in current iteration */ + uint64_t bytes_done = 0; + uint8_t *cipher_data = NULL; + QEMUIOVector hd_qiov; + int ret = 0; + size_t payload_offset = + qcrypto_block_get_payload_offset(crypto->block) / 512; + + qemu_iovec_init(&hd_qiov, qiov->niov); + + /* Bounce buffer so we have a linear mem region for + * entire sector. XXX optimize so we avoid bounce + * buffer in case that qiov->niov == 1 + */ + cipher_data = + qemu_try_blockalign(bs->file->bs, MIN(BLOCK_CRYPTO_MAX_SECTORS * 512, + qiov->size)); + if (cipher_data == NULL) { + ret = -ENOMEM; + goto cleanup; + } + + while (remaining_sectors) { + cur_nr_sectors = remaining_sectors; + + if (cur_nr_sectors > BLOCK_CRYPTO_MAX_SECTORS) { + cur_nr_sectors = BLOCK_CRYPTO_MAX_SECTORS; + } + + qemu_iovec_reset(&hd_qiov); + qemu_iovec_add(&hd_qiov, cipher_data, cur_nr_sectors * 512); + + ret = bdrv_co_readv(bs->file->bs, + payload_offset + sector_num, + cur_nr_sectors, &hd_qiov); + if (ret < 0) { + goto cleanup; + } + + if (qcrypto_block_decrypt(crypto->block, + sector_num, + cipher_data, cur_nr_sectors * 512, + NULL) < 0) { + ret = -EIO; + goto cleanup; + } + + qemu_iovec_from_buf(qiov, bytes_done, + cipher_data, cur_nr_sectors * 512); + + remaining_sectors -= cur_nr_sectors; + sector_num += cur_nr_sectors; + bytes_done += cur_nr_sectors * 512; + } + + cleanup: + qemu_iovec_destroy(&hd_qiov); + qemu_vfree(cipher_data); + + return ret; +} + + +static coroutine_fn int +block_crypto_co_writev(BlockDriverState *bs, int64_t sector_num, + int remaining_sectors, QEMUIOVector *qiov) +{ + BlockCrypto *crypto = bs->opaque; + int cur_nr_sectors; /* number of sectors in current iteration */ + uint64_t bytes_done = 0; + uint8_t *cipher_data = NULL; + QEMUIOVector hd_qiov; + int ret = 0; + size_t payload_offset = + qcrypto_block_get_payload_offset(crypto->block) / 512; + + qemu_iovec_init(&hd_qiov, qiov->niov); + + /* Bounce buffer so we have a linear mem region for + * entire sector. XXX optimize so we avoid bounce + * buffer in case that qiov->niov == 1 + */ + cipher_data = + qemu_try_blockalign(bs->file->bs, MIN(BLOCK_CRYPTO_MAX_SECTORS * 512, + qiov->size)); + if (cipher_data == NULL) { + ret = -ENOMEM; + goto cleanup; + } + + while (remaining_sectors) { + cur_nr_sectors = remaining_sectors; + + if (cur_nr_sectors > BLOCK_CRYPTO_MAX_SECTORS) { + cur_nr_sectors = BLOCK_CRYPTO_MAX_SECTORS; + } + + qemu_iovec_to_buf(qiov, bytes_done, + cipher_data, cur_nr_sectors * 512); + + if (qcrypto_block_encrypt(crypto->block, + sector_num, + cipher_data, cur_nr_sectors * 512, + NULL) < 0) { + ret = -EIO; + goto cleanup; + } + + qemu_iovec_reset(&hd_qiov); + qemu_iovec_add(&hd_qiov, cipher_data, cur_nr_sectors * 512); + + ret = bdrv_co_writev(bs->file->bs, + payload_offset + sector_num, + cur_nr_sectors, &hd_qiov); + if (ret < 0) { + goto cleanup; + } + + remaining_sectors -= cur_nr_sectors; + sector_num += cur_nr_sectors; + bytes_done += cur_nr_sectors * 512; + } + + cleanup: + qemu_iovec_destroy(&hd_qiov); + qemu_vfree(cipher_data); + + return ret; +} + + +static int64_t block_crypto_getlength(BlockDriverState *bs) +{ + BlockCrypto *crypto = bs->opaque; + int64_t len = bdrv_getlength(bs->file->bs); + + ssize_t offset = qcrypto_block_get_payload_offset(crypto->block); + + len -= offset; + + return len; +} + + +static int block_crypto_probe_luks(const uint8_t *buf, + int buf_size, + const char *filename) { + return block_crypto_probe_generic(Q_CRYPTO_BLOCK_FORMAT_LUKS, + buf, buf_size, filename); +} + +static int block_crypto_open_luks(BlockDriverState *bs, + QDict *options, + int flags, + Error **errp) +{ + return block_crypto_open_generic(Q_CRYPTO_BLOCK_FORMAT_LUKS, + &block_crypto_runtime_opts_luks, + bs, options, flags, errp); +} + +static int block_crypto_create_luks(const char *filename, + QemuOpts *opts, + Error **errp) +{ + return block_crypto_create_generic(Q_CRYPTO_BLOCK_FORMAT_LUKS, + filename, opts, errp); +} + +BlockDriver bdrv_crypto_luks = { + .format_name = "luks", + .instance_size = sizeof(BlockCrypto), + .bdrv_probe = block_crypto_probe_luks, + .bdrv_open = block_crypto_open_luks, + .bdrv_close = block_crypto_close, + .bdrv_create = block_crypto_create_luks, + .bdrv_truncate = block_crypto_truncate, + .create_opts = &block_crypto_create_opts_luks, + + .bdrv_co_readv = block_crypto_co_readv, + .bdrv_co_writev = block_crypto_co_writev, + .bdrv_getlength = block_crypto_getlength, +}; + +static void block_crypto_init(void) +{ + bdrv_register(&bdrv_crypto_luks); +} + +block_init(block_crypto_init); diff --git a/qemu/block/curl.c b/qemu/block/curl.c index 032cc8ae2..5a8f8b623 100644 --- a/qemu/block/curl.c +++ b/qemu/block/curl.c @@ -21,12 +21,16 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. */ +#include "qemu/osdep.h" +#include "qapi/error.h" #include "qemu-common.h" #include "qemu/error-report.h" #include "block/block_int.h" #include "qapi/qmp/qbool.h" #include "qapi/qmp/qstring.h" +#include "crypto/secret.h" #include <curl/curl.h> +#include "qemu/cutils.h" // #define DEBUG_CURL // #define DEBUG_VERBOSE @@ -77,6 +81,10 @@ static CURLMcode __curl_multi_socket_action(CURLM *multi_handle, #define CURL_BLOCK_OPT_SSLVERIFY "sslverify" #define CURL_BLOCK_OPT_TIMEOUT "timeout" #define CURL_BLOCK_OPT_COOKIE "cookie" +#define CURL_BLOCK_OPT_USERNAME "username" +#define CURL_BLOCK_OPT_PASSWORD_SECRET "password-secret" +#define CURL_BLOCK_OPT_PROXY_USERNAME "proxy-username" +#define CURL_BLOCK_OPT_PROXY_PASSWORD_SECRET "proxy-password-secret" struct BDRVCURLState; @@ -119,6 +127,10 @@ typedef struct BDRVCURLState { char *cookie; bool accept_range; AioContext *aio_context; + char *username; + char *password; + char *proxyusername; + char *proxypassword; } BDRVCURLState; static void curl_clean_state(CURLState *s); @@ -154,18 +166,20 @@ static int curl_sock_cb(CURL *curl, curl_socket_t fd, int action, DPRINTF("CURL (AIO): Sock action %d on fd %d\n", action, fd); switch (action) { case CURL_POLL_IN: - aio_set_fd_handler(s->aio_context, fd, curl_multi_read, - NULL, state); + aio_set_fd_handler(s->aio_context, fd, false, + curl_multi_read, NULL, state); break; case CURL_POLL_OUT: - aio_set_fd_handler(s->aio_context, fd, NULL, curl_multi_do, state); + aio_set_fd_handler(s->aio_context, fd, false, + NULL, curl_multi_do, state); break; case CURL_POLL_INOUT: - aio_set_fd_handler(s->aio_context, fd, curl_multi_read, - curl_multi_do, state); + aio_set_fd_handler(s->aio_context, fd, false, + curl_multi_read, curl_multi_do, state); break; case CURL_POLL_REMOVE: - aio_set_fd_handler(s->aio_context, fd, NULL, NULL, NULL); + aio_set_fd_handler(s->aio_context, fd, false, + NULL, NULL, NULL); break; } @@ -416,6 +430,21 @@ static CURLState *curl_init_state(BlockDriverState *bs, BDRVCURLState *s) curl_easy_setopt(state->curl, CURLOPT_ERRORBUFFER, state->errmsg); curl_easy_setopt(state->curl, CURLOPT_FAILONERROR, 1); + if (s->username) { + curl_easy_setopt(state->curl, CURLOPT_USERNAME, s->username); + } + if (s->password) { + curl_easy_setopt(state->curl, CURLOPT_PASSWORD, s->password); + } + if (s->proxyusername) { + curl_easy_setopt(state->curl, + CURLOPT_PROXYUSERNAME, s->proxyusername); + } + if (s->proxypassword) { + curl_easy_setopt(state->curl, + CURLOPT_PROXYPASSWORD, s->proxypassword); + } + /* Restrict supported protocols to avoid security issues in the more * obscure protocols. For example, do not allow POP3/SMTP/IMAP see * CVE-2013-0249. @@ -522,10 +551,31 @@ static QemuOptsList runtime_opts = { .type = QEMU_OPT_STRING, .help = "Pass the cookie or list of cookies with each request" }, + { + .name = CURL_BLOCK_OPT_USERNAME, + .type = QEMU_OPT_STRING, + .help = "Username for HTTP auth" + }, + { + .name = CURL_BLOCK_OPT_PASSWORD_SECRET, + .type = QEMU_OPT_STRING, + .help = "ID of secret used as password for HTTP auth", + }, + { + .name = CURL_BLOCK_OPT_PROXY_USERNAME, + .type = QEMU_OPT_STRING, + .help = "Username for HTTP proxy auth" + }, + { + .name = CURL_BLOCK_OPT_PROXY_PASSWORD_SECRET, + .type = QEMU_OPT_STRING, + .help = "ID of secret used as password for HTTP proxy auth", + }, { /* end of list */ } }, }; + static int curl_open(BlockDriverState *bs, QDict *options, int flags, Error **errp) { @@ -536,6 +586,7 @@ static int curl_open(BlockDriverState *bs, QDict *options, int flags, const char *file; const char *cookie; double d; + const char *secretid; static int inited = 0; @@ -577,6 +628,26 @@ static int curl_open(BlockDriverState *bs, QDict *options, int flags, goto out_noclean; } + s->username = g_strdup(qemu_opt_get(opts, CURL_BLOCK_OPT_USERNAME)); + secretid = qemu_opt_get(opts, CURL_BLOCK_OPT_PASSWORD_SECRET); + + if (secretid) { + s->password = qcrypto_secret_lookup_as_utf8(secretid, errp); + if (!s->password) { + goto out_noclean; + } + } + + s->proxyusername = g_strdup( + qemu_opt_get(opts, CURL_BLOCK_OPT_PROXY_USERNAME)); + secretid = qemu_opt_get(opts, CURL_BLOCK_OPT_PROXY_PASSWORD_SECRET); + if (secretid) { + s->proxypassword = qcrypto_secret_lookup_as_utf8(secretid, errp); + if (!s->proxypassword) { + goto out_noclean; + } + } + if (!inited) { curl_global_init(CURL_GLOBAL_ALL); inited = 1; diff --git a/qemu/block/dirty-bitmap.c b/qemu/block/dirty-bitmap.c new file mode 100644 index 000000000..4902ca557 --- /dev/null +++ b/qemu/block/dirty-bitmap.c @@ -0,0 +1,387 @@ +/* + * Block Dirty Bitmap + * + * Copyright (c) 2016 Red Hat. Inc + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ +#include "qemu/osdep.h" +#include "qapi/error.h" +#include "qemu-common.h" +#include "trace.h" +#include "block/block_int.h" +#include "block/blockjob.h" + +/** + * A BdrvDirtyBitmap can be in three possible states: + * (1) successor is NULL and disabled is false: full r/w mode + * (2) successor is NULL and disabled is true: read only mode ("disabled") + * (3) successor is set: frozen mode. + * A frozen bitmap cannot be renamed, deleted, anonymized, cleared, set, + * or enabled. A frozen bitmap can only abdicate() or reclaim(). + */ +struct BdrvDirtyBitmap { + HBitmap *bitmap; /* Dirty sector bitmap implementation */ + BdrvDirtyBitmap *successor; /* Anonymous child; implies frozen status */ + char *name; /* Optional non-empty unique ID */ + int64_t size; /* Size of the bitmap (Number of sectors) */ + bool disabled; /* Bitmap is read-only */ + QLIST_ENTRY(BdrvDirtyBitmap) list; +}; + +BdrvDirtyBitmap *bdrv_find_dirty_bitmap(BlockDriverState *bs, const char *name) +{ + BdrvDirtyBitmap *bm; + + assert(name); + QLIST_FOREACH(bm, &bs->dirty_bitmaps, list) { + if (bm->name && !strcmp(name, bm->name)) { + return bm; + } + } + return NULL; +} + +void bdrv_dirty_bitmap_make_anon(BdrvDirtyBitmap *bitmap) +{ + assert(!bdrv_dirty_bitmap_frozen(bitmap)); + g_free(bitmap->name); + bitmap->name = NULL; +} + +BdrvDirtyBitmap *bdrv_create_dirty_bitmap(BlockDriverState *bs, + uint32_t granularity, + const char *name, + Error **errp) +{ + int64_t bitmap_size; + BdrvDirtyBitmap *bitmap; + uint32_t sector_granularity; + + assert((granularity & (granularity - 1)) == 0); + + if (name && bdrv_find_dirty_bitmap(bs, name)) { + error_setg(errp, "Bitmap already exists: %s", name); + return NULL; + } + sector_granularity = granularity >> BDRV_SECTOR_BITS; + assert(sector_granularity); + bitmap_size = bdrv_nb_sectors(bs); + if (bitmap_size < 0) { + error_setg_errno(errp, -bitmap_size, "could not get length of device"); + errno = -bitmap_size; + return NULL; + } + bitmap = g_new0(BdrvDirtyBitmap, 1); + bitmap->bitmap = hbitmap_alloc(bitmap_size, ctz32(sector_granularity)); + bitmap->size = bitmap_size; + bitmap->name = g_strdup(name); + bitmap->disabled = false; + QLIST_INSERT_HEAD(&bs->dirty_bitmaps, bitmap, list); + return bitmap; +} + +bool bdrv_dirty_bitmap_frozen(BdrvDirtyBitmap *bitmap) +{ + return bitmap->successor; +} + +bool bdrv_dirty_bitmap_enabled(BdrvDirtyBitmap *bitmap) +{ + return !(bitmap->disabled || bitmap->successor); +} + +DirtyBitmapStatus bdrv_dirty_bitmap_status(BdrvDirtyBitmap *bitmap) +{ + if (bdrv_dirty_bitmap_frozen(bitmap)) { + return DIRTY_BITMAP_STATUS_FROZEN; + } else if (!bdrv_dirty_bitmap_enabled(bitmap)) { + return DIRTY_BITMAP_STATUS_DISABLED; + } else { + return DIRTY_BITMAP_STATUS_ACTIVE; + } +} + +/** + * Create a successor bitmap destined to replace this bitmap after an operation. + * Requires that the bitmap is not frozen and has no successor. + */ +int bdrv_dirty_bitmap_create_successor(BlockDriverState *bs, + BdrvDirtyBitmap *bitmap, Error **errp) +{ + uint64_t granularity; + BdrvDirtyBitmap *child; + + if (bdrv_dirty_bitmap_frozen(bitmap)) { + error_setg(errp, "Cannot create a successor for a bitmap that is " + "currently frozen"); + return -1; + } + assert(!bitmap->successor); + + /* Create an anonymous successor */ + granularity = bdrv_dirty_bitmap_granularity(bitmap); + child = bdrv_create_dirty_bitmap(bs, granularity, NULL, errp); + if (!child) { + return -1; + } + + /* Successor will be on or off based on our current state. */ + child->disabled = bitmap->disabled; + + /* Install the successor and freeze the parent */ + bitmap->successor = child; + return 0; +} + +/** + * For a bitmap with a successor, yield our name to the successor, + * delete the old bitmap, and return a handle to the new bitmap. + */ +BdrvDirtyBitmap *bdrv_dirty_bitmap_abdicate(BlockDriverState *bs, + BdrvDirtyBitmap *bitmap, + Error **errp) +{ + char *name; + BdrvDirtyBitmap *successor = bitmap->successor; + + if (successor == NULL) { + error_setg(errp, "Cannot relinquish control if " + "there's no successor present"); + return NULL; + } + + name = bitmap->name; + bitmap->name = NULL; + successor->name = name; + bitmap->successor = NULL; + bdrv_release_dirty_bitmap(bs, bitmap); + + return successor; +} + +/** + * In cases of failure where we can no longer safely delete the parent, + * we may wish to re-join the parent and child/successor. + * The merged parent will be un-frozen, but not explicitly re-enabled. + */ +BdrvDirtyBitmap *bdrv_reclaim_dirty_bitmap(BlockDriverState *bs, + BdrvDirtyBitmap *parent, + Error **errp) +{ + BdrvDirtyBitmap *successor = parent->successor; + + if (!successor) { + error_setg(errp, "Cannot reclaim a successor when none is present"); + return NULL; + } + + if (!hbitmap_merge(parent->bitmap, successor->bitmap)) { + error_setg(errp, "Merging of parent and successor bitmap failed"); + return NULL; + } + bdrv_release_dirty_bitmap(bs, successor); + parent->successor = NULL; + + return parent; +} + +/** + * Truncates _all_ bitmaps attached to a BDS. + */ +void bdrv_dirty_bitmap_truncate(BlockDriverState *bs) +{ + BdrvDirtyBitmap *bitmap; + uint64_t size = bdrv_nb_sectors(bs); + + QLIST_FOREACH(bitmap, &bs->dirty_bitmaps, list) { + assert(!bdrv_dirty_bitmap_frozen(bitmap)); + hbitmap_truncate(bitmap->bitmap, size); + bitmap->size = size; + } +} + +static void bdrv_do_release_matching_dirty_bitmap(BlockDriverState *bs, + BdrvDirtyBitmap *bitmap, + bool only_named) +{ + BdrvDirtyBitmap *bm, *next; + QLIST_FOREACH_SAFE(bm, &bs->dirty_bitmaps, list, next) { + if ((!bitmap || bm == bitmap) && (!only_named || bm->name)) { + assert(!bdrv_dirty_bitmap_frozen(bm)); + QLIST_REMOVE(bm, list); + hbitmap_free(bm->bitmap); + g_free(bm->name); + g_free(bm); + + if (bitmap) { + return; + } + } + } +} + +void bdrv_release_dirty_bitmap(BlockDriverState *bs, BdrvDirtyBitmap *bitmap) +{ + bdrv_do_release_matching_dirty_bitmap(bs, bitmap, false); +} + +/** + * Release all named dirty bitmaps attached to a BDS (for use in bdrv_close()). + * There must not be any frozen bitmaps attached. + */ +void bdrv_release_named_dirty_bitmaps(BlockDriverState *bs) +{ + bdrv_do_release_matching_dirty_bitmap(bs, NULL, true); +} + +void bdrv_disable_dirty_bitmap(BdrvDirtyBitmap *bitmap) +{ + assert(!bdrv_dirty_bitmap_frozen(bitmap)); + bitmap->disabled = true; +} + +void bdrv_enable_dirty_bitmap(BdrvDirtyBitmap *bitmap) +{ + assert(!bdrv_dirty_bitmap_frozen(bitmap)); + bitmap->disabled = false; +} + +BlockDirtyInfoList *bdrv_query_dirty_bitmaps(BlockDriverState *bs) +{ + BdrvDirtyBitmap *bm; + BlockDirtyInfoList *list = NULL; + BlockDirtyInfoList **plist = &list; + + QLIST_FOREACH(bm, &bs->dirty_bitmaps, list) { + BlockDirtyInfo *info = g_new0(BlockDirtyInfo, 1); + BlockDirtyInfoList *entry = g_new0(BlockDirtyInfoList, 1); + info->count = bdrv_get_dirty_count(bm); + info->granularity = bdrv_dirty_bitmap_granularity(bm); + info->has_name = !!bm->name; + info->name = g_strdup(bm->name); + info->status = bdrv_dirty_bitmap_status(bm); + entry->value = info; + *plist = entry; + plist = &entry->next; + } + + return list; +} + +int bdrv_get_dirty(BlockDriverState *bs, BdrvDirtyBitmap *bitmap, + int64_t sector) +{ + if (bitmap) { + return hbitmap_get(bitmap->bitmap, sector); + } else { + return 0; + } +} + +/** + * Chooses a default granularity based on the existing cluster size, + * but clamped between [4K, 64K]. Defaults to 64K in the case that there + * is no cluster size information available. + */ +uint32_t bdrv_get_default_bitmap_granularity(BlockDriverState *bs) +{ + BlockDriverInfo bdi; + uint32_t granularity; + + if (bdrv_get_info(bs, &bdi) >= 0 && bdi.cluster_size > 0) { + granularity = MAX(4096, bdi.cluster_size); + granularity = MIN(65536, granularity); + } else { + granularity = 65536; + } + + return granularity; +} + +uint32_t bdrv_dirty_bitmap_granularity(BdrvDirtyBitmap *bitmap) +{ + return BDRV_SECTOR_SIZE << hbitmap_granularity(bitmap->bitmap); +} + +void bdrv_dirty_iter_init(BdrvDirtyBitmap *bitmap, HBitmapIter *hbi) +{ + hbitmap_iter_init(hbi, bitmap->bitmap, 0); +} + +void bdrv_set_dirty_bitmap(BdrvDirtyBitmap *bitmap, + int64_t cur_sector, int nr_sectors) +{ + assert(bdrv_dirty_bitmap_enabled(bitmap)); + hbitmap_set(bitmap->bitmap, cur_sector, nr_sectors); +} + +void bdrv_reset_dirty_bitmap(BdrvDirtyBitmap *bitmap, + int64_t cur_sector, int nr_sectors) +{ + assert(bdrv_dirty_bitmap_enabled(bitmap)); + hbitmap_reset(bitmap->bitmap, cur_sector, nr_sectors); +} + +void bdrv_clear_dirty_bitmap(BdrvDirtyBitmap *bitmap, HBitmap **out) +{ + assert(bdrv_dirty_bitmap_enabled(bitmap)); + if (!out) { + hbitmap_reset_all(bitmap->bitmap); + } else { + HBitmap *backup = bitmap->bitmap; + bitmap->bitmap = hbitmap_alloc(bitmap->size, + hbitmap_granularity(backup)); + *out = backup; + } +} + +void bdrv_undo_clear_dirty_bitmap(BdrvDirtyBitmap *bitmap, HBitmap *in) +{ + HBitmap *tmp = bitmap->bitmap; + assert(bdrv_dirty_bitmap_enabled(bitmap)); + bitmap->bitmap = in; + hbitmap_free(tmp); +} + +void bdrv_set_dirty(BlockDriverState *bs, int64_t cur_sector, + int nr_sectors) +{ + BdrvDirtyBitmap *bitmap; + QLIST_FOREACH(bitmap, &bs->dirty_bitmaps, list) { + if (!bdrv_dirty_bitmap_enabled(bitmap)) { + continue; + } + hbitmap_set(bitmap->bitmap, cur_sector, nr_sectors); + } +} + +/** + * Advance an HBitmapIter to an arbitrary offset. + */ +void bdrv_set_dirty_iter(HBitmapIter *hbi, int64_t offset) +{ + assert(hbi->hb); + hbitmap_iter_init(hbi, hbi->hb, offset); +} + +int64_t bdrv_get_dirty_count(BdrvDirtyBitmap *bitmap) +{ + return hbitmap_count(bitmap->bitmap); +} diff --git a/qemu/block/dmg.c b/qemu/block/dmg.c index 9f2528169..a496eb7c9 100644 --- a/qemu/block/dmg.c +++ b/qemu/block/dmg.c @@ -21,6 +21,8 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. */ +#include "qemu/osdep.h" +#include "qapi/error.h" #include "qemu-common.h" #include "block/block_int.h" #include "qemu/bswap.h" @@ -85,7 +87,7 @@ static int read_uint64(BlockDriverState *bs, int64_t offset, uint64_t *result) uint64_t buffer; int ret; - ret = bdrv_pread(bs->file, offset, &buffer, 8); + ret = bdrv_pread(bs->file->bs, offset, &buffer, 8); if (ret < 0) { return ret; } @@ -99,7 +101,7 @@ static int read_uint32(BlockDriverState *bs, int64_t offset, uint32_t *result) uint32_t buffer; int ret; - ret = bdrv_pread(bs->file, offset, &buffer, 4); + ret = bdrv_pread(bs->file->bs, offset, &buffer, 4); if (ret < 0) { return ret; } @@ -354,7 +356,7 @@ static int dmg_read_resource_fork(BlockDriverState *bs, DmgHeaderState *ds, offset += 4; buffer = g_realloc(buffer, count); - ret = bdrv_pread(bs->file, offset, buffer, count); + ret = bdrv_pread(bs->file->bs, offset, buffer, count); if (ret < 0) { goto fail; } @@ -391,7 +393,7 @@ static int dmg_read_plist_xml(BlockDriverState *bs, DmgHeaderState *ds, buffer = g_malloc(info_length + 1); buffer[info_length] = '\0'; - ret = bdrv_pread(bs->file, info_begin, buffer, info_length); + ret = bdrv_pread(bs->file->bs, info_begin, buffer, info_length); if (ret != info_length) { ret = -EINVAL; goto fail; @@ -446,7 +448,7 @@ static int dmg_open(BlockDriverState *bs, QDict *options, int flags, ds.max_sectors_per_chunk = 1; /* locate the UDIF trailer */ - offset = dmg_find_koly_offset(bs->file, errp); + offset = dmg_find_koly_offset(bs->file->bs, errp); if (offset < 0) { ret = offset; goto fail; @@ -514,9 +516,9 @@ static int dmg_open(BlockDriverState *bs, QDict *options, int flags, } /* initialize zlib engine */ - s->compressed_chunk = qemu_try_blockalign(bs->file, + s->compressed_chunk = qemu_try_blockalign(bs->file->bs, ds.max_compressed_size + 1); - s->uncompressed_chunk = qemu_try_blockalign(bs->file, + s->uncompressed_chunk = qemu_try_blockalign(bs->file->bs, 512 * ds.max_sectors_per_chunk); if (s->compressed_chunk == NULL || s->uncompressed_chunk == NULL) { ret = -ENOMEM; @@ -592,7 +594,7 @@ static inline int dmg_read_chunk(BlockDriverState *bs, uint64_t sector_num) case 0x80000005: { /* zlib compressed */ /* we need to buffer, because only the chunk as whole can be * inflated. */ - ret = bdrv_pread(bs->file, s->offsets[chunk], + ret = bdrv_pread(bs->file->bs, s->offsets[chunk], s->compressed_chunk, s->lengths[chunk]); if (ret != s->lengths[chunk]) { return -1; @@ -616,7 +618,7 @@ static inline int dmg_read_chunk(BlockDriverState *bs, uint64_t sector_num) case 0x80000006: /* bzip2 compressed */ /* we need to buffer, because only the chunk as whole can be * inflated. */ - ret = bdrv_pread(bs->file, s->offsets[chunk], + ret = bdrv_pread(bs->file->bs, s->offsets[chunk], s->compressed_chunk, s->lengths[chunk]); if (ret != s->lengths[chunk]) { return -1; @@ -641,7 +643,7 @@ static inline int dmg_read_chunk(BlockDriverState *bs, uint64_t sector_num) break; #endif /* CONFIG_BZIP2 */ case 1: /* copy */ - ret = bdrv_pread(bs->file, s->offsets[chunk], + ret = bdrv_pread(bs->file->bs, s->offsets[chunk], s->uncompressed_chunk, s->lengths[chunk]); if (ret != s->lengths[chunk]) { return -1; diff --git a/qemu/block/gluster.c b/qemu/block/gluster.c index 1eb3a8c39..a8aaacf64 100644 --- a/qemu/block/gluster.c +++ b/qemu/block/gluster.c @@ -7,8 +7,10 @@ * See the COPYING file in the top-level directory. * */ +#include "qemu/osdep.h" #include <glusterfs/api/glfs.h> #include "block/block_int.h" +#include "qapi/error.h" #include "qemu/uri.h" typedef struct GlusterAIOCB { @@ -245,7 +247,7 @@ static void gluster_finish_aiocb(struct glfs_fd *fd, ssize_t ret, void *arg) if (!ret || ret == acb->size) { acb->ret = 0; /* Success */ } else if (ret < 0) { - acb->ret = ret; /* Read/Write failed */ + acb->ret = -errno; /* Read/Write failed */ } else { acb->ret = -EIO; /* Partial read/write - fail it */ } @@ -312,6 +314,23 @@ static int qemu_gluster_open(BlockDriverState *bs, QDict *options, goto out; } +#ifdef CONFIG_GLUSTERFS_XLATOR_OPT + /* Without this, if fsync fails for a recoverable reason (for instance, + * ENOSPC), gluster will dump its cache, preventing retries. This means + * almost certain data loss. Not all gluster versions support the + * 'resync-failed-syncs-after-fsync' key value, but there is no way to + * discover during runtime if it is supported (this api returns success for + * unknown key/value pairs) */ + ret = glfs_set_xlator_option(s->glfs, "*-write-behind", + "resync-failed-syncs-after-fsync", + "on"); + if (ret < 0) { + error_setg_errno(errp, errno, "Unable to set xlator key/value pair"); + ret = -errno; + goto out; + } +#endif + qemu_gluster_parse_flags(bdrv_flags, &open_flags); s->fd = glfs_open(s->glfs, gconf->image, open_flags); @@ -364,6 +383,16 @@ static int qemu_gluster_reopen_prepare(BDRVReopenState *state, goto exit; } +#ifdef CONFIG_GLUSTERFS_XLATOR_OPT + ret = glfs_set_xlator_option(reop_s->glfs, "*-write-behind", + "resync-failed-syncs-after-fsync", "on"); + if (ret < 0) { + error_setg_errno(errp, errno, "Unable to set xlator key/value pair"); + ret = -errno; + goto exit; + } +#endif + reop_s->fd = glfs_open(reop_s->glfs, gconf->image, open_flags); if (reop_s->fd == NULL) { /* reops->glfs will be cleaned up in _abort */ @@ -429,28 +458,23 @@ static coroutine_fn int qemu_gluster_co_write_zeroes(BlockDriverState *bs, int64_t sector_num, int nb_sectors, BdrvRequestFlags flags) { int ret; - GlusterAIOCB *acb = g_slice_new(GlusterAIOCB); + GlusterAIOCB acb; BDRVGlusterState *s = bs->opaque; off_t size = nb_sectors * BDRV_SECTOR_SIZE; off_t offset = sector_num * BDRV_SECTOR_SIZE; - acb->size = size; - acb->ret = 0; - acb->coroutine = qemu_coroutine_self(); - acb->aio_context = bdrv_get_aio_context(bs); + acb.size = size; + acb.ret = 0; + acb.coroutine = qemu_coroutine_self(); + acb.aio_context = bdrv_get_aio_context(bs); - ret = glfs_zerofill_async(s->fd, offset, size, &gluster_finish_aiocb, acb); + ret = glfs_zerofill_async(s->fd, offset, size, gluster_finish_aiocb, &acb); if (ret < 0) { - ret = -errno; - goto out; + return -errno; } qemu_coroutine_yield(); - ret = acb->ret; - -out: - g_slice_free(GlusterAIOCB, acb); - return ret; + return acb.ret; } static inline bool gluster_supports_zerofill(void) @@ -541,35 +565,30 @@ static coroutine_fn int qemu_gluster_co_rw(BlockDriverState *bs, int64_t sector_num, int nb_sectors, QEMUIOVector *qiov, int write) { int ret; - GlusterAIOCB *acb = g_slice_new(GlusterAIOCB); + GlusterAIOCB acb; BDRVGlusterState *s = bs->opaque; size_t size = nb_sectors * BDRV_SECTOR_SIZE; off_t offset = sector_num * BDRV_SECTOR_SIZE; - acb->size = size; - acb->ret = 0; - acb->coroutine = qemu_coroutine_self(); - acb->aio_context = bdrv_get_aio_context(bs); + acb.size = size; + acb.ret = 0; + acb.coroutine = qemu_coroutine_self(); + acb.aio_context = bdrv_get_aio_context(bs); if (write) { ret = glfs_pwritev_async(s->fd, qiov->iov, qiov->niov, offset, 0, - &gluster_finish_aiocb, acb); + gluster_finish_aiocb, &acb); } else { ret = glfs_preadv_async(s->fd, qiov->iov, qiov->niov, offset, 0, - &gluster_finish_aiocb, acb); + gluster_finish_aiocb, &acb); } if (ret < 0) { - ret = -errno; - goto out; + return -errno; } qemu_coroutine_yield(); - ret = acb->ret; - -out: - g_slice_free(GlusterAIOCB, acb); - return ret; + return acb.ret; } static int qemu_gluster_truncate(BlockDriverState *bs, int64_t offset) @@ -597,28 +616,58 @@ static coroutine_fn int qemu_gluster_co_writev(BlockDriverState *bs, return qemu_gluster_co_rw(bs, sector_num, nb_sectors, qiov, 1); } +static void qemu_gluster_close(BlockDriverState *bs) +{ + BDRVGlusterState *s = bs->opaque; + + if (s->fd) { + glfs_close(s->fd); + s->fd = NULL; + } + glfs_fini(s->glfs); +} + static coroutine_fn int qemu_gluster_co_flush_to_disk(BlockDriverState *bs) { int ret; - GlusterAIOCB *acb = g_slice_new(GlusterAIOCB); + GlusterAIOCB acb; BDRVGlusterState *s = bs->opaque; - acb->size = 0; - acb->ret = 0; - acb->coroutine = qemu_coroutine_self(); - acb->aio_context = bdrv_get_aio_context(bs); + acb.size = 0; + acb.ret = 0; + acb.coroutine = qemu_coroutine_self(); + acb.aio_context = bdrv_get_aio_context(bs); - ret = glfs_fsync_async(s->fd, &gluster_finish_aiocb, acb); + ret = glfs_fsync_async(s->fd, gluster_finish_aiocb, &acb); if (ret < 0) { ret = -errno; - goto out; + goto error; } qemu_coroutine_yield(); - ret = acb->ret; - -out: - g_slice_free(GlusterAIOCB, acb); + if (acb.ret < 0) { + ret = acb.ret; + goto error; + } + + return acb.ret; + +error: + /* Some versions of Gluster (3.5.6 -> 3.5.8?) will not retain its cache + * after a fsync failure, so we have no way of allowing the guest to safely + * continue. Gluster versions prior to 3.5.6 don't retain the cache + * either, but will invalidate the fd on error, so this is again our only + * option. + * + * The 'resync-failed-syncs-after-fsync' xlator option for the + * write-behind cache will cause later gluster versions to retain its + * cache after error, so long as the fd remains open. However, we + * currently have no way of knowing if this option is supported. + * + * TODO: Once gluster provides a way for us to determine if the option + * is supported, bypass the closure and setting drv to NULL. */ + qemu_gluster_close(bs); + bs->drv = NULL; return ret; } @@ -627,28 +676,23 @@ static coroutine_fn int qemu_gluster_co_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors) { int ret; - GlusterAIOCB *acb = g_slice_new(GlusterAIOCB); + GlusterAIOCB acb; BDRVGlusterState *s = bs->opaque; size_t size = nb_sectors * BDRV_SECTOR_SIZE; off_t offset = sector_num * BDRV_SECTOR_SIZE; - acb->size = 0; - acb->ret = 0; - acb->coroutine = qemu_coroutine_self(); - acb->aio_context = bdrv_get_aio_context(bs); + acb.size = 0; + acb.ret = 0; + acb.coroutine = qemu_coroutine_self(); + acb.aio_context = bdrv_get_aio_context(bs); - ret = glfs_discard_async(s->fd, offset, size, &gluster_finish_aiocb, acb); + ret = glfs_discard_async(s->fd, offset, size, gluster_finish_aiocb, &acb); if (ret < 0) { - ret = -errno; - goto out; + return -errno; } qemu_coroutine_yield(); - ret = acb->ret; - -out: - g_slice_free(GlusterAIOCB, acb); - return ret; + return acb.ret; } #endif @@ -679,17 +723,6 @@ static int64_t qemu_gluster_allocated_file_size(BlockDriverState *bs) } } -static void qemu_gluster_close(BlockDriverState *bs) -{ - BDRVGlusterState *s = bs->opaque; - - if (s->fd) { - glfs_close(s->fd); - s->fd = NULL; - } - glfs_fini(s->glfs); -} - static int qemu_gluster_has_zero_init(BlockDriverState *bs) { /* GlusterFS volume could be backed by a block device */ diff --git a/qemu/block/io.c b/qemu/block/io.c index d4bc83b33..a7dbf85b1 100644 --- a/qemu/block/io.c +++ b/qemu/block/io.c @@ -22,10 +22,14 @@ * THE SOFTWARE. */ +#include "qemu/osdep.h" #include "trace.h" +#include "sysemu/block-backend.h" #include "block/blockjob.h" #include "block/block_int.h" #include "block/throttle-groups.h" +#include "qemu/cutils.h" +#include "qapi/error.h" #include "qemu/error-report.h" #define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */ @@ -42,12 +46,6 @@ static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs, static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs, int64_t sector_num, int nb_sectors, QEMUIOVector *iov); -static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs, - int64_t offset, unsigned int bytes, QEMUIOVector *qiov, - BdrvRequestFlags flags); -static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs, - int64_t offset, unsigned int bytes, QEMUIOVector *qiov, - BdrvRequestFlags flags); static BlockAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs, int64_t sector_num, QEMUIOVector *qiov, @@ -156,38 +154,45 @@ void bdrv_refresh_limits(BlockDriverState *bs, Error **errp) /* Take some limits from the children as a default */ if (bs->file) { - bdrv_refresh_limits(bs->file, &local_err); + bdrv_refresh_limits(bs->file->bs, &local_err); if (local_err) { error_propagate(errp, local_err); return; } - bs->bl.opt_transfer_length = bs->file->bl.opt_transfer_length; - bs->bl.max_transfer_length = bs->file->bl.max_transfer_length; - bs->bl.min_mem_alignment = bs->file->bl.min_mem_alignment; - bs->bl.opt_mem_alignment = bs->file->bl.opt_mem_alignment; + bs->bl.opt_transfer_length = bs->file->bs->bl.opt_transfer_length; + bs->bl.max_transfer_length = bs->file->bs->bl.max_transfer_length; + bs->bl.min_mem_alignment = bs->file->bs->bl.min_mem_alignment; + bs->bl.opt_mem_alignment = bs->file->bs->bl.opt_mem_alignment; + bs->bl.max_iov = bs->file->bs->bl.max_iov; } else { bs->bl.min_mem_alignment = 512; bs->bl.opt_mem_alignment = getpagesize(); + + /* Safe default since most protocols use readv()/writev()/etc */ + bs->bl.max_iov = IOV_MAX; } - if (bs->backing_hd) { - bdrv_refresh_limits(bs->backing_hd, &local_err); + if (bs->backing) { + bdrv_refresh_limits(bs->backing->bs, &local_err); if (local_err) { error_propagate(errp, local_err); return; } bs->bl.opt_transfer_length = MAX(bs->bl.opt_transfer_length, - bs->backing_hd->bl.opt_transfer_length); + bs->backing->bs->bl.opt_transfer_length); bs->bl.max_transfer_length = MIN_NON_ZERO(bs->bl.max_transfer_length, - bs->backing_hd->bl.max_transfer_length); + bs->backing->bs->bl.max_transfer_length); bs->bl.opt_mem_alignment = MAX(bs->bl.opt_mem_alignment, - bs->backing_hd->bl.opt_mem_alignment); + bs->backing->bs->bl.opt_mem_alignment); bs->bl.min_mem_alignment = MAX(bs->bl.min_mem_alignment, - bs->backing_hd->bl.min_mem_alignment); + bs->backing->bs->bl.min_mem_alignment); + bs->bl.max_iov = + MIN(bs->bl.max_iov, + bs->backing->bs->bl.max_iov); } /* Then let the driver override it */ @@ -213,8 +218,10 @@ void bdrv_disable_copy_on_read(BlockDriverState *bs) } /* Check if any requests are in-flight (including throttled requests) */ -static bool bdrv_requests_pending(BlockDriverState *bs) +bool bdrv_requests_pending(BlockDriverState *bs) { + BdrvChild *child; + if (!QLIST_EMPTY(&bs->tracked_requests)) { return true; } @@ -224,17 +231,72 @@ static bool bdrv_requests_pending(BlockDriverState *bs) if (!qemu_co_queue_empty(&bs->throttled_reqs[1])) { return true; } - if (bs->file && bdrv_requests_pending(bs->file)) { - return true; - } - if (bs->backing_hd && bdrv_requests_pending(bs->backing_hd)) { - return true; + + QLIST_FOREACH(child, &bs->children, next) { + if (bdrv_requests_pending(child->bs)) { + return true; + } } + return false; } +static void bdrv_drain_recurse(BlockDriverState *bs) +{ + BdrvChild *child; + + if (bs->drv && bs->drv->bdrv_drain) { + bs->drv->bdrv_drain(bs); + } + QLIST_FOREACH(child, &bs->children, next) { + bdrv_drain_recurse(child->bs); + } +} + +typedef struct { + Coroutine *co; + BlockDriverState *bs; + QEMUBH *bh; + bool done; +} BdrvCoDrainData; + +static void bdrv_co_drain_bh_cb(void *opaque) +{ + BdrvCoDrainData *data = opaque; + Coroutine *co = data->co; + + qemu_bh_delete(data->bh); + bdrv_drain(data->bs); + data->done = true; + qemu_coroutine_enter(co, NULL); +} + +void coroutine_fn bdrv_co_drain(BlockDriverState *bs) +{ + BdrvCoDrainData data; + + /* Calling bdrv_drain() from a BH ensures the current coroutine yields and + * other coroutines run if they were queued from + * qemu_co_queue_run_restart(). */ + + assert(qemu_in_coroutine()); + data = (BdrvCoDrainData) { + .co = qemu_coroutine_self(), + .bs = bs, + .done = false, + .bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_co_drain_bh_cb, &data), + }; + qemu_bh_schedule(data.bh); + + qemu_coroutine_yield(); + /* If we are resumed from some other event (such as an aio completion or a + * timer callback), it is a bug in the caller that should be fixed. */ + assert(data.done); +} + /* - * Wait for pending requests to complete on a single BlockDriverState subtree + * Wait for pending requests to complete on a single BlockDriverState subtree, + * and suspend block driver's internal I/O until next request arrives. * * Note that unlike bdrv_drain_all(), the caller must hold the BlockDriverState * AioContext. @@ -247,6 +309,11 @@ void bdrv_drain(BlockDriverState *bs) { bool busy = true; + bdrv_drain_recurse(bs); + if (qemu_in_coroutine()) { + bdrv_co_drain(bs); + return; + } while (busy) { /* Keep iterating */ bdrv_flush_io_queue(bs); @@ -275,6 +342,7 @@ void bdrv_drain_all(void) if (bs->job) { block_job_pause(bs->job); } + bdrv_drain_recurse(bs); aio_context_release(aio_context); if (!g_slist_find(aio_ctxs, aio_context)) { @@ -344,13 +412,14 @@ static void tracked_request_end(BdrvTrackedRequest *req) static void tracked_request_begin(BdrvTrackedRequest *req, BlockDriverState *bs, int64_t offset, - unsigned int bytes, bool is_write) + unsigned int bytes, + enum BdrvTrackedRequestType type) { *req = (BdrvTrackedRequest){ .bs = bs, .offset = offset, .bytes = bytes, - .is_write = is_write, + .type = type, .co = qemu_coroutine_self(), .serialising = false, .overlap_offset = offset, @@ -593,20 +662,6 @@ int bdrv_read(BlockDriverState *bs, int64_t sector_num, return bdrv_rw_co(bs, sector_num, buf, nb_sectors, false, 0); } -/* Just like bdrv_read(), but with I/O throttling temporarily disabled */ -int bdrv_read_unthrottled(BlockDriverState *bs, int64_t sector_num, - uint8_t *buf, int nb_sectors) -{ - bool enabled; - int ret; - - enabled = bs->io_limits_enabled; - bs->io_limits_enabled = false; - ret = bdrv_read(bs, sector_num, buf, nb_sectors); - bs->io_limits_enabled = enabled; - return ret; -} - /* Return < 0 if error. Important errors are: -EIO generic I/O error (may happen for all errors) -ENOMEDIUM No media inserted. @@ -637,6 +692,7 @@ int bdrv_write_zeroes(BlockDriverState *bs, int64_t sector_num, int bdrv_make_zero(BlockDriverState *bs, BdrvRequestFlags flags) { int64_t target_sectors, ret, nb_sectors, sector_num = 0; + BlockDriverState *file; int n; target_sectors = bdrv_nb_sectors(bs); @@ -649,7 +705,7 @@ int bdrv_make_zero(BlockDriverState *bs, BdrvRequestFlags flags) if (nb_sectors <= 0) { return 0; } - ret = bdrv_get_block_status(bs, sector_num, nb_sectors, &n); + ret = bdrv_get_block_status(bs, sector_num, nb_sectors, &n, &file); if (ret < 0) { error_report("error getting block status at sector %" PRId64 ": %s", sector_num, strerror(-ret)); @@ -736,9 +792,9 @@ int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset, return ret; } - /* No flush needed for cache modes that already do it */ - if (bs->enable_write_cache) { - bdrv_flush(bs); + ret = bdrv_flush(bs); + if (ret < 0) { + return ret; } return 0; @@ -833,6 +889,7 @@ static int coroutine_fn bdrv_aligned_preadv(BlockDriverState *bs, assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0); assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0); assert(!qiov || bytes == qiov->size); + assert((bs->open_flags & BDRV_O_NO_IO) == 0); /* Handle Copy on Read and associated serialisation */ if (flags & BDRV_REQ_COPY_ON_READ) { @@ -844,7 +901,9 @@ static int coroutine_fn bdrv_aligned_preadv(BlockDriverState *bs, mark_request_serialising(req, bdrv_get_cluster_size(bs)); } - wait_serialising_requests(req); + if (!(flags & BDRV_REQ_NO_SERIALISING)) { + wait_serialising_requests(req); + } if (flags & BDRV_REQ_COPY_ON_READ) { int pnum; @@ -908,7 +967,7 @@ out: /* * Handle a read request in coroutine context */ -static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs, +int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs, int64_t offset, unsigned int bytes, QEMUIOVector *qiov, BdrvRequestFlags flags) { @@ -932,7 +991,8 @@ static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs, return ret; } - if (bs->copy_on_read) { + /* Don't do copy-on-read if we read data before write operation */ + if (bs->copy_on_read && !(flags & BDRV_REQ_NO_SERIALISING)) { flags |= BDRV_REQ_COPY_ON_READ; } @@ -966,7 +1026,7 @@ static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs, bytes = ROUND_UP(bytes, align); } - tracked_request_begin(&req, bs, offset, bytes, false); + tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_READ); ret = bdrv_aligned_preadv(bs, &req, offset, bytes, align, use_local_qiov ? &local_qiov : qiov, flags); @@ -1001,6 +1061,15 @@ int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num, return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 0); } +int coroutine_fn bdrv_co_readv_no_serialising(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, QEMUIOVector *qiov) +{ + trace_bdrv_co_readv_no_serialising(bs, sector_num, nb_sectors); + + return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, + BDRV_REQ_NO_SERIALISING); +} + int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs, int64_t sector_num, int nb_sectors, QEMUIOVector *qiov) { @@ -1107,6 +1176,7 @@ static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs, assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0); assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0); assert(!qiov || bytes == qiov->size); + assert((bs->open_flags & BDRV_O_NO_IO) == 0); waited = wait_serialising_requests(req); assert(!waited || !req->serialising); @@ -1127,21 +1197,30 @@ static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs, if (ret < 0) { /* Do nothing, write notifier decided to fail this request */ } else if (flags & BDRV_REQ_ZERO_WRITE) { - BLKDBG_EVENT(bs, BLKDBG_PWRITEV_ZERO); + bdrv_debug_event(bs, BLKDBG_PWRITEV_ZERO); ret = bdrv_co_do_write_zeroes(bs, sector_num, nb_sectors, flags); + } else if (drv->bdrv_co_writev_flags) { + bdrv_debug_event(bs, BLKDBG_PWRITEV); + ret = drv->bdrv_co_writev_flags(bs, sector_num, nb_sectors, qiov, + flags); } else { - BLKDBG_EVENT(bs, BLKDBG_PWRITEV); + assert(drv->supported_write_flags == 0); + bdrv_debug_event(bs, BLKDBG_PWRITEV); ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov); } - BLKDBG_EVENT(bs, BLKDBG_PWRITEV_DONE); + bdrv_debug_event(bs, BLKDBG_PWRITEV_DONE); - if (ret == 0 && !bs->enable_write_cache) { + if (ret == 0 && (flags & BDRV_REQ_FUA) && + !(drv->supported_write_flags & BDRV_REQ_FUA)) + { ret = bdrv_co_flush(bs); } bdrv_set_dirty(bs, sector_num, nb_sectors); - block_acct_highest_sector(&bs->stats, sector_num, nb_sectors); + if (bs->wr_highest_offset < offset + bytes) { + bs->wr_highest_offset = offset + bytes; + } if (ret >= 0) { bs->total_sectors = MAX(bs->total_sectors, sector_num + nb_sectors); @@ -1182,13 +1261,13 @@ static int coroutine_fn bdrv_co_do_zero_pwritev(BlockDriverState *bs, /* RMW the unaligned part before head. */ mark_request_serialising(req, align); wait_serialising_requests(req); - BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_HEAD); + bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_HEAD); ret = bdrv_aligned_preadv(bs, req, offset & ~(align - 1), align, align, &local_qiov, 0); if (ret < 0) { goto fail; } - BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD); + bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD); memset(buf + head_padding_bytes, 0, zero_bytes); ret = bdrv_aligned_pwritev(bs, req, offset & ~(align - 1), align, @@ -1220,13 +1299,13 @@ static int coroutine_fn bdrv_co_do_zero_pwritev(BlockDriverState *bs, /* RMW the unaligned part after tail. */ mark_request_serialising(req, align); wait_serialising_requests(req); - BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_TAIL); + bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_TAIL); ret = bdrv_aligned_preadv(bs, req, offset, align, align, &local_qiov, 0); if (ret < 0) { goto fail; } - BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL); + bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL); memset(buf, 0, bytes); ret = bdrv_aligned_pwritev(bs, req, offset, align, @@ -1241,7 +1320,7 @@ fail: /* * Handle a write request in coroutine context */ -static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs, +int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs, int64_t offset, unsigned int bytes, QEMUIOVector *qiov, BdrvRequestFlags flags) { @@ -1260,6 +1339,7 @@ static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs, if (bs->read_only) { return -EPERM; } + assert(!(bs->open_flags & BDRV_O_INACTIVE)); ret = bdrv_check_byte_request(bs, offset, bytes); if (ret < 0) { @@ -1276,7 +1356,7 @@ static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs, * Pad qiov with the read parts and be sure to have a tracked request not * only for bdrv_aligned_pwritev, but also for the reads of the RMW cycle. */ - tracked_request_begin(&req, bs, offset, bytes, true); + tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_WRITE); if (!qiov) { ret = bdrv_co_do_zero_pwritev(bs, offset, bytes, flags, &req); @@ -1297,13 +1377,13 @@ static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs, }; qemu_iovec_init_external(&head_qiov, &head_iov, 1); - BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_HEAD); + bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_HEAD); ret = bdrv_aligned_preadv(bs, &req, offset & ~(align - 1), align, align, &head_qiov, 0); if (ret < 0) { goto fail; } - BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD); + bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD); qemu_iovec_init(&local_qiov, qiov->niov + 2); qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1)); @@ -1331,13 +1411,13 @@ static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs, }; qemu_iovec_init_external(&tail_qiov, &tail_iov, 1); - BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_TAIL); + bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_TAIL); ret = bdrv_aligned_preadv(bs, &req, (offset + bytes) & ~(align - 1), align, align, &tail_qiov, 0); if (ret < 0) { goto fail; } - BLKDBG_EVENT(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL); + bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL); if (!use_local_qiov) { qemu_iovec_init(&local_qiov, qiov->niov + 1); @@ -1401,29 +1481,10 @@ int coroutine_fn bdrv_co_write_zeroes(BlockDriverState *bs, BDRV_REQ_ZERO_WRITE | flags); } -int bdrv_flush_all(void) -{ - BlockDriverState *bs = NULL; - int result = 0; - - while ((bs = bdrv_next(bs))) { - AioContext *aio_context = bdrv_get_aio_context(bs); - int ret; - - aio_context_acquire(aio_context); - ret = bdrv_flush(bs); - if (ret < 0 && !result) { - result = ret; - } - aio_context_release(aio_context); - } - - return result; -} - typedef struct BdrvCoGetBlockStatusData { BlockDriverState *bs; BlockDriverState *base; + BlockDriverState **file; int64_t sector_num; int nb_sectors; int *pnum; @@ -1445,10 +1506,14 @@ typedef struct BdrvCoGetBlockStatusData { * * 'nb_sectors' is the max value 'pnum' should be set to. If nb_sectors goes * beyond the end of the disk image it will be clamped. + * + * If returned value is positive and BDRV_BLOCK_OFFSET_VALID bit is set, 'file' + * points to the BDS which the sector range is allocated in. */ static int64_t coroutine_fn bdrv_co_get_block_status(BlockDriverState *bs, int64_t sector_num, - int nb_sectors, int *pnum) + int nb_sectors, int *pnum, + BlockDriverState **file) { int64_t total_sectors; int64_t n; @@ -1478,7 +1543,9 @@ static int64_t coroutine_fn bdrv_co_get_block_status(BlockDriverState *bs, return ret; } - ret = bs->drv->bdrv_co_get_block_status(bs, sector_num, nb_sectors, pnum); + *file = NULL; + ret = bs->drv->bdrv_co_get_block_status(bs, sector_num, nb_sectors, pnum, + file); if (ret < 0) { *pnum = 0; return ret; @@ -1486,8 +1553,8 @@ static int64_t coroutine_fn bdrv_co_get_block_status(BlockDriverState *bs, if (ret & BDRV_BLOCK_RAW) { assert(ret & BDRV_BLOCK_OFFSET_VALID); - return bdrv_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS, - *pnum, pnum); + return bdrv_get_block_status(bs->file->bs, ret >> BDRV_SECTOR_BITS, + *pnum, pnum, file); } if (ret & (BDRV_BLOCK_DATA | BDRV_BLOCK_ZERO)) { @@ -1495,8 +1562,8 @@ static int64_t coroutine_fn bdrv_co_get_block_status(BlockDriverState *bs, } else { if (bdrv_unallocated_blocks_are_zero(bs)) { ret |= BDRV_BLOCK_ZERO; - } else if (bs->backing_hd) { - BlockDriverState *bs2 = bs->backing_hd; + } else if (bs->backing) { + BlockDriverState *bs2 = bs->backing->bs; int64_t nb_sectors2 = bdrv_nb_sectors(bs2); if (nb_sectors2 >= 0 && sector_num >= nb_sectors2) { ret |= BDRV_BLOCK_ZERO; @@ -1504,13 +1571,14 @@ static int64_t coroutine_fn bdrv_co_get_block_status(BlockDriverState *bs, } } - if (bs->file && + if (*file && *file != bs && (ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO) && (ret & BDRV_BLOCK_OFFSET_VALID)) { + BlockDriverState *file2; int file_pnum; - ret2 = bdrv_co_get_block_status(bs->file, ret >> BDRV_SECTOR_BITS, - *pnum, &file_pnum); + ret2 = bdrv_co_get_block_status(*file, ret >> BDRV_SECTOR_BITS, + *pnum, &file_pnum, &file2); if (ret2 >= 0) { /* Ignore errors. This is just providing extra information, it * is useful but not necessary. @@ -1535,14 +1603,15 @@ static int64_t coroutine_fn bdrv_co_get_block_status_above(BlockDriverState *bs, BlockDriverState *base, int64_t sector_num, int nb_sectors, - int *pnum) + int *pnum, + BlockDriverState **file) { BlockDriverState *p; int64_t ret = 0; assert(bs != base); - for (p = bs; p != base; p = p->backing_hd) { - ret = bdrv_co_get_block_status(p, sector_num, nb_sectors, pnum); + for (p = bs; p != base; p = backing_bs(p)) { + ret = bdrv_co_get_block_status(p, sector_num, nb_sectors, pnum, file); if (ret < 0 || ret & BDRV_BLOCK_ALLOCATED) { break; } @@ -1561,7 +1630,8 @@ static void coroutine_fn bdrv_get_block_status_above_co_entry(void *opaque) data->ret = bdrv_co_get_block_status_above(data->bs, data->base, data->sector_num, data->nb_sectors, - data->pnum); + data->pnum, + data->file); data->done = true; } @@ -1573,12 +1643,14 @@ static void coroutine_fn bdrv_get_block_status_above_co_entry(void *opaque) int64_t bdrv_get_block_status_above(BlockDriverState *bs, BlockDriverState *base, int64_t sector_num, - int nb_sectors, int *pnum) + int nb_sectors, int *pnum, + BlockDriverState **file) { Coroutine *co; BdrvCoGetBlockStatusData data = { .bs = bs, .base = base, + .file = file, .sector_num = sector_num, .nb_sectors = nb_sectors, .pnum = pnum, @@ -1602,16 +1674,19 @@ int64_t bdrv_get_block_status_above(BlockDriverState *bs, int64_t bdrv_get_block_status(BlockDriverState *bs, int64_t sector_num, - int nb_sectors, int *pnum) + int nb_sectors, int *pnum, + BlockDriverState **file) { - return bdrv_get_block_status_above(bs, bs->backing_hd, - sector_num, nb_sectors, pnum); + return bdrv_get_block_status_above(bs, backing_bs(bs), + sector_num, nb_sectors, pnum, file); } int coroutine_fn bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num, int nb_sectors, int *pnum) { - int64_t ret = bdrv_get_block_status(bs, sector_num, nb_sectors, pnum); + BlockDriverState *file; + int64_t ret = bdrv_get_block_status(bs, sector_num, nb_sectors, pnum, + &file); if (ret < 0) { return ret; } @@ -1662,7 +1737,7 @@ int bdrv_is_allocated_above(BlockDriverState *top, n = pnum_inter; } - intermediate = intermediate->backing_hd; + intermediate = backing_bs(intermediate); } *pnum = n; @@ -1713,7 +1788,7 @@ int bdrv_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos) } else if (drv->bdrv_save_vmstate) { return drv->bdrv_save_vmstate(bs, qiov, pos); } else if (bs->file) { - return bdrv_writev_vmstate(bs->file, qiov, pos); + return bdrv_writev_vmstate(bs->file->bs, qiov, pos); } return -ENOTSUP; @@ -1728,7 +1803,7 @@ int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf, if (drv->bdrv_load_vmstate) return drv->bdrv_load_vmstate(bs, buf, pos, size); if (bs->file) - return bdrv_load_vmstate(bs->file, buf, pos, size); + return bdrv_load_vmstate(bs->file->bs, buf, pos, size); return -ENOTSUP; } @@ -1849,7 +1924,8 @@ static int multiwrite_merge(BlockDriverState *bs, BlockRequest *reqs, merge = 1; } - if (reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1 > IOV_MAX) { + if (reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1 > + bs->bl.max_iov) { merge = 0; } @@ -1893,7 +1969,10 @@ static int multiwrite_merge(BlockDriverState *bs, BlockRequest *reqs, } } - block_acct_merge_done(&bs->stats, BLOCK_ACCT_WRITE, num_reqs - outidx - 1); + if (bs->blk) { + block_acct_merge_done(blk_get_stats(bs->blk), BLOCK_ACCT_WRITE, + num_reqs - outidx - 1); + } return outidx + 1; } @@ -2208,7 +2287,7 @@ void *qemu_aio_get(const AIOCBInfo *aiocb_info, BlockDriverState *bs, { BlockAIOCB *acb; - acb = g_slice_alloc(aiocb_info->aiocb_size); + acb = g_malloc(aiocb_info->aiocb_size); acb->aiocb_info = aiocb_info; acb->bs = bs; acb->cb = cb; @@ -2228,7 +2307,7 @@ void qemu_aio_unref(void *p) BlockAIOCB *acb = p; assert(acb->refcnt > 0); if (--acb->refcnt == 0) { - g_slice_free1(acb->aiocb_info->aiocb_size, acb); + g_free(acb); } } @@ -2298,18 +2377,27 @@ static void coroutine_fn bdrv_flush_co_entry(void *opaque) int coroutine_fn bdrv_co_flush(BlockDriverState *bs) { int ret; + BdrvTrackedRequest req; if (!bs || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs) || bdrv_is_sg(bs)) { return 0; } + tracked_request_begin(&req, bs, 0, 0, BDRV_TRACKED_FLUSH); + + /* Write back all layers by calling one driver function */ + if (bs->drv->bdrv_co_flush) { + ret = bs->drv->bdrv_co_flush(bs); + goto out; + } + /* Write back cached data to the OS even with cache=unsafe */ BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_OS); if (bs->drv->bdrv_co_flush_to_os) { ret = bs->drv->bdrv_co_flush_to_os(bs); if (ret < 0) { - return ret; + goto out; } } @@ -2349,14 +2437,17 @@ int coroutine_fn bdrv_co_flush(BlockDriverState *bs) ret = 0; } if (ret < 0) { - return ret; + goto out; } /* Now flush the underlying protocol. It will also have BDRV_O_NO_FLUSH * in the case of cache=unsafe, so there are no useless flushes. */ flush_parent: - return bdrv_co_flush(bs->file); + ret = bs->file ? bdrv_co_flush(bs->file->bs) : 0; +out: + tracked_request_end(&req); + return ret; } int bdrv_flush(BlockDriverState *bs) @@ -2399,6 +2490,7 @@ static void coroutine_fn bdrv_discard_co_entry(void *opaque) int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors) { + BdrvTrackedRequest req; int max_discard, ret; if (!bs->drv) { @@ -2411,6 +2503,7 @@ int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num, } else if (bs->read_only) { return -EPERM; } + assert(!(bs->open_flags & BDRV_O_INACTIVE)); /* Do nothing if disabled. */ if (!(bs->open_flags & BDRV_O_UNMAP)) { @@ -2421,6 +2514,8 @@ int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num, return 0; } + tracked_request_begin(&req, bs, sector_num, nb_sectors, + BDRV_TRACKED_DISCARD); bdrv_set_dirty(bs, sector_num, nb_sectors); max_discard = MIN_NON_ZERO(bs->bl.max_discard, BDRV_REQUEST_MAX_SECTORS); @@ -2454,20 +2549,24 @@ int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num, acb = bs->drv->bdrv_aio_discard(bs, sector_num, nb_sectors, bdrv_co_io_em_complete, &co); if (acb == NULL) { - return -EIO; + ret = -EIO; + goto out; } else { qemu_coroutine_yield(); ret = co.ret; } } if (ret && ret != -ENOTSUP) { - return ret; + goto out; } sector_num += num; nb_sectors -= num; } - return 0; + ret = 0; +out: + tracked_request_end(&req); + return ret; } int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors) @@ -2496,26 +2595,110 @@ int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors) return rwco.ret; } -/* needed for generic scsi interface */ +typedef struct { + CoroutineIOCompletion *co; + QEMUBH *bh; +} BdrvIoctlCompletionData; -int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf) +static void bdrv_ioctl_bh_cb(void *opaque) +{ + BdrvIoctlCompletionData *data = opaque; + + bdrv_co_io_em_complete(data->co, -ENOTSUP); + qemu_bh_delete(data->bh); +} + +static int bdrv_co_do_ioctl(BlockDriverState *bs, int req, void *buf) { BlockDriver *drv = bs->drv; + BdrvTrackedRequest tracked_req; + CoroutineIOCompletion co = { + .coroutine = qemu_coroutine_self(), + }; + BlockAIOCB *acb; - if (drv && drv->bdrv_ioctl) - return drv->bdrv_ioctl(bs, req, buf); - return -ENOTSUP; + tracked_request_begin(&tracked_req, bs, 0, 0, BDRV_TRACKED_IOCTL); + if (!drv || !drv->bdrv_aio_ioctl) { + co.ret = -ENOTSUP; + goto out; + } + + acb = drv->bdrv_aio_ioctl(bs, req, buf, bdrv_co_io_em_complete, &co); + if (!acb) { + BdrvIoctlCompletionData *data = g_new(BdrvIoctlCompletionData, 1); + data->bh = aio_bh_new(bdrv_get_aio_context(bs), + bdrv_ioctl_bh_cb, data); + data->co = &co; + qemu_bh_schedule(data->bh); + } + qemu_coroutine_yield(); +out: + tracked_request_end(&tracked_req); + return co.ret; +} + +typedef struct { + BlockDriverState *bs; + int req; + void *buf; + int ret; +} BdrvIoctlCoData; + +static void coroutine_fn bdrv_co_ioctl_entry(void *opaque) +{ + BdrvIoctlCoData *data = opaque; + data->ret = bdrv_co_do_ioctl(data->bs, data->req, data->buf); +} + +/* needed for generic scsi interface */ +int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf) +{ + BdrvIoctlCoData data = { + .bs = bs, + .req = req, + .buf = buf, + .ret = -EINPROGRESS, + }; + + if (qemu_in_coroutine()) { + /* Fast-path if already in coroutine context */ + bdrv_co_ioctl_entry(&data); + } else { + Coroutine *co = qemu_coroutine_create(bdrv_co_ioctl_entry); + + qemu_coroutine_enter(co, &data); + while (data.ret == -EINPROGRESS) { + aio_poll(bdrv_get_aio_context(bs), true); + } + } + return data.ret; +} + +static void coroutine_fn bdrv_co_aio_ioctl_entry(void *opaque) +{ + BlockAIOCBCoroutine *acb = opaque; + acb->req.error = bdrv_co_do_ioctl(acb->common.bs, + acb->req.req, acb->req.buf); + bdrv_co_complete(acb); } BlockAIOCB *bdrv_aio_ioctl(BlockDriverState *bs, unsigned long int req, void *buf, BlockCompletionFunc *cb, void *opaque) { - BlockDriver *drv = bs->drv; + BlockAIOCBCoroutine *acb = qemu_aio_get(&bdrv_em_co_aiocb_info, + bs, cb, opaque); + Coroutine *co; + + acb->need_bh = true; + acb->req.error = -EINPROGRESS; + acb->req.req = req; + acb->req.buf = buf; + co = qemu_coroutine_create(bdrv_co_aio_ioctl_entry); + qemu_coroutine_enter(co, acb); - if (drv && drv->bdrv_aio_ioctl) - return drv->bdrv_aio_ioctl(bs, req, buf, cb, opaque); - return NULL; + bdrv_co_maybe_schedule_bh(acb); + return &acb->common; } void *qemu_blockalign(BlockDriverState *bs, size_t size) @@ -2584,7 +2767,7 @@ void bdrv_io_plug(BlockDriverState *bs) if (drv && drv->bdrv_io_plug) { drv->bdrv_io_plug(bs); } else if (bs->file) { - bdrv_io_plug(bs->file); + bdrv_io_plug(bs->file->bs); } } @@ -2594,7 +2777,7 @@ void bdrv_io_unplug(BlockDriverState *bs) if (drv && drv->bdrv_io_unplug) { drv->bdrv_io_unplug(bs); } else if (bs->file) { - bdrv_io_unplug(bs->file); + bdrv_io_unplug(bs->file->bs); } } @@ -2604,7 +2787,24 @@ void bdrv_flush_io_queue(BlockDriverState *bs) if (drv && drv->bdrv_flush_io_queue) { drv->bdrv_flush_io_queue(bs); } else if (bs->file) { - bdrv_flush_io_queue(bs->file); + bdrv_flush_io_queue(bs->file->bs); } bdrv_start_throttled_reqs(bs); } + +void bdrv_drained_begin(BlockDriverState *bs) +{ + if (!bs->quiesce_counter++) { + aio_disable_external(bdrv_get_aio_context(bs)); + } + bdrv_drain(bs); +} + +void bdrv_drained_end(BlockDriverState *bs) +{ + assert(bs->quiesce_counter > 0); + if (--bs->quiesce_counter > 0) { + return; + } + aio_enable_external(bdrv_get_aio_context(bs)); +} diff --git a/qemu/block/iscsi.c b/qemu/block/iscsi.c index 50029168e..302baf84c 100644 --- a/qemu/block/iscsi.c +++ b/qemu/block/iscsi.c @@ -23,7 +23,7 @@ * THE SOFTWARE. */ -#include "config-host.h" +#include "qemu/osdep.h" #include <poll.h> #include <math.h> @@ -39,6 +39,7 @@ #include "sysemu/sysemu.h" #include "qmp-commands.h" #include "qapi/qmp/qstring.h" +#include "crypto/secret.h" #include <iscsi/iscsi.h> #include <iscsi/scsi-lowlevel.h> @@ -69,7 +70,6 @@ typedef struct IscsiLun { bool lbprz; bool dpofua; bool has_write_same; - bool force_next_flush; bool request_timed_out; } IscsiLun; @@ -83,7 +83,7 @@ typedef struct IscsiTask { QEMUBH *bh; IscsiLun *iscsilun; QEMUTimer retry_timer; - bool force_next_flush; + int err_code; } IscsiTask; typedef struct IscsiAIOCB { @@ -96,6 +96,7 @@ typedef struct IscsiAIOCB { int status; int64_t sector_num; int nb_sectors; + int ret; #ifdef __linux__ sg_io_hdr_t *ioh; #endif @@ -169,19 +170,70 @@ static inline unsigned exp_random(double mean) return -mean * log((double)rand() / RAND_MAX); } -/* SCSI_STATUS_TASK_SET_FULL and SCSI_STATUS_TIMEOUT were introduced - * in libiscsi 1.10.0 as part of an enum. The LIBISCSI_API_VERSION - * macro was introduced in 1.11.0. So use the API_VERSION macro as - * a hint that the macros are defined and define them ourselves - * otherwise to keep the required libiscsi version at 1.9.0 */ -#if !defined(LIBISCSI_API_VERSION) -#define QEMU_SCSI_STATUS_TASK_SET_FULL 0x28 -#define QEMU_SCSI_STATUS_TIMEOUT 0x0f000002 -#else -#define QEMU_SCSI_STATUS_TASK_SET_FULL SCSI_STATUS_TASK_SET_FULL -#define QEMU_SCSI_STATUS_TIMEOUT SCSI_STATUS_TIMEOUT +/* SCSI_SENSE_ASCQ_INVALID_FIELD_IN_PARAMETER_LIST was introduced in + * libiscsi 1.10.0, together with other constants we need. Use it as + * a hint that we have to define them ourselves if needed, to keep the + * minimum required libiscsi version at 1.9.0. We use an ASCQ macro for + * the test because SCSI_STATUS_* is an enum. + * + * To guard against future changes where SCSI_SENSE_ASCQ_* also becomes + * an enum, check against the LIBISCSI_API_VERSION macro, which was + * introduced in 1.11.0. If it is present, there is no need to define + * anything. + */ +#if !defined(SCSI_SENSE_ASCQ_INVALID_FIELD_IN_PARAMETER_LIST) && \ + !defined(LIBISCSI_API_VERSION) +#define SCSI_STATUS_TASK_SET_FULL 0x28 +#define SCSI_STATUS_TIMEOUT 0x0f000002 +#define SCSI_SENSE_ASCQ_INVALID_FIELD_IN_PARAMETER_LIST 0x2600 +#define SCSI_SENSE_ASCQ_PARAMETER_LIST_LENGTH_ERROR 0x1a00 #endif +static int iscsi_translate_sense(struct scsi_sense *sense) +{ + int ret; + + switch (sense->key) { + case SCSI_SENSE_NOT_READY: + return -EBUSY; + case SCSI_SENSE_DATA_PROTECTION: + return -EACCES; + case SCSI_SENSE_COMMAND_ABORTED: + return -ECANCELED; + case SCSI_SENSE_ILLEGAL_REQUEST: + /* Parse ASCQ */ + break; + default: + return -EIO; + } + switch (sense->ascq) { + case SCSI_SENSE_ASCQ_PARAMETER_LIST_LENGTH_ERROR: + case SCSI_SENSE_ASCQ_INVALID_OPERATION_CODE: + case SCSI_SENSE_ASCQ_INVALID_FIELD_IN_CDB: + case SCSI_SENSE_ASCQ_INVALID_FIELD_IN_PARAMETER_LIST: + ret = -EINVAL; + break; + case SCSI_SENSE_ASCQ_LBA_OUT_OF_RANGE: + ret = -ENOSPC; + break; + case SCSI_SENSE_ASCQ_LOGICAL_UNIT_NOT_SUPPORTED: + ret = -ENOTSUP; + break; + case SCSI_SENSE_ASCQ_MEDIUM_NOT_PRESENT: + case SCSI_SENSE_ASCQ_MEDIUM_NOT_PRESENT_TRAY_CLOSED: + case SCSI_SENSE_ASCQ_MEDIUM_NOT_PRESENT_TRAY_OPEN: + ret = -ENOMEDIUM; + break; + case SCSI_SENSE_ASCQ_WRITE_PROTECTED: + ret = -EACCES; + break; + default: + ret = -EIO; + break; + } + return ret; +} + static void iscsi_co_generic_cb(struct iscsi_context *iscsi, int status, void *command_data, void *opaque) @@ -203,11 +255,11 @@ iscsi_co_generic_cb(struct iscsi_context *iscsi, int status, goto out; } if (status == SCSI_STATUS_BUSY || - status == QEMU_SCSI_STATUS_TIMEOUT || - status == QEMU_SCSI_STATUS_TASK_SET_FULL) { + status == SCSI_STATUS_TIMEOUT || + status == SCSI_STATUS_TASK_SET_FULL) { unsigned retry_time = exp_random(iscsi_retry_times[iTask->retries - 1]); - if (status == QEMU_SCSI_STATUS_TIMEOUT) { + if (status == SCSI_STATUS_TIMEOUT) { /* make sure the request is rescheduled AFTER the * reconnect is initiated */ retry_time = EVENT_INTERVAL * 2; @@ -226,9 +278,8 @@ iscsi_co_generic_cb(struct iscsi_context *iscsi, int status, return; } } + iTask->err_code = iscsi_translate_sense(&task->sense); error_report("iSCSI Failure: %s", iscsi_get_error(iscsi)); - } else { - iTask->iscsilun->force_next_flush |= iTask->force_next_flush; } out: @@ -291,8 +342,8 @@ iscsi_set_events(IscsiLun *iscsilun) int ev = iscsi_which_events(iscsi); if (ev != iscsilun->events) { - aio_set_fd_handler(iscsilun->aio_context, - iscsi_get_fd(iscsi), + aio_set_fd_handler(iscsilun->aio_context, iscsi_get_fd(iscsi), + false, (ev & POLLIN) ? iscsi_process_read : NULL, (ev & POLLOUT) ? iscsi_process_write : NULL, iscsilun); @@ -397,15 +448,15 @@ static void iscsi_allocationmap_clear(IscsiLun *iscsilun, int64_t sector_num, } } -static int coroutine_fn iscsi_co_writev(BlockDriverState *bs, - int64_t sector_num, int nb_sectors, - QEMUIOVector *iov) +static int coroutine_fn +iscsi_co_writev_flags(BlockDriverState *bs, int64_t sector_num, int nb_sectors, + QEMUIOVector *iov, int flags) { IscsiLun *iscsilun = bs->opaque; struct IscsiTask iTask; uint64_t lba; uint32_t num_sectors; - int fua; + bool fua; if (!is_request_lun_aligned(sector_num, nb_sectors, iscsilun)) { return -EINVAL; @@ -421,8 +472,7 @@ static int coroutine_fn iscsi_co_writev(BlockDriverState *bs, num_sectors = sector_qemu2lun(nb_sectors, iscsilun); iscsi_co_init_iscsitask(iscsilun, &iTask); retry: - fua = iscsilun->dpofua && !bs->enable_write_cache; - iTask.force_next_flush = !fua; + fua = iscsilun->dpofua && (flags & BDRV_REQ_FUA); if (iscsilun->use_16_for_rw) { iTask.task = iscsi_write16_task(iscsilun->iscsi, iscsilun->lun, lba, NULL, num_sectors * iscsilun->block_size, @@ -455,7 +505,7 @@ retry: } if (iTask.status != SCSI_STATUS_GOOD) { - return -EIO; + return iTask.err_code; } iscsi_allocationmap_set(iscsilun, sector_num, nb_sectors); @@ -463,6 +513,13 @@ retry: return 0; } +static int coroutine_fn +iscsi_co_writev(BlockDriverState *bs, int64_t sector_num, int nb_sectors, + QEMUIOVector *iov) +{ + return iscsi_co_writev_flags(bs, sector_num, nb_sectors, iov, 0); +} + static bool iscsi_allocationmap_is_allocated(IscsiLun *iscsilun, int64_t sector_num, int nb_sectors) @@ -478,7 +535,8 @@ static bool iscsi_allocationmap_is_allocated(IscsiLun *iscsilun, static int64_t coroutine_fn iscsi_co_get_block_status(BlockDriverState *bs, int64_t sector_num, - int nb_sectors, int *pnum) + int nb_sectors, int *pnum, + BlockDriverState **file) { IscsiLun *iscsilun = bs->opaque; struct scsi_get_lba_status *lbas = NULL; @@ -570,6 +628,9 @@ out: if (iTask.task != NULL) { scsi_free_scsi_task(iTask.task); } + if (ret > 0 && ret & BDRV_BLOCK_OFFSET_VALID) { + *file = bs; + } return ret; } @@ -596,7 +657,8 @@ static int coroutine_fn iscsi_co_readv(BlockDriverState *bs, !iscsi_allocationmap_is_allocated(iscsilun, sector_num, nb_sectors)) { int64_t ret; int pnum; - ret = iscsi_co_get_block_status(bs, sector_num, INT_MAX, &pnum); + BlockDriverState *file; + ret = iscsi_co_get_block_status(bs, sector_num, INT_MAX, &pnum, &file); if (ret < 0) { return ret; } @@ -644,7 +706,7 @@ retry: } if (iTask.status != SCSI_STATUS_GOOD) { - return -EIO; + return iTask.err_code; } return 0; @@ -655,11 +717,6 @@ static int coroutine_fn iscsi_co_flush(BlockDriverState *bs) IscsiLun *iscsilun = bs->opaque; struct IscsiTask iTask; - if (!iscsilun->force_next_flush) { - return 0; - } - iscsilun->force_next_flush = false; - iscsi_co_init_iscsitask(iscsilun, &iTask); retry: if (iscsi_synchronizecache10_task(iscsilun->iscsi, iscsilun->lun, 0, 0, 0, @@ -683,7 +740,7 @@ retry: } if (iTask.status != SCSI_STATUS_GOOD) { - return -EIO; + return iTask.err_code; } return 0; @@ -703,7 +760,7 @@ iscsi_aio_ioctl_cb(struct iscsi_context *iscsi, int status, if (status < 0) { error_report("Failed to ioctl(SG_IO) to iSCSI lun. %s", iscsi_get_error(iscsi)); - acb->status = -EIO; + acb->status = iscsi_translate_sense(&acb->task->sense); } acb->ioh->driver_status = 0; @@ -726,6 +783,38 @@ iscsi_aio_ioctl_cb(struct iscsi_context *iscsi, int status, iscsi_schedule_bh(acb); } +static void iscsi_ioctl_bh_completion(void *opaque) +{ + IscsiAIOCB *acb = opaque; + + qemu_bh_delete(acb->bh); + acb->common.cb(acb->common.opaque, acb->ret); + qemu_aio_unref(acb); +} + +static void iscsi_ioctl_handle_emulated(IscsiAIOCB *acb, int req, void *buf) +{ + BlockDriverState *bs = acb->common.bs; + IscsiLun *iscsilun = bs->opaque; + int ret = 0; + + switch (req) { + case SG_GET_VERSION_NUM: + *(int *)buf = 30000; + break; + case SG_GET_SCSI_ID: + ((struct sg_scsi_id *)buf)->scsi_type = iscsilun->type; + break; + default: + ret = -EINVAL; + } + assert(!acb->bh); + acb->bh = aio_bh_new(bdrv_get_aio_context(bs), + iscsi_ioctl_bh_completion, acb); + acb->ret = ret; + qemu_bh_schedule(acb->bh); +} + static BlockAIOCB *iscsi_aio_ioctl(BlockDriverState *bs, unsigned long int req, void *buf, BlockCompletionFunc *cb, void *opaque) @@ -735,8 +824,6 @@ static BlockAIOCB *iscsi_aio_ioctl(BlockDriverState *bs, struct iscsi_data data; IscsiAIOCB *acb; - assert(req == SG_IO); - acb = qemu_aio_get(&iscsi_aiocb_info, bs, cb, opaque); acb->iscsilun = iscsilun; @@ -745,6 +832,11 @@ static BlockAIOCB *iscsi_aio_ioctl(BlockDriverState *bs, acb->buf = NULL; acb->ioh = buf; + if (req != SG_IO) { + iscsi_ioctl_handle_emulated(acb, req, buf); + return &acb->common; + } + acb->task = malloc(sizeof(struct scsi_task)); if (acb->task == NULL) { error_report("iSCSI: Failed to allocate task for scsi command. %s", @@ -809,38 +901,6 @@ static BlockAIOCB *iscsi_aio_ioctl(BlockDriverState *bs, return &acb->common; } -static void ioctl_cb(void *opaque, int status) -{ - int *p_status = opaque; - *p_status = status; -} - -static int iscsi_ioctl(BlockDriverState *bs, unsigned long int req, void *buf) -{ - IscsiLun *iscsilun = bs->opaque; - int status; - - switch (req) { - case SG_GET_VERSION_NUM: - *(int *)buf = 30000; - break; - case SG_GET_SCSI_ID: - ((struct sg_scsi_id *)buf)->scsi_type = iscsilun->type; - break; - case SG_IO: - status = -EINPROGRESS; - iscsi_aio_ioctl(bs, req, buf, ioctl_cb, &status); - - while (status == -EINPROGRESS) { - aio_poll(iscsilun->aio_context, true); - } - - return 0; - default: - return -1; - } - return 0; -} #endif static int64_t @@ -905,7 +965,7 @@ retry: } if (iTask.status != SCSI_STATUS_GOOD) { - return -EIO; + return iTask.err_code; } iscsi_allocationmap_clear(iscsilun, sector_num, nb_sectors); @@ -956,7 +1016,6 @@ coroutine_fn iscsi_co_write_zeroes(BlockDriverState *bs, int64_t sector_num, } iscsi_co_init_iscsitask(iscsilun, &iTask); - iTask.force_next_flush = true; retry: if (use_16_for_ws) { iTask.task = iscsi_writesame16_task(iscsilun->iscsi, iscsilun->lun, lba, @@ -999,7 +1058,7 @@ retry: } if (iTask.status != SCSI_STATUS_GOOD) { - return -EIO; + return iTask.err_code; } if (flags & BDRV_REQ_MAY_UNMAP) { @@ -1018,6 +1077,8 @@ static void parse_chap(struct iscsi_context *iscsi, const char *target, QemuOpts *opts; const char *user = NULL; const char *password = NULL; + const char *secretid; + char *secret = NULL; list = qemu_find_opts("iscsi"); if (!list) { @@ -1037,8 +1098,20 @@ static void parse_chap(struct iscsi_context *iscsi, const char *target, return; } + secretid = qemu_opt_get(opts, "password-secret"); password = qemu_opt_get(opts, "password"); - if (!password) { + if (secretid && password) { + error_setg(errp, "'password' and 'password-secret' properties are " + "mutually exclusive"); + return; + } + if (secretid) { + secret = qcrypto_secret_lookup_as_utf8(secretid, errp); + if (!secret) { + return; + } + password = secret; + } else if (!password) { error_setg(errp, "CHAP username specified but no password was given"); return; } @@ -1046,6 +1119,8 @@ static void parse_chap(struct iscsi_context *iscsi, const char *target, if (iscsi_set_initiator_username_pwd(iscsi, user, password)) { error_setg(errp, "Failed to set initiator username and password"); } + + g_free(secret); } static void parse_header_digest(struct iscsi_context *iscsi, const char *target, @@ -1186,8 +1261,13 @@ static void iscsi_readcapacity_sync(IscsiLun *iscsilun, Error **errp) iscsilun->lbprz = !!rc16->lbprz; iscsilun->use_16_for_rw = (rc16->returned_lba > 0xffffffff); } + break; } - break; + if (task != NULL && task->status == SCSI_STATUS_CHECK_CONDITION + && task->sense.key == SCSI_SENSE_UNIT_ATTENTION) { + break; + } + /* Fall through and try READ CAPACITY(10) instead. */ case TYPE_ROM: task = iscsi_readcapacity10_sync(iscsilun->iscsi, iscsilun->lun, 0, 0); if (task != NULL && task->status == SCSI_STATUS_GOOD) { @@ -1213,7 +1293,11 @@ static void iscsi_readcapacity_sync(IscsiLun *iscsilun, Error **errp) && retries-- > 0); if (task == NULL || task->status != SCSI_STATUS_GOOD) { - error_setg(errp, "iSCSI: failed to send readcapacity10 command."); + error_setg(errp, "iSCSI: failed to send readcapacity10/16 command"); + } else if (!iscsilun->block_size || + iscsilun->block_size % BDRV_SECTOR_SIZE) { + error_setg(errp, "iSCSI: the target returned an invalid " + "block size of %d.", iscsilun->block_size); } if (task) { scsi_free_scsi_task(task); @@ -1276,9 +1360,8 @@ static void iscsi_detach_aio_context(BlockDriverState *bs) { IscsiLun *iscsilun = bs->opaque; - aio_set_fd_handler(iscsilun->aio_context, - iscsi_get_fd(iscsilun->iscsi), - NULL, NULL, NULL); + aio_set_fd_handler(iscsilun->aio_context, iscsi_get_fd(iscsilun->iscsi), + false, NULL, NULL, NULL); iscsilun->events = 0; if (iscsilun->nop_timer) { @@ -1765,10 +1848,11 @@ static BlockDriver bdrv_iscsi = { .bdrv_co_write_zeroes = iscsi_co_write_zeroes, .bdrv_co_readv = iscsi_co_readv, .bdrv_co_writev = iscsi_co_writev, + .bdrv_co_writev_flags = iscsi_co_writev_flags, + .supported_write_flags = BDRV_REQ_FUA, .bdrv_co_flush_to_disk = iscsi_co_flush, #ifdef __linux__ - .bdrv_ioctl = iscsi_ioctl, .bdrv_aio_ioctl = iscsi_aio_ioctl, #endif @@ -1789,6 +1873,11 @@ static QemuOptsList qemu_iscsi_opts = { .type = QEMU_OPT_STRING, .help = "password for CHAP authentication to target", },{ + .name = "password-secret", + .type = QEMU_OPT_STRING, + .help = "ID of the secret providing password for CHAP " + "authentication to target", + },{ .name = "header-digest", .type = QEMU_OPT_STRING, .help = "HeaderDigest setting. " diff --git a/qemu/block/linux-aio.c b/qemu/block/linux-aio.c index c991443c5..805757e02 100644 --- a/qemu/block/linux-aio.c +++ b/qemu/block/linux-aio.c @@ -7,6 +7,7 @@ * This work is licensed under the terms of the GNU GPL, version 2 or later. * See the COPYING file in the top-level directory. */ +#include "qemu/osdep.h" #include "qemu-common.h" #include "block/aio.h" #include "qemu/queue.h" @@ -287,7 +288,7 @@ void laio_detach_aio_context(void *s_, AioContext *old_context) { struct qemu_laio_state *s = s_; - aio_set_event_notifier(old_context, &s->e, NULL); + aio_set_event_notifier(old_context, &s->e, false, NULL); qemu_bh_delete(s->completion_bh); } @@ -296,7 +297,8 @@ void laio_attach_aio_context(void *s_, AioContext *new_context) struct qemu_laio_state *s = s_; s->completion_bh = aio_bh_new(new_context, qemu_laio_completion_bh, s); - aio_set_event_notifier(new_context, &s->e, qemu_laio_completion_cb); + aio_set_event_notifier(new_context, &s->e, false, + qemu_laio_completion_cb); } void *laio_init(void) diff --git a/qemu/block/mirror.c b/qemu/block/mirror.c index fc4d8f561..039f48125 100644 --- a/qemu/block/mirror.c +++ b/qemu/block/mirror.c @@ -11,12 +11,16 @@ * */ +#include "qemu/osdep.h" #include "trace.h" #include "block/blockjob.h" #include "block/block_int.h" +#include "sysemu/block-backend.h" +#include "qapi/error.h" #include "qapi/qmp/qerror.h" #include "qemu/ratelimit.h" #include "qemu/bitmap.h" +#include "qemu/error-report.h" #define SLICE_TIME 100000000ULL /* ns */ #define MAX_IN_FLIGHT 16 @@ -44,7 +48,6 @@ typedef struct MirrorBlockJob { BlockdevOnError on_source_error, on_target_error; bool synced; bool should_complete; - int64_t sector_num; int64_t granularity; size_t buf_size; int64_t bdev_length; @@ -60,6 +63,9 @@ typedef struct MirrorBlockJob { int sectors_in_flight; int ret; bool unmap; + bool waiting_for_io; + int target_cluster_sectors; + int max_iov; } MirrorBlockJob; typedef struct MirrorOp { @@ -102,7 +108,7 @@ static void mirror_iteration_done(MirrorOp *op, int ret) sectors_per_chunk = s->granularity >> BDRV_SECTOR_BITS; chunk_num = op->sector_num / sectors_per_chunk; - nb_chunks = op->nb_sectors / sectors_per_chunk; + nb_chunks = DIV_ROUND_UP(op->nb_sectors, sectors_per_chunk); bitmap_clear(s->in_flight_bitmap, chunk_num, nb_chunks); if (ret >= 0) { if (s->cow_bitmap) { @@ -112,13 +118,9 @@ static void mirror_iteration_done(MirrorOp *op, int ret) } qemu_iovec_destroy(&op->qiov); - g_slice_free(MirrorOp, op); + g_free(op); - /* Enter coroutine when it is not sleeping. The coroutine sleeps to - * rate-limit itself. The coroutine will eventually resume since there is - * a sleep timeout so don't wake it early. - */ - if (s->common.busy) { + if (s->waiting_for_io) { qemu_coroutine_enter(s->common.co, NULL); } } @@ -159,107 +161,97 @@ static void mirror_read_complete(void *opaque, int ret) mirror_write_complete, op); } -static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s) +static inline void mirror_clip_sectors(MirrorBlockJob *s, + int64_t sector_num, + int *nb_sectors) { - BlockDriverState *source = s->common.bs; - int nb_sectors, sectors_per_chunk, nb_chunks; - int64_t end, sector_num, next_chunk, next_sector, hbitmap_next_sector; - uint64_t delay_ns = 0; - MirrorOp *op; - int pnum; - int64_t ret; - - s->sector_num = hbitmap_iter_next(&s->hbi); - if (s->sector_num < 0) { - bdrv_dirty_iter_init(s->dirty_bitmap, &s->hbi); - s->sector_num = hbitmap_iter_next(&s->hbi); - trace_mirror_restart_iter(s, bdrv_get_dirty_count(s->dirty_bitmap)); - assert(s->sector_num >= 0); - } - - hbitmap_next_sector = s->sector_num; - sector_num = s->sector_num; - sectors_per_chunk = s->granularity >> BDRV_SECTOR_BITS; - end = s->bdev_length / BDRV_SECTOR_SIZE; - - /* Extend the QEMUIOVector to include all adjacent blocks that will - * be copied in this operation. - * - * We have to do this if we have no backing file yet in the destination, - * and the cluster size is very large. Then we need to do COW ourselves. - * The first time a cluster is copied, copy it entirely. Note that, - * because both the granularity and the cluster size are powers of two, - * the number of sectors to copy cannot exceed one cluster. - * - * We also want to extend the QEMUIOVector to include more adjacent - * dirty blocks if possible, to limit the number of I/O operations and - * run efficiently even with a small granularity. - */ - nb_chunks = 0; - nb_sectors = 0; - next_sector = sector_num; - next_chunk = sector_num / sectors_per_chunk; + *nb_sectors = MIN(*nb_sectors, + s->bdev_length / BDRV_SECTOR_SIZE - sector_num); +} - /* Wait for I/O to this cluster (from a previous iteration) to be done. */ - while (test_bit(next_chunk, s->in_flight_bitmap)) { - trace_mirror_yield_in_flight(s, sector_num, s->in_flight); - qemu_coroutine_yield(); +/* Round sector_num and/or nb_sectors to target cluster if COW is needed, and + * return the offset of the adjusted tail sector against original. */ +static int mirror_cow_align(MirrorBlockJob *s, + int64_t *sector_num, + int *nb_sectors) +{ + bool need_cow; + int ret = 0; + int chunk_sectors = s->granularity >> BDRV_SECTOR_BITS; + int64_t align_sector_num = *sector_num; + int align_nb_sectors = *nb_sectors; + int max_sectors = chunk_sectors * s->max_iov; + + need_cow = !test_bit(*sector_num / chunk_sectors, s->cow_bitmap); + need_cow |= !test_bit((*sector_num + *nb_sectors - 1) / chunk_sectors, + s->cow_bitmap); + if (need_cow) { + bdrv_round_to_clusters(s->target, *sector_num, *nb_sectors, + &align_sector_num, &align_nb_sectors); + } + + if (align_nb_sectors > max_sectors) { + align_nb_sectors = max_sectors; + if (need_cow) { + align_nb_sectors = QEMU_ALIGN_DOWN(align_nb_sectors, + s->target_cluster_sectors); + } } + /* Clipping may result in align_nb_sectors unaligned to chunk boundary, but + * that doesn't matter because it's already the end of source image. */ + mirror_clip_sectors(s, align_sector_num, &align_nb_sectors); - do { - int added_sectors, added_chunks; - - if (!bdrv_get_dirty(source, s->dirty_bitmap, next_sector) || - test_bit(next_chunk, s->in_flight_bitmap)) { - assert(nb_sectors > 0); - break; - } + ret = align_sector_num + align_nb_sectors - (*sector_num + *nb_sectors); + *sector_num = align_sector_num; + *nb_sectors = align_nb_sectors; + assert(ret >= 0); + return ret; +} - added_sectors = sectors_per_chunk; - if (s->cow_bitmap && !test_bit(next_chunk, s->cow_bitmap)) { - bdrv_round_to_clusters(s->target, - next_sector, added_sectors, - &next_sector, &added_sectors); +static inline void mirror_wait_for_io(MirrorBlockJob *s) +{ + assert(!s->waiting_for_io); + s->waiting_for_io = true; + qemu_coroutine_yield(); + s->waiting_for_io = false; +} - /* On the first iteration, the rounding may make us copy - * sectors before the first dirty one. - */ - if (next_sector < sector_num) { - assert(nb_sectors == 0); - sector_num = next_sector; - next_chunk = next_sector / sectors_per_chunk; - } - } +/* Submit async read while handling COW. + * Returns: nb_sectors if no alignment is necessary, or + * (new_end - sector_num) if tail is rounded up or down due to + * alignment or buffer limit. + */ +static int mirror_do_read(MirrorBlockJob *s, int64_t sector_num, + int nb_sectors) +{ + BlockDriverState *source = s->common.bs; + int sectors_per_chunk, nb_chunks; + int ret = nb_sectors; + MirrorOp *op; - added_sectors = MIN(added_sectors, end - (sector_num + nb_sectors)); - added_chunks = (added_sectors + sectors_per_chunk - 1) / sectors_per_chunk; + sectors_per_chunk = s->granularity >> BDRV_SECTOR_BITS; - /* When doing COW, it may happen that there is not enough space for - * a full cluster. Wait if that is the case. - */ - while (nb_chunks == 0 && s->buf_free_count < added_chunks) { - trace_mirror_yield_buf_busy(s, nb_chunks, s->in_flight); - qemu_coroutine_yield(); - } - if (s->buf_free_count < nb_chunks + added_chunks) { - trace_mirror_break_buf_busy(s, nb_chunks, s->in_flight); - break; - } + /* We can only handle as much as buf_size at a time. */ + nb_sectors = MIN(s->buf_size >> BDRV_SECTOR_BITS, nb_sectors); + assert(nb_sectors); - /* We have enough free space to copy these sectors. */ - bitmap_set(s->in_flight_bitmap, next_chunk, added_chunks); + if (s->cow_bitmap) { + ret += mirror_cow_align(s, §or_num, &nb_sectors); + } + assert(nb_sectors << BDRV_SECTOR_BITS <= s->buf_size); + /* The sector range must meet granularity because: + * 1) Caller passes in aligned values; + * 2) mirror_cow_align is used only when target cluster is larger. */ + assert(!(sector_num % sectors_per_chunk)); + nb_chunks = DIV_ROUND_UP(nb_sectors, sectors_per_chunk); - nb_sectors += added_sectors; - nb_chunks += added_chunks; - next_sector += added_sectors; - next_chunk += added_chunks; - if (!s->synced && s->common.speed) { - delay_ns = ratelimit_calculate_delay(&s->limit, added_sectors); - } - } while (delay_ns == 0 && next_sector < end); + while (s->buf_free_count < nb_chunks) { + trace_mirror_yield_in_flight(s, sector_num, s->in_flight); + mirror_wait_for_io(s); + } /* Allocate a MirrorOp that is used as an AIO callback. */ - op = g_slice_new(MirrorOp); + op = g_new(MirrorOp, 1); op->s = s; op->sector_num = sector_num; op->nb_sectors = nb_sectors; @@ -268,47 +260,158 @@ static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s) * from s->buf_free. */ qemu_iovec_init(&op->qiov, nb_chunks); - next_sector = sector_num; while (nb_chunks-- > 0) { MirrorBuffer *buf = QSIMPLEQ_FIRST(&s->buf_free); - size_t remaining = (nb_sectors * BDRV_SECTOR_SIZE) - op->qiov.size; + size_t remaining = nb_sectors * BDRV_SECTOR_SIZE - op->qiov.size; QSIMPLEQ_REMOVE_HEAD(&s->buf_free, next); s->buf_free_count--; qemu_iovec_add(&op->qiov, buf, MIN(s->granularity, remaining)); - - /* Advance the HBitmapIter in parallel, so that we do not examine - * the same sector twice. - */ - if (next_sector > hbitmap_next_sector - && bdrv_get_dirty(source, s->dirty_bitmap, next_sector)) { - hbitmap_next_sector = hbitmap_iter_next(&s->hbi); - } - - next_sector += sectors_per_chunk; } - bdrv_reset_dirty_bitmap(s->dirty_bitmap, sector_num, nb_sectors); - /* Copy the dirty cluster. */ s->in_flight++; s->sectors_in_flight += nb_sectors; trace_mirror_one_iteration(s, sector_num, nb_sectors); - ret = bdrv_get_block_status_above(source, NULL, sector_num, - nb_sectors, &pnum); - if (ret < 0 || pnum < nb_sectors || - (ret & BDRV_BLOCK_DATA && !(ret & BDRV_BLOCK_ZERO))) { - bdrv_aio_readv(source, sector_num, &op->qiov, nb_sectors, - mirror_read_complete, op); - } else if (ret & BDRV_BLOCK_ZERO) { + bdrv_aio_readv(source, sector_num, &op->qiov, nb_sectors, + mirror_read_complete, op); + return ret; +} + +static void mirror_do_zero_or_discard(MirrorBlockJob *s, + int64_t sector_num, + int nb_sectors, + bool is_discard) +{ + MirrorOp *op; + + /* Allocate a MirrorOp that is used as an AIO callback. The qiov is zeroed + * so the freeing in mirror_iteration_done is nop. */ + op = g_new0(MirrorOp, 1); + op->s = s; + op->sector_num = sector_num; + op->nb_sectors = nb_sectors; + + s->in_flight++; + s->sectors_in_flight += nb_sectors; + if (is_discard) { + bdrv_aio_discard(s->target, sector_num, op->nb_sectors, + mirror_write_complete, op); + } else { bdrv_aio_write_zeroes(s->target, sector_num, op->nb_sectors, s->unmap ? BDRV_REQ_MAY_UNMAP : 0, mirror_write_complete, op); - } else { - assert(!(ret & BDRV_BLOCK_DATA)); - bdrv_aio_discard(s->target, sector_num, op->nb_sectors, - mirror_write_complete, op); + } +} + +static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s) +{ + BlockDriverState *source = s->common.bs; + int64_t sector_num, first_chunk; + uint64_t delay_ns = 0; + /* At least the first dirty chunk is mirrored in one iteration. */ + int nb_chunks = 1; + int64_t end = s->bdev_length / BDRV_SECTOR_SIZE; + int sectors_per_chunk = s->granularity >> BDRV_SECTOR_BITS; + + sector_num = hbitmap_iter_next(&s->hbi); + if (sector_num < 0) { + bdrv_dirty_iter_init(s->dirty_bitmap, &s->hbi); + sector_num = hbitmap_iter_next(&s->hbi); + trace_mirror_restart_iter(s, bdrv_get_dirty_count(s->dirty_bitmap)); + assert(sector_num >= 0); + } + + first_chunk = sector_num / sectors_per_chunk; + while (test_bit(first_chunk, s->in_flight_bitmap)) { + trace_mirror_yield_in_flight(s, first_chunk, s->in_flight); + mirror_wait_for_io(s); + } + + /* Find the number of consective dirty chunks following the first dirty + * one, and wait for in flight requests in them. */ + while (nb_chunks * sectors_per_chunk < (s->buf_size >> BDRV_SECTOR_BITS)) { + int64_t hbitmap_next; + int64_t next_sector = sector_num + nb_chunks * sectors_per_chunk; + int64_t next_chunk = next_sector / sectors_per_chunk; + if (next_sector >= end || + !bdrv_get_dirty(source, s->dirty_bitmap, next_sector)) { + break; + } + if (test_bit(next_chunk, s->in_flight_bitmap)) { + break; + } + + hbitmap_next = hbitmap_iter_next(&s->hbi); + if (hbitmap_next > next_sector || hbitmap_next < 0) { + /* The bitmap iterator's cache is stale, refresh it */ + bdrv_set_dirty_iter(&s->hbi, next_sector); + hbitmap_next = hbitmap_iter_next(&s->hbi); + } + assert(hbitmap_next == next_sector); + nb_chunks++; + } + + /* Clear dirty bits before querying the block status, because + * calling bdrv_get_block_status_above could yield - if some blocks are + * marked dirty in this window, we need to know. + */ + bdrv_reset_dirty_bitmap(s->dirty_bitmap, sector_num, + nb_chunks * sectors_per_chunk); + bitmap_set(s->in_flight_bitmap, sector_num / sectors_per_chunk, nb_chunks); + while (nb_chunks > 0 && sector_num < end) { + int ret; + int io_sectors; + BlockDriverState *file; + enum MirrorMethod { + MIRROR_METHOD_COPY, + MIRROR_METHOD_ZERO, + MIRROR_METHOD_DISCARD + } mirror_method = MIRROR_METHOD_COPY; + + assert(!(sector_num % sectors_per_chunk)); + ret = bdrv_get_block_status_above(source, NULL, sector_num, + nb_chunks * sectors_per_chunk, + &io_sectors, &file); + if (ret < 0) { + io_sectors = nb_chunks * sectors_per_chunk; + } + + io_sectors -= io_sectors % sectors_per_chunk; + if (io_sectors < sectors_per_chunk) { + io_sectors = sectors_per_chunk; + } else if (ret >= 0 && !(ret & BDRV_BLOCK_DATA)) { + int64_t target_sector_num; + int target_nb_sectors; + bdrv_round_to_clusters(s->target, sector_num, io_sectors, + &target_sector_num, &target_nb_sectors); + if (target_sector_num == sector_num && + target_nb_sectors == io_sectors) { + mirror_method = ret & BDRV_BLOCK_ZERO ? + MIRROR_METHOD_ZERO : + MIRROR_METHOD_DISCARD; + } + } + + mirror_clip_sectors(s, sector_num, &io_sectors); + switch (mirror_method) { + case MIRROR_METHOD_COPY: + io_sectors = mirror_do_read(s, sector_num, io_sectors); + break; + case MIRROR_METHOD_ZERO: + mirror_do_zero_or_discard(s, sector_num, io_sectors, false); + break; + case MIRROR_METHOD_DISCARD: + mirror_do_zero_or_discard(s, sector_num, io_sectors, true); + break; + default: + abort(); + } + assert(io_sectors); + sector_num += io_sectors; + nb_chunks -= DIV_ROUND_UP(io_sectors, sectors_per_chunk); + delay_ns += ratelimit_calculate_delay(&s->limit, io_sectors); } return delay_ns; } @@ -333,7 +436,7 @@ static void mirror_free_init(MirrorBlockJob *s) static void mirror_drain(MirrorBlockJob *s) { while (s->in_flight > 0) { - qemu_coroutine_yield(); + mirror_wait_for_io(s); } } @@ -346,6 +449,11 @@ static void mirror_exit(BlockJob *job, void *opaque) MirrorBlockJob *s = container_of(job, MirrorBlockJob, common); MirrorExitData *data = opaque; AioContext *replace_aio_context = NULL; + BlockDriverState *src = s->common.bs; + + /* Make sure that the source BDS doesn't go away before we called + * block_job_completed(). */ + bdrv_ref(src); if (s->to_replace) { replace_aio_context = bdrv_get_aio_context(s->to_replace); @@ -357,18 +465,22 @@ static void mirror_exit(BlockJob *job, void *opaque) if (s->to_replace) { to_replace = s->to_replace; } + + /* This was checked in mirror_start_job(), but meanwhile one of the + * nodes could have been newly attached to a BlockBackend. */ + if (to_replace->blk && s->target->blk) { + error_report("block job: Can't create node with two BlockBackends"); + data->ret = -EINVAL; + goto out; + } + if (bdrv_get_flags(s->target) != bdrv_get_flags(to_replace)) { bdrv_reopen(s->target, bdrv_get_flags(to_replace), NULL); } - bdrv_swap(s->target, to_replace); - if (s->common.driver->job_type == BLOCK_JOB_TYPE_COMMIT) { - /* drop the bs loop chain formed by the swap: break the loop then - * trigger the unref from the top one */ - BlockDriverState *p = s->base->backing_hd; - bdrv_set_backing_hd(s->base, NULL); - bdrv_unref(p); - } + bdrv_replace_in_backing_chain(to_replace, s->target); } + +out: if (s->to_replace) { bdrv_op_unblock_all(s->to_replace, s->replace_blocker); error_free(s->replace_blocker); @@ -378,9 +490,15 @@ static void mirror_exit(BlockJob *job, void *opaque) aio_context_release(replace_aio_context); } g_free(s->replaces); + bdrv_op_unblock_all(s->target, s->common.blocker); bdrv_unref(s->target); block_job_completed(&s->common, data->ret); g_free(data); + bdrv_drained_end(src); + if (qemu_get_aio_context() == bdrv_get_aio_context(src)) { + aio_enable_external(iohandler_get_aio_context()); + } + bdrv_unref(src); } static void coroutine_fn mirror_run(void *opaque) @@ -395,6 +513,7 @@ static void coroutine_fn mirror_run(void *opaque) checking for a NULL string */ int ret = 0; int n; + int target_cluster_size = BDRV_SECTOR_SIZE; if (block_job_is_cancelled(&s->common)) { goto immediate_exit; @@ -424,16 +543,16 @@ static void coroutine_fn mirror_run(void *opaque) */ bdrv_get_backing_filename(s->target, backing_filename, sizeof(backing_filename)); - if (backing_filename[0] && !s->target->backing_hd) { - ret = bdrv_get_info(s->target, &bdi); - if (ret < 0) { - goto immediate_exit; - } - if (s->granularity < bdi.cluster_size) { - s->buf_size = MAX(s->buf_size, bdi.cluster_size); - s->cow_bitmap = bitmap_new(length); - } + if (!bdrv_get_info(s->target, &bdi) && bdi.cluster_size) { + target_cluster_size = bdi.cluster_size; } + if (backing_filename[0] && !s->target->backing + && s->granularity < target_cluster_size) { + s->buf_size = MAX(s->buf_size, target_cluster_size); + s->cow_bitmap = bitmap_new(length); + } + s->target_cluster_sectors = target_cluster_size >> BDRV_SECTOR_BITS; + s->max_iov = MIN(s->common.bs->bl.max_iov, s->target->bl.max_iov); end = s->bdev_length / BDRV_SECTOR_SIZE; s->buf = qemu_try_blockalign(bs, s->buf_size); @@ -448,6 +567,8 @@ static void coroutine_fn mirror_run(void *opaque) if (!s->is_none_mode) { /* First part, loop on the sectors and initialize the dirty bitmap. */ BlockDriverState *base = s->base; + bool mark_all_dirty = s->base == NULL && !bdrv_has_zero_init(s->target); + for (sector_num = 0; sector_num < end; ) { /* Just to make sure we are not exceeding int limit. */ int nb_sectors = MIN(INT_MAX >> BDRV_SECTOR_BITS, @@ -470,7 +591,7 @@ static void coroutine_fn mirror_run(void *opaque) } assert(n > 0); - if (ret == 1) { + if (ret == 1 || mark_all_dirty) { bdrv_set_dirty_bitmap(s->dirty_bitmap, sector_num, n); } sector_num += n; @@ -506,7 +627,7 @@ static void coroutine_fn mirror_run(void *opaque) if (s->in_flight == MAX_IN_FLIGHT || s->buf_free_count == 0 || (cnt == 0 && s->in_flight > 0)) { trace_mirror_yield(s, s->in_flight, s->buf_free_count, cnt); - qemu_coroutine_yield(); + mirror_wait_for_io(s); continue; } else if (cnt != 0) { delay_ns = mirror_iteration(s); @@ -549,7 +670,7 @@ static void coroutine_fn mirror_run(void *opaque) * mirror_populate runs. */ trace_mirror_before_drain(s, cnt); - bdrv_drain(bs); + bdrv_co_drain(bs); cnt = bdrv_get_dirty_count(s->dirty_bitmap); } @@ -589,10 +710,21 @@ immediate_exit: g_free(s->cow_bitmap); g_free(s->in_flight_bitmap); bdrv_release_dirty_bitmap(bs, s->dirty_bitmap); - bdrv_iostatus_disable(s->target); + if (s->target->blk) { + blk_iostatus_disable(s->target->blk); + } data = g_malloc(sizeof(*data)); data->ret = ret; + /* Before we switch to target in mirror_exit, make sure data doesn't + * change. */ + bdrv_drained_begin(s->common.bs); + if (qemu_get_aio_context() == bdrv_get_aio_context(bs)) { + /* FIXME: virtio host notifiers run on iohandler_ctx, therefore the + * above bdrv_drained_end isn't enough to quiesce it. This is ugly, we + * need a block layer API change to achieve this. */ + aio_disable_external(iohandler_get_aio_context()); + } block_job_defer_to_main_loop(&s->common, mirror_exit, data); } @@ -611,7 +743,9 @@ static void mirror_iostatus_reset(BlockJob *job) { MirrorBlockJob *s = container_of(job, MirrorBlockJob, common); - bdrv_iostatus_reset(s->target); + if (s->target->blk) { + blk_iostatus_reset(s->target->blk); + } } static void mirror_complete(BlockJob *job, Error **errp) @@ -620,14 +754,13 @@ static void mirror_complete(BlockJob *job, Error **errp) Error *local_err = NULL; int ret; - ret = bdrv_open_backing_file(s->target, NULL, &local_err); + ret = bdrv_open_backing_file(s->target, NULL, "backing", &local_err); if (ret < 0) { error_propagate(errp, local_err); return; } if (!s->synced) { - error_setg(errp, QERR_BLOCK_JOB_NOT_READY, - bdrv_get_device_name(job->bs)); + error_setg(errp, QERR_BLOCK_JOB_NOT_READY, job->id); return; } @@ -635,9 +768,9 @@ static void mirror_complete(BlockJob *job, Error **errp) if (s->replaces) { AioContext *replace_aio_context; - s->to_replace = check_to_replace_node(s->replaces, &local_err); + s->to_replace = bdrv_find_node(s->replaces); if (!s->to_replace) { - error_propagate(errp, local_err); + error_setg(errp, "Node name '%s' not found", s->replaces); return; } @@ -686,6 +819,7 @@ static void mirror_start_job(BlockDriverState *bs, BlockDriverState *target, bool is_none_mode, BlockDriverState *base) { MirrorBlockJob *s; + BlockDriverState *replaced_bs; if (granularity == 0) { granularity = bdrv_get_default_bitmap_granularity(target); @@ -695,7 +829,7 @@ static void mirror_start_job(BlockDriverState *bs, BlockDriverState *target, if ((on_source_error == BLOCKDEV_ON_ERROR_STOP || on_source_error == BLOCKDEV_ON_ERROR_ENOSPC) && - !bdrv_iostatus_is_enabled(bs)) { + (!bs->blk || !blk_iostatus_is_enabled(bs->blk))) { error_setg(errp, QERR_INVALID_PARAMETER, "on-source-error"); return; } @@ -709,6 +843,21 @@ static void mirror_start_job(BlockDriverState *bs, BlockDriverState *target, buf_size = DEFAULT_MIRROR_BUF_SIZE; } + /* We can't support this case as long as the block layer can't handle + * multiple BlockBackends per BlockDriverState. */ + if (replaces) { + replaced_bs = bdrv_lookup_bs(replaces, replaces, errp); + if (replaced_bs == NULL) { + return; + } + } else { + replaced_bs = bs; + } + if (replaced_bs->blk && target->blk) { + error_setg(errp, "Can't create node with two BlockBackends"); + return; + } + s = block_job_create(driver, bs, speed, cb, opaque, errp); if (!s) { return; @@ -727,12 +876,16 @@ static void mirror_start_job(BlockDriverState *bs, BlockDriverState *target, s->dirty_bitmap = bdrv_create_dirty_bitmap(bs, granularity, NULL, errp); if (!s->dirty_bitmap) { g_free(s->replaces); - block_job_release(bs); + block_job_unref(&s->common); return; } - bdrv_set_enable_write_cache(s->target, true); - bdrv_set_on_error(s->target, on_target_error, on_target_error); - bdrv_iostatus_enable(s->target); + + bdrv_op_block_all(s->target, s->common.blocker); + + if (s->target->blk) { + blk_set_on_error(s->target->blk, on_target_error, on_target_error); + blk_iostatus_enable(s->target->blk); + } s->common.co = qemu_coroutine_create(mirror_run); trace_mirror_start(bs, s, s->common.co, opaque); qemu_coroutine_enter(s->common.co, s); @@ -755,7 +908,7 @@ void mirror_start(BlockDriverState *bs, BlockDriverState *target, return; } is_none_mode = mode == MIRROR_SYNC_MODE_NONE; - base = mode == MIRROR_SYNC_MODE_TOP ? bs->backing_hd : NULL; + base = mode == MIRROR_SYNC_MODE_TOP ? backing_bs(bs) : NULL; mirror_start_job(bs, target, replaces, speed, granularity, buf_size, on_source_error, on_target_error, unmap, cb, opaque, errp, diff --git a/qemu/block/nbd-client.c b/qemu/block/nbd-client.c index e1bb9198c..878e879ac 100644 --- a/qemu/block/nbd-client.c +++ b/qemu/block/nbd-client.c @@ -26,8 +26,8 @@ * THE SOFTWARE. */ +#include "qemu/osdep.h" #include "nbd-client.h" -#include "qemu/sockets.h" #define HANDLE_TO_INDEX(bs, handle) ((handle) ^ ((uint64_t)(intptr_t)bs)) #define INDEX_TO_HANDLE(bs, index) ((index) ^ ((uint64_t)(intptr_t)bs)) @@ -47,13 +47,21 @@ static void nbd_teardown_connection(BlockDriverState *bs) { NbdClientSession *client = nbd_get_client_session(bs); + if (!client->ioc) { /* Already closed */ + return; + } + /* finish any pending coroutines */ - shutdown(client->sock, 2); + qio_channel_shutdown(client->ioc, + QIO_CHANNEL_SHUTDOWN_BOTH, + NULL); nbd_recv_coroutines_enter_all(client); nbd_client_detach_aio_context(bs); - closesocket(client->sock); - client->sock = -1; + object_unref(OBJECT(client->sioc)); + client->sioc = NULL; + object_unref(OBJECT(client->ioc)); + client->ioc = NULL; } static void nbd_reply_ready(void *opaque) @@ -63,12 +71,16 @@ static void nbd_reply_ready(void *opaque) uint64_t i; int ret; + if (!s->ioc) { /* Already closed */ + return; + } + if (s->reply.handle == 0) { /* No reply already in flight. Fetch a header. It is possible * that another thread has done the same thing in parallel, so * the socket is not readable anymore. */ - ret = nbd_receive_reply(s->sock, &s->reply); + ret = nbd_receive_reply(s->ioc, &s->reply); if (ret == -EAGAIN) { return; } @@ -119,32 +131,36 @@ static int nbd_co_send_request(BlockDriverState *bs, } } + g_assert(qemu_in_coroutine()); assert(i < MAX_NBD_REQUESTS); request->handle = INDEX_TO_HANDLE(s, i); + + if (!s->ioc) { + qemu_co_mutex_unlock(&s->send_mutex); + return -EPIPE; + } + s->send_coroutine = qemu_coroutine_self(); aio_context = bdrv_get_aio_context(bs); - aio_set_fd_handler(aio_context, s->sock, + aio_set_fd_handler(aio_context, s->sioc->fd, false, nbd_reply_ready, nbd_restart_write, bs); if (qiov) { - if (!s->is_unix) { - socket_set_cork(s->sock, 1); - } - rc = nbd_send_request(s->sock, request); + qio_channel_set_cork(s->ioc, true); + rc = nbd_send_request(s->ioc, request); if (rc >= 0) { - ret = qemu_co_sendv(s->sock, qiov->iov, qiov->niov, - offset, request->len); + ret = nbd_wr_syncv(s->ioc, qiov->iov, qiov->niov, + offset, request->len, 0); if (ret != request->len) { rc = -EIO; } } - if (!s->is_unix) { - socket_set_cork(s->sock, 0); - } + qio_channel_set_cork(s->ioc, false); } else { - rc = nbd_send_request(s->sock, request); + rc = nbd_send_request(s->ioc, request); } - aio_set_fd_handler(aio_context, s->sock, nbd_reply_ready, NULL, bs); + aio_set_fd_handler(aio_context, s->sioc->fd, false, + nbd_reply_ready, NULL, bs); s->send_coroutine = NULL; qemu_co_mutex_unlock(&s->send_mutex); return rc; @@ -160,12 +176,13 @@ static void nbd_co_receive_reply(NbdClientSession *s, * peek at the next reply and avoid yielding if it's ours? */ qemu_coroutine_yield(); *reply = s->reply; - if (reply->handle != request->handle) { + if (reply->handle != request->handle || + !s->ioc) { reply->error = EIO; } else { if (qiov && reply->error == 0) { - ret = qemu_co_recvv(s->sock, qiov->iov, qiov->niov, - offset, request->len); + ret = nbd_wr_syncv(s->ioc, qiov->iov, qiov->niov, + offset, request->len, 1); if (ret != request->len) { reply->error = EIO; } @@ -226,15 +243,15 @@ static int nbd_co_readv_1(BlockDriverState *bs, int64_t sector_num, static int nbd_co_writev_1(BlockDriverState *bs, int64_t sector_num, int nb_sectors, QEMUIOVector *qiov, - int offset) + int offset, int *flags) { NbdClientSession *client = nbd_get_client_session(bs); struct nbd_request request = { .type = NBD_CMD_WRITE }; struct nbd_reply reply; ssize_t ret; - if (!bdrv_enable_write_cache(bs) && - (client->nbdflags & NBD_FLAG_SEND_FUA)) { + if ((*flags & BDRV_REQ_FUA) && (client->nbdflags & NBD_FLAG_SEND_FUA)) { + *flags &= ~BDRV_REQ_FUA; request.type |= NBD_CMD_FLAG_FUA; } @@ -274,12 +291,13 @@ int nbd_client_co_readv(BlockDriverState *bs, int64_t sector_num, } int nbd_client_co_writev(BlockDriverState *bs, int64_t sector_num, - int nb_sectors, QEMUIOVector *qiov) + int nb_sectors, QEMUIOVector *qiov, int *flags) { int offset = 0; int ret; while (nb_sectors > NBD_MAX_SECTORS) { - ret = nbd_co_writev_1(bs, sector_num, NBD_MAX_SECTORS, qiov, offset); + ret = nbd_co_writev_1(bs, sector_num, NBD_MAX_SECTORS, qiov, offset, + flags); if (ret < 0) { return ret; } @@ -287,7 +305,7 @@ int nbd_client_co_writev(BlockDriverState *bs, int64_t sector_num, sector_num += NBD_MAX_SECTORS; nb_sectors -= NBD_MAX_SECTORS; } - return nbd_co_writev_1(bs, sector_num, nb_sectors, qiov, offset); + return nbd_co_writev_1(bs, sector_num, nb_sectors, qiov, offset, flags); } int nbd_client_co_flush(BlockDriverState *bs) @@ -301,10 +319,6 @@ int nbd_client_co_flush(BlockDriverState *bs) return 0; } - if (client->nbdflags & NBD_FLAG_SEND_FUA) { - request.type |= NBD_CMD_FLAG_FUA; - } - request.from = 0; request.len = 0; @@ -348,14 +362,15 @@ int nbd_client_co_discard(BlockDriverState *bs, int64_t sector_num, void nbd_client_detach_aio_context(BlockDriverState *bs) { aio_set_fd_handler(bdrv_get_aio_context(bs), - nbd_get_client_session(bs)->sock, NULL, NULL, NULL); + nbd_get_client_session(bs)->sioc->fd, + false, NULL, NULL, NULL); } void nbd_client_attach_aio_context(BlockDriverState *bs, AioContext *new_context) { - aio_set_fd_handler(new_context, nbd_get_client_session(bs)->sock, - nbd_reply_ready, NULL, bs); + aio_set_fd_handler(new_context, nbd_get_client_session(bs)->sioc->fd, + false, nbd_reply_ready, NULL, bs); } void nbd_client_close(BlockDriverState *bs) @@ -367,16 +382,20 @@ void nbd_client_close(BlockDriverState *bs) .len = 0 }; - if (client->sock == -1) { + if (client->ioc == NULL) { return; } - nbd_send_request(client->sock, &request); + nbd_send_request(client->ioc, &request); nbd_teardown_connection(bs); } -int nbd_client_init(BlockDriverState *bs, int sock, const char *export, +int nbd_client_init(BlockDriverState *bs, + QIOChannelSocket *sioc, + const char *export, + QCryptoTLSCreds *tlscreds, + const char *hostname, Error **errp) { NbdClientSession *client = nbd_get_client_session(bs); @@ -384,22 +403,32 @@ int nbd_client_init(BlockDriverState *bs, int sock, const char *export, /* NBD handshake */ logout("session init %s\n", export); - qemu_set_block(sock); - ret = nbd_receive_negotiate(sock, export, - &client->nbdflags, &client->size, errp); + qio_channel_set_blocking(QIO_CHANNEL(sioc), true, NULL); + + ret = nbd_receive_negotiate(QIO_CHANNEL(sioc), export, + &client->nbdflags, + tlscreds, hostname, + &client->ioc, + &client->size, errp); if (ret < 0) { logout("Failed to negotiate with the NBD server\n"); - closesocket(sock); return ret; } qemu_co_mutex_init(&client->send_mutex); qemu_co_mutex_init(&client->free_sema); - client->sock = sock; + client->sioc = sioc; + object_ref(OBJECT(client->sioc)); + + if (!client->ioc) { + client->ioc = QIO_CHANNEL(sioc); + object_ref(OBJECT(client->ioc)); + } /* Now that we're connected, set the socket to be non-blocking and * kick the reply mechanism. */ - qemu_set_nonblock(sock); + qio_channel_set_blocking(QIO_CHANNEL(sioc), false, NULL); + nbd_client_attach_aio_context(bs, bdrv_get_aio_context(bs)); logout("Established connection with NBD server\n"); diff --git a/qemu/block/nbd-client.h b/qemu/block/nbd-client.h index e8413408b..bc7aec079 100644 --- a/qemu/block/nbd-client.h +++ b/qemu/block/nbd-client.h @@ -4,6 +4,7 @@ #include "qemu-common.h" #include "block/nbd.h" #include "block/block_int.h" +#include "io/channel-socket.h" /* #define DEBUG_NBD */ @@ -17,7 +18,8 @@ #define MAX_NBD_REQUESTS 16 typedef struct NbdClientSession { - int sock; + QIOChannelSocket *sioc; /* The master data channel */ + QIOChannel *ioc; /* The current I/O channel which may differ (eg TLS) */ uint32_t nbdflags; off_t size; @@ -34,7 +36,11 @@ typedef struct NbdClientSession { NbdClientSession *nbd_get_client_session(BlockDriverState *bs); -int nbd_client_init(BlockDriverState *bs, int sock, const char *export_name, +int nbd_client_init(BlockDriverState *bs, + QIOChannelSocket *sock, + const char *export_name, + QCryptoTLSCreds *tlscreds, + const char *hostname, Error **errp); void nbd_client_close(BlockDriverState *bs); @@ -42,7 +48,7 @@ int nbd_client_co_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors); int nbd_client_co_flush(BlockDriverState *bs); int nbd_client_co_writev(BlockDriverState *bs, int64_t sector_num, - int nb_sectors, QEMUIOVector *qiov); + int nb_sectors, QEMUIOVector *qiov, int *flags); int nbd_client_co_readv(BlockDriverState *bs, int64_t sector_num, int nb_sectors, QEMUIOVector *qiov); diff --git a/qemu/block/nbd.c b/qemu/block/nbd.c index 217618612..f7ea3b360 100644 --- a/qemu/block/nbd.c +++ b/qemu/block/nbd.c @@ -26,24 +26,22 @@ * THE SOFTWARE. */ +#include "qemu/osdep.h" #include "block/nbd-client.h" +#include "qapi/error.h" #include "qemu/uri.h" #include "block/block_int.h" #include "qemu/module.h" -#include "qemu/sockets.h" #include "qapi/qmp/qdict.h" #include "qapi/qmp/qjson.h" #include "qapi/qmp/qint.h" #include "qapi/qmp/qstring.h" - -#include <sys/types.h> -#include <unistd.h> +#include "qemu/cutils.h" #define EN_OPTSTR ":exportname=" typedef struct BDRVNBDState { NbdClientSession client; - QemuOpts *socket_opts; } BDRVNBDState; static int nbd_parse_uri(const char *filename, QDict *options) @@ -190,10 +188,10 @@ out: g_free(file); } -static void nbd_config(BDRVNBDState *s, QDict *options, char **export, - Error **errp) +static SocketAddress *nbd_config(BDRVNBDState *s, QDict *options, char **export, + Error **errp) { - Error *local_err = NULL; + SocketAddress *saddr; if (qdict_haskey(options, "path") == qdict_haskey(options, "host")) { if (qdict_haskey(options, "path")) { @@ -201,28 +199,39 @@ static void nbd_config(BDRVNBDState *s, QDict *options, char **export, } else { error_setg(errp, "one of path and host must be specified."); } - return; + return NULL; } - s->client.is_unix = qdict_haskey(options, "path"); - s->socket_opts = qemu_opts_create(&socket_optslist, NULL, 0, - &error_abort); + saddr = g_new0(SocketAddress, 1); - qemu_opts_absorb_qdict(s->socket_opts, options, &local_err); - if (local_err) { - error_propagate(errp, local_err); - return; + if (qdict_haskey(options, "path")) { + UnixSocketAddress *q_unix; + saddr->type = SOCKET_ADDRESS_KIND_UNIX; + q_unix = saddr->u.q_unix.data = g_new0(UnixSocketAddress, 1); + q_unix->path = g_strdup(qdict_get_str(options, "path")); + qdict_del(options, "path"); + } else { + InetSocketAddress *inet; + saddr->type = SOCKET_ADDRESS_KIND_INET; + inet = saddr->u.inet.data = g_new0(InetSocketAddress, 1); + inet->host = g_strdup(qdict_get_str(options, "host")); + if (!qdict_get_try_str(options, "port")) { + inet->port = g_strdup_printf("%d", NBD_DEFAULT_PORT); + } else { + inet->port = g_strdup(qdict_get_str(options, "port")); + } + qdict_del(options, "host"); + qdict_del(options, "port"); } - if (!qemu_opt_get(s->socket_opts, "port")) { - qemu_opt_set_number(s->socket_opts, "port", NBD_DEFAULT_PORT, - &error_abort); - } + s->client.is_unix = saddr->type == SOCKET_ADDRESS_KIND_UNIX; *export = g_strdup(qdict_get_try_str(options, "export")); if (*export) { qdict_del(options, "export"); } + + return saddr; } NbdClientSession *nbd_get_client_session(BlockDriverState *bs) @@ -231,57 +240,113 @@ NbdClientSession *nbd_get_client_session(BlockDriverState *bs) return &s->client; } -static int nbd_establish_connection(BlockDriverState *bs, Error **errp) +static QIOChannelSocket *nbd_establish_connection(SocketAddress *saddr, + Error **errp) { - BDRVNBDState *s = bs->opaque; - int sock; + QIOChannelSocket *sioc; + Error *local_err = NULL; - if (s->client.is_unix) { - sock = unix_connect_opts(s->socket_opts, errp, NULL, NULL); - } else { - sock = inet_connect_opts(s->socket_opts, errp, NULL, NULL); - if (sock >= 0) { - socket_set_nodelay(sock); - } + sioc = qio_channel_socket_new(); + + qio_channel_socket_connect_sync(sioc, + saddr, + &local_err); + if (local_err) { + error_propagate(errp, local_err); + return NULL; } - /* Failed to establish connection */ - if (sock < 0) { - logout("Failed to establish connection to NBD server\n"); - return -EIO; + qio_channel_set_delay(QIO_CHANNEL(sioc), false); + + return sioc; +} + + +static QCryptoTLSCreds *nbd_get_tls_creds(const char *id, Error **errp) +{ + Object *obj; + QCryptoTLSCreds *creds; + + obj = object_resolve_path_component( + object_get_objects_root(), id); + if (!obj) { + error_setg(errp, "No TLS credentials with id '%s'", + id); + return NULL; + } + creds = (QCryptoTLSCreds *) + object_dynamic_cast(obj, TYPE_QCRYPTO_TLS_CREDS); + if (!creds) { + error_setg(errp, "Object with id '%s' is not TLS credentials", + id); + return NULL; } - return sock; + if (creds->endpoint != QCRYPTO_TLS_CREDS_ENDPOINT_CLIENT) { + error_setg(errp, + "Expecting TLS credentials with a client endpoint"); + return NULL; + } + object_ref(obj); + return creds; } + static int nbd_open(BlockDriverState *bs, QDict *options, int flags, Error **errp) { BDRVNBDState *s = bs->opaque; char *export = NULL; - int result, sock; - Error *local_err = NULL; + QIOChannelSocket *sioc = NULL; + SocketAddress *saddr; + const char *tlscredsid; + QCryptoTLSCreds *tlscreds = NULL; + const char *hostname = NULL; + int ret = -EINVAL; /* Pop the config into our state object. Exit if invalid. */ - nbd_config(s, options, &export, &local_err); - if (local_err) { - error_propagate(errp, local_err); - return -EINVAL; + saddr = nbd_config(s, options, &export, errp); + if (!saddr) { + goto error; + } + + tlscredsid = g_strdup(qdict_get_try_str(options, "tls-creds")); + if (tlscredsid) { + qdict_del(options, "tls-creds"); + tlscreds = nbd_get_tls_creds(tlscredsid, errp); + if (!tlscreds) { + goto error; + } + + if (saddr->type != SOCKET_ADDRESS_KIND_INET) { + error_setg(errp, "TLS only supported over IP sockets"); + goto error; + } + hostname = saddr->u.inet.data->host; } /* establish TCP connection, return error if it fails * TODO: Configurable retry-until-timeout behaviour. */ - sock = nbd_establish_connection(bs, errp); - if (sock < 0) { - g_free(export); - return sock; + sioc = nbd_establish_connection(saddr, errp); + if (!sioc) { + ret = -ECONNREFUSED; + goto error; } /* NBD handshake */ - result = nbd_client_init(bs, sock, export, errp); + ret = nbd_client_init(bs, sioc, export, + tlscreds, hostname, errp); + error: + if (sioc) { + object_unref(OBJECT(sioc)); + } + if (tlscreds) { + object_unref(OBJECT(tlscreds)); + } + qapi_free_SocketAddress(saddr); g_free(export); - return result; + return ret; } static int nbd_co_readv(BlockDriverState *bs, int64_t sector_num, @@ -290,10 +355,29 @@ static int nbd_co_readv(BlockDriverState *bs, int64_t sector_num, return nbd_client_co_readv(bs, sector_num, nb_sectors, qiov); } +static int nbd_co_writev_flags(BlockDriverState *bs, int64_t sector_num, + int nb_sectors, QEMUIOVector *qiov, int flags) +{ + int ret; + + ret = nbd_client_co_writev(bs, sector_num, nb_sectors, qiov, &flags); + if (ret < 0) { + return ret; + } + + /* The flag wasn't sent to the server, so we need to emulate it with an + * explicit flush */ + if (flags & BDRV_REQ_FUA) { + ret = nbd_client_co_flush(bs); + } + + return ret; +} + static int nbd_co_writev(BlockDriverState *bs, int64_t sector_num, int nb_sectors, QEMUIOVector *qiov) { - return nbd_client_co_writev(bs, sector_num, nb_sectors, qiov); + return nbd_co_writev_flags(bs, sector_num, nb_sectors, qiov, 0); } static int nbd_co_flush(BlockDriverState *bs) @@ -315,9 +399,6 @@ static int nbd_co_discard(BlockDriverState *bs, int64_t sector_num, static void nbd_close(BlockDriverState *bs) { - BDRVNBDState *s = bs->opaque; - - qemu_opts_del(s->socket_opts); nbd_client_close(bs); } @@ -339,13 +420,14 @@ static void nbd_attach_aio_context(BlockDriverState *bs, nbd_client_attach_aio_context(bs, new_context); } -static void nbd_refresh_filename(BlockDriverState *bs) +static void nbd_refresh_filename(BlockDriverState *bs, QDict *options) { QDict *opts = qdict_new(); - const char *path = qdict_get_try_str(bs->options, "path"); - const char *host = qdict_get_try_str(bs->options, "host"); - const char *port = qdict_get_try_str(bs->options, "port"); - const char *export = qdict_get_try_str(bs->options, "export"); + const char *path = qdict_get_try_str(options, "path"); + const char *host = qdict_get_try_str(options, "host"); + const char *port = qdict_get_try_str(options, "port"); + const char *export = qdict_get_try_str(options, "export"); + const char *tlscreds = qdict_get_try_str(options, "tls-creds"); qdict_put_obj(opts, "driver", QOBJECT(qstring_from_str("nbd"))); @@ -380,6 +462,9 @@ static void nbd_refresh_filename(BlockDriverState *bs) if (export) { qdict_put_obj(opts, "export", QOBJECT(qstring_from_str(export))); } + if (tlscreds) { + qdict_put_obj(opts, "tls-creds", QOBJECT(qstring_from_str(tlscreds))); + } bs->full_open_options = opts; } @@ -392,6 +477,8 @@ static BlockDriver bdrv_nbd = { .bdrv_file_open = nbd_open, .bdrv_co_readv = nbd_co_readv, .bdrv_co_writev = nbd_co_writev, + .bdrv_co_writev_flags = nbd_co_writev_flags, + .supported_write_flags = BDRV_REQ_FUA, .bdrv_close = nbd_close, .bdrv_co_flush_to_os = nbd_co_flush, .bdrv_co_discard = nbd_co_discard, @@ -410,6 +497,8 @@ static BlockDriver bdrv_nbd_tcp = { .bdrv_file_open = nbd_open, .bdrv_co_readv = nbd_co_readv, .bdrv_co_writev = nbd_co_writev, + .bdrv_co_writev_flags = nbd_co_writev_flags, + .supported_write_flags = BDRV_REQ_FUA, .bdrv_close = nbd_close, .bdrv_co_flush_to_os = nbd_co_flush, .bdrv_co_discard = nbd_co_discard, @@ -428,6 +517,8 @@ static BlockDriver bdrv_nbd_unix = { .bdrv_file_open = nbd_open, .bdrv_co_readv = nbd_co_readv, .bdrv_co_writev = nbd_co_writev, + .bdrv_co_writev_flags = nbd_co_writev_flags, + .supported_write_flags = BDRV_REQ_FUA, .bdrv_close = nbd_close, .bdrv_co_flush_to_os = nbd_co_flush, .bdrv_co_discard = nbd_co_discard, diff --git a/qemu/block/nfs.c b/qemu/block/nfs.c index c026ff688..9f51cc3f1 100644 --- a/qemu/block/nfs.c +++ b/qemu/block/nfs.c @@ -22,20 +22,23 @@ * THE SOFTWARE. */ -#include "config-host.h" +#include "qemu/osdep.h" #include <poll.h> #include "qemu-common.h" #include "qemu/config-file.h" #include "qemu/error-report.h" +#include "qapi/error.h" #include "block/block_int.h" #include "trace.h" #include "qemu/iov.h" #include "qemu/uri.h" +#include "qemu/cutils.h" #include "sysemu/sysemu.h" #include <nfsc/libnfs.h> #define QEMU_NFS_MAX_READAHEAD_SIZE 1048576 +#define QEMU_NFS_MAX_DEBUG_LEVEL 2 typedef struct NFSClient { struct nfs_context *context; @@ -43,6 +46,7 @@ typedef struct NFSClient { int events; bool has_zero_init; AioContext *aio_context; + blkcnt_t st_blocks; } NFSClient; typedef struct NFSRPC { @@ -62,11 +66,10 @@ static void nfs_set_events(NFSClient *client) { int ev = nfs_which_events(client->context); if (ev != client->events) { - aio_set_fd_handler(client->aio_context, - nfs_get_fd(client->context), + aio_set_fd_handler(client->aio_context, nfs_get_fd(client->context), + false, (ev & POLLIN) ? nfs_process_read : NULL, - (ev & POLLOUT) ? nfs_process_write : NULL, - client); + (ev & POLLOUT) ? nfs_process_write : NULL, client); } client->events = ev; @@ -241,9 +244,8 @@ static void nfs_detach_aio_context(BlockDriverState *bs) { NFSClient *client = bs->opaque; - aio_set_fd_handler(client->aio_context, - nfs_get_fd(client->context), - NULL, NULL, NULL); + aio_set_fd_handler(client->aio_context, nfs_get_fd(client->context), + false, NULL, NULL, NULL); client->events = 0; } @@ -262,9 +264,8 @@ static void nfs_client_close(NFSClient *client) if (client->fh) { nfs_close(client->context, client->fh); } - aio_set_fd_handler(client->aio_context, - nfs_get_fd(client->context), - NULL, NULL, NULL); + aio_set_fd_handler(client->aio_context, nfs_get_fd(client->context), + false, NULL, NULL, NULL); nfs_destroy_context(client->context); } memset(client, 0, sizeof(NFSClient)); @@ -336,6 +337,17 @@ static int64_t nfs_client_open(NFSClient *client, const char *filename, } nfs_set_readahead(client->context, val); #endif +#ifdef LIBNFS_FEATURE_DEBUG + } else if (!strcmp(qp->p[i].name, "debug")) { + /* limit the maximum debug level to avoid potential flooding + * of our log files. */ + if (val > QEMU_NFS_MAX_DEBUG_LEVEL) { + error_report("NFS Warning: Limiting NFS debug level" + " to %d", QEMU_NFS_MAX_DEBUG_LEVEL); + val = QEMU_NFS_MAX_DEBUG_LEVEL; + } + nfs_set_debug(client->context, val); +#endif } else { error_setg(errp, "Unknown NFS parameter name: %s", qp->p[i].name); @@ -374,6 +386,7 @@ static int64_t nfs_client_open(NFSClient *client, const char *filename, } ret = DIV_ROUND_UP(st.st_size, BDRV_SECTOR_SIZE); + client->st_blocks = st.st_blocks; client->has_zero_init = S_ISREG(st.st_mode); goto out; fail: @@ -464,6 +477,11 @@ static int64_t nfs_get_allocated_file_size(BlockDriverState *bs) NFSRPC task = {0}; struct stat st; + if (bdrv_is_read_only(bs) && + !(bs->open_flags & BDRV_O_NOCACHE)) { + return client->st_blocks * 512; + } + task.st = &st; if (nfs_fstat_async(client->context, client->fh, nfs_co_generic_cb, &task) != 0) { @@ -475,7 +493,7 @@ static int64_t nfs_get_allocated_file_size(BlockDriverState *bs) aio_poll(client->aio_context, true); } - return (task.ret < 0 ? task.ret : st.st_blocks * st.st_blksize); + return (task.ret < 0 ? task.ret : st.st_blocks * 512); } static int nfs_file_truncate(BlockDriverState *bs, int64_t offset) @@ -484,6 +502,34 @@ static int nfs_file_truncate(BlockDriverState *bs, int64_t offset) return nfs_ftruncate(client->context, client->fh, offset); } +/* Note that this will not re-establish a connection with the NFS server + * - it is effectively a NOP. */ +static int nfs_reopen_prepare(BDRVReopenState *state, + BlockReopenQueue *queue, Error **errp) +{ + NFSClient *client = state->bs->opaque; + struct stat st; + int ret = 0; + + if (state->flags & BDRV_O_RDWR && bdrv_is_read_only(state->bs)) { + error_setg(errp, "Cannot open a read-only mount as read-write"); + return -EACCES; + } + + /* Update cache for read-only reopens */ + if (!(state->flags & BDRV_O_RDWR)) { + ret = nfs_fstat(client->context, client->fh, &st); + if (ret < 0) { + error_setg(errp, "Failed to fstat file: %s", + nfs_get_error(client->context)); + return ret; + } + client->st_blocks = st.st_blocks; + } + + return 0; +} + static BlockDriver bdrv_nfs = { .format_name = "nfs", .protocol_name = "nfs", @@ -499,6 +545,7 @@ static BlockDriver bdrv_nfs = { .bdrv_file_open = nfs_file_open, .bdrv_close = nfs_file_close, .bdrv_create = nfs_file_create, + .bdrv_reopen_prepare = nfs_reopen_prepare, .bdrv_co_readv = nfs_co_readv, .bdrv_co_writev = nfs_co_writev, diff --git a/qemu/block/null.c b/qemu/block/null.c index 7d083233f..396500bab 100644 --- a/qemu/block/null.c +++ b/qemu/block/null.c @@ -10,13 +10,17 @@ * See the COPYING file in the top-level directory. */ +#include "qemu/osdep.h" +#include "qapi/error.h" #include "block/block_int.h" #define NULL_OPT_LATENCY "latency-ns" +#define NULL_OPT_ZEROES "read-zeroes" typedef struct { int64_t length; int64_t latency_ns; + bool read_zeroes; } BDRVNullState; static QemuOptsList runtime_opts = { @@ -39,6 +43,11 @@ static QemuOptsList runtime_opts = { .help = "nanoseconds (approximated) to wait " "before completing request", }, + { + .name = NULL_OPT_ZEROES, + .type = QEMU_OPT_BOOL, + .help = "return zeroes when read", + }, { /* end of list */ } }, }; @@ -60,6 +69,7 @@ static int null_file_open(BlockDriverState *bs, QDict *options, int flags, error_setg(errp, "latency-ns is invalid"); ret = -EINVAL; } + s->read_zeroes = qemu_opt_get_bool(opts, NULL_OPT_ZEROES, false); qemu_opts_del(opts); return ret; } @@ -89,6 +99,12 @@ static coroutine_fn int null_co_readv(BlockDriverState *bs, int64_t sector_num, int nb_sectors, QEMUIOVector *qiov) { + BDRVNullState *s = bs->opaque; + + if (s->read_zeroes) { + qemu_iovec_memset(qiov, 0, 0, nb_sectors * BDRV_SECTOR_SIZE); + } + return null_co_common(bs); } @@ -158,6 +174,12 @@ static BlockAIOCB *null_aio_readv(BlockDriverState *bs, BlockCompletionFunc *cb, void *opaque) { + BDRVNullState *s = bs->opaque; + + if (s->read_zeroes) { + qemu_iovec_memset(qiov, 0, 0, nb_sectors * BDRV_SECTOR_SIZE); + } + return null_aio_common(bs, cb, opaque); } @@ -183,6 +205,24 @@ static int null_reopen_prepare(BDRVReopenState *reopen_state, return 0; } +static int64_t coroutine_fn null_co_get_block_status(BlockDriverState *bs, + int64_t sector_num, + int nb_sectors, int *pnum, + BlockDriverState **file) +{ + BDRVNullState *s = bs->opaque; + off_t start = sector_num * BDRV_SECTOR_SIZE; + + *pnum = nb_sectors; + *file = bs; + + if (s->read_zeroes) { + return BDRV_BLOCK_OFFSET_VALID | start | BDRV_BLOCK_ZERO; + } else { + return BDRV_BLOCK_OFFSET_VALID | start; + } +} + static BlockDriver bdrv_null_co = { .format_name = "null-co", .protocol_name = "null-co", @@ -196,6 +236,8 @@ static BlockDriver bdrv_null_co = { .bdrv_co_writev = null_co_writev, .bdrv_co_flush_to_disk = null_co_flush, .bdrv_reopen_prepare = null_reopen_prepare, + + .bdrv_co_get_block_status = null_co_get_block_status, }; static BlockDriver bdrv_null_aio = { @@ -211,6 +253,8 @@ static BlockDriver bdrv_null_aio = { .bdrv_aio_writev = null_aio_writev, .bdrv_aio_flush = null_aio_flush, .bdrv_reopen_prepare = null_reopen_prepare, + + .bdrv_co_get_block_status = null_co_get_block_status, }; static void bdrv_null_init(void) diff --git a/qemu/block/parallels.c b/qemu/block/parallels.c index 046b56844..324ed43ac 100644 --- a/qemu/block/parallels.c +++ b/qemu/block/parallels.c @@ -27,8 +27,11 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. */ +#include "qemu/osdep.h" +#include "qapi/error.h" #include "qemu-common.h" #include "block/block_int.h" +#include "sysemu/block-backend.h" #include "qemu/module.h" #include "qemu/bitmap.h" #include "qapi/util.h" @@ -61,7 +64,7 @@ typedef struct ParallelsHeader { typedef enum ParallelsPreallocMode { PRL_PREALLOC_MODE_FALLOCATE = 0, PRL_PREALLOC_MODE_TRUNCATE = 1, - PRL_PREALLOC_MODE_MAX = 2, + PRL_PREALLOC_MODE__MAX = 2, } ParallelsPreallocMode; static const char *prealloc_mode_lookup[] = { @@ -202,13 +205,13 @@ static int64_t allocate_clusters(BlockDriverState *bs, int64_t sector_num, to_allocate = (sector_num + *pnum + s->tracks - 1) / s->tracks - idx; space = to_allocate * s->tracks; - if (s->data_end + space > bdrv_getlength(bs->file) >> BDRV_SECTOR_BITS) { + if (s->data_end + space > bdrv_getlength(bs->file->bs) >> BDRV_SECTOR_BITS) { int ret; space += s->prealloc_size; if (s->prealloc_mode == PRL_PREALLOC_MODE_FALLOCATE) { - ret = bdrv_write_zeroes(bs->file, s->data_end, space, 0); + ret = bdrv_write_zeroes(bs->file->bs, s->data_end, space, 0); } else { - ret = bdrv_truncate(bs->file, + ret = bdrv_truncate(bs->file->bs, (s->data_end + space) << BDRV_SECTOR_BITS); } if (ret < 0) { @@ -220,7 +223,7 @@ static int64_t allocate_clusters(BlockDriverState *bs, int64_t sector_num, s->bat_bitmap[idx + i] = cpu_to_le32(s->data_end / s->off_multiplier); s->data_end += s->tracks; bitmap_set(s->bat_dirty_bmap, - bat_entry_off(idx) / s->bat_dirty_block, 1); + bat_entry_off(idx + i) / s->bat_dirty_block, 1); } return bat2sect(s, idx) + sector_num % s->tracks; @@ -244,7 +247,8 @@ static coroutine_fn int parallels_co_flush_to_os(BlockDriverState *bs) if (off + to_write > s->header_size) { to_write = s->header_size - off; } - ret = bdrv_pwrite(bs->file, off, (uint8_t *)s->header + off, to_write); + ret = bdrv_pwrite(bs->file->bs, off, (uint8_t *)s->header + off, + to_write); if (ret < 0) { qemu_co_mutex_unlock(&s->lock); return ret; @@ -259,7 +263,7 @@ static coroutine_fn int parallels_co_flush_to_os(BlockDriverState *bs) static int64_t coroutine_fn parallels_co_get_block_status(BlockDriverState *bs, - int64_t sector_num, int nb_sectors, int *pnum) + int64_t sector_num, int nb_sectors, int *pnum, BlockDriverState **file) { BDRVParallelsState *s = bs->opaque; int64_t offset; @@ -272,6 +276,7 @@ static int64_t coroutine_fn parallels_co_get_block_status(BlockDriverState *bs, return 0; } + *file = bs->file->bs; return (offset << BDRV_SECTOR_BITS) | BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID; } @@ -303,7 +308,7 @@ static coroutine_fn int parallels_co_writev(BlockDriverState *bs, qemu_iovec_reset(&hd_qiov); qemu_iovec_concat(&hd_qiov, qiov, bytes_done, nbytes); - ret = bdrv_co_writev(bs->file, position, n, &hd_qiov); + ret = bdrv_co_writev(bs->file->bs, position, n, &hd_qiov); if (ret < 0) { break; } @@ -343,7 +348,7 @@ static coroutine_fn int parallels_co_readv(BlockDriverState *bs, qemu_iovec_reset(&hd_qiov); qemu_iovec_concat(&hd_qiov, qiov, bytes_done, nbytes); - ret = bdrv_co_readv(bs->file, position, n, &hd_qiov); + ret = bdrv_co_readv(bs->file->bs, position, n, &hd_qiov); if (ret < 0) { break; } @@ -369,7 +374,7 @@ static int parallels_check(BlockDriverState *bs, BdrvCheckResult *res, bool flush_bat = false; int cluster_size = s->tracks << BDRV_SECTOR_BITS; - size = bdrv_getlength(bs->file); + size = bdrv_getlength(bs->file->bs); if (size < 0) { res->check_errors++; return size; @@ -424,7 +429,7 @@ static int parallels_check(BlockDriverState *bs, BdrvCheckResult *res, } if (flush_bat) { - ret = bdrv_pwrite_sync(bs->file, 0, s->header, s->header_size); + ret = bdrv_pwrite_sync(bs->file->bs, 0, s->header, s->header_size); if (ret < 0) { res->check_errors++; return ret; @@ -440,7 +445,7 @@ static int parallels_check(BlockDriverState *bs, BdrvCheckResult *res, size - res->image_end_offset); res->leaks += count; if (fix & BDRV_FIX_LEAKS) { - ret = bdrv_truncate(bs->file, res->image_end_offset); + ret = bdrv_truncate(bs->file->bs, res->image_end_offset); if (ret < 0) { res->check_errors++; return ret; @@ -458,7 +463,7 @@ static int parallels_create(const char *filename, QemuOpts *opts, Error **errp) int64_t total_size, cl_size; uint8_t tmp[BDRV_SECTOR_SIZE]; Error *local_err = NULL; - BlockDriverState *file; + BlockBackend *file; uint32_t bat_entries, bat_sectors; ParallelsHeader header; int ret; @@ -474,14 +479,16 @@ static int parallels_create(const char *filename, QemuOpts *opts, Error **errp) return ret; } - file = NULL; - ret = bdrv_open(&file, filename, NULL, NULL, - BDRV_O_RDWR | BDRV_O_PROTOCOL, NULL, &local_err); - if (ret < 0) { + file = blk_new_open(filename, NULL, NULL, + BDRV_O_RDWR | BDRV_O_PROTOCOL, &local_err); + if (file == NULL) { error_propagate(errp, local_err); - return ret; + return -EIO; } - ret = bdrv_truncate(file, 0); + + blk_set_allow_write_beyond_eof(file, true); + + ret = blk_truncate(file, 0); if (ret < 0) { goto exit; } @@ -505,18 +512,18 @@ static int parallels_create(const char *filename, QemuOpts *opts, Error **errp) memset(tmp, 0, sizeof(tmp)); memcpy(tmp, &header, sizeof(header)); - ret = bdrv_pwrite(file, 0, tmp, BDRV_SECTOR_SIZE); + ret = blk_pwrite(file, 0, tmp, BDRV_SECTOR_SIZE); if (ret < 0) { goto exit; } - ret = bdrv_write_zeroes(file, 1, bat_sectors - 1, 0); + ret = blk_write_zeroes(file, 1, bat_sectors - 1, 0); if (ret < 0) { goto exit; } ret = 0; done: - bdrv_unref(file); + blk_unref(file); return ret; exit: @@ -546,12 +553,13 @@ static int parallels_probe(const uint8_t *buf, int buf_size, static int parallels_update_header(BlockDriverState *bs) { BDRVParallelsState *s = bs->opaque; - unsigned size = MAX(bdrv_opt_mem_align(bs->file), sizeof(ParallelsHeader)); + unsigned size = MAX(bdrv_opt_mem_align(bs->file->bs), + sizeof(ParallelsHeader)); if (size > s->header_size) { size = s->header_size; } - return bdrv_pwrite_sync(bs->file, 0, s->header, size); + return bdrv_pwrite_sync(bs->file->bs, 0, s->header, size); } static int parallels_open(BlockDriverState *bs, QDict *options, int flags, @@ -564,7 +572,7 @@ static int parallels_open(BlockDriverState *bs, QDict *options, int flags, Error *local_err = NULL; char *buf; - ret = bdrv_pread(bs->file, 0, &ph, sizeof(ph)); + ret = bdrv_pread(bs->file->bs, 0, &ph, sizeof(ph)); if (ret < 0) { goto fail; } @@ -603,8 +611,8 @@ static int parallels_open(BlockDriverState *bs, QDict *options, int flags, } size = bat_entry_off(s->bat_size); - s->header_size = ROUND_UP(size, bdrv_opt_mem_align(bs->file)); - s->header = qemu_try_blockalign(bs->file, s->header_size); + s->header_size = ROUND_UP(size, bdrv_opt_mem_align(bs->file->bs)); + s->header = qemu_try_blockalign(bs->file->bs, s->header_size); if (s->header == NULL) { ret = -ENOMEM; goto fail; @@ -619,7 +627,7 @@ static int parallels_open(BlockDriverState *bs, QDict *options, int flags, s->header_size = size; } - ret = bdrv_pread(bs->file, 0, s->header, s->header_size); + ret = bdrv_pread(bs->file->bs, 0, s->header, s->header_size); if (ret < 0) { goto fail; } @@ -658,13 +666,13 @@ static int parallels_open(BlockDriverState *bs, QDict *options, int flags, s->prealloc_size = MAX(s->tracks, s->prealloc_size >> BDRV_SECTOR_BITS); buf = qemu_opt_get_del(opts, PARALLELS_OPT_PREALLOC_MODE); s->prealloc_mode = qapi_enum_parse(prealloc_mode_lookup, buf, - PRL_PREALLOC_MODE_MAX, PRL_PREALLOC_MODE_FALLOCATE, &local_err); + PRL_PREALLOC_MODE__MAX, PRL_PREALLOC_MODE_FALLOCATE, &local_err); g_free(buf); if (local_err != NULL) { goto fail_options; } - if (!bdrv_has_zero_init(bs->file) || - bdrv_truncate(bs->file, bdrv_getlength(bs->file)) != 0) { + if (!bdrv_has_zero_init(bs->file->bs) || + bdrv_truncate(bs->file->bs, bdrv_getlength(bs->file->bs)) != 0) { s->prealloc_mode = PRL_PREALLOC_MODE_FALLOCATE; } @@ -707,7 +715,7 @@ static void parallels_close(BlockDriverState *bs) } if (bs->open_flags & BDRV_O_RDWR) { - bdrv_truncate(bs->file, s->data_end << BDRV_SECTOR_BITS); + bdrv_truncate(bs->file->bs, s->data_end << BDRV_SECTOR_BITS); } g_free(s->bat_dirty_bmap); diff --git a/qemu/block/qapi.c b/qemu/block/qapi.c index 2ce509711..c5f6ba643 100644 --- a/qemu/block/qapi.c +++ b/qemu/block/qapi.c @@ -22,6 +22,7 @@ * THE SOFTWARE. */ +#include "qemu/osdep.h" #include "block/qapi.h" #include "block/block_int.h" #include "block/throttle-groups.h" @@ -31,8 +32,10 @@ #include "qapi/qmp-output-visitor.h" #include "qapi/qmp/types.h" #include "sysemu/block-backend.h" +#include "qemu/cutils.h" -BlockDeviceInfo *bdrv_block_device_info(BlockDriverState *bs, Error **errp) +BlockDeviceInfo *bdrv_block_device_info(BlockBackend *blk, + BlockDriverState *bs, Error **errp) { ImageInfo **p_image_info; BlockDriverState *bs0; @@ -46,7 +49,7 @@ BlockDeviceInfo *bdrv_block_device_info(BlockDriverState *bs, Error **errp) info->cache = g_new(BlockdevCacheInfo, 1); *info->cache = (BlockdevCacheInfo) { - .writeback = bdrv_enable_write_cache(bs), + .writeback = blk ? blk_enable_write_cache(blk) : true, .direct = !!(bs->open_flags & BDRV_O_NOCACHE), .no_flush = !!(bs->open_flags & BDRV_O_NO_FLUSH), }; @@ -64,7 +67,7 @@ BlockDeviceInfo *bdrv_block_device_info(BlockDriverState *bs, Error **errp) info->backing_file_depth = bdrv_get_backing_file_depth(bs); info->detect_zeroes = bs->detect_zeroes; - if (bs->io_limits_enabled) { + if (bs->throttle_state) { ThrottleConfig cfg; throttle_group_get_config(bs, &cfg); @@ -91,6 +94,26 @@ BlockDeviceInfo *bdrv_block_device_info(BlockDriverState *bs, Error **errp) info->has_iops_wr_max = cfg.buckets[THROTTLE_OPS_WRITE].max; info->iops_wr_max = cfg.buckets[THROTTLE_OPS_WRITE].max; + info->has_bps_max_length = info->has_bps_max; + info->bps_max_length = + cfg.buckets[THROTTLE_BPS_TOTAL].burst_length; + info->has_bps_rd_max_length = info->has_bps_rd_max; + info->bps_rd_max_length = + cfg.buckets[THROTTLE_BPS_READ].burst_length; + info->has_bps_wr_max_length = info->has_bps_wr_max; + info->bps_wr_max_length = + cfg.buckets[THROTTLE_BPS_WRITE].burst_length; + + info->has_iops_max_length = info->has_iops_max; + info->iops_max_length = + cfg.buckets[THROTTLE_OPS_TOTAL].burst_length; + info->has_iops_rd_max_length = info->has_iops_rd_max; + info->iops_rd_max_length = + cfg.buckets[THROTTLE_OPS_READ].burst_length; + info->has_iops_wr_max_length = info->has_iops_wr_max; + info->iops_wr_max_length = + cfg.buckets[THROTTLE_OPS_WRITE].burst_length; + info->has_iops_size = cfg.op_size; info->iops_size = cfg.op_size; @@ -110,8 +133,8 @@ BlockDeviceInfo *bdrv_block_device_info(BlockDriverState *bs, Error **errp) qapi_free_BlockDeviceInfo(info); return NULL; } - if (bs0->drv && bs0->backing_hd) { - bs0 = bs0->backing_hd; + if (bs0->drv && bs0->backing) { + bs0 = bs0->backing->bs; (*p_image_info)->has_backing_image = true; p_image_info = &((*p_image_info)->backing_image); } else { @@ -210,11 +233,13 @@ void bdrv_query_image_info(BlockDriverState *bs, Error *err = NULL; ImageInfo *info; + aio_context_acquire(bdrv_get_aio_context(bs)); + size = bdrv_getlength(bs); if (size < 0) { error_setg_errno(errp, -size, "Can't get size of device '%s'", bdrv_get_device_name(bs)); - return; + goto out; } info = g_new0(ImageInfo, 1); @@ -245,15 +270,18 @@ void bdrv_query_image_info(BlockDriverState *bs, info->has_backing_filename = true; bdrv_get_full_backing_filename(bs, backing_filename2, PATH_MAX, &err); if (err) { - error_propagate(errp, err); - qapi_free_ImageInfo(info); + /* Can't reconstruct the full backing filename, so we must omit + * this field and apply a Best Effort to this query. */ g_free(backing_filename2); - return; + backing_filename2 = NULL; + error_free(err); + err = NULL; } - if (strcmp(backing_filename, backing_filename2) != 0) { - info->full_backing_filename = - g_strdup(backing_filename2); + /* Always report the full_backing_filename if present, even if it's the + * same as backing_filename. That they are same is useful info. */ + if (backing_filename2) { + info->full_backing_filename = g_strdup(backing_filename2); info->has_full_backing_filename = true; } @@ -279,10 +307,13 @@ void bdrv_query_image_info(BlockDriverState *bs, default: error_propagate(errp, err); qapi_free_ImageInfo(info); - return; + goto out; } *p_info = info; + +out: + aio_context_release(bdrv_get_aio_context(bs)); } /* @p_info will be set only on success. */ @@ -296,24 +327,24 @@ static void bdrv_query_info(BlockBackend *blk, BlockInfo **p_info, info->locked = blk_dev_is_medium_locked(blk); info->removable = blk_dev_has_removable_media(blk); - if (blk_dev_has_removable_media(blk)) { + if (blk_dev_has_tray(blk)) { info->has_tray_open = true; info->tray_open = blk_dev_is_tray_open(blk); } - if (bdrv_iostatus_is_enabled(bs)) { + if (blk_iostatus_is_enabled(blk)) { info->has_io_status = true; - info->io_status = bs->iostatus; + info->io_status = blk_iostatus(blk); } - if (!QLIST_EMPTY(&bs->dirty_bitmaps)) { + if (bs && !QLIST_EMPTY(&bs->dirty_bitmaps)) { info->has_dirty_bitmaps = true; info->dirty_bitmaps = bdrv_query_dirty_bitmaps(bs); } - if (bs->drv) { + if (bs && bs->drv) { info->has_inserted = true; - info->inserted = bdrv_block_device_info(bs, errp); + info->inserted = bdrv_block_device_info(blk, bs, errp); if (info->inserted == NULL) { goto err; } @@ -326,45 +357,115 @@ static void bdrv_query_info(BlockBackend *blk, BlockInfo **p_info, qapi_free_BlockInfo(info); } -static BlockStats *bdrv_query_stats(const BlockDriverState *bs, - bool query_backing) +static BlockStats *bdrv_query_stats(BlockBackend *blk, + const BlockDriverState *bs, + bool query_backing); + +static void bdrv_query_blk_stats(BlockDeviceStats *ds, BlockBackend *blk) { - BlockStats *s; + BlockAcctStats *stats = blk_get_stats(blk); + BlockAcctTimedStats *ts = NULL; - s = g_malloc0(sizeof(*s)); + ds->rd_bytes = stats->nr_bytes[BLOCK_ACCT_READ]; + ds->wr_bytes = stats->nr_bytes[BLOCK_ACCT_WRITE]; + ds->rd_operations = stats->nr_ops[BLOCK_ACCT_READ]; + ds->wr_operations = stats->nr_ops[BLOCK_ACCT_WRITE]; - if (bdrv_get_device_name(bs)[0]) { - s->has_device = true; - s->device = g_strdup(bdrv_get_device_name(bs)); + ds->failed_rd_operations = stats->failed_ops[BLOCK_ACCT_READ]; + ds->failed_wr_operations = stats->failed_ops[BLOCK_ACCT_WRITE]; + ds->failed_flush_operations = stats->failed_ops[BLOCK_ACCT_FLUSH]; + + ds->invalid_rd_operations = stats->invalid_ops[BLOCK_ACCT_READ]; + ds->invalid_wr_operations = stats->invalid_ops[BLOCK_ACCT_WRITE]; + ds->invalid_flush_operations = + stats->invalid_ops[BLOCK_ACCT_FLUSH]; + + ds->rd_merged = stats->merged[BLOCK_ACCT_READ]; + ds->wr_merged = stats->merged[BLOCK_ACCT_WRITE]; + ds->flush_operations = stats->nr_ops[BLOCK_ACCT_FLUSH]; + ds->wr_total_time_ns = stats->total_time_ns[BLOCK_ACCT_WRITE]; + ds->rd_total_time_ns = stats->total_time_ns[BLOCK_ACCT_READ]; + ds->flush_total_time_ns = stats->total_time_ns[BLOCK_ACCT_FLUSH]; + + ds->has_idle_time_ns = stats->last_access_time_ns > 0; + if (ds->has_idle_time_ns) { + ds->idle_time_ns = block_acct_idle_time_ns(stats); } + ds->account_invalid = stats->account_invalid; + ds->account_failed = stats->account_failed; + + while ((ts = block_acct_interval_next(stats, ts))) { + BlockDeviceTimedStatsList *timed_stats = + g_malloc0(sizeof(*timed_stats)); + BlockDeviceTimedStats *dev_stats = g_malloc0(sizeof(*dev_stats)); + timed_stats->next = ds->timed_stats; + timed_stats->value = dev_stats; + ds->timed_stats = timed_stats; + + TimedAverage *rd = &ts->latency[BLOCK_ACCT_READ]; + TimedAverage *wr = &ts->latency[BLOCK_ACCT_WRITE]; + TimedAverage *fl = &ts->latency[BLOCK_ACCT_FLUSH]; + + dev_stats->interval_length = ts->interval_length; + + dev_stats->min_rd_latency_ns = timed_average_min(rd); + dev_stats->max_rd_latency_ns = timed_average_max(rd); + dev_stats->avg_rd_latency_ns = timed_average_avg(rd); + + dev_stats->min_wr_latency_ns = timed_average_min(wr); + dev_stats->max_wr_latency_ns = timed_average_max(wr); + dev_stats->avg_wr_latency_ns = timed_average_avg(wr); + + dev_stats->min_flush_latency_ns = timed_average_min(fl); + dev_stats->max_flush_latency_ns = timed_average_max(fl); + dev_stats->avg_flush_latency_ns = timed_average_avg(fl); + + dev_stats->avg_rd_queue_depth = + block_acct_queue_depth(ts, BLOCK_ACCT_READ); + dev_stats->avg_wr_queue_depth = + block_acct_queue_depth(ts, BLOCK_ACCT_WRITE); + } +} + +static void bdrv_query_bds_stats(BlockStats *s, const BlockDriverState *bs, + bool query_backing) +{ if (bdrv_get_node_name(bs)[0]) { s->has_node_name = true; s->node_name = g_strdup(bdrv_get_node_name(bs)); } - s->stats = g_malloc0(sizeof(*s->stats)); - s->stats->rd_bytes = bs->stats.nr_bytes[BLOCK_ACCT_READ]; - s->stats->wr_bytes = bs->stats.nr_bytes[BLOCK_ACCT_WRITE]; - s->stats->rd_operations = bs->stats.nr_ops[BLOCK_ACCT_READ]; - s->stats->wr_operations = bs->stats.nr_ops[BLOCK_ACCT_WRITE]; - s->stats->rd_merged = bs->stats.merged[BLOCK_ACCT_READ]; - s->stats->wr_merged = bs->stats.merged[BLOCK_ACCT_WRITE]; - s->stats->wr_highest_offset = - bs->stats.wr_highest_sector * BDRV_SECTOR_SIZE; - s->stats->flush_operations = bs->stats.nr_ops[BLOCK_ACCT_FLUSH]; - s->stats->wr_total_time_ns = bs->stats.total_time_ns[BLOCK_ACCT_WRITE]; - s->stats->rd_total_time_ns = bs->stats.total_time_ns[BLOCK_ACCT_READ]; - s->stats->flush_total_time_ns = bs->stats.total_time_ns[BLOCK_ACCT_FLUSH]; + s->stats->wr_highest_offset = bs->wr_highest_offset; if (bs->file) { s->has_parent = true; - s->parent = bdrv_query_stats(bs->file, query_backing); + s->parent = bdrv_query_stats(NULL, bs->file->bs, query_backing); } - if (query_backing && bs->backing_hd) { + if (query_backing && bs->backing) { s->has_backing = true; - s->backing = bdrv_query_stats(bs->backing_hd, query_backing); + s->backing = bdrv_query_stats(NULL, bs->backing->bs, query_backing); + } + +} + +static BlockStats *bdrv_query_stats(BlockBackend *blk, + const BlockDriverState *bs, + bool query_backing) +{ + BlockStats *s; + + s = g_malloc0(sizeof(*s)); + s->stats = g_malloc0(sizeof(*s->stats)); + + if (blk) { + s->has_device = true; + s->device = g_strdup(blk_name(blk)); + bdrv_query_blk_stats(s->stats, blk); + } + if (bs) { + bdrv_query_bds_stats(s, bs, query_backing); } return s; @@ -381,7 +482,9 @@ BlockInfoList *qmp_query_block(Error **errp) bdrv_query_info(blk, &info->value, &local_err); if (local_err) { error_propagate(errp, local_err); - goto err; + g_free(info); + qapi_free_BlockInfoList(head); + return NULL; } *p_next = info; @@ -389,10 +492,20 @@ BlockInfoList *qmp_query_block(Error **errp) } return head; +} - err: - qapi_free_BlockInfoList(head); - return NULL; +static bool next_query_bds(BlockBackend **blk, BlockDriverState **bs, + bool query_nodes) +{ + if (query_nodes) { + *bs = bdrv_next_node(*bs); + return !!*bs; + } + + *blk = blk_next(*blk); + *bs = *blk ? blk_bs(*blk) : NULL; + + return !!*blk; } BlockStatsList *qmp_query_blockstats(bool has_query_nodes, @@ -400,17 +513,19 @@ BlockStatsList *qmp_query_blockstats(bool has_query_nodes, Error **errp) { BlockStatsList *head = NULL, **p_next = &head; + BlockBackend *blk = NULL; BlockDriverState *bs = NULL; /* Just to be safe if query_nodes is not always initialized */ query_nodes = has_query_nodes && query_nodes; - while ((bs = query_nodes ? bdrv_next_node(bs) : bdrv_next(bs))) { + while (next_query_bds(&blk, &bs, query_nodes)) { BlockStatsList *info = g_malloc0(sizeof(*info)); - AioContext *ctx = bdrv_get_aio_context(bs); + AioContext *ctx = blk ? blk_get_aio_context(blk) + : bdrv_get_aio_context(bs); aio_context_acquire(ctx); - info->value = bdrv_query_stats(bs, !query_nodes); + info->value = bdrv_query_stats(blk, bs, !query_nodes); aio_context_release(ctx); *p_next = info; @@ -535,11 +650,10 @@ static void dump_qlist(fprintf_function func_fprintf, void *f, int indentation, int i = 0; for (entry = qlist_first(list); entry; entry = qlist_next(entry), i++) { - qtype_code type = qobject_type(entry->value); + QType type = qobject_type(entry->value); bool composite = (type == QTYPE_QDICT || type == QTYPE_QLIST); - const char *format = composite ? "%*s[%i]:\n" : "%*s[%i]: "; - - func_fprintf(f, format, indentation * 4, "", i); + func_fprintf(f, "%*s[%i]:%c", indentation * 4, "", i, + composite ? '\n' : ' '); dump_qobject(func_fprintf, f, indentation + 1, entry->value); if (!composite) { func_fprintf(f, "\n"); @@ -553,10 +667,9 @@ static void dump_qdict(fprintf_function func_fprintf, void *f, int indentation, const QDictEntry *entry; for (entry = qdict_first(dict); entry; entry = qdict_next(dict, entry)) { - qtype_code type = qobject_type(entry->value); + QType type = qobject_type(entry->value); bool composite = (type == QTYPE_QDICT || type == QTYPE_QLIST); - const char *format = composite ? "%*s%s:\n" : "%*s%s: "; - char key[strlen(entry->key) + 1]; + char *key = g_malloc(strlen(entry->key) + 1); int i; /* replace dashes with spaces in key (variable) names */ @@ -564,12 +677,13 @@ static void dump_qdict(fprintf_function func_fprintf, void *f, int indentation, key[i] = entry->key[i] == '-' ? ' ' : entry->key[i]; } key[i] = 0; - - func_fprintf(f, format, indentation * 4, "", key); + func_fprintf(f, "%*s%s:%c", indentation * 4, "", key, + composite ? '\n' : ' '); dump_qobject(func_fprintf, f, indentation + 1, entry->value); if (!composite) { func_fprintf(f, "\n"); } + g_free(key); } } @@ -579,7 +693,7 @@ void bdrv_image_info_specific_dump(fprintf_function func_fprintf, void *f, QmpOutputVisitor *ov = qmp_output_visitor_new(); QObject *obj, *data; - visit_type_ImageInfoSpecific(qmp_output_get_visitor(ov), &info_spec, NULL, + visit_type_ImageInfoSpecific(qmp_output_get_visitor(ov), NULL, &info_spec, &error_abort); obj = qmp_output_get_qobject(ov); assert(qobject_type(obj) == QTYPE_QDICT); @@ -623,7 +737,10 @@ void bdrv_image_info_dump(fprintf_function func_fprintf, void *f, if (info->has_backing_filename) { func_fprintf(f, "backing file: %s", info->backing_filename); - if (info->has_full_backing_filename) { + if (!info->has_full_backing_filename) { + func_fprintf(f, " (cannot determine actual path)"); + } else if (strcmp(info->backing_filename, + info->full_backing_filename) != 0) { func_fprintf(f, " (actual path: %s)", info->full_backing_filename); } func_fprintf(f, "\n"); diff --git a/qemu/block/qcow.c b/qemu/block/qcow.c index 01fba54ce..60ddb12ec 100644 --- a/qemu/block/qcow.c +++ b/qemu/block/qcow.c @@ -21,8 +21,12 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. */ +#include "qemu/osdep.h" +#include "qapi/error.h" #include "qemu-common.h" +#include "qemu/error-report.h" #include "block/block_int.h" +#include "sysemu/block-backend.h" #include "qemu/module.h" #include <zlib.h> #include "qapi/qmp/qerror.h" @@ -100,7 +104,7 @@ static int qcow_open(BlockDriverState *bs, QDict *options, int flags, int ret; QCowHeader header; - ret = bdrv_pread(bs->file, 0, &header, sizeof(header)); + ret = bdrv_pread(bs->file->bs, 0, &header, sizeof(header)); if (ret < 0) { goto fail; } @@ -119,11 +123,7 @@ static int qcow_open(BlockDriverState *bs, QDict *options, int flags, goto fail; } if (header.version != QCOW_VERSION) { - char version[64]; - snprintf(version, sizeof(version), "QCOW version %" PRIu32, - header.version); - error_setg(errp, QERR_UNKNOWN_BLOCK_FORMAT_FEATURE, - bdrv_get_device_or_node_name(bs), "qcow", version); + error_setg(errp, "Unsupported qcow version %" PRIu32, header.version); ret = -ENOTSUP; goto fail; } @@ -159,6 +159,14 @@ static int qcow_open(BlockDriverState *bs, QDict *options, int flags, } s->crypt_method_header = header.crypt_method; if (s->crypt_method_header) { + if (bdrv_uses_whitelist() && + s->crypt_method_header == QCOW_CRYPT_AES) { + error_report("qcow built-in AES encryption is deprecated"); + error_printf("Support for it will be removed in a future release.\n" + "You can use 'qemu-img convert' to switch to an\n" + "unencrypted qcow image, or a LUKS raw image.\n"); + } + bs->encrypted = 1; } s->cluster_bits = header.cluster_bits; @@ -193,7 +201,7 @@ static int qcow_open(BlockDriverState *bs, QDict *options, int flags, goto fail; } - ret = bdrv_pread(bs->file, s->l1_table_offset, s->l1_table, + ret = bdrv_pread(bs->file->bs, s->l1_table_offset, s->l1_table, s->l1_size * sizeof(uint64_t)); if (ret < 0) { goto fail; @@ -205,7 +213,7 @@ static int qcow_open(BlockDriverState *bs, QDict *options, int flags, /* alloc L2 cache (max. 64k * 16 * 8 = 8 MB) */ s->l2_cache = - qemu_try_blockalign(bs->file, + qemu_try_blockalign(bs->file->bs, s->l2_size * L2_CACHE_SIZE * sizeof(uint64_t)); if (s->l2_cache == NULL) { error_setg(errp, "Could not allocate L2 table cache"); @@ -224,7 +232,7 @@ static int qcow_open(BlockDriverState *bs, QDict *options, int flags, ret = -EINVAL; goto fail; } - ret = bdrv_pread(bs->file, header.backing_file_offset, + ret = bdrv_pread(bs->file->bs, header.backing_file_offset, bs->backing_file, len); if (ret < 0) { goto fail; @@ -369,13 +377,13 @@ static uint64_t get_cluster_offset(BlockDriverState *bs, if (!allocate) return 0; /* allocate a new l2 entry */ - l2_offset = bdrv_getlength(bs->file); + l2_offset = bdrv_getlength(bs->file->bs); /* round to cluster size */ l2_offset = (l2_offset + s->cluster_size - 1) & ~(s->cluster_size - 1); /* update the L1 entry */ s->l1_table[l1_index] = l2_offset; tmp = cpu_to_be64(l2_offset); - if (bdrv_pwrite_sync(bs->file, + if (bdrv_pwrite_sync(bs->file->bs, s->l1_table_offset + l1_index * sizeof(tmp), &tmp, sizeof(tmp)) < 0) return 0; @@ -405,11 +413,12 @@ static uint64_t get_cluster_offset(BlockDriverState *bs, l2_table = s->l2_cache + (min_index << s->l2_bits); if (new_l2_table) { memset(l2_table, 0, s->l2_size * sizeof(uint64_t)); - if (bdrv_pwrite_sync(bs->file, l2_offset, l2_table, + if (bdrv_pwrite_sync(bs->file->bs, l2_offset, l2_table, s->l2_size * sizeof(uint64_t)) < 0) return 0; } else { - if (bdrv_pread(bs->file, l2_offset, l2_table, s->l2_size * sizeof(uint64_t)) != + if (bdrv_pread(bs->file->bs, l2_offset, l2_table, + s->l2_size * sizeof(uint64_t)) != s->l2_size * sizeof(uint64_t)) return 0; } @@ -430,20 +439,21 @@ static uint64_t get_cluster_offset(BlockDriverState *bs, overwritten */ if (decompress_cluster(bs, cluster_offset) < 0) return 0; - cluster_offset = bdrv_getlength(bs->file); + cluster_offset = bdrv_getlength(bs->file->bs); cluster_offset = (cluster_offset + s->cluster_size - 1) & ~(s->cluster_size - 1); /* write the cluster content */ - if (bdrv_pwrite(bs->file, cluster_offset, s->cluster_cache, s->cluster_size) != + if (bdrv_pwrite(bs->file->bs, cluster_offset, s->cluster_cache, + s->cluster_size) != s->cluster_size) return -1; } else { - cluster_offset = bdrv_getlength(bs->file); + cluster_offset = bdrv_getlength(bs->file->bs); if (allocate == 1) { /* round to cluster size */ cluster_offset = (cluster_offset + s->cluster_size - 1) & ~(s->cluster_size - 1); - bdrv_truncate(bs->file, cluster_offset + s->cluster_size); + bdrv_truncate(bs->file->bs, cluster_offset + s->cluster_size); /* if encrypted, we must initialize the cluster content which won't be written */ if (bs->encrypted && @@ -463,7 +473,8 @@ static uint64_t get_cluster_offset(BlockDriverState *bs, errno = EIO; return -1; } - if (bdrv_pwrite(bs->file, cluster_offset + i * 512, + if (bdrv_pwrite(bs->file->bs, + cluster_offset + i * 512, s->cluster_data, 512) != 512) return -1; } @@ -477,7 +488,7 @@ static uint64_t get_cluster_offset(BlockDriverState *bs, /* update L2 table */ tmp = cpu_to_be64(cluster_offset); l2_table[l2_index] = tmp; - if (bdrv_pwrite_sync(bs->file, l2_offset + l2_index * sizeof(tmp), + if (bdrv_pwrite_sync(bs->file->bs, l2_offset + l2_index * sizeof(tmp), &tmp, sizeof(tmp)) < 0) return 0; } @@ -485,7 +496,7 @@ static uint64_t get_cluster_offset(BlockDriverState *bs, } static int64_t coroutine_fn qcow_co_get_block_status(BlockDriverState *bs, - int64_t sector_num, int nb_sectors, int *pnum) + int64_t sector_num, int nb_sectors, int *pnum, BlockDriverState **file) { BDRVQcowState *s = bs->opaque; int index_in_cluster, n; @@ -506,6 +517,7 @@ static int64_t coroutine_fn qcow_co_get_block_status(BlockDriverState *bs, return BDRV_BLOCK_DATA; } cluster_offset |= (index_in_cluster << BDRV_SECTOR_BITS); + *file = bs->file->bs; return BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID | cluster_offset; } @@ -546,7 +558,7 @@ static int decompress_cluster(BlockDriverState *bs, uint64_t cluster_offset) if (s->cluster_cache_offset != coffset) { csize = cluster_offset >> (63 - s->cluster_bits); csize &= (s->cluster_size - 1); - ret = bdrv_pread(bs->file, coffset, s->cluster_data, csize); + ret = bdrv_pread(bs->file->bs, coffset, s->cluster_data, csize); if (ret != csize) return -1; if (decompress_buffer(s->cluster_cache, s->cluster_size, @@ -594,13 +606,13 @@ static coroutine_fn int qcow_co_readv(BlockDriverState *bs, int64_t sector_num, } if (!cluster_offset) { - if (bs->backing_hd) { + if (bs->backing) { /* read from the base image */ hd_iov.iov_base = (void *)buf; hd_iov.iov_len = n * 512; qemu_iovec_init_external(&hd_qiov, &hd_iov, 1); qemu_co_mutex_unlock(&s->lock); - ret = bdrv_co_readv(bs->backing_hd, sector_num, + ret = bdrv_co_readv(bs->backing->bs, sector_num, n, &hd_qiov); qemu_co_mutex_lock(&s->lock); if (ret < 0) { @@ -625,7 +637,7 @@ static coroutine_fn int qcow_co_readv(BlockDriverState *bs, int64_t sector_num, hd_iov.iov_len = n * 512; qemu_iovec_init_external(&hd_qiov, &hd_iov, 1); qemu_co_mutex_unlock(&s->lock); - ret = bdrv_co_readv(bs->file, + ret = bdrv_co_readv(bs->file->bs, (cluster_offset >> 9) + index_in_cluster, n, &hd_qiov); qemu_co_mutex_lock(&s->lock); @@ -727,7 +739,7 @@ static coroutine_fn int qcow_co_writev(BlockDriverState *bs, int64_t sector_num, hd_iov.iov_len = n * 512; qemu_iovec_init_external(&hd_qiov, &hd_iov, 1); qemu_co_mutex_unlock(&s->lock); - ret = bdrv_co_writev(bs->file, + ret = bdrv_co_writev(bs->file->bs, (cluster_offset >> 9) + index_in_cluster, n, &hd_qiov); qemu_co_mutex_lock(&s->lock); @@ -775,7 +787,7 @@ static int qcow_create(const char *filename, QemuOpts *opts, Error **errp) int flags = 0; Error *local_err = NULL; int ret; - BlockDriverState *qcow_bs; + BlockBackend *qcow_blk; /* Read out options */ total_size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0), @@ -791,15 +803,17 @@ static int qcow_create(const char *filename, QemuOpts *opts, Error **errp) goto cleanup; } - qcow_bs = NULL; - ret = bdrv_open(&qcow_bs, filename, NULL, NULL, - BDRV_O_RDWR | BDRV_O_PROTOCOL, NULL, &local_err); - if (ret < 0) { + qcow_blk = blk_new_open(filename, NULL, NULL, + BDRV_O_RDWR | BDRV_O_PROTOCOL, &local_err); + if (qcow_blk == NULL) { error_propagate(errp, local_err); + ret = -EIO; goto cleanup; } - ret = bdrv_truncate(qcow_bs, 0); + blk_set_allow_write_beyond_eof(qcow_blk, true); + + ret = blk_truncate(qcow_blk, 0); if (ret < 0) { goto exit; } @@ -839,13 +853,13 @@ static int qcow_create(const char *filename, QemuOpts *opts, Error **errp) } /* write all the data */ - ret = bdrv_pwrite(qcow_bs, 0, &header, sizeof(header)); + ret = blk_pwrite(qcow_blk, 0, &header, sizeof(header)); if (ret != sizeof(header)) { goto exit; } if (backing_file) { - ret = bdrv_pwrite(qcow_bs, sizeof(header), + ret = blk_pwrite(qcow_blk, sizeof(header), backing_file, backing_filename_len); if (ret != backing_filename_len) { goto exit; @@ -855,7 +869,7 @@ static int qcow_create(const char *filename, QemuOpts *opts, Error **errp) tmp = g_malloc0(BDRV_SECTOR_SIZE); for (i = 0; i < ((sizeof(uint64_t)*l1_size + BDRV_SECTOR_SIZE - 1)/ BDRV_SECTOR_SIZE); i++) { - ret = bdrv_pwrite(qcow_bs, header_size + + ret = blk_pwrite(qcow_blk, header_size + BDRV_SECTOR_SIZE*i, tmp, BDRV_SECTOR_SIZE); if (ret != BDRV_SECTOR_SIZE) { g_free(tmp); @@ -866,7 +880,7 @@ static int qcow_create(const char *filename, QemuOpts *opts, Error **errp) g_free(tmp); ret = 0; exit: - bdrv_unref(qcow_bs); + blk_unref(qcow_blk); cleanup: g_free(backing_file); return ret; @@ -879,10 +893,10 @@ static int qcow_make_empty(BlockDriverState *bs) int ret; memset(s->l1_table, 0, l1_length); - if (bdrv_pwrite_sync(bs->file, s->l1_table_offset, s->l1_table, + if (bdrv_pwrite_sync(bs->file->bs, s->l1_table_offset, s->l1_table, l1_length) < 0) return -1; - ret = bdrv_truncate(bs->file, s->l1_table_offset + l1_length); + ret = bdrv_truncate(bs->file->bs, s->l1_table_offset + l1_length); if (ret < 0) return ret; @@ -962,7 +976,7 @@ static int qcow_write_compressed(BlockDriverState *bs, int64_t sector_num, } cluster_offset &= s->cluster_offset_mask; - ret = bdrv_pwrite(bs->file, cluster_offset, out_buf, out_len); + ret = bdrv_pwrite(bs->file->bs, cluster_offset, out_buf, out_len); if (ret < 0) { goto fail; } diff --git a/qemu/block/qcow2-cache.c b/qemu/block/qcow2-cache.c index 53b8afc3d..0fe8edae4 100644 --- a/qemu/block/qcow2-cache.c +++ b/qemu/block/qcow2-cache.c @@ -22,6 +22,13 @@ * THE SOFTWARE. */ +/* Needed for CONFIG_MADVISE */ +#include "qemu/osdep.h" + +#if defined(CONFIG_MADVISE) || defined(CONFIG_POSIX_MADVISE) +#include <sys/mman.h> +#endif + #include "block/block_int.h" #include "qemu-common.h" #include "qcow2.h" @@ -29,9 +36,9 @@ typedef struct Qcow2CachedTable { int64_t offset; - bool dirty; uint64_t lru_counter; int ref; + bool dirty; } Qcow2CachedTable; struct Qcow2Cache { @@ -41,34 +48,85 @@ struct Qcow2Cache { bool depends_on_flush; void *table_array; uint64_t lru_counter; + uint64_t cache_clean_lru_counter; }; static inline void *qcow2_cache_get_table_addr(BlockDriverState *bs, Qcow2Cache *c, int table) { - BDRVQcowState *s = bs->opaque; + BDRVQcow2State *s = bs->opaque; return (uint8_t *) c->table_array + (size_t) table * s->cluster_size; } static inline int qcow2_cache_get_table_idx(BlockDriverState *bs, Qcow2Cache *c, void *table) { - BDRVQcowState *s = bs->opaque; + BDRVQcow2State *s = bs->opaque; ptrdiff_t table_offset = (uint8_t *) table - (uint8_t *) c->table_array; int idx = table_offset / s->cluster_size; assert(idx >= 0 && idx < c->size && table_offset % s->cluster_size == 0); return idx; } +static void qcow2_cache_table_release(BlockDriverState *bs, Qcow2Cache *c, + int i, int num_tables) +{ +#if QEMU_MADV_DONTNEED != QEMU_MADV_INVALID + BDRVQcow2State *s = bs->opaque; + void *t = qcow2_cache_get_table_addr(bs, c, i); + int align = getpagesize(); + size_t mem_size = (size_t) s->cluster_size * num_tables; + size_t offset = QEMU_ALIGN_UP((uintptr_t) t, align) - (uintptr_t) t; + size_t length = QEMU_ALIGN_DOWN(mem_size - offset, align); + if (length > 0) { + qemu_madvise((uint8_t *) t + offset, length, QEMU_MADV_DONTNEED); + } +#endif +} + +static inline bool can_clean_entry(Qcow2Cache *c, int i) +{ + Qcow2CachedTable *t = &c->entries[i]; + return t->ref == 0 && !t->dirty && t->offset != 0 && + t->lru_counter <= c->cache_clean_lru_counter; +} + +void qcow2_cache_clean_unused(BlockDriverState *bs, Qcow2Cache *c) +{ + int i = 0; + while (i < c->size) { + int to_clean = 0; + + /* Skip the entries that we don't need to clean */ + while (i < c->size && !can_clean_entry(c, i)) { + i++; + } + + /* And count how many we can clean in a row */ + while (i < c->size && can_clean_entry(c, i)) { + c->entries[i].offset = 0; + c->entries[i].lru_counter = 0; + i++; + to_clean++; + } + + if (to_clean > 0) { + qcow2_cache_table_release(bs, c, i - to_clean, to_clean); + } + } + + c->cache_clean_lru_counter = c->lru_counter; +} + Qcow2Cache *qcow2_cache_create(BlockDriverState *bs, int num_tables) { - BDRVQcowState *s = bs->opaque; + BDRVQcow2State *s = bs->opaque; Qcow2Cache *c; c = g_new0(Qcow2Cache, 1); c->size = num_tables; c->entries = g_try_new0(Qcow2CachedTable, num_tables); - c->table_array = qemu_try_blockalign(bs->file, + c->table_array = qemu_try_blockalign(bs->file->bs, (size_t) num_tables * s->cluster_size); if (!c->entries || !c->table_array) { @@ -113,7 +171,7 @@ static int qcow2_cache_flush_dependency(BlockDriverState *bs, Qcow2Cache *c) static int qcow2_cache_entry_flush(BlockDriverState *bs, Qcow2Cache *c, int i) { - BDRVQcowState *s = bs->opaque; + BDRVQcow2State *s = bs->opaque; int ret = 0; if (!c->entries[i].dirty || !c->entries[i].offset) { @@ -126,7 +184,7 @@ static int qcow2_cache_entry_flush(BlockDriverState *bs, Qcow2Cache *c, int i) if (c->depends) { ret = qcow2_cache_flush_dependency(bs, c); } else if (c->depends_on_flush) { - ret = bdrv_flush(bs->file); + ret = bdrv_flush(bs->file->bs); if (ret >= 0) { c->depends_on_flush = false; } @@ -157,7 +215,7 @@ static int qcow2_cache_entry_flush(BlockDriverState *bs, Qcow2Cache *c, int i) BLKDBG_EVENT(bs->file, BLKDBG_L2_UPDATE); } - ret = bdrv_pwrite(bs->file, c->entries[i].offset, + ret = bdrv_pwrite(bs->file->bs, c->entries[i].offset, qcow2_cache_get_table_addr(bs, c, i), s->cluster_size); if (ret < 0) { return ret; @@ -170,7 +228,7 @@ static int qcow2_cache_entry_flush(BlockDriverState *bs, Qcow2Cache *c, int i) int qcow2_cache_flush(BlockDriverState *bs, Qcow2Cache *c) { - BDRVQcowState *s = bs->opaque; + BDRVQcow2State *s = bs->opaque; int result = 0; int ret; int i; @@ -185,7 +243,7 @@ int qcow2_cache_flush(BlockDriverState *bs, Qcow2Cache *c) } if (result == 0) { - ret = bdrv_flush(bs->file); + ret = bdrv_flush(bs->file->bs); if (ret < 0) { result = ret; } @@ -237,6 +295,8 @@ int qcow2_cache_empty(BlockDriverState *bs, Qcow2Cache *c) c->entries[i].lru_counter = 0; } + qcow2_cache_table_release(bs, c, 0, c->size); + c->lru_counter = 0; return 0; @@ -245,7 +305,7 @@ int qcow2_cache_empty(BlockDriverState *bs, Qcow2Cache *c) static int qcow2_cache_do_get(BlockDriverState *bs, Qcow2Cache *c, uint64_t offset, void **table, bool read_from_disk) { - BDRVQcowState *s = bs->opaque; + BDRVQcow2State *s = bs->opaque; int i; int ret; int lookup_index; @@ -295,7 +355,8 @@ static int qcow2_cache_do_get(BlockDriverState *bs, Qcow2Cache *c, BLKDBG_EVENT(bs->file, BLKDBG_L2_LOAD); } - ret = bdrv_pread(bs->file, offset, qcow2_cache_get_table_addr(bs, c, i), + ret = bdrv_pread(bs->file->bs, offset, + qcow2_cache_get_table_addr(bs, c, i), s->cluster_size); if (ret < 0) { return ret; diff --git a/qemu/block/qcow2-cluster.c b/qemu/block/qcow2-cluster.c index b43f186eb..31ecc1030 100644 --- a/qemu/block/qcow2-cluster.c +++ b/qemu/block/qcow2-cluster.c @@ -22,8 +22,10 @@ * THE SOFTWARE. */ +#include "qemu/osdep.h" #include <zlib.h> +#include "qapi/error.h" #include "qemu-common.h" #include "block/block_int.h" #include "block/qcow2.h" @@ -32,7 +34,7 @@ int qcow2_grow_l1_table(BlockDriverState *bs, uint64_t min_size, bool exact_size) { - BDRVQcowState *s = bs->opaque; + BDRVQcow2State *s = bs->opaque; int new_l1_size2, ret, i; uint64_t *new_l1_table; int64_t old_l1_table_offset, old_l1_size; @@ -72,7 +74,7 @@ int qcow2_grow_l1_table(BlockDriverState *bs, uint64_t min_size, #endif new_l1_size2 = sizeof(uint64_t) * new_l1_size; - new_l1_table = qemu_try_blockalign(bs->file, + new_l1_table = qemu_try_blockalign(bs->file->bs, align_offset(new_l1_size2, 512)); if (new_l1_table == NULL) { return -ENOMEM; @@ -105,7 +107,8 @@ int qcow2_grow_l1_table(BlockDriverState *bs, uint64_t min_size, BLKDBG_EVENT(bs->file, BLKDBG_L1_GROW_WRITE_TABLE); for(i = 0; i < s->l1_size; i++) new_l1_table[i] = cpu_to_be64(new_l1_table[i]); - ret = bdrv_pwrite_sync(bs->file, new_l1_table_offset, new_l1_table, new_l1_size2); + ret = bdrv_pwrite_sync(bs->file->bs, new_l1_table_offset, + new_l1_table, new_l1_size2); if (ret < 0) goto fail; for(i = 0; i < s->l1_size; i++) @@ -115,7 +118,8 @@ int qcow2_grow_l1_table(BlockDriverState *bs, uint64_t min_size, BLKDBG_EVENT(bs->file, BLKDBG_L1_GROW_ACTIVATE_TABLE); cpu_to_be32w((uint32_t*)data, new_l1_size); stq_be_p(data + 4, new_l1_table_offset); - ret = bdrv_pwrite_sync(bs->file, offsetof(QCowHeader, l1_size), data,sizeof(data)); + ret = bdrv_pwrite_sync(bs->file->bs, offsetof(QCowHeader, l1_size), + data, sizeof(data)); if (ret < 0) { goto fail; } @@ -148,7 +152,7 @@ int qcow2_grow_l1_table(BlockDriverState *bs, uint64_t min_size, static int l2_load(BlockDriverState *bs, uint64_t l2_offset, uint64_t **l2_table) { - BDRVQcowState *s = bs->opaque; + BDRVQcow2State *s = bs->opaque; int ret; ret = qcow2_cache_get(bs, s->l2_table_cache, l2_offset, (void**) l2_table); @@ -163,7 +167,7 @@ static int l2_load(BlockDriverState *bs, uint64_t l2_offset, #define L1_ENTRIES_PER_SECTOR (512 / 8) int qcow2_write_l1_entry(BlockDriverState *bs, int l1_index) { - BDRVQcowState *s = bs->opaque; + BDRVQcow2State *s = bs->opaque; uint64_t buf[L1_ENTRIES_PER_SECTOR] = { 0 }; int l1_start_index; int i, ret; @@ -182,8 +186,9 @@ int qcow2_write_l1_entry(BlockDriverState *bs, int l1_index) } BLKDBG_EVENT(bs->file, BLKDBG_L1_UPDATE); - ret = bdrv_pwrite_sync(bs->file, s->l1_table_offset + 8 * l1_start_index, - buf, sizeof(buf)); + ret = bdrv_pwrite_sync(bs->file->bs, + s->l1_table_offset + 8 * l1_start_index, + buf, sizeof(buf)); if (ret < 0) { return ret; } @@ -203,7 +208,7 @@ int qcow2_write_l1_entry(BlockDriverState *bs, int l1_index) static int l2_allocate(BlockDriverState *bs, int l1_index, uint64_t **table) { - BDRVQcowState *s = bs->opaque; + BDRVQcow2State *s = bs->opaque; uint64_t old_l2_offset; uint64_t *l2_table = NULL; int64_t l2_offset; @@ -298,7 +303,7 @@ fail: * as contiguous. (This allows it, for example, to stop at the first compressed * cluster which may require a different handling) */ -static int count_contiguous_clusters(uint64_t nb_clusters, int cluster_size, +static int count_contiguous_clusters(int nb_clusters, int cluster_size, uint64_t *l2_table, uint64_t stop_flags) { int i; @@ -309,7 +314,7 @@ static int count_contiguous_clusters(uint64_t nb_clusters, int cluster_size, if (!offset) return 0; - assert(qcow2_get_cluster_type(first_entry) != QCOW2_CLUSTER_COMPRESSED); + assert(qcow2_get_cluster_type(first_entry) == QCOW2_CLUSTER_NORMAL); for (i = 0; i < nb_clusters; i++) { uint64_t l2_entry = be64_to_cpu(l2_table[i]) & mask; @@ -321,14 +326,16 @@ static int count_contiguous_clusters(uint64_t nb_clusters, int cluster_size, return i; } -static int count_contiguous_free_clusters(uint64_t nb_clusters, uint64_t *l2_table) +static int count_contiguous_clusters_by_type(int nb_clusters, + uint64_t *l2_table, + int wanted_type) { int i; for (i = 0; i < nb_clusters; i++) { int type = qcow2_get_cluster_type(be64_to_cpu(l2_table[i])); - if (type != QCOW2_CLUSTER_UNALLOCATED) { + if (type != wanted_type) { break; } } @@ -339,7 +346,7 @@ static int count_contiguous_free_clusters(uint64_t nb_clusters, uint64_t *l2_tab /* The crypt function is compatible with the linux cryptoloop algorithm for < 4 GB images. NOTE: out_buf == in_buf is supported */ -int qcow2_encrypt_sectors(BDRVQcowState *s, int64_t sector_num, +int qcow2_encrypt_sectors(BDRVQcow2State *s, int64_t sector_num, uint8_t *out_buf, const uint8_t *in_buf, int nb_sectors, bool enc, Error **errp) @@ -387,7 +394,7 @@ static int coroutine_fn copy_sectors(BlockDriverState *bs, uint64_t cluster_offset, int n_start, int n_end) { - BDRVQcowState *s = bs->opaque; + BDRVQcow2State *s = bs->opaque; QEMUIOVector qiov; struct iovec iov; int n, ret; @@ -440,7 +447,8 @@ static int coroutine_fn copy_sectors(BlockDriverState *bs, } BLKDBG_EVENT(bs->file, BLKDBG_COW_WRITE); - ret = bdrv_co_writev(bs->file, (cluster_offset >> 9) + n_start, n, &qiov); + ret = bdrv_co_writev(bs->file->bs, (cluster_offset >> 9) + n_start, n, + &qiov); if (ret < 0) { goto out; } @@ -469,7 +477,7 @@ out: int qcow2_get_cluster_offset(BlockDriverState *bs, uint64_t offset, int *num, uint64_t *cluster_offset) { - BDRVQcowState *s = bs->opaque; + BDRVQcow2State *s = bs->opaque; unsigned int l2_index; uint64_t l1_index, l2_offset, *l2_table; int l1_bits, c; @@ -495,10 +503,11 @@ int qcow2_get_cluster_offset(BlockDriverState *bs, uint64_t offset, if (nb_needed > nb_available) { nb_needed = nb_available; } + assert(nb_needed <= INT_MAX); *cluster_offset = 0; - /* seek the the l2 offset in the l1 table */ + /* seek to the l2 offset in the l1 table */ l1_index = offset >> l1_bits; if (l1_index >= s->l1_size) { @@ -530,6 +539,8 @@ int qcow2_get_cluster_offset(BlockDriverState *bs, uint64_t offset, l2_index = (offset >> s->cluster_bits) & (s->l2_size - 1); *cluster_offset = be64_to_cpu(l2_table[l2_index]); + + /* nb_needed <= INT_MAX, thus nb_clusters <= INT_MAX, too */ nb_clusters = size_to_clusters(s, nb_needed << 9); ret = qcow2_get_cluster_type(*cluster_offset); @@ -547,13 +558,14 @@ int qcow2_get_cluster_offset(BlockDriverState *bs, uint64_t offset, ret = -EIO; goto fail; } - c = count_contiguous_clusters(nb_clusters, s->cluster_size, - &l2_table[l2_index], QCOW_OFLAG_ZERO); + c = count_contiguous_clusters_by_type(nb_clusters, &l2_table[l2_index], + QCOW2_CLUSTER_ZERO); *cluster_offset = 0; break; case QCOW2_CLUSTER_UNALLOCATED: /* how many empty clusters ? */ - c = count_contiguous_free_clusters(nb_clusters, &l2_table[l2_index]); + c = count_contiguous_clusters_by_type(nb_clusters, &l2_table[l2_index], + QCOW2_CLUSTER_UNALLOCATED); *cluster_offset = 0; break; case QCOW2_CLUSTER_NORMAL: @@ -606,13 +618,13 @@ static int get_cluster_table(BlockDriverState *bs, uint64_t offset, uint64_t **new_l2_table, int *new_l2_index) { - BDRVQcowState *s = bs->opaque; + BDRVQcow2State *s = bs->opaque; unsigned int l2_index; uint64_t l1_index, l2_offset; uint64_t *l2_table = NULL; int ret; - /* seek the the l2 offset in the l1 table */ + /* seek to the l2 offset in the l1 table */ l1_index = offset >> (s->l2_bits + s->cluster_bits); if (l1_index >= s->l1_size) { @@ -680,7 +692,7 @@ uint64_t qcow2_alloc_compressed_cluster_offset(BlockDriverState *bs, uint64_t offset, int compressed_size) { - BDRVQcowState *s = bs->opaque; + BDRVQcow2State *s = bs->opaque; int l2_index, ret; uint64_t *l2_table; int64_t cluster_offset; @@ -725,7 +737,7 @@ uint64_t qcow2_alloc_compressed_cluster_offset(BlockDriverState *bs, static int perform_cow(BlockDriverState *bs, QCowL2Meta *m, Qcow2COWRegion *r) { - BDRVQcowState *s = bs->opaque; + BDRVQcow2State *s = bs->opaque; int ret; if (r->nb_sectors == 0) { @@ -754,7 +766,7 @@ static int perform_cow(BlockDriverState *bs, QCowL2Meta *m, Qcow2COWRegion *r) int qcow2_alloc_cluster_link_l2(BlockDriverState *bs, QCowL2Meta *m) { - BDRVQcowState *s = bs->opaque; + BDRVQcow2State *s = bs->opaque; int i, j = 0, l2_index, ret; uint64_t *old_cluster, *l2_table; uint64_t cluster_offset = m->alloc_offset; @@ -814,7 +826,6 @@ int qcow2_alloc_cluster_link_l2(BlockDriverState *bs, QCowL2Meta *m) /* * If this was a COW, we need to decrease the refcount of the old cluster. - * Also flush bs->file to get the right order for L2 and refcount update. * * Don't discard clusters that reach a refcount of 0 (e.g. compressed * clusters), the next write will reuse them anyway. @@ -837,7 +848,7 @@ err: * write, but require COW to be performed (this includes yet unallocated space, * which must copy from the backing file) */ -static int count_cow_clusters(BDRVQcowState *s, int nb_clusters, +static int count_cow_clusters(BDRVQcow2State *s, int nb_clusters, uint64_t *l2_table, int l2_index) { int i; @@ -883,7 +894,7 @@ out: static int handle_dependencies(BlockDriverState *bs, uint64_t guest_offset, uint64_t *cur_bytes, QCowL2Meta **m) { - BDRVQcowState *s = bs->opaque; + BDRVQcow2State *s = bs->opaque; QCowL2Meta *old_alloc; uint64_t bytes = *cur_bytes; @@ -956,11 +967,11 @@ static int handle_dependencies(BlockDriverState *bs, uint64_t guest_offset, static int handle_copied(BlockDriverState *bs, uint64_t guest_offset, uint64_t *host_offset, uint64_t *bytes, QCowL2Meta **m) { - BDRVQcowState *s = bs->opaque; + BDRVQcow2State *s = bs->opaque; int l2_index; uint64_t cluster_offset; uint64_t *l2_table; - unsigned int nb_clusters; + uint64_t nb_clusters; unsigned int keep_clusters; int ret; @@ -979,6 +990,7 @@ static int handle_copied(BlockDriverState *bs, uint64_t guest_offset, l2_index = offset_to_l2_index(s, guest_offset); nb_clusters = MIN(nb_clusters, s->l2_size - l2_index); + assert(nb_clusters <= INT_MAX); /* Find L2 entry for the first involved cluster */ ret = get_cluster_table(bs, guest_offset, &l2_table, &l2_index); @@ -1061,9 +1073,9 @@ out: * restarted, but the whole request should not be failed. */ static int do_alloc_cluster_offset(BlockDriverState *bs, uint64_t guest_offset, - uint64_t *host_offset, unsigned int *nb_clusters) + uint64_t *host_offset, uint64_t *nb_clusters) { - BDRVQcowState *s = bs->opaque; + BDRVQcow2State *s = bs->opaque; trace_qcow2_do_alloc_clusters_offset(qemu_coroutine_self(), guest_offset, *host_offset, *nb_clusters); @@ -1079,7 +1091,7 @@ static int do_alloc_cluster_offset(BlockDriverState *bs, uint64_t guest_offset, *host_offset = cluster_offset; return 0; } else { - int ret = qcow2_alloc_clusters_at(bs, *host_offset, *nb_clusters); + int64_t ret = qcow2_alloc_clusters_at(bs, *host_offset, *nb_clusters); if (ret < 0) { return ret; } @@ -1111,11 +1123,11 @@ static int do_alloc_cluster_offset(BlockDriverState *bs, uint64_t guest_offset, static int handle_alloc(BlockDriverState *bs, uint64_t guest_offset, uint64_t *host_offset, uint64_t *bytes, QCowL2Meta **m) { - BDRVQcowState *s = bs->opaque; + BDRVQcow2State *s = bs->opaque; int l2_index; uint64_t *l2_table; uint64_t entry; - unsigned int nb_clusters; + uint64_t nb_clusters; int ret; uint64_t alloc_cluster_offset; @@ -1133,6 +1145,7 @@ static int handle_alloc(BlockDriverState *bs, uint64_t guest_offset, l2_index = offset_to_l2_index(s, guest_offset); nb_clusters = MIN(nb_clusters, s->l2_size - l2_index); + assert(nb_clusters <= INT_MAX); /* Find L2 entry for the first involved cluster */ ret = get_cluster_table(bs, guest_offset, &l2_table, &l2_index); @@ -1263,7 +1276,7 @@ fail: int qcow2_alloc_cluster_offset(BlockDriverState *bs, uint64_t offset, int *num, uint64_t *host_offset, QCowL2Meta **m) { - BDRVQcowState *s = bs->opaque; + BDRVQcow2State *s = bs->opaque; uint64_t start, remaining; uint64_t cluster_offset; uint64_t cur_bytes; @@ -1397,7 +1410,7 @@ static int decompress_buffer(uint8_t *out_buf, int out_buf_size, int qcow2_decompress_cluster(BlockDriverState *bs, uint64_t cluster_offset) { - BDRVQcowState *s = bs->opaque; + BDRVQcow2State *s = bs->opaque; int ret, csize, nb_csectors, sector_offset; uint64_t coffset; @@ -1407,7 +1420,8 @@ int qcow2_decompress_cluster(BlockDriverState *bs, uint64_t cluster_offset) sector_offset = coffset & 511; csize = nb_csectors * 512 - sector_offset; BLKDBG_EVENT(bs->file, BLKDBG_READ_COMPRESSED); - ret = bdrv_read(bs->file, coffset >> 9, s->cluster_data, nb_csectors); + ret = bdrv_read(bs->file->bs, coffset >> 9, s->cluster_data, + nb_csectors); if (ret < 0) { return ret; } @@ -1426,9 +1440,10 @@ int qcow2_decompress_cluster(BlockDriverState *bs, uint64_t cluster_offset) * clusters. */ static int discard_single_l2(BlockDriverState *bs, uint64_t offset, - unsigned int nb_clusters, enum qcow2_discard_type type, bool full_discard) + uint64_t nb_clusters, enum qcow2_discard_type type, + bool full_discard) { - BDRVQcowState *s = bs->opaque; + BDRVQcow2State *s = bs->opaque; uint64_t *l2_table; int l2_index; int ret; @@ -1441,6 +1456,7 @@ static int discard_single_l2(BlockDriverState *bs, uint64_t offset, /* Limit nb_clusters to one L2 table */ nb_clusters = MIN(nb_clusters, s->l2_size - l2_index); + assert(nb_clusters <= INT_MAX); for (i = 0; i < nb_clusters; i++) { uint64_t old_l2_entry; @@ -1462,7 +1478,7 @@ static int discard_single_l2(BlockDriverState *bs, uint64_t offset, */ switch (qcow2_get_cluster_type(old_l2_entry)) { case QCOW2_CLUSTER_UNALLOCATED: - if (full_discard || !bs->backing_hd) { + if (full_discard || !bs->backing) { continue; } break; @@ -1501,9 +1517,9 @@ static int discard_single_l2(BlockDriverState *bs, uint64_t offset, int qcow2_discard_clusters(BlockDriverState *bs, uint64_t offset, int nb_sectors, enum qcow2_discard_type type, bool full_discard) { - BDRVQcowState *s = bs->opaque; + BDRVQcow2State *s = bs->opaque; uint64_t end_offset; - unsigned int nb_clusters; + uint64_t nb_clusters; int ret; end_offset = offset + (nb_sectors << BDRV_SECTOR_BITS); @@ -1545,9 +1561,9 @@ fail: * clusters. */ static int zero_single_l2(BlockDriverState *bs, uint64_t offset, - unsigned int nb_clusters) + uint64_t nb_clusters) { - BDRVQcowState *s = bs->opaque; + BDRVQcow2State *s = bs->opaque; uint64_t *l2_table; int l2_index; int ret; @@ -1560,6 +1576,7 @@ static int zero_single_l2(BlockDriverState *bs, uint64_t offset, /* Limit nb_clusters to one L2 table */ nb_clusters = MIN(nb_clusters, s->l2_size - l2_index); + assert(nb_clusters <= INT_MAX); for (i = 0; i < nb_clusters; i++) { uint64_t old_offset; @@ -1583,8 +1600,8 @@ static int zero_single_l2(BlockDriverState *bs, uint64_t offset, int qcow2_zero_clusters(BlockDriverState *bs, uint64_t offset, int nb_sectors) { - BDRVQcowState *s = bs->opaque; - unsigned int nb_clusters; + BDRVQcow2State *s = bs->opaque; + uint64_t nb_clusters; int ret; /* The zero flag is only supported by version 3 and newer */ @@ -1626,9 +1643,10 @@ fail: static int expand_zero_clusters_in_l1(BlockDriverState *bs, uint64_t *l1_table, int l1_size, int64_t *visited_l1_entries, int64_t l1_entries, - BlockDriverAmendStatusCB *status_cb) + BlockDriverAmendStatusCB *status_cb, + void *cb_opaque) { - BDRVQcowState *s = bs->opaque; + BDRVQcow2State *s = bs->opaque; bool is_active_l1 = (l1_table == s->l1_table); uint64_t *l2_table = NULL; int ret; @@ -1637,7 +1655,7 @@ static int expand_zero_clusters_in_l1(BlockDriverState *bs, uint64_t *l1_table, if (!is_active_l1) { /* inactive L2 tables require a buffer to be stored in when loading * them from disk */ - l2_table = qemu_try_blockalign(bs->file, s->cluster_size); + l2_table = qemu_try_blockalign(bs->file->bs, s->cluster_size); if (l2_table == NULL) { return -ENOMEM; } @@ -1652,7 +1670,7 @@ static int expand_zero_clusters_in_l1(BlockDriverState *bs, uint64_t *l1_table, /* unallocated */ (*visited_l1_entries)++; if (status_cb) { - status_cb(bs, *visited_l1_entries, l1_entries); + status_cb(bs, *visited_l1_entries, l1_entries, cb_opaque); } continue; } @@ -1671,8 +1689,8 @@ static int expand_zero_clusters_in_l1(BlockDriverState *bs, uint64_t *l1_table, (void **)&l2_table); } else { /* load inactive L2 tables from disk */ - ret = bdrv_read(bs->file, l2_offset / BDRV_SECTOR_SIZE, - (void *)l2_table, s->cluster_sectors); + ret = bdrv_read(bs->file->bs, l2_offset / BDRV_SECTOR_SIZE, + (void *)l2_table, s->cluster_sectors); } if (ret < 0) { goto fail; @@ -1695,7 +1713,7 @@ static int expand_zero_clusters_in_l1(BlockDriverState *bs, uint64_t *l1_table, } if (!preallocated) { - if (!bs->backing_hd) { + if (!bs->backing) { /* not backed; therefore we can simply deallocate the * cluster */ l2_table[j] = 0; @@ -1746,7 +1764,7 @@ static int expand_zero_clusters_in_l1(BlockDriverState *bs, uint64_t *l1_table, goto fail; } - ret = bdrv_write_zeroes(bs->file, offset / BDRV_SECTOR_SIZE, + ret = bdrv_write_zeroes(bs->file->bs, offset / BDRV_SECTOR_SIZE, s->cluster_sectors, 0); if (ret < 0) { if (!preallocated) { @@ -1779,8 +1797,8 @@ static int expand_zero_clusters_in_l1(BlockDriverState *bs, uint64_t *l1_table, goto fail; } - ret = bdrv_write(bs->file, l2_offset / BDRV_SECTOR_SIZE, - (void *)l2_table, s->cluster_sectors); + ret = bdrv_write(bs->file->bs, l2_offset / BDRV_SECTOR_SIZE, + (void *)l2_table, s->cluster_sectors); if (ret < 0) { goto fail; } @@ -1789,7 +1807,7 @@ static int expand_zero_clusters_in_l1(BlockDriverState *bs, uint64_t *l1_table, (*visited_l1_entries)++; if (status_cb) { - status_cb(bs, *visited_l1_entries, l1_entries); + status_cb(bs, *visited_l1_entries, l1_entries, cb_opaque); } } @@ -1813,9 +1831,10 @@ fail: * qcow2 version which doesn't yet support metadata zero clusters. */ int qcow2_expand_zero_clusters(BlockDriverState *bs, - BlockDriverAmendStatusCB *status_cb) + BlockDriverAmendStatusCB *status_cb, + void *cb_opaque) { - BDRVQcowState *s = bs->opaque; + BDRVQcow2State *s = bs->opaque; uint64_t *l1_table = NULL; int64_t l1_entries = 0, visited_l1_entries = 0; int ret; @@ -1830,7 +1849,7 @@ int qcow2_expand_zero_clusters(BlockDriverState *bs, ret = expand_zero_clusters_in_l1(bs, s->l1_table, s->l1_size, &visited_l1_entries, l1_entries, - status_cb); + status_cb, cb_opaque); if (ret < 0) { goto fail; } @@ -1853,8 +1872,9 @@ int qcow2_expand_zero_clusters(BlockDriverState *bs, l1_table = g_realloc(l1_table, l1_sectors * BDRV_SECTOR_SIZE); - ret = bdrv_read(bs->file, s->snapshots[i].l1_table_offset / - BDRV_SECTOR_SIZE, (void *)l1_table, l1_sectors); + ret = bdrv_read(bs->file->bs, + s->snapshots[i].l1_table_offset / BDRV_SECTOR_SIZE, + (void *)l1_table, l1_sectors); if (ret < 0) { goto fail; } @@ -1865,7 +1885,7 @@ int qcow2_expand_zero_clusters(BlockDriverState *bs, ret = expand_zero_clusters_in_l1(bs, l1_table, s->snapshots[i].l1_size, &visited_l1_entries, l1_entries, - status_cb); + status_cb, cb_opaque); if (ret < 0) { goto fail; } diff --git a/qemu/block/qcow2-refcount.c b/qemu/block/qcow2-refcount.c index b0ee42d81..ca6094ff5 100644 --- a/qemu/block/qcow2-refcount.c +++ b/qemu/block/qcow2-refcount.c @@ -22,6 +22,8 @@ * THE SOFTWARE. */ +#include "qemu/osdep.h" +#include "qapi/error.h" #include "qemu-common.h" #include "block/block_int.h" #include "block/qcow2.h" @@ -82,7 +84,7 @@ static Qcow2SetRefcountFunc *const set_refcount_funcs[] = { int qcow2_refcount_init(BlockDriverState *bs) { - BDRVQcowState *s = bs->opaque; + BDRVQcow2State *s = bs->opaque; unsigned int refcount_table_size2, i; int ret; @@ -101,7 +103,7 @@ int qcow2_refcount_init(BlockDriverState *bs) goto fail; } BLKDBG_EVENT(bs->file, BLKDBG_REFTABLE_LOAD); - ret = bdrv_pread(bs->file, s->refcount_table_offset, + ret = bdrv_pread(bs->file->bs, s->refcount_table_offset, s->refcount_table, refcount_table_size2); if (ret < 0) { goto fail; @@ -116,7 +118,7 @@ int qcow2_refcount_init(BlockDriverState *bs) void qcow2_refcount_close(BlockDriverState *bs) { - BDRVQcowState *s = bs->opaque; + BDRVQcow2State *s = bs->opaque; g_free(s->refcount_table); } @@ -214,7 +216,7 @@ static int load_refcount_block(BlockDriverState *bs, int64_t refcount_block_offset, void **refcount_block) { - BDRVQcowState *s = bs->opaque; + BDRVQcow2State *s = bs->opaque; int ret; BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_LOAD); @@ -231,7 +233,7 @@ static int load_refcount_block(BlockDriverState *bs, int qcow2_get_refcount(BlockDriverState *bs, int64_t cluster_index, uint64_t *refcount) { - BDRVQcowState *s = bs->opaque; + BDRVQcow2State *s = bs->opaque; uint64_t refcount_table_index, block_index; int64_t refcount_block_offset; int ret; @@ -274,7 +276,7 @@ int qcow2_get_refcount(BlockDriverState *bs, int64_t cluster_index, * Rounds the refcount table size up to avoid growing the table for each single * refcount block that is allocated. */ -static unsigned int next_refcount_table_size(BDRVQcowState *s, +static unsigned int next_refcount_table_size(BDRVQcow2State *s, unsigned int min_size) { unsigned int min_clusters = (min_size >> (s->cluster_bits - 3)) + 1; @@ -290,7 +292,7 @@ static unsigned int next_refcount_table_size(BDRVQcowState *s, /* Checks if two offsets are described by the same refcount block */ -static int in_same_refcount_block(BDRVQcowState *s, uint64_t offset_a, +static int in_same_refcount_block(BDRVQcow2State *s, uint64_t offset_a, uint64_t offset_b) { uint64_t block_a = offset_a >> (s->cluster_bits + s->refcount_block_bits); @@ -308,7 +310,7 @@ static int in_same_refcount_block(BDRVQcowState *s, uint64_t offset_a, static int alloc_refcount_block(BlockDriverState *bs, int64_t cluster_index, void **refcount_block) { - BDRVQcowState *s = bs->opaque; + BDRVQcow2State *s = bs->opaque; unsigned int refcount_table_index; int ret; @@ -431,7 +433,7 @@ static int alloc_refcount_block(BlockDriverState *bs, if (refcount_table_index < s->refcount_table_size) { uint64_t data64 = cpu_to_be64(new_block); BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_HOOKUP); - ret = bdrv_pwrite_sync(bs->file, + ret = bdrv_pwrite_sync(bs->file->bs, s->refcount_table_offset + refcount_table_index * sizeof(uint64_t), &data64, sizeof(data64)); if (ret < 0) { @@ -535,7 +537,7 @@ static int alloc_refcount_block(BlockDriverState *bs, /* Write refcount blocks to disk */ BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_WRITE_BLOCKS); - ret = bdrv_pwrite_sync(bs->file, meta_offset, new_blocks, + ret = bdrv_pwrite_sync(bs->file->bs, meta_offset, new_blocks, blocks_clusters * s->cluster_size); g_free(new_blocks); new_blocks = NULL; @@ -549,7 +551,7 @@ static int alloc_refcount_block(BlockDriverState *bs, } BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_WRITE_TABLE); - ret = bdrv_pwrite_sync(bs->file, table_offset, new_table, + ret = bdrv_pwrite_sync(bs->file->bs, table_offset, new_table, table_size * sizeof(uint64_t)); if (ret < 0) { goto fail_table; @@ -560,12 +562,16 @@ static int alloc_refcount_block(BlockDriverState *bs, } /* Hook up the new refcount table in the qcow2 header */ - uint8_t data[12]; - cpu_to_be64w((uint64_t*)data, table_offset); - cpu_to_be32w((uint32_t*)(data + 8), table_clusters); + struct QEMU_PACKED { + uint64_t d64; + uint32_t d32; + } data; + cpu_to_be64w(&data.d64, table_offset); + cpu_to_be32w(&data.d32, table_clusters); BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_SWITCH_TABLE); - ret = bdrv_pwrite_sync(bs->file, offsetof(QCowHeader, refcount_table_offset), - data, sizeof(data)); + ret = bdrv_pwrite_sync(bs->file->bs, + offsetof(QCowHeader, refcount_table_offset), + &data, sizeof(data)); if (ret < 0) { goto fail_table; } @@ -605,7 +611,7 @@ fail_block: void qcow2_process_discards(BlockDriverState *bs, int ret) { - BDRVQcowState *s = bs->opaque; + BDRVQcow2State *s = bs->opaque; Qcow2DiscardRegion *d, *next; QTAILQ_FOREACH_SAFE(d, &s->discards, next, next) { @@ -613,7 +619,7 @@ void qcow2_process_discards(BlockDriverState *bs, int ret) /* Discard is optional, ignore the return value */ if (ret >= 0) { - bdrv_discard(bs->file, + bdrv_discard(bs->file->bs, d->offset >> BDRV_SECTOR_BITS, d->bytes >> BDRV_SECTOR_BITS); } @@ -625,7 +631,7 @@ void qcow2_process_discards(BlockDriverState *bs, int ret) static void update_refcount_discard(BlockDriverState *bs, uint64_t offset, uint64_t length) { - BDRVQcowState *s = bs->opaque; + BDRVQcow2State *s = bs->opaque; Qcow2DiscardRegion *d, *p, *next; QTAILQ_FOREACH(d, &s->discards, next) { @@ -682,7 +688,7 @@ static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs, bool decrease, enum qcow2_discard_type type) { - BDRVQcowState *s = bs->opaque; + BDRVQcow2State *s = bs->opaque; int64_t start, last, cluster_offset; void *refcount_block = NULL; int64_t old_table_index = -1; @@ -793,7 +799,7 @@ int qcow2_update_cluster_refcount(BlockDriverState *bs, uint64_t addend, bool decrease, enum qcow2_discard_type type) { - BDRVQcowState *s = bs->opaque; + BDRVQcow2State *s = bs->opaque; int ret; ret = update_refcount(bs, cluster_index << s->cluster_bits, 1, addend, @@ -815,7 +821,7 @@ int qcow2_update_cluster_refcount(BlockDriverState *bs, /* return < 0 if error */ static int64_t alloc_clusters_noref(BlockDriverState *bs, uint64_t size) { - BDRVQcowState *s = bs->opaque; + BDRVQcow2State *s = bs->opaque; uint64_t i, nb_clusters, refcount; int ret; @@ -875,10 +881,10 @@ int64_t qcow2_alloc_clusters(BlockDriverState *bs, uint64_t size) return offset; } -int qcow2_alloc_clusters_at(BlockDriverState *bs, uint64_t offset, - int nb_clusters) +int64_t qcow2_alloc_clusters_at(BlockDriverState *bs, uint64_t offset, + int64_t nb_clusters) { - BDRVQcowState *s = bs->opaque; + BDRVQcow2State *s = bs->opaque; uint64_t cluster_index, refcount; uint64_t i; int ret; @@ -916,7 +922,7 @@ int qcow2_alloc_clusters_at(BlockDriverState *bs, uint64_t offset, contiguous sectors. size must be <= cluster_size */ int64_t qcow2_alloc_bytes(BlockDriverState *bs, int size) { - BDRVQcowState *s = bs->opaque; + BDRVQcow2State *s = bs->opaque; int64_t offset; size_t free_in_cluster; int ret; @@ -949,11 +955,17 @@ int64_t qcow2_alloc_bytes(BlockDriverState *bs, int size) if (!offset || ROUND_UP(offset, s->cluster_size) != new_cluster) { offset = new_cluster; + free_in_cluster = s->cluster_size; + } else { + free_in_cluster += s->cluster_size; } } assert(offset); ret = update_refcount(bs, offset, size, 1, false, QCOW2_DISCARD_NEVER); + if (ret < 0) { + offset = 0; + } } while (ret == -EAGAIN); if (ret < 0) { return ret; @@ -992,7 +1004,7 @@ void qcow2_free_clusters(BlockDriverState *bs, void qcow2_free_any_clusters(BlockDriverState *bs, uint64_t l2_entry, int nb_clusters, enum qcow2_discard_type type) { - BDRVQcowState *s = bs->opaque; + BDRVQcow2State *s = bs->opaque; switch (qcow2_get_cluster_type(l2_entry)) { case QCOW2_CLUSTER_COMPRESSED: @@ -1036,7 +1048,7 @@ void qcow2_free_any_clusters(BlockDriverState *bs, uint64_t l2_entry, int qcow2_update_snapshot_refcount(BlockDriverState *bs, int64_t l1_table_offset, int l1_size, int addend) { - BDRVQcowState *s = bs->opaque; + BDRVQcow2State *s = bs->opaque; uint64_t *l1_table, *l2_table, l2_offset, offset, l1_size2, refcount; bool l1_allocated = false; int64_t old_offset, old_l2_offset; @@ -1062,7 +1074,7 @@ int qcow2_update_snapshot_refcount(BlockDriverState *bs, } l1_allocated = true; - ret = bdrv_pread(bs->file, l1_table_offset, l1_table, l1_size2); + ret = bdrv_pread(bs->file->bs, l1_table_offset, l1_table, l1_size2); if (ret < 0) { goto fail; } @@ -1215,7 +1227,8 @@ fail: cpu_to_be64s(&l1_table[i]); } - ret = bdrv_pwrite_sync(bs->file, l1_table_offset, l1_table, l1_size2); + ret = bdrv_pwrite_sync(bs->file->bs, l1_table_offset, + l1_table, l1_size2); for (i = 0; i < l1_size; i++) { be64_to_cpus(&l1_table[i]); @@ -1233,7 +1246,7 @@ fail: /* refcount checking functions */ -static size_t refcount_array_byte_size(BDRVQcowState *s, uint64_t entries) +static uint64_t refcount_array_byte_size(BDRVQcow2State *s, uint64_t entries) { /* This assertion holds because there is no way we can address more than * 2^(64 - 9) clusters at once (with cluster size 512 = 2^9, and because @@ -1256,10 +1269,10 @@ static size_t refcount_array_byte_size(BDRVQcowState *s, uint64_t entries) * refcount array buffer will be aligned to a cluster boundary, and the newly * allocated area will be zeroed. */ -static int realloc_refcount_array(BDRVQcowState *s, void **array, +static int realloc_refcount_array(BDRVQcow2State *s, void **array, int64_t *size, int64_t new_size) { - size_t old_byte_size, new_byte_size; + int64_t old_byte_size, new_byte_size; void *new_ptr; /* Round to clusters so the array can be directly written to disk */ @@ -1275,13 +1288,17 @@ static int realloc_refcount_array(BDRVQcowState *s, void **array, assert(new_byte_size > 0); + if (new_byte_size > SIZE_MAX) { + return -ENOMEM; + } + new_ptr = g_try_realloc(*array, new_byte_size); if (!new_ptr) { return -ENOMEM; } if (new_byte_size > old_byte_size) { - memset((void *)((uintptr_t)new_ptr + old_byte_size), 0, + memset((char *)new_ptr + old_byte_size, 0, new_byte_size - old_byte_size); } @@ -1294,7 +1311,7 @@ static int realloc_refcount_array(BDRVQcowState *s, void **array, /* * Increases the refcount for a range of clusters in a given refcount table. * This is used to construct a temporary refcount table out of L1 and L2 tables - * which can be compared the the refcount table saved in the image. + * which can be compared to the refcount table saved in the image. * * Modifies the number of errors in res. */ @@ -1304,7 +1321,7 @@ static int inc_refcounts(BlockDriverState *bs, int64_t *refcount_table_size, int64_t offset, int64_t size) { - BDRVQcowState *s = bs->opaque; + BDRVQcow2State *s = bs->opaque; uint64_t start, last, cluster_offset, k, refcount; int ret; @@ -1330,6 +1347,9 @@ static int inc_refcounts(BlockDriverState *bs, if (refcount == s->refcount_max) { fprintf(stderr, "ERROR: overflow cluster offset=0x%" PRIx64 "\n", cluster_offset); + fprintf(stderr, "Use qemu-img amend to increase the refcount entry " + "width or qemu-img convert to create a clean copy if the " + "image cannot be opened for writing\n"); res->corruptions++; continue; } @@ -1357,7 +1377,7 @@ static int check_refcounts_l2(BlockDriverState *bs, BdrvCheckResult *res, int64_t *refcount_table_size, int64_t l2_offset, int flags) { - BDRVQcowState *s = bs->opaque; + BDRVQcow2State *s = bs->opaque; uint64_t *l2_table, l2_entry; uint64_t next_contiguous_offset = 0; int i, l2_size, nb_csectors, ret; @@ -1366,7 +1386,7 @@ static int check_refcounts_l2(BlockDriverState *bs, BdrvCheckResult *res, l2_size = s->l2_size * sizeof(uint64_t); l2_table = g_malloc(l2_size); - ret = bdrv_pread(bs->file, l2_offset, l2_table, l2_size); + ret = bdrv_pread(bs->file->bs, l2_offset, l2_table, l2_size); if (ret < 0) { fprintf(stderr, "ERROR: I/O error in check_refcounts_l2\n"); res->check_errors++; @@ -1477,7 +1497,7 @@ static int check_refcounts_l1(BlockDriverState *bs, int64_t l1_table_offset, int l1_size, int flags) { - BDRVQcowState *s = bs->opaque; + BDRVQcow2State *s = bs->opaque; uint64_t *l1_table = NULL, l2_offset, l1_size2; int i, ret; @@ -1498,7 +1518,7 @@ static int check_refcounts_l1(BlockDriverState *bs, res->check_errors++; goto fail; } - ret = bdrv_pread(bs->file, l1_table_offset, l1_table, l1_size2); + ret = bdrv_pread(bs->file->bs, l1_table_offset, l1_table, l1_size2); if (ret < 0) { fprintf(stderr, "ERROR: I/O error in check_refcounts_l1\n"); res->check_errors++; @@ -1554,7 +1574,7 @@ fail: static int check_oflag_copied(BlockDriverState *bs, BdrvCheckResult *res, BdrvCheckMode fix) { - BDRVQcowState *s = bs->opaque; + BDRVQcow2State *s = bs->opaque; uint64_t *l2_table = qemu_blockalign(bs, s->cluster_size); int ret; uint64_t refcount; @@ -1596,7 +1616,7 @@ static int check_oflag_copied(BlockDriverState *bs, BdrvCheckResult *res, } } - ret = bdrv_pread(bs->file, l2_offset, l2_table, + ret = bdrv_pread(bs->file->bs, l2_offset, l2_table, s->l2_size * sizeof(uint64_t)); if (ret < 0) { fprintf(stderr, "ERROR: Could not read L2 table: %s\n", @@ -1648,7 +1668,8 @@ static int check_oflag_copied(BlockDriverState *bs, BdrvCheckResult *res, goto fail; } - ret = bdrv_pwrite(bs->file, l2_offset, l2_table, s->cluster_size); + ret = bdrv_pwrite(bs->file->bs, l2_offset, l2_table, + s->cluster_size); if (ret < 0) { fprintf(stderr, "ERROR: Could not write L2 table: %s\n", strerror(-ret)); @@ -1673,7 +1694,7 @@ static int check_refblocks(BlockDriverState *bs, BdrvCheckResult *res, BdrvCheckMode fix, bool *rebuild, void **refcount_table, int64_t *nb_clusters) { - BDRVQcowState *s = bs->opaque; + BDRVQcow2State *s = bs->opaque; int64_t i, size; int ret; @@ -1703,11 +1724,11 @@ static int check_refblocks(BlockDriverState *bs, BdrvCheckResult *res, goto resize_fail; } - ret = bdrv_truncate(bs->file, offset + s->cluster_size); + ret = bdrv_truncate(bs->file->bs, offset + s->cluster_size); if (ret < 0) { goto resize_fail; } - size = bdrv_getlength(bs->file); + size = bdrv_getlength(bs->file->bs); if (size < 0) { ret = size; goto resize_fail; @@ -1776,7 +1797,7 @@ static int calculate_refcounts(BlockDriverState *bs, BdrvCheckResult *res, BdrvCheckMode fix, bool *rebuild, void **refcount_table, int64_t *nb_clusters) { - BDRVQcowState *s = bs->opaque; + BDRVQcow2State *s = bs->opaque; int64_t i; QCowSnapshot *sn; int ret; @@ -1840,7 +1861,7 @@ static void compare_refcounts(BlockDriverState *bs, BdrvCheckResult *res, int64_t *highest_cluster, void *refcount_table, int64_t nb_clusters) { - BDRVQcowState *s = bs->opaque; + BDRVQcow2State *s = bs->opaque; int64_t i; uint64_t refcount1, refcount2; int ret; @@ -1917,7 +1938,7 @@ static int64_t alloc_clusters_imrt(BlockDriverState *bs, int64_t *imrt_nb_clusters, int64_t *first_free_cluster) { - BDRVQcowState *s = bs->opaque; + BDRVQcow2State *s = bs->opaque; int64_t cluster = *first_free_cluster, i; bool first_gap = true; int contiguous_free_clusters; @@ -1987,7 +2008,7 @@ static int rebuild_refcount_structure(BlockDriverState *bs, void **refcount_table, int64_t *nb_clusters) { - BDRVQcowState *s = bs->opaque; + BDRVQcow2State *s = bs->opaque; int64_t first_free_cluster = 0, reftable_offset = -1, cluster = 0; int64_t refblock_offset, refblock_start, refblock_index; uint32_t reftable_size = 0; @@ -2081,7 +2102,7 @@ write_refblocks: on_disk_refblock = (void *)((char *) *refcount_table + refblock_index * s->cluster_size); - ret = bdrv_write(bs->file, refblock_offset / BDRV_SECTOR_SIZE, + ret = bdrv_write(bs->file->bs, refblock_offset / BDRV_SECTOR_SIZE, on_disk_refblock, s->cluster_sectors); if (ret < 0) { fprintf(stderr, "ERROR writing refblock: %s\n", strerror(-ret)); @@ -2130,7 +2151,7 @@ write_refblocks: } assert(reftable_size < INT_MAX / sizeof(uint64_t)); - ret = bdrv_pwrite(bs->file, reftable_offset, on_disk_reftable, + ret = bdrv_pwrite(bs->file->bs, reftable_offset, on_disk_reftable, reftable_size * sizeof(uint64_t)); if (ret < 0) { fprintf(stderr, "ERROR writing reftable: %s\n", strerror(-ret)); @@ -2142,8 +2163,8 @@ write_refblocks: reftable_offset); cpu_to_be32w(&reftable_offset_and_clusters.reftable_clusters, size_to_clusters(s, reftable_size * sizeof(uint64_t))); - ret = bdrv_pwrite_sync(bs->file, offsetof(QCowHeader, - refcount_table_offset), + ret = bdrv_pwrite_sync(bs->file->bs, offsetof(QCowHeader, + refcount_table_offset), &reftable_offset_and_clusters, sizeof(reftable_offset_and_clusters)); if (ret < 0) { @@ -2174,14 +2195,14 @@ fail: int qcow2_check_refcounts(BlockDriverState *bs, BdrvCheckResult *res, BdrvCheckMode fix) { - BDRVQcowState *s = bs->opaque; + BDRVQcow2State *s = bs->opaque; BdrvCheckResult pre_compare_res; int64_t size, highest_cluster, nb_clusters; void *refcount_table = NULL; bool rebuild = false; int ret; - size = bdrv_getlength(bs->file); + size = bdrv_getlength(bs->file->bs); if (size < 0) { res->check_errors++; return size; @@ -2311,7 +2332,7 @@ fail: int qcow2_check_metadata_overlap(BlockDriverState *bs, int ign, int64_t offset, int64_t size) { - BDRVQcowState *s = bs->opaque; + BDRVQcow2State *s = bs->opaque; int chk = s->overlap_check & ~ign; int i, j; @@ -2390,7 +2411,7 @@ int qcow2_check_metadata_overlap(BlockDriverState *bs, int ign, int64_t offset, return -ENOMEM; } - ret = bdrv_pread(bs->file, l1_ofs, l1, l1_sz2); + ret = bdrv_pread(bs->file->bs, l1_ofs, l1, l1_sz2); if (ret < 0) { g_free(l1); return ret; @@ -2451,3 +2472,450 @@ int qcow2_pre_write_overlap_check(BlockDriverState *bs, int ign, int64_t offset, return 0; } + +/* A pointer to a function of this type is given to walk_over_reftable(). That + * function will create refblocks and pass them to a RefblockFinishOp once they + * are completed (@refblock). @refblock_empty is set if the refblock is + * completely empty. + * + * Along with the refblock, a corresponding reftable entry is passed, in the + * reftable @reftable (which may be reallocated) at @reftable_index. + * + * @allocated should be set to true if a new cluster has been allocated. + */ +typedef int (RefblockFinishOp)(BlockDriverState *bs, uint64_t **reftable, + uint64_t reftable_index, uint64_t *reftable_size, + void *refblock, bool refblock_empty, + bool *allocated, Error **errp); + +/** + * This "operation" for walk_over_reftable() allocates the refblock on disk (if + * it is not empty) and inserts its offset into the new reftable. The size of + * this new reftable is increased as required. + */ +static int alloc_refblock(BlockDriverState *bs, uint64_t **reftable, + uint64_t reftable_index, uint64_t *reftable_size, + void *refblock, bool refblock_empty, bool *allocated, + Error **errp) +{ + BDRVQcow2State *s = bs->opaque; + int64_t offset; + + if (!refblock_empty && reftable_index >= *reftable_size) { + uint64_t *new_reftable; + uint64_t new_reftable_size; + + new_reftable_size = ROUND_UP(reftable_index + 1, + s->cluster_size / sizeof(uint64_t)); + if (new_reftable_size > QCOW_MAX_REFTABLE_SIZE / sizeof(uint64_t)) { + error_setg(errp, + "This operation would make the refcount table grow " + "beyond the maximum size supported by QEMU, aborting"); + return -ENOTSUP; + } + + new_reftable = g_try_realloc(*reftable, new_reftable_size * + sizeof(uint64_t)); + if (!new_reftable) { + error_setg(errp, "Failed to increase reftable buffer size"); + return -ENOMEM; + } + + memset(new_reftable + *reftable_size, 0, + (new_reftable_size - *reftable_size) * sizeof(uint64_t)); + + *reftable = new_reftable; + *reftable_size = new_reftable_size; + } + + if (!refblock_empty && !(*reftable)[reftable_index]) { + offset = qcow2_alloc_clusters(bs, s->cluster_size); + if (offset < 0) { + error_setg_errno(errp, -offset, "Failed to allocate refblock"); + return offset; + } + (*reftable)[reftable_index] = offset; + *allocated = true; + } + + return 0; +} + +/** + * This "operation" for walk_over_reftable() writes the refblock to disk at the + * offset specified by the new reftable's entry. It does not modify the new + * reftable or change any refcounts. + */ +static int flush_refblock(BlockDriverState *bs, uint64_t **reftable, + uint64_t reftable_index, uint64_t *reftable_size, + void *refblock, bool refblock_empty, bool *allocated, + Error **errp) +{ + BDRVQcow2State *s = bs->opaque; + int64_t offset; + int ret; + + if (reftable_index < *reftable_size && (*reftable)[reftable_index]) { + offset = (*reftable)[reftable_index]; + + ret = qcow2_pre_write_overlap_check(bs, 0, offset, s->cluster_size); + if (ret < 0) { + error_setg_errno(errp, -ret, "Overlap check failed"); + return ret; + } + + ret = bdrv_pwrite(bs->file->bs, offset, refblock, s->cluster_size); + if (ret < 0) { + error_setg_errno(errp, -ret, "Failed to write refblock"); + return ret; + } + } else { + assert(refblock_empty); + } + + return 0; +} + +/** + * This function walks over the existing reftable and every referenced refblock; + * if @new_set_refcount is non-NULL, it is called for every refcount entry to + * create an equal new entry in the passed @new_refblock. Once that + * @new_refblock is completely filled, @operation will be called. + * + * @status_cb and @cb_opaque are used for the amend operation's status callback. + * @index is the index of the walk_over_reftable() calls and @total is the total + * number of walk_over_reftable() calls per amend operation. Both are used for + * calculating the parameters for the status callback. + * + * @allocated is set to true if a new cluster has been allocated. + */ +static int walk_over_reftable(BlockDriverState *bs, uint64_t **new_reftable, + uint64_t *new_reftable_index, + uint64_t *new_reftable_size, + void *new_refblock, int new_refblock_size, + int new_refcount_bits, + RefblockFinishOp *operation, bool *allocated, + Qcow2SetRefcountFunc *new_set_refcount, + BlockDriverAmendStatusCB *status_cb, + void *cb_opaque, int index, int total, + Error **errp) +{ + BDRVQcow2State *s = bs->opaque; + uint64_t reftable_index; + bool new_refblock_empty = true; + int refblock_index; + int new_refblock_index = 0; + int ret; + + for (reftable_index = 0; reftable_index < s->refcount_table_size; + reftable_index++) + { + uint64_t refblock_offset = s->refcount_table[reftable_index] + & REFT_OFFSET_MASK; + + status_cb(bs, (uint64_t)index * s->refcount_table_size + reftable_index, + (uint64_t)total * s->refcount_table_size, cb_opaque); + + if (refblock_offset) { + void *refblock; + + if (offset_into_cluster(s, refblock_offset)) { + qcow2_signal_corruption(bs, true, -1, -1, "Refblock offset %#" + PRIx64 " unaligned (reftable index: %#" + PRIx64 ")", refblock_offset, + reftable_index); + error_setg(errp, + "Image is corrupt (unaligned refblock offset)"); + return -EIO; + } + + ret = qcow2_cache_get(bs, s->refcount_block_cache, refblock_offset, + &refblock); + if (ret < 0) { + error_setg_errno(errp, -ret, "Failed to retrieve refblock"); + return ret; + } + + for (refblock_index = 0; refblock_index < s->refcount_block_size; + refblock_index++) + { + uint64_t refcount; + + if (new_refblock_index >= new_refblock_size) { + /* new_refblock is now complete */ + ret = operation(bs, new_reftable, *new_reftable_index, + new_reftable_size, new_refblock, + new_refblock_empty, allocated, errp); + if (ret < 0) { + qcow2_cache_put(bs, s->refcount_block_cache, &refblock); + return ret; + } + + (*new_reftable_index)++; + new_refblock_index = 0; + new_refblock_empty = true; + } + + refcount = s->get_refcount(refblock, refblock_index); + if (new_refcount_bits < 64 && refcount >> new_refcount_bits) { + uint64_t offset; + + qcow2_cache_put(bs, s->refcount_block_cache, &refblock); + + offset = ((reftable_index << s->refcount_block_bits) + + refblock_index) << s->cluster_bits; + + error_setg(errp, "Cannot decrease refcount entry width to " + "%i bits: Cluster at offset %#" PRIx64 " has a " + "refcount of %" PRIu64, new_refcount_bits, + offset, refcount); + return -EINVAL; + } + + if (new_set_refcount) { + new_set_refcount(new_refblock, new_refblock_index++, + refcount); + } else { + new_refblock_index++; + } + new_refblock_empty = new_refblock_empty && refcount == 0; + } + + qcow2_cache_put(bs, s->refcount_block_cache, &refblock); + } else { + /* No refblock means every refcount is 0 */ + for (refblock_index = 0; refblock_index < s->refcount_block_size; + refblock_index++) + { + if (new_refblock_index >= new_refblock_size) { + /* new_refblock is now complete */ + ret = operation(bs, new_reftable, *new_reftable_index, + new_reftable_size, new_refblock, + new_refblock_empty, allocated, errp); + if (ret < 0) { + return ret; + } + + (*new_reftable_index)++; + new_refblock_index = 0; + new_refblock_empty = true; + } + + if (new_set_refcount) { + new_set_refcount(new_refblock, new_refblock_index++, 0); + } else { + new_refblock_index++; + } + } + } + } + + if (new_refblock_index > 0) { + /* Complete the potentially existing partially filled final refblock */ + if (new_set_refcount) { + for (; new_refblock_index < new_refblock_size; + new_refblock_index++) + { + new_set_refcount(new_refblock, new_refblock_index, 0); + } + } + + ret = operation(bs, new_reftable, *new_reftable_index, + new_reftable_size, new_refblock, new_refblock_empty, + allocated, errp); + if (ret < 0) { + return ret; + } + + (*new_reftable_index)++; + } + + status_cb(bs, (uint64_t)(index + 1) * s->refcount_table_size, + (uint64_t)total * s->refcount_table_size, cb_opaque); + + return 0; +} + +int qcow2_change_refcount_order(BlockDriverState *bs, int refcount_order, + BlockDriverAmendStatusCB *status_cb, + void *cb_opaque, Error **errp) +{ + BDRVQcow2State *s = bs->opaque; + Qcow2GetRefcountFunc *new_get_refcount; + Qcow2SetRefcountFunc *new_set_refcount; + void *new_refblock = qemu_blockalign(bs->file->bs, s->cluster_size); + uint64_t *new_reftable = NULL, new_reftable_size = 0; + uint64_t *old_reftable, old_reftable_size, old_reftable_offset; + uint64_t new_reftable_index = 0; + uint64_t i; + int64_t new_reftable_offset = 0, allocated_reftable_size = 0; + int new_refblock_size, new_refcount_bits = 1 << refcount_order; + int old_refcount_order; + int walk_index = 0; + int ret; + bool new_allocation; + + assert(s->qcow_version >= 3); + assert(refcount_order >= 0 && refcount_order <= 6); + + /* see qcow2_open() */ + new_refblock_size = 1 << (s->cluster_bits - (refcount_order - 3)); + + new_get_refcount = get_refcount_funcs[refcount_order]; + new_set_refcount = set_refcount_funcs[refcount_order]; + + + do { + int total_walks; + + new_allocation = false; + + /* At least we have to do this walk and the one which writes the + * refblocks; also, at least we have to do this loop here at least + * twice (normally), first to do the allocations, and second to + * determine that everything is correctly allocated, this then makes + * three walks in total */ + total_walks = MAX(walk_index + 2, 3); + + /* First, allocate the structures so they are present in the refcount + * structures */ + ret = walk_over_reftable(bs, &new_reftable, &new_reftable_index, + &new_reftable_size, NULL, new_refblock_size, + new_refcount_bits, &alloc_refblock, + &new_allocation, NULL, status_cb, cb_opaque, + walk_index++, total_walks, errp); + if (ret < 0) { + goto done; + } + + new_reftable_index = 0; + + if (new_allocation) { + if (new_reftable_offset) { + qcow2_free_clusters(bs, new_reftable_offset, + allocated_reftable_size * sizeof(uint64_t), + QCOW2_DISCARD_NEVER); + } + + new_reftable_offset = qcow2_alloc_clusters(bs, new_reftable_size * + sizeof(uint64_t)); + if (new_reftable_offset < 0) { + error_setg_errno(errp, -new_reftable_offset, + "Failed to allocate the new reftable"); + ret = new_reftable_offset; + goto done; + } + allocated_reftable_size = new_reftable_size; + } + } while (new_allocation); + + /* Second, write the new refblocks */ + ret = walk_over_reftable(bs, &new_reftable, &new_reftable_index, + &new_reftable_size, new_refblock, + new_refblock_size, new_refcount_bits, + &flush_refblock, &new_allocation, new_set_refcount, + status_cb, cb_opaque, walk_index, walk_index + 1, + errp); + if (ret < 0) { + goto done; + } + assert(!new_allocation); + + + /* Write the new reftable */ + ret = qcow2_pre_write_overlap_check(bs, 0, new_reftable_offset, + new_reftable_size * sizeof(uint64_t)); + if (ret < 0) { + error_setg_errno(errp, -ret, "Overlap check failed"); + goto done; + } + + for (i = 0; i < new_reftable_size; i++) { + cpu_to_be64s(&new_reftable[i]); + } + + ret = bdrv_pwrite(bs->file->bs, new_reftable_offset, new_reftable, + new_reftable_size * sizeof(uint64_t)); + + for (i = 0; i < new_reftable_size; i++) { + be64_to_cpus(&new_reftable[i]); + } + + if (ret < 0) { + error_setg_errno(errp, -ret, "Failed to write the new reftable"); + goto done; + } + + + /* Empty the refcount cache */ + ret = qcow2_cache_flush(bs, s->refcount_block_cache); + if (ret < 0) { + error_setg_errno(errp, -ret, "Failed to flush the refblock cache"); + goto done; + } + + /* Update the image header to point to the new reftable; this only updates + * the fields which are relevant to qcow2_update_header(); other fields + * such as s->refcount_table or s->refcount_bits stay stale for now + * (because we have to restore everything if qcow2_update_header() fails) */ + old_refcount_order = s->refcount_order; + old_reftable_size = s->refcount_table_size; + old_reftable_offset = s->refcount_table_offset; + + s->refcount_order = refcount_order; + s->refcount_table_size = new_reftable_size; + s->refcount_table_offset = new_reftable_offset; + + ret = qcow2_update_header(bs); + if (ret < 0) { + s->refcount_order = old_refcount_order; + s->refcount_table_size = old_reftable_size; + s->refcount_table_offset = old_reftable_offset; + error_setg_errno(errp, -ret, "Failed to update the qcow2 header"); + goto done; + } + + /* Now update the rest of the in-memory information */ + old_reftable = s->refcount_table; + s->refcount_table = new_reftable; + + s->refcount_bits = 1 << refcount_order; + s->refcount_max = UINT64_C(1) << (s->refcount_bits - 1); + s->refcount_max += s->refcount_max - 1; + + s->refcount_block_bits = s->cluster_bits - (refcount_order - 3); + s->refcount_block_size = 1 << s->refcount_block_bits; + + s->get_refcount = new_get_refcount; + s->set_refcount = new_set_refcount; + + /* For cleaning up all old refblocks and the old reftable below the "done" + * label */ + new_reftable = old_reftable; + new_reftable_size = old_reftable_size; + new_reftable_offset = old_reftable_offset; + +done: + if (new_reftable) { + /* On success, new_reftable actually points to the old reftable (and + * new_reftable_size is the old reftable's size); but that is just + * fine */ + for (i = 0; i < new_reftable_size; i++) { + uint64_t offset = new_reftable[i] & REFT_OFFSET_MASK; + if (offset) { + qcow2_free_clusters(bs, offset, s->cluster_size, + QCOW2_DISCARD_OTHER); + } + } + g_free(new_reftable); + + if (new_reftable_offset > 0) { + qcow2_free_clusters(bs, new_reftable_offset, + new_reftable_size * sizeof(uint64_t), + QCOW2_DISCARD_OTHER); + } + } + + qemu_vfree(new_refblock); + return ret; +} diff --git a/qemu/block/qcow2-snapshot.c b/qemu/block/qcow2-snapshot.c index b6f58c13e..5f4a17e47 100644 --- a/qemu/block/qcow2-snapshot.c +++ b/qemu/block/qcow2-snapshot.c @@ -22,14 +22,16 @@ * THE SOFTWARE. */ -#include "qemu-common.h" +#include "qemu/osdep.h" +#include "qapi/error.h" #include "block/block_int.h" #include "block/qcow2.h" #include "qemu/error-report.h" +#include "qemu/cutils.h" void qcow2_free_snapshots(BlockDriverState *bs) { - BDRVQcowState *s = bs->opaque; + BDRVQcow2State *s = bs->opaque; int i; for(i = 0; i < s->nb_snapshots; i++) { @@ -43,7 +45,7 @@ void qcow2_free_snapshots(BlockDriverState *bs) int qcow2_read_snapshots(BlockDriverState *bs) { - BDRVQcowState *s = bs->opaque; + BDRVQcow2State *s = bs->opaque; QCowSnapshotHeader h; QCowSnapshotExtraData extra; QCowSnapshot *sn; @@ -64,7 +66,7 @@ int qcow2_read_snapshots(BlockDriverState *bs) for(i = 0; i < s->nb_snapshots; i++) { /* Read statically sized part of the snapshot header */ offset = align_offset(offset, 8); - ret = bdrv_pread(bs->file, offset, &h, sizeof(h)); + ret = bdrv_pread(bs->file->bs, offset, &h, sizeof(h)); if (ret < 0) { goto fail; } @@ -83,7 +85,7 @@ int qcow2_read_snapshots(BlockDriverState *bs) name_size = be16_to_cpu(h.name_size); /* Read extra data */ - ret = bdrv_pread(bs->file, offset, &extra, + ret = bdrv_pread(bs->file->bs, offset, &extra, MIN(sizeof(extra), extra_data_size)); if (ret < 0) { goto fail; @@ -102,7 +104,7 @@ int qcow2_read_snapshots(BlockDriverState *bs) /* Read snapshot ID */ sn->id_str = g_malloc(id_str_size + 1); - ret = bdrv_pread(bs->file, offset, sn->id_str, id_str_size); + ret = bdrv_pread(bs->file->bs, offset, sn->id_str, id_str_size); if (ret < 0) { goto fail; } @@ -111,7 +113,7 @@ int qcow2_read_snapshots(BlockDriverState *bs) /* Read snapshot name */ sn->name = g_malloc(name_size + 1); - ret = bdrv_pread(bs->file, offset, sn->name, name_size); + ret = bdrv_pread(bs->file->bs, offset, sn->name, name_size); if (ret < 0) { goto fail; } @@ -136,7 +138,7 @@ fail: /* add at the end of the file a new list of snapshots */ static int qcow2_write_snapshots(BlockDriverState *bs) { - BDRVQcowState *s = bs->opaque; + BDRVQcow2State *s = bs->opaque; QCowSnapshot *sn; QCowSnapshotHeader h; QCowSnapshotExtraData extra; @@ -214,25 +216,25 @@ static int qcow2_write_snapshots(BlockDriverState *bs) h.name_size = cpu_to_be16(name_size); offset = align_offset(offset, 8); - ret = bdrv_pwrite(bs->file, offset, &h, sizeof(h)); + ret = bdrv_pwrite(bs->file->bs, offset, &h, sizeof(h)); if (ret < 0) { goto fail; } offset += sizeof(h); - ret = bdrv_pwrite(bs->file, offset, &extra, sizeof(extra)); + ret = bdrv_pwrite(bs->file->bs, offset, &extra, sizeof(extra)); if (ret < 0) { goto fail; } offset += sizeof(extra); - ret = bdrv_pwrite(bs->file, offset, sn->id_str, id_str_size); + ret = bdrv_pwrite(bs->file->bs, offset, sn->id_str, id_str_size); if (ret < 0) { goto fail; } offset += id_str_size; - ret = bdrv_pwrite(bs->file, offset, sn->name, name_size); + ret = bdrv_pwrite(bs->file->bs, offset, sn->name, name_size); if (ret < 0) { goto fail; } @@ -254,7 +256,7 @@ static int qcow2_write_snapshots(BlockDriverState *bs) header_data.nb_snapshots = cpu_to_be32(s->nb_snapshots); header_data.snapshots_offset = cpu_to_be64(snapshots_offset); - ret = bdrv_pwrite_sync(bs->file, offsetof(QCowHeader, nb_snapshots), + ret = bdrv_pwrite_sync(bs->file->bs, offsetof(QCowHeader, nb_snapshots), &header_data, sizeof(header_data)); if (ret < 0) { goto fail; @@ -278,7 +280,7 @@ fail: static void find_new_snapshot_id(BlockDriverState *bs, char *id_str, int id_str_size) { - BDRVQcowState *s = bs->opaque; + BDRVQcow2State *s = bs->opaque; QCowSnapshot *sn; int i; unsigned long id, id_max = 0; @@ -296,7 +298,7 @@ static int find_snapshot_by_id_and_name(BlockDriverState *bs, const char *id, const char *name) { - BDRVQcowState *s = bs->opaque; + BDRVQcow2State *s = bs->opaque; int i; if (id && name) { @@ -338,7 +340,7 @@ static int find_snapshot_by_id_or_name(BlockDriverState *bs, /* if no id is provided, a new one is constructed */ int qcow2_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info) { - BDRVQcowState *s = bs->opaque; + BDRVQcow2State *s = bs->opaque; QCowSnapshot *new_snapshot_list = NULL; QCowSnapshot *old_snapshot_list = NULL; QCowSnapshot sn1, *sn = &sn1; @@ -396,7 +398,7 @@ int qcow2_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info) goto fail; } - ret = bdrv_pwrite(bs->file, sn->l1_table_offset, l1_table, + ret = bdrv_pwrite(bs->file->bs, sn->l1_table_offset, l1_table, s->l1_size * sizeof(uint64_t)); if (ret < 0) { goto fail; @@ -461,7 +463,7 @@ fail: /* copy the snapshot 'snapshot_name' into the current disk image */ int qcow2_snapshot_goto(BlockDriverState *bs, const char *snapshot_id) { - BDRVQcowState *s = bs->opaque; + BDRVQcow2State *s = bs->opaque; QCowSnapshot *sn; int i, snapshot_index; int cur_l1_bytes, sn_l1_bytes; @@ -509,7 +511,8 @@ int qcow2_snapshot_goto(BlockDriverState *bs, const char *snapshot_id) goto fail; } - ret = bdrv_pread(bs->file, sn->l1_table_offset, sn_l1_table, sn_l1_bytes); + ret = bdrv_pread(bs->file->bs, sn->l1_table_offset, + sn_l1_table, sn_l1_bytes); if (ret < 0) { goto fail; } @@ -526,7 +529,7 @@ int qcow2_snapshot_goto(BlockDriverState *bs, const char *snapshot_id) goto fail; } - ret = bdrv_pwrite_sync(bs->file, s->l1_table_offset, sn_l1_table, + ret = bdrv_pwrite_sync(bs->file->bs, s->l1_table_offset, sn_l1_table, cur_l1_bytes); if (ret < 0) { goto fail; @@ -587,7 +590,7 @@ int qcow2_snapshot_delete(BlockDriverState *bs, const char *name, Error **errp) { - BDRVQcowState *s = bs->opaque; + BDRVQcow2State *s = bs->opaque; QCowSnapshot sn; int snapshot_index, ret; @@ -650,7 +653,7 @@ int qcow2_snapshot_delete(BlockDriverState *bs, int qcow2_snapshot_list(BlockDriverState *bs, QEMUSnapshotInfo **psn_tab) { - BDRVQcowState *s = bs->opaque; + BDRVQcow2State *s = bs->opaque; QEMUSnapshotInfo *sn_tab, *sn_info; QCowSnapshot *sn; int i; @@ -683,7 +686,7 @@ int qcow2_snapshot_load_tmp(BlockDriverState *bs, Error **errp) { int i, snapshot_index; - BDRVQcowState *s = bs->opaque; + BDRVQcow2State *s = bs->opaque; QCowSnapshot *sn; uint64_t *new_l1_table; int new_l1_bytes; @@ -706,13 +709,14 @@ int qcow2_snapshot_load_tmp(BlockDriverState *bs, return -EFBIG; } new_l1_bytes = sn->l1_size * sizeof(uint64_t); - new_l1_table = qemu_try_blockalign(bs->file, + new_l1_table = qemu_try_blockalign(bs->file->bs, align_offset(new_l1_bytes, 512)); if (new_l1_table == NULL) { return -ENOMEM; } - ret = bdrv_pread(bs->file, sn->l1_table_offset, new_l1_table, new_l1_bytes); + ret = bdrv_pread(bs->file->bs, sn->l1_table_offset, + new_l1_table, new_l1_bytes); if (ret < 0) { error_setg(errp, "Failed to read l1 table for snapshot"); qemu_vfree(new_l1_table); diff --git a/qemu/block/qcow2.c b/qemu/block/qcow2.c index 76c331b38..470734be9 100644 --- a/qemu/block/qcow2.c +++ b/qemu/block/qcow2.c @@ -21,8 +21,9 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. */ -#include "qemu-common.h" +#include "qemu/osdep.h" #include "block/block_int.h" +#include "sysemu/block-backend.h" #include "qemu/module.h" #include <zlib.h> #include "block/qcow2.h" @@ -34,6 +35,7 @@ #include "qapi-event.h" #include "trace.h" #include "qemu/option_int.h" +#include "qemu/cutils.h" /* Differences with QCOW: @@ -85,7 +87,7 @@ static int qcow2_read_extensions(BlockDriverState *bs, uint64_t start_offset, uint64_t end_offset, void **p_feature_table, Error **errp) { - BDRVQcowState *s = bs->opaque; + BDRVQcow2State *s = bs->opaque; QCowExtension ext; uint64_t offset; int ret; @@ -104,7 +106,7 @@ static int qcow2_read_extensions(BlockDriverState *bs, uint64_t start_offset, printf("attempting to read extended header in offset %lu\n", offset); #endif - ret = bdrv_pread(bs->file, offset, &ext, sizeof(ext)); + ret = bdrv_pread(bs->file->bs, offset, &ext, sizeof(ext)); if (ret < 0) { error_setg_errno(errp, -ret, "qcow2_read_extension: ERROR: " "pread fail from offset %" PRIu64, offset); @@ -132,7 +134,7 @@ static int qcow2_read_extensions(BlockDriverState *bs, uint64_t start_offset, sizeof(bs->backing_format)); return 2; } - ret = bdrv_pread(bs->file, offset, bs->backing_format, ext.len); + ret = bdrv_pread(bs->file->bs, offset, bs->backing_format, ext.len); if (ret < 0) { error_setg_errno(errp, -ret, "ERROR: ext_backing_format: " "Could not read format name"); @@ -148,7 +150,7 @@ static int qcow2_read_extensions(BlockDriverState *bs, uint64_t start_offset, case QCOW2_EXT_MAGIC_FEATURE_TABLE: if (p_feature_table != NULL) { void* feature_table = g_malloc0(ext.len + 2 * sizeof(Qcow2Feature)); - ret = bdrv_pread(bs->file, offset , feature_table, ext.len); + ret = bdrv_pread(bs->file->bs, offset , feature_table, ext.len); if (ret < 0) { error_setg_errno(errp, -ret, "ERROR: ext_feature_table: " "Could not read table"); @@ -169,7 +171,7 @@ static int qcow2_read_extensions(BlockDriverState *bs, uint64_t start_offset, uext->len = ext.len; QLIST_INSERT_HEAD(&s->unknown_header_ext, uext, next); - ret = bdrv_pread(bs->file, offset , uext->data, uext->len); + ret = bdrv_pread(bs->file->bs, offset , uext->data, uext->len); if (ret < 0) { error_setg_errno(errp, -ret, "ERROR: unknown extension: " "Could not read data"); @@ -187,7 +189,7 @@ static int qcow2_read_extensions(BlockDriverState *bs, uint64_t start_offset, static void cleanup_unknown_header_ext(BlockDriverState *bs) { - BDRVQcowState *s = bs->opaque; + BDRVQcow2State *s = bs->opaque; Qcow2UnknownHeaderExtension *uext, *next; QLIST_FOREACH_SAFE(uext, &s->unknown_header_ext, next, next) { @@ -196,22 +198,8 @@ static void cleanup_unknown_header_ext(BlockDriverState *bs) } } -static void GCC_FMT_ATTR(3, 4) report_unsupported(BlockDriverState *bs, - Error **errp, const char *fmt, ...) -{ - char msg[64]; - va_list ap; - - va_start(ap, fmt); - vsnprintf(msg, sizeof(msg), fmt, ap); - va_end(ap); - - error_setg(errp, QERR_UNKNOWN_BLOCK_FORMAT_FEATURE, - bdrv_get_device_or_node_name(bs), "qcow2", msg); -} - -static void report_unsupported_feature(BlockDriverState *bs, - Error **errp, Qcow2Feature *table, uint64_t mask) +static void report_unsupported_feature(Error **errp, Qcow2Feature *table, + uint64_t mask) { char *features = g_strdup(""); char *old; @@ -236,7 +224,7 @@ static void report_unsupported_feature(BlockDriverState *bs, g_free(old); } - report_unsupported(bs, errp, "%s", features); + error_setg(errp, "Unsupported qcow2 feature(s): %s", features); g_free(features); } @@ -249,7 +237,7 @@ static void report_unsupported_feature(BlockDriverState *bs, */ int qcow2_mark_dirty(BlockDriverState *bs) { - BDRVQcowState *s = bs->opaque; + BDRVQcow2State *s = bs->opaque; uint64_t val; int ret; @@ -260,12 +248,12 @@ int qcow2_mark_dirty(BlockDriverState *bs) } val = cpu_to_be64(s->incompatible_features | QCOW2_INCOMPAT_DIRTY); - ret = bdrv_pwrite(bs->file, offsetof(QCowHeader, incompatible_features), + ret = bdrv_pwrite(bs->file->bs, offsetof(QCowHeader, incompatible_features), &val, sizeof(val)); if (ret < 0) { return ret; } - ret = bdrv_flush(bs->file); + ret = bdrv_flush(bs->file->bs); if (ret < 0) { return ret; } @@ -282,7 +270,7 @@ int qcow2_mark_dirty(BlockDriverState *bs) */ static int qcow2_mark_clean(BlockDriverState *bs) { - BDRVQcowState *s = bs->opaque; + BDRVQcow2State *s = bs->opaque; if (s->incompatible_features & QCOW2_INCOMPAT_DIRTY) { int ret; @@ -304,7 +292,7 @@ static int qcow2_mark_clean(BlockDriverState *bs) */ int qcow2_mark_corrupt(BlockDriverState *bs) { - BDRVQcowState *s = bs->opaque; + BDRVQcow2State *s = bs->opaque; s->incompatible_features |= QCOW2_INCOMPAT_CORRUPT; return qcow2_update_header(bs); @@ -316,7 +304,7 @@ int qcow2_mark_corrupt(BlockDriverState *bs) */ int qcow2_mark_consistent(BlockDriverState *bs) { - BDRVQcowState *s = bs->opaque; + BDRVQcow2State *s = bs->opaque; if (s->incompatible_features & QCOW2_INCOMPAT_CORRUPT) { int ret = bdrv_flush(bs); @@ -351,7 +339,7 @@ static int qcow2_check(BlockDriverState *bs, BdrvCheckResult *result, static int validate_table_offset(BlockDriverState *bs, uint64_t offset, uint64_t entries, size_t entry_len) { - BDRVQcowState *s = bs->opaque; + BDRVQcow2State *s = bs->opaque; uint64_t size; /* Use signed INT64_MAX as the maximum even for uint64_t header fields, @@ -467,6 +455,11 @@ static QemuOptsList qcow2_runtime_opts = { .type = QEMU_OPT_SIZE, .help = "Maximum refcount block cache size", }, + { + .name = QCOW2_OPT_CACHE_CLEAN_INTERVAL, + .type = QEMU_OPT_NUMBER, + .help = "Clean unused cache entries after this time (in seconds)", + }, { /* end of list */ } }, }; @@ -482,11 +475,54 @@ static const char *overlap_bool_option_names[QCOW2_OL_MAX_BITNR] = { [QCOW2_OL_INACTIVE_L2_BITNR] = QCOW2_OPT_OVERLAP_INACTIVE_L2, }; +static void cache_clean_timer_cb(void *opaque) +{ + BlockDriverState *bs = opaque; + BDRVQcow2State *s = bs->opaque; + qcow2_cache_clean_unused(bs, s->l2_table_cache); + qcow2_cache_clean_unused(bs, s->refcount_block_cache); + timer_mod(s->cache_clean_timer, qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL) + + (int64_t) s->cache_clean_interval * 1000); +} + +static void cache_clean_timer_init(BlockDriverState *bs, AioContext *context) +{ + BDRVQcow2State *s = bs->opaque; + if (s->cache_clean_interval > 0) { + s->cache_clean_timer = aio_timer_new(context, QEMU_CLOCK_VIRTUAL, + SCALE_MS, cache_clean_timer_cb, + bs); + timer_mod(s->cache_clean_timer, qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL) + + (int64_t) s->cache_clean_interval * 1000); + } +} + +static void cache_clean_timer_del(BlockDriverState *bs) +{ + BDRVQcow2State *s = bs->opaque; + if (s->cache_clean_timer) { + timer_del(s->cache_clean_timer); + timer_free(s->cache_clean_timer); + s->cache_clean_timer = NULL; + } +} + +static void qcow2_detach_aio_context(BlockDriverState *bs) +{ + cache_clean_timer_del(bs); +} + +static void qcow2_attach_aio_context(BlockDriverState *bs, + AioContext *new_context) +{ + cache_clean_timer_init(bs, new_context); +} + static void read_cache_sizes(BlockDriverState *bs, QemuOpts *opts, uint64_t *l2_cache_size, uint64_t *refcount_cache_size, Error **errp) { - BDRVQcowState *s = bs->opaque; + BDRVQcow2State *s = bs->opaque; uint64_t combined_cache_size; bool l2_cache_size_set, refcount_cache_size_set, combined_cache_size_set; @@ -541,22 +577,246 @@ static void read_cache_sizes(BlockDriverState *bs, QemuOpts *opts, } } +typedef struct Qcow2ReopenState { + Qcow2Cache *l2_table_cache; + Qcow2Cache *refcount_block_cache; + bool use_lazy_refcounts; + int overlap_check; + bool discard_passthrough[QCOW2_DISCARD_MAX]; + uint64_t cache_clean_interval; +} Qcow2ReopenState; + +static int qcow2_update_options_prepare(BlockDriverState *bs, + Qcow2ReopenState *r, + QDict *options, int flags, + Error **errp) +{ + BDRVQcow2State *s = bs->opaque; + QemuOpts *opts = NULL; + const char *opt_overlap_check, *opt_overlap_check_template; + int overlap_check_template = 0; + uint64_t l2_cache_size, refcount_cache_size; + int i; + Error *local_err = NULL; + int ret; + + opts = qemu_opts_create(&qcow2_runtime_opts, NULL, 0, &error_abort); + qemu_opts_absorb_qdict(opts, options, &local_err); + if (local_err) { + error_propagate(errp, local_err); + ret = -EINVAL; + goto fail; + } + + /* get L2 table/refcount block cache size from command line options */ + read_cache_sizes(bs, opts, &l2_cache_size, &refcount_cache_size, + &local_err); + if (local_err) { + error_propagate(errp, local_err); + ret = -EINVAL; + goto fail; + } + + l2_cache_size /= s->cluster_size; + if (l2_cache_size < MIN_L2_CACHE_SIZE) { + l2_cache_size = MIN_L2_CACHE_SIZE; + } + if (l2_cache_size > INT_MAX) { + error_setg(errp, "L2 cache size too big"); + ret = -EINVAL; + goto fail; + } + + refcount_cache_size /= s->cluster_size; + if (refcount_cache_size < MIN_REFCOUNT_CACHE_SIZE) { + refcount_cache_size = MIN_REFCOUNT_CACHE_SIZE; + } + if (refcount_cache_size > INT_MAX) { + error_setg(errp, "Refcount cache size too big"); + ret = -EINVAL; + goto fail; + } + + /* alloc new L2 table/refcount block cache, flush old one */ + if (s->l2_table_cache) { + ret = qcow2_cache_flush(bs, s->l2_table_cache); + if (ret) { + error_setg_errno(errp, -ret, "Failed to flush the L2 table cache"); + goto fail; + } + } + + if (s->refcount_block_cache) { + ret = qcow2_cache_flush(bs, s->refcount_block_cache); + if (ret) { + error_setg_errno(errp, -ret, + "Failed to flush the refcount block cache"); + goto fail; + } + } + + r->l2_table_cache = qcow2_cache_create(bs, l2_cache_size); + r->refcount_block_cache = qcow2_cache_create(bs, refcount_cache_size); + if (r->l2_table_cache == NULL || r->refcount_block_cache == NULL) { + error_setg(errp, "Could not allocate metadata caches"); + ret = -ENOMEM; + goto fail; + } + + /* New interval for cache cleanup timer */ + r->cache_clean_interval = + qemu_opt_get_number(opts, QCOW2_OPT_CACHE_CLEAN_INTERVAL, + s->cache_clean_interval); + if (r->cache_clean_interval > UINT_MAX) { + error_setg(errp, "Cache clean interval too big"); + ret = -EINVAL; + goto fail; + } + + /* lazy-refcounts; flush if going from enabled to disabled */ + r->use_lazy_refcounts = qemu_opt_get_bool(opts, QCOW2_OPT_LAZY_REFCOUNTS, + (s->compatible_features & QCOW2_COMPAT_LAZY_REFCOUNTS)); + if (r->use_lazy_refcounts && s->qcow_version < 3) { + error_setg(errp, "Lazy refcounts require a qcow2 image with at least " + "qemu 1.1 compatibility level"); + ret = -EINVAL; + goto fail; + } + + if (s->use_lazy_refcounts && !r->use_lazy_refcounts) { + ret = qcow2_mark_clean(bs); + if (ret < 0) { + error_setg_errno(errp, -ret, "Failed to disable lazy refcounts"); + goto fail; + } + } + + /* Overlap check options */ + opt_overlap_check = qemu_opt_get(opts, QCOW2_OPT_OVERLAP); + opt_overlap_check_template = qemu_opt_get(opts, QCOW2_OPT_OVERLAP_TEMPLATE); + if (opt_overlap_check_template && opt_overlap_check && + strcmp(opt_overlap_check_template, opt_overlap_check)) + { + error_setg(errp, "Conflicting values for qcow2 options '" + QCOW2_OPT_OVERLAP "' ('%s') and '" QCOW2_OPT_OVERLAP_TEMPLATE + "' ('%s')", opt_overlap_check, opt_overlap_check_template); + ret = -EINVAL; + goto fail; + } + if (!opt_overlap_check) { + opt_overlap_check = opt_overlap_check_template ?: "cached"; + } + + if (!strcmp(opt_overlap_check, "none")) { + overlap_check_template = 0; + } else if (!strcmp(opt_overlap_check, "constant")) { + overlap_check_template = QCOW2_OL_CONSTANT; + } else if (!strcmp(opt_overlap_check, "cached")) { + overlap_check_template = QCOW2_OL_CACHED; + } else if (!strcmp(opt_overlap_check, "all")) { + overlap_check_template = QCOW2_OL_ALL; + } else { + error_setg(errp, "Unsupported value '%s' for qcow2 option " + "'overlap-check'. Allowed are any of the following: " + "none, constant, cached, all", opt_overlap_check); + ret = -EINVAL; + goto fail; + } + + r->overlap_check = 0; + for (i = 0; i < QCOW2_OL_MAX_BITNR; i++) { + /* overlap-check defines a template bitmask, but every flag may be + * overwritten through the associated boolean option */ + r->overlap_check |= + qemu_opt_get_bool(opts, overlap_bool_option_names[i], + overlap_check_template & (1 << i)) << i; + } + + r->discard_passthrough[QCOW2_DISCARD_NEVER] = false; + r->discard_passthrough[QCOW2_DISCARD_ALWAYS] = true; + r->discard_passthrough[QCOW2_DISCARD_REQUEST] = + qemu_opt_get_bool(opts, QCOW2_OPT_DISCARD_REQUEST, + flags & BDRV_O_UNMAP); + r->discard_passthrough[QCOW2_DISCARD_SNAPSHOT] = + qemu_opt_get_bool(opts, QCOW2_OPT_DISCARD_SNAPSHOT, true); + r->discard_passthrough[QCOW2_DISCARD_OTHER] = + qemu_opt_get_bool(opts, QCOW2_OPT_DISCARD_OTHER, false); + + ret = 0; +fail: + qemu_opts_del(opts); + opts = NULL; + return ret; +} + +static void qcow2_update_options_commit(BlockDriverState *bs, + Qcow2ReopenState *r) +{ + BDRVQcow2State *s = bs->opaque; + int i; + + if (s->l2_table_cache) { + qcow2_cache_destroy(bs, s->l2_table_cache); + } + if (s->refcount_block_cache) { + qcow2_cache_destroy(bs, s->refcount_block_cache); + } + s->l2_table_cache = r->l2_table_cache; + s->refcount_block_cache = r->refcount_block_cache; + + s->overlap_check = r->overlap_check; + s->use_lazy_refcounts = r->use_lazy_refcounts; + + for (i = 0; i < QCOW2_DISCARD_MAX; i++) { + s->discard_passthrough[i] = r->discard_passthrough[i]; + } + + if (s->cache_clean_interval != r->cache_clean_interval) { + cache_clean_timer_del(bs); + s->cache_clean_interval = r->cache_clean_interval; + cache_clean_timer_init(bs, bdrv_get_aio_context(bs)); + } +} + +static void qcow2_update_options_abort(BlockDriverState *bs, + Qcow2ReopenState *r) +{ + if (r->l2_table_cache) { + qcow2_cache_destroy(bs, r->l2_table_cache); + } + if (r->refcount_block_cache) { + qcow2_cache_destroy(bs, r->refcount_block_cache); + } +} + +static int qcow2_update_options(BlockDriverState *bs, QDict *options, + int flags, Error **errp) +{ + Qcow2ReopenState r = {}; + int ret; + + ret = qcow2_update_options_prepare(bs, &r, options, flags, errp); + if (ret >= 0) { + qcow2_update_options_commit(bs, &r); + } else { + qcow2_update_options_abort(bs, &r); + } + + return ret; +} + static int qcow2_open(BlockDriverState *bs, QDict *options, int flags, Error **errp) { - BDRVQcowState *s = bs->opaque; + BDRVQcow2State *s = bs->opaque; unsigned int len, i; int ret = 0; QCowHeader header; - QemuOpts *opts = NULL; Error *local_err = NULL; uint64_t ext_end; uint64_t l1_vm_state_index; - const char *opt_overlap_check, *opt_overlap_check_template; - int overlap_check_template = 0; - uint64_t l2_cache_size, refcount_cache_size; - ret = bdrv_pread(bs->file, 0, &header, sizeof(header)); + ret = bdrv_pread(bs->file->bs, 0, &header, sizeof(header)); if (ret < 0) { error_setg_errno(errp, -ret, "Could not read qcow2 header"); goto fail; @@ -581,7 +841,7 @@ static int qcow2_open(BlockDriverState *bs, QDict *options, int flags, goto fail; } if (header.version < 2 || header.version > 3) { - report_unsupported(bs, errp, "QCOW version %" PRIu32, header.version); + error_setg(errp, "Unsupported qcow2 version %" PRIu32, header.version); ret = -ENOTSUP; goto fail; } @@ -631,7 +891,7 @@ static int qcow2_open(BlockDriverState *bs, QDict *options, int flags, if (header.header_length > sizeof(header)) { s->unknown_header_fields_size = header.header_length - sizeof(header); s->unknown_header_fields = g_malloc(s->unknown_header_fields_size); - ret = bdrv_pread(bs->file, sizeof(header), s->unknown_header_fields, + ret = bdrv_pread(bs->file->bs, sizeof(header), s->unknown_header_fields, s->unknown_header_fields_size); if (ret < 0) { error_setg_errno(errp, -ret, "Could not read unknown qcow2 header " @@ -661,7 +921,7 @@ static int qcow2_open(BlockDriverState *bs, QDict *options, int flags, void *feature_table = NULL; qcow2_read_extensions(bs, header.header_length, ext_end, &feature_table, NULL); - report_unsupported_feature(bs, errp, feature_table, + report_unsupported_feature(errp, feature_table, s->incompatible_features & ~QCOW2_INCOMPAT_MASK); ret = -ENOTSUP; @@ -705,6 +965,14 @@ static int qcow2_open(BlockDriverState *bs, QDict *options, int flags, } s->crypt_method_header = header.crypt_method; if (s->crypt_method_header) { + if (bdrv_uses_whitelist() && + s->crypt_method_header == QCOW_CRYPT_AES) { + error_report("qcow2 built-in AES encryption is deprecated"); + error_printf("Support for it will be removed in a future release.\n" + "You can use 'qemu-img convert' to switch to an\n" + "unencrypted qcow2 image, or a LUKS raw image.\n"); + } + bs->encrypted = 1; } @@ -784,14 +1052,14 @@ static int qcow2_open(BlockDriverState *bs, QDict *options, int flags, if (s->l1_size > 0) { - s->l1_table = qemu_try_blockalign(bs->file, + s->l1_table = qemu_try_blockalign(bs->file->bs, align_offset(s->l1_size * sizeof(uint64_t), 512)); if (s->l1_table == NULL) { error_setg(errp, "Could not allocate L1 table"); ret = -ENOMEM; goto fail; } - ret = bdrv_pread(bs->file, s->l1_table_offset, s->l1_table, + ret = bdrv_pread(bs->file->bs, s->l1_table_offset, s->l1_table, s->l1_size * sizeof(uint64_t)); if (ret < 0) { error_setg_errno(errp, -ret, "Could not read L1 table"); @@ -802,55 +1070,15 @@ static int qcow2_open(BlockDriverState *bs, QDict *options, int flags, } } - /* get L2 table/refcount block cache size from command line options */ - opts = qemu_opts_create(&qcow2_runtime_opts, NULL, 0, &error_abort); - qemu_opts_absorb_qdict(opts, options, &local_err); - if (local_err) { - error_propagate(errp, local_err); - ret = -EINVAL; - goto fail; - } - - read_cache_sizes(bs, opts, &l2_cache_size, &refcount_cache_size, - &local_err); - if (local_err) { - error_propagate(errp, local_err); - ret = -EINVAL; - goto fail; - } - - l2_cache_size /= s->cluster_size; - if (l2_cache_size < MIN_L2_CACHE_SIZE) { - l2_cache_size = MIN_L2_CACHE_SIZE; - } - if (l2_cache_size > INT_MAX) { - error_setg(errp, "L2 cache size too big"); - ret = -EINVAL; - goto fail; - } - - refcount_cache_size /= s->cluster_size; - if (refcount_cache_size < MIN_REFCOUNT_CACHE_SIZE) { - refcount_cache_size = MIN_REFCOUNT_CACHE_SIZE; - } - if (refcount_cache_size > INT_MAX) { - error_setg(errp, "Refcount cache size too big"); - ret = -EINVAL; - goto fail; - } - - /* alloc L2 table/refcount block cache */ - s->l2_table_cache = qcow2_cache_create(bs, l2_cache_size); - s->refcount_block_cache = qcow2_cache_create(bs, refcount_cache_size); - if (s->l2_table_cache == NULL || s->refcount_block_cache == NULL) { - error_setg(errp, "Could not allocate metadata caches"); - ret = -ENOMEM; + /* Parse driver-specific options */ + ret = qcow2_update_options(bs, options, flags, errp); + if (ret < 0) { goto fail; } s->cluster_cache = g_malloc(s->cluster_size); /* one more sector for decompressed data alignment */ - s->cluster_data = qemu_try_blockalign(bs->file, QCOW_MAX_CRYPT_CLUSTERS + s->cluster_data = qemu_try_blockalign(bs->file->bs, QCOW_MAX_CRYPT_CLUSTERS * s->cluster_size + 512); if (s->cluster_data == NULL) { error_setg(errp, "Could not allocate temporary cluster buffer"); @@ -887,7 +1115,7 @@ static int qcow2_open(BlockDriverState *bs, QDict *options, int flags, ret = -EINVAL; goto fail; } - ret = bdrv_pread(bs->file, header.backing_file_offset, + ret = bdrv_pread(bs->file->bs, header.backing_file_offset, bs->backing_file, len); if (ret < 0) { error_setg_errno(errp, -ret, "Could not read backing file name"); @@ -908,7 +1136,7 @@ static int qcow2_open(BlockDriverState *bs, QDict *options, int flags, } /* Clear unknown autoclear feature bits */ - if (!bs->read_only && !(flags & BDRV_O_INCOMING) && s->autoclear_features) { + if (!bs->read_only && !(flags & BDRV_O_INACTIVE) && s->autoclear_features) { s->autoclear_features = 0; ret = qcow2_update_header(bs); if (ret < 0) { @@ -921,7 +1149,7 @@ static int qcow2_open(BlockDriverState *bs, QDict *options, int flags, qemu_co_mutex_init(&s->lock); /* Repair image if dirty */ - if (!(flags & (BDRV_O_CHECK | BDRV_O_INCOMING)) && !bs->read_only && + if (!(flags & (BDRV_O_CHECK | BDRV_O_INACTIVE)) && !bs->read_only && (s->incompatible_features & QCOW2_INCOMPAT_DIRTY)) { BdrvCheckResult result = {0}; @@ -932,70 +1160,6 @@ static int qcow2_open(BlockDriverState *bs, QDict *options, int flags, } } - /* Enable lazy_refcounts according to image and command line options */ - s->use_lazy_refcounts = qemu_opt_get_bool(opts, QCOW2_OPT_LAZY_REFCOUNTS, - (s->compatible_features & QCOW2_COMPAT_LAZY_REFCOUNTS)); - - s->discard_passthrough[QCOW2_DISCARD_NEVER] = false; - s->discard_passthrough[QCOW2_DISCARD_ALWAYS] = true; - s->discard_passthrough[QCOW2_DISCARD_REQUEST] = - qemu_opt_get_bool(opts, QCOW2_OPT_DISCARD_REQUEST, - flags & BDRV_O_UNMAP); - s->discard_passthrough[QCOW2_DISCARD_SNAPSHOT] = - qemu_opt_get_bool(opts, QCOW2_OPT_DISCARD_SNAPSHOT, true); - s->discard_passthrough[QCOW2_DISCARD_OTHER] = - qemu_opt_get_bool(opts, QCOW2_OPT_DISCARD_OTHER, false); - - opt_overlap_check = qemu_opt_get(opts, QCOW2_OPT_OVERLAP); - opt_overlap_check_template = qemu_opt_get(opts, QCOW2_OPT_OVERLAP_TEMPLATE); - if (opt_overlap_check_template && opt_overlap_check && - strcmp(opt_overlap_check_template, opt_overlap_check)) - { - error_setg(errp, "Conflicting values for qcow2 options '" - QCOW2_OPT_OVERLAP "' ('%s') and '" QCOW2_OPT_OVERLAP_TEMPLATE - "' ('%s')", opt_overlap_check, opt_overlap_check_template); - ret = -EINVAL; - goto fail; - } - if (!opt_overlap_check) { - opt_overlap_check = opt_overlap_check_template ?: "cached"; - } - - if (!strcmp(opt_overlap_check, "none")) { - overlap_check_template = 0; - } else if (!strcmp(opt_overlap_check, "constant")) { - overlap_check_template = QCOW2_OL_CONSTANT; - } else if (!strcmp(opt_overlap_check, "cached")) { - overlap_check_template = QCOW2_OL_CACHED; - } else if (!strcmp(opt_overlap_check, "all")) { - overlap_check_template = QCOW2_OL_ALL; - } else { - error_setg(errp, "Unsupported value '%s' for qcow2 option " - "'overlap-check'. Allowed are either of the following: " - "none, constant, cached, all", opt_overlap_check); - ret = -EINVAL; - goto fail; - } - - s->overlap_check = 0; - for (i = 0; i < QCOW2_OL_MAX_BITNR; i++) { - /* overlap-check defines a template bitmask, but every flag may be - * overwritten through the associated boolean option */ - s->overlap_check |= - qemu_opt_get_bool(opts, overlap_bool_option_names[i], - overlap_check_template & (1 << i)) << i; - } - - qemu_opts_del(opts); - opts = NULL; - - if (s->use_lazy_refcounts && s->qcow_version < 3) { - error_setg(errp, "Lazy refcounts require a qcow2 image with at least " - "qemu 1.1 compatibility level"); - ret = -EINVAL; - goto fail; - } - #ifdef DEBUG_ALLOC { BdrvCheckResult result = {0}; @@ -1005,7 +1169,6 @@ static int qcow2_open(BlockDriverState *bs, QDict *options, int flags, return ret; fail: - qemu_opts_del(opts); g_free(s->unknown_header_fields); cleanup_unknown_header_ext(bs); qcow2_free_snapshots(bs); @@ -1013,6 +1176,7 @@ static int qcow2_open(BlockDriverState *bs, QDict *options, int flags, qemu_vfree(s->l1_table); /* else pre-write overlap checks in cache_destroy may crash */ s->l1_table = NULL; + cache_clean_timer_del(bs); if (s->l2_table_cache) { qcow2_cache_destroy(bs, s->l2_table_cache); } @@ -1026,14 +1190,14 @@ static int qcow2_open(BlockDriverState *bs, QDict *options, int flags, static void qcow2_refresh_limits(BlockDriverState *bs, Error **errp) { - BDRVQcowState *s = bs->opaque; + BDRVQcow2State *s = bs->opaque; bs->bl.write_zeroes_alignment = s->cluster_sectors; } static int qcow2_set_key(BlockDriverState *bs, const char *key) { - BDRVQcowState *s = bs->opaque; + BDRVQcow2State *s = bs->opaque; uint8_t keybuf[16]; int len, i; Error *err = NULL; @@ -1066,32 +1230,104 @@ static int qcow2_set_key(BlockDriverState *bs, const char *key) return 0; } -/* We have no actual commit/abort logic for qcow2, but we need to write out any - * unwritten data if we reopen read-only. */ static int qcow2_reopen_prepare(BDRVReopenState *state, BlockReopenQueue *queue, Error **errp) { + Qcow2ReopenState *r; int ret; + r = g_new0(Qcow2ReopenState, 1); + state->opaque = r; + + ret = qcow2_update_options_prepare(state->bs, r, state->options, + state->flags, errp); + if (ret < 0) { + goto fail; + } + + /* We need to write out any unwritten data if we reopen read-only. */ if ((state->flags & BDRV_O_RDWR) == 0) { ret = bdrv_flush(state->bs); if (ret < 0) { - return ret; + goto fail; } ret = qcow2_mark_clean(state->bs); if (ret < 0) { - return ret; + goto fail; } } return 0; + +fail: + qcow2_update_options_abort(state->bs, r); + g_free(r); + return ret; +} + +static void qcow2_reopen_commit(BDRVReopenState *state) +{ + qcow2_update_options_commit(state->bs, state->opaque); + g_free(state->opaque); +} + +static void qcow2_reopen_abort(BDRVReopenState *state) +{ + qcow2_update_options_abort(state->bs, state->opaque); + g_free(state->opaque); +} + +static void qcow2_join_options(QDict *options, QDict *old_options) +{ + bool has_new_overlap_template = + qdict_haskey(options, QCOW2_OPT_OVERLAP) || + qdict_haskey(options, QCOW2_OPT_OVERLAP_TEMPLATE); + bool has_new_total_cache_size = + qdict_haskey(options, QCOW2_OPT_CACHE_SIZE); + bool has_all_cache_options; + + /* New overlap template overrides all old overlap options */ + if (has_new_overlap_template) { + qdict_del(old_options, QCOW2_OPT_OVERLAP); + qdict_del(old_options, QCOW2_OPT_OVERLAP_TEMPLATE); + qdict_del(old_options, QCOW2_OPT_OVERLAP_MAIN_HEADER); + qdict_del(old_options, QCOW2_OPT_OVERLAP_ACTIVE_L1); + qdict_del(old_options, QCOW2_OPT_OVERLAP_ACTIVE_L2); + qdict_del(old_options, QCOW2_OPT_OVERLAP_REFCOUNT_TABLE); + qdict_del(old_options, QCOW2_OPT_OVERLAP_REFCOUNT_BLOCK); + qdict_del(old_options, QCOW2_OPT_OVERLAP_SNAPSHOT_TABLE); + qdict_del(old_options, QCOW2_OPT_OVERLAP_INACTIVE_L1); + qdict_del(old_options, QCOW2_OPT_OVERLAP_INACTIVE_L2); + } + + /* New total cache size overrides all old options */ + if (qdict_haskey(options, QCOW2_OPT_CACHE_SIZE)) { + qdict_del(old_options, QCOW2_OPT_L2_CACHE_SIZE); + qdict_del(old_options, QCOW2_OPT_REFCOUNT_CACHE_SIZE); + } + + qdict_join(options, old_options, false); + + /* + * If after merging all cache size options are set, an old total size is + * overwritten. Do keep all options, however, if all three are new. The + * resulting error message is what we want to happen. + */ + has_all_cache_options = + qdict_haskey(options, QCOW2_OPT_CACHE_SIZE) || + qdict_haskey(options, QCOW2_OPT_L2_CACHE_SIZE) || + qdict_haskey(options, QCOW2_OPT_REFCOUNT_CACHE_SIZE); + + if (has_all_cache_options && !has_new_total_cache_size) { + qdict_del(options, QCOW2_OPT_CACHE_SIZE); + } } static int64_t coroutine_fn qcow2_co_get_block_status(BlockDriverState *bs, - int64_t sector_num, int nb_sectors, int *pnum) + int64_t sector_num, int nb_sectors, int *pnum, BlockDriverState **file) { - BDRVQcowState *s = bs->opaque; + BDRVQcow2State *s = bs->opaque; uint64_t cluster_offset; int index_in_cluster, ret; int64_t status = 0; @@ -1108,6 +1344,7 @@ static int64_t coroutine_fn qcow2_co_get_block_status(BlockDriverState *bs, !s->cipher) { index_in_cluster = sector_num & (s->cluster_sectors - 1); cluster_offset |= (index_in_cluster << BDRV_SECTOR_BITS); + *file = bs->file->bs; status |= BDRV_BLOCK_OFFSET_VALID | cluster_offset; } if (ret == QCOW2_CLUSTER_ZERO) { @@ -1138,7 +1375,7 @@ int qcow2_backing_read1(BlockDriverState *bs, QEMUIOVector *qiov, static coroutine_fn int qcow2_co_readv(BlockDriverState *bs, int64_t sector_num, int remaining_sectors, QEMUIOVector *qiov) { - BDRVQcowState *s = bs->opaque; + BDRVQcow2State *s = bs->opaque; int index_in_cluster, n1; int ret; int cur_nr_sectors; /* number of sectors in current iteration */ @@ -1175,9 +1412,9 @@ static coroutine_fn int qcow2_co_readv(BlockDriverState *bs, int64_t sector_num, switch (ret) { case QCOW2_CLUSTER_UNALLOCATED: - if (bs->backing_hd) { + if (bs->backing) { /* read from the base image */ - n1 = qcow2_backing_read1(bs->backing_hd, &hd_qiov, + n1 = qcow2_backing_read1(bs->backing->bs, &hd_qiov, sector_num, cur_nr_sectors); if (n1 > 0) { QEMUIOVector local_qiov; @@ -1188,7 +1425,7 @@ static coroutine_fn int qcow2_co_readv(BlockDriverState *bs, int64_t sector_num, BLKDBG_EVENT(bs->file, BLKDBG_READ_BACKING_AIO); qemu_co_mutex_unlock(&s->lock); - ret = bdrv_co_readv(bs->backing_hd, sector_num, + ret = bdrv_co_readv(bs->backing->bs, sector_num, n1, &local_qiov); qemu_co_mutex_lock(&s->lock); @@ -1235,8 +1472,9 @@ static coroutine_fn int qcow2_co_readv(BlockDriverState *bs, int64_t sector_num, */ if (!cluster_data) { cluster_data = - qemu_try_blockalign(bs->file, QCOW_MAX_CRYPT_CLUSTERS - * s->cluster_size); + qemu_try_blockalign(bs->file->bs, + QCOW_MAX_CRYPT_CLUSTERS + * s->cluster_size); if (cluster_data == NULL) { ret = -ENOMEM; goto fail; @@ -1252,7 +1490,7 @@ static coroutine_fn int qcow2_co_readv(BlockDriverState *bs, int64_t sector_num, BLKDBG_EVENT(bs->file, BLKDBG_READ_AIO); qemu_co_mutex_unlock(&s->lock); - ret = bdrv_co_readv(bs->file, + ret = bdrv_co_readv(bs->file->bs, (cluster_offset >> 9) + index_in_cluster, cur_nr_sectors, &hd_qiov); qemu_co_mutex_lock(&s->lock); @@ -1300,7 +1538,7 @@ static coroutine_fn int qcow2_co_writev(BlockDriverState *bs, int remaining_sectors, QEMUIOVector *qiov) { - BDRVQcowState *s = bs->opaque; + BDRVQcow2State *s = bs->opaque; int index_in_cluster; int ret; int cur_nr_sectors; /* number of sectors in current iteration */ @@ -1349,7 +1587,7 @@ static coroutine_fn int qcow2_co_writev(BlockDriverState *bs, Error *err = NULL; assert(s->cipher); if (!cluster_data) { - cluster_data = qemu_try_blockalign(bs->file, + cluster_data = qemu_try_blockalign(bs->file->bs, QCOW_MAX_CRYPT_CLUSTERS * s->cluster_size); if (cluster_data == NULL) { @@ -1386,7 +1624,7 @@ static coroutine_fn int qcow2_co_writev(BlockDriverState *bs, BLKDBG_EVENT(bs->file, BLKDBG_WRITE_AIO); trace_qcow2_writev_data(qemu_coroutine_self(), (cluster_offset >> 9) + index_in_cluster); - ret = bdrv_co_writev(bs->file, + ret = bdrv_co_writev(bs->file->bs, (cluster_offset >> 9) + index_in_cluster, cur_nr_sectors, &hd_qiov); qemu_co_mutex_lock(&s->lock); @@ -1444,33 +1682,44 @@ fail: return ret; } +static int qcow2_inactivate(BlockDriverState *bs) +{ + BDRVQcow2State *s = bs->opaque; + int ret, result = 0; + + ret = qcow2_cache_flush(bs, s->l2_table_cache); + if (ret) { + result = ret; + error_report("Failed to flush the L2 table cache: %s", + strerror(-ret)); + } + + ret = qcow2_cache_flush(bs, s->refcount_block_cache); + if (ret) { + result = ret; + error_report("Failed to flush the refcount block cache: %s", + strerror(-ret)); + } + + if (result == 0) { + qcow2_mark_clean(bs); + } + + return result; +} + static void qcow2_close(BlockDriverState *bs) { - BDRVQcowState *s = bs->opaque; + BDRVQcow2State *s = bs->opaque; qemu_vfree(s->l1_table); /* else pre-write overlap checks in cache_destroy may crash */ s->l1_table = NULL; - if (!(bs->open_flags & BDRV_O_INCOMING)) { - int ret1, ret2; - - ret1 = qcow2_cache_flush(bs, s->l2_table_cache); - ret2 = qcow2_cache_flush(bs, s->refcount_block_cache); - - if (ret1) { - error_report("Failed to flush the L2 table cache: %s", - strerror(-ret1)); - } - if (ret2) { - error_report("Failed to flush the refcount block cache: %s", - strerror(-ret2)); - } - - if (!ret1 && !ret2) { - qcow2_mark_clean(bs); - } + if (!(s->flags & BDRV_O_INACTIVE)) { + qcow2_inactivate(bs); } + cache_clean_timer_del(bs); qcow2_cache_destroy(bs, s->l2_table_cache); qcow2_cache_destroy(bs, s->refcount_block_cache); @@ -1491,7 +1740,7 @@ static void qcow2_close(BlockDriverState *bs) static void qcow2_invalidate_cache(BlockDriverState *bs, Error **errp) { - BDRVQcowState *s = bs->opaque; + BDRVQcow2State *s = bs->opaque; int flags = s->flags; QCryptoCipher *cipher = NULL; QDict *options; @@ -1508,24 +1757,27 @@ static void qcow2_invalidate_cache(BlockDriverState *bs, Error **errp) qcow2_close(bs); - bdrv_invalidate_cache(bs->file, &local_err); + bdrv_invalidate_cache(bs->file->bs, &local_err); if (local_err) { error_propagate(errp, local_err); + bs->drv = NULL; return; } - memset(s, 0, sizeof(BDRVQcowState)); + memset(s, 0, sizeof(BDRVQcow2State)); options = qdict_clone_shallow(bs->options); + flags &= ~BDRV_O_INACTIVE; ret = qcow2_open(bs, options, flags, &local_err); QDECREF(options); if (local_err) { - error_setg(errp, "Could not reopen qcow2 layer: %s", - error_get_pretty(local_err)); - error_free(local_err); + error_propagate(errp, local_err); + error_prepend(errp, "Could not reopen qcow2 layer: "); + bs->drv = NULL; return; } else if (ret < 0) { error_setg_errno(errp, -ret, "Could not reopen qcow2 layer"); + bs->drv = NULL; return; } @@ -1561,7 +1813,7 @@ static size_t header_ext_add(char *buf, uint32_t magic, const void *s, */ int qcow2_update_header(BlockDriverState *bs) { - BDRVQcowState *s = bs->opaque; + BDRVQcow2State *s = bs->opaque; QCowHeader *header; char *buf; size_t buflen = s->cluster_size; @@ -1653,31 +1905,33 @@ int qcow2_update_header(BlockDriverState *bs) } /* Feature table */ - Qcow2Feature features[] = { - { - .type = QCOW2_FEAT_TYPE_INCOMPATIBLE, - .bit = QCOW2_INCOMPAT_DIRTY_BITNR, - .name = "dirty bit", - }, - { - .type = QCOW2_FEAT_TYPE_INCOMPATIBLE, - .bit = QCOW2_INCOMPAT_CORRUPT_BITNR, - .name = "corrupt bit", - }, - { - .type = QCOW2_FEAT_TYPE_COMPATIBLE, - .bit = QCOW2_COMPAT_LAZY_REFCOUNTS_BITNR, - .name = "lazy refcounts", - }, - }; + if (s->qcow_version >= 3) { + Qcow2Feature features[] = { + { + .type = QCOW2_FEAT_TYPE_INCOMPATIBLE, + .bit = QCOW2_INCOMPAT_DIRTY_BITNR, + .name = "dirty bit", + }, + { + .type = QCOW2_FEAT_TYPE_INCOMPATIBLE, + .bit = QCOW2_INCOMPAT_CORRUPT_BITNR, + .name = "corrupt bit", + }, + { + .type = QCOW2_FEAT_TYPE_COMPATIBLE, + .bit = QCOW2_COMPAT_LAZY_REFCOUNTS_BITNR, + .name = "lazy refcounts", + }, + }; - ret = header_ext_add(buf, QCOW2_EXT_MAGIC_FEATURE_TABLE, - features, sizeof(features), buflen); - if (ret < 0) { - goto fail; + ret = header_ext_add(buf, QCOW2_EXT_MAGIC_FEATURE_TABLE, + features, sizeof(features), buflen); + if (ret < 0) { + goto fail; + } + buf += ret; + buflen -= ret; } - buf += ret; - buflen -= ret; /* Keep unknown header extensions */ QLIST_FOREACH(uext, &s->unknown_header_ext, next) { @@ -1716,7 +1970,7 @@ int qcow2_update_header(BlockDriverState *bs) } /* Write the new header */ - ret = bdrv_pwrite(bs->file, 0, header, s->cluster_size); + ret = bdrv_pwrite(bs->file->bs, 0, header, s->cluster_size); if (ret < 0) { goto fail; } @@ -1730,7 +1984,11 @@ fail: static int qcow2_change_backing_file(BlockDriverState *bs, const char *backing_file, const char *backing_fmt) { - BDRVQcowState *s = bs->opaque; + BDRVQcow2State *s = bs->opaque; + + if (backing_file && strlen(backing_file) > 1023) { + return -EINVAL; + } pstrcpy(bs->backing_file, sizeof(bs->backing_file), backing_file ?: ""); pstrcpy(bs->backing_format, sizeof(bs->backing_format), backing_fmt ?: ""); @@ -1796,7 +2054,8 @@ static int preallocate(BlockDriverState *bs) if (host_offset != 0) { uint8_t buf[BDRV_SECTOR_SIZE]; memset(buf, 0, BDRV_SECTOR_SIZE); - ret = bdrv_write(bs->file, (host_offset >> BDRV_SECTOR_BITS) + num - 1, + ret = bdrv_write(bs->file->bs, + (host_offset >> BDRV_SECTOR_BITS) + num - 1, buf, 1); if (ret < 0) { return ret; @@ -1812,8 +2071,10 @@ static int qcow2_create2(const char *filename, int64_t total_size, QemuOpts *opts, int version, int refcount_order, Error **errp) { - /* Calculate cluster_bits */ int cluster_bits; + QDict *options; + + /* Calculate cluster_bits */ cluster_bits = ctz32(cluster_size); if (cluster_bits < MIN_CLUSTER_BITS || cluster_bits > MAX_CLUSTER_BITS || (1 << cluster_bits) != cluster_size) @@ -1835,7 +2096,7 @@ static int qcow2_create2(const char *filename, int64_t total_size, * 2 GB for 64k clusters, and we don't want to have a 2 GB initial file * size for any qcow2 image. */ - BlockDriverState* bs; + BlockBackend *blk; QCowHeader *header; uint64_t* refcount_table; Error *local_err = NULL; @@ -1910,14 +2171,15 @@ static int qcow2_create2(const char *filename, int64_t total_size, return ret; } - bs = NULL; - ret = bdrv_open(&bs, filename, NULL, NULL, BDRV_O_RDWR | BDRV_O_PROTOCOL, - NULL, &local_err); - if (ret < 0) { + blk = blk_new_open(filename, NULL, NULL, + BDRV_O_RDWR | BDRV_O_PROTOCOL, &local_err); + if (blk == NULL) { error_propagate(errp, local_err); - return ret; + return -EIO; } + blk_set_allow_write_beyond_eof(blk, true); + /* Write the header */ QEMU_BUILD_BUG_ON((1 << MIN_CLUSTER_BITS) < sizeof(*header)); header = g_malloc0(cluster_size); @@ -1945,7 +2207,7 @@ static int qcow2_create2(const char *filename, int64_t total_size, cpu_to_be64(QCOW2_COMPAT_LAZY_REFCOUNTS); } - ret = bdrv_pwrite(bs, 0, header, cluster_size); + ret = blk_pwrite(blk, 0, header, cluster_size); g_free(header); if (ret < 0) { error_setg_errno(errp, -ret, "Could not write qcow2 header"); @@ -1955,7 +2217,7 @@ static int qcow2_create2(const char *filename, int64_t total_size, /* Write a refcount table with one refcount block */ refcount_table = g_malloc0(2 * cluster_size); refcount_table[0] = cpu_to_be64(2 * cluster_size); - ret = bdrv_pwrite(bs, cluster_size, refcount_table, 2 * cluster_size); + ret = blk_pwrite(blk, cluster_size, refcount_table, 2 * cluster_size); g_free(refcount_table); if (ret < 0) { @@ -1963,23 +2225,25 @@ static int qcow2_create2(const char *filename, int64_t total_size, goto out; } - bdrv_unref(bs); - bs = NULL; + blk_unref(blk); + blk = NULL; /* * And now open the image and make it consistent first (i.e. increase the * refcount of the cluster that is occupied by the header and the refcount * table) */ - ret = bdrv_open(&bs, filename, NULL, NULL, - BDRV_O_RDWR | BDRV_O_CACHE_WB | BDRV_O_NO_FLUSH, - &bdrv_qcow2, &local_err); - if (ret < 0) { + options = qdict_new(); + qdict_put(options, "driver", qstring_from_str("qcow2")); + blk = blk_new_open(filename, NULL, options, + BDRV_O_RDWR | BDRV_O_NO_FLUSH, &local_err); + if (blk == NULL) { error_propagate(errp, local_err); + ret = -EIO; goto out; } - ret = qcow2_alloc_clusters(bs, 3 * cluster_size); + ret = qcow2_alloc_clusters(blk_bs(blk), 3 * cluster_size); if (ret < 0) { error_setg_errno(errp, -ret, "Could not allocate clusters for qcow2 " "header and refcount table"); @@ -1990,8 +2254,15 @@ static int qcow2_create2(const char *filename, int64_t total_size, abort(); } + /* Create a full header (including things like feature table) */ + ret = qcow2_update_header(blk_bs(blk)); + if (ret < 0) { + error_setg_errno(errp, -ret, "Could not update qcow2 header"); + goto out; + } + /* Okay, now that we have a valid image, let's give it the right size */ - ret = bdrv_truncate(bs, total_size); + ret = blk_truncate(blk, total_size); if (ret < 0) { error_setg_errno(errp, -ret, "Could not resize image"); goto out; @@ -1999,7 +2270,7 @@ static int qcow2_create2(const char *filename, int64_t total_size, /* Want a backing file? There you go.*/ if (backing_file) { - ret = bdrv_change_backing_file(bs, backing_file, backing_format); + ret = bdrv_change_backing_file(blk_bs(blk), backing_file, backing_format); if (ret < 0) { error_setg_errno(errp, -ret, "Could not assign backing file '%s' " "with format '%s'", backing_file, backing_format); @@ -2009,9 +2280,9 @@ static int qcow2_create2(const char *filename, int64_t total_size, /* And if we're supposed to preallocate metadata, do that now */ if (prealloc != PREALLOC_MODE_OFF) { - BDRVQcowState *s = bs->opaque; + BDRVQcow2State *s = blk_bs(blk)->opaque; qemu_co_mutex_lock(&s->lock); - ret = preallocate(bs); + ret = preallocate(blk_bs(blk)); qemu_co_mutex_unlock(&s->lock); if (ret < 0) { error_setg_errno(errp, -ret, "Could not preallocate metadata"); @@ -2019,22 +2290,24 @@ static int qcow2_create2(const char *filename, int64_t total_size, } } - bdrv_unref(bs); - bs = NULL; + blk_unref(blk); + blk = NULL; /* Reopen the image without BDRV_O_NO_FLUSH to flush it before returning */ - ret = bdrv_open(&bs, filename, NULL, NULL, - BDRV_O_RDWR | BDRV_O_CACHE_WB | BDRV_O_NO_BACKING, - &bdrv_qcow2, &local_err); - if (local_err) { + options = qdict_new(); + qdict_put(options, "driver", qstring_from_str("qcow2")); + blk = blk_new_open(filename, NULL, options, + BDRV_O_RDWR | BDRV_O_NO_BACKING, &local_err); + if (blk == NULL) { error_propagate(errp, local_err); + ret = -EIO; goto out; } ret = 0; out: - if (bs) { - bdrv_unref(bs); + if (blk) { + blk_unref(blk); } return ret; } @@ -2066,7 +2339,7 @@ static int qcow2_create(const char *filename, QemuOpts *opts, Error **errp) DEFAULT_CLUSTER_SIZE); buf = qemu_opt_get_del(opts, BLOCK_OPT_PREALLOC); prealloc = qapi_enum_parse(PreallocMode_lookup, buf, - PREALLOC_MODE_MAX, PREALLOC_MODE_OFF, + PREALLOC_MODE__MAX, PREALLOC_MODE_OFF, &local_err); if (local_err) { error_propagate(errp, local_err); @@ -2142,7 +2415,7 @@ static coroutine_fn int qcow2_co_write_zeroes(BlockDriverState *bs, int64_t sector_num, int nb_sectors, BdrvRequestFlags flags) { int ret; - BDRVQcowState *s = bs->opaque; + BDRVQcow2State *s = bs->opaque; /* Emulate misaligned zero writes */ if (sector_num % s->cluster_sectors || nb_sectors % s->cluster_sectors) { @@ -2162,7 +2435,7 @@ static coroutine_fn int qcow2_co_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors) { int ret; - BDRVQcowState *s = bs->opaque; + BDRVQcow2State *s = bs->opaque; qemu_co_mutex_lock(&s->lock); ret = qcow2_discard_clusters(bs, sector_num << BDRV_SECTOR_BITS, @@ -2173,7 +2446,7 @@ static coroutine_fn int qcow2_co_discard(BlockDriverState *bs, static int qcow2_truncate(BlockDriverState *bs, int64_t offset) { - BDRVQcowState *s = bs->opaque; + BDRVQcow2State *s = bs->opaque; int64_t new_l1_size; int ret; @@ -2202,7 +2475,7 @@ static int qcow2_truncate(BlockDriverState *bs, int64_t offset) /* write updated header.size */ offset = cpu_to_be64(offset); - ret = bdrv_pwrite_sync(bs->file, offsetof(QCowHeader, size), + ret = bdrv_pwrite_sync(bs->file->bs, offsetof(QCowHeader, size), &offset, sizeof(uint64_t)); if (ret < 0) { return ret; @@ -2217,7 +2490,7 @@ static int qcow2_truncate(BlockDriverState *bs, int64_t offset) static int qcow2_write_compressed(BlockDriverState *bs, int64_t sector_num, const uint8_t *buf, int nb_sectors) { - BDRVQcowState *s = bs->opaque; + BDRVQcow2State *s = bs->opaque; z_stream strm; int ret, out_len; uint8_t *out_buf; @@ -2226,8 +2499,8 @@ static int qcow2_write_compressed(BlockDriverState *bs, int64_t sector_num, if (nb_sectors == 0) { /* align end of file to a sector boundary to ease reading with sector based I/Os */ - cluster_offset = bdrv_getlength(bs->file); - return bdrv_truncate(bs->file, cluster_offset); + cluster_offset = bdrv_getlength(bs->file->bs); + return bdrv_truncate(bs->file->bs, cluster_offset); } if (nb_sectors != s->cluster_sectors) { @@ -2294,7 +2567,7 @@ static int qcow2_write_compressed(BlockDriverState *bs, int64_t sector_num, } BLKDBG_EVENT(bs->file, BLKDBG_WRITE_COMPRESSED); - ret = bdrv_pwrite(bs->file, cluster_offset, out_buf, out_len); + ret = bdrv_pwrite(bs->file->bs, cluster_offset, out_buf, out_len); if (ret < 0) { goto fail; } @@ -2308,7 +2581,7 @@ fail: static int make_completely_empty(BlockDriverState *bs) { - BDRVQcowState *s = bs->opaque; + BDRVQcow2State *s = bs->opaque; int ret, l1_clusters; int64_t offset; uint64_t *new_reftable = NULL; @@ -2343,7 +2616,7 @@ static int make_completely_empty(BlockDriverState *bs) /* After this call, neither the in-memory nor the on-disk refcount * information accurately describe the actual references */ - ret = bdrv_write_zeroes(bs->file, s->l1_table_offset / BDRV_SECTOR_SIZE, + ret = bdrv_write_zeroes(bs->file->bs, s->l1_table_offset / BDRV_SECTOR_SIZE, l1_clusters * s->cluster_sectors, 0); if (ret < 0) { goto fail_broken_refcounts; @@ -2357,7 +2630,7 @@ static int make_completely_empty(BlockDriverState *bs) * overwrite parts of the existing refcount and L1 table, which is not * an issue because the dirty flag is set, complete data loss is in fact * desired and partial data loss is consequently fine as well */ - ret = bdrv_write_zeroes(bs->file, s->cluster_size / BDRV_SECTOR_SIZE, + ret = bdrv_write_zeroes(bs->file->bs, s->cluster_size / BDRV_SECTOR_SIZE, (2 + l1_clusters) * s->cluster_size / BDRV_SECTOR_SIZE, 0); /* This call (even if it failed overall) may have overwritten on-disk @@ -2377,7 +2650,7 @@ static int make_completely_empty(BlockDriverState *bs) cpu_to_be64w(&l1_ofs_rt_ofs_cls.l1_offset, 3 * s->cluster_size); cpu_to_be64w(&l1_ofs_rt_ofs_cls.reftable_offset, s->cluster_size); cpu_to_be32w(&l1_ofs_rt_ofs_cls.reftable_clusters, 1); - ret = bdrv_pwrite_sync(bs->file, offsetof(QCowHeader, l1_table_offset), + ret = bdrv_pwrite_sync(bs->file->bs, offsetof(QCowHeader, l1_table_offset), &l1_ofs_rt_ofs_cls, sizeof(l1_ofs_rt_ofs_cls)); if (ret < 0) { goto fail_broken_refcounts; @@ -2408,7 +2681,7 @@ static int make_completely_empty(BlockDriverState *bs) /* Enter the first refblock into the reftable */ rt_entry = cpu_to_be64(2 * s->cluster_size); - ret = bdrv_pwrite_sync(bs->file, s->cluster_size, + ret = bdrv_pwrite_sync(bs->file->bs, s->cluster_size, &rt_entry, sizeof(rt_entry)); if (ret < 0) { goto fail_broken_refcounts; @@ -2433,7 +2706,7 @@ static int make_completely_empty(BlockDriverState *bs) goto fail; } - ret = bdrv_truncate(bs->file, (3 + l1_clusters) * s->cluster_size); + ret = bdrv_truncate(bs->file->bs, (3 + l1_clusters) * s->cluster_size); if (ret < 0) { goto fail; } @@ -2456,7 +2729,7 @@ fail: static int qcow2_make_empty(BlockDriverState *bs) { - BDRVQcowState *s = bs->opaque; + BDRVQcow2State *s = bs->opaque; uint64_t start_sector; int sector_step = INT_MAX / BDRV_SECTOR_SIZE; int l1_clusters, ret = 0; @@ -2497,7 +2770,7 @@ static int qcow2_make_empty(BlockDriverState *bs) static coroutine_fn int qcow2_co_flush_to_os(BlockDriverState *bs) { - BDRVQcowState *s = bs->opaque; + BDRVQcow2State *s = bs->opaque; int ret; qemu_co_mutex_lock(&s->lock); @@ -2521,7 +2794,7 @@ static coroutine_fn int qcow2_co_flush_to_os(BlockDriverState *bs) static int qcow2_get_info(BlockDriverState *bs, BlockDriverInfo *bdi) { - BDRVQcowState *s = bs->opaque; + BDRVQcow2State *s = bs->opaque; bdi->unallocated_blocks_are_zero = true; bdi->can_write_zeroes_with_unmap = (s->qcow_version >= 3); bdi->cluster_size = s->cluster_size; @@ -2531,22 +2804,20 @@ static int qcow2_get_info(BlockDriverState *bs, BlockDriverInfo *bdi) static ImageInfoSpecific *qcow2_get_specific_info(BlockDriverState *bs) { - BDRVQcowState *s = bs->opaque; + BDRVQcow2State *s = bs->opaque; ImageInfoSpecific *spec_info = g_new(ImageInfoSpecific, 1); *spec_info = (ImageInfoSpecific){ - .kind = IMAGE_INFO_SPECIFIC_KIND_QCOW2, - { - .qcow2 = g_new(ImageInfoSpecificQCow2, 1), - }, + .type = IMAGE_INFO_SPECIFIC_KIND_QCOW2, + .u.qcow2.data = g_new(ImageInfoSpecificQCow2, 1), }; if (s->qcow_version == 2) { - *spec_info->qcow2 = (ImageInfoSpecificQCow2){ + *spec_info->u.qcow2.data = (ImageInfoSpecificQCow2){ .compat = g_strdup("0.10"), .refcount_bits = s->refcount_bits, }; } else if (s->qcow_version == 3) { - *spec_info->qcow2 = (ImageInfoSpecificQCow2){ + *spec_info->u.qcow2.data = (ImageInfoSpecificQCow2){ .compat = g_strdup("1.1"), .lazy_refcounts = s->compatible_features & QCOW2_COMPAT_LAZY_REFCOUNTS, @@ -2556,6 +2827,10 @@ static ImageInfoSpecific *qcow2_get_specific_info(BlockDriverState *bs) .has_corrupt = true, .refcount_bits = s->refcount_bits, }; + } else { + /* if this assertion fails, this probably means a new version was + * added without having it covered here */ + assert(false); } return spec_info; @@ -2564,11 +2839,11 @@ static ImageInfoSpecific *qcow2_get_specific_info(BlockDriverState *bs) #if 0 static void dump_refcounts(BlockDriverState *bs) { - BDRVQcowState *s = bs->opaque; + BDRVQcow2State *s = bs->opaque; int64_t nb_clusters, k, k1, size; int refcount; - size = bdrv_getlength(bs->file); + size = bdrv_getlength(bs->file->bs); nb_clusters = size_to_clusters(s, size); for(k = 0; k < nb_clusters;) { k1 = k; @@ -2585,7 +2860,7 @@ static void dump_refcounts(BlockDriverState *bs) static int qcow2_save_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos) { - BDRVQcowState *s = bs->opaque; + BDRVQcow2State *s = bs->opaque; int64_t total_sectors = bs->total_sectors; bool zero_beyond_eof = bs->zero_beyond_eof; int ret; @@ -2606,7 +2881,7 @@ static int qcow2_save_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, static int qcow2_load_vmstate(BlockDriverState *bs, uint8_t *buf, int64_t pos, int size) { - BDRVQcowState *s = bs->opaque; + BDRVQcow2State *s = bs->opaque; bool zero_beyond_eof = bs->zero_beyond_eof; int ret; @@ -2623,9 +2898,9 @@ static int qcow2_load_vmstate(BlockDriverState *bs, uint8_t *buf, * have to be removed. */ static int qcow2_downgrade(BlockDriverState *bs, int target_version, - BlockDriverAmendStatusCB *status_cb) + BlockDriverAmendStatusCB *status_cb, void *cb_opaque) { - BDRVQcowState *s = bs->opaque; + BDRVQcow2State *s = bs->opaque; int current_version = s->qcow_version; int ret; @@ -2638,13 +2913,7 @@ static int qcow2_downgrade(BlockDriverState *bs, int target_version, } if (s->refcount_order != 4) { - /* we would have to convert the image to a refcount_order == 4 image - * here; however, since qemu (at the time of writing this) does not - * support anything different than 4 anyway, there is no point in doing - * so right now; however, we should error out (if qemu supports this in - * the future and this code has not been adapted) */ - error_report("qcow2_downgrade: Image refcount orders other than 4 are " - "currently not supported."); + error_report("compat=0.10 requires refcount_bits=16"); return -ENOTSUP; } @@ -2672,7 +2941,7 @@ static int qcow2_downgrade(BlockDriverState *bs, int target_version, /* clearing autoclear features is trivial */ s->autoclear_features = 0; - ret = qcow2_expand_zero_clusters(bs, status_cb); + ret = qcow2_expand_zero_clusters(bs, status_cb, cb_opaque); if (ret < 0) { return ret; } @@ -2686,10 +2955,81 @@ static int qcow2_downgrade(BlockDriverState *bs, int target_version, return 0; } +typedef enum Qcow2AmendOperation { + /* This is the value Qcow2AmendHelperCBInfo::last_operation will be + * statically initialized to so that the helper CB can discern the first + * invocation from an operation change */ + QCOW2_NO_OPERATION = 0, + + QCOW2_CHANGING_REFCOUNT_ORDER, + QCOW2_DOWNGRADING, +} Qcow2AmendOperation; + +typedef struct Qcow2AmendHelperCBInfo { + /* The code coordinating the amend operations should only modify + * these four fields; the rest will be managed by the CB */ + BlockDriverAmendStatusCB *original_status_cb; + void *original_cb_opaque; + + Qcow2AmendOperation current_operation; + + /* Total number of operations to perform (only set once) */ + int total_operations; + + /* The following fields are managed by the CB */ + + /* Number of operations completed */ + int operations_completed; + + /* Cumulative offset of all completed operations */ + int64_t offset_completed; + + Qcow2AmendOperation last_operation; + int64_t last_work_size; +} Qcow2AmendHelperCBInfo; + +static void qcow2_amend_helper_cb(BlockDriverState *bs, + int64_t operation_offset, + int64_t operation_work_size, void *opaque) +{ + Qcow2AmendHelperCBInfo *info = opaque; + int64_t current_work_size; + int64_t projected_work_size; + + if (info->current_operation != info->last_operation) { + if (info->last_operation != QCOW2_NO_OPERATION) { + info->offset_completed += info->last_work_size; + info->operations_completed++; + } + + info->last_operation = info->current_operation; + } + + assert(info->total_operations > 0); + assert(info->operations_completed < info->total_operations); + + info->last_work_size = operation_work_size; + + current_work_size = info->offset_completed + operation_work_size; + + /* current_work_size is the total work size for (operations_completed + 1) + * operations (which includes this one), so multiply it by the number of + * operations not covered and divide it by the number of operations + * covered to get a projection for the operations not covered */ + projected_work_size = current_work_size * (info->total_operations - + info->operations_completed - 1) + / (info->operations_completed + 1); + + info->original_status_cb(bs, info->offset_completed + operation_offset, + current_work_size + projected_work_size, + info->original_cb_opaque); +} + static int qcow2_amend_options(BlockDriverState *bs, QemuOpts *opts, - BlockDriverAmendStatusCB *status_cb) + BlockDriverAmendStatusCB *status_cb, + void *cb_opaque) { - BDRVQcowState *s = bs->opaque; + BDRVQcow2State *s = bs->opaque; int old_version = s->qcow_version, new_version = old_version; uint64_t new_size = 0; const char *backing_file = NULL, *backing_format = NULL; @@ -2697,8 +3037,10 @@ static int qcow2_amend_options(BlockDriverState *bs, QemuOpts *opts, const char *compat = NULL; uint64_t cluster_size = s->cluster_size; bool encrypt; + int refcount_bits = s->refcount_bits; int ret; QemuOptDesc *desc = opts->list->desc; + Qcow2AmendHelperCBInfo helper_cb_info; while (desc && desc->name) { if (!qemu_opt_find(opts, desc->name)) { @@ -2716,11 +3058,11 @@ static int qcow2_amend_options(BlockDriverState *bs, QemuOpts *opts, } else if (!strcmp(compat, "1.1")) { new_version = 3; } else { - fprintf(stderr, "Unknown compatibility level %s.\n", compat); + error_report("Unknown compatibility level %s", compat); return -EINVAL; } } else if (!strcmp(desc->name, BLOCK_OPT_PREALLOC)) { - fprintf(stderr, "Cannot change preallocation mode.\n"); + error_report("Cannot change preallocation mode"); return -ENOTSUP; } else if (!strcmp(desc->name, BLOCK_OPT_SIZE)) { new_size = qemu_opt_get_size(opts, BLOCK_OPT_SIZE, 0); @@ -2733,47 +3075,74 @@ static int qcow2_amend_options(BlockDriverState *bs, QemuOpts *opts, !!s->cipher); if (encrypt != !!s->cipher) { - fprintf(stderr, "Changing the encryption flag is not " - "supported.\n"); + error_report("Changing the encryption flag is not supported"); return -ENOTSUP; } } else if (!strcmp(desc->name, BLOCK_OPT_CLUSTER_SIZE)) { cluster_size = qemu_opt_get_size(opts, BLOCK_OPT_CLUSTER_SIZE, cluster_size); if (cluster_size != s->cluster_size) { - fprintf(stderr, "Changing the cluster size is not " - "supported.\n"); + error_report("Changing the cluster size is not supported"); return -ENOTSUP; } } else if (!strcmp(desc->name, BLOCK_OPT_LAZY_REFCOUNTS)) { lazy_refcounts = qemu_opt_get_bool(opts, BLOCK_OPT_LAZY_REFCOUNTS, lazy_refcounts); } else if (!strcmp(desc->name, BLOCK_OPT_REFCOUNT_BITS)) { - error_report("Cannot change refcount entry width"); - return -ENOTSUP; + refcount_bits = qemu_opt_get_number(opts, BLOCK_OPT_REFCOUNT_BITS, + refcount_bits); + + if (refcount_bits <= 0 || refcount_bits > 64 || + !is_power_of_2(refcount_bits)) + { + error_report("Refcount width must be a power of two and may " + "not exceed 64 bits"); + return -EINVAL; + } } else { - /* if this assertion fails, this probably means a new option was + /* if this point is reached, this probably means a new option was * added without having it covered here */ - assert(false); + abort(); } desc++; } - if (new_version != old_version) { - if (new_version > old_version) { - /* Upgrade */ - s->qcow_version = new_version; - ret = qcow2_update_header(bs); - if (ret < 0) { - s->qcow_version = old_version; - return ret; - } - } else { - ret = qcow2_downgrade(bs, new_version, status_cb); - if (ret < 0) { - return ret; - } + helper_cb_info = (Qcow2AmendHelperCBInfo){ + .original_status_cb = status_cb, + .original_cb_opaque = cb_opaque, + .total_operations = (new_version < old_version) + + (s->refcount_bits != refcount_bits) + }; + + /* Upgrade first (some features may require compat=1.1) */ + if (new_version > old_version) { + s->qcow_version = new_version; + ret = qcow2_update_header(bs); + if (ret < 0) { + s->qcow_version = old_version; + return ret; + } + } + + if (s->refcount_bits != refcount_bits) { + int refcount_order = ctz32(refcount_bits); + Error *local_error = NULL; + + if (new_version < 3 && refcount_bits != 16) { + error_report("Different refcount widths than 16 bits require " + "compatibility level 1.1 or above (use compat=1.1 or " + "greater)"); + return -EINVAL; + } + + helper_cb_info.current_operation = QCOW2_CHANGING_REFCOUNT_ORDER; + ret = qcow2_change_refcount_order(bs, refcount_order, + &qcow2_amend_helper_cb, + &helper_cb_info, &local_error); + if (ret < 0) { + error_report_err(local_error); + return ret; } } @@ -2788,9 +3157,9 @@ static int qcow2_amend_options(BlockDriverState *bs, QemuOpts *opts, if (s->use_lazy_refcounts != lazy_refcounts) { if (lazy_refcounts) { - if (s->qcow_version < 3) { - fprintf(stderr, "Lazy refcounts only supported with compatibility " - "level 1.1 and above (use compat=1.1 or greater)\n"); + if (new_version < 3) { + error_report("Lazy refcounts only supported with compatibility " + "level 1.1 and above (use compat=1.1 or greater)"); return -EINVAL; } s->compatible_features |= QCOW2_COMPAT_LAZY_REFCOUNTS; @@ -2824,6 +3193,16 @@ static int qcow2_amend_options(BlockDriverState *bs, QemuOpts *opts, } } + /* Downgrade last (so unsupported features can be removed before) */ + if (new_version < old_version) { + helper_cb_info.current_operation = QCOW2_DOWNGRADING; + ret = qcow2_downgrade(bs, new_version, &qcow2_amend_helper_cb, + &helper_cb_info); + if (ret < 0) { + return ret; + } + } + return 0; } @@ -2836,7 +3215,7 @@ static int qcow2_amend_options(BlockDriverState *bs, QemuOpts *opts, void qcow2_signal_corruption(BlockDriverState *bs, bool fatal, int64_t offset, int64_t size, const char *message_format, ...) { - BDRVQcowState *s = bs->opaque; + BDRVQcow2State *s = bs->opaque; const char *node_name; char *message; va_list ap; @@ -2937,11 +3316,14 @@ static QemuOptsList qcow2_create_opts = { BlockDriver bdrv_qcow2 = { .format_name = "qcow2", - .instance_size = sizeof(BDRVQcowState), + .instance_size = sizeof(BDRVQcow2State), .bdrv_probe = qcow2_probe, .bdrv_open = qcow2_open, .bdrv_close = qcow2_close, .bdrv_reopen_prepare = qcow2_reopen_prepare, + .bdrv_reopen_commit = qcow2_reopen_commit, + .bdrv_reopen_abort = qcow2_reopen_abort, + .bdrv_join_options = qcow2_join_options, .bdrv_create = qcow2_create, .bdrv_has_zero_init = bdrv_has_zero_init_1, .bdrv_co_get_block_status = qcow2_co_get_block_status, @@ -2973,10 +3355,14 @@ BlockDriver bdrv_qcow2 = { .bdrv_refresh_limits = qcow2_refresh_limits, .bdrv_invalidate_cache = qcow2_invalidate_cache, + .bdrv_inactivate = qcow2_inactivate, .create_opts = &qcow2_create_opts, .bdrv_check = qcow2_check, .bdrv_amend_options = qcow2_amend_options, + + .bdrv_detach_aio_context = qcow2_detach_aio_context, + .bdrv_attach_aio_context = qcow2_attach_aio_context, }; static void bdrv_qcow2_init(void) diff --git a/qemu/block/qcow2.h b/qemu/block/qcow2.h index 72e132838..a063a3c1a 100644 --- a/qemu/block/qcow2.h +++ b/qemu/block/qcow2.h @@ -26,7 +26,7 @@ #define BLOCK_QCOW2_H #include "crypto/cipher.h" -#include "block/coroutine.h" +#include "qemu/coroutine.h" //#define DEBUG_ALLOC //#define DEBUG_ALLOC2 @@ -96,6 +96,7 @@ #define QCOW2_OPT_CACHE_SIZE "cache-size" #define QCOW2_OPT_L2_CACHE_SIZE "l2-cache-size" #define QCOW2_OPT_REFCOUNT_CACHE_SIZE "refcount-cache-size" +#define QCOW2_OPT_CACHE_CLEAN_INTERVAL "cache-clean-interval" typedef struct QCowHeader { uint32_t magic; @@ -221,7 +222,7 @@ typedef uint64_t Qcow2GetRefcountFunc(const void *refcount_array, typedef void Qcow2SetRefcountFunc(void *refcount_array, uint64_t index, uint64_t value); -typedef struct BDRVQcowState { +typedef struct BDRVQcow2State { int cluster_bits; int cluster_size; int cluster_sectors; @@ -239,6 +240,8 @@ typedef struct BDRVQcowState { Qcow2Cache* l2_table_cache; Qcow2Cache* refcount_block_cache; + QEMUTimer *cache_clean_timer; + unsigned cache_clean_interval; uint8_t *cluster_cache; uint8_t *cluster_data; @@ -290,9 +293,7 @@ typedef struct BDRVQcowState { * override) */ char *image_backing_file; char *image_backing_format; -} BDRVQcowState; - -struct QCowAIOCB; +} BDRVQcow2State; typedef struct Qcow2COWRegion { /** @@ -402,28 +403,28 @@ typedef enum QCow2MetadataOverlap { #define REFT_OFFSET_MASK 0xfffffffffffffe00ULL -static inline int64_t start_of_cluster(BDRVQcowState *s, int64_t offset) +static inline int64_t start_of_cluster(BDRVQcow2State *s, int64_t offset) { return offset & ~(s->cluster_size - 1); } -static inline int64_t offset_into_cluster(BDRVQcowState *s, int64_t offset) +static inline int64_t offset_into_cluster(BDRVQcow2State *s, int64_t offset) { return offset & (s->cluster_size - 1); } -static inline int size_to_clusters(BDRVQcowState *s, int64_t size) +static inline uint64_t size_to_clusters(BDRVQcow2State *s, uint64_t size) { return (size + (s->cluster_size - 1)) >> s->cluster_bits; } -static inline int64_t size_to_l1(BDRVQcowState *s, int64_t size) +static inline int64_t size_to_l1(BDRVQcow2State *s, int64_t size) { int shift = s->cluster_bits + s->l2_bits; return (size + (1ULL << shift) - 1) >> shift; } -static inline int offset_to_l2_index(BDRVQcowState *s, int64_t offset) +static inline int offset_to_l2_index(BDRVQcow2State *s, int64_t offset) { return (offset >> s->cluster_bits) & (s->l2_size - 1); } @@ -434,12 +435,12 @@ static inline int64_t align_offset(int64_t offset, int n) return offset; } -static inline int64_t qcow2_vm_state_offset(BDRVQcowState *s) +static inline int64_t qcow2_vm_state_offset(BDRVQcow2State *s) { return (int64_t)s->l1_vm_state_index << (s->cluster_bits + s->l2_bits); } -static inline uint64_t qcow2_max_refcount_clusters(BDRVQcowState *s) +static inline uint64_t qcow2_max_refcount_clusters(BDRVQcow2State *s) { return QCOW_MAX_REFTABLE_SIZE >> s->cluster_bits; } @@ -458,7 +459,7 @@ static inline int qcow2_get_cluster_type(uint64_t l2_entry) } /* Check whether refcounts are eager or lazy */ -static inline bool qcow2_need_accurate_refcounts(BDRVQcowState *s) +static inline bool qcow2_need_accurate_refcounts(BDRVQcow2State *s) { return !(s->incompatible_features & QCOW2_INCOMPAT_DIRTY); } @@ -506,8 +507,8 @@ int qcow2_update_cluster_refcount(BlockDriverState *bs, int64_t cluster_index, enum qcow2_discard_type type); int64_t qcow2_alloc_clusters(BlockDriverState *bs, uint64_t size); -int qcow2_alloc_clusters_at(BlockDriverState *bs, uint64_t offset, - int nb_clusters); +int64_t qcow2_alloc_clusters_at(BlockDriverState *bs, uint64_t offset, + int64_t nb_clusters); int64_t qcow2_alloc_bytes(BlockDriverState *bs, int size); void qcow2_free_clusters(BlockDriverState *bs, int64_t offset, int64_t size, @@ -528,13 +529,17 @@ int qcow2_check_metadata_overlap(BlockDriverState *bs, int ign, int64_t offset, int qcow2_pre_write_overlap_check(BlockDriverState *bs, int ign, int64_t offset, int64_t size); +int qcow2_change_refcount_order(BlockDriverState *bs, int refcount_order, + BlockDriverAmendStatusCB *status_cb, + void *cb_opaque, Error **errp); + /* qcow2-cluster.c functions */ int qcow2_grow_l1_table(BlockDriverState *bs, uint64_t min_size, bool exact_size); int qcow2_write_l1_entry(BlockDriverState *bs, int l1_index); void qcow2_l2_cache_reset(BlockDriverState *bs); int qcow2_decompress_cluster(BlockDriverState *bs, uint64_t cluster_offset); -int qcow2_encrypt_sectors(BDRVQcowState *s, int64_t sector_num, +int qcow2_encrypt_sectors(BDRVQcow2State *s, int64_t sector_num, uint8_t *out_buf, const uint8_t *in_buf, int nb_sectors, bool enc, Error **errp); @@ -552,7 +557,8 @@ int qcow2_discard_clusters(BlockDriverState *bs, uint64_t offset, int qcow2_zero_clusters(BlockDriverState *bs, uint64_t offset, int nb_sectors); int qcow2_expand_zero_clusters(BlockDriverState *bs, - BlockDriverAmendStatusCB *status_cb); + BlockDriverAmendStatusCB *status_cb, + void *cb_opaque); /* qcow2-snapshot.c functions */ int qcow2_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info); @@ -581,6 +587,7 @@ int qcow2_cache_set_dependency(BlockDriverState *bs, Qcow2Cache *c, Qcow2Cache *dependency); void qcow2_cache_depends_on_flush(Qcow2Cache *c); +void qcow2_cache_clean_unused(BlockDriverState *bs, Qcow2Cache *c); int qcow2_cache_empty(BlockDriverState *bs, Qcow2Cache *c); int qcow2_cache_get(BlockDriverState *bs, Qcow2Cache *c, uint64_t offset, diff --git a/qemu/block/qed-check.c b/qemu/block/qed-check.c index 36ecd290d..622f30897 100644 --- a/qemu/block/qed-check.c +++ b/qemu/block/qed-check.c @@ -11,6 +11,7 @@ * */ +#include "qemu/osdep.h" #include "qed.h" typedef struct { diff --git a/qemu/block/qed-cluster.c b/qemu/block/qed-cluster.c index f64b2af8f..c24e75616 100644 --- a/qemu/block/qed-cluster.c +++ b/qemu/block/qed-cluster.c @@ -12,6 +12,7 @@ * */ +#include "qemu/osdep.h" #include "qed.h" /** diff --git a/qemu/block/qed-gencb.c b/qemu/block/qed-gencb.c index b817a8bf5..faf8ecc84 100644 --- a/qemu/block/qed-gencb.c +++ b/qemu/block/qed-gencb.c @@ -11,6 +11,7 @@ * */ +#include "qemu/osdep.h" #include "qed.h" void *gencb_alloc(size_t len, BlockCompletionFunc *cb, void *opaque) diff --git a/qemu/block/qed-l2-cache.c b/qemu/block/qed-l2-cache.c index e9b2aae44..5cba79465 100644 --- a/qemu/block/qed-l2-cache.c +++ b/qemu/block/qed-l2-cache.c @@ -50,6 +50,7 @@ * table will be deleted in favor of the existing cache entry. */ +#include "qemu/osdep.h" #include "trace.h" #include "qed.h" diff --git a/qemu/block/qed-table.c b/qemu/block/qed-table.c index 513aa872c..802945f5e 100644 --- a/qemu/block/qed-table.c +++ b/qemu/block/qed-table.c @@ -12,6 +12,7 @@ * */ +#include "qemu/osdep.h" #include "trace.h" #include "qemu/sockets.h" /* for EINPROGRESS on Windows */ #include "qed.h" @@ -63,7 +64,7 @@ static void qed_read_table(BDRVQEDState *s, uint64_t offset, QEDTable *table, read_table_cb->iov.iov_len = s->header.cluster_size * s->header.table_size, qemu_iovec_init_external(qiov, &read_table_cb->iov, 1); - bdrv_aio_readv(s->bs->file, offset / BDRV_SECTOR_SIZE, qiov, + bdrv_aio_readv(s->bs->file->bs, offset / BDRV_SECTOR_SIZE, qiov, qiov->size / BDRV_SECTOR_SIZE, qed_read_table_cb, read_table_cb); } @@ -152,7 +153,7 @@ static void qed_write_table(BDRVQEDState *s, uint64_t offset, QEDTable *table, /* Adjust for offset into table */ offset += start * sizeof(uint64_t); - bdrv_aio_writev(s->bs->file, offset / BDRV_SECTOR_SIZE, + bdrv_aio_writev(s->bs->file->bs, offset / BDRV_SECTOR_SIZE, &write_table_cb->qiov, write_table_cb->qiov.size / BDRV_SECTOR_SIZE, qed_write_table_cb, write_table_cb); diff --git a/qemu/block/qed.c b/qemu/block/qed.c index 954ed007c..0af52741d 100644 --- a/qemu/block/qed.c +++ b/qemu/block/qed.c @@ -12,11 +12,14 @@ * */ +#include "qemu/osdep.h" +#include "qapi/error.h" #include "qemu/timer.h" #include "trace.h" #include "qed.h" #include "qapi/qmp/qerror.h" #include "migration/migration.h" +#include "sysemu/block-backend.h" static const AIOCBInfo qed_aiocb_info = { .aiocb_size = sizeof(QEDAIOCB), @@ -82,7 +85,7 @@ int qed_write_header_sync(BDRVQEDState *s) int ret; qed_header_cpu_to_le(&s->header, &le); - ret = bdrv_pwrite(s->bs->file, 0, &le, sizeof(le)); + ret = bdrv_pwrite(s->bs->file->bs, 0, &le, sizeof(le)); if (ret != sizeof(le)) { return ret; } @@ -119,7 +122,7 @@ static void qed_write_header_read_cb(void *opaque, int ret) /* Update header */ qed_header_cpu_to_le(&s->header, (QEDHeader *)write_header_cb->buf); - bdrv_aio_writev(s->bs->file, 0, &write_header_cb->qiov, + bdrv_aio_writev(s->bs->file->bs, 0, &write_header_cb->qiov, write_header_cb->nsectors, qed_write_header_cb, write_header_cb); } @@ -152,7 +155,7 @@ static void qed_write_header(BDRVQEDState *s, BlockCompletionFunc cb, write_header_cb->iov.iov_len = len; qemu_iovec_init_external(&write_header_cb->qiov, &write_header_cb->iov, 1); - bdrv_aio_readv(s->bs->file, 0, &write_header_cb->qiov, nsectors, + bdrv_aio_readv(s->bs->file->bs, 0, &write_header_cb->qiov, nsectors, qed_write_header_read_cb, write_header_cb); } @@ -344,7 +347,7 @@ static void qed_start_need_check_timer(BDRVQEDState *s) * migration. */ timer_mod(s->need_check_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + - get_ticks_per_sec() * QED_NEED_CHECK_TIMEOUT); + NANOSECONDS_PER_SECOND * QED_NEED_CHECK_TIMEOUT); } /* It's okay to call this multiple times or when no timer is started */ @@ -354,12 +357,6 @@ static void qed_cancel_need_check_timer(BDRVQEDState *s) timer_del(s->need_check_timer); } -static void bdrv_qed_rebind(BlockDriverState *bs) -{ - BDRVQEDState *s = bs->opaque; - s->bs = bs; -} - static void bdrv_qed_detach_aio_context(BlockDriverState *bs) { BDRVQEDState *s = bs->opaque; @@ -392,7 +389,7 @@ static int bdrv_qed_open(BlockDriverState *bs, QDict *options, int flags, s->bs = bs; QSIMPLEQ_INIT(&s->allocating_write_reqs); - ret = bdrv_pread(bs->file, 0, &le_header, sizeof(le_header)); + ret = bdrv_pread(bs->file->bs, 0, &le_header, sizeof(le_header)); if (ret < 0) { return ret; } @@ -404,11 +401,8 @@ static int bdrv_qed_open(BlockDriverState *bs, QDict *options, int flags, } if (s->header.features & ~QED_FEATURE_MASK) { /* image uses unsupported feature bits */ - char buf[64]; - snprintf(buf, sizeof(buf), "%" PRIx64, - s->header.features & ~QED_FEATURE_MASK); - error_setg(errp, QERR_UNKNOWN_BLOCK_FORMAT_FEATURE, - bdrv_get_device_or_node_name(bs), "QED", buf); + error_setg(errp, "Unsupported QED features: %" PRIx64, + s->header.features & ~QED_FEATURE_MASK); return -ENOTSUP; } if (!qed_is_cluster_size_valid(s->header.cluster_size)) { @@ -416,7 +410,7 @@ static int bdrv_qed_open(BlockDriverState *bs, QDict *options, int flags, } /* Round down file size to the last cluster */ - file_size = bdrv_getlength(bs->file); + file_size = bdrv_getlength(bs->file->bs); if (file_size < 0) { return file_size; } @@ -452,7 +446,7 @@ static int bdrv_qed_open(BlockDriverState *bs, QDict *options, int flags, return -EINVAL; } - ret = qed_read_string(bs->file, s->header.backing_filename_offset, + ret = qed_read_string(bs->file->bs, s->header.backing_filename_offset, s->header.backing_filename_size, bs->backing_file, sizeof(bs->backing_file)); if (ret < 0) { @@ -471,7 +465,7 @@ static int bdrv_qed_open(BlockDriverState *bs, QDict *options, int flags, * feature is no longer valid. */ if ((s->header.autoclear_features & ~QED_AUTOCLEAR_FEATURE_MASK) != 0 && - !bdrv_is_read_only(bs->file) && !(flags & BDRV_O_INCOMING)) { + !bdrv_is_read_only(bs->file->bs) && !(flags & BDRV_O_INACTIVE)) { s->header.autoclear_features &= QED_AUTOCLEAR_FEATURE_MASK; ret = qed_write_header_sync(s); @@ -480,7 +474,7 @@ static int bdrv_qed_open(BlockDriverState *bs, QDict *options, int flags, } /* From here on only known autoclear feature bits are valid */ - bdrv_flush(bs->file); + bdrv_flush(bs->file->bs); } s->l1_table = qed_alloc_table(s); @@ -498,8 +492,8 @@ static int bdrv_qed_open(BlockDriverState *bs, QDict *options, int flags, * potentially inconsistent images to be opened read-only. This can * aid data recovery from an otherwise inconsistent image. */ - if (!bdrv_is_read_only(bs->file) && - !(flags & BDRV_O_INCOMING)) { + if (!bdrv_is_read_only(bs->file->bs) && + !(flags & BDRV_O_INACTIVE)) { BdrvCheckResult result = {0}; ret = qed_check(s, &result, true); @@ -541,7 +535,7 @@ static void bdrv_qed_close(BlockDriverState *bs) bdrv_qed_detach_aio_context(bs); /* Ensure writes reach stable storage */ - bdrv_flush(bs->file); + bdrv_flush(bs->file->bs); /* Clean shutdown, no check required on next open */ if (s->header.features & QED_F_NEED_CHECK) { @@ -573,7 +567,7 @@ static int qed_create(const char *filename, uint32_t cluster_size, size_t l1_size = header.cluster_size * header.table_size; Error *local_err = NULL; int ret = 0; - BlockDriverState *bs; + BlockBackend *blk; ret = bdrv_create_file(filename, opts, &local_err); if (ret < 0) { @@ -581,17 +575,17 @@ static int qed_create(const char *filename, uint32_t cluster_size, return ret; } - bs = NULL; - ret = bdrv_open(&bs, filename, NULL, NULL, - BDRV_O_RDWR | BDRV_O_CACHE_WB | BDRV_O_PROTOCOL, NULL, - &local_err); - if (ret < 0) { + blk = blk_new_open(filename, NULL, NULL, + BDRV_O_RDWR | BDRV_O_PROTOCOL, &local_err); + if (blk == NULL) { error_propagate(errp, local_err); - return ret; + return -EIO; } + blk_set_allow_write_beyond_eof(blk, true); + /* File must start empty and grow, check truncate is supported */ - ret = bdrv_truncate(bs, 0); + ret = blk_truncate(blk, 0); if (ret < 0) { goto out; } @@ -607,18 +601,18 @@ static int qed_create(const char *filename, uint32_t cluster_size, } qed_header_cpu_to_le(&header, &le_header); - ret = bdrv_pwrite(bs, 0, &le_header, sizeof(le_header)); + ret = blk_pwrite(blk, 0, &le_header, sizeof(le_header)); if (ret < 0) { goto out; } - ret = bdrv_pwrite(bs, sizeof(le_header), backing_file, - header.backing_filename_size); + ret = blk_pwrite(blk, sizeof(le_header), backing_file, + header.backing_filename_size); if (ret < 0) { goto out; } l1_table = g_malloc0(l1_size); - ret = bdrv_pwrite(bs, header.l1_table_offset, l1_table, l1_size); + ret = blk_pwrite(blk, header.l1_table_offset, l1_table, l1_size); if (ret < 0) { goto out; } @@ -626,7 +620,7 @@ static int qed_create(const char *filename, uint32_t cluster_size, ret = 0; /* success */ out: g_free(l1_table); - bdrv_unref(bs); + blk_unref(blk); return ret; } @@ -686,6 +680,7 @@ typedef struct { uint64_t pos; int64_t status; int *pnum; + BlockDriverState **file; } QEDIsAllocatedCB; static void qed_is_allocated_cb(void *opaque, int ret, uint64_t offset, size_t len) @@ -697,6 +692,7 @@ static void qed_is_allocated_cb(void *opaque, int ret, uint64_t offset, size_t l case QED_CLUSTER_FOUND: offset |= qed_offset_into_cluster(s, cb->pos); cb->status = BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID | offset; + *cb->file = cb->bs->file->bs; break; case QED_CLUSTER_ZERO: cb->status = BDRV_BLOCK_ZERO; @@ -718,7 +714,8 @@ static void qed_is_allocated_cb(void *opaque, int ret, uint64_t offset, size_t l static int64_t coroutine_fn bdrv_qed_co_get_block_status(BlockDriverState *bs, int64_t sector_num, - int nb_sectors, int *pnum) + int nb_sectors, int *pnum, + BlockDriverState **file) { BDRVQEDState *s = bs->opaque; size_t len = (size_t)nb_sectors * BDRV_SECTOR_SIZE; @@ -727,6 +724,7 @@ static int64_t coroutine_fn bdrv_qed_co_get_block_status(BlockDriverState *bs, .pos = (uint64_t)sector_num * BDRV_SECTOR_SIZE, .status = BDRV_BLOCK_OFFSET_MASK, .pnum = pnum, + .file = file, }; QEDRequest request = { .l2_table = NULL }; @@ -772,8 +770,8 @@ static void qed_read_backing_file(BDRVQEDState *s, uint64_t pos, /* If there is a backing file, get its length. Treat the absence of a * backing file like a zero length backing file. */ - if (s->bs->backing_hd) { - int64_t l = bdrv_getlength(s->bs->backing_hd); + if (s->bs->backing) { + int64_t l = bdrv_getlength(s->bs->backing->bs); if (l < 0) { cb(opaque, l); return; @@ -802,7 +800,7 @@ static void qed_read_backing_file(BDRVQEDState *s, uint64_t pos, qemu_iovec_concat(*backing_qiov, qiov, 0, size); BLKDBG_EVENT(s->bs->file, BLKDBG_READ_BACKING_AIO); - bdrv_aio_readv(s->bs->backing_hd, pos / BDRV_SECTOR_SIZE, + bdrv_aio_readv(s->bs->backing->bs, pos / BDRV_SECTOR_SIZE, *backing_qiov, size / BDRV_SECTOR_SIZE, cb, opaque); } @@ -839,7 +837,7 @@ static void qed_copy_from_backing_file_write(void *opaque, int ret) } BLKDBG_EVENT(s->bs->file, BLKDBG_COW_WRITE); - bdrv_aio_writev(s->bs->file, copy_cb->offset / BDRV_SECTOR_SIZE, + bdrv_aio_writev(s->bs->file->bs, copy_cb->offset / BDRV_SECTOR_SIZE, ©_cb->qiov, copy_cb->qiov.size / BDRV_SECTOR_SIZE, qed_copy_from_backing_file_cb, copy_cb); } @@ -1055,7 +1053,7 @@ static void qed_aio_write_flush_before_l2_update(void *opaque, int ret) QEDAIOCB *acb = opaque; BDRVQEDState *s = acb_to_s(acb); - if (!bdrv_aio_flush(s->bs->file, qed_aio_write_l2_update_cb, opaque)) { + if (!bdrv_aio_flush(s->bs->file->bs, qed_aio_write_l2_update_cb, opaque)) { qed_aio_complete(acb, -EIO); } } @@ -1081,7 +1079,7 @@ static void qed_aio_write_main(void *opaque, int ret) if (acb->find_cluster_ret == QED_CLUSTER_FOUND) { next_fn = qed_aio_next_io; } else { - if (s->bs->backing_hd) { + if (s->bs->backing) { next_fn = qed_aio_write_flush_before_l2_update; } else { next_fn = qed_aio_write_l2_update_cb; @@ -1089,7 +1087,7 @@ static void qed_aio_write_main(void *opaque, int ret) } BLKDBG_EVENT(s->bs->file, BLKDBG_WRITE_AIO); - bdrv_aio_writev(s->bs->file, offset / BDRV_SECTOR_SIZE, + bdrv_aio_writev(s->bs->file->bs, offset / BDRV_SECTOR_SIZE, &acb->cur_qiov, acb->cur_qiov.size / BDRV_SECTOR_SIZE, next_fn, acb); } @@ -1139,7 +1137,7 @@ static void qed_aio_write_prefill(void *opaque, int ret) static bool qed_should_set_need_check(BDRVQEDState *s) { /* The flush before L2 update path ensures consistency */ - if (s->bs->backing_hd) { + if (s->bs->backing) { return false; } @@ -1321,7 +1319,7 @@ static void qed_aio_read_data(void *opaque, int ret, } BLKDBG_EVENT(bs->file, BLKDBG_READ_AIO); - bdrv_aio_readv(bs->file, offset / BDRV_SECTOR_SIZE, + bdrv_aio_readv(bs->file->bs, offset / BDRV_SECTOR_SIZE, &acb->cur_qiov, acb->cur_qiov.size / BDRV_SECTOR_SIZE, qed_aio_next_io, acb); return; @@ -1443,7 +1441,7 @@ static int coroutine_fn bdrv_qed_co_write_zeroes(BlockDriverState *bs, struct iovec iov; /* Refuse if there are untouched backing file sectors */ - if (bs->backing_hd) { + if (bs->backing) { if (qed_offset_into_cluster(s, sector_num * BDRV_SECTOR_SIZE) != 0) { return -ENOTSUP; } @@ -1580,7 +1578,7 @@ static int bdrv_qed_change_backing_file(BlockDriverState *bs, } /* Write new header */ - ret = bdrv_pwrite_sync(bs->file, 0, buffer, buffer_len); + ret = bdrv_pwrite_sync(bs->file->bs, 0, buffer, buffer_len); g_free(buffer); if (ret == 0) { memcpy(&s->header, &new_header, sizeof(new_header)); @@ -1596,7 +1594,7 @@ static void bdrv_qed_invalidate_cache(BlockDriverState *bs, Error **errp) bdrv_qed_close(bs); - bdrv_invalidate_cache(bs->file, &local_err); + bdrv_invalidate_cache(bs->file->bs, &local_err); if (local_err) { error_propagate(errp, local_err); return; @@ -1605,9 +1603,8 @@ static void bdrv_qed_invalidate_cache(BlockDriverState *bs, Error **errp) memset(s, 0, sizeof(BDRVQEDState)); ret = bdrv_qed_open(bs, NULL, bs->open_flags, &local_err); if (local_err) { - error_setg(errp, "Could not reopen qed layer: %s", - error_get_pretty(local_err)); - error_free(local_err); + error_propagate(errp, local_err); + error_prepend(errp, "Could not reopen qed layer: "); return; } else if (ret < 0) { error_setg_errno(errp, -ret, "Could not reopen qed layer"); @@ -1664,7 +1661,6 @@ static BlockDriver bdrv_qed = { .supports_backing = true, .bdrv_probe = bdrv_qed_probe, - .bdrv_rebind = bdrv_qed_rebind, .bdrv_open = bdrv_qed_open, .bdrv_close = bdrv_qed_close, .bdrv_reopen_prepare = bdrv_qed_reopen_prepare, diff --git a/qemu/block/qed.h b/qemu/block/qed.h index 615e676fc..22b319875 100644 --- a/qemu/block/qed.h +++ b/qemu/block/qed.h @@ -16,6 +16,7 @@ #define BLOCK_QED_H #include "block/block_int.h" +#include "qemu/cutils.h" /* The layout of a QED file is as follows: * diff --git a/qemu/block/quorum.c b/qemu/block/quorum.c index 2f6c45f76..da15465a9 100644 --- a/qemu/block/quorum.c +++ b/qemu/block/quorum.c @@ -13,6 +13,7 @@ * See the COPYING file in the top-level directory. */ +#include "qemu/osdep.h" #include "block/block_int.h" #include "qapi/qmp/qbool.h" #include "qapi/qmp/qdict.h" @@ -64,7 +65,7 @@ typedef struct QuorumVotes { /* the following structure holds the state of one quorum instance */ typedef struct BDRVQuorumState { - BlockDriverState **bs; /* children BlockDriverStates */ + BdrvChild **children; /* children BlockDriverStates */ int num_children; /* children count */ int threshold; /* if less than threshold children reads gave the * same result a quorum error occurs. @@ -214,14 +215,16 @@ static QuorumAIOCB *quorum_aio_get(BDRVQuorumState *s, return acb; } -static void quorum_report_bad(QuorumAIOCB *acb, char *node_name, int ret) +static void quorum_report_bad(QuorumOpType type, uint64_t sector_num, + int nb_sectors, char *node_name, int ret) { const char *msg = NULL; if (ret < 0) { msg = strerror(-ret); } - qapi_event_send_quorum_report_bad(!!msg, msg, node_name, - acb->sector_num, acb->nb_sectors, &error_abort); + + qapi_event_send_quorum_report_bad(type, !!msg, msg, node_name, + sector_num, nb_sectors, &error_abort); } static void quorum_report_failure(QuorumAIOCB *acb) @@ -283,9 +286,19 @@ static void quorum_aio_cb(void *opaque, int ret) BDRVQuorumState *s = acb->common.bs->opaque; bool rewrite = false; + if (ret == 0) { + acb->success_count++; + } else { + QuorumOpType type; + type = acb->is_read ? QUORUM_OP_TYPE_READ : QUORUM_OP_TYPE_WRITE; + quorum_report_bad(type, acb->sector_num, acb->nb_sectors, + sacb->aiocb->bs->node_name, ret); + } + if (acb->is_read && s->read_pattern == QUORUM_READ_PATTERN_FIFO) { /* We try to read next child in FIFO order if we fail to read */ - if (ret < 0 && ++acb->child_iter < s->num_children) { + if (ret < 0 && (acb->child_iter + 1) < s->num_children) { + acb->child_iter++; read_fifo_child(acb); return; } @@ -300,11 +313,6 @@ static void quorum_aio_cb(void *opaque, int ret) sacb->ret = ret; acb->count++; - if (ret == 0) { - acb->success_count++; - } else { - quorum_report_bad(acb, sacb->aiocb->bs->node_name, ret); - } assert(acb->count <= s->num_children); assert(acb->success_count <= s->num_children); if (acb->count < s->num_children) { @@ -336,7 +344,9 @@ static void quorum_report_bad_versions(BDRVQuorumState *s, continue; } QLIST_FOREACH(item, &version->items, next) { - quorum_report_bad(acb, s->bs[item->index]->node_name, 0); + quorum_report_bad(QUORUM_OP_TYPE_READ, acb->sector_num, + acb->nb_sectors, + s->children[item->index]->bs->node_name, 0); } } } @@ -369,8 +379,9 @@ static bool quorum_rewrite_bad_versions(BDRVQuorumState *s, QuorumAIOCB *acb, continue; } QLIST_FOREACH(item, &version->items, next) { - bdrv_aio_writev(s->bs[item->index], acb->sector_num, acb->qiov, - acb->nb_sectors, quorum_rewrite_aio_cb, acb); + bdrv_aio_writev(s->children[item->index]->bs, acb->sector_num, + acb->qiov, acb->nb_sectors, quorum_rewrite_aio_cb, + acb); } } @@ -639,14 +650,15 @@ static BlockAIOCB *read_quorum_children(QuorumAIOCB *acb) int i; for (i = 0; i < s->num_children; i++) { - acb->qcrs[i].buf = qemu_blockalign(s->bs[i], acb->qiov->size); + acb->qcrs[i].buf = qemu_blockalign(s->children[i]->bs, acb->qiov->size); qemu_iovec_init(&acb->qcrs[i].qiov, acb->qiov->niov); qemu_iovec_clone(&acb->qcrs[i].qiov, acb->qiov, acb->qcrs[i].buf); } for (i = 0; i < s->num_children; i++) { - bdrv_aio_readv(s->bs[i], acb->sector_num, &acb->qcrs[i].qiov, - acb->nb_sectors, quorum_aio_cb, &acb->qcrs[i]); + acb->qcrs[i].aiocb = bdrv_aio_readv(s->children[i]->bs, acb->sector_num, + &acb->qcrs[i].qiov, acb->nb_sectors, + quorum_aio_cb, &acb->qcrs[i]); } return &acb->common; @@ -656,14 +668,15 @@ static BlockAIOCB *read_fifo_child(QuorumAIOCB *acb) { BDRVQuorumState *s = acb->common.bs->opaque; - acb->qcrs[acb->child_iter].buf = qemu_blockalign(s->bs[acb->child_iter], - acb->qiov->size); + acb->qcrs[acb->child_iter].buf = + qemu_blockalign(s->children[acb->child_iter]->bs, acb->qiov->size); qemu_iovec_init(&acb->qcrs[acb->child_iter].qiov, acb->qiov->niov); qemu_iovec_clone(&acb->qcrs[acb->child_iter].qiov, acb->qiov, acb->qcrs[acb->child_iter].buf); - bdrv_aio_readv(s->bs[acb->child_iter], acb->sector_num, - &acb->qcrs[acb->child_iter].qiov, acb->nb_sectors, - quorum_aio_cb, &acb->qcrs[acb->child_iter]); + acb->qcrs[acb->child_iter].aiocb = + bdrv_aio_readv(s->children[acb->child_iter]->bs, acb->sector_num, + &acb->qcrs[acb->child_iter].qiov, acb->nb_sectors, + quorum_aio_cb, &acb->qcrs[acb->child_iter]); return &acb->common; } @@ -702,8 +715,8 @@ static BlockAIOCB *quorum_aio_writev(BlockDriverState *bs, int i; for (i = 0; i < s->num_children; i++) { - acb->qcrs[i].aiocb = bdrv_aio_writev(s->bs[i], sector_num, qiov, - nb_sectors, &quorum_aio_cb, + acb->qcrs[i].aiocb = bdrv_aio_writev(s->children[i]->bs, sector_num, + qiov, nb_sectors, &quorum_aio_cb, &acb->qcrs[i]); } @@ -717,12 +730,12 @@ static int64_t quorum_getlength(BlockDriverState *bs) int i; /* check that all file have the same length */ - result = bdrv_getlength(s->bs[0]); + result = bdrv_getlength(s->children[0]->bs); if (result < 0) { return result; } for (i = 1; i < s->num_children; i++) { - int64_t value = bdrv_getlength(s->bs[i]); + int64_t value = bdrv_getlength(s->children[i]->bs); if (value < 0) { return value; } @@ -741,7 +754,7 @@ static void quorum_invalidate_cache(BlockDriverState *bs, Error **errp) int i; for (i = 0; i < s->num_children; i++) { - bdrv_invalidate_cache(s->bs[i], &local_err); + bdrv_invalidate_cache(s->children[i]->bs, &local_err); if (local_err) { error_propagate(errp, local_err); return; @@ -757,19 +770,30 @@ static coroutine_fn int quorum_co_flush(BlockDriverState *bs) QuorumVoteValue result_value; int i; int result = 0; + int success_count = 0; QLIST_INIT(&error_votes.vote_list); error_votes.compare = quorum_64bits_compare; for (i = 0; i < s->num_children; i++) { - result = bdrv_co_flush(s->bs[i]); - result_value.l = result; - quorum_count_vote(&error_votes, &result_value, i); + result = bdrv_co_flush(s->children[i]->bs); + if (result) { + quorum_report_bad(QUORUM_OP_TYPE_FLUSH, 0, + bdrv_nb_sectors(s->children[i]->bs), + s->children[i]->bs->node_name, result); + result_value.l = result; + quorum_count_vote(&error_votes, &result_value, i); + } else { + success_count++; + } } - winner = quorum_get_vote_winner(&error_votes); - result = winner->value.l; - + if (success_count >= s->threshold) { + result = 0; + } else { + winner = quorum_get_vote_winner(&error_votes); + result = winner->value.l; + } quorum_free_vote_list(&error_votes); return result; @@ -782,7 +806,7 @@ static bool quorum_recurse_is_first_non_filter(BlockDriverState *bs, int i; for (i = 0; i < s->num_children; i++) { - bool perm = bdrv_recurse_is_first_non_filter(s->bs[i], + bool perm = bdrv_recurse_is_first_non_filter(s->children[i]->bs, candidate); if (perm) { return true; @@ -846,7 +870,7 @@ static int parse_read_pattern(const char *opt) return QUORUM_READ_PATTERN_QUORUM; } - for (i = 0; i < QUORUM_READ_PATTERN_MAX; i++) { + for (i = 0; i < QUORUM_READ_PATTERN__MAX; i++) { if (!strcmp(opt, QuorumReadPattern_lookup[i])) { return i; } @@ -889,6 +913,12 @@ static int quorum_open(BlockDriverState *bs, QDict *options, int flags, } s->threshold = qemu_opt_get_number(opts, QUORUM_OPT_VOTE_THRESHOLD, 0); + /* and validate it against s->num_children */ + ret = quorum_valid_threshold(s->threshold, s->num_children, &local_err); + if (ret < 0) { + goto exit; + } + ret = parse_read_pattern(qemu_opt_get(opts, QUORUM_OPT_READ_PATTERN)); if (ret < 0) { error_setg(&local_err, "Please set read-pattern as fifo or quorum"); @@ -897,12 +927,6 @@ static int quorum_open(BlockDriverState *bs, QDict *options, int flags, s->read_pattern = ret; if (s->read_pattern == QUORUM_READ_PATTERN_QUORUM) { - /* and validate it against s->num_children */ - ret = quorum_valid_threshold(s->threshold, s->num_children, &local_err); - if (ret < 0) { - goto exit; - } - /* is the driver in blkverify mode */ if (qemu_opt_get_bool(opts, QUORUM_OPT_BLKVERIFY, false) && s->num_children == 2 && s->threshold == 2) { @@ -922,8 +946,8 @@ static int quorum_open(BlockDriverState *bs, QDict *options, int flags, } } - /* allocate the children BlockDriverState array */ - s->bs = g_new0(BlockDriverState *, s->num_children); + /* allocate the children array */ + s->children = g_new0(BdrvChild *, s->num_children); opened = g_new0(bool, s->num_children); for (i = 0; i < s->num_children; i++) { @@ -931,9 +955,10 @@ static int quorum_open(BlockDriverState *bs, QDict *options, int flags, ret = snprintf(indexstr, 32, "children.%d", i); assert(ret < 32); - ret = bdrv_open_image(&s->bs[i], NULL, options, indexstr, bs, - &child_format, false, &local_err); - if (ret < 0) { + s->children[i] = bdrv_open_child(NULL, options, indexstr, bs, + &child_format, false, &local_err); + if (local_err) { + ret = -EINVAL; goto close_exit; } @@ -949,9 +974,9 @@ close_exit: if (!opened[i]) { continue; } - bdrv_unref(s->bs[i]); + bdrv_unref_child(bs, s->children[i]); } - g_free(s->bs); + g_free(s->children); g_free(opened); exit: qemu_opts_del(opts); @@ -968,10 +993,10 @@ static void quorum_close(BlockDriverState *bs) int i; for (i = 0; i < s->num_children; i++) { - bdrv_unref(s->bs[i]); + bdrv_unref_child(bs, s->children[i]); } - g_free(s->bs); + g_free(s->children); } static void quorum_detach_aio_context(BlockDriverState *bs) @@ -980,7 +1005,7 @@ static void quorum_detach_aio_context(BlockDriverState *bs) int i; for (i = 0; i < s->num_children; i++) { - bdrv_detach_aio_context(s->bs[i]); + bdrv_detach_aio_context(s->children[i]->bs); } } @@ -991,11 +1016,11 @@ static void quorum_attach_aio_context(BlockDriverState *bs, int i; for (i = 0; i < s->num_children; i++) { - bdrv_attach_aio_context(s->bs[i], new_context); + bdrv_attach_aio_context(s->children[i]->bs, new_context); } } -static void quorum_refresh_filename(BlockDriverState *bs) +static void quorum_refresh_filename(BlockDriverState *bs, QDict *options) { BDRVQuorumState *s = bs->opaque; QDict *opts; @@ -1003,16 +1028,17 @@ static void quorum_refresh_filename(BlockDriverState *bs) int i; for (i = 0; i < s->num_children; i++) { - bdrv_refresh_filename(s->bs[i]); - if (!s->bs[i]->full_open_options) { + bdrv_refresh_filename(s->children[i]->bs); + if (!s->children[i]->bs->full_open_options) { return; } } children = qlist_new(); for (i = 0; i < s->num_children; i++) { - QINCREF(s->bs[i]->full_open_options); - qlist_append_obj(children, QOBJECT(s->bs[i]->full_open_options)); + QINCREF(s->children[i]->bs->full_open_options); + qlist_append_obj(children, + QOBJECT(s->children[i]->bs->full_open_options)); } opts = qdict_new(); diff --git a/qemu/block/raw-aio.h b/qemu/block/raw-aio.h index 31d791fe6..811e37501 100644 --- a/qemu/block/raw-aio.h +++ b/qemu/block/raw-aio.h @@ -15,6 +15,8 @@ #ifndef QEMU_RAW_AIO_H #define QEMU_RAW_AIO_H +#include "qemu/iov.h" + /* AIO request types */ #define QEMU_AIO_READ 0x0001 #define QEMU_AIO_WRITE 0x0002 diff --git a/qemu/block/raw-posix.c b/qemu/block/raw-posix.c index 855febed5..906d5c941 100644 --- a/qemu/block/raw-posix.c +++ b/qemu/block/raw-posix.c @@ -21,7 +21,9 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. */ -#include "qemu-common.h" +#include "qemu/osdep.h" +#include "qapi/error.h" +#include "qemu/cutils.h" #include "qemu/error-report.h" #include "qemu/timer.h" #include "qemu/log.h" @@ -43,6 +45,7 @@ #include <IOKit/storage/IOMedia.h> #include <IOKit/storage/IOCDMedia.h> //#include <IOKit/storage/IOCDTypes.h> +#include <IOKit/storage/IODVDMedia.h> #include <CoreFoundation/CoreFoundation.h> #endif @@ -51,8 +54,6 @@ #include <sys/dkio.h> #endif #ifdef __linux__ -#include <sys/types.h> -#include <sys/stat.h> #include <sys/ioctl.h> #include <sys/param.h> #include <linux/cdrom.h> @@ -127,11 +128,6 @@ do { \ #define FTYPE_FILE 0 #define FTYPE_CD 1 -#define FTYPE_FD 2 - -/* if the FD is not accessed during that time (in ns), we try to - reopen it to see if the disk has been changed */ -#define FD_OPEN_TIMEOUT (1000000000) #define MAX_BLOCKSIZE 4096 @@ -141,13 +137,6 @@ typedef struct BDRVRawState { int open_flags; size_t buf_align; -#if defined(__linux__) - /* linux floppy specific */ - int64_t fd_open_time; - int64_t fd_error_time; - int fd_got_error; - int fd_media_changed; -#endif #ifdef CONFIG_LINUX_AIO int use_aio; void *aio_ctx; @@ -512,14 +501,19 @@ static int raw_open_common(BlockDriverState *bs, QDict *options, goto fail; } if (!s->use_aio && (bdrv_flags & BDRV_O_NATIVE_AIO)) { - error_printf("WARNING: aio=native was specified for '%s', but " - "it requires cache.direct=on, which was not " - "specified. Falling back to aio=threads.\n" - " This will become an error condition in " - "future QEMU versions.\n", - bs->filename); + error_setg(errp, "aio=native was specified, but it requires " + "cache.direct=on, which was not specified."); + ret = -EINVAL; + goto fail; } -#endif +#else + if (bdrv_flags & BDRV_O_NATIVE_AIO) { + error_setg(errp, "aio=native was specified, but is not supported " + "in this build."); + ret = -EINVAL; + goto fail; + } +#endif /* !defined(CONFIG_LINUX_AIO) */ s->has_discard = true; s->has_write_zeroes = true; @@ -626,7 +620,7 @@ static int raw_reopen_prepare(BDRVReopenState *state, } #endif - if (s->type == FTYPE_FD || s->type == FTYPE_CD) { + if (s->type == FTYPE_CD) { raw_s->open_flags |= O_NONBLOCK; } @@ -670,11 +664,17 @@ static int raw_reopen_prepare(BDRVReopenState *state, /* If we cannot use fcntl, or fcntl failed, fall back to qemu_open() */ if (raw_s->fd == -1) { - assert(!(raw_s->open_flags & O_CREAT)); - raw_s->fd = qemu_open(state->bs->filename, raw_s->open_flags); - if (raw_s->fd == -1) { - error_setg_errno(errp, errno, "Could not reopen file"); - ret = -1; + const char *normalized_filename = state->bs->filename; + ret = raw_normalize_devicepath(&normalized_filename); + if (ret < 0) { + error_setg_errno(errp, -ret, "Could not normalize device path"); + } else { + assert(!(raw_s->open_flags & O_CREAT)); + raw_s->fd = qemu_open(normalized_filename, raw_s->open_flags); + if (raw_s->fd == -1) { + error_setg_errno(errp, errno, "Could not reopen file"); + ret = -1; + } } } @@ -780,7 +780,6 @@ static int hdev_probe_geometry(BlockDriverState *bs, HDGeometry *geo) { BDRVRawState *s = bs->opaque; struct hd_geometry ioctl_geo = {0}; - uint32_t blksize; /* If DASD, get its geometry */ if (check_for_dasd(s->fd) < 0) { @@ -800,12 +799,6 @@ static int hdev_probe_geometry(BlockDriverState *bs, HDGeometry *geo) } geo->heads = ioctl_geo.heads; geo->sectors = ioctl_geo.sectors; - if (!probe_physical_blocksize(s->fd, &blksize)) { - /* overwrite cyls: HDIO_GETGEO result is incorrect for big drives */ - geo->cylinders = bdrv_nb_sectors(bs) / (blksize / BDRV_SECTOR_SIZE) - / (geo->heads * geo->sectors); - return 0; - } geo->cylinders = ioctl_geo.cylinders; return 0; @@ -1253,7 +1246,7 @@ static int aio_worker(void *arg) break; } - g_slice_free(RawPosixAIOData, aiocb); + g_free(aiocb); return ret; } @@ -1261,7 +1254,7 @@ static int paio_submit_co(BlockDriverState *bs, int fd, int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, int type) { - RawPosixAIOData *acb = g_slice_new(RawPosixAIOData); + RawPosixAIOData *acb = g_new(RawPosixAIOData, 1); ThreadPool *pool; acb->bs = bs; @@ -1286,7 +1279,7 @@ static BlockAIOCB *paio_submit(BlockDriverState *bs, int fd, int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, BlockCompletionFunc *cb, void *opaque, int type) { - RawPosixAIOData *acb = g_slice_new(RawPosixAIOData); + RawPosixAIOData *acb = g_new(RawPosixAIOData, 1); ThreadPool *pool; acb->bs = bs; @@ -1633,7 +1626,7 @@ static int raw_create(const char *filename, QemuOpts *opts, Error **errp) nocow = qemu_opt_get_bool(opts, BLOCK_OPT_NOCOW, false); buf = qemu_opt_get_del(opts, BLOCK_OPT_PREALLOC); prealloc = qapi_enum_parse(PreallocMode_lookup, buf, - PREALLOC_MODE_MAX, PREALLOC_MODE_OFF, + PREALLOC_MODE__MAX, PREALLOC_MODE_OFF, &local_err); g_free(buf); if (local_err) { @@ -1642,7 +1635,7 @@ static int raw_create(const char *filename, QemuOpts *opts, Error **errp) goto out; } - fd = qemu_open(filename, O_WRONLY | O_CREAT | O_TRUNC | O_BINARY, + fd = qemu_open(filename, O_RDWR | O_CREAT | O_TRUNC | O_BINARY, 0644); if (fd < 0) { result = -errno; @@ -1827,7 +1820,8 @@ static int find_allocation(BlockDriverState *bs, off_t start, */ static int64_t coroutine_fn raw_co_get_block_status(BlockDriverState *bs, int64_t sector_num, - int nb_sectors, int *pnum) + int nb_sectors, int *pnum, + BlockDriverState **file) { off_t start, data = 0, hole = 0; int64_t total_size; @@ -1869,6 +1863,7 @@ static int64_t coroutine_fn raw_co_get_block_status(BlockDriverState *bs, *pnum = MIN(nb_sectors, (data - start) / BDRV_SECTOR_SIZE); ret = BDRV_BLOCK_ZERO; } + *file = bs; return ret | BDRV_BLOCK_OFFSET_VALID | start; } @@ -1972,36 +1967,51 @@ BlockDriver bdrv_file = { /* host device */ #if defined(__APPLE__) && defined(__MACH__) -static kern_return_t FindEjectableCDMedia( io_iterator_t *mediaIterator ); -static kern_return_t GetBSDPath( io_iterator_t mediaIterator, char *bsdPath, CFIndex maxPathSize ); - -kern_return_t FindEjectableCDMedia( io_iterator_t *mediaIterator ) +static kern_return_t GetBSDPath(io_iterator_t mediaIterator, char *bsdPath, + CFIndex maxPathSize, int flags); +static char *FindEjectableOpticalMedia(io_iterator_t *mediaIterator) { - kern_return_t kernResult; + kern_return_t kernResult = KERN_FAILURE; mach_port_t masterPort; CFMutableDictionaryRef classesToMatch; + const char *matching_array[] = {kIODVDMediaClass, kIOCDMediaClass}; + char *mediaType = NULL; kernResult = IOMasterPort( MACH_PORT_NULL, &masterPort ); if ( KERN_SUCCESS != kernResult ) { printf( "IOMasterPort returned %d\n", kernResult ); } - classesToMatch = IOServiceMatching( kIOCDMediaClass ); - if ( classesToMatch == NULL ) { - printf( "IOServiceMatching returned a NULL dictionary.\n" ); - } else { - CFDictionarySetValue( classesToMatch, CFSTR( kIOMediaEjectableKey ), kCFBooleanTrue ); - } - kernResult = IOServiceGetMatchingServices( masterPort, classesToMatch, mediaIterator ); - if ( KERN_SUCCESS != kernResult ) - { - printf( "IOServiceGetMatchingServices returned %d\n", kernResult ); - } + int index; + for (index = 0; index < ARRAY_SIZE(matching_array); index++) { + classesToMatch = IOServiceMatching(matching_array[index]); + if (classesToMatch == NULL) { + error_report("IOServiceMatching returned NULL for %s", + matching_array[index]); + continue; + } + CFDictionarySetValue(classesToMatch, CFSTR(kIOMediaEjectableKey), + kCFBooleanTrue); + kernResult = IOServiceGetMatchingServices(masterPort, classesToMatch, + mediaIterator); + if (kernResult != KERN_SUCCESS) { + error_report("Note: IOServiceGetMatchingServices returned %d", + kernResult); + continue; + } - return kernResult; + /* If a match was found, leave the loop */ + if (*mediaIterator != 0) { + DPRINTF("Matching using %s\n", matching_array[index]); + mediaType = g_strdup(matching_array[index]); + break; + } + } + return mediaType; } -kern_return_t GetBSDPath( io_iterator_t mediaIterator, char *bsdPath, CFIndex maxPathSize ) +kern_return_t GetBSDPath(io_iterator_t mediaIterator, char *bsdPath, + CFIndex maxPathSize, int flags) { io_object_t nextMedia; kern_return_t kernResult = KERN_FAILURE; @@ -2014,7 +2024,9 @@ kern_return_t GetBSDPath( io_iterator_t mediaIterator, char *bsdPath, CFIndex ma if ( bsdPathAsCFString ) { size_t devPathLength; strcpy( bsdPath, _PATH_DEV ); - strcat( bsdPath, "r" ); + if (flags & BDRV_O_NOCACHE) { + strcat(bsdPath, "r"); + } devPathLength = strlen( bsdPath ); if ( CFStringGetCString( bsdPathAsCFString, bsdPath + devPathLength, maxPathSize - devPathLength, kCFStringEncodingASCII ) ) { kernResult = KERN_SUCCESS; @@ -2027,7 +2039,46 @@ kern_return_t GetBSDPath( io_iterator_t mediaIterator, char *bsdPath, CFIndex ma return kernResult; } -#endif +/* Sets up a real cdrom for use in QEMU */ +static bool setup_cdrom(char *bsd_path, Error **errp) +{ + int index, num_of_test_partitions = 2, fd; + char test_partition[MAXPATHLEN]; + bool partition_found = false; + + /* look for a working partition */ + for (index = 0; index < num_of_test_partitions; index++) { + snprintf(test_partition, sizeof(test_partition), "%ss%d", bsd_path, + index); + fd = qemu_open(test_partition, O_RDONLY | O_BINARY | O_LARGEFILE); + if (fd >= 0) { + partition_found = true; + qemu_close(fd); + break; + } + } + + /* if a working partition on the device was not found */ + if (partition_found == false) { + error_setg(errp, "Failed to find a working partition on disc"); + } else { + DPRINTF("Using %s as optical disc\n", test_partition); + pstrcpy(bsd_path, MAXPATHLEN, test_partition); + } + return partition_found; +} + +/* Prints directions on mounting and unmounting a device */ +static void print_unmounting_directions(const char *file_name) +{ + error_report("If device %s is mounted on the desktop, unmount" + " it first before using it in QEMU", file_name); + error_report("Command to unmount device: diskutil unmountDisk %s", + file_name); + error_report("Command to mount device: diskutil mountDisk %s", file_name); +} + +#endif /* defined(__APPLE__) && defined(__MACH__) */ static int hdev_probe_device(const char *filename) { @@ -2118,33 +2169,57 @@ static int hdev_open(BlockDriverState *bs, QDict *options, int flags, #if defined(__APPLE__) && defined(__MACH__) const char *filename = qdict_get_str(options, "filename"); + char bsd_path[MAXPATHLEN] = ""; + bool error_occurred = false; + + /* If using a real cdrom */ + if (strcmp(filename, "/dev/cdrom") == 0) { + char *mediaType = NULL; + kern_return_t ret_val; + io_iterator_t mediaIterator = 0; + + mediaType = FindEjectableOpticalMedia(&mediaIterator); + if (mediaType == NULL) { + error_setg(errp, "Please make sure your CD/DVD is in the optical" + " drive"); + error_occurred = true; + goto hdev_open_Mac_error; + } - if (strstart(filename, "/dev/cdrom", NULL)) { - kern_return_t kernResult; - io_iterator_t mediaIterator; - char bsdPath[ MAXPATHLEN ]; - int fd; - - kernResult = FindEjectableCDMedia( &mediaIterator ); - kernResult = GetBSDPath( mediaIterator, bsdPath, sizeof( bsdPath ) ); - - if ( bsdPath[ 0 ] != '\0' ) { - strcat(bsdPath,"s0"); - /* some CDs don't have a partition 0 */ - fd = qemu_open(bsdPath, O_RDONLY | O_BINARY | O_LARGEFILE); - if (fd < 0) { - bsdPath[strlen(bsdPath)-1] = '1'; - } else { - qemu_close(fd); - } - filename = bsdPath; - qdict_put(options, "filename", qstring_from_str(filename)); + ret_val = GetBSDPath(mediaIterator, bsd_path, sizeof(bsd_path), flags); + if (ret_val != KERN_SUCCESS) { + error_setg(errp, "Could not get BSD path for optical drive"); + error_occurred = true; + goto hdev_open_Mac_error; + } + + /* If a real optical drive was not found */ + if (bsd_path[0] == '\0') { + error_setg(errp, "Failed to obtain bsd path for optical drive"); + error_occurred = true; + goto hdev_open_Mac_error; + } + + /* If using a cdrom disc and finding a partition on the disc failed */ + if (strncmp(mediaType, kIOCDMediaClass, 9) == 0 && + setup_cdrom(bsd_path, errp) == false) { + print_unmounting_directions(bsd_path); + error_occurred = true; + goto hdev_open_Mac_error; } - if ( mediaIterator ) - IOObjectRelease( mediaIterator ); + qdict_put(options, "filename", qstring_from_str(bsd_path)); + +hdev_open_Mac_error: + g_free(mediaType); + if (mediaIterator) { + IOObjectRelease(mediaIterator); + } + if (error_occurred) { + return -ENOENT; + } } -#endif +#endif /* defined(__APPLE__) && defined(__MACH__) */ s->type = FTYPE_FILE; @@ -2153,6 +2228,15 @@ static int hdev_open(BlockDriverState *bs, QDict *options, int flags, if (local_err) { error_propagate(errp, local_err); } +#if defined(__APPLE__) && defined(__MACH__) + if (*bsd_path) { + filename = bsd_path; + } + /* if a physical device experienced an error while being opened */ + if (strncmp(filename, "/dev/", 5) == 0) { + print_unmounting_directions(filename); + } +#endif /* defined(__APPLE__) && defined(__MACH__) */ return ret; } @@ -2172,53 +2256,6 @@ static int hdev_open(BlockDriverState *bs, QDict *options, int flags, } #if defined(__linux__) -/* Note: we do not have a reliable method to detect if the floppy is - present. The current method is to try to open the floppy at every - I/O and to keep it opened during a few hundreds of ms. */ -static int fd_open(BlockDriverState *bs) -{ - BDRVRawState *s = bs->opaque; - int last_media_present; - - if (s->type != FTYPE_FD) - return 0; - last_media_present = (s->fd >= 0); - if (s->fd >= 0 && - (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - s->fd_open_time) >= FD_OPEN_TIMEOUT) { - qemu_close(s->fd); - s->fd = -1; - DPRINTF("Floppy closed\n"); - } - if (s->fd < 0) { - if (s->fd_got_error && - (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - s->fd_error_time) < FD_OPEN_TIMEOUT) { - DPRINTF("No floppy (open delayed)\n"); - return -EIO; - } - s->fd = qemu_open(bs->filename, s->open_flags & ~O_NONBLOCK); - if (s->fd < 0) { - s->fd_error_time = qemu_clock_get_ns(QEMU_CLOCK_REALTIME); - s->fd_got_error = 1; - if (last_media_present) - s->fd_media_changed = 1; - DPRINTF("No floppy\n"); - return -EIO; - } - DPRINTF("Floppy opened\n"); - } - if (!last_media_present) - s->fd_media_changed = 1; - s->fd_open_time = qemu_clock_get_ns(QEMU_CLOCK_REALTIME); - s->fd_got_error = 0; - return 0; -} - -static int hdev_ioctl(BlockDriverState *bs, unsigned long int req, void *buf) -{ - BDRVRawState *s = bs->opaque; - - return ioctl(s->fd, req, buf); -} static BlockAIOCB *hdev_aio_ioctl(BlockDriverState *bs, unsigned long int req, void *buf, @@ -2231,7 +2268,7 @@ static BlockAIOCB *hdev_aio_ioctl(BlockDriverState *bs, if (fd_open(bs) < 0) return NULL; - acb = g_slice_new(RawPosixAIOData); + acb = g_new(RawPosixAIOData, 1); acb->bs = bs; acb->aio_type = QEMU_AIO_IOCTL; acb->aio_fildes = s->fd; @@ -2241,8 +2278,8 @@ static BlockAIOCB *hdev_aio_ioctl(BlockDriverState *bs, pool = aio_get_thread_pool(bdrv_get_aio_context(bs)); return thread_pool_submit_aio(pool, aio_worker, acb, cb, opaque); } +#endif /* linux */ -#elif defined(__FreeBSD__) || defined(__FreeBSD_kernel__) static int fd_open(BlockDriverState *bs) { BDRVRawState *s = bs->opaque; @@ -2252,14 +2289,6 @@ static int fd_open(BlockDriverState *bs) return 0; return -EIO; } -#else /* !linux && !FreeBSD */ - -static int fd_open(BlockDriverState *bs) -{ - return 0; -} - -#endif /* !linux && !FreeBSD */ static coroutine_fn BlockAIOCB *hdev_aio_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors, @@ -2303,17 +2332,22 @@ static int hdev_create(const char *filename, QemuOpts *opts, int64_t total_size = 0; bool has_prefix; - /* This function is used by all three protocol block drivers and therefore - * any of these three prefixes may be given. + /* This function is used by both protocol block drivers and therefore either + * of these prefixes may be given. * The return value has to be stored somewhere, otherwise this is an error * due to -Werror=unused-value. */ has_prefix = strstart(filename, "host_device:", &filename) || - strstart(filename, "host_cdrom:" , &filename) || - strstart(filename, "host_floppy:", &filename); + strstart(filename, "host_cdrom:" , &filename); (void)has_prefix; + ret = raw_normalize_devicepath(&filename); + if (ret < 0) { + error_setg_errno(errp, -ret, "Could not normalize device path"); + return ret; + } + /* Read out options */ total_size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0), BDRV_SECTOR_SIZE); @@ -2379,160 +2413,10 @@ static BlockDriver bdrv_host_device = { /* generic scsi device */ #ifdef __linux__ - .bdrv_ioctl = hdev_ioctl, .bdrv_aio_ioctl = hdev_aio_ioctl, #endif }; -#ifdef __linux__ -static void floppy_parse_filename(const char *filename, QDict *options, - Error **errp) -{ - /* The prefix is optional, just as for "file". */ - strstart(filename, "host_floppy:", &filename); - - qdict_put_obj(options, "filename", QOBJECT(qstring_from_str(filename))); -} - -static int floppy_open(BlockDriverState *bs, QDict *options, int flags, - Error **errp) -{ - BDRVRawState *s = bs->opaque; - Error *local_err = NULL; - int ret; - - s->type = FTYPE_FD; - - /* open will not fail even if no floppy is inserted, so add O_NONBLOCK */ - ret = raw_open_common(bs, options, flags, O_NONBLOCK, &local_err); - if (ret) { - if (local_err) { - error_propagate(errp, local_err); - } - return ret; - } - - /* close fd so that we can reopen it as needed */ - qemu_close(s->fd); - s->fd = -1; - s->fd_media_changed = 1; - - error_report("Host floppy pass-through is deprecated"); - error_printf("Support for it will be removed in a future release.\n"); - return 0; -} - -static int floppy_probe_device(const char *filename) -{ - int fd, ret; - int prio = 0; - struct floppy_struct fdparam; - struct stat st; - - if (strstart(filename, "/dev/fd", NULL) && - !strstart(filename, "/dev/fdset/", NULL) && - !strstart(filename, "/dev/fd/", NULL)) { - prio = 50; - } - - fd = qemu_open(filename, O_RDONLY | O_NONBLOCK); - if (fd < 0) { - goto out; - } - ret = fstat(fd, &st); - if (ret == -1 || !S_ISBLK(st.st_mode)) { - goto outc; - } - - /* Attempt to detect via a floppy specific ioctl */ - ret = ioctl(fd, FDGETPRM, &fdparam); - if (ret >= 0) - prio = 100; - -outc: - qemu_close(fd); -out: - return prio; -} - - -static int floppy_is_inserted(BlockDriverState *bs) -{ - return fd_open(bs) >= 0; -} - -static int floppy_media_changed(BlockDriverState *bs) -{ - BDRVRawState *s = bs->opaque; - int ret; - - /* - * XXX: we do not have a true media changed indication. - * It does not work if the floppy is changed without trying to read it. - */ - fd_open(bs); - ret = s->fd_media_changed; - s->fd_media_changed = 0; - DPRINTF("Floppy changed=%d\n", ret); - return ret; -} - -static void floppy_eject(BlockDriverState *bs, bool eject_flag) -{ - BDRVRawState *s = bs->opaque; - int fd; - - if (s->fd >= 0) { - qemu_close(s->fd); - s->fd = -1; - } - fd = qemu_open(bs->filename, s->open_flags | O_NONBLOCK); - if (fd >= 0) { - if (ioctl(fd, FDEJECT, 0) < 0) - perror("FDEJECT"); - qemu_close(fd); - } -} - -static BlockDriver bdrv_host_floppy = { - .format_name = "host_floppy", - .protocol_name = "host_floppy", - .instance_size = sizeof(BDRVRawState), - .bdrv_needs_filename = true, - .bdrv_probe_device = floppy_probe_device, - .bdrv_parse_filename = floppy_parse_filename, - .bdrv_file_open = floppy_open, - .bdrv_close = raw_close, - .bdrv_reopen_prepare = raw_reopen_prepare, - .bdrv_reopen_commit = raw_reopen_commit, - .bdrv_reopen_abort = raw_reopen_abort, - .bdrv_create = hdev_create, - .create_opts = &raw_create_opts, - - .bdrv_aio_readv = raw_aio_readv, - .bdrv_aio_writev = raw_aio_writev, - .bdrv_aio_flush = raw_aio_flush, - .bdrv_refresh_limits = raw_refresh_limits, - .bdrv_io_plug = raw_aio_plug, - .bdrv_io_unplug = raw_aio_unplug, - .bdrv_flush_io_queue = raw_aio_flush_io_queue, - - .bdrv_truncate = raw_truncate, - .bdrv_getlength = raw_getlength, - .has_variable_length = true, - .bdrv_get_allocated_file_size - = raw_get_allocated_file_size, - - .bdrv_detach_aio_context = raw_detach_aio_context, - .bdrv_attach_aio_context = raw_attach_aio_context, - - /* removable device support */ - .bdrv_is_inserted = floppy_is_inserted, - .bdrv_media_changed = floppy_media_changed, - .bdrv_eject = floppy_eject, -}; -#endif - #if defined(__linux__) || defined(__FreeBSD__) || defined(__FreeBSD_kernel__) static void cdrom_parse_filename(const char *filename, QDict *options, Error **errp) @@ -2588,15 +2472,13 @@ out: return prio; } -static int cdrom_is_inserted(BlockDriverState *bs) +static bool cdrom_is_inserted(BlockDriverState *bs) { BDRVRawState *s = bs->opaque; int ret; ret = ioctl(s->fd, CDROM_DRIVE_STATUS, CDSL_CURRENT); - if (ret == CDS_DISC_OK) - return 1; - return 0; + return ret == CDS_DISC_OK; } static void cdrom_eject(BlockDriverState *bs, bool eject_flag) @@ -2663,7 +2545,6 @@ static BlockDriver bdrv_host_cdrom = { .bdrv_lock_medium = cdrom_lock_medium, /* generic scsi device */ - .bdrv_ioctl = hdev_ioctl, .bdrv_aio_ioctl = hdev_aio_ioctl, }; #endif /* __linux__ */ @@ -2722,7 +2603,7 @@ static int cdrom_reopen(BlockDriverState *bs) return 0; } -static int cdrom_is_inserted(BlockDriverState *bs) +static bool cdrom_is_inserted(BlockDriverState *bs) { return raw_getlength(bs) > 0; } @@ -2810,7 +2691,6 @@ static void bdrv_file_init(void) bdrv_register(&bdrv_file); bdrv_register(&bdrv_host_device); #ifdef __linux__ - bdrv_register(&bdrv_host_floppy); bdrv_register(&bdrv_host_cdrom); #endif #if defined(__FreeBSD__) || defined(__FreeBSD_kernel__) diff --git a/qemu/block/raw-win32.c b/qemu/block/raw-win32.c index 68f2338ac..fd2389153 100644 --- a/qemu/block/raw-win32.c +++ b/qemu/block/raw-win32.c @@ -21,7 +21,9 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. */ -#include "qemu-common.h" +#include "qemu/osdep.h" +#include "qapi/error.h" +#include "qemu/cutils.h" #include "qemu/timer.h" #include "block/block_int.h" #include "qemu/module.h" @@ -119,9 +121,9 @@ static int aio_worker(void *arg) case QEMU_AIO_WRITE: count = handle_aiocb_rw(aiocb); if (count == aiocb->aio_nbytes) { - count = 0; + ret = 0; } else { - count = -EINVAL; + ret = -EINVAL; } break; case QEMU_AIO_FLUSH: @@ -135,7 +137,7 @@ static int aio_worker(void *arg) break; } - g_slice_free(RawWin32AIOData, aiocb); + g_free(aiocb); return ret; } @@ -143,7 +145,7 @@ static BlockAIOCB *paio_submit(BlockDriverState *bs, HANDLE hfile, int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, BlockCompletionFunc *cb, void *opaque, int type) { - RawWin32AIOData *acb = g_slice_new(RawWin32AIOData); + RawWin32AIOData *acb = g_new(RawWin32AIOData, 1); ThreadPool *pool; acb->bs = bs; diff --git a/qemu/block/raw_bsd.c b/qemu/block/raw_bsd.c index e3d2d0468..a6cc7e991 100644 --- a/qemu/block/raw_bsd.c +++ b/qemu/block/raw_bsd.c @@ -26,7 +26,9 @@ * IN THE SOFTWARE. */ +#include "qemu/osdep.h" #include "block/block_int.h" +#include "qapi/error.h" #include "qemu/option.h" static QemuOptsList raw_create_opts = { @@ -52,11 +54,12 @@ static int coroutine_fn raw_co_readv(BlockDriverState *bs, int64_t sector_num, int nb_sectors, QEMUIOVector *qiov) { BLKDBG_EVENT(bs->file, BLKDBG_READ_AIO); - return bdrv_co_readv(bs->file, sector_num, nb_sectors, qiov); + return bdrv_co_readv(bs->file->bs, sector_num, nb_sectors, qiov); } -static int coroutine_fn raw_co_writev(BlockDriverState *bs, int64_t sector_num, - int nb_sectors, QEMUIOVector *qiov) +static int coroutine_fn +raw_co_writev_flags(BlockDriverState *bs, int64_t sector_num, int nb_sectors, + QEMUIOVector *qiov, int flags) { void *buf = NULL; BlockDriver *drv; @@ -75,7 +78,7 @@ static int coroutine_fn raw_co_writev(BlockDriverState *bs, int64_t sector_num, return 0; } - buf = qemu_try_blockalign(bs->file, 512); + buf = qemu_try_blockalign(bs->file->bs, 512); if (!buf) { ret = -ENOMEM; goto fail; @@ -102,7 +105,8 @@ static int coroutine_fn raw_co_writev(BlockDriverState *bs, int64_t sector_num, } BLKDBG_EVENT(bs->file, BLKDBG_WRITE_AIO); - ret = bdrv_co_writev(bs->file, sector_num, nb_sectors, qiov); + ret = bdrv_co_do_pwritev(bs->file->bs, sector_num * BDRV_SECTOR_SIZE, + nb_sectors * BDRV_SECTOR_SIZE, qiov, flags); fail: if (qiov == &local_qiov) { @@ -112,11 +116,20 @@ fail: return ret; } +static int coroutine_fn +raw_co_writev(BlockDriverState *bs, int64_t sector_num, int nb_sectors, + QEMUIOVector *qiov) +{ + return raw_co_writev_flags(bs, sector_num, nb_sectors, qiov, 0); +} + static int64_t coroutine_fn raw_co_get_block_status(BlockDriverState *bs, int64_t sector_num, - int nb_sectors, int *pnum) + int nb_sectors, int *pnum, + BlockDriverState **file) { *pnum = nb_sectors; + *file = bs->file->bs; return BDRV_BLOCK_RAW | BDRV_BLOCK_OFFSET_VALID | BDRV_BLOCK_DATA | (sector_num << BDRV_SECTOR_BITS); } @@ -125,58 +138,48 @@ static int coroutine_fn raw_co_write_zeroes(BlockDriverState *bs, int64_t sector_num, int nb_sectors, BdrvRequestFlags flags) { - return bdrv_co_write_zeroes(bs->file, sector_num, nb_sectors, flags); + return bdrv_co_write_zeroes(bs->file->bs, sector_num, nb_sectors, flags); } static int coroutine_fn raw_co_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors) { - return bdrv_co_discard(bs->file, sector_num, nb_sectors); + return bdrv_co_discard(bs->file->bs, sector_num, nb_sectors); } static int64_t raw_getlength(BlockDriverState *bs) { - return bdrv_getlength(bs->file); + return bdrv_getlength(bs->file->bs); } static int raw_get_info(BlockDriverState *bs, BlockDriverInfo *bdi) { - return bdrv_get_info(bs->file, bdi); + return bdrv_get_info(bs->file->bs, bdi); } static void raw_refresh_limits(BlockDriverState *bs, Error **errp) { - bs->bl = bs->file->bl; + bs->bl = bs->file->bs->bl; } static int raw_truncate(BlockDriverState *bs, int64_t offset) { - return bdrv_truncate(bs->file, offset); -} - -static int raw_is_inserted(BlockDriverState *bs) -{ - return bdrv_is_inserted(bs->file); + return bdrv_truncate(bs->file->bs, offset); } static int raw_media_changed(BlockDriverState *bs) { - return bdrv_media_changed(bs->file); + return bdrv_media_changed(bs->file->bs); } static void raw_eject(BlockDriverState *bs, bool eject_flag) { - bdrv_eject(bs->file, eject_flag); + bdrv_eject(bs->file->bs, eject_flag); } static void raw_lock_medium(BlockDriverState *bs, bool locked) { - bdrv_lock_medium(bs->file, locked); -} - -static int raw_ioctl(BlockDriverState *bs, unsigned long int req, void *buf) -{ - return bdrv_ioctl(bs->file, req, buf); + bdrv_lock_medium(bs->file->bs, locked); } static BlockAIOCB *raw_aio_ioctl(BlockDriverState *bs, @@ -184,12 +187,12 @@ static BlockAIOCB *raw_aio_ioctl(BlockDriverState *bs, BlockCompletionFunc *cb, void *opaque) { - return bdrv_aio_ioctl(bs->file, req, buf, cb, opaque); + return bdrv_aio_ioctl(bs->file->bs, req, buf, cb, opaque); } static int raw_has_zero_init(BlockDriverState *bs) { - return bdrv_has_zero_init(bs->file); + return bdrv_has_zero_init(bs->file->bs); } static int raw_create(const char *filename, QemuOpts *opts, Error **errp) @@ -207,7 +210,7 @@ static int raw_create(const char *filename, QemuOpts *opts, Error **errp) static int raw_open(BlockDriverState *bs, QDict *options, int flags, Error **errp) { - bs->sg = bs->file->sg; + bs->sg = bs->file->bs->sg; if (bs->probed && !bdrv_is_read_only(bs)) { fprintf(stderr, @@ -217,7 +220,7 @@ static int raw_open(BlockDriverState *bs, QDict *options, int flags, "raw images, write operations on block 0 will be restricted.\n" " Specify the 'raw' format explicitly to remove the " "restrictions.\n", - bs->file->filename); + bs->file->bs->filename); } return 0; @@ -237,12 +240,12 @@ static int raw_probe(const uint8_t *buf, int buf_size, const char *filename) static int raw_probe_blocksizes(BlockDriverState *bs, BlockSizes *bsz) { - return bdrv_probe_blocksizes(bs->file, bsz); + return bdrv_probe_blocksizes(bs->file->bs, bsz); } static int raw_probe_geometry(BlockDriverState *bs, HDGeometry *geo) { - return bdrv_probe_geometry(bs->file, geo); + return bdrv_probe_geometry(bs->file->bs, geo); } BlockDriver bdrv_raw = { @@ -254,6 +257,8 @@ BlockDriver bdrv_raw = { .bdrv_create = &raw_create, .bdrv_co_readv = &raw_co_readv, .bdrv_co_writev = &raw_co_writev, + .bdrv_co_writev_flags = &raw_co_writev_flags, + .supported_write_flags = BDRV_REQ_FUA, .bdrv_co_write_zeroes = &raw_co_write_zeroes, .bdrv_co_discard = &raw_co_discard, .bdrv_co_get_block_status = &raw_co_get_block_status, @@ -264,11 +269,9 @@ BlockDriver bdrv_raw = { .bdrv_refresh_limits = &raw_refresh_limits, .bdrv_probe_blocksizes = &raw_probe_blocksizes, .bdrv_probe_geometry = &raw_probe_geometry, - .bdrv_is_inserted = &raw_is_inserted, .bdrv_media_changed = &raw_media_changed, .bdrv_eject = &raw_eject, .bdrv_lock_medium = &raw_lock_medium, - .bdrv_ioctl = &raw_ioctl, .bdrv_aio_ioctl = &raw_aio_ioctl, .create_opts = &raw_create_opts, .bdrv_has_zero_init = &raw_has_zero_init diff --git a/qemu/block/rbd.c b/qemu/block/rbd.c index a60a19d58..5bc5b3253 100644 --- a/qemu/block/rbd.c +++ b/qemu/block/rbd.c @@ -11,11 +11,13 @@ * GNU GPL, version 2 or (at your option) any later version. */ -#include <inttypes.h> +#include "qemu/osdep.h" -#include "qemu-common.h" +#include "qapi/error.h" #include "qemu/error-report.h" #include "block/block_int.h" +#include "crypto/secret.h" +#include "qemu/cutils.h" #include <rbd/librbd.h> @@ -228,6 +230,27 @@ static char *qemu_rbd_parse_clientname(const char *conf, char *clientname) return NULL; } + +static int qemu_rbd_set_auth(rados_t cluster, const char *secretid, + Error **errp) +{ + if (secretid == 0) { + return 0; + } + + gchar *secret = qcrypto_secret_lookup_as_base64(secretid, + errp); + if (!secret) { + return -1; + } + + rados_conf_set(cluster, "key", secret); + g_free(secret); + + return 0; +} + + static int qemu_rbd_set_conf(rados_t cluster, const char *conf, bool only_read_conf_file, Error **errp) @@ -299,10 +322,13 @@ static int qemu_rbd_create(const char *filename, QemuOpts *opts, Error **errp) char conf[RBD_MAX_CONF_SIZE]; char clientname_buf[RBD_MAX_CONF_SIZE]; char *clientname; + const char *secretid; rados_t cluster; rados_ioctx_t io_ctx; int ret; + secretid = qemu_opt_get(opts, "password-secret"); + if (qemu_rbd_parsename(filename, pool, sizeof(pool), snap_buf, sizeof(snap_buf), name, sizeof(name), @@ -350,6 +376,11 @@ static int qemu_rbd_create(const char *filename, QemuOpts *opts, Error **errp) return -EIO; } + if (qemu_rbd_set_auth(cluster, secretid, errp) < 0) { + rados_shutdown(cluster); + return -EIO; + } + if (rados_connect(cluster) < 0) { error_setg(errp, "error connecting"); rados_shutdown(cluster); @@ -423,6 +454,11 @@ static QemuOptsList runtime_opts = { .type = QEMU_OPT_STRING, .help = "Specification of the rbd image", }, + { + .name = "password-secret", + .type = QEMU_OPT_STRING, + .help = "ID of secret providing the password", + }, { /* end of list */ } }, }; @@ -436,6 +472,7 @@ static int qemu_rbd_open(BlockDriverState *bs, QDict *options, int flags, char conf[RBD_MAX_CONF_SIZE]; char clientname_buf[RBD_MAX_CONF_SIZE]; char *clientname; + const char *secretid; QemuOpts *opts; Error *local_err = NULL; const char *filename; @@ -450,6 +487,7 @@ static int qemu_rbd_open(BlockDriverState *bs, QDict *options, int flags, } filename = qemu_opt_get(opts, "filename"); + secretid = qemu_opt_get(opts, "password-secret"); if (qemu_rbd_parsename(filename, pool, sizeof(pool), snap_buf, sizeof(snap_buf), @@ -488,6 +526,11 @@ static int qemu_rbd_open(BlockDriverState *bs, QDict *options, int flags, } } + if (qemu_rbd_set_auth(s->cluster, secretid, errp) < 0) { + r = -EIO; + goto failed_shutdown; + } + /* * Fallback to more conservative semantics if setting cache * options fails. Ignore errors from setting rbd_cache because the @@ -919,6 +962,11 @@ static QemuOptsList qemu_rbd_create_opts = { .type = QEMU_OPT_SIZE, .help = "RBD object size" }, + { + .name = "password-secret", + .type = QEMU_OPT_STRING, + .help = "ID of secret providing the password", + }, { /* end of list */ } } }; diff --git a/qemu/block/sheepdog.c b/qemu/block/sheepdog.c index 9585beb73..33e0a3382 100644 --- a/qemu/block/sheepdog.c +++ b/qemu/block/sheepdog.c @@ -12,12 +12,15 @@ * GNU GPL, version 2 or (at your option) any later version. */ -#include "qemu-common.h" +#include "qemu/osdep.h" +#include "qapi/error.h" #include "qemu/uri.h" #include "qemu/error-report.h" #include "qemu/sockets.h" #include "block/block_int.h" +#include "sysemu/block-backend.h" #include "qemu/bitops.h" +#include "qemu/cutils.h" #define SD_PROTO_VER 0x01 @@ -28,7 +31,6 @@ #define SD_OP_READ_OBJ 0x02 #define SD_OP_WRITE_OBJ 0x03 /* 0x04 is used internally by Sheepdog */ -#define SD_OP_DISCARD_OBJ 0x05 #define SD_OP_NEW_VDI 0x11 #define SD_OP_LOCK_VDI 0x12 @@ -284,6 +286,12 @@ static inline bool is_snapshot(struct SheepdogInode *inode) return !!inode->snap_ctime; } +static inline size_t count_data_objs(const struct SheepdogInode *inode) +{ + return DIV_ROUND_UP(inode->vdi_size, + (1UL << inode->block_size_shift)); +} + #undef DPRINTF #ifdef DEBUG_SDOG #define DPRINTF(fmt, args...) \ @@ -318,7 +326,7 @@ enum AIOCBState { AIOCB_DISCARD_OBJ, }; -#define AIOCBOverwrapping(x, y) \ +#define AIOCBOverlapping(x, y) \ (!(x->max_affect_data_idx < y->min_affect_data_idx \ || y->max_affect_data_idx < x->min_affect_data_idx)) @@ -342,6 +350,15 @@ struct SheepdogAIOCB { uint32_t min_affect_data_idx; uint32_t max_affect_data_idx; + /* + * The difference between affect_data_idx and dirty_data_idx: + * affect_data_idx represents range of index of all request types. + * dirty_data_idx represents range of index updated by COW requests. + * dirty_data_idx is used for updating an inode object. + */ + uint32_t min_dirty_data_idx; + uint32_t max_dirty_data_idx; + QLIST_ENTRY(SheepdogAIOCB) aiocb_siblings; }; @@ -351,9 +368,6 @@ typedef struct BDRVSheepdogState { SheepdogInode inode; - uint32_t min_dirty_data_idx; - uint32_t max_dirty_data_idx; - char name[SD_MAX_VDI_LEN]; bool is_snapshot; uint32_t cache_flags; @@ -373,10 +387,15 @@ typedef struct BDRVSheepdogState { QLIST_HEAD(inflight_aio_head, AIOReq) inflight_aio_head; QLIST_HEAD(failed_aio_head, AIOReq) failed_aio_head; - CoQueue overwrapping_queue; + CoQueue overlapping_queue; QLIST_HEAD(inflight_aiocb_head, SheepdogAIOCB) inflight_aiocb_head; } BDRVSheepdogState; +typedef struct BDRVSheepdogReopenState { + int fd; + int cache_flags; +} BDRVSheepdogReopenState; + static const char * sd_strerror(int err) { int i; @@ -556,6 +575,9 @@ static SheepdogAIOCB *sd_aio_setup(BlockDriverState *bs, QEMUIOVector *qiov, acb->max_affect_data_idx = (acb->sector_num * BDRV_SECTOR_SIZE + acb->nb_sectors * BDRV_SECTOR_SIZE) / object_size; + acb->min_dirty_data_idx = UINT32_MAX; + acb->max_dirty_data_idx = 0; + return acb; } @@ -595,14 +617,13 @@ static coroutine_fn int send_co_req(int sockfd, SheepdogReq *hdr, void *data, ret = qemu_co_send(sockfd, hdr, sizeof(*hdr)); if (ret != sizeof(*hdr)) { error_report("failed to send a req, %s", strerror(errno)); - ret = -socket_error(); - return ret; + return -errno; } ret = qemu_co_send(sockfd, data, *wlen); if (ret != *wlen) { - ret = -socket_error(); error_report("failed to send a req, %s", strerror(errno)); + return -errno; } return ret; @@ -638,14 +659,16 @@ static coroutine_fn void do_co_req(void *opaque) unsigned int *rlen = srco->rlen; co = qemu_coroutine_self(); - aio_set_fd_handler(srco->aio_context, sockfd, NULL, restart_co_req, co); + aio_set_fd_handler(srco->aio_context, sockfd, false, + NULL, restart_co_req, co); ret = send_co_req(sockfd, hdr, data, wlen); if (ret < 0) { goto out; } - aio_set_fd_handler(srco->aio_context, sockfd, restart_co_req, NULL, co); + aio_set_fd_handler(srco->aio_context, sockfd, false, + restart_co_req, NULL, co); ret = qemu_co_recv(sockfd, hdr, sizeof(*hdr)); if (ret != sizeof(*hdr)) { @@ -670,7 +693,8 @@ static coroutine_fn void do_co_req(void *opaque) out: /* there is at most one request for this sockfd, so it is safe to * set each handler to NULL. */ - aio_set_fd_handler(srco->aio_context, sockfd, NULL, NULL, NULL); + aio_set_fd_handler(srco->aio_context, sockfd, false, + NULL, NULL, NULL); srco->ret = ret; srco->finished = true; @@ -722,7 +746,8 @@ static coroutine_fn void reconnect_to_sdog(void *opaque) BDRVSheepdogState *s = opaque; AIOReq *aio_req, *next; - aio_set_fd_handler(s->aio_context, s->fd, NULL, NULL, NULL); + aio_set_fd_handler(s->aio_context, s->fd, false, NULL, + NULL, NULL); close(s->fd); s->fd = -1; @@ -819,8 +844,8 @@ static void coroutine_fn aio_read_response(void *opaque) */ if (rsp.result == SD_RES_SUCCESS) { s->inode.data_vdi_id[idx] = s->inode.vdi_id; - s->max_dirty_data_idx = MAX(idx, s->max_dirty_data_idx); - s->min_dirty_data_idx = MIN(idx, s->min_dirty_data_idx); + acb->max_dirty_data_idx = MAX(idx, acb->max_dirty_data_idx); + acb->min_dirty_data_idx = MIN(idx, acb->min_dirty_data_idx); } } break; @@ -847,10 +872,6 @@ static void coroutine_fn aio_read_response(void *opaque) rsp.result = SD_RES_SUCCESS; s->discard_supported = false; break; - case SD_RES_SUCCESS: - idx = data_oid_to_idx(aio_req->oid); - s->inode.data_vdi_id[idx] = 0; - break; default: break; } @@ -929,7 +950,8 @@ static int get_sheep_fd(BDRVSheepdogState *s, Error **errp) return fd; } - aio_set_fd_handler(s->aio_context, fd, co_read_response, NULL, s); + aio_set_fd_handler(s->aio_context, fd, false, + co_read_response, NULL, s); return fd; } @@ -1165,7 +1187,13 @@ static void coroutine_fn add_aio_request(BDRVSheepdogState *s, AIOReq *aio_req, hdr.flags = SD_FLAG_CMD_WRITE | flags; break; case AIOCB_DISCARD_OBJ: - hdr.opcode = SD_OP_DISCARD_OBJ; + hdr.opcode = SD_OP_WRITE_OBJ; + hdr.flags = SD_FLAG_CMD_WRITE | flags; + s->inode.data_vdi_id[data_oid_to_idx(oid)] = 0; + offset = offsetof(SheepdogInode, + data_vdi_id[data_oid_to_idx(oid)]); + oid = vid_to_vdi_oid(s->inode.vdi_id); + wlen = datalen = sizeof(uint32_t); break; } @@ -1184,7 +1212,7 @@ static void coroutine_fn add_aio_request(BDRVSheepdogState *s, AIOReq *aio_req, qemu_co_mutex_lock(&s->lock); s->co_send = qemu_coroutine_self(); - aio_set_fd_handler(s->aio_context, s->fd, + aio_set_fd_handler(s->aio_context, s->fd, false, co_read_response, co_write_request, s); socket_set_cork(s->fd, 1); @@ -1203,7 +1231,8 @@ static void coroutine_fn add_aio_request(BDRVSheepdogState *s, AIOReq *aio_req, } out: socket_set_cork(s->fd, 0); - aio_set_fd_handler(s->aio_context, s->fd, co_read_response, NULL, s); + aio_set_fd_handler(s->aio_context, s->fd, false, + co_read_response, NULL, s); s->co_send = NULL; qemu_co_mutex_unlock(&s->lock); } @@ -1353,7 +1382,8 @@ static void sd_detach_aio_context(BlockDriverState *bs) { BDRVSheepdogState *s = bs->opaque; - aio_set_fd_handler(s->aio_context, s->fd, NULL, NULL, NULL); + aio_set_fd_handler(s->aio_context, s->fd, false, NULL, + NULL, NULL); } static void sd_attach_aio_context(BlockDriverState *bs, @@ -1362,7 +1392,8 @@ static void sd_attach_aio_context(BlockDriverState *bs, BDRVSheepdogState *s = bs->opaque; s->aio_context = new_context; - aio_set_fd_handler(new_context, s->fd, co_read_response, NULL, s); + aio_set_fd_handler(new_context, s->fd, false, + co_read_response, NULL, s); } /* TODO Convert to fine grained options */ @@ -1466,18 +1497,17 @@ static int sd_open(BlockDriverState *bs, QDict *options, int flags, } memcpy(&s->inode, buf, sizeof(s->inode)); - s->min_dirty_data_idx = UINT32_MAX; - s->max_dirty_data_idx = 0; bs->total_sectors = s->inode.vdi_size / BDRV_SECTOR_SIZE; pstrcpy(s->name, sizeof(s->name), vdi); qemu_co_mutex_init(&s->lock); - qemu_co_queue_init(&s->overwrapping_queue); + qemu_co_queue_init(&s->overlapping_queue); qemu_opts_del(opts); g_free(buf); return 0; out: - aio_set_fd_handler(bdrv_get_aio_context(bs), s->fd, NULL, NULL, NULL); + aio_set_fd_handler(bdrv_get_aio_context(bs), s->fd, + false, NULL, NULL, NULL); if (s->fd >= 0) { closesocket(s->fd); } @@ -1486,6 +1516,70 @@ out: return ret; } +static int sd_reopen_prepare(BDRVReopenState *state, BlockReopenQueue *queue, + Error **errp) +{ + BDRVSheepdogState *s = state->bs->opaque; + BDRVSheepdogReopenState *re_s; + int ret = 0; + + re_s = state->opaque = g_new0(BDRVSheepdogReopenState, 1); + + re_s->cache_flags = SD_FLAG_CMD_CACHE; + if (state->flags & BDRV_O_NOCACHE) { + re_s->cache_flags = SD_FLAG_CMD_DIRECT; + } + + re_s->fd = get_sheep_fd(s, errp); + if (re_s->fd < 0) { + ret = re_s->fd; + return ret; + } + + return ret; +} + +static void sd_reopen_commit(BDRVReopenState *state) +{ + BDRVSheepdogReopenState *re_s = state->opaque; + BDRVSheepdogState *s = state->bs->opaque; + + if (s->fd) { + aio_set_fd_handler(s->aio_context, s->fd, false, + NULL, NULL, NULL); + closesocket(s->fd); + } + + s->fd = re_s->fd; + s->cache_flags = re_s->cache_flags; + + g_free(state->opaque); + state->opaque = NULL; + + return; +} + +static void sd_reopen_abort(BDRVReopenState *state) +{ + BDRVSheepdogReopenState *re_s = state->opaque; + BDRVSheepdogState *s = state->bs->opaque; + + if (re_s == NULL) { + return; + } + + if (re_s->fd) { + aio_set_fd_handler(s->aio_context, re_s->fd, false, + NULL, NULL, NULL); + closesocket(re_s->fd); + } + + g_free(state->opaque); + state->opaque = NULL; + + return; +} + static int do_sd_create(BDRVSheepdogState *s, uint32_t *vdi_id, int snapshot, Error **errp) { @@ -1544,7 +1638,7 @@ static int do_sd_create(BDRVSheepdogState *s, uint32_t *vdi_id, int snapshot, static int sd_prealloc(const char *filename, Error **errp) { - BlockDriverState *bs = NULL; + BlockBackend *blk = NULL; BDRVSheepdogState *base = NULL; unsigned long buf_size; uint32_t idx, max_idx; @@ -1553,19 +1647,22 @@ static int sd_prealloc(const char *filename, Error **errp) void *buf = NULL; int ret; - ret = bdrv_open(&bs, filename, NULL, NULL, BDRV_O_RDWR | BDRV_O_PROTOCOL, - NULL, errp); - if (ret < 0) { + blk = blk_new_open(filename, NULL, NULL, + BDRV_O_RDWR | BDRV_O_PROTOCOL, errp); + if (blk == NULL) { + ret = -EIO; goto out_with_err_set; } - vdi_size = bdrv_getlength(bs); + blk_set_allow_write_beyond_eof(blk, true); + + vdi_size = blk_getlength(blk); if (vdi_size < 0) { ret = vdi_size; goto out; } - base = bs->opaque; + base = blk_bs(blk)->opaque; object_size = (UINT32_C(1) << base->inode.block_size_shift); buf_size = MIN(object_size, SD_DATA_OBJ_SIZE); buf = g_malloc0(buf_size); @@ -1577,23 +1674,24 @@ static int sd_prealloc(const char *filename, Error **errp) * The created image can be a cloned image, so we need to read * a data from the source image. */ - ret = bdrv_pread(bs, idx * buf_size, buf, buf_size); + ret = blk_pread(blk, idx * buf_size, buf, buf_size); if (ret < 0) { goto out; } - ret = bdrv_pwrite(bs, idx * buf_size, buf, buf_size); + ret = blk_pwrite(blk, idx * buf_size, buf, buf_size); if (ret < 0) { goto out; } } + ret = 0; out: if (ret < 0) { error_setg_errno(errp, -ret, "Can't pre-allocate"); } out_with_err_set: - if (bs) { - bdrv_unref(bs); + if (blk) { + blk_unref(blk); } g_free(buf); @@ -1733,7 +1831,7 @@ static int sd_create(const char *filename, QemuOpts *opts, } if (backing_file) { - BlockDriverState *bs; + BlockBackend *blk; BDRVSheepdogState *base; BlockDriver *drv; @@ -1745,23 +1843,23 @@ static int sd_create(const char *filename, QemuOpts *opts, goto out; } - bs = NULL; - ret = bdrv_open(&bs, backing_file, NULL, NULL, BDRV_O_PROTOCOL, NULL, - errp); - if (ret < 0) { + blk = blk_new_open(backing_file, NULL, NULL, + BDRV_O_PROTOCOL, errp); + if (blk == NULL) { + ret = -EIO; goto out; } - base = bs->opaque; + base = blk_bs(blk)->opaque; if (!is_snapshot(&base->inode)) { error_setg(errp, "cannot clone from a non snapshot vdi"); - bdrv_unref(bs); + blk_unref(blk); ret = -EINVAL; goto out; } s->inode.vdi_id = base->inode.vdi_id; - bdrv_unref(bs); + blk_unref(blk); } s->aio_context = qemu_get_aio_context(); @@ -1776,8 +1874,7 @@ static int sd_create(const char *filename, QemuOpts *opts, fd = connect_to_sdog(s, &local_err); if (fd < 0) { - error_report("%s", error_get_pretty(local_err)); - error_free(local_err); + error_report_err(local_err); ret = -EIO; goto out; } @@ -1861,7 +1958,8 @@ static void sd_close(BlockDriverState *bs) error_report("%s, %s", sd_strerror(rsp->result), s->name); } - aio_set_fd_handler(bdrv_get_aio_context(bs), s->fd, NULL, NULL, NULL); + aio_set_fd_handler(bdrv_get_aio_context(bs), s->fd, + false, NULL, NULL, NULL); closesocket(s->fd); g_free(s->host_spec); } @@ -1923,16 +2021,16 @@ static void coroutine_fn sd_write_done(SheepdogAIOCB *acb) AIOReq *aio_req; uint32_t offset, data_len, mn, mx; - mn = s->min_dirty_data_idx; - mx = s->max_dirty_data_idx; + mn = acb->min_dirty_data_idx; + mx = acb->max_dirty_data_idx; if (mn <= mx) { /* we need to update the vdi object. */ offset = sizeof(s->inode) - sizeof(s->inode.data_vdi_id) + mn * sizeof(s->inode.data_vdi_id[0]); data_len = (mx - mn + 1) * sizeof(s->inode.data_vdi_id[0]); - s->min_dirty_data_idx = UINT32_MAX; - s->max_dirty_data_idx = 0; + acb->min_dirty_data_idx = UINT32_MAX; + acb->max_dirty_data_idx = 0; iov.iov_base = &s->inode; iov.iov_len = sizeof(s->inode); @@ -2141,7 +2239,9 @@ static int coroutine_fn sd_co_rw_vector(void *p) } aio_req = alloc_aio_req(s, acb, oid, len, offset, flags, create, - old_oid, done); + old_oid, + acb->aiocb_type == AIOCB_DISCARD_OBJ ? + 0 : done); QLIST_INSERT_HEAD(&s->inflight_aio_head, aio_req, aio_siblings); add_aio_request(s, aio_req, acb->qiov->iov, acb->qiov->niov, @@ -2158,12 +2258,12 @@ out: return 1; } -static bool check_overwrapping_aiocb(BDRVSheepdogState *s, SheepdogAIOCB *aiocb) +static bool check_overlapping_aiocb(BDRVSheepdogState *s, SheepdogAIOCB *aiocb) { SheepdogAIOCB *cb; QLIST_FOREACH(cb, &s->inflight_aiocb_head, aiocb_siblings) { - if (AIOCBOverwrapping(aiocb, cb)) { + if (AIOCBOverlapping(aiocb, cb)) { return true; } } @@ -2192,15 +2292,15 @@ static coroutine_fn int sd_co_writev(BlockDriverState *bs, int64_t sector_num, acb->aiocb_type = AIOCB_WRITE_UDATA; retry: - if (check_overwrapping_aiocb(s, acb)) { - qemu_co_queue_wait(&s->overwrapping_queue); + if (check_overlapping_aiocb(s, acb)) { + qemu_co_queue_wait(&s->overlapping_queue); goto retry; } ret = sd_co_rw_vector(acb); if (ret <= 0) { QLIST_REMOVE(acb, aiocb_siblings); - qemu_co_queue_restart_all(&s->overwrapping_queue); + qemu_co_queue_restart_all(&s->overlapping_queue); qemu_aio_unref(acb); return ret; } @@ -2208,7 +2308,7 @@ retry: qemu_coroutine_yield(); QLIST_REMOVE(acb, aiocb_siblings); - qemu_co_queue_restart_all(&s->overwrapping_queue); + qemu_co_queue_restart_all(&s->overlapping_queue); return acb->ret; } @@ -2225,15 +2325,15 @@ static coroutine_fn int sd_co_readv(BlockDriverState *bs, int64_t sector_num, acb->aio_done_func = sd_finish_aiocb; retry: - if (check_overwrapping_aiocb(s, acb)) { - qemu_co_queue_wait(&s->overwrapping_queue); + if (check_overlapping_aiocb(s, acb)) { + qemu_co_queue_wait(&s->overlapping_queue); goto retry; } ret = sd_co_rw_vector(acb); if (ret <= 0) { QLIST_REMOVE(acb, aiocb_siblings); - qemu_co_queue_restart_all(&s->overwrapping_queue); + qemu_co_queue_restart_all(&s->overlapping_queue); qemu_aio_unref(acb); return ret; } @@ -2241,7 +2341,7 @@ retry: qemu_coroutine_yield(); QLIST_REMOVE(acb, aiocb_siblings); - qemu_co_queue_restart_all(&s->overwrapping_queue); + qemu_co_queue_restart_all(&s->overlapping_queue); return acb->ret; } @@ -2318,9 +2418,8 @@ static int sd_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info) ret = do_sd_create(s, &new_vid, 1, &local_err); if (ret < 0) { - error_report("failed to create inode for snapshot: %s", - error_get_pretty(local_err)); - error_free(local_err); + error_reportf_err(local_err, + "failed to create inode for snapshot: "); goto cleanup; } @@ -2391,13 +2490,131 @@ out: return ret; } +#define NR_BATCHED_DISCARD 128 + +static bool remove_objects(BDRVSheepdogState *s) +{ + int fd, i = 0, nr_objs = 0; + Error *local_err = NULL; + int ret = 0; + bool result = true; + SheepdogInode *inode = &s->inode; + + fd = connect_to_sdog(s, &local_err); + if (fd < 0) { + error_report_err(local_err); + return false; + } + + nr_objs = count_data_objs(inode); + while (i < nr_objs) { + int start_idx, nr_filled_idx; + + while (i < nr_objs && !inode->data_vdi_id[i]) { + i++; + } + start_idx = i; + + nr_filled_idx = 0; + while (i < nr_objs && nr_filled_idx < NR_BATCHED_DISCARD) { + if (inode->data_vdi_id[i]) { + inode->data_vdi_id[i] = 0; + nr_filled_idx++; + } + + i++; + } + + ret = write_object(fd, s->aio_context, + (char *)&inode->data_vdi_id[start_idx], + vid_to_vdi_oid(s->inode.vdi_id), inode->nr_copies, + (i - start_idx) * sizeof(uint32_t), + offsetof(struct SheepdogInode, + data_vdi_id[start_idx]), + false, s->cache_flags); + if (ret < 0) { + error_report("failed to discard snapshot inode."); + result = false; + goto out; + } + } + +out: + closesocket(fd); + return result; +} + static int sd_snapshot_delete(BlockDriverState *bs, const char *snapshot_id, const char *name, Error **errp) { - /* FIXME: Delete specified snapshot id. */ - return 0; + unsigned long snap_id = 0; + char snap_tag[SD_MAX_VDI_TAG_LEN]; + Error *local_err = NULL; + int fd, ret; + char buf[SD_MAX_VDI_LEN + SD_MAX_VDI_TAG_LEN]; + BDRVSheepdogState *s = bs->opaque; + unsigned int wlen = SD_MAX_VDI_LEN + SD_MAX_VDI_TAG_LEN, rlen = 0; + uint32_t vid; + SheepdogVdiReq hdr = { + .opcode = SD_OP_DEL_VDI, + .data_length = wlen, + .flags = SD_FLAG_CMD_WRITE, + }; + SheepdogVdiRsp *rsp = (SheepdogVdiRsp *)&hdr; + + if (!remove_objects(s)) { + return -1; + } + + memset(buf, 0, sizeof(buf)); + memset(snap_tag, 0, sizeof(snap_tag)); + pstrcpy(buf, SD_MAX_VDI_LEN, s->name); + ret = qemu_strtoul(snapshot_id, NULL, 10, &snap_id); + if (ret || snap_id > UINT32_MAX) { + error_setg(errp, "Invalid snapshot ID: %s", + snapshot_id ? snapshot_id : "<null>"); + return -EINVAL; + } + + if (snap_id) { + hdr.snapid = (uint32_t) snap_id; + } else { + pstrcpy(snap_tag, sizeof(snap_tag), snapshot_id); + pstrcpy(buf + SD_MAX_VDI_LEN, SD_MAX_VDI_TAG_LEN, snap_tag); + } + + ret = find_vdi_name(s, s->name, snap_id, snap_tag, &vid, true, + &local_err); + if (ret) { + return ret; + } + + fd = connect_to_sdog(s, &local_err); + if (fd < 0) { + error_report_err(local_err); + return -1; + } + + ret = do_req(fd, s->aio_context, (SheepdogReq *)&hdr, + buf, &wlen, &rlen); + closesocket(fd); + if (ret) { + return ret; + } + + switch (rsp->result) { + case SD_RES_NO_VDI: + error_report("%s was already deleted", s->name); + case SD_RES_SUCCESS: + break; + default: + error_report("%s, %s", sd_strerror(rsp->result), s->name); + return -1; + } + + return ret; } static int sd_snapshot_list(BlockDriverState *bs, QEMUSnapshotInfo **psn_tab) @@ -2577,28 +2794,36 @@ static coroutine_fn int sd_co_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors) { SheepdogAIOCB *acb; - QEMUIOVector dummy; BDRVSheepdogState *s = bs->opaque; int ret; + QEMUIOVector discard_iov; + struct iovec iov; + uint32_t zero = 0; if (!s->discard_supported) { return 0; } - acb = sd_aio_setup(bs, &dummy, sector_num, nb_sectors); + memset(&discard_iov, 0, sizeof(discard_iov)); + memset(&iov, 0, sizeof(iov)); + iov.iov_base = &zero; + iov.iov_len = sizeof(zero); + discard_iov.iov = &iov; + discard_iov.niov = 1; + acb = sd_aio_setup(bs, &discard_iov, sector_num, nb_sectors); acb->aiocb_type = AIOCB_DISCARD_OBJ; acb->aio_done_func = sd_finish_aiocb; retry: - if (check_overwrapping_aiocb(s, acb)) { - qemu_co_queue_wait(&s->overwrapping_queue); + if (check_overlapping_aiocb(s, acb)) { + qemu_co_queue_wait(&s->overlapping_queue); goto retry; } ret = sd_co_rw_vector(acb); if (ret <= 0) { QLIST_REMOVE(acb, aiocb_siblings); - qemu_co_queue_restart_all(&s->overwrapping_queue); + qemu_co_queue_restart_all(&s->overlapping_queue); qemu_aio_unref(acb); return ret; } @@ -2606,14 +2831,14 @@ retry: qemu_coroutine_yield(); QLIST_REMOVE(acb, aiocb_siblings); - qemu_co_queue_restart_all(&s->overwrapping_queue); + qemu_co_queue_restart_all(&s->overlapping_queue); return acb->ret; } static coroutine_fn int64_t sd_co_get_block_status(BlockDriverState *bs, int64_t sector_num, int nb_sectors, - int *pnum) + int *pnum, BlockDriverState **file) { BDRVSheepdogState *s = bs->opaque; SheepdogInode *inode = &s->inode; @@ -2644,6 +2869,9 @@ sd_co_get_block_status(BlockDriverState *bs, int64_t sector_num, int nb_sectors, if (*pnum > nb_sectors) { *pnum = nb_sectors; } + if (ret > 0 && ret & BDRV_BLOCK_OFFSET_VALID) { + *file = bs; + } return ret; } @@ -2703,6 +2931,9 @@ static BlockDriver bdrv_sheepdog = { .instance_size = sizeof(BDRVSheepdogState), .bdrv_needs_filename = true, .bdrv_file_open = sd_open, + .bdrv_reopen_prepare = sd_reopen_prepare, + .bdrv_reopen_commit = sd_reopen_commit, + .bdrv_reopen_abort = sd_reopen_abort, .bdrv_close = sd_close, .bdrv_create = sd_create, .bdrv_has_zero_init = bdrv_has_zero_init_1, @@ -2736,6 +2967,9 @@ static BlockDriver bdrv_sheepdog_tcp = { .instance_size = sizeof(BDRVSheepdogState), .bdrv_needs_filename = true, .bdrv_file_open = sd_open, + .bdrv_reopen_prepare = sd_reopen_prepare, + .bdrv_reopen_commit = sd_reopen_commit, + .bdrv_reopen_abort = sd_reopen_abort, .bdrv_close = sd_close, .bdrv_create = sd_create, .bdrv_has_zero_init = bdrv_has_zero_init_1, @@ -2769,6 +3003,9 @@ static BlockDriver bdrv_sheepdog_unix = { .instance_size = sizeof(BDRVSheepdogState), .bdrv_needs_filename = true, .bdrv_file_open = sd_open, + .bdrv_reopen_prepare = sd_reopen_prepare, + .bdrv_reopen_commit = sd_reopen_commit, + .bdrv_reopen_abort = sd_reopen_abort, .bdrv_close = sd_close, .bdrv_create = sd_create, .bdrv_has_zero_init = bdrv_has_zero_init_1, diff --git a/qemu/block/snapshot.c b/qemu/block/snapshot.c index 49e143e99..e9d721df6 100644 --- a/qemu/block/snapshot.c +++ b/qemu/block/snapshot.c @@ -22,8 +22,10 @@ * THE SOFTWARE. */ +#include "qemu/osdep.h" #include "block/snapshot.h" #include "block/block_int.h" +#include "qapi/error.h" #include "qapi/qmp/qerror.h" QemuOptsList internal_snapshot_opts = { @@ -149,7 +151,7 @@ int bdrv_can_snapshot(BlockDriverState *bs) if (!drv->bdrv_snapshot_create) { if (bs->file != NULL) { - return bdrv_can_snapshot(bs->file); + return bdrv_can_snapshot(bs->file->bs); } return 0; } @@ -168,7 +170,7 @@ int bdrv_snapshot_create(BlockDriverState *bs, return drv->bdrv_snapshot_create(bs, sn_info); } if (bs->file) { - return bdrv_snapshot_create(bs->file, sn_info); + return bdrv_snapshot_create(bs->file->bs, sn_info); } return -ENOTSUP; } @@ -188,10 +190,10 @@ int bdrv_snapshot_goto(BlockDriverState *bs, if (bs->file) { drv->bdrv_close(bs); - ret = bdrv_snapshot_goto(bs->file, snapshot_id); + ret = bdrv_snapshot_goto(bs->file->bs, snapshot_id); open_ret = drv->bdrv_open(bs, NULL, bs->open_flags, NULL); if (open_ret < 0) { - bdrv_unref(bs->file); + bdrv_unref(bs->file->bs); bs->drv = NULL; return open_ret; } @@ -229,6 +231,8 @@ int bdrv_snapshot_delete(BlockDriverState *bs, Error **errp) { BlockDriver *drv = bs->drv; + int ret; + if (!drv) { error_setg(errp, QERR_DEVICE_HAS_NO_MEDIUM, bdrv_get_device_name(bs)); return -ENOMEDIUM; @@ -239,23 +243,26 @@ int bdrv_snapshot_delete(BlockDriverState *bs, } /* drain all pending i/o before deleting snapshot */ - bdrv_drain(bs); + bdrv_drained_begin(bs); if (drv->bdrv_snapshot_delete) { - return drv->bdrv_snapshot_delete(bs, snapshot_id, name, errp); - } - if (bs->file) { - return bdrv_snapshot_delete(bs->file, snapshot_id, name, errp); + ret = drv->bdrv_snapshot_delete(bs, snapshot_id, name, errp); + } else if (bs->file) { + ret = bdrv_snapshot_delete(bs->file->bs, snapshot_id, name, errp); + } else { + error_setg(errp, "Block format '%s' used by device '%s' " + "does not support internal snapshot deletion", + drv->format_name, bdrv_get_device_name(bs)); + ret = -ENOTSUP; } - error_setg(errp, "Block format '%s' used by device '%s' " - "does not support internal snapshot deletion", - drv->format_name, bdrv_get_device_name(bs)); - return -ENOTSUP; + + bdrv_drained_end(bs); + return ret; } -void bdrv_snapshot_delete_by_id_or_name(BlockDriverState *bs, - const char *id_or_name, - Error **errp) +int bdrv_snapshot_delete_by_id_or_name(BlockDriverState *bs, + const char *id_or_name, + Error **errp) { int ret; Error *local_err = NULL; @@ -270,6 +277,7 @@ void bdrv_snapshot_delete_by_id_or_name(BlockDriverState *bs, if (ret < 0) { error_propagate(errp, local_err); } + return ret; } int bdrv_snapshot_list(BlockDriverState *bs, @@ -283,7 +291,7 @@ int bdrv_snapshot_list(BlockDriverState *bs, return drv->bdrv_snapshot_list(bs, psn_info); } if (bs->file) { - return bdrv_snapshot_list(bs->file, psn_info); + return bdrv_snapshot_list(bs->file->bs, psn_info); } return -ENOTSUP; } @@ -356,3 +364,130 @@ int bdrv_snapshot_load_tmp_by_id_or_name(BlockDriverState *bs, return ret; } + + +/* Group operations. All block drivers are involved. + * These functions will properly handle dataplane (take aio_context_acquire + * when appropriate for appropriate block drivers) */ + +bool bdrv_all_can_snapshot(BlockDriverState **first_bad_bs) +{ + bool ok = true; + BlockDriverState *bs = NULL; + + while (ok && (bs = bdrv_next(bs))) { + AioContext *ctx = bdrv_get_aio_context(bs); + + aio_context_acquire(ctx); + if (bdrv_is_inserted(bs) && !bdrv_is_read_only(bs)) { + ok = bdrv_can_snapshot(bs); + } + aio_context_release(ctx); + } + + *first_bad_bs = bs; + return ok; +} + +int bdrv_all_delete_snapshot(const char *name, BlockDriverState **first_bad_bs, + Error **err) +{ + int ret = 0; + BlockDriverState *bs = NULL; + QEMUSnapshotInfo sn1, *snapshot = &sn1; + + while (ret == 0 && (bs = bdrv_next(bs))) { + AioContext *ctx = bdrv_get_aio_context(bs); + + aio_context_acquire(ctx); + if (bdrv_can_snapshot(bs) && + bdrv_snapshot_find(bs, snapshot, name) >= 0) { + ret = bdrv_snapshot_delete_by_id_or_name(bs, name, err); + } + aio_context_release(ctx); + } + + *first_bad_bs = bs; + return ret; +} + + +int bdrv_all_goto_snapshot(const char *name, BlockDriverState **first_bad_bs) +{ + int err = 0; + BlockDriverState *bs = NULL; + + while (err == 0 && (bs = bdrv_next(bs))) { + AioContext *ctx = bdrv_get_aio_context(bs); + + aio_context_acquire(ctx); + if (bdrv_can_snapshot(bs)) { + err = bdrv_snapshot_goto(bs, name); + } + aio_context_release(ctx); + } + + *first_bad_bs = bs; + return err; +} + +int bdrv_all_find_snapshot(const char *name, BlockDriverState **first_bad_bs) +{ + QEMUSnapshotInfo sn; + int err = 0; + BlockDriverState *bs = NULL; + + while (err == 0 && (bs = bdrv_next(bs))) { + AioContext *ctx = bdrv_get_aio_context(bs); + + aio_context_acquire(ctx); + if (bdrv_can_snapshot(bs)) { + err = bdrv_snapshot_find(bs, &sn, name); + } + aio_context_release(ctx); + } + + *first_bad_bs = bs; + return err; +} + +int bdrv_all_create_snapshot(QEMUSnapshotInfo *sn, + BlockDriverState *vm_state_bs, + uint64_t vm_state_size, + BlockDriverState **first_bad_bs) +{ + int err = 0; + BlockDriverState *bs = NULL; + + while (err == 0 && (bs = bdrv_next(bs))) { + AioContext *ctx = bdrv_get_aio_context(bs); + + aio_context_acquire(ctx); + if (bs == vm_state_bs) { + sn->vm_state_size = vm_state_size; + err = bdrv_snapshot_create(bs, sn); + } else if (bdrv_can_snapshot(bs)) { + sn->vm_state_size = 0; + err = bdrv_snapshot_create(bs, sn); + } + aio_context_release(ctx); + } + + *first_bad_bs = bs; + return err; +} + +BlockDriverState *bdrv_all_find_vmstate_bs(void) +{ + bool not_found = true; + BlockDriverState *bs = NULL; + + while (not_found && (bs = bdrv_next(bs))) { + AioContext *ctx = bdrv_get_aio_context(bs); + + aio_context_acquire(ctx); + not_found = !bdrv_can_snapshot(bs); + aio_context_release(ctx); + } + return bs; +} diff --git a/qemu/block/ssh.c b/qemu/block/ssh.c index 8d0673903..06928ed93 100644 --- a/qemu/block/ssh.c +++ b/qemu/block/ssh.c @@ -22,14 +22,13 @@ * THE SOFTWARE. */ -#include <stdio.h> -#include <stdlib.h> -#include <stdarg.h> +#include "qemu/osdep.h" #include <libssh2.h> #include <libssh2_sftp.h> #include "block/block_int.h" +#include "qapi/error.h" #include "qemu/error-report.h" #include "qemu/sockets.h" #include "qemu/uri.h" @@ -193,7 +192,7 @@ sftp_error_report(BDRVSSHState *s, const char *fs, ...) static int parse_uri(const char *filename, QDict *options, Error **errp) { URI *uri = NULL; - QueryParams *qp = NULL; + QueryParams *qp; int i; uri = uri_parse(filename); @@ -249,9 +248,6 @@ static int parse_uri(const char *filename, QDict *options, Error **errp) return 0; err: - if (qp) { - query_params_free(qp); - } if (uri) { uri_free(uri); } @@ -803,14 +799,15 @@ static coroutine_fn void set_fd_handler(BDRVSSHState *s, BlockDriverState *bs) rd_handler, wr_handler); aio_set_fd_handler(bdrv_get_aio_context(bs), s->sock, - rd_handler, wr_handler, co); + false, rd_handler, wr_handler, co); } static coroutine_fn void clear_fd_handler(BDRVSSHState *s, BlockDriverState *bs) { DPRINTF("s->sock=%d", s->sock); - aio_set_fd_handler(bdrv_get_aio_context(bs), s->sock, NULL, NULL, NULL); + aio_set_fd_handler(bdrv_get_aio_context(bs), s->sock, + false, NULL, NULL, NULL); } /* A non-blocking call returned EAGAIN, so yield, ensuring the diff --git a/qemu/block/stream.c b/qemu/block/stream.c index ab0bd057f..332b9a183 100644 --- a/qemu/block/stream.c +++ b/qemu/block/stream.c @@ -11,11 +11,14 @@ * */ +#include "qemu/osdep.h" #include "trace.h" #include "block/block_int.h" #include "block/blockjob.h" +#include "qapi/error.h" #include "qapi/qmp/qerror.h" #include "qemu/ratelimit.h" +#include "sysemu/block-backend.h" enum { /* @@ -52,34 +55,6 @@ static int coroutine_fn stream_populate(BlockDriverState *bs, return bdrv_co_copy_on_readv(bs, sector_num, nb_sectors, &qiov); } -static void close_unused_images(BlockDriverState *top, BlockDriverState *base, - const char *base_id) -{ - BlockDriverState *intermediate; - intermediate = top->backing_hd; - - /* Must assign before bdrv_delete() to prevent traversing dangling pointer - * while we delete backing image instances. - */ - bdrv_set_backing_hd(top, base); - - while (intermediate) { - BlockDriverState *unused; - - /* reached base */ - if (intermediate == base) { - break; - } - - unused = intermediate; - intermediate = intermediate->backing_hd; - bdrv_set_backing_hd(unused, NULL); - bdrv_unref(unused); - } - - bdrv_refresh_limits(top, NULL); -} - typedef struct { int ret; bool reached_end; @@ -101,7 +76,7 @@ static void stream_complete(BlockJob *job, void *opaque) } } data->ret = bdrv_change_backing_file(job->bs, base_id, base_fmt); - close_unused_images(job->bs, base, base_id); + bdrv_set_backing_hd(job->bs, base); } g_free(s->backing_file_str); @@ -115,21 +90,21 @@ static void coroutine_fn stream_run(void *opaque) StreamCompleteData *data; BlockDriverState *bs = s->common.bs; BlockDriverState *base = s->base; - int64_t sector_num, end; + int64_t sector_num = 0; + int64_t end = -1; int error = 0; int ret = 0; int n = 0; void *buf; - if (!bs->backing_hd) { - block_job_completed(&s->common, 0); - return; + if (!bs->backing) { + goto out; } s->common.len = bdrv_getlength(bs); if (s->common.len < 0) { - block_job_completed(&s->common, s->common.len); - return; + ret = s->common.len; + goto out; } end = s->common.len >> BDRV_SECTOR_BITS; @@ -166,7 +141,7 @@ wait: } else if (ret >= 0) { /* Copy if allocated in the intermediate images. Limit to the * known-unallocated area [sector_num, sector_num+n). */ - ret = bdrv_is_allocated_above(bs->backing_hd, base, + ret = bdrv_is_allocated_above(backing_bs(bs), base, sector_num, n, &n); /* Finish early if end of backing file has been reached */ @@ -216,6 +191,7 @@ wait: qemu_vfree(buf); +out: /* Modify backing chain and close BDSes in main loop */ data = g_malloc(sizeof(*data)); data->ret = ret; @@ -250,7 +226,7 @@ void stream_start(BlockDriverState *bs, BlockDriverState *base, if ((on_error == BLOCKDEV_ON_ERROR_STOP || on_error == BLOCKDEV_ON_ERROR_ENOSPC) && - !bdrv_iostatus_is_enabled(bs)) { + (!bs->blk || !blk_iostatus_is_enabled(bs->blk))) { error_setg(errp, QERR_INVALID_PARAMETER, "on-error"); return; } diff --git a/qemu/block/throttle-groups.c b/qemu/block/throttle-groups.c index 1abc6fcae..4920e0949 100644 --- a/qemu/block/throttle-groups.c +++ b/qemu/block/throttle-groups.c @@ -22,6 +22,7 @@ * along with this program; if not, see <http://www.gnu.org/licenses/>. */ +#include "qemu/osdep.h" #include "block/throttle-groups.h" #include "qemu/queue.h" #include "qemu/thread.h" @@ -33,8 +34,7 @@ * its own locking. * * This locking is however handled internally in this file, so it's - * mostly transparent to outside users (but see the documentation in - * throttle_groups_lock()). + * transparent to outside users. * * The whole ThrottleGroup structure is private and invisible to * outside users, that only use it through its ThrottleState. @@ -76,9 +76,9 @@ static QTAILQ_HEAD(, ThrottleGroup) throttle_groups = * created. * * @name: the name of the ThrottleGroup - * @ret: the ThrottleGroup + * @ret: the ThrottleState member of the ThrottleGroup */ -static ThrottleGroup *throttle_group_incref(const char *name) +ThrottleState *throttle_group_incref(const char *name) { ThrottleGroup *tg = NULL; ThrottleGroup *iter; @@ -108,7 +108,7 @@ static ThrottleGroup *throttle_group_incref(const char *name) qemu_mutex_unlock(&throttle_groups_lock); - return tg; + return &tg->ts; } /* Decrease the reference count of a ThrottleGroup. @@ -116,10 +116,12 @@ static ThrottleGroup *throttle_group_incref(const char *name) * When the reference count reaches zero the ThrottleGroup is * destroyed. * - * @tg: The ThrottleGroup to unref + * @ts: The ThrottleGroup to unref, given by its ThrottleState member */ -static void throttle_group_unref(ThrottleGroup *tg) +void throttle_group_unref(ThrottleState *ts) { + ThrottleGroup *tg = container_of(ts, ThrottleGroup, ts); + qemu_mutex_lock(&throttle_groups_lock); if (--tg->refcount == 0) { QTAILQ_REMOVE(&throttle_groups, tg, list); @@ -401,7 +403,8 @@ static void write_timer_cb(void *opaque) void throttle_group_register_bs(BlockDriverState *bs, const char *groupname) { int i; - ThrottleGroup *tg = throttle_group_incref(groupname); + ThrottleState *ts = throttle_group_incref(groupname); + ThrottleGroup *tg = container_of(ts, ThrottleGroup, ts); int clock_type = QEMU_CLOCK_REALTIME; if (qtest_enabled()) { @@ -409,7 +412,7 @@ void throttle_group_register_bs(BlockDriverState *bs, const char *groupname) clock_type = QEMU_CLOCK_VIRTUAL; } - bs->throttle_state = &tg->ts; + bs->throttle_state = ts; qemu_mutex_lock(&tg->lock); /* If the ThrottleGroup is new set this BlockDriverState as the token */ @@ -435,6 +438,9 @@ void throttle_group_register_bs(BlockDriverState *bs, const char *groupname) * list, destroying the timers and setting the throttle_state pointer * to NULL. * + * The BlockDriverState must not have pending throttled requests, so + * the caller has to drain them first. + * * The group will be destroyed if it's empty after this operation. * * @bs: the BlockDriverState to remove @@ -444,6 +450,10 @@ void throttle_group_unregister_bs(BlockDriverState *bs) ThrottleGroup *tg = container_of(bs->throttle_state, ThrottleGroup, ts); int i; + assert(bs->pending_reqs[0] == 0 && bs->pending_reqs[1] == 0); + assert(qemu_co_queue_empty(&bs->throttled_reqs[0])); + assert(qemu_co_queue_empty(&bs->throttled_reqs[1])); + qemu_mutex_lock(&tg->lock); for (i = 0; i < 2; i++) { if (tg->tokens[i] == bs) { @@ -461,38 +471,10 @@ void throttle_group_unregister_bs(BlockDriverState *bs) throttle_timers_destroy(&bs->throttle_timers); qemu_mutex_unlock(&tg->lock); - throttle_group_unref(tg); + throttle_group_unref(&tg->ts); bs->throttle_state = NULL; } -/* Acquire the lock of this throttling group. - * - * You won't normally need to use this. None of the functions from the - * ThrottleGroup API require you to acquire the lock since all of them - * deal with it internally. - * - * This should only be used in exceptional cases when you want to - * access the protected fields of a BlockDriverState directly - * (e.g. bdrv_swap()). - * - * @bs: a BlockDriverState that is member of the group - */ -void throttle_group_lock(BlockDriverState *bs) -{ - ThrottleGroup *tg = container_of(bs->throttle_state, ThrottleGroup, ts); - qemu_mutex_lock(&tg->lock); -} - -/* Release the lock of this throttling group. - * - * See the comments in throttle_group_lock(). - */ -void throttle_group_unlock(BlockDriverState *bs) -{ - ThrottleGroup *tg = container_of(bs->throttle_state, ThrottleGroup, ts); - qemu_mutex_unlock(&tg->lock); -} - static void throttle_groups_init(void) { qemu_mutex_init(&throttle_groups_lock); diff --git a/qemu/block/vdi.c b/qemu/block/vdi.c index 7642ef359..75d4819ed 100644 --- a/qemu/block/vdi.c +++ b/qemu/block/vdi.c @@ -49,11 +49,14 @@ * so this seems to be reasonable. */ -#include "qemu-common.h" +#include "qemu/osdep.h" +#include "qapi/error.h" #include "block/block_int.h" +#include "sysemu/block-backend.h" #include "qemu/module.h" #include "migration/migration.h" -#include "block/coroutine.h" +#include "qemu/coroutine.h" +#include "qemu/cutils.h" #if defined(CONFIG_UUID) #include <uuid/uuid.h> @@ -399,7 +402,7 @@ static int vdi_open(BlockDriverState *bs, QDict *options, int flags, logout("\n"); - ret = bdrv_read(bs->file, 0, (uint8_t *)&header, 1); + ret = bdrv_read(bs->file->bs, 0, (uint8_t *)&header, 1); if (ret < 0) { goto fail; } @@ -490,13 +493,14 @@ static int vdi_open(BlockDriverState *bs, QDict *options, int flags, bmap_size = header.blocks_in_image * sizeof(uint32_t); bmap_size = DIV_ROUND_UP(bmap_size, SECTOR_SIZE); - s->bmap = qemu_try_blockalign(bs->file, bmap_size * SECTOR_SIZE); + s->bmap = qemu_try_blockalign(bs->file->bs, bmap_size * SECTOR_SIZE); if (s->bmap == NULL) { ret = -ENOMEM; goto fail; } - ret = bdrv_read(bs->file, s->bmap_sector, (uint8_t *)s->bmap, bmap_size); + ret = bdrv_read(bs->file->bs, s->bmap_sector, (uint8_t *)s->bmap, + bmap_size); if (ret < 0) { goto fail_free_bmap; } @@ -525,7 +529,7 @@ static int vdi_reopen_prepare(BDRVReopenState *state, } static int64_t coroutine_fn vdi_co_get_block_status(BlockDriverState *bs, - int64_t sector_num, int nb_sectors, int *pnum) + int64_t sector_num, int nb_sectors, int *pnum, BlockDriverState **file) { /* TODO: Check for too large sector_num (in bdrv_is_allocated or here). */ BDRVVdiState *s = (BDRVVdiState *)bs->opaque; @@ -549,6 +553,7 @@ static int64_t coroutine_fn vdi_co_get_block_status(BlockDriverState *bs, offset = s->header.offset_data + (uint64_t)bmap_entry * s->block_size + sector_in_block * SECTOR_SIZE; + *file = bs->file->bs; return BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID | offset; } @@ -585,7 +590,7 @@ static int vdi_co_read(BlockDriverState *bs, uint64_t offset = s->header.offset_data / SECTOR_SIZE + (uint64_t)bmap_entry * s->block_sectors + sector_in_block; - ret = bdrv_read(bs->file, offset, buf, n_sectors); + ret = bdrv_read(bs->file->bs, offset, buf, n_sectors); } logout("%u sectors read\n", n_sectors); @@ -653,7 +658,7 @@ static int vdi_co_write(BlockDriverState *bs, * acquire the lock and thus the padded cluster is written before * the other coroutines can write to the affected area. */ qemu_co_mutex_lock(&s->write_lock); - ret = bdrv_write(bs->file, offset, block, s->block_sectors); + ret = bdrv_write(bs->file->bs, offset, block, s->block_sectors); qemu_co_mutex_unlock(&s->write_lock); } else { uint64_t offset = s->header.offset_data / SECTOR_SIZE + @@ -669,7 +674,7 @@ static int vdi_co_write(BlockDriverState *bs, * that that write operation has returned (there may be other writes * in flight, but they do not concern this very operation). */ qemu_co_mutex_unlock(&s->write_lock); - ret = bdrv_write(bs->file, offset, buf, n_sectors); + ret = bdrv_write(bs->file->bs, offset, buf, n_sectors); } nb_sectors -= n_sectors; @@ -694,7 +699,7 @@ static int vdi_co_write(BlockDriverState *bs, assert(VDI_IS_ALLOCATED(bmap_first)); *header = s->header; vdi_header_to_le(header); - ret = bdrv_write(bs->file, 0, block, 1); + ret = bdrv_write(bs->file->bs, 0, block, 1); g_free(block); block = NULL; @@ -712,7 +717,7 @@ static int vdi_co_write(BlockDriverState *bs, base = ((uint8_t *)&s->bmap[0]) + bmap_first * SECTOR_SIZE; logout("will write %u block map sectors starting from entry %u\n", n_sectors, bmap_first); - ret = bdrv_write(bs->file, offset, base, n_sectors); + ret = bdrv_write(bs->file->bs, offset, base, n_sectors); } return ret; @@ -730,7 +735,7 @@ static int vdi_create(const char *filename, QemuOpts *opts, Error **errp) size_t bmap_size; int64_t offset = 0; Error *local_err = NULL; - BlockDriverState *bs = NULL; + BlockBackend *blk = NULL; uint32_t *bmap = NULL; logout("\n"); @@ -763,13 +768,17 @@ static int vdi_create(const char *filename, QemuOpts *opts, Error **errp) error_propagate(errp, local_err); goto exit; } - ret = bdrv_open(&bs, filename, NULL, NULL, BDRV_O_RDWR | BDRV_O_PROTOCOL, - NULL, &local_err); - if (ret < 0) { + + blk = blk_new_open(filename, NULL, NULL, + BDRV_O_RDWR | BDRV_O_PROTOCOL, &local_err); + if (blk == NULL) { error_propagate(errp, local_err); + ret = -EIO; goto exit; } + blk_set_allow_write_beyond_eof(blk, true); + /* We need enough blocks to store the given disk size, so always round up. */ blocks = DIV_ROUND_UP(bytes, block_size); @@ -799,7 +808,7 @@ static int vdi_create(const char *filename, QemuOpts *opts, Error **errp) vdi_header_print(&header); #endif vdi_header_to_le(&header); - ret = bdrv_pwrite_sync(bs, offset, &header, sizeof(header)); + ret = blk_pwrite(blk, offset, &header, sizeof(header)); if (ret < 0) { error_setg(errp, "Error writing header to %s", filename); goto exit; @@ -820,7 +829,7 @@ static int vdi_create(const char *filename, QemuOpts *opts, Error **errp) bmap[i] = VDI_UNALLOCATED; } } - ret = bdrv_pwrite_sync(bs, offset, bmap, bmap_size); + ret = blk_pwrite(blk, offset, bmap, bmap_size); if (ret < 0) { error_setg(errp, "Error writing bmap to %s", filename); goto exit; @@ -829,7 +838,7 @@ static int vdi_create(const char *filename, QemuOpts *opts, Error **errp) } if (image_type == VDI_TYPE_STATIC) { - ret = bdrv_truncate(bs, offset + blocks * block_size); + ret = blk_truncate(blk, offset + blocks * block_size); if (ret < 0) { error_setg(errp, "Failed to statically allocate %s", filename); goto exit; @@ -837,7 +846,7 @@ static int vdi_create(const char *filename, QemuOpts *opts, Error **errp) } exit: - bdrv_unref(bs); + blk_unref(blk); g_free(bmap); return ret; } diff --git a/qemu/block/vhdx-endian.c b/qemu/block/vhdx-endian.c index 0640d3f4a..da33cd38e 100644 --- a/qemu/block/vhdx-endian.c +++ b/qemu/block/vhdx-endian.c @@ -15,6 +15,7 @@ * */ +#include "qemu/osdep.h" #include "qemu-common.h" #include "block/block_int.h" #include "block/vhdx.h" diff --git a/qemu/block/vhdx-log.c b/qemu/block/vhdx-log.c index 47fec63c6..7ea7187fc 100644 --- a/qemu/block/vhdx-log.c +++ b/qemu/block/vhdx-log.c @@ -17,6 +17,8 @@ * See the COPYING.LIB file in the top-level directory. * */ +#include "qemu/osdep.h" +#include "qapi/error.h" #include "qemu-common.h" #include "block/block_int.h" #include "qemu/error-report.h" @@ -81,7 +83,7 @@ static int vhdx_log_peek_hdr(BlockDriverState *bs, VHDXLogEntries *log, offset = log->offset + read; - ret = bdrv_pread(bs->file, offset, hdr, sizeof(VHDXLogEntryHeader)); + ret = bdrv_pread(bs->file->bs, offset, hdr, sizeof(VHDXLogEntryHeader)); if (ret < 0) { goto exit; } @@ -141,7 +143,7 @@ static int vhdx_log_read_sectors(BlockDriverState *bs, VHDXLogEntries *log, } offset = log->offset + read; - ret = bdrv_pread(bs->file, offset, buffer, VHDX_LOG_SECTOR_SIZE); + ret = bdrv_pread(bs->file->bs, offset, buffer, VHDX_LOG_SECTOR_SIZE); if (ret < 0) { goto exit; } @@ -191,7 +193,8 @@ static int vhdx_log_write_sectors(BlockDriverState *bs, VHDXLogEntries *log, /* full */ break; } - ret = bdrv_pwrite(bs->file, offset, buffer_tmp, VHDX_LOG_SECTOR_SIZE); + ret = bdrv_pwrite(bs->file->bs, offset, buffer_tmp, + VHDX_LOG_SECTOR_SIZE); if (ret < 0) { goto exit; } @@ -353,7 +356,7 @@ static int vhdx_log_read_desc(BlockDriverState *bs, BDRVVHDXState *s, } desc_sectors = vhdx_compute_desc_sectors(hdr.descriptor_count); - desc_entries = qemu_try_blockalign(bs->file, + desc_entries = qemu_try_blockalign(bs->file->bs, desc_sectors * VHDX_LOG_SECTOR_SIZE); if (desc_entries == NULL) { ret = -ENOMEM; @@ -462,7 +465,7 @@ static int vhdx_log_flush_desc(BlockDriverState *bs, VHDXLogDescriptor *desc, /* count is only > 1 if we are writing zeroes */ for (i = 0; i < count; i++) { - ret = bdrv_pwrite_sync(bs->file, file_offset, buffer, + ret = bdrv_pwrite_sync(bs->file->bs, file_offset, buffer, VHDX_LOG_SECTOR_SIZE); if (ret < 0) { goto exit; @@ -509,7 +512,7 @@ static int vhdx_log_flush(BlockDriverState *bs, BDRVVHDXState *s, /* if the log shows a FlushedFileOffset larger than our current file * size, then that means the file has been truncated / corrupted, and * we must refused to open it / use it */ - if (hdr_tmp.flushed_file_offset > bdrv_getlength(bs->file)) { + if (hdr_tmp.flushed_file_offset > bdrv_getlength(bs->file->bs)) { ret = -EINVAL; goto exit; } @@ -539,12 +542,12 @@ static int vhdx_log_flush(BlockDriverState *bs, BDRVVHDXState *s, goto exit; } } - if (bdrv_getlength(bs->file) < desc_entries->hdr.last_file_offset) { + if (bdrv_getlength(bs->file->bs) < desc_entries->hdr.last_file_offset) { new_file_size = desc_entries->hdr.last_file_offset; if (new_file_size % (1024*1024)) { /* round up to nearest 1MB boundary */ new_file_size = ((new_file_size >> 20) + 1) << 20; - bdrv_truncate(bs->file, new_file_size); + bdrv_truncate(bs->file->bs, new_file_size); } } qemu_vfree(desc_entries); @@ -783,12 +786,13 @@ int vhdx_parse_log(BlockDriverState *bs, BDRVVHDXState *s, bool *flushed, if (logs.valid) { if (bs->read_only) { ret = -EPERM; - error_setg_errno(errp, EPERM, - "VHDX image file '%s' opened read-only, but " - "contains a log that needs to be replayed. To " - "replay the log, execute:\n qemu-img check -r " - "all '%s'", - bs->filename, bs->filename); + error_setg(errp, + "VHDX image file '%s' opened read-only, but " + "contains a log that needs to be replayed", + bs->filename); + error_append_hint(errp, "To replay the log, run:\n" + "qemu-img check -r all '%s'\n", + bs->filename); goto exit; } /* now flush the log */ @@ -908,8 +912,8 @@ static int vhdx_log_write(BlockDriverState *bs, BDRVVHDXState *s, .sequence_number = s->log.sequence, .descriptor_count = sectors, .reserved = 0, - .flushed_file_offset = bdrv_getlength(bs->file), - .last_file_offset = bdrv_getlength(bs->file), + .flushed_file_offset = bdrv_getlength(bs->file->bs), + .last_file_offset = bdrv_getlength(bs->file->bs), }; new_hdr.log_guid = header->log_guid; @@ -940,7 +944,7 @@ static int vhdx_log_write(BlockDriverState *bs, BDRVVHDXState *s, if (i == 0 && leading_length) { /* partial sector at the front of the buffer */ - ret = bdrv_pread(bs->file, file_offset, merged_sector, + ret = bdrv_pread(bs->file->bs, file_offset, merged_sector, VHDX_LOG_SECTOR_SIZE); if (ret < 0) { goto exit; @@ -950,7 +954,7 @@ static int vhdx_log_write(BlockDriverState *bs, BDRVVHDXState *s, sector_write = merged_sector; } else if (i == sectors - 1 && trailing_length) { /* partial sector at the end of the buffer */ - ret = bdrv_pread(bs->file, + ret = bdrv_pread(bs->file->bs, file_offset, merged_sector + trailing_length, VHDX_LOG_SECTOR_SIZE - trailing_length); diff --git a/qemu/block/vhdx.c b/qemu/block/vhdx.c index 0776de717..2b7b33240 100644 --- a/qemu/block/vhdx.c +++ b/qemu/block/vhdx.c @@ -15,8 +15,11 @@ * */ +#include "qemu/osdep.h" +#include "qapi/error.h" #include "qemu-common.h" #include "block/block_int.h" +#include "sysemu/block-backend.h" #include "qemu/module.h" #include "qemu/crc32c.h" #include "block/vhdx.h" @@ -263,10 +266,10 @@ static void vhdx_region_unregister_all(BDRVVHDXState *s) static void vhdx_set_shift_bits(BDRVVHDXState *s) { - s->logical_sector_size_bits = 31 - clz32(s->logical_sector_size); - s->sectors_per_block_bits = 31 - clz32(s->sectors_per_block); - s->chunk_ratio_bits = 63 - clz64(s->chunk_ratio); - s->block_size_bits = 31 - clz32(s->block_size); + s->logical_sector_size_bits = ctz32(s->logical_sector_size); + s->sectors_per_block_bits = ctz32(s->sectors_per_block); + s->chunk_ratio_bits = ctz64(s->chunk_ratio); + s->block_size_bits = ctz32(s->block_size); } /* @@ -375,7 +378,7 @@ static int vhdx_update_header(BlockDriverState *bs, BDRVVHDXState *s, inactive_header->log_guid = *log_guid; } - ret = vhdx_write_header(bs->file, inactive_header, header_offset, true); + ret = vhdx_write_header(bs->file->bs, inactive_header, header_offset, true); if (ret < 0) { goto exit; } @@ -427,7 +430,8 @@ static void vhdx_parse_header(BlockDriverState *bs, BDRVVHDXState *s, /* We have to read the whole VHDX_HEADER_SIZE instead of * sizeof(VHDXHeader), because the checksum is over the whole * region */ - ret = bdrv_pread(bs->file, VHDX_HEADER1_OFFSET, buffer, VHDX_HEADER_SIZE); + ret = bdrv_pread(bs->file->bs, VHDX_HEADER1_OFFSET, buffer, + VHDX_HEADER_SIZE); if (ret < 0) { goto fail; } @@ -443,7 +447,8 @@ static void vhdx_parse_header(BlockDriverState *bs, BDRVVHDXState *s, } } - ret = bdrv_pread(bs->file, VHDX_HEADER2_OFFSET, buffer, VHDX_HEADER_SIZE); + ret = bdrv_pread(bs->file->bs, VHDX_HEADER2_OFFSET, buffer, + VHDX_HEADER_SIZE); if (ret < 0) { goto fail; } @@ -516,7 +521,7 @@ static int vhdx_open_region_tables(BlockDriverState *bs, BDRVVHDXState *s) * whole block */ buffer = qemu_blockalign(bs, VHDX_HEADER_BLOCK_SIZE); - ret = bdrv_pread(bs->file, VHDX_REGION_TABLE_OFFSET, buffer, + ret = bdrv_pread(bs->file->bs, VHDX_REGION_TABLE_OFFSET, buffer, VHDX_HEADER_BLOCK_SIZE); if (ret < 0) { goto fail; @@ -629,7 +634,7 @@ static int vhdx_parse_metadata(BlockDriverState *bs, BDRVVHDXState *s) buffer = qemu_blockalign(bs, VHDX_METADATA_TABLE_MAX_SIZE); - ret = bdrv_pread(bs->file, s->metadata_rt.file_offset, buffer, + ret = bdrv_pread(bs->file->bs, s->metadata_rt.file_offset, buffer, VHDX_METADATA_TABLE_MAX_SIZE); if (ret < 0) { goto exit; @@ -732,7 +737,7 @@ static int vhdx_parse_metadata(BlockDriverState *bs, BDRVVHDXState *s) goto exit; } - ret = bdrv_pread(bs->file, + ret = bdrv_pread(bs->file->bs, s->metadata_entries.file_parameters_entry.offset + s->metadata_rt.file_offset, &s->params, @@ -767,7 +772,7 @@ static int vhdx_parse_metadata(BlockDriverState *bs, BDRVVHDXState *s) /* determine virtual disk size, logical sector size, * and phys sector size */ - ret = bdrv_pread(bs->file, + ret = bdrv_pread(bs->file->bs, s->metadata_entries.virtual_disk_size_entry.offset + s->metadata_rt.file_offset, &s->virtual_disk_size, @@ -775,7 +780,7 @@ static int vhdx_parse_metadata(BlockDriverState *bs, BDRVVHDXState *s) if (ret < 0) { goto exit; } - ret = bdrv_pread(bs->file, + ret = bdrv_pread(bs->file->bs, s->metadata_entries.logical_sector_size_entry.offset + s->metadata_rt.file_offset, &s->logical_sector_size, @@ -783,7 +788,7 @@ static int vhdx_parse_metadata(BlockDriverState *bs, BDRVVHDXState *s) if (ret < 0) { goto exit; } - ret = bdrv_pread(bs->file, + ret = bdrv_pread(bs->file->bs, s->metadata_entries.phys_sector_size_entry.offset + s->metadata_rt.file_offset, &s->physical_sector_size, @@ -854,14 +859,8 @@ static void vhdx_calc_bat_entries(BDRVVHDXState *s) { uint32_t data_blocks_cnt, bitmap_blocks_cnt; - data_blocks_cnt = s->virtual_disk_size >> s->block_size_bits; - if (s->virtual_disk_size - (data_blocks_cnt << s->block_size_bits)) { - data_blocks_cnt++; - } - bitmap_blocks_cnt = data_blocks_cnt >> s->chunk_ratio_bits; - if (data_blocks_cnt - (bitmap_blocks_cnt << s->chunk_ratio_bits)) { - bitmap_blocks_cnt++; - } + data_blocks_cnt = DIV_ROUND_UP(s->virtual_disk_size, s->block_size); + bitmap_blocks_cnt = DIV_ROUND_UP(data_blocks_cnt, s->chunk_ratio); if (s->parent_entries) { s->bat_entries = bitmap_blocks_cnt * (s->chunk_ratio + 1); @@ -906,7 +905,7 @@ static int vhdx_open(BlockDriverState *bs, QDict *options, int flags, QLIST_INIT(&s->regions); /* validate the file signature */ - ret = bdrv_pread(bs->file, 0, &signature, sizeof(uint64_t)); + ret = bdrv_pread(bs->file->bs, 0, &signature, sizeof(uint64_t)); if (ret < 0) { goto fail; } @@ -959,13 +958,13 @@ static int vhdx_open(BlockDriverState *bs, QDict *options, int flags, } /* s->bat is freed in vhdx_close() */ - s->bat = qemu_try_blockalign(bs->file, s->bat_rt.length); + s->bat = qemu_try_blockalign(bs->file->bs, s->bat_rt.length); if (s->bat == NULL) { ret = -ENOMEM; goto fail; } - ret = bdrv_pread(bs->file, s->bat_offset, s->bat, s->bat_rt.length); + ret = bdrv_pread(bs->file->bs, s->bat_offset, s->bat, s->bat_rt.length); if (ret < 0) { goto fail; } @@ -1118,7 +1117,7 @@ static coroutine_fn int vhdx_co_readv(BlockDriverState *bs, int64_t sector_num, break; case PAYLOAD_BLOCK_FULLY_PRESENT: qemu_co_mutex_unlock(&s->lock); - ret = bdrv_co_readv(bs->file, + ret = bdrv_co_readv(bs->file->bs, sinfo.file_offset >> BDRV_SECTOR_BITS, sinfo.sectors_avail, &hd_qiov); qemu_co_mutex_lock(&s->lock); @@ -1156,12 +1155,12 @@ exit: static int vhdx_allocate_block(BlockDriverState *bs, BDRVVHDXState *s, uint64_t *new_offset) { - *new_offset = bdrv_getlength(bs->file); + *new_offset = bdrv_getlength(bs->file->bs); /* per the spec, the address for a block is in units of 1MB */ *new_offset = ROUND_UP(*new_offset, 1024 * 1024); - return bdrv_truncate(bs->file, *new_offset + s->block_size); + return bdrv_truncate(bs->file->bs, *new_offset + s->block_size); } /* @@ -1260,7 +1259,7 @@ static coroutine_fn int vhdx_co_writev(BlockDriverState *bs, int64_t sector_num, /* Queue another write of zero buffers if the underlying file * does not zero-fill on file extension */ - if (bdrv_has_zero_init(bs->file) == 0) { + if (bdrv_has_zero_init(bs->file->bs) == 0) { use_zero_buffers = true; /* zero fill the front, if any */ @@ -1327,7 +1326,7 @@ static coroutine_fn int vhdx_co_writev(BlockDriverState *bs, int64_t sector_num, } /* block exists, so we can just overwrite it */ qemu_co_mutex_unlock(&s->lock); - ret = bdrv_co_writev(bs->file, + ret = bdrv_co_writev(bs->file->bs, sinfo.file_offset >> BDRV_SECTOR_BITS, sectors_to_write, &hd_qiov); qemu_co_mutex_lock(&s->lock); @@ -1454,7 +1453,7 @@ static int vhdx_create_new_metadata(BlockDriverState *bs, uint32_t offset = 0; void *buffer = NULL; void *entry_buffer; - VHDXMetadataTableHeader *md_table;; + VHDXMetadataTableHeader *md_table; VHDXMetadataTableEntry *md_table_entry; /* Metadata entries */ @@ -1775,7 +1774,7 @@ static int vhdx_create(const char *filename, QemuOpts *opts, Error **errp) gunichar2 *creator = NULL; glong creator_items; - BlockDriverState *bs; + BlockBackend *blk; char *type = NULL; VHDXImageType image_type; Error *local_err = NULL; @@ -1840,14 +1839,16 @@ static int vhdx_create(const char *filename, QemuOpts *opts, Error **errp) goto exit; } - bs = NULL; - ret = bdrv_open(&bs, filename, NULL, NULL, BDRV_O_RDWR | BDRV_O_PROTOCOL, - NULL, &local_err); - if (ret < 0) { + blk = blk_new_open(filename, NULL, NULL, + BDRV_O_RDWR | BDRV_O_PROTOCOL, &local_err); + if (blk == NULL) { error_propagate(errp, local_err); + ret = -EIO; goto exit; } + blk_set_allow_write_beyond_eof(blk, true); + /* Create (A) */ /* The creator field is optional, but may be useful for @@ -1855,13 +1856,13 @@ static int vhdx_create(const char *filename, QemuOpts *opts, Error **errp) creator = g_utf8_to_utf16("QEMU v" QEMU_VERSION, -1, NULL, &creator_items, NULL); signature = cpu_to_le64(VHDX_FILE_SIGNATURE); - ret = bdrv_pwrite(bs, VHDX_FILE_ID_OFFSET, &signature, sizeof(signature)); + ret = blk_pwrite(blk, VHDX_FILE_ID_OFFSET, &signature, sizeof(signature)); if (ret < 0) { goto delete_and_exit; } if (creator) { - ret = bdrv_pwrite(bs, VHDX_FILE_ID_OFFSET + sizeof(signature), - creator, creator_items * sizeof(gunichar2)); + ret = blk_pwrite(blk, VHDX_FILE_ID_OFFSET + sizeof(signature), + creator, creator_items * sizeof(gunichar2)); if (ret < 0) { goto delete_and_exit; } @@ -1869,13 +1870,13 @@ static int vhdx_create(const char *filename, QemuOpts *opts, Error **errp) /* Creates (B),(C) */ - ret = vhdx_create_new_headers(bs, image_size, log_size); + ret = vhdx_create_new_headers(blk_bs(blk), image_size, log_size); if (ret < 0) { goto delete_and_exit; } /* Creates (D),(E),(G) explicitly. (F) created as by-product */ - ret = vhdx_create_new_region_table(bs, image_size, block_size, 512, + ret = vhdx_create_new_region_table(blk_bs(blk), image_size, block_size, 512, log_size, use_zero_blocks, image_type, &metadata_offset); if (ret < 0) { @@ -1883,7 +1884,7 @@ static int vhdx_create(const char *filename, QemuOpts *opts, Error **errp) } /* Creates (H) */ - ret = vhdx_create_new_metadata(bs, image_size, block_size, 512, + ret = vhdx_create_new_metadata(blk_bs(blk), image_size, block_size, 512, metadata_offset, image_type); if (ret < 0) { goto delete_and_exit; @@ -1891,7 +1892,7 @@ static int vhdx_create(const char *filename, QemuOpts *opts, Error **errp) delete_and_exit: - bdrv_unref(bs); + blk_unref(blk); exit: g_free(type); g_free(creator); diff --git a/qemu/block/vmdk.c b/qemu/block/vmdk.c index fbaab67c8..45f9d3c5b 100644 --- a/qemu/block/vmdk.c +++ b/qemu/block/vmdk.c @@ -23,12 +23,15 @@ * THE SOFTWARE. */ -#include "qemu-common.h" +#include "qemu/osdep.h" +#include "qapi/error.h" #include "block/block_int.h" +#include "sysemu/block-backend.h" #include "qapi/qmp/qerror.h" #include "qemu/error-report.h" #include "qemu/module.h" #include "migration/migration.h" +#include "qemu/cutils.h" #include <zlib.h> #include <glib.h> @@ -87,7 +90,7 @@ typedef struct { #define L2_CACHE_SIZE 16 typedef struct VmdkExtent { - BlockDriverState *file; + BdrvChild *file; bool flat; bool compressed; bool has_marker; @@ -222,7 +225,7 @@ static void vmdk_free_extents(BlockDriverState *bs) g_free(e->l1_backup_table); g_free(e->type); if (e->file != bs->file) { - bdrv_unref(e->file); + bdrv_unref_child(bs, e->file); } } g_free(s->extents); @@ -241,15 +244,17 @@ static void vmdk_free_last_extent(BlockDriverState *bs) static uint32_t vmdk_read_cid(BlockDriverState *bs, int parent) { - char desc[DESC_SIZE]; + char *desc; uint32_t cid = 0xffffffff; const char *p_name, *cid_str; size_t cid_str_size; BDRVVmdkState *s = bs->opaque; int ret; - ret = bdrv_pread(bs->file, s->desc_offset, desc, DESC_SIZE); + desc = g_malloc0(DESC_SIZE); + ret = bdrv_pread(bs->file->bs, s->desc_offset, desc, DESC_SIZE); if (ret < 0) { + g_free(desc); return 0; } @@ -268,50 +273,55 @@ static uint32_t vmdk_read_cid(BlockDriverState *bs, int parent) sscanf(p_name, "%" SCNx32, &cid); } + g_free(desc); return cid; } static int vmdk_write_cid(BlockDriverState *bs, uint32_t cid) { - char desc[DESC_SIZE], tmp_desc[DESC_SIZE]; + char *desc, *tmp_desc; char *p_name, *tmp_str; BDRVVmdkState *s = bs->opaque; - int ret; + int ret = 0; - ret = bdrv_pread(bs->file, s->desc_offset, desc, DESC_SIZE); + desc = g_malloc0(DESC_SIZE); + tmp_desc = g_malloc0(DESC_SIZE); + ret = bdrv_pread(bs->file->bs, s->desc_offset, desc, DESC_SIZE); if (ret < 0) { - return ret; + goto out; } desc[DESC_SIZE - 1] = '\0'; tmp_str = strstr(desc, "parentCID"); if (tmp_str == NULL) { - return -EINVAL; + ret = -EINVAL; + goto out; } - pstrcpy(tmp_desc, sizeof(tmp_desc), tmp_str); + pstrcpy(tmp_desc, DESC_SIZE, tmp_str); p_name = strstr(desc, "CID"); if (p_name != NULL) { p_name += sizeof("CID"); - snprintf(p_name, sizeof(desc) - (p_name - desc), "%" PRIx32 "\n", cid); - pstrcat(desc, sizeof(desc), tmp_desc); + snprintf(p_name, DESC_SIZE - (p_name - desc), "%" PRIx32 "\n", cid); + pstrcat(desc, DESC_SIZE, tmp_desc); } - ret = bdrv_pwrite_sync(bs->file, s->desc_offset, desc, DESC_SIZE); - if (ret < 0) { - return ret; - } + ret = bdrv_pwrite_sync(bs->file->bs, s->desc_offset, desc, DESC_SIZE); - return 0; +out: + g_free(desc); + g_free(tmp_desc); + return ret; } static int vmdk_is_cid_valid(BlockDriverState *bs) { BDRVVmdkState *s = bs->opaque; - BlockDriverState *p_bs = bs->backing_hd; uint32_t cur_pcid; - if (!s->cid_checked && p_bs) { + if (!s->cid_checked && bs->backing) { + BlockDriverState *p_bs = bs->backing->bs; + cur_pcid = vmdk_read_cid(p_bs, 0); if (s->parent_cid != cur_pcid) { /* CID not valid */ @@ -335,15 +345,16 @@ static int vmdk_reopen_prepare(BDRVReopenState *state, static int vmdk_parent_open(BlockDriverState *bs) { char *p_name; - char desc[DESC_SIZE + 1]; + char *desc; BDRVVmdkState *s = bs->opaque; int ret; - desc[DESC_SIZE] = '\0'; - ret = bdrv_pread(bs->file, s->desc_offset, desc, DESC_SIZE); + desc = g_malloc0(DESC_SIZE + 1); + ret = bdrv_pread(bs->file->bs, s->desc_offset, desc, DESC_SIZE); if (ret < 0) { - return ret; + goto out; } + ret = 0; p_name = strstr(desc, "parentFileNameHint"); if (p_name != NULL) { @@ -352,22 +363,26 @@ static int vmdk_parent_open(BlockDriverState *bs) p_name += sizeof("parentFileNameHint") + 1; end_name = strchr(p_name, '\"'); if (end_name == NULL) { - return -EINVAL; + ret = -EINVAL; + goto out; } if ((end_name - p_name) > sizeof(bs->backing_file) - 1) { - return -EINVAL; + ret = -EINVAL; + goto out; } pstrcpy(bs->backing_file, end_name - p_name + 1, p_name); } - return 0; +out: + g_free(desc); + return ret; } /* Create and append extent to the extent array. Return the added VmdkExtent * address. return NULL if allocation failed. */ static int vmdk_add_extent(BlockDriverState *bs, - BlockDriverState *file, bool flat, int64_t sectors, + BdrvChild *file, bool flat, int64_t sectors, int64_t l1_offset, int64_t l1_backup_offset, uint32_t l1_size, int l2_size, uint64_t cluster_sectors, @@ -392,7 +407,7 @@ static int vmdk_add_extent(BlockDriverState *bs, return -EFBIG; } - nb_sectors = bdrv_nb_sectors(file); + nb_sectors = bdrv_nb_sectors(file->bs); if (nb_sectors < 0) { return nb_sectors; } @@ -439,14 +454,14 @@ static int vmdk_init_tables(BlockDriverState *bs, VmdkExtent *extent, return -ENOMEM; } - ret = bdrv_pread(extent->file, + ret = bdrv_pread(extent->file->bs, extent->l1_table_offset, extent->l1_table, l1_size); if (ret < 0) { error_setg_errno(errp, -ret, "Could not read l1 table from extent '%s'", - extent->file->filename); + extent->file->bs->filename); goto fail_l1; } for (i = 0; i < extent->l1_size; i++) { @@ -459,14 +474,14 @@ static int vmdk_init_tables(BlockDriverState *bs, VmdkExtent *extent, ret = -ENOMEM; goto fail_l1; } - ret = bdrv_pread(extent->file, + ret = bdrv_pread(extent->file->bs, extent->l1_backup_table_offset, extent->l1_backup_table, l1_size); if (ret < 0) { error_setg_errno(errp, -ret, "Could not read l1 backup table from extent '%s'", - extent->file->filename); + extent->file->bs->filename); goto fail_l1b; } for (i = 0; i < extent->l1_size; i++) { @@ -485,7 +500,7 @@ static int vmdk_init_tables(BlockDriverState *bs, VmdkExtent *extent, } static int vmdk_open_vmfs_sparse(BlockDriverState *bs, - BlockDriverState *file, + BdrvChild *file, int flags, Error **errp) { int ret; @@ -493,11 +508,11 @@ static int vmdk_open_vmfs_sparse(BlockDriverState *bs, VMDK3Header header; VmdkExtent *extent; - ret = bdrv_pread(file, sizeof(magic), &header, sizeof(header)); + ret = bdrv_pread(file->bs, sizeof(magic), &header, sizeof(header)); if (ret < 0) { error_setg_errno(errp, -ret, "Could not read header from file '%s'", - file->filename); + file->bs->filename); return ret; } ret = vmdk_add_extent(bs, file, false, @@ -559,7 +574,7 @@ static char *vmdk_read_desc(BlockDriverState *file, uint64_t desc_offset, } static int vmdk_open_vmdk4(BlockDriverState *bs, - BlockDriverState *file, + BdrvChild *file, int flags, QDict *options, Error **errp) { int ret; @@ -569,18 +584,19 @@ static int vmdk_open_vmdk4(BlockDriverState *bs, VmdkExtent *extent; BDRVVmdkState *s = bs->opaque; int64_t l1_backup_offset = 0; + bool compressed; - ret = bdrv_pread(file, sizeof(magic), &header, sizeof(header)); + ret = bdrv_pread(file->bs, sizeof(magic), &header, sizeof(header)); if (ret < 0) { error_setg_errno(errp, -ret, "Could not read header from file '%s'", - file->filename); + file->bs->filename); return -EINVAL; } if (header.capacity == 0) { uint64_t desc_offset = le64_to_cpu(header.desc_offset); if (desc_offset) { - char *buf = vmdk_read_desc(file, desc_offset << 9, errp); + char *buf = vmdk_read_desc(file->bs, desc_offset << 9, errp); if (!buf) { return -EINVAL; } @@ -620,8 +636,8 @@ static int vmdk_open_vmdk4(BlockDriverState *bs, } QEMU_PACKED eos_marker; } QEMU_PACKED footer; - ret = bdrv_pread(file, - bs->file->total_sectors * 512 - 1536, + ret = bdrv_pread(file->bs, + bs->file->bs->total_sectors * 512 - 1536, &footer, sizeof(footer)); if (ret < 0) { error_setg_errno(errp, -ret, "Failed to read footer"); @@ -643,14 +659,14 @@ static int vmdk_open_vmdk4(BlockDriverState *bs, header = footer.header; } + compressed = + le16_to_cpu(header.compressAlgorithm) == VMDK4_COMPRESSION_DEFLATE; if (le32_to_cpu(header.version) > 3) { - char buf[64]; - snprintf(buf, sizeof(buf), "VMDK version %" PRId32, - le32_to_cpu(header.version)); - error_setg(errp, QERR_UNKNOWN_BLOCK_FORMAT_FEATURE, - bdrv_get_device_or_node_name(bs), "vmdk", buf); + error_setg(errp, "Unsupported VMDK version %" PRIu32, + le32_to_cpu(header.version)); return -ENOTSUP; - } else if (le32_to_cpu(header.version) == 3 && (flags & BDRV_O_RDWR)) { + } else if (le32_to_cpu(header.version) == 3 && (flags & BDRV_O_RDWR) && + !compressed) { /* VMware KB 2064959 explains that version 3 added support for * persistent changed block tracking (CBT), and backup software can * read it as version=1 if it doesn't care about the changed area @@ -675,7 +691,7 @@ static int vmdk_open_vmdk4(BlockDriverState *bs, if (le32_to_cpu(header.flags) & VMDK4_FLAG_RGD) { l1_backup_offset = le64_to_cpu(header.rgd_offset) << 9; } - if (bdrv_nb_sectors(file) < le64_to_cpu(header.grain_offset)) { + if (bdrv_nb_sectors(file->bs) < le64_to_cpu(header.grain_offset)) { error_setg(errp, "File truncated, expecting at least %" PRId64 " bytes", (int64_t)(le64_to_cpu(header.grain_offset) * BDRV_SECTOR_SIZE)); @@ -739,8 +755,7 @@ static int vmdk_parse_description(const char *desc, const char *opt_name, } /* Open an extent file and append to bs array */ -static int vmdk_open_sparse(BlockDriverState *bs, - BlockDriverState *file, int flags, +static int vmdk_open_sparse(BlockDriverState *bs, BdrvChild *file, int flags, char *buf, QDict *options, Error **errp) { uint32_t magic; @@ -760,6 +775,17 @@ static int vmdk_open_sparse(BlockDriverState *bs, } } +static const char *next_line(const char *s) +{ + while (*s) { + if (*s == '\n') { + return s + 1; + } + s++; + } + return s; +} + static int vmdk_parse_extents(const char *desc, BlockDriverState *bs, const char *desc_file_path, QDict *options, Error **errp) @@ -769,16 +795,17 @@ static int vmdk_parse_extents(const char *desc, BlockDriverState *bs, char access[11]; char type[11]; char fname[512]; - const char *p = desc; + const char *p, *np; int64_t sectors = 0; int64_t flat_offset; char *extent_path; - BlockDriverState *extent_file; + BdrvChild *extent_file; BDRVVmdkState *s = bs->opaque; VmdkExtent *extent; char extent_opt_prefix[32]; + Error *local_err = NULL; - while (*p) { + for (p = desc; *p; p = next_line(p)) { /* parse extent line in one of below formats: * * RW [size in sectors] FLAT "file-name.vmdk" OFFSET @@ -790,51 +817,48 @@ static int vmdk_parse_extents(const char *desc, BlockDriverState *bs, matches = sscanf(p, "%10s %" SCNd64 " %10s \"%511[^\n\r\"]\" %" SCNd64, access, §ors, type, fname, &flat_offset); if (matches < 4 || strcmp(access, "RW")) { - goto next_line; + continue; } else if (!strcmp(type, "FLAT")) { if (matches != 5 || flat_offset < 0) { - error_setg(errp, "Invalid extent lines: \n%s", p); - return -EINVAL; + goto invalid; } } else if (!strcmp(type, "VMFS")) { if (matches == 4) { flat_offset = 0; } else { - error_setg(errp, "Invalid extent lines:\n%s", p); - return -EINVAL; + goto invalid; } } else if (matches != 4) { - error_setg(errp, "Invalid extent lines:\n%s", p); - return -EINVAL; + goto invalid; } if (sectors <= 0 || (strcmp(type, "FLAT") && strcmp(type, "SPARSE") && strcmp(type, "VMFS") && strcmp(type, "VMFSSPARSE")) || (strcmp(access, "RW"))) { - goto next_line; + continue; } if (!path_is_absolute(fname) && !path_has_protocol(fname) && !desc_file_path[0]) { error_setg(errp, "Cannot use relative extent paths with VMDK " - "descriptor file '%s'", bs->file->filename); + "descriptor file '%s'", bs->file->bs->filename); return -EINVAL; } extent_path = g_malloc0(PATH_MAX); path_combine(extent_path, PATH_MAX, desc_file_path, fname); - extent_file = NULL; ret = snprintf(extent_opt_prefix, 32, "extents.%d", s->num_extents); assert(ret < 32); - ret = bdrv_open_image(&extent_file, extent_path, options, - extent_opt_prefix, bs, &child_file, false, errp); + extent_file = bdrv_open_child(extent_path, options, extent_opt_prefix, + bs, &child_file, false, &local_err); g_free(extent_path); - if (ret) { - return ret; + if (local_err) { + error_propagate(errp, local_err); + return -EINVAL; } /* save to extents array */ @@ -844,13 +868,13 @@ static int vmdk_parse_extents(const char *desc, BlockDriverState *bs, ret = vmdk_add_extent(bs, extent_file, true, sectors, 0, 0, 0, 0, 0, &extent, errp); if (ret < 0) { - bdrv_unref(extent_file); + bdrv_unref_child(bs, extent_file); return ret; } extent->flat_start_offset = flat_offset << 9; } else if (!strcmp(type, "SPARSE") || !strcmp(type, "VMFSSPARSE")) { /* SPARSE extent and VMFSSPARSE extent are both "COWD" sparse file*/ - char *buf = vmdk_read_desc(extent_file, 0, errp); + char *buf = vmdk_read_desc(extent_file->bs, 0, errp); if (!buf) { ret = -EINVAL; } else { @@ -859,27 +883,27 @@ static int vmdk_parse_extents(const char *desc, BlockDriverState *bs, } g_free(buf); if (ret) { - bdrv_unref(extent_file); + bdrv_unref_child(bs, extent_file); return ret; } extent = &s->extents[s->num_extents - 1]; } else { error_setg(errp, "Unsupported extent type '%s'", type); - bdrv_unref(extent_file); + bdrv_unref_child(bs, extent_file); return -ENOTSUP; } extent->type = g_strdup(type); -next_line: - /* move to next line */ - while (*p) { - if (*p == '\n') { - p++; - break; - } - p++; - } } return 0; + +invalid: + np = next_line(p); + assert(np != p); + if (np[-1] == '\n') { + np--; + } + error_setg(errp, "Invalid extent line: %.*s", (int)(np - p), p); + return -EINVAL; } static int vmdk_open_desc_file(BlockDriverState *bs, int flags, char *buf, @@ -905,7 +929,8 @@ static int vmdk_open_desc_file(BlockDriverState *bs, int flags, char *buf, } s->create_type = g_strdup(ct); s->desc_offset = 0; - ret = vmdk_parse_extents(buf, bs, bs->file->exact_filename, options, errp); + ret = vmdk_parse_extents(buf, bs, bs->file->bs->exact_filename, options, + errp); exit: return ret; } @@ -918,7 +943,7 @@ static int vmdk_open(BlockDriverState *bs, QDict *options, int flags, BDRVVmdkState *s = bs->opaque; uint32_t magic; - buf = vmdk_read_desc(bs->file, 0, errp); + buf = vmdk_read_desc(bs->file->bs, 0, errp); if (!buf) { return -EINVAL; } @@ -927,7 +952,8 @@ static int vmdk_open(BlockDriverState *bs, QDict *options, int flags, switch (magic) { case VMDK3_MAGIC: case VMDK4_MAGIC: - ret = vmdk_open_sparse(bs, bs->file, flags, buf, options, errp); + ret = vmdk_open_sparse(bs, bs->file, flags, buf, options, + errp); s->desc_offset = 0x200; break; default: @@ -1004,7 +1030,7 @@ static int get_whole_cluster(BlockDriverState *bs, cluster_bytes = extent->cluster_sectors << BDRV_SECTOR_BITS; whole_grain = qemu_blockalign(bs, cluster_bytes); - if (!bs->backing_hd) { + if (!bs->backing) { memset(whole_grain, 0, skip_start_sector << BDRV_SECTOR_BITS); memset(whole_grain + (skip_end_sector << BDRV_SECTOR_BITS), 0, cluster_bytes - (skip_end_sector << BDRV_SECTOR_BITS)); @@ -1013,22 +1039,22 @@ static int get_whole_cluster(BlockDriverState *bs, assert(skip_end_sector <= extent->cluster_sectors); /* we will be here if it's first write on non-exist grain(cluster). * try to read from parent image, if exist */ - if (bs->backing_hd && !vmdk_is_cid_valid(bs)) { + if (bs->backing && !vmdk_is_cid_valid(bs)) { ret = VMDK_ERROR; goto exit; } /* Read backing data before skip range */ if (skip_start_sector > 0) { - if (bs->backing_hd) { - ret = bdrv_read(bs->backing_hd, sector_num, + if (bs->backing) { + ret = bdrv_read(bs->backing->bs, sector_num, whole_grain, skip_start_sector); if (ret < 0) { ret = VMDK_ERROR; goto exit; } } - ret = bdrv_write(extent->file, cluster_sector_num, whole_grain, + ret = bdrv_write(extent->file->bs, cluster_sector_num, whole_grain, skip_start_sector); if (ret < 0) { ret = VMDK_ERROR; @@ -1037,8 +1063,8 @@ static int get_whole_cluster(BlockDriverState *bs, } /* Read backing data after skip range */ if (skip_end_sector < extent->cluster_sectors) { - if (bs->backing_hd) { - ret = bdrv_read(bs->backing_hd, sector_num + skip_end_sector, + if (bs->backing) { + ret = bdrv_read(bs->backing->bs, sector_num + skip_end_sector, whole_grain + (skip_end_sector << BDRV_SECTOR_BITS), extent->cluster_sectors - skip_end_sector); if (ret < 0) { @@ -1046,7 +1072,7 @@ static int get_whole_cluster(BlockDriverState *bs, goto exit; } } - ret = bdrv_write(extent->file, cluster_sector_num + skip_end_sector, + ret = bdrv_write(extent->file->bs, cluster_sector_num + skip_end_sector, whole_grain + (skip_end_sector << BDRV_SECTOR_BITS), extent->cluster_sectors - skip_end_sector); if (ret < 0) { @@ -1066,7 +1092,7 @@ static int vmdk_L2update(VmdkExtent *extent, VmdkMetaData *m_data, offset = cpu_to_le32(offset); /* update L2 table */ if (bdrv_pwrite_sync( - extent->file, + extent->file->bs, ((int64_t)m_data->l2_offset * 512) + (m_data->l2_index * sizeof(offset)), &offset, sizeof(offset)) < 0) { @@ -1076,7 +1102,7 @@ static int vmdk_L2update(VmdkExtent *extent, VmdkMetaData *m_data, if (extent->l1_backup_table_offset != 0) { m_data->l2_offset = extent->l1_backup_table[m_data->l1_index]; if (bdrv_pwrite_sync( - extent->file, + extent->file->bs, ((int64_t)m_data->l2_offset * 512) + (m_data->l2_index * sizeof(offset)), &offset, sizeof(offset)) < 0) { @@ -1166,7 +1192,7 @@ static int get_cluster_offset(BlockDriverState *bs, } l2_table = extent->l2_cache + (min_index * extent->l2_size); if (bdrv_pread( - extent->file, + extent->file->bs, (int64_t)l2_offset * 512, l2_table, extent->l2_size * sizeof(uint32_t) @@ -1245,7 +1271,7 @@ static inline uint64_t vmdk_find_index_in_cluster(VmdkExtent *extent, } static int64_t coroutine_fn vmdk_co_get_block_status(BlockDriverState *bs, - int64_t sector_num, int nb_sectors, int *pnum) + int64_t sector_num, int nb_sectors, int *pnum, BlockDriverState **file) { BDRVVmdkState *s = bs->opaque; int64_t index_in_cluster, n, ret; @@ -1262,6 +1288,7 @@ static int64_t coroutine_fn vmdk_co_get_block_status(BlockDriverState *bs, 0, 0); qemu_co_mutex_unlock(&s->lock); + index_in_cluster = vmdk_find_index_in_cluster(extent, sector_num); switch (ret) { case VMDK_ERROR: ret = -EIO; @@ -1274,14 +1301,15 @@ static int64_t coroutine_fn vmdk_co_get_block_status(BlockDriverState *bs, break; case VMDK_OK: ret = BDRV_BLOCK_DATA; - if (extent->file == bs->file && !extent->compressed) { - ret |= BDRV_BLOCK_OFFSET_VALID | offset; + if (!extent->compressed) { + ret |= BDRV_BLOCK_OFFSET_VALID; + ret |= (offset + (index_in_cluster << BDRV_SECTOR_BITS)) + & BDRV_BLOCK_OFFSET_MASK; } - + *file = extent->file->bs; break; } - index_in_cluster = vmdk_find_index_in_cluster(extent, sector_num); n = extent->cluster_sectors - index_in_cluster; if (n > nb_sectors) { n = nb_sectors; @@ -1320,12 +1348,16 @@ static int vmdk_write_extent(VmdkExtent *extent, int64_t cluster_offset, write_len = buf_len + sizeof(VmdkGrainMarker); } write_offset = cluster_offset + offset_in_cluster, - ret = bdrv_pwrite(extent->file, write_offset, write_buf, write_len); + ret = bdrv_pwrite(extent->file->bs, write_offset, write_buf, write_len); write_end_sector = DIV_ROUND_UP(write_offset + write_len, BDRV_SECTOR_SIZE); - extent->next_cluster_sector = MAX(extent->next_cluster_sector, - write_end_sector); + if (extent->compressed) { + extent->next_cluster_sector = write_end_sector; + } else { + extent->next_cluster_sector = MAX(extent->next_cluster_sector, + write_end_sector); + } if (ret != write_len) { ret = ret < 0 ? ret : -EIO; @@ -1351,7 +1383,7 @@ static int vmdk_read_extent(VmdkExtent *extent, int64_t cluster_offset, if (!extent->compressed) { - ret = bdrv_pread(extent->file, + ret = bdrv_pread(extent->file->bs, cluster_offset + offset_in_cluster, buf, nb_sectors * 512); if (ret == nb_sectors * 512) { @@ -1365,7 +1397,7 @@ static int vmdk_read_extent(VmdkExtent *extent, int64_t cluster_offset, buf_bytes = cluster_bytes * 2; cluster_buf = g_malloc(buf_bytes); uncomp_buf = g_malloc(cluster_bytes); - ret = bdrv_pread(extent->file, + ret = bdrv_pread(extent->file->bs, cluster_offset, cluster_buf, buf_bytes); if (ret < 0) { @@ -1427,11 +1459,11 @@ static int vmdk_read(BlockDriverState *bs, int64_t sector_num, } if (ret != VMDK_OK) { /* if not allocated, try to read from parent image, if exist */ - if (bs->backing_hd && ret != VMDK_ZEROED) { + if (bs->backing && ret != VMDK_ZEROED) { if (!vmdk_is_cid_valid(bs)) { return -EINVAL; } - ret = bdrv_read(bs->backing_hd, sector_num, buf, n); + ret = bdrv_read(bs->backing->bs, sector_num, buf, n); if (ret < 0) { return ret; } @@ -1487,8 +1519,8 @@ static int vmdk_write(BlockDriverState *bs, int64_t sector_num, if (sector_num > bs->total_sectors) { error_report("Wrong offset: sector_num=0x%" PRIx64 - " total_sectors=0x%" PRIx64 "\n", - sector_num, bs->total_sectors); + " total_sectors=0x%" PRIx64, + sector_num, bs->total_sectors); return -EIO; } @@ -1617,7 +1649,7 @@ static int vmdk_create_extent(const char *filename, int64_t filesize, QemuOpts *opts, Error **errp) { int ret, i; - BlockDriverState *bs = NULL; + BlockBackend *blk = NULL; VMDK4Header header; Error *local_err = NULL; uint32_t tmp, magic, grains, gd_sectors, gt_size, gt_count; @@ -1630,16 +1662,18 @@ static int vmdk_create_extent(const char *filename, int64_t filesize, goto exit; } - assert(bs == NULL); - ret = bdrv_open(&bs, filename, NULL, NULL, BDRV_O_RDWR | BDRV_O_PROTOCOL, - NULL, &local_err); - if (ret < 0) { + blk = blk_new_open(filename, NULL, NULL, + BDRV_O_RDWR | BDRV_O_PROTOCOL, &local_err); + if (blk == NULL) { error_propagate(errp, local_err); + ret = -EIO; goto exit; } + blk_set_allow_write_beyond_eof(blk, true); + if (flat) { - ret = bdrv_truncate(bs, filesize); + ret = blk_truncate(blk, filesize); if (ret < 0) { error_setg_errno(errp, -ret, "Could not truncate file"); } @@ -1647,7 +1681,13 @@ static int vmdk_create_extent(const char *filename, int64_t filesize, } magic = cpu_to_be32(VMDK4_MAGIC); memset(&header, 0, sizeof(header)); - header.version = zeroed_grain ? 2 : 1; + if (compress) { + header.version = 3; + } else if (zeroed_grain) { + header.version = 2; + } else { + header.version = 1; + } header.flags = VMDK4_FLAG_RGD | VMDK4_FLAG_NL_DETECT | (compress ? VMDK4_FLAG_COMPRESS | VMDK4_FLAG_MARKER : 0) | (zeroed_grain ? VMDK4_FLAG_ZERO_GRAIN : 0); @@ -1688,18 +1728,18 @@ static int vmdk_create_extent(const char *filename, int64_t filesize, header.check_bytes[3] = 0xa; /* write all the data */ - ret = bdrv_pwrite(bs, 0, &magic, sizeof(magic)); + ret = blk_pwrite(blk, 0, &magic, sizeof(magic)); if (ret < 0) { error_setg(errp, QERR_IO_ERROR); goto exit; } - ret = bdrv_pwrite(bs, sizeof(magic), &header, sizeof(header)); + ret = blk_pwrite(blk, sizeof(magic), &header, sizeof(header)); if (ret < 0) { error_setg(errp, QERR_IO_ERROR); goto exit; } - ret = bdrv_truncate(bs, le64_to_cpu(header.grain_offset) << 9); + ret = blk_truncate(blk, le64_to_cpu(header.grain_offset) << 9); if (ret < 0) { error_setg_errno(errp, -ret, "Could not truncate file"); goto exit; @@ -1712,8 +1752,8 @@ static int vmdk_create_extent(const char *filename, int64_t filesize, i < gt_count; i++, tmp += gt_size) { gd_buf[i] = cpu_to_le32(tmp); } - ret = bdrv_pwrite(bs, le64_to_cpu(header.rgd_offset) * BDRV_SECTOR_SIZE, - gd_buf, gd_buf_size); + ret = blk_pwrite(blk, le64_to_cpu(header.rgd_offset) * BDRV_SECTOR_SIZE, + gd_buf, gd_buf_size); if (ret < 0) { error_setg(errp, QERR_IO_ERROR); goto exit; @@ -1724,8 +1764,8 @@ static int vmdk_create_extent(const char *filename, int64_t filesize, i < gt_count; i++, tmp += gt_size) { gd_buf[i] = cpu_to_le32(tmp); } - ret = bdrv_pwrite(bs, le64_to_cpu(header.gd_offset) * BDRV_SECTOR_SIZE, - gd_buf, gd_buf_size); + ret = blk_pwrite(blk, le64_to_cpu(header.gd_offset) * BDRV_SECTOR_SIZE, + gd_buf, gd_buf_size); if (ret < 0) { error_setg(errp, QERR_IO_ERROR); goto exit; @@ -1733,8 +1773,8 @@ static int vmdk_create_extent(const char *filename, int64_t filesize, ret = 0; exit: - if (bs) { - bdrv_unref(bs); + if (blk) { + blk_unref(blk); } g_free(gd_buf); return ret; @@ -1783,7 +1823,7 @@ static int filename_decompose(const char *filename, char *path, char *prefix, static int vmdk_create(const char *filename, QemuOpts *opts, Error **errp) { int idx = 0; - BlockDriverState *new_bs = NULL; + BlockBackend *new_blk = NULL; Error *local_err = NULL; char *desc = NULL; int64_t total_size = 0, filesize; @@ -1894,7 +1934,7 @@ static int vmdk_create(const char *filename, QemuOpts *opts, Error **errp) goto exit; } if (backing_file) { - BlockDriverState *bs = NULL; + BlockBackend *blk; char *full_backing = g_new0(char, PATH_MAX); bdrv_get_full_backing_filename_from_filename(filename, backing_file, full_backing, PATH_MAX, @@ -1905,19 +1945,21 @@ static int vmdk_create(const char *filename, QemuOpts *opts, Error **errp) ret = -ENOENT; goto exit; } - ret = bdrv_open(&bs, full_backing, NULL, NULL, BDRV_O_NO_BACKING, NULL, - errp); + + blk = blk_new_open(full_backing, NULL, NULL, + BDRV_O_NO_BACKING, errp); g_free(full_backing); - if (ret != 0) { + if (blk == NULL) { + ret = -EIO; goto exit; } - if (strcmp(bs->drv->format_name, "vmdk")) { - bdrv_unref(bs); + if (strcmp(blk_bs(blk)->drv->format_name, "vmdk")) { + blk_unref(blk); ret = -EINVAL; goto exit; } - parent_cid = vmdk_read_cid(bs, 0); - bdrv_unref(bs); + parent_cid = vmdk_read_cid(blk_bs(blk), 0); + blk_unref(blk); snprintf(parent_desc_line, BUF_SIZE, "parentFileNameHint=\"%s\"", backing_file); } @@ -1975,14 +2017,18 @@ static int vmdk_create(const char *filename, QemuOpts *opts, Error **errp) goto exit; } } - assert(new_bs == NULL); - ret = bdrv_open(&new_bs, filename, NULL, NULL, - BDRV_O_RDWR | BDRV_O_PROTOCOL, NULL, &local_err); - if (ret < 0) { + + new_blk = blk_new_open(filename, NULL, NULL, + BDRV_O_RDWR | BDRV_O_PROTOCOL, &local_err); + if (new_blk == NULL) { error_propagate(errp, local_err); + ret = -EIO; goto exit; } - ret = bdrv_pwrite(new_bs, desc_offset, desc, desc_len); + + blk_set_allow_write_beyond_eof(new_blk, true); + + ret = blk_pwrite(new_blk, desc_offset, desc, desc_len); if (ret < 0) { error_setg_errno(errp, -ret, "Could not write description"); goto exit; @@ -1990,14 +2036,14 @@ static int vmdk_create(const char *filename, QemuOpts *opts, Error **errp) /* bdrv_pwrite write padding zeros to align to sector, we don't need that * for description file */ if (desc_offset == 0) { - ret = bdrv_truncate(new_bs, desc_len); + ret = blk_truncate(new_blk, desc_len); if (ret < 0) { error_setg_errno(errp, -ret, "Could not truncate file"); } } exit: - if (new_bs) { - bdrv_unref(new_bs); + if (new_blk) { + blk_unref(new_blk); } g_free(adapter_type); g_free(backing_file); @@ -2032,7 +2078,7 @@ static coroutine_fn int vmdk_co_flush(BlockDriverState *bs) int ret = 0; for (i = 0; i < s->num_extents; i++) { - err = bdrv_co_flush(s->extents[i].file); + err = bdrv_co_flush(s->extents[i].file->bs); if (err < 0) { ret = err; } @@ -2047,7 +2093,7 @@ static int64_t vmdk_get_allocated_file_size(BlockDriverState *bs) int64_t r; BDRVVmdkState *s = bs->opaque; - ret = bdrv_get_allocated_file_size(bs->file); + ret = bdrv_get_allocated_file_size(bs->file->bs); if (ret < 0) { return ret; } @@ -2055,7 +2101,7 @@ static int64_t vmdk_get_allocated_file_size(BlockDriverState *bs) if (s->extents[i].file == bs->file) { continue; } - r = bdrv_get_allocated_file_size(s->extents[i].file); + r = bdrv_get_allocated_file_size(s->extents[i].file->bs); if (r < 0) { return r; } @@ -2073,7 +2119,7 @@ static int vmdk_has_zero_init(BlockDriverState *bs) * return 0. */ for (i = 0; i < s->num_extents; i++) { if (s->extents[i].flat) { - if (!bdrv_has_zero_init(s->extents[i].file)) { + if (!bdrv_has_zero_init(s->extents[i].file->bs)) { return 0; } } @@ -2086,7 +2132,7 @@ static ImageInfo *vmdk_get_extent_info(VmdkExtent *extent) ImageInfo *info = g_new0(ImageInfo, 1); *info = (ImageInfo){ - .filename = g_strdup(extent->file->filename), + .filename = g_strdup(extent->file->bs->filename), .format = g_strdup(extent->type), .virtual_size = extent->sectors * BDRV_SECTOR_SIZE, .compressed = extent->compressed, @@ -2132,7 +2178,9 @@ static int vmdk_check(BlockDriverState *bs, BdrvCheckResult *result, PRId64 "\n", sector_num); break; } - if (ret == VMDK_OK && cluster_offset >= bdrv_getlength(extent->file)) { + if (ret == VMDK_OK && + cluster_offset >= bdrv_getlength(extent->file->bs)) + { fprintf(stderr, "ERROR: cluster offset for sector %" PRId64 " points after EOF\n", sector_num); @@ -2153,19 +2201,19 @@ static ImageInfoSpecific *vmdk_get_specific_info(BlockDriverState *bs) ImageInfoList **next; *spec_info = (ImageInfoSpecific){ - .kind = IMAGE_INFO_SPECIFIC_KIND_VMDK, - { - .vmdk = g_new0(ImageInfoSpecificVmdk, 1), + .type = IMAGE_INFO_SPECIFIC_KIND_VMDK, + .u = { + .vmdk.data = g_new0(ImageInfoSpecificVmdk, 1), }, }; - *spec_info->vmdk = (ImageInfoSpecificVmdk) { + *spec_info->u.vmdk.data = (ImageInfoSpecificVmdk) { .create_type = g_strdup(s->create_type), .cid = s->cid, .parent_cid = s->parent_cid, }; - next = &spec_info->vmdk->extents; + next = &spec_info->u.vmdk.data->extents; for (i = 0; i < s->num_extents; i++) { *next = g_new0(ImageInfoList, 1); (*next)->value = vmdk_get_extent_info(&s->extents[i]); @@ -2208,7 +2256,7 @@ static void vmdk_detach_aio_context(BlockDriverState *bs) int i; for (i = 0; i < s->num_extents; i++) { - bdrv_detach_aio_context(s->extents[i].file); + bdrv_detach_aio_context(s->extents[i].file->bs); } } @@ -2219,7 +2267,7 @@ static void vmdk_attach_aio_context(BlockDriverState *bs, int i; for (i = 0; i < s->num_extents; i++) { - bdrv_attach_aio_context(s->extents[i].file, new_context); + bdrv_attach_aio_context(s->extents[i].file->bs, new_context); } } diff --git a/qemu/block/vpc.c b/qemu/block/vpc.c index 3e385d9fb..3e2ea698d 100644 --- a/qemu/block/vpc.c +++ b/qemu/block/vpc.c @@ -22,8 +22,11 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. */ +#include "qemu/osdep.h" +#include "qapi/error.h" #include "qemu-common.h" #include "block/block_int.h" +#include "sysemu/block-backend.h" #include "qemu/module.h" #include "migration/migration.h" #if defined(CONFIG_UUID) @@ -42,28 +45,34 @@ enum vhd_type { VHD_DIFFERENCING = 4, }; -// Seconds since Jan 1, 2000 0:00:00 (UTC) +/* Seconds since Jan 1, 2000 0:00:00 (UTC) */ #define VHD_TIMESTAMP_BASE 946684800 -#define VHD_MAX_SECTORS (65535LL * 255 * 255) -#define VHD_MAX_GEOMETRY (65535LL * 16 * 255) +#define VHD_CHS_MAX_C 65535LL +#define VHD_CHS_MAX_H 16 +#define VHD_CHS_MAX_S 255 -// always big-endian +#define VHD_MAX_SECTORS 0xff000000 /* 2040 GiB max image size */ +#define VHD_MAX_GEOMETRY (VHD_CHS_MAX_C * VHD_CHS_MAX_H * VHD_CHS_MAX_S) + +#define VPC_OPT_FORCE_SIZE "force_size" + +/* always big-endian */ typedef struct vhd_footer { - char creator[8]; // "conectix" + char creator[8]; /* "conectix" */ uint32_t features; uint32_t version; - // Offset of next header structure, 0xFFFFFFFF if none + /* Offset of next header structure, 0xFFFFFFFF if none */ uint64_t data_offset; - // Seconds since Jan 1, 2000 0:00:00 (UTC) + /* Seconds since Jan 1, 2000 0:00:00 (UTC) */ uint32_t timestamp; - char creator_app[4]; // "vpc " + char creator_app[4]; /* e.g., "vpc " */ uint16_t major; uint16_t minor; - char creator_os[4]; // "Wi2k" + char creator_os[4]; /* "Wi2k" */ uint64_t orig_size; uint64_t current_size; @@ -74,29 +83,29 @@ typedef struct vhd_footer { uint32_t type; - // Checksum of the Hard Disk Footer ("one's complement of the sum of all - // the bytes in the footer without the checksum field") + /* Checksum of the Hard Disk Footer ("one's complement of the sum of all + the bytes in the footer without the checksum field") */ uint32_t checksum; - // UUID used to identify a parent hard disk (backing file) + /* UUID used to identify a parent hard disk (backing file) */ uint8_t uuid[16]; uint8_t in_saved_state; } QEMU_PACKED VHDFooter; typedef struct vhd_dyndisk_header { - char magic[8]; // "cxsparse" + char magic[8]; /* "cxsparse" */ - // Offset of next header structure, 0xFFFFFFFF if none + /* Offset of next header structure, 0xFFFFFFFF if none */ uint64_t data_offset; - // Offset of the Block Allocation Table (BAT) + /* Offset of the Block Allocation Table (BAT) */ uint64_t table_offset; uint32_t version; - uint32_t max_table_entries; // 32bit/entry + uint32_t max_table_entries; /* 32bit/entry */ - // 2 MB by default, must be a power of two + /* 2 MB by default, must be a power of two */ uint32_t block_size; uint32_t checksum; @@ -104,7 +113,7 @@ typedef struct vhd_dyndisk_header { uint32_t parent_timestamp; uint32_t reserved; - // Backing file name (in UTF-16) + /* Backing file name (in UTF-16) */ uint8_t parent_name[512]; struct { @@ -127,6 +136,8 @@ typedef struct BDRVVPCState { uint32_t block_size; uint32_t bitmap_size; + bool force_use_chs; + bool force_use_sz; #ifdef CACHE uint8_t *pageentry_u8; @@ -139,6 +150,22 @@ typedef struct BDRVVPCState { Error *migration_blocker; } BDRVVPCState; +#define VPC_OPT_SIZE_CALC "force_size_calc" +static QemuOptsList vpc_runtime_opts = { + .name = "vpc-runtime-opts", + .head = QTAILQ_HEAD_INITIALIZER(vpc_runtime_opts.head), + .desc = { + { + .name = VPC_OPT_SIZE_CALC, + .type = QEMU_OPT_STRING, + .help = "Force disk size calculation to use either CHS geometry, " + "or use the disk current_size specified in the VHD footer. " + "{chs, current_size}" + }, + { /* end of list */ } + } +}; + static uint32_t vpc_checksum(uint8_t* buf, size_t size) { uint32_t res = 0; @@ -158,6 +185,25 @@ static int vpc_probe(const uint8_t *buf, int buf_size, const char *filename) return 0; } +static void vpc_parse_options(BlockDriverState *bs, QemuOpts *opts, + Error **errp) +{ + BDRVVPCState *s = bs->opaque; + const char *size_calc; + + size_calc = qemu_opt_get(opts, VPC_OPT_SIZE_CALC); + + if (!size_calc) { + /* no override, use autodetect only */ + } else if (!strcmp(size_calc, "current_size")) { + s->force_use_sz = true; + } else if (!strcmp(size_calc, "chs")) { + s->force_use_chs = true; + } else { + error_setg(errp, "Invalid size calculation mode: '%s'", size_calc); + } +} + static int vpc_open(BlockDriverState *bs, QDict *options, int flags, Error **errp) { @@ -165,6 +211,9 @@ static int vpc_open(BlockDriverState *bs, QDict *options, int flags, int i; VHDFooter *footer; VHDDynDiskHeader *dyndisk_header; + QemuOpts *opts = NULL; + Error *local_err = NULL; + bool use_chs; uint8_t buf[HEADER_SIZE]; uint32_t checksum; uint64_t computed_size; @@ -172,24 +221,42 @@ static int vpc_open(BlockDriverState *bs, QDict *options, int flags, int disk_type = VHD_DYNAMIC; int ret; - ret = bdrv_pread(bs->file, 0, s->footer_buf, HEADER_SIZE); + opts = qemu_opts_create(&vpc_runtime_opts, NULL, 0, &error_abort); + qemu_opts_absorb_qdict(opts, options, &local_err); + if (local_err) { + error_propagate(errp, local_err); + ret = -EINVAL; + goto fail; + } + + vpc_parse_options(bs, opts, &local_err); + if (local_err) { + error_propagate(errp, local_err); + ret = -EINVAL; + goto fail; + } + + ret = bdrv_pread(bs->file->bs, 0, s->footer_buf, HEADER_SIZE); if (ret < 0) { + error_setg(errp, "Unable to read VHD header"); goto fail; } footer = (VHDFooter *) s->footer_buf; if (strncmp(footer->creator, "conectix", 8)) { - int64_t offset = bdrv_getlength(bs->file); + int64_t offset = bdrv_getlength(bs->file->bs); if (offset < 0) { ret = offset; + error_setg(errp, "Invalid file size"); goto fail; } else if (offset < HEADER_SIZE) { ret = -EINVAL; + error_setg(errp, "File too small for a VHD header"); goto fail; } /* If a fixed disk, the footer is found only at the end of the file */ - ret = bdrv_pread(bs->file, offset-HEADER_SIZE, s->footer_buf, + ret = bdrv_pread(bs->file->bs, offset-HEADER_SIZE, s->footer_buf, HEADER_SIZE); if (ret < 0) { goto fail; @@ -211,36 +278,66 @@ static int vpc_open(BlockDriverState *bs, QDict *options, int flags, /* Write 'checksum' back to footer, or else will leave it with zero. */ footer->checksum = cpu_to_be32(checksum); - // The visible size of a image in Virtual PC depends on the geometry - // rather than on the size stored in the footer (the size in the footer - // is too large usually) + /* The visible size of a image in Virtual PC depends on the geometry + rather than on the size stored in the footer (the size in the footer + is too large usually) */ bs->total_sectors = (int64_t) be16_to_cpu(footer->cyls) * footer->heads * footer->secs_per_cyl; - /* Images that have exactly the maximum geometry are probably bigger and - * would be truncated if we adhered to the geometry for them. Rely on - * footer->current_size for them. */ - if (bs->total_sectors == VHD_MAX_GEOMETRY) { + /* Microsoft Virtual PC and Microsoft Hyper-V produce and read + * VHD image sizes differently. VPC will rely on CHS geometry, + * while Hyper-V and disk2vhd use the size specified in the footer. + * + * We use a couple of approaches to try and determine the correct method: + * look at the Creator App field, and look for images that have CHS + * geometry that is the maximum value. + * + * If the CHS geometry is the maximum CHS geometry, then we assume that + * the size is the footer->current_size to avoid truncation. Otherwise, + * we follow the table based on footer->creator_app: + * + * Known creator apps: + * 'vpc ' : CHS Virtual PC (uses disk geometry) + * 'qemu' : CHS QEMU (uses disk geometry) + * 'qem2' : current_size QEMU (uses current_size) + * 'win ' : current_size Hyper-V + * 'd2v ' : current_size Disk2vhd + * 'tap\0' : current_size XenServer + * 'CTXS' : current_size XenConverter + * + * The user can override the table values via drive options, however + * even with an override we will still use current_size for images + * that have CHS geometry of the maximum size. + */ + use_chs = (!!strncmp(footer->creator_app, "win ", 4) && + !!strncmp(footer->creator_app, "qem2", 4) && + !!strncmp(footer->creator_app, "d2v ", 4) && + !!strncmp(footer->creator_app, "CTXS", 4) && + !!memcmp(footer->creator_app, "tap", 4)) || s->force_use_chs; + + if (!use_chs || bs->total_sectors == VHD_MAX_GEOMETRY || s->force_use_sz) { bs->total_sectors = be64_to_cpu(footer->current_size) / - BDRV_SECTOR_SIZE; + BDRV_SECTOR_SIZE; } - /* Allow a maximum disk size of approximately 2 TB */ - if (bs->total_sectors >= VHD_MAX_SECTORS) { + /* Allow a maximum disk size of 2040 GiB */ + if (bs->total_sectors > VHD_MAX_SECTORS) { ret = -EFBIG; goto fail; } if (disk_type == VHD_DYNAMIC) { - ret = bdrv_pread(bs->file, be64_to_cpu(footer->data_offset), buf, + ret = bdrv_pread(bs->file->bs, be64_to_cpu(footer->data_offset), buf, HEADER_SIZE); if (ret < 0) { + error_setg(errp, "Error reading dynamic VHD header"); goto fail; } dyndisk_header = (VHDDynDiskHeader *) buf; if (strncmp(dyndisk_header->magic, "cxsparse", 8)) { + error_setg(errp, "Invalid header magic"); ret = -EINVAL; goto fail; } @@ -256,16 +353,14 @@ static int vpc_open(BlockDriverState *bs, QDict *options, int flags, s->max_table_entries = be32_to_cpu(dyndisk_header->max_table_entries); if ((bs->total_sectors * 512) / s->block_size > 0xffffffffU) { - ret = -EINVAL; - goto fail; - } - if (s->max_table_entries > (VHD_MAX_SECTORS * 512) / s->block_size) { + error_setg(errp, "Too many blocks"); ret = -EINVAL; goto fail; } computed_size = (uint64_t) s->max_table_entries * s->block_size; if (computed_size < bs->total_sectors * 512) { + error_setg(errp, "Page table too small"); ret = -EINVAL; goto fail; } @@ -280,16 +375,19 @@ static int vpc_open(BlockDriverState *bs, QDict *options, int flags, pagetable_size = (uint64_t) s->max_table_entries * 4; - s->pagetable = qemu_try_blockalign(bs->file, pagetable_size); + s->pagetable = qemu_try_blockalign(bs->file->bs, pagetable_size); if (s->pagetable == NULL) { + error_setg(errp, "Unable to allocate memory for page table"); ret = -ENOMEM; goto fail; } s->bat_offset = be64_to_cpu(dyndisk_header->table_offset); - ret = bdrv_pread(bs->file, s->bat_offset, s->pagetable, pagetable_size); + ret = bdrv_pread(bs->file->bs, s->bat_offset, s->pagetable, + pagetable_size); if (ret < 0) { + error_setg(errp, "Error reading pagetable"); goto fail; } @@ -308,7 +406,7 @@ static int vpc_open(BlockDriverState *bs, QDict *options, int flags, } } - if (s->free_data_block_offset > bdrv_getlength(bs->file)) { + if (s->free_data_block_offset > bdrv_getlength(bs->file->bs)) { error_setg(errp, "block-vpc: free_data_block_offset points after " "the end of file. The image has been truncated."); ret = -EINVAL; @@ -368,22 +466,22 @@ static inline int64_t get_sector_offset(BlockDriverState *bs, pageentry_index = (offset % s->block_size) / 512; if (pagetable_index >= s->max_table_entries || s->pagetable[pagetable_index] == 0xffffffff) - return -1; // not allocated + return -1; /* not allocated */ bitmap_offset = 512 * (uint64_t) s->pagetable[pagetable_index]; block_offset = bitmap_offset + s->bitmap_size + (512 * pageentry_index); - // We must ensure that we don't write to any sectors which are marked as - // unused in the bitmap. We get away with setting all bits in the block - // bitmap each time we write to a new block. This might cause Virtual PC to - // miss sparse read optimization, but it's not a problem in terms of - // correctness. + /* We must ensure that we don't write to any sectors which are marked as + unused in the bitmap. We get away with setting all bits in the block + bitmap each time we write to a new block. This might cause Virtual PC to + miss sparse read optimization, but it's not a problem in terms of + correctness. */ if (write && (s->last_bitmap_offset != bitmap_offset)) { uint8_t bitmap[s->bitmap_size]; s->last_bitmap_offset = bitmap_offset; memset(bitmap, 0xff, s->bitmap_size); - bdrv_pwrite_sync(bs->file, bitmap_offset, bitmap, s->bitmap_size); + bdrv_pwrite_sync(bs->file->bs, bitmap_offset, bitmap, s->bitmap_size); } return block_offset; @@ -401,7 +499,7 @@ static int rewrite_footer(BlockDriverState* bs) BDRVVPCState *s = bs->opaque; int64_t offset = s->free_data_block_offset; - ret = bdrv_pwrite_sync(bs->file, offset, s->footer_buf, HEADER_SIZE); + ret = bdrv_pwrite_sync(bs->file->bs, offset, s->footer_buf, HEADER_SIZE); if (ret < 0) return ret; @@ -423,35 +521,35 @@ static int64_t alloc_block(BlockDriverState* bs, int64_t sector_num) int ret; uint8_t bitmap[s->bitmap_size]; - // Check if sector_num is valid + /* Check if sector_num is valid */ if ((sector_num < 0) || (sector_num > bs->total_sectors)) return -1; - // Write entry into in-memory BAT + /* Write entry into in-memory BAT */ index = (sector_num * 512) / s->block_size; if (s->pagetable[index] != 0xFFFFFFFF) return -1; s->pagetable[index] = s->free_data_block_offset / 512; - // Initialize the block's bitmap + /* Initialize the block's bitmap */ memset(bitmap, 0xff, s->bitmap_size); - ret = bdrv_pwrite_sync(bs->file, s->free_data_block_offset, bitmap, + ret = bdrv_pwrite_sync(bs->file->bs, s->free_data_block_offset, bitmap, s->bitmap_size); if (ret < 0) { return ret; } - // Write new footer (the old one will be overwritten) + /* Write new footer (the old one will be overwritten) */ s->free_data_block_offset += s->block_size + s->bitmap_size; ret = rewrite_footer(bs); if (ret < 0) goto fail; - // Write BAT entry to disk + /* Write BAT entry to disk */ bat_offset = s->bat_offset + (4 * index); bat_value = cpu_to_be32(s->pagetable[index]); - ret = bdrv_pwrite_sync(bs->file, bat_offset, &bat_value, 4); + ret = bdrv_pwrite_sync(bs->file->bs, bat_offset, &bat_value, 4); if (ret < 0) goto fail; @@ -485,7 +583,7 @@ static int vpc_read(BlockDriverState *bs, int64_t sector_num, VHDFooter *footer = (VHDFooter *) s->footer_buf; if (be32_to_cpu(footer->type) == VHD_FIXED) { - return bdrv_read(bs->file, sector_num, buf, nb_sectors); + return bdrv_read(bs->file->bs, sector_num, buf, nb_sectors); } while (nb_sectors > 0) { offset = get_sector_offset(bs, sector_num, 0); @@ -499,7 +597,7 @@ static int vpc_read(BlockDriverState *bs, int64_t sector_num, if (offset == -1) { memset(buf, 0, sectors * BDRV_SECTOR_SIZE); } else { - ret = bdrv_pread(bs->file, offset, buf, + ret = bdrv_pread(bs->file->bs, offset, buf, sectors * BDRV_SECTOR_SIZE); if (ret != sectors * BDRV_SECTOR_SIZE) { return -1; @@ -534,7 +632,7 @@ static int vpc_write(BlockDriverState *bs, int64_t sector_num, VHDFooter *footer = (VHDFooter *) s->footer_buf; if (be32_to_cpu(footer->type) == VHD_FIXED) { - return bdrv_write(bs->file, sector_num, buf, nb_sectors); + return bdrv_write(bs->file->bs, sector_num, buf, nb_sectors); } while (nb_sectors > 0) { offset = get_sector_offset(bs, sector_num, 1); @@ -551,7 +649,8 @@ static int vpc_write(BlockDriverState *bs, int64_t sector_num, return -1; } - ret = bdrv_pwrite(bs->file, offset, buf, sectors * BDRV_SECTOR_SIZE); + ret = bdrv_pwrite(bs->file->bs, offset, buf, + sectors * BDRV_SECTOR_SIZE); if (ret != sectors * BDRV_SECTOR_SIZE) { return -1; } @@ -576,7 +675,7 @@ static coroutine_fn int vpc_co_write(BlockDriverState *bs, int64_t sector_num, } static int64_t coroutine_fn vpc_co_get_block_status(BlockDriverState *bs, - int64_t sector_num, int nb_sectors, int *pnum) + int64_t sector_num, int nb_sectors, int *pnum, BlockDriverState **file) { BDRVVPCState *s = bs->opaque; VHDFooter *footer = (VHDFooter*) s->footer_buf; @@ -586,6 +685,7 @@ static int64_t coroutine_fn vpc_co_get_block_status(BlockDriverState *bs, if (be32_to_cpu(footer->type) == VHD_FIXED) { *pnum = nb_sectors; + *file = bs->file->bs; return BDRV_BLOCK_RAW | BDRV_BLOCK_OFFSET_VALID | BDRV_BLOCK_DATA | (sector_num << BDRV_SECTOR_BITS); } @@ -607,6 +707,7 @@ static int64_t coroutine_fn vpc_co_get_block_status(BlockDriverState *bs, /* *pnum can't be greater than one block for allocated * sectors since there is always a bitmap in between. */ if (allocated) { + *file = bs->file->bs; return BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID | start; } if (nb_sectors == 0) { @@ -626,7 +727,7 @@ static int64_t coroutine_fn vpc_co_get_block_status(BlockDriverState *bs, * Note that the geometry doesn't always exactly match total_sectors but * may round it down. * - * Returns 0 on success, -EFBIG if the size is larger than ~2 TB. Override + * Returns 0 on success, -EFBIG if the size is larger than 2040 GiB. Override * the hardware EIDE and ATA-2 limit of 16 heads (max disk size of 127 GB) * and instead allow up to 255 heads. */ @@ -668,7 +769,7 @@ static int calculate_geometry(int64_t total_sectors, uint16_t* cyls, return 0; } -static int create_dynamic_disk(BlockDriverState *bs, uint8_t *buf, +static int create_dynamic_disk(BlockBackend *blk, uint8_t *buf, int64_t total_sectors) { VHDDynDiskHeader *dyndisk_header = @@ -678,34 +779,34 @@ static int create_dynamic_disk(BlockDriverState *bs, uint8_t *buf, int ret; int64_t offset = 0; - // Write the footer (twice: at the beginning and at the end) + /* Write the footer (twice: at the beginning and at the end) */ block_size = 0x200000; num_bat_entries = (total_sectors + block_size / 512) / (block_size / 512); - ret = bdrv_pwrite_sync(bs, offset, buf, HEADER_SIZE); - if (ret) { + ret = blk_pwrite(blk, offset, buf, HEADER_SIZE); + if (ret < 0) { goto fail; } offset = 1536 + ((num_bat_entries * 4 + 511) & ~511); - ret = bdrv_pwrite_sync(bs, offset, buf, HEADER_SIZE); + ret = blk_pwrite(blk, offset, buf, HEADER_SIZE); if (ret < 0) { goto fail; } - // Write the initial BAT + /* Write the initial BAT */ offset = 3 * 512; memset(buf, 0xFF, 512); for (i = 0; i < (num_bat_entries * 4 + 511) / 512; i++) { - ret = bdrv_pwrite_sync(bs, offset, buf, 512); + ret = blk_pwrite(blk, offset, buf, 512); if (ret < 0) { goto fail; } offset += 512; } - // Prepare the Dynamic Disk Header + /* Prepare the Dynamic Disk Header */ memset(buf, 0, 1024); memcpy(dyndisk_header->magic, "cxsparse", 8); @@ -722,10 +823,10 @@ static int create_dynamic_disk(BlockDriverState *bs, uint8_t *buf, dyndisk_header->checksum = cpu_to_be32(vpc_checksum(buf, 1024)); - // Write the header + /* Write the header */ offset = 512; - ret = bdrv_pwrite_sync(bs, offset, buf, 1024); + ret = blk_pwrite(blk, offset, buf, 1024); if (ret < 0) { goto fail; } @@ -734,7 +835,7 @@ static int create_dynamic_disk(BlockDriverState *bs, uint8_t *buf, return ret; } -static int create_fixed_disk(BlockDriverState *bs, uint8_t *buf, +static int create_fixed_disk(BlockBackend *blk, uint8_t *buf, int64_t total_size) { int ret; @@ -742,12 +843,12 @@ static int create_fixed_disk(BlockDriverState *bs, uint8_t *buf, /* Add footer to total size */ total_size += HEADER_SIZE; - ret = bdrv_truncate(bs, total_size); + ret = blk_truncate(blk, total_size); if (ret < 0) { return ret; } - ret = bdrv_pwrite_sync(bs, total_size - HEADER_SIZE, buf, HEADER_SIZE); + ret = blk_pwrite(blk, total_size - HEADER_SIZE, buf, HEADER_SIZE); if (ret < 0) { return ret; } @@ -768,8 +869,9 @@ static int vpc_create(const char *filename, QemuOpts *opts, Error **errp) int64_t total_size; int disk_type; int ret = -EIO; + bool force_size; Error *local_err = NULL; - BlockDriverState *bs = NULL; + BlockBackend *blk = NULL; /* Read out options */ total_size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0), @@ -781,6 +883,7 @@ static int vpc_create(const char *filename, QemuOpts *opts, Error **errp) } else if (!strcmp(disk_type_param, "fixed")) { disk_type = VHD_FIXED; } else { + error_setg(errp, "Invalid disk type, %s", disk_type_param); ret = -EINVAL; goto out; } @@ -788,36 +891,50 @@ static int vpc_create(const char *filename, QemuOpts *opts, Error **errp) disk_type = VHD_DYNAMIC; } + force_size = qemu_opt_get_bool_del(opts, VPC_OPT_FORCE_SIZE, false); + ret = bdrv_create_file(filename, opts, &local_err); if (ret < 0) { error_propagate(errp, local_err); goto out; } - ret = bdrv_open(&bs, filename, NULL, NULL, BDRV_O_RDWR | BDRV_O_PROTOCOL, - NULL, &local_err); - if (ret < 0) { + + blk = blk_new_open(filename, NULL, NULL, + BDRV_O_RDWR | BDRV_O_PROTOCOL, &local_err); + if (blk == NULL) { error_propagate(errp, local_err); + ret = -EIO; goto out; } + blk_set_allow_write_beyond_eof(blk, true); + /* * Calculate matching total_size and geometry. Increase the number of * sectors requested until we get enough (or fail). This ensures that * qemu-img convert doesn't truncate images, but rather rounds up. * - * If the image size can't be represented by a spec conform CHS geometry, + * If the image size can't be represented by a spec conformant CHS geometry, * we set the geometry to 65535 x 16 x 255 (CxHxS) sectors and use * the image size from the VHD footer to calculate total_sectors. */ - total_sectors = MIN(VHD_MAX_GEOMETRY, total_size / BDRV_SECTOR_SIZE); - for (i = 0; total_sectors > (int64_t)cyls * heads * secs_per_cyl; i++) { - calculate_geometry(total_sectors + i, &cyls, &heads, &secs_per_cyl); + if (force_size) { + /* This will force the use of total_size for sector count, below */ + cyls = VHD_CHS_MAX_C; + heads = VHD_CHS_MAX_H; + secs_per_cyl = VHD_CHS_MAX_S; + } else { + total_sectors = MIN(VHD_MAX_GEOMETRY, total_size / BDRV_SECTOR_SIZE); + for (i = 0; total_sectors > (int64_t)cyls * heads * secs_per_cyl; i++) { + calculate_geometry(total_sectors + i, &cyls, &heads, &secs_per_cyl); + } } if ((int64_t)cyls * heads * secs_per_cyl == VHD_MAX_GEOMETRY) { total_sectors = total_size / BDRV_SECTOR_SIZE; - /* Allow a maximum disk size of approximately 2 TB */ + /* Allow a maximum disk size of 2040 GiB */ if (total_sectors > VHD_MAX_SECTORS) { + error_setg(errp, "Disk size is too large, max size is 2040 GiB"); ret = -EFBIG; goto out; } @@ -830,8 +947,11 @@ static int vpc_create(const char *filename, QemuOpts *opts, Error **errp) memset(buf, 0, 1024); memcpy(footer->creator, "conectix", 8); - /* TODO Check if "qemu" creator_app is ok for VPC */ - memcpy(footer->creator_app, "qemu", 4); + if (force_size) { + memcpy(footer->creator_app, "qem2", 4); + } else { + memcpy(footer->creator_app, "qemu", 4); + } memcpy(footer->creator_os, "Wi2k", 4); footer->features = cpu_to_be32(0x02); @@ -861,13 +981,16 @@ static int vpc_create(const char *filename, QemuOpts *opts, Error **errp) footer->checksum = cpu_to_be32(vpc_checksum(buf, HEADER_SIZE)); if (disk_type == VHD_DYNAMIC) { - ret = create_dynamic_disk(bs, buf, total_sectors); + ret = create_dynamic_disk(blk, buf, total_sectors); } else { - ret = create_fixed_disk(bs, buf, total_size); + ret = create_fixed_disk(blk, buf, total_size); + } + if (ret < 0) { + error_setg(errp, "Unable to create or write VHD header"); } out: - bdrv_unref(bs); + blk_unref(blk); g_free(disk_type_param); return ret; } @@ -878,7 +1001,7 @@ static int vpc_has_zero_init(BlockDriverState *bs) VHDFooter *footer = (VHDFooter *) s->footer_buf; if (be32_to_cpu(footer->type) == VHD_FIXED) { - return bdrv_has_zero_init(bs->file); + return bdrv_has_zero_init(bs->file->bs); } else { return 1; } @@ -912,6 +1035,13 @@ static QemuOptsList vpc_create_opts = { "Type of virtual hard disk format. Supported formats are " "{dynamic (default) | fixed} " }, + { + .name = VPC_OPT_FORCE_SIZE, + .type = QEMU_OPT_BOOL, + .help = "Force disk size calculation to use the actual size " + "specified, rather than using the nearest CHS-based " + "calculation" + }, { /* end of list */ } } }; diff --git a/qemu/block/vvfat.c b/qemu/block/vvfat.c index 206869712..183fc4f04 100644 --- a/qemu/block/vvfat.c +++ b/qemu/block/vvfat.c @@ -22,15 +22,16 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. */ -#include <sys/stat.h> +#include "qemu/osdep.h" #include <dirent.h> -#include "qemu-common.h" +#include "qapi/error.h" #include "block/block_int.h" #include "qemu/module.h" #include "migration/migration.h" #include "qapi/qmp/qint.h" #include "qapi/qmp/qbool.h" #include "qapi/qmp/qstring.h" +#include "qemu/cutils.h" #ifndef S_IWGRP #define S_IWGRP 0 @@ -985,12 +986,6 @@ static BDRVVVFATState *vvv = NULL; static int enable_write_target(BDRVVVFATState *s, Error **errp); static int is_consistent(BDRVVVFATState *s); -static void vvfat_rebind(BlockDriverState *bs) -{ - BDRVVVFATState *s = bs->opaque; - s->bs = bs; -} - static QemuOptsList runtime_opts = { .name = "vvfat", .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head), @@ -1114,6 +1109,8 @@ static int vvfat_open(BlockDriverState *bs, QDict *options, int flags, goto fail; } memcpy(s->volume_label, label, label_length); + } else { + memcpy(s->volume_label, "QEMU VVFAT", 10); } if (floppy) { @@ -2288,12 +2285,17 @@ DLOG(fprintf(stderr, "commit_direntries for %s, parent_mapping_index %d\n", mapp factor * (old_cluster_count - new_cluster_count)); for (c = first_cluster; !fat_eof(s, c); c = modified_fat_get(s, c)) { + direntry_t *first_direntry; void* direntry = array_get(&(s->directory), current_dir_index); int ret = vvfat_read(s->bs, cluster2sector(s, c), direntry, s->sectors_per_cluster); if (ret) return ret; - assert(!strncmp(s->directory.pointer, "QEMU", 4)); + + /* The first directory entry on the filesystem is the volume name */ + first_direntry = (direntry_t*) s->directory.pointer; + assert(!memcmp(first_direntry->name, s->volume_label, 11)); + current_dir_index += factor; } @@ -2890,7 +2892,7 @@ static coroutine_fn int vvfat_co_write(BlockDriverState *bs, int64_t sector_num, } static int64_t coroutine_fn vvfat_co_get_block_status(BlockDriverState *bs, - int64_t sector_num, int nb_sectors, int* n) + int64_t sector_num, int nb_sectors, int *n, BlockDriverState **file) { BDRVVVFATState* s = bs->opaque; *n = s->sector_count - sector_num; @@ -2923,9 +2925,12 @@ static BlockDriver vvfat_write_target = { static int enable_write_target(BDRVVVFATState *s, Error **errp) { BlockDriver *bdrv_qcow = NULL; + BlockDriverState *backing; QemuOpts *opts = NULL; int ret; int size = sector2cluster(s, s->sector_count); + QDict *options; + s->used_clusters = calloc(size, 1); array_init(&(s->commits), sizeof(commit_t)); @@ -2956,9 +2961,10 @@ static int enable_write_target(BDRVVVFATState *s, Error **errp) } s->qcow = NULL; - ret = bdrv_open(&s->qcow, s->qcow_filename, NULL, NULL, - BDRV_O_RDWR | BDRV_O_CACHE_WB | BDRV_O_NO_FLUSH, - bdrv_qcow, errp); + options = qdict_new(); + qdict_put(options, "driver", qstring_from_str("qcow")); + ret = bdrv_open(&s->qcow, s->qcow_filename, NULL, options, + BDRV_O_RDWR | BDRV_O_NO_FLUSH, errp); if (ret < 0) { goto err; } @@ -2967,10 +2973,13 @@ static int enable_write_target(BDRVVVFATState *s, Error **errp) unlink(s->qcow_filename); #endif - bdrv_set_backing_hd(s->bs, bdrv_new()); - s->bs->backing_hd->drv = &vvfat_write_target; - s->bs->backing_hd->opaque = g_new(void *, 1); - *(void**)s->bs->backing_hd->opaque = s; + backing = bdrv_new(); + bdrv_set_backing_hd(s->bs, backing); + bdrv_unref(backing); + + s->bs->backing->bs->drv = &vvfat_write_target; + s->bs->backing->bs->opaque = g_new(void *, 1); + *(void**)s->bs->backing->bs->opaque = s; return 0; @@ -3004,7 +3013,6 @@ static BlockDriver bdrv_vvfat = { .bdrv_parse_filename = vvfat_parse_filename, .bdrv_file_open = vvfat_open, .bdrv_close = vvfat_close, - .bdrv_rebind = vvfat_rebind, .bdrv_read = vvfat_co_read, .bdrv_write = vvfat_co_write, diff --git a/qemu/block/win32-aio.c b/qemu/block/win32-aio.c index 64e86827b..2d509a9a7 100644 --- a/qemu/block/win32-aio.c +++ b/qemu/block/win32-aio.c @@ -21,6 +21,7 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. */ +#include "qemu/osdep.h" #include "qemu-common.h" #include "qemu/timer.h" #include "block/block_int.h" @@ -174,7 +175,7 @@ int win32_aio_attach(QEMUWin32AIOState *aio, HANDLE hfile) void win32_aio_detach_aio_context(QEMUWin32AIOState *aio, AioContext *old_context) { - aio_set_event_notifier(old_context, &aio->e, NULL); + aio_set_event_notifier(old_context, &aio->e, false, NULL); aio->is_aio_context_attached = false; } @@ -182,7 +183,8 @@ void win32_aio_attach_aio_context(QEMUWin32AIOState *aio, AioContext *new_context) { aio->is_aio_context_attached = true; - aio_set_event_notifier(new_context, &aio->e, win32_aio_completion_cb); + aio_set_event_notifier(new_context, &aio->e, false, + win32_aio_completion_cb); } QEMUWin32AIOState *win32_aio_init(void) diff --git a/qemu/block/write-threshold.c b/qemu/block/write-threshold.c index a53c1f5e6..cc2ca7183 100644 --- a/qemu/block/write-threshold.c +++ b/qemu/block/write-threshold.c @@ -10,8 +10,9 @@ * See the COPYING.LIB file in the top-level directory. */ +#include "qemu/osdep.h" #include "block/block_int.h" -#include "block/coroutine.h" +#include "qemu/coroutine.h" #include "block/write-threshold.h" #include "qemu/notify.h" #include "qapi-event.h" |