diff options
Diffstat (limited to 'qemu/migration')
-rw-r--r-- | qemu/migration/Makefile.objs | 10 | ||||
-rw-r--r-- | qemu/migration/block.c | 950 | ||||
-rw-r--r-- | qemu/migration/exec.c | 70 | ||||
-rw-r--r-- | qemu/migration/fd.c | 90 | ||||
-rw-r--r-- | qemu/migration/migration.c | 1810 | ||||
-rw-r--r-- | qemu/migration/postcopy-ram.c | 761 | ||||
-rw-r--r-- | qemu/migration/qemu-file-buf.c | 464 | ||||
-rw-r--r-- | qemu/migration/qemu-file-internal.h | 53 | ||||
-rw-r--r-- | qemu/migration/qemu-file-stdio.c | 196 | ||||
-rw-r--r-- | qemu/migration/qemu-file-unix.c | 323 | ||||
-rw-r--r-- | qemu/migration/qemu-file.c | 678 | ||||
-rw-r--r-- | qemu/migration/ram.c | 2561 | ||||
-rw-r--r-- | qemu/migration/rdma.c | 3516 | ||||
-rw-r--r-- | qemu/migration/savevm.c | 2243 | ||||
-rw-r--r-- | qemu/migration/tcp.c | 102 | ||||
-rw-r--r-- | qemu/migration/unix.c | 103 | ||||
-rw-r--r-- | qemu/migration/vmstate.c | 918 | ||||
-rw-r--r-- | qemu/migration/xbzrle.c | 176 |
18 files changed, 0 insertions, 15024 deletions
diff --git a/qemu/migration/Makefile.objs b/qemu/migration/Makefile.objs deleted file mode 100644 index 0cac6d707..000000000 --- a/qemu/migration/Makefile.objs +++ /dev/null @@ -1,10 +0,0 @@ -common-obj-y += migration.o tcp.o -common-obj-y += vmstate.o -common-obj-y += qemu-file.o qemu-file-buf.o qemu-file-unix.o qemu-file-stdio.o -common-obj-y += xbzrle.o postcopy-ram.o - -common-obj-$(CONFIG_RDMA) += rdma.o -common-obj-$(CONFIG_POSIX) += exec.o unix.o fd.o - -common-obj-y += block.o - diff --git a/qemu/migration/block.c b/qemu/migration/block.c deleted file mode 100644 index 174331728..000000000 --- a/qemu/migration/block.c +++ /dev/null @@ -1,950 +0,0 @@ -/* - * QEMU live block migration - * - * Copyright IBM, Corp. 2009 - * - * Authors: - * Liran Schour <lirans@il.ibm.com> - * - * This work is licensed under the terms of the GNU GPL, version 2. See - * the COPYING file in the top-level directory. - * - * Contributions after 2012-01-13 are licensed under the terms of the - * GNU GPL, version 2 or (at your option) any later version. - */ - -#include "qemu/osdep.h" -#include "qapi/error.h" -#include "qemu-common.h" -#include "block/block.h" -#include "qemu/error-report.h" -#include "qemu/main-loop.h" -#include "hw/hw.h" -#include "qemu/cutils.h" -#include "qemu/queue.h" -#include "qemu/timer.h" -#include "migration/block.h" -#include "migration/migration.h" -#include "sysemu/blockdev.h" -#include "sysemu/block-backend.h" - -#define BLOCK_SIZE (1 << 20) -#define BDRV_SECTORS_PER_DIRTY_CHUNK (BLOCK_SIZE >> BDRV_SECTOR_BITS) - -#define BLK_MIG_FLAG_DEVICE_BLOCK 0x01 -#define BLK_MIG_FLAG_EOS 0x02 -#define BLK_MIG_FLAG_PROGRESS 0x04 -#define BLK_MIG_FLAG_ZERO_BLOCK 0x08 - -#define MAX_IS_ALLOCATED_SEARCH 65536 - -#define MAX_INFLIGHT_IO 512 - -//#define DEBUG_BLK_MIGRATION - -#ifdef DEBUG_BLK_MIGRATION -#define DPRINTF(fmt, ...) \ - do { printf("blk_migration: " fmt, ## __VA_ARGS__); } while (0) -#else -#define DPRINTF(fmt, ...) \ - do { } while (0) -#endif - -typedef struct BlkMigDevState { - /* Written during setup phase. Can be read without a lock. */ - BlockDriverState *bs; - int shared_base; - int64_t total_sectors; - QSIMPLEQ_ENTRY(BlkMigDevState) entry; - Error *blocker; - - /* Only used by migration thread. Does not need a lock. */ - int bulk_completed; - int64_t cur_sector; - int64_t cur_dirty; - - /* Data in the aio_bitmap is protected by block migration lock. - * Allocation and free happen during setup and cleanup respectively. - */ - unsigned long *aio_bitmap; - - /* Protected by block migration lock. */ - int64_t completed_sectors; - - /* During migration this is protected by iothread lock / AioContext. - * Allocation and free happen during setup and cleanup respectively. - */ - BdrvDirtyBitmap *dirty_bitmap; -} BlkMigDevState; - -typedef struct BlkMigBlock { - /* Only used by migration thread. */ - uint8_t *buf; - BlkMigDevState *bmds; - int64_t sector; - int nr_sectors; - struct iovec iov; - QEMUIOVector qiov; - BlockAIOCB *aiocb; - - /* Protected by block migration lock. */ - int ret; - QSIMPLEQ_ENTRY(BlkMigBlock) entry; -} BlkMigBlock; - -typedef struct BlkMigState { - /* Written during setup phase. Can be read without a lock. */ - int blk_enable; - int shared_base; - QSIMPLEQ_HEAD(bmds_list, BlkMigDevState) bmds_list; - int64_t total_sector_sum; - bool zero_blocks; - - /* Protected by lock. */ - QSIMPLEQ_HEAD(blk_list, BlkMigBlock) blk_list; - int submitted; - int read_done; - - /* Only used by migration thread. Does not need a lock. */ - int transferred; - int prev_progress; - int bulk_completed; - - /* Lock must be taken _inside_ the iothread lock and any AioContexts. */ - QemuMutex lock; -} BlkMigState; - -static BlkMigState block_mig_state; - -static void blk_mig_lock(void) -{ - qemu_mutex_lock(&block_mig_state.lock); -} - -static void blk_mig_unlock(void) -{ - qemu_mutex_unlock(&block_mig_state.lock); -} - -/* Must run outside of the iothread lock during the bulk phase, - * or the VM will stall. - */ - -static void blk_send(QEMUFile *f, BlkMigBlock * blk) -{ - int len; - uint64_t flags = BLK_MIG_FLAG_DEVICE_BLOCK; - - if (block_mig_state.zero_blocks && - buffer_is_zero(blk->buf, BLOCK_SIZE)) { - flags |= BLK_MIG_FLAG_ZERO_BLOCK; - } - - /* sector number and flags */ - qemu_put_be64(f, (blk->sector << BDRV_SECTOR_BITS) - | flags); - - /* device name */ - len = strlen(bdrv_get_device_name(blk->bmds->bs)); - qemu_put_byte(f, len); - qemu_put_buffer(f, (uint8_t *)bdrv_get_device_name(blk->bmds->bs), len); - - /* if a block is zero we need to flush here since the network - * bandwidth is now a lot higher than the storage device bandwidth. - * thus if we queue zero blocks we slow down the migration */ - if (flags & BLK_MIG_FLAG_ZERO_BLOCK) { - qemu_fflush(f); - return; - } - - qemu_put_buffer(f, blk->buf, BLOCK_SIZE); -} - -int blk_mig_active(void) -{ - return !QSIMPLEQ_EMPTY(&block_mig_state.bmds_list); -} - -uint64_t blk_mig_bytes_transferred(void) -{ - BlkMigDevState *bmds; - uint64_t sum = 0; - - blk_mig_lock(); - QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) { - sum += bmds->completed_sectors; - } - blk_mig_unlock(); - return sum << BDRV_SECTOR_BITS; -} - -uint64_t blk_mig_bytes_remaining(void) -{ - return blk_mig_bytes_total() - blk_mig_bytes_transferred(); -} - -uint64_t blk_mig_bytes_total(void) -{ - BlkMigDevState *bmds; - uint64_t sum = 0; - - QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) { - sum += bmds->total_sectors; - } - return sum << BDRV_SECTOR_BITS; -} - - -/* Called with migration lock held. */ - -static int bmds_aio_inflight(BlkMigDevState *bmds, int64_t sector) -{ - int64_t chunk = sector / (int64_t)BDRV_SECTORS_PER_DIRTY_CHUNK; - - if (sector < bdrv_nb_sectors(bmds->bs)) { - return !!(bmds->aio_bitmap[chunk / (sizeof(unsigned long) * 8)] & - (1UL << (chunk % (sizeof(unsigned long) * 8)))); - } else { - return 0; - } -} - -/* Called with migration lock held. */ - -static void bmds_set_aio_inflight(BlkMigDevState *bmds, int64_t sector_num, - int nb_sectors, int set) -{ - int64_t start, end; - unsigned long val, idx, bit; - - start = sector_num / BDRV_SECTORS_PER_DIRTY_CHUNK; - end = (sector_num + nb_sectors - 1) / BDRV_SECTORS_PER_DIRTY_CHUNK; - - for (; start <= end; start++) { - idx = start / (sizeof(unsigned long) * 8); - bit = start % (sizeof(unsigned long) * 8); - val = bmds->aio_bitmap[idx]; - if (set) { - val |= 1UL << bit; - } else { - val &= ~(1UL << bit); - } - bmds->aio_bitmap[idx] = val; - } -} - -static void alloc_aio_bitmap(BlkMigDevState *bmds) -{ - BlockDriverState *bs = bmds->bs; - int64_t bitmap_size; - - bitmap_size = bdrv_nb_sectors(bs) + BDRV_SECTORS_PER_DIRTY_CHUNK * 8 - 1; - bitmap_size /= BDRV_SECTORS_PER_DIRTY_CHUNK * 8; - - bmds->aio_bitmap = g_malloc0(bitmap_size); -} - -/* Never hold migration lock when yielding to the main loop! */ - -static void blk_mig_read_cb(void *opaque, int ret) -{ - BlkMigBlock *blk = opaque; - - blk_mig_lock(); - blk->ret = ret; - - QSIMPLEQ_INSERT_TAIL(&block_mig_state.blk_list, blk, entry); - bmds_set_aio_inflight(blk->bmds, blk->sector, blk->nr_sectors, 0); - - block_mig_state.submitted--; - block_mig_state.read_done++; - assert(block_mig_state.submitted >= 0); - blk_mig_unlock(); -} - -/* Called with no lock taken. */ - -static int mig_save_device_bulk(QEMUFile *f, BlkMigDevState *bmds) -{ - int64_t total_sectors = bmds->total_sectors; - int64_t cur_sector = bmds->cur_sector; - BlockDriverState *bs = bmds->bs; - BlkMigBlock *blk; - int nr_sectors; - - if (bmds->shared_base) { - qemu_mutex_lock_iothread(); - aio_context_acquire(bdrv_get_aio_context(bs)); - while (cur_sector < total_sectors && - !bdrv_is_allocated(bs, cur_sector, MAX_IS_ALLOCATED_SEARCH, - &nr_sectors)) { - cur_sector += nr_sectors; - } - aio_context_release(bdrv_get_aio_context(bs)); - qemu_mutex_unlock_iothread(); - } - - if (cur_sector >= total_sectors) { - bmds->cur_sector = bmds->completed_sectors = total_sectors; - return 1; - } - - bmds->completed_sectors = cur_sector; - - cur_sector &= ~((int64_t)BDRV_SECTORS_PER_DIRTY_CHUNK - 1); - - /* we are going to transfer a full block even if it is not allocated */ - nr_sectors = BDRV_SECTORS_PER_DIRTY_CHUNK; - - if (total_sectors - cur_sector < BDRV_SECTORS_PER_DIRTY_CHUNK) { - nr_sectors = total_sectors - cur_sector; - } - - blk = g_new(BlkMigBlock, 1); - blk->buf = g_malloc(BLOCK_SIZE); - blk->bmds = bmds; - blk->sector = cur_sector; - blk->nr_sectors = nr_sectors; - - blk->iov.iov_base = blk->buf; - blk->iov.iov_len = nr_sectors * BDRV_SECTOR_SIZE; - qemu_iovec_init_external(&blk->qiov, &blk->iov, 1); - - blk_mig_lock(); - block_mig_state.submitted++; - blk_mig_unlock(); - - /* We do not know if bs is under the main thread (and thus does - * not acquire the AioContext when doing AIO) or rather under - * dataplane. Thus acquire both the iothread mutex and the - * AioContext. - * - * This is ugly and will disappear when we make bdrv_* thread-safe, - * without the need to acquire the AioContext. - */ - qemu_mutex_lock_iothread(); - aio_context_acquire(bdrv_get_aio_context(bmds->bs)); - blk->aiocb = bdrv_aio_readv(bs, cur_sector, &blk->qiov, - nr_sectors, blk_mig_read_cb, blk); - - bdrv_reset_dirty_bitmap(bmds->dirty_bitmap, cur_sector, nr_sectors); - aio_context_release(bdrv_get_aio_context(bmds->bs)); - qemu_mutex_unlock_iothread(); - - bmds->cur_sector = cur_sector + nr_sectors; - return (bmds->cur_sector >= total_sectors); -} - -/* Called with iothread lock taken. */ - -static int set_dirty_tracking(void) -{ - BlkMigDevState *bmds; - int ret; - - QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) { - aio_context_acquire(bdrv_get_aio_context(bmds->bs)); - bmds->dirty_bitmap = bdrv_create_dirty_bitmap(bmds->bs, BLOCK_SIZE, - NULL, NULL); - aio_context_release(bdrv_get_aio_context(bmds->bs)); - if (!bmds->dirty_bitmap) { - ret = -errno; - goto fail; - } - } - return 0; - -fail: - QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) { - if (bmds->dirty_bitmap) { - aio_context_acquire(bdrv_get_aio_context(bmds->bs)); - bdrv_release_dirty_bitmap(bmds->bs, bmds->dirty_bitmap); - aio_context_release(bdrv_get_aio_context(bmds->bs)); - } - } - return ret; -} - -/* Called with iothread lock taken. */ - -static void unset_dirty_tracking(void) -{ - BlkMigDevState *bmds; - - QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) { - aio_context_acquire(bdrv_get_aio_context(bmds->bs)); - bdrv_release_dirty_bitmap(bmds->bs, bmds->dirty_bitmap); - aio_context_release(bdrv_get_aio_context(bmds->bs)); - } -} - -static void init_blk_migration(QEMUFile *f) -{ - BlockDriverState *bs; - BlkMigDevState *bmds; - int64_t sectors; - - block_mig_state.submitted = 0; - block_mig_state.read_done = 0; - block_mig_state.transferred = 0; - block_mig_state.total_sector_sum = 0; - block_mig_state.prev_progress = -1; - block_mig_state.bulk_completed = 0; - block_mig_state.zero_blocks = migrate_zero_blocks(); - - for (bs = bdrv_next(NULL); bs; bs = bdrv_next(bs)) { - if (bdrv_is_read_only(bs)) { - continue; - } - - sectors = bdrv_nb_sectors(bs); - if (sectors <= 0) { - return; - } - - bmds = g_new0(BlkMigDevState, 1); - bmds->bs = bs; - bmds->bulk_completed = 0; - bmds->total_sectors = sectors; - bmds->completed_sectors = 0; - bmds->shared_base = block_mig_state.shared_base; - alloc_aio_bitmap(bmds); - error_setg(&bmds->blocker, "block device is in use by migration"); - bdrv_op_block_all(bs, bmds->blocker); - bdrv_ref(bs); - - block_mig_state.total_sector_sum += sectors; - - if (bmds->shared_base) { - DPRINTF("Start migration for %s with shared base image\n", - bdrv_get_device_name(bs)); - } else { - DPRINTF("Start full migration for %s\n", bdrv_get_device_name(bs)); - } - - QSIMPLEQ_INSERT_TAIL(&block_mig_state.bmds_list, bmds, entry); - } -} - -/* Called with no lock taken. */ - -static int blk_mig_save_bulked_block(QEMUFile *f) -{ - int64_t completed_sector_sum = 0; - BlkMigDevState *bmds; - int progress; - int ret = 0; - - QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) { - if (bmds->bulk_completed == 0) { - if (mig_save_device_bulk(f, bmds) == 1) { - /* completed bulk section for this device */ - bmds->bulk_completed = 1; - } - completed_sector_sum += bmds->completed_sectors; - ret = 1; - break; - } else { - completed_sector_sum += bmds->completed_sectors; - } - } - - if (block_mig_state.total_sector_sum != 0) { - progress = completed_sector_sum * 100 / - block_mig_state.total_sector_sum; - } else { - progress = 100; - } - if (progress != block_mig_state.prev_progress) { - block_mig_state.prev_progress = progress; - qemu_put_be64(f, (progress << BDRV_SECTOR_BITS) - | BLK_MIG_FLAG_PROGRESS); - DPRINTF("Completed %d %%\r", progress); - } - - return ret; -} - -static void blk_mig_reset_dirty_cursor(void) -{ - BlkMigDevState *bmds; - - QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) { - bmds->cur_dirty = 0; - } -} - -/* Called with iothread lock and AioContext taken. */ - -static int mig_save_device_dirty(QEMUFile *f, BlkMigDevState *bmds, - int is_async) -{ - BlkMigBlock *blk; - int64_t total_sectors = bmds->total_sectors; - int64_t sector; - int nr_sectors; - int ret = -EIO; - - for (sector = bmds->cur_dirty; sector < bmds->total_sectors;) { - blk_mig_lock(); - if (bmds_aio_inflight(bmds, sector)) { - blk_mig_unlock(); - bdrv_drain(bmds->bs); - } else { - blk_mig_unlock(); - } - if (bdrv_get_dirty(bmds->bs, bmds->dirty_bitmap, sector)) { - - if (total_sectors - sector < BDRV_SECTORS_PER_DIRTY_CHUNK) { - nr_sectors = total_sectors - sector; - } else { - nr_sectors = BDRV_SECTORS_PER_DIRTY_CHUNK; - } - blk = g_new(BlkMigBlock, 1); - blk->buf = g_malloc(BLOCK_SIZE); - blk->bmds = bmds; - blk->sector = sector; - blk->nr_sectors = nr_sectors; - - if (is_async) { - blk->iov.iov_base = blk->buf; - blk->iov.iov_len = nr_sectors * BDRV_SECTOR_SIZE; - qemu_iovec_init_external(&blk->qiov, &blk->iov, 1); - - blk->aiocb = bdrv_aio_readv(bmds->bs, sector, &blk->qiov, - nr_sectors, blk_mig_read_cb, blk); - - blk_mig_lock(); - block_mig_state.submitted++; - bmds_set_aio_inflight(bmds, sector, nr_sectors, 1); - blk_mig_unlock(); - } else { - ret = bdrv_read(bmds->bs, sector, blk->buf, nr_sectors); - if (ret < 0) { - goto error; - } - blk_send(f, blk); - - g_free(blk->buf); - g_free(blk); - } - - bdrv_reset_dirty_bitmap(bmds->dirty_bitmap, sector, nr_sectors); - break; - } - sector += BDRV_SECTORS_PER_DIRTY_CHUNK; - bmds->cur_dirty = sector; - } - - return (bmds->cur_dirty >= bmds->total_sectors); - -error: - DPRINTF("Error reading sector %" PRId64 "\n", sector); - g_free(blk->buf); - g_free(blk); - return ret; -} - -/* Called with iothread lock taken. - * - * return value: - * 0: too much data for max_downtime - * 1: few enough data for max_downtime -*/ -static int blk_mig_save_dirty_block(QEMUFile *f, int is_async) -{ - BlkMigDevState *bmds; - int ret = 1; - - QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) { - aio_context_acquire(bdrv_get_aio_context(bmds->bs)); - ret = mig_save_device_dirty(f, bmds, is_async); - aio_context_release(bdrv_get_aio_context(bmds->bs)); - if (ret <= 0) { - break; - } - } - - return ret; -} - -/* Called with no locks taken. */ - -static int flush_blks(QEMUFile *f) -{ - BlkMigBlock *blk; - int ret = 0; - - DPRINTF("%s Enter submitted %d read_done %d transferred %d\n", - __FUNCTION__, block_mig_state.submitted, block_mig_state.read_done, - block_mig_state.transferred); - - blk_mig_lock(); - while ((blk = QSIMPLEQ_FIRST(&block_mig_state.blk_list)) != NULL) { - if (qemu_file_rate_limit(f)) { - break; - } - if (blk->ret < 0) { - ret = blk->ret; - break; - } - - QSIMPLEQ_REMOVE_HEAD(&block_mig_state.blk_list, entry); - blk_mig_unlock(); - blk_send(f, blk); - blk_mig_lock(); - - g_free(blk->buf); - g_free(blk); - - block_mig_state.read_done--; - block_mig_state.transferred++; - assert(block_mig_state.read_done >= 0); - } - blk_mig_unlock(); - - DPRINTF("%s Exit submitted %d read_done %d transferred %d\n", __FUNCTION__, - block_mig_state.submitted, block_mig_state.read_done, - block_mig_state.transferred); - return ret; -} - -/* Called with iothread lock taken. */ - -static int64_t get_remaining_dirty(void) -{ - BlkMigDevState *bmds; - int64_t dirty = 0; - - QSIMPLEQ_FOREACH(bmds, &block_mig_state.bmds_list, entry) { - aio_context_acquire(bdrv_get_aio_context(bmds->bs)); - dirty += bdrv_get_dirty_count(bmds->dirty_bitmap); - aio_context_release(bdrv_get_aio_context(bmds->bs)); - } - - return dirty << BDRV_SECTOR_BITS; -} - -/* Called with iothread lock taken. */ - -static void block_migration_cleanup(void *opaque) -{ - BlkMigDevState *bmds; - BlkMigBlock *blk; - AioContext *ctx; - - bdrv_drain_all(); - - unset_dirty_tracking(); - - while ((bmds = QSIMPLEQ_FIRST(&block_mig_state.bmds_list)) != NULL) { - QSIMPLEQ_REMOVE_HEAD(&block_mig_state.bmds_list, entry); - bdrv_op_unblock_all(bmds->bs, bmds->blocker); - error_free(bmds->blocker); - - /* Save ctx, because bmds->bs can disappear during bdrv_unref. */ - ctx = bdrv_get_aio_context(bmds->bs); - aio_context_acquire(ctx); - bdrv_unref(bmds->bs); - aio_context_release(ctx); - - g_free(bmds->aio_bitmap); - g_free(bmds); - } - - blk_mig_lock(); - while ((blk = QSIMPLEQ_FIRST(&block_mig_state.blk_list)) != NULL) { - QSIMPLEQ_REMOVE_HEAD(&block_mig_state.blk_list, entry); - g_free(blk->buf); - g_free(blk); - } - blk_mig_unlock(); -} - -static int block_save_setup(QEMUFile *f, void *opaque) -{ - int ret; - - DPRINTF("Enter save live setup submitted %d transferred %d\n", - block_mig_state.submitted, block_mig_state.transferred); - - qemu_mutex_lock_iothread(); - init_blk_migration(f); - - /* start track dirty blocks */ - ret = set_dirty_tracking(); - - qemu_mutex_unlock_iothread(); - - if (ret) { - return ret; - } - - ret = flush_blks(f); - blk_mig_reset_dirty_cursor(); - qemu_put_be64(f, BLK_MIG_FLAG_EOS); - - return ret; -} - -static int block_save_iterate(QEMUFile *f, void *opaque) -{ - int ret; - int64_t last_ftell = qemu_ftell(f); - int64_t delta_ftell; - - DPRINTF("Enter save live iterate submitted %d transferred %d\n", - block_mig_state.submitted, block_mig_state.transferred); - - ret = flush_blks(f); - if (ret) { - return ret; - } - - blk_mig_reset_dirty_cursor(); - - /* control the rate of transfer */ - blk_mig_lock(); - while ((block_mig_state.submitted + - block_mig_state.read_done) * BLOCK_SIZE < - qemu_file_get_rate_limit(f) && - (block_mig_state.submitted + - block_mig_state.read_done) < - MAX_INFLIGHT_IO) { - blk_mig_unlock(); - if (block_mig_state.bulk_completed == 0) { - /* first finish the bulk phase */ - if (blk_mig_save_bulked_block(f) == 0) { - /* finished saving bulk on all devices */ - block_mig_state.bulk_completed = 1; - } - ret = 0; - } else { - /* Always called with iothread lock taken for - * simplicity, block_save_complete also calls it. - */ - qemu_mutex_lock_iothread(); - ret = blk_mig_save_dirty_block(f, 1); - qemu_mutex_unlock_iothread(); - } - if (ret < 0) { - return ret; - } - blk_mig_lock(); - if (ret != 0) { - /* no more dirty blocks */ - break; - } - } - blk_mig_unlock(); - - ret = flush_blks(f); - if (ret) { - return ret; - } - - qemu_put_be64(f, BLK_MIG_FLAG_EOS); - delta_ftell = qemu_ftell(f) - last_ftell; - if (delta_ftell > 0) { - return 1; - } else if (delta_ftell < 0) { - return -1; - } else { - return 0; - } -} - -/* Called with iothread lock taken. */ - -static int block_save_complete(QEMUFile *f, void *opaque) -{ - int ret; - - DPRINTF("Enter save live complete submitted %d transferred %d\n", - block_mig_state.submitted, block_mig_state.transferred); - - ret = flush_blks(f); - if (ret) { - return ret; - } - - blk_mig_reset_dirty_cursor(); - - /* we know for sure that save bulk is completed and - all async read completed */ - blk_mig_lock(); - assert(block_mig_state.submitted == 0); - blk_mig_unlock(); - - do { - ret = blk_mig_save_dirty_block(f, 0); - if (ret < 0) { - return ret; - } - } while (ret == 0); - - /* report completion */ - qemu_put_be64(f, (100 << BDRV_SECTOR_BITS) | BLK_MIG_FLAG_PROGRESS); - - DPRINTF("Block migration completed\n"); - - qemu_put_be64(f, BLK_MIG_FLAG_EOS); - - return 0; -} - -static void block_save_pending(QEMUFile *f, void *opaque, uint64_t max_size, - uint64_t *non_postcopiable_pending, - uint64_t *postcopiable_pending) -{ - /* Estimate pending number of bytes to send */ - uint64_t pending; - - qemu_mutex_lock_iothread(); - pending = get_remaining_dirty(); - qemu_mutex_unlock_iothread(); - - blk_mig_lock(); - pending += block_mig_state.submitted * BLOCK_SIZE + - block_mig_state.read_done * BLOCK_SIZE; - blk_mig_unlock(); - - /* Report at least one block pending during bulk phase */ - if (pending <= max_size && !block_mig_state.bulk_completed) { - pending = max_size + BLOCK_SIZE; - } - - DPRINTF("Enter save live pending %" PRIu64 "\n", pending); - /* We don't do postcopy */ - *non_postcopiable_pending += pending; -} - -static int block_load(QEMUFile *f, void *opaque, int version_id) -{ - static int banner_printed; - int len, flags; - char device_name[256]; - int64_t addr; - BlockDriverState *bs, *bs_prev = NULL; - BlockBackend *blk; - Error *local_err = NULL; - uint8_t *buf; - int64_t total_sectors = 0; - int nr_sectors; - int ret; - - do { - addr = qemu_get_be64(f); - - flags = addr & ~BDRV_SECTOR_MASK; - addr >>= BDRV_SECTOR_BITS; - - if (flags & BLK_MIG_FLAG_DEVICE_BLOCK) { - /* get device name */ - len = qemu_get_byte(f); - qemu_get_buffer(f, (uint8_t *)device_name, len); - device_name[len] = '\0'; - - blk = blk_by_name(device_name); - if (!blk) { - fprintf(stderr, "Error unknown block device %s\n", - device_name); - return -EINVAL; - } - bs = blk_bs(blk); - if (!bs) { - fprintf(stderr, "Block device %s has no medium\n", - device_name); - return -EINVAL; - } - - if (bs != bs_prev) { - bs_prev = bs; - total_sectors = bdrv_nb_sectors(bs); - if (total_sectors <= 0) { - error_report("Error getting length of block device %s", - device_name); - return -EINVAL; - } - - bdrv_invalidate_cache(bs, &local_err); - if (local_err) { - error_report_err(local_err); - return -EINVAL; - } - } - - if (total_sectors - addr < BDRV_SECTORS_PER_DIRTY_CHUNK) { - nr_sectors = total_sectors - addr; - } else { - nr_sectors = BDRV_SECTORS_PER_DIRTY_CHUNK; - } - - if (flags & BLK_MIG_FLAG_ZERO_BLOCK) { - ret = bdrv_write_zeroes(bs, addr, nr_sectors, - BDRV_REQ_MAY_UNMAP); - } else { - buf = g_malloc(BLOCK_SIZE); - qemu_get_buffer(f, buf, BLOCK_SIZE); - ret = bdrv_write(bs, addr, buf, nr_sectors); - g_free(buf); - } - - if (ret < 0) { - return ret; - } - } else if (flags & BLK_MIG_FLAG_PROGRESS) { - if (!banner_printed) { - printf("Receiving block device images\n"); - banner_printed = 1; - } - printf("Completed %d %%%c", (int)addr, - (addr == 100) ? '\n' : '\r'); - fflush(stdout); - } else if (!(flags & BLK_MIG_FLAG_EOS)) { - fprintf(stderr, "Unknown block migration flags: %#x\n", flags); - return -EINVAL; - } - ret = qemu_file_get_error(f); - if (ret != 0) { - return ret; - } - } while (!(flags & BLK_MIG_FLAG_EOS)); - - return 0; -} - -static void block_set_params(const MigrationParams *params, void *opaque) -{ - block_mig_state.blk_enable = params->blk; - block_mig_state.shared_base = params->shared; - - /* shared base means that blk_enable = 1 */ - block_mig_state.blk_enable |= params->shared; -} - -static bool block_is_active(void *opaque) -{ - return block_mig_state.blk_enable == 1; -} - -static SaveVMHandlers savevm_block_handlers = { - .set_params = block_set_params, - .save_live_setup = block_save_setup, - .save_live_iterate = block_save_iterate, - .save_live_complete_precopy = block_save_complete, - .save_live_pending = block_save_pending, - .load_state = block_load, - .cleanup = block_migration_cleanup, - .is_active = block_is_active, -}; - -void blk_mig_init(void) -{ - QSIMPLEQ_INIT(&block_mig_state.bmds_list); - QSIMPLEQ_INIT(&block_mig_state.blk_list); - qemu_mutex_init(&block_mig_state.lock); - - register_savevm_live(NULL, "block", 0, 1, &savevm_block_handlers, - &block_mig_state); -} diff --git a/qemu/migration/exec.c b/qemu/migration/exec.c deleted file mode 100644 index 559420969..000000000 --- a/qemu/migration/exec.c +++ /dev/null @@ -1,70 +0,0 @@ -/* - * QEMU live migration - * - * Copyright IBM, Corp. 2008 - * Copyright Dell MessageOne 2008 - * - * Authors: - * Anthony Liguori <aliguori@us.ibm.com> - * Charles Duffy <charles_duffy@messageone.com> - * - * This work is licensed under the terms of the GNU GPL, version 2. See - * the COPYING file in the top-level directory. - * - * Contributions after 2012-01-13 are licensed under the terms of the - * GNU GPL, version 2 or (at your option) any later version. - */ - -#include "qemu/osdep.h" -#include "qapi/error.h" -#include "qemu-common.h" -#include "qemu/sockets.h" -#include "qemu/main-loop.h" -#include "migration/migration.h" -#include "migration/qemu-file.h" -#include "block/block.h" -#include <sys/wait.h> - -//#define DEBUG_MIGRATION_EXEC - -#ifdef DEBUG_MIGRATION_EXEC -#define DPRINTF(fmt, ...) \ - do { printf("migration-exec: " fmt, ## __VA_ARGS__); } while (0) -#else -#define DPRINTF(fmt, ...) \ - do { } while (0) -#endif - -void exec_start_outgoing_migration(MigrationState *s, const char *command, Error **errp) -{ - s->to_dst_file = qemu_popen_cmd(command, "w"); - if (s->to_dst_file == NULL) { - error_setg_errno(errp, errno, "failed to popen the migration target"); - return; - } - - migrate_fd_connect(s); -} - -static void exec_accept_incoming_migration(void *opaque) -{ - QEMUFile *f = opaque; - - qemu_set_fd_handler(qemu_get_fd(f), NULL, NULL, NULL); - process_incoming_migration(f); -} - -void exec_start_incoming_migration(const char *command, Error **errp) -{ - QEMUFile *f; - - DPRINTF("Attempting to start an incoming migration\n"); - f = qemu_popen_cmd(command, "r"); - if(f == NULL) { - error_setg_errno(errp, errno, "failed to popen the migration source"); - return; - } - - qemu_set_fd_handler(qemu_get_fd(f), exec_accept_incoming_migration, NULL, - f); -} diff --git a/qemu/migration/fd.c b/qemu/migration/fd.c deleted file mode 100644 index 3d788bb29..000000000 --- a/qemu/migration/fd.c +++ /dev/null @@ -1,90 +0,0 @@ -/* - * QEMU live migration via generic fd - * - * Copyright Red Hat, Inc. 2009 - * - * Authors: - * Chris Lalancette <clalance@redhat.com> - * - * This work is licensed under the terms of the GNU GPL, version 2. See - * the COPYING file in the top-level directory. - * - * Contributions after 2012-01-13 are licensed under the terms of the - * GNU GPL, version 2 or (at your option) any later version. - */ - -#include "qemu/osdep.h" -#include "qapi/error.h" -#include "qemu-common.h" -#include "qemu/main-loop.h" -#include "qemu/sockets.h" -#include "migration/migration.h" -#include "monitor/monitor.h" -#include "migration/qemu-file.h" -#include "block/block.h" - -//#define DEBUG_MIGRATION_FD - -#ifdef DEBUG_MIGRATION_FD -#define DPRINTF(fmt, ...) \ - do { printf("migration-fd: " fmt, ## __VA_ARGS__); } while (0) -#else -#define DPRINTF(fmt, ...) \ - do { } while (0) -#endif - -static bool fd_is_socket(int fd) -{ - struct stat stat; - int ret = fstat(fd, &stat); - if (ret == -1) { - /* When in doubt say no */ - return false; - } - return S_ISSOCK(stat.st_mode); -} - -void fd_start_outgoing_migration(MigrationState *s, const char *fdname, Error **errp) -{ - int fd = monitor_get_fd(cur_mon, fdname, errp); - if (fd == -1) { - return; - } - - if (fd_is_socket(fd)) { - s->to_dst_file = qemu_fopen_socket(fd, "wb"); - } else { - s->to_dst_file = qemu_fdopen(fd, "wb"); - } - - migrate_fd_connect(s); -} - -static void fd_accept_incoming_migration(void *opaque) -{ - QEMUFile *f = opaque; - - qemu_set_fd_handler(qemu_get_fd(f), NULL, NULL, NULL); - process_incoming_migration(f); -} - -void fd_start_incoming_migration(const char *infd, Error **errp) -{ - int fd; - QEMUFile *f; - - DPRINTF("Attempting to start an incoming migration via fd\n"); - - fd = strtol(infd, NULL, 0); - if (fd_is_socket(fd)) { - f = qemu_fopen_socket(fd, "rb"); - } else { - f = qemu_fdopen(fd, "rb"); - } - if(f == NULL) { - error_setg_errno(errp, errno, "failed to open the source descriptor"); - return; - } - - qemu_set_fd_handler(fd, fd_accept_incoming_migration, NULL, f); -} diff --git a/qemu/migration/migration.c b/qemu/migration/migration.c deleted file mode 100644 index 991313a86..000000000 --- a/qemu/migration/migration.c +++ /dev/null @@ -1,1810 +0,0 @@ -/* - * QEMU live migration - * - * Copyright IBM, Corp. 2008 - * - * Authors: - * Anthony Liguori <aliguori@us.ibm.com> - * - * This work is licensed under the terms of the GNU GPL, version 2. See - * the COPYING file in the top-level directory. - * - * Contributions after 2012-01-13 are licensed under the terms of the - * GNU GPL, version 2 or (at your option) any later version. - */ - -#include "qemu/osdep.h" -#include "qemu/cutils.h" -#include "qemu/error-report.h" -#include "qemu/main-loop.h" -#include "migration/migration.h" -#include "migration/qemu-file.h" -#include "sysemu/sysemu.h" -#include "block/block.h" -#include "qapi/qmp/qerror.h" -#include "qapi/util.h" -#include "qemu/sockets.h" -#include "qemu/rcu.h" -#include "migration/block.h" -#include "migration/postcopy-ram.h" -#include "qemu/thread.h" -#include "qmp-commands.h" -#include "trace.h" -#include "qapi-event.h" -#include "qom/cpu.h" -#include "exec/memory.h" -#include "exec/address-spaces.h" - -#define MAX_THROTTLE (32 << 20) /* Migration transfer speed throttling */ - -/* Amount of time to allocate to each "chunk" of bandwidth-throttled - * data. */ -#define BUFFER_DELAY 100 -#define XFER_LIMIT_RATIO (1000 / BUFFER_DELAY) - -/* Default compression thread count */ -#define DEFAULT_MIGRATE_COMPRESS_THREAD_COUNT 8 -/* Default decompression thread count, usually decompression is at - * least 4 times as fast as compression.*/ -#define DEFAULT_MIGRATE_DECOMPRESS_THREAD_COUNT 2 -/*0: means nocompress, 1: best speed, ... 9: best compress ratio */ -#define DEFAULT_MIGRATE_COMPRESS_LEVEL 1 -/* Define default autoconverge cpu throttle migration parameters */ -#define DEFAULT_MIGRATE_X_CPU_THROTTLE_INITIAL 20 -#define DEFAULT_MIGRATE_X_CPU_THROTTLE_INCREMENT 10 - -/* Migration XBZRLE default cache size */ -#define DEFAULT_MIGRATE_CACHE_SIZE (64 * 1024 * 1024) - -static NotifierList migration_state_notifiers = - NOTIFIER_LIST_INITIALIZER(migration_state_notifiers); - -static bool deferred_incoming; - -/* - * Current state of incoming postcopy; note this is not part of - * MigrationIncomingState since it's state is used during cleanup - * at the end as MIS is being freed. - */ -static PostcopyState incoming_postcopy_state; - -/* When we add fault tolerance, we could have several - migrations at once. For now we don't need to add - dynamic creation of migration */ - -/* For outgoing */ -MigrationState *migrate_get_current(void) -{ - static bool once; - static MigrationState current_migration = { - .state = MIGRATION_STATUS_NONE, - .bandwidth_limit = MAX_THROTTLE, - .xbzrle_cache_size = DEFAULT_MIGRATE_CACHE_SIZE, - .mbps = -1, - .parameters[MIGRATION_PARAMETER_COMPRESS_LEVEL] = - DEFAULT_MIGRATE_COMPRESS_LEVEL, - .parameters[MIGRATION_PARAMETER_COMPRESS_THREADS] = - DEFAULT_MIGRATE_COMPRESS_THREAD_COUNT, - .parameters[MIGRATION_PARAMETER_DECOMPRESS_THREADS] = - DEFAULT_MIGRATE_DECOMPRESS_THREAD_COUNT, - .parameters[MIGRATION_PARAMETER_X_CPU_THROTTLE_INITIAL] = - DEFAULT_MIGRATE_X_CPU_THROTTLE_INITIAL, - .parameters[MIGRATION_PARAMETER_X_CPU_THROTTLE_INCREMENT] = - DEFAULT_MIGRATE_X_CPU_THROTTLE_INCREMENT, - }; - - if (!once) { - qemu_mutex_init(¤t_migration.src_page_req_mutex); - once = true; - } - return ¤t_migration; -} - -/* For incoming */ -static MigrationIncomingState *mis_current; - -MigrationIncomingState *migration_incoming_get_current(void) -{ - return mis_current; -} - -MigrationIncomingState *migration_incoming_state_new(QEMUFile* f) -{ - mis_current = g_new0(MigrationIncomingState, 1); - mis_current->from_src_file = f; - mis_current->state = MIGRATION_STATUS_NONE; - QLIST_INIT(&mis_current->loadvm_handlers); - qemu_mutex_init(&mis_current->rp_mutex); - qemu_event_init(&mis_current->main_thread_load_event, false); - - return mis_current; -} - -void migration_incoming_state_destroy(void) -{ - qemu_event_destroy(&mis_current->main_thread_load_event); - loadvm_free_handlers(mis_current); - g_free(mis_current); - mis_current = NULL; -} - - -typedef struct { - bool optional; - uint32_t size; - uint8_t runstate[100]; - RunState state; - bool received; -} GlobalState; - -static GlobalState global_state; - -int global_state_store(void) -{ - if (!runstate_store((char *)global_state.runstate, - sizeof(global_state.runstate))) { - error_report("runstate name too big: %s", global_state.runstate); - trace_migrate_state_too_big(); - return -EINVAL; - } - return 0; -} - -void global_state_store_running(void) -{ - const char *state = RunState_lookup[RUN_STATE_RUNNING]; - strncpy((char *)global_state.runstate, - state, sizeof(global_state.runstate)); -} - -static bool global_state_received(void) -{ - return global_state.received; -} - -static RunState global_state_get_runstate(void) -{ - return global_state.state; -} - -void global_state_set_optional(void) -{ - global_state.optional = true; -} - -static bool global_state_needed(void *opaque) -{ - GlobalState *s = opaque; - char *runstate = (char *)s->runstate; - - /* If it is not optional, it is mandatory */ - - if (s->optional == false) { - return true; - } - - /* If state is running or paused, it is not needed */ - - if (strcmp(runstate, "running") == 0 || - strcmp(runstate, "paused") == 0) { - return false; - } - - /* for any other state it is needed */ - return true; -} - -static int global_state_post_load(void *opaque, int version_id) -{ - GlobalState *s = opaque; - Error *local_err = NULL; - int r; - char *runstate = (char *)s->runstate; - - s->received = true; - trace_migrate_global_state_post_load(runstate); - - r = qapi_enum_parse(RunState_lookup, runstate, RUN_STATE__MAX, - -1, &local_err); - - if (r == -1) { - if (local_err) { - error_report_err(local_err); - } - return -EINVAL; - } - s->state = r; - - return 0; -} - -static void global_state_pre_save(void *opaque) -{ - GlobalState *s = opaque; - - trace_migrate_global_state_pre_save((char *)s->runstate); - s->size = strlen((char *)s->runstate) + 1; -} - -static const VMStateDescription vmstate_globalstate = { - .name = "globalstate", - .version_id = 1, - .minimum_version_id = 1, - .post_load = global_state_post_load, - .pre_save = global_state_pre_save, - .needed = global_state_needed, - .fields = (VMStateField[]) { - VMSTATE_UINT32(size, GlobalState), - VMSTATE_BUFFER(runstate, GlobalState), - VMSTATE_END_OF_LIST() - }, -}; - -void register_global_state(void) -{ - /* We would use it independently that we receive it */ - strcpy((char *)&global_state.runstate, ""); - global_state.received = false; - vmstate_register(NULL, 0, &vmstate_globalstate, &global_state); -} - -static void migrate_generate_event(int new_state) -{ - if (migrate_use_events()) { - qapi_event_send_migration(new_state, &error_abort); - } -} - -/* - * Called on -incoming with a defer: uri. - * The migration can be started later after any parameters have been - * changed. - */ -static void deferred_incoming_migration(Error **errp) -{ - if (deferred_incoming) { - error_setg(errp, "Incoming migration already deferred"); - } - deferred_incoming = true; -} - -/* Request a range of pages from the source VM at the given - * start address. - * rbname: Name of the RAMBlock to request the page in, if NULL it's the same - * as the last request (a name must have been given previously) - * Start: Address offset within the RB - * Len: Length in bytes required - must be a multiple of pagesize - */ -void migrate_send_rp_req_pages(MigrationIncomingState *mis, const char *rbname, - ram_addr_t start, size_t len) -{ - uint8_t bufc[12 + 1 + 255]; /* start (8), len (4), rbname upto 256 */ - size_t msglen = 12; /* start + len */ - - *(uint64_t *)bufc = cpu_to_be64((uint64_t)start); - *(uint32_t *)(bufc + 8) = cpu_to_be32((uint32_t)len); - - if (rbname) { - int rbname_len = strlen(rbname); - assert(rbname_len < 256); - - bufc[msglen++] = rbname_len; - memcpy(bufc + msglen, rbname, rbname_len); - msglen += rbname_len; - migrate_send_rp_message(mis, MIG_RP_MSG_REQ_PAGES_ID, msglen, bufc); - } else { - migrate_send_rp_message(mis, MIG_RP_MSG_REQ_PAGES, msglen, bufc); - } -} - -void qemu_start_incoming_migration(const char *uri, Error **errp) -{ - const char *p; - - qapi_event_send_migration(MIGRATION_STATUS_SETUP, &error_abort); - if (!strcmp(uri, "defer")) { - deferred_incoming_migration(errp); - } else if (strstart(uri, "tcp:", &p)) { - tcp_start_incoming_migration(p, errp); -#ifdef CONFIG_RDMA - } else if (strstart(uri, "rdma:", &p)) { - rdma_start_incoming_migration(p, errp); -#endif -#if !defined(WIN32) - } else if (strstart(uri, "exec:", &p)) { - exec_start_incoming_migration(p, errp); - } else if (strstart(uri, "unix:", &p)) { - unix_start_incoming_migration(p, errp); - } else if (strstart(uri, "fd:", &p)) { - fd_start_incoming_migration(p, errp); -#endif - } else { - error_setg(errp, "unknown migration protocol: %s", uri); - } -} - -static void process_incoming_migration_bh(void *opaque) -{ - Error *local_err = NULL; - MigrationIncomingState *mis = opaque; - - /* Make sure all file formats flush their mutable metadata */ - bdrv_invalidate_cache_all(&local_err); - if (local_err) { - migrate_set_state(&mis->state, MIGRATION_STATUS_ACTIVE, - MIGRATION_STATUS_FAILED); - error_report_err(local_err); - migrate_decompress_threads_join(); - exit(EXIT_FAILURE); - } - - /* - * This must happen after all error conditions are dealt with and - * we're sure the VM is going to be running on this host. - */ - qemu_announce_self(); - - /* If global state section was not received or we are in running - state, we need to obey autostart. Any other state is set with - runstate_set. */ - - if (!global_state_received() || - global_state_get_runstate() == RUN_STATE_RUNNING) { - if (autostart) { - vm_start(); - } else { - runstate_set(RUN_STATE_PAUSED); - } - } else { - runstate_set(global_state_get_runstate()); - } - migrate_decompress_threads_join(); - /* - * This must happen after any state changes since as soon as an external - * observer sees this event they might start to prod at the VM assuming - * it's ready to use. - */ - migrate_set_state(&mis->state, MIGRATION_STATUS_ACTIVE, - MIGRATION_STATUS_COMPLETED); - qemu_bh_delete(mis->bh); - migration_incoming_state_destroy(); -} - -static void process_incoming_migration_co(void *opaque) -{ - QEMUFile *f = opaque; - MigrationIncomingState *mis; - PostcopyState ps; - int ret; - - mis = migration_incoming_state_new(f); - postcopy_state_set(POSTCOPY_INCOMING_NONE); - migrate_set_state(&mis->state, MIGRATION_STATUS_NONE, - MIGRATION_STATUS_ACTIVE); - ret = qemu_loadvm_state(f); - - ps = postcopy_state_get(); - trace_process_incoming_migration_co_end(ret, ps); - if (ps != POSTCOPY_INCOMING_NONE) { - if (ps == POSTCOPY_INCOMING_ADVISE) { - /* - * Where a migration had postcopy enabled (and thus went to advise) - * but managed to complete within the precopy period, we can use - * the normal exit. - */ - postcopy_ram_incoming_cleanup(mis); - } else if (ret >= 0) { - /* - * Postcopy was started, cleanup should happen at the end of the - * postcopy thread. - */ - trace_process_incoming_migration_co_postcopy_end_main(); - return; - } - /* Else if something went wrong then just fall out of the normal exit */ - } - - qemu_fclose(f); - free_xbzrle_decoded_buf(); - - if (ret < 0) { - migrate_set_state(&mis->state, MIGRATION_STATUS_ACTIVE, - MIGRATION_STATUS_FAILED); - error_report("load of migration failed: %s", strerror(-ret)); - migrate_decompress_threads_join(); - exit(EXIT_FAILURE); - } - - mis->bh = qemu_bh_new(process_incoming_migration_bh, mis); - qemu_bh_schedule(mis->bh); -} - -void process_incoming_migration(QEMUFile *f) -{ - Coroutine *co = qemu_coroutine_create(process_incoming_migration_co); - int fd = qemu_get_fd(f); - - assert(fd != -1); - migrate_decompress_threads_create(); - qemu_set_nonblock(fd); - qemu_coroutine_enter(co, f); -} - -/* - * Send a message on the return channel back to the source - * of the migration. - */ -void migrate_send_rp_message(MigrationIncomingState *mis, - enum mig_rp_message_type message_type, - uint16_t len, void *data) -{ - trace_migrate_send_rp_message((int)message_type, len); - qemu_mutex_lock(&mis->rp_mutex); - qemu_put_be16(mis->to_src_file, (unsigned int)message_type); - qemu_put_be16(mis->to_src_file, len); - qemu_put_buffer(mis->to_src_file, data, len); - qemu_fflush(mis->to_src_file); - qemu_mutex_unlock(&mis->rp_mutex); -} - -/* - * Send a 'SHUT' message on the return channel with the given value - * to indicate that we've finished with the RP. Non-0 value indicates - * error. - */ -void migrate_send_rp_shut(MigrationIncomingState *mis, - uint32_t value) -{ - uint32_t buf; - - buf = cpu_to_be32(value); - migrate_send_rp_message(mis, MIG_RP_MSG_SHUT, sizeof(buf), &buf); -} - -/* - * Send a 'PONG' message on the return channel with the given value - * (normally in response to a 'PING') - */ -void migrate_send_rp_pong(MigrationIncomingState *mis, - uint32_t value) -{ - uint32_t buf; - - buf = cpu_to_be32(value); - migrate_send_rp_message(mis, MIG_RP_MSG_PONG, sizeof(buf), &buf); -} - -/* amount of nanoseconds we are willing to wait for migration to be down. - * the choice of nanoseconds is because it is the maximum resolution that - * get_clock() can achieve. It is an internal measure. All user-visible - * units must be in seconds */ -static uint64_t max_downtime = 300000000; - -uint64_t migrate_max_downtime(void) -{ - return max_downtime; -} - -MigrationCapabilityStatusList *qmp_query_migrate_capabilities(Error **errp) -{ - MigrationCapabilityStatusList *head = NULL; - MigrationCapabilityStatusList *caps; - MigrationState *s = migrate_get_current(); - int i; - - caps = NULL; /* silence compiler warning */ - for (i = 0; i < MIGRATION_CAPABILITY__MAX; i++) { - if (head == NULL) { - head = g_malloc0(sizeof(*caps)); - caps = head; - } else { - caps->next = g_malloc0(sizeof(*caps)); - caps = caps->next; - } - caps->value = - g_malloc(sizeof(*caps->value)); - caps->value->capability = i; - caps->value->state = s->enabled_capabilities[i]; - } - - return head; -} - -MigrationParameters *qmp_query_migrate_parameters(Error **errp) -{ - MigrationParameters *params; - MigrationState *s = migrate_get_current(); - - params = g_malloc0(sizeof(*params)); - params->compress_level = s->parameters[MIGRATION_PARAMETER_COMPRESS_LEVEL]; - params->compress_threads = - s->parameters[MIGRATION_PARAMETER_COMPRESS_THREADS]; - params->decompress_threads = - s->parameters[MIGRATION_PARAMETER_DECOMPRESS_THREADS]; - params->x_cpu_throttle_initial = - s->parameters[MIGRATION_PARAMETER_X_CPU_THROTTLE_INITIAL]; - params->x_cpu_throttle_increment = - s->parameters[MIGRATION_PARAMETER_X_CPU_THROTTLE_INCREMENT]; - - return params; -} - -/* - * Return true if we're already in the middle of a migration - * (i.e. any of the active or setup states) - */ -static bool migration_is_setup_or_active(int state) -{ - switch (state) { - case MIGRATION_STATUS_ACTIVE: - case MIGRATION_STATUS_POSTCOPY_ACTIVE: - case MIGRATION_STATUS_SETUP: - return true; - - default: - return false; - - } -} - -static void get_xbzrle_cache_stats(MigrationInfo *info) -{ - if (migrate_use_xbzrle()) { - info->has_xbzrle_cache = true; - info->xbzrle_cache = g_malloc0(sizeof(*info->xbzrle_cache)); - info->xbzrle_cache->cache_size = migrate_xbzrle_cache_size(); - info->xbzrle_cache->bytes = xbzrle_mig_bytes_transferred(); - info->xbzrle_cache->pages = xbzrle_mig_pages_transferred(); - info->xbzrle_cache->cache_miss = xbzrle_mig_pages_cache_miss(); - info->xbzrle_cache->cache_miss_rate = xbzrle_mig_cache_miss_rate(); - info->xbzrle_cache->overflow = xbzrle_mig_pages_overflow(); - } -} - -MigrationInfo *qmp_query_migrate(Error **errp) -{ - MigrationInfo *info = g_malloc0(sizeof(*info)); - MigrationState *s = migrate_get_current(); - - switch (s->state) { - case MIGRATION_STATUS_NONE: - /* no migration has happened ever */ - break; - case MIGRATION_STATUS_SETUP: - info->has_status = true; - info->has_total_time = false; - break; - case MIGRATION_STATUS_ACTIVE: - case MIGRATION_STATUS_CANCELLING: - info->has_status = true; - info->has_total_time = true; - info->total_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME) - - s->total_time; - info->has_expected_downtime = true; - info->expected_downtime = s->expected_downtime; - info->has_setup_time = true; - info->setup_time = s->setup_time; - - info->has_ram = true; - info->ram = g_malloc0(sizeof(*info->ram)); - info->ram->transferred = ram_bytes_transferred(); - info->ram->remaining = ram_bytes_remaining(); - info->ram->total = ram_bytes_total(); - info->ram->duplicate = dup_mig_pages_transferred(); - info->ram->skipped = skipped_mig_pages_transferred(); - info->ram->normal = norm_mig_pages_transferred(); - info->ram->normal_bytes = norm_mig_bytes_transferred(); - info->ram->dirty_pages_rate = s->dirty_pages_rate; - info->ram->mbps = s->mbps; - info->ram->dirty_sync_count = s->dirty_sync_count; - - if (blk_mig_active()) { - info->has_disk = true; - info->disk = g_malloc0(sizeof(*info->disk)); - info->disk->transferred = blk_mig_bytes_transferred(); - info->disk->remaining = blk_mig_bytes_remaining(); - info->disk->total = blk_mig_bytes_total(); - } - - if (cpu_throttle_active()) { - info->has_x_cpu_throttle_percentage = true; - info->x_cpu_throttle_percentage = cpu_throttle_get_percentage(); - } - - get_xbzrle_cache_stats(info); - break; - case MIGRATION_STATUS_POSTCOPY_ACTIVE: - /* Mostly the same as active; TODO add some postcopy stats */ - info->has_status = true; - info->has_total_time = true; - info->total_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME) - - s->total_time; - info->has_expected_downtime = true; - info->expected_downtime = s->expected_downtime; - info->has_setup_time = true; - info->setup_time = s->setup_time; - - info->has_ram = true; - info->ram = g_malloc0(sizeof(*info->ram)); - info->ram->transferred = ram_bytes_transferred(); - info->ram->remaining = ram_bytes_remaining(); - info->ram->total = ram_bytes_total(); - info->ram->duplicate = dup_mig_pages_transferred(); - info->ram->skipped = skipped_mig_pages_transferred(); - info->ram->normal = norm_mig_pages_transferred(); - info->ram->normal_bytes = norm_mig_bytes_transferred(); - info->ram->dirty_pages_rate = s->dirty_pages_rate; - info->ram->mbps = s->mbps; - info->ram->dirty_sync_count = s->dirty_sync_count; - - if (blk_mig_active()) { - info->has_disk = true; - info->disk = g_malloc0(sizeof(*info->disk)); - info->disk->transferred = blk_mig_bytes_transferred(); - info->disk->remaining = blk_mig_bytes_remaining(); - info->disk->total = blk_mig_bytes_total(); - } - - get_xbzrle_cache_stats(info); - break; - case MIGRATION_STATUS_COMPLETED: - get_xbzrle_cache_stats(info); - - info->has_status = true; - info->has_total_time = true; - info->total_time = s->total_time; - info->has_downtime = true; - info->downtime = s->downtime; - info->has_setup_time = true; - info->setup_time = s->setup_time; - - info->has_ram = true; - info->ram = g_malloc0(sizeof(*info->ram)); - info->ram->transferred = ram_bytes_transferred(); - info->ram->remaining = 0; - info->ram->total = ram_bytes_total(); - info->ram->duplicate = dup_mig_pages_transferred(); - info->ram->skipped = skipped_mig_pages_transferred(); - info->ram->normal = norm_mig_pages_transferred(); - info->ram->normal_bytes = norm_mig_bytes_transferred(); - info->ram->mbps = s->mbps; - info->ram->dirty_sync_count = s->dirty_sync_count; - break; - case MIGRATION_STATUS_FAILED: - info->has_status = true; - break; - case MIGRATION_STATUS_CANCELLED: - info->has_status = true; - break; - } - info->status = s->state; - - return info; -} - -void qmp_migrate_set_capabilities(MigrationCapabilityStatusList *params, - Error **errp) -{ - MigrationState *s = migrate_get_current(); - MigrationCapabilityStatusList *cap; - - if (migration_is_setup_or_active(s->state)) { - error_setg(errp, QERR_MIGRATION_ACTIVE); - return; - } - - for (cap = params; cap; cap = cap->next) { - s->enabled_capabilities[cap->value->capability] = cap->value->state; - } - - if (migrate_postcopy_ram()) { - if (migrate_use_compression()) { - /* The decompression threads asynchronously write into RAM - * rather than use the atomic copies needed to avoid - * userfaulting. It should be possible to fix the decompression - * threads for compatibility in future. - */ - error_report("Postcopy is not currently compatible with " - "compression"); - s->enabled_capabilities[MIGRATION_CAPABILITY_POSTCOPY_RAM] = - false; - } - } -} - -void qmp_migrate_set_parameters(bool has_compress_level, - int64_t compress_level, - bool has_compress_threads, - int64_t compress_threads, - bool has_decompress_threads, - int64_t decompress_threads, - bool has_x_cpu_throttle_initial, - int64_t x_cpu_throttle_initial, - bool has_x_cpu_throttle_increment, - int64_t x_cpu_throttle_increment, Error **errp) -{ - MigrationState *s = migrate_get_current(); - - if (has_compress_level && (compress_level < 0 || compress_level > 9)) { - error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "compress_level", - "is invalid, it should be in the range of 0 to 9"); - return; - } - if (has_compress_threads && - (compress_threads < 1 || compress_threads > 255)) { - error_setg(errp, QERR_INVALID_PARAMETER_VALUE, - "compress_threads", - "is invalid, it should be in the range of 1 to 255"); - return; - } - if (has_decompress_threads && - (decompress_threads < 1 || decompress_threads > 255)) { - error_setg(errp, QERR_INVALID_PARAMETER_VALUE, - "decompress_threads", - "is invalid, it should be in the range of 1 to 255"); - return; - } - if (has_x_cpu_throttle_initial && - (x_cpu_throttle_initial < 1 || x_cpu_throttle_initial > 99)) { - error_setg(errp, QERR_INVALID_PARAMETER_VALUE, - "x_cpu_throttle_initial", - "an integer in the range of 1 to 99"); - } - if (has_x_cpu_throttle_increment && - (x_cpu_throttle_increment < 1 || x_cpu_throttle_increment > 99)) { - error_setg(errp, QERR_INVALID_PARAMETER_VALUE, - "x_cpu_throttle_increment", - "an integer in the range of 1 to 99"); - } - - if (has_compress_level) { - s->parameters[MIGRATION_PARAMETER_COMPRESS_LEVEL] = compress_level; - } - if (has_compress_threads) { - s->parameters[MIGRATION_PARAMETER_COMPRESS_THREADS] = compress_threads; - } - if (has_decompress_threads) { - s->parameters[MIGRATION_PARAMETER_DECOMPRESS_THREADS] = - decompress_threads; - } - if (has_x_cpu_throttle_initial) { - s->parameters[MIGRATION_PARAMETER_X_CPU_THROTTLE_INITIAL] = - x_cpu_throttle_initial; - } - - if (has_x_cpu_throttle_increment) { - s->parameters[MIGRATION_PARAMETER_X_CPU_THROTTLE_INCREMENT] = - x_cpu_throttle_increment; - } -} - -void qmp_migrate_start_postcopy(Error **errp) -{ - MigrationState *s = migrate_get_current(); - - if (!migrate_postcopy_ram()) { - error_setg(errp, "Enable postcopy with migrate_set_capability before" - " the start of migration"); - return; - } - - if (s->state == MIGRATION_STATUS_NONE) { - error_setg(errp, "Postcopy must be started after migration has been" - " started"); - return; - } - /* - * we don't error if migration has finished since that would be racy - * with issuing this command. - */ - atomic_set(&s->start_postcopy, true); -} - -/* shared migration helpers */ - -void migrate_set_state(int *state, int old_state, int new_state) -{ - if (atomic_cmpxchg(state, old_state, new_state) == old_state) { - trace_migrate_set_state(new_state); - migrate_generate_event(new_state); - } -} - -static void migrate_fd_cleanup(void *opaque) -{ - MigrationState *s = opaque; - - qemu_bh_delete(s->cleanup_bh); - s->cleanup_bh = NULL; - - flush_page_queue(s); - - if (s->to_dst_file) { - trace_migrate_fd_cleanup(); - qemu_mutex_unlock_iothread(); - if (s->migration_thread_running) { - qemu_thread_join(&s->thread); - s->migration_thread_running = false; - } - qemu_mutex_lock_iothread(); - - migrate_compress_threads_join(); - qemu_fclose(s->to_dst_file); - s->to_dst_file = NULL; - } - - assert((s->state != MIGRATION_STATUS_ACTIVE) && - (s->state != MIGRATION_STATUS_POSTCOPY_ACTIVE)); - - if (s->state == MIGRATION_STATUS_CANCELLING) { - migrate_set_state(&s->state, MIGRATION_STATUS_CANCELLING, - MIGRATION_STATUS_CANCELLED); - } - - notifier_list_notify(&migration_state_notifiers, s); -} - -void migrate_fd_error(MigrationState *s) -{ - trace_migrate_fd_error(); - assert(s->to_dst_file == NULL); - migrate_set_state(&s->state, MIGRATION_STATUS_SETUP, - MIGRATION_STATUS_FAILED); - notifier_list_notify(&migration_state_notifiers, s); -} - -static void migrate_fd_cancel(MigrationState *s) -{ - int old_state ; - QEMUFile *f = migrate_get_current()->to_dst_file; - trace_migrate_fd_cancel(); - - if (s->rp_state.from_dst_file) { - /* shutdown the rp socket, so causing the rp thread to shutdown */ - qemu_file_shutdown(s->rp_state.from_dst_file); - } - - do { - old_state = s->state; - if (!migration_is_setup_or_active(old_state)) { - break; - } - migrate_set_state(&s->state, old_state, MIGRATION_STATUS_CANCELLING); - } while (s->state != MIGRATION_STATUS_CANCELLING); - - /* - * If we're unlucky the migration code might be stuck somewhere in a - * send/write while the network has failed and is waiting to timeout; - * if we've got shutdown(2) available then we can force it to quit. - * The outgoing qemu file gets closed in migrate_fd_cleanup that is - * called in a bh, so there is no race against this cancel. - */ - if (s->state == MIGRATION_STATUS_CANCELLING && f) { - qemu_file_shutdown(f); - } -} - -void add_migration_state_change_notifier(Notifier *notify) -{ - notifier_list_add(&migration_state_notifiers, notify); -} - -void remove_migration_state_change_notifier(Notifier *notify) -{ - notifier_remove(notify); -} - -bool migration_in_setup(MigrationState *s) -{ - return s->state == MIGRATION_STATUS_SETUP; -} - -bool migration_has_finished(MigrationState *s) -{ - return s->state == MIGRATION_STATUS_COMPLETED; -} - -bool migration_has_failed(MigrationState *s) -{ - return (s->state == MIGRATION_STATUS_CANCELLED || - s->state == MIGRATION_STATUS_FAILED); -} - -bool migration_in_postcopy(MigrationState *s) -{ - return (s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE); -} - -bool migration_in_postcopy_after_devices(MigrationState *s) -{ - return migration_in_postcopy(s) && s->postcopy_after_devices; -} - -MigrationState *migrate_init(const MigrationParams *params) -{ - MigrationState *s = migrate_get_current(); - - /* - * Reinitialise all migration state, except - * parameters/capabilities that the user set, and - * locks. - */ - s->bytes_xfer = 0; - s->xfer_limit = 0; - s->cleanup_bh = 0; - s->to_dst_file = NULL; - s->state = MIGRATION_STATUS_NONE; - s->params = *params; - s->rp_state.from_dst_file = NULL; - s->rp_state.error = false; - s->mbps = 0.0; - s->downtime = 0; - s->expected_downtime = 0; - s->dirty_pages_rate = 0; - s->dirty_bytes_rate = 0; - s->setup_time = 0; - s->dirty_sync_count = 0; - s->start_postcopy = false; - s->postcopy_after_devices = false; - s->migration_thread_running = false; - s->last_req_rb = NULL; - - migrate_set_state(&s->state, MIGRATION_STATUS_NONE, MIGRATION_STATUS_SETUP); - - QSIMPLEQ_INIT(&s->src_page_requests); - - s->total_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); - return s; -} - -static GSList *migration_blockers; - -void migrate_add_blocker(Error *reason) -{ - migration_blockers = g_slist_prepend(migration_blockers, reason); -} - -void migrate_del_blocker(Error *reason) -{ - migration_blockers = g_slist_remove(migration_blockers, reason); -} - -void qmp_migrate_incoming(const char *uri, Error **errp) -{ - Error *local_err = NULL; - static bool once = true; - - if (!deferred_incoming) { - error_setg(errp, "For use with '-incoming defer'"); - return; - } - if (!once) { - error_setg(errp, "The incoming migration has already been started"); - } - - qemu_start_incoming_migration(uri, &local_err); - - if (local_err) { - error_propagate(errp, local_err); - return; - } - - once = false; -} - -void qmp_migrate(const char *uri, bool has_blk, bool blk, - bool has_inc, bool inc, bool has_detach, bool detach, - Error **errp) -{ - Error *local_err = NULL; - MigrationState *s = migrate_get_current(); - MigrationParams params; - const char *p; - - params.blk = has_blk && blk; - params.shared = has_inc && inc; - - if (migration_is_setup_or_active(s->state) || - s->state == MIGRATION_STATUS_CANCELLING) { - error_setg(errp, QERR_MIGRATION_ACTIVE); - return; - } - if (runstate_check(RUN_STATE_INMIGRATE)) { - error_setg(errp, "Guest is waiting for an incoming migration"); - return; - } - - if (qemu_savevm_state_blocked(errp)) { - return; - } - - if (migration_blockers) { - *errp = error_copy(migration_blockers->data); - return; - } - - s = migrate_init(¶ms); - - if (strstart(uri, "tcp:", &p)) { - tcp_start_outgoing_migration(s, p, &local_err); -#ifdef CONFIG_RDMA - } else if (strstart(uri, "rdma:", &p)) { - rdma_start_outgoing_migration(s, p, &local_err); -#endif -#if !defined(WIN32) - } else if (strstart(uri, "exec:", &p)) { - exec_start_outgoing_migration(s, p, &local_err); - } else if (strstart(uri, "unix:", &p)) { - unix_start_outgoing_migration(s, p, &local_err); - } else if (strstart(uri, "fd:", &p)) { - fd_start_outgoing_migration(s, p, &local_err); -#endif - } else { - error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "uri", - "a valid migration protocol"); - migrate_set_state(&s->state, MIGRATION_STATUS_SETUP, - MIGRATION_STATUS_FAILED); - return; - } - - if (local_err) { - migrate_fd_error(s); - error_propagate(errp, local_err); - return; - } -} - -void qmp_migrate_cancel(Error **errp) -{ - migrate_fd_cancel(migrate_get_current()); -} - -void qmp_migrate_set_cache_size(int64_t value, Error **errp) -{ - MigrationState *s = migrate_get_current(); - int64_t new_size; - - /* Check for truncation */ - if (value != (size_t)value) { - error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size", - "exceeding address space"); - return; - } - - /* Cache should not be larger than guest ram size */ - if (value > ram_bytes_total()) { - error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size", - "exceeds guest ram size "); - return; - } - - new_size = xbzrle_cache_resize(value); - if (new_size < 0) { - error_setg(errp, QERR_INVALID_PARAMETER_VALUE, "cache size", - "is smaller than page size"); - return; - } - - s->xbzrle_cache_size = new_size; -} - -int64_t qmp_query_migrate_cache_size(Error **errp) -{ - return migrate_xbzrle_cache_size(); -} - -void qmp_migrate_set_speed(int64_t value, Error **errp) -{ - MigrationState *s; - - if (value < 0) { - value = 0; - } - if (value > SIZE_MAX) { - value = SIZE_MAX; - } - - s = migrate_get_current(); - s->bandwidth_limit = value; - if (s->to_dst_file) { - qemu_file_set_rate_limit(s->to_dst_file, - s->bandwidth_limit / XFER_LIMIT_RATIO); - } -} - -void qmp_migrate_set_downtime(double value, Error **errp) -{ - value *= 1e9; - value = MAX(0, MIN(UINT64_MAX, value)); - max_downtime = (uint64_t)value; -} - -bool migrate_postcopy_ram(void) -{ - MigrationState *s; - - s = migrate_get_current(); - - return s->enabled_capabilities[MIGRATION_CAPABILITY_POSTCOPY_RAM]; -} - -bool migrate_auto_converge(void) -{ - MigrationState *s; - - s = migrate_get_current(); - - return s->enabled_capabilities[MIGRATION_CAPABILITY_AUTO_CONVERGE]; -} - -bool migrate_zero_blocks(void) -{ - MigrationState *s; - - s = migrate_get_current(); - - return s->enabled_capabilities[MIGRATION_CAPABILITY_ZERO_BLOCKS]; -} - -bool migrate_use_compression(void) -{ - MigrationState *s; - - s = migrate_get_current(); - - return s->enabled_capabilities[MIGRATION_CAPABILITY_COMPRESS]; -} - -int migrate_compress_level(void) -{ - MigrationState *s; - - s = migrate_get_current(); - - return s->parameters[MIGRATION_PARAMETER_COMPRESS_LEVEL]; -} - -int migrate_compress_threads(void) -{ - MigrationState *s; - - s = migrate_get_current(); - - return s->parameters[MIGRATION_PARAMETER_COMPRESS_THREADS]; -} - -int migrate_decompress_threads(void) -{ - MigrationState *s; - - s = migrate_get_current(); - - return s->parameters[MIGRATION_PARAMETER_DECOMPRESS_THREADS]; -} - -bool migrate_use_events(void) -{ - MigrationState *s; - - s = migrate_get_current(); - - return s->enabled_capabilities[MIGRATION_CAPABILITY_EVENTS]; -} - -int migrate_use_xbzrle(void) -{ - MigrationState *s; - - s = migrate_get_current(); - - return s->enabled_capabilities[MIGRATION_CAPABILITY_XBZRLE]; -} - -int64_t migrate_xbzrle_cache_size(void) -{ - MigrationState *s; - - s = migrate_get_current(); - - return s->xbzrle_cache_size; -} - -/* migration thread support */ -/* - * Something bad happened to the RP stream, mark an error - * The caller shall print or trace something to indicate why - */ -static void mark_source_rp_bad(MigrationState *s) -{ - s->rp_state.error = true; -} - -static struct rp_cmd_args { - ssize_t len; /* -1 = variable */ - const char *name; -} rp_cmd_args[] = { - [MIG_RP_MSG_INVALID] = { .len = -1, .name = "INVALID" }, - [MIG_RP_MSG_SHUT] = { .len = 4, .name = "SHUT" }, - [MIG_RP_MSG_PONG] = { .len = 4, .name = "PONG" }, - [MIG_RP_MSG_REQ_PAGES] = { .len = 12, .name = "REQ_PAGES" }, - [MIG_RP_MSG_REQ_PAGES_ID] = { .len = -1, .name = "REQ_PAGES_ID" }, - [MIG_RP_MSG_MAX] = { .len = -1, .name = "MAX" }, -}; - -/* - * Process a request for pages received on the return path, - * We're allowed to send more than requested (e.g. to round to our page size) - * and we don't need to send pages that have already been sent. - */ -static void migrate_handle_rp_req_pages(MigrationState *ms, const char* rbname, - ram_addr_t start, size_t len) -{ - long our_host_ps = getpagesize(); - - trace_migrate_handle_rp_req_pages(rbname, start, len); - - /* - * Since we currently insist on matching page sizes, just sanity check - * we're being asked for whole host pages. - */ - if (start & (our_host_ps-1) || - (len & (our_host_ps-1))) { - error_report("%s: Misaligned page request, start: " RAM_ADDR_FMT - " len: %zd", __func__, start, len); - mark_source_rp_bad(ms); - return; - } - - if (ram_save_queue_pages(ms, rbname, start, len)) { - mark_source_rp_bad(ms); - } -} - -/* - * Handles messages sent on the return path towards the source VM - * - */ -static void *source_return_path_thread(void *opaque) -{ - MigrationState *ms = opaque; - QEMUFile *rp = ms->rp_state.from_dst_file; - uint16_t header_len, header_type; - uint8_t buf[512]; - uint32_t tmp32, sibling_error; - ram_addr_t start = 0; /* =0 to silence warning */ - size_t len = 0, expected_len; - int res; - - trace_source_return_path_thread_entry(); - while (!ms->rp_state.error && !qemu_file_get_error(rp) && - migration_is_setup_or_active(ms->state)) { - trace_source_return_path_thread_loop_top(); - header_type = qemu_get_be16(rp); - header_len = qemu_get_be16(rp); - - if (header_type >= MIG_RP_MSG_MAX || - header_type == MIG_RP_MSG_INVALID) { - error_report("RP: Received invalid message 0x%04x length 0x%04x", - header_type, header_len); - mark_source_rp_bad(ms); - goto out; - } - - if ((rp_cmd_args[header_type].len != -1 && - header_len != rp_cmd_args[header_type].len) || - header_len > sizeof(buf)) { - error_report("RP: Received '%s' message (0x%04x) with" - "incorrect length %d expecting %zu", - rp_cmd_args[header_type].name, header_type, header_len, - (size_t)rp_cmd_args[header_type].len); - mark_source_rp_bad(ms); - goto out; - } - - /* We know we've got a valid header by this point */ - res = qemu_get_buffer(rp, buf, header_len); - if (res != header_len) { - error_report("RP: Failed reading data for message 0x%04x" - " read %d expected %d", - header_type, res, header_len); - mark_source_rp_bad(ms); - goto out; - } - - /* OK, we have the message and the data */ - switch (header_type) { - case MIG_RP_MSG_SHUT: - sibling_error = be32_to_cpup((uint32_t *)buf); - trace_source_return_path_thread_shut(sibling_error); - if (sibling_error) { - error_report("RP: Sibling indicated error %d", sibling_error); - mark_source_rp_bad(ms); - } - /* - * We'll let the main thread deal with closing the RP - * we could do a shutdown(2) on it, but we're the only user - * anyway, so there's nothing gained. - */ - goto out; - - case MIG_RP_MSG_PONG: - tmp32 = be32_to_cpup((uint32_t *)buf); - trace_source_return_path_thread_pong(tmp32); - break; - - case MIG_RP_MSG_REQ_PAGES: - start = be64_to_cpup((uint64_t *)buf); - len = be32_to_cpup((uint32_t *)(buf + 8)); - migrate_handle_rp_req_pages(ms, NULL, start, len); - break; - - case MIG_RP_MSG_REQ_PAGES_ID: - expected_len = 12 + 1; /* header + termination */ - - if (header_len >= expected_len) { - start = be64_to_cpup((uint64_t *)buf); - len = be32_to_cpup((uint32_t *)(buf + 8)); - /* Now we expect an idstr */ - tmp32 = buf[12]; /* Length of the following idstr */ - buf[13 + tmp32] = '\0'; - expected_len += tmp32; - } - if (header_len != expected_len) { - error_report("RP: Req_Page_id with length %d expecting %zd", - header_len, expected_len); - mark_source_rp_bad(ms); - goto out; - } - migrate_handle_rp_req_pages(ms, (char *)&buf[13], start, len); - break; - - default: - break; - } - } - if (qemu_file_get_error(rp)) { - trace_source_return_path_thread_bad_end(); - mark_source_rp_bad(ms); - } - - trace_source_return_path_thread_end(); -out: - ms->rp_state.from_dst_file = NULL; - qemu_fclose(rp); - return NULL; -} - -static int open_return_path_on_source(MigrationState *ms) -{ - - ms->rp_state.from_dst_file = qemu_file_get_return_path(ms->to_dst_file); - if (!ms->rp_state.from_dst_file) { - return -1; - } - - trace_open_return_path_on_source(); - qemu_thread_create(&ms->rp_state.rp_thread, "return path", - source_return_path_thread, ms, QEMU_THREAD_JOINABLE); - - trace_open_return_path_on_source_continue(); - - return 0; -} - -/* Returns 0 if the RP was ok, otherwise there was an error on the RP */ -static int await_return_path_close_on_source(MigrationState *ms) -{ - /* - * If this is a normal exit then the destination will send a SHUT and the - * rp_thread will exit, however if there's an error we need to cause - * it to exit. - */ - if (qemu_file_get_error(ms->to_dst_file) && ms->rp_state.from_dst_file) { - /* - * shutdown(2), if we have it, will cause it to unblock if it's stuck - * waiting for the destination. - */ - qemu_file_shutdown(ms->rp_state.from_dst_file); - mark_source_rp_bad(ms); - } - trace_await_return_path_close_on_source_joining(); - qemu_thread_join(&ms->rp_state.rp_thread); - trace_await_return_path_close_on_source_close(); - return ms->rp_state.error; -} - -/* - * Switch from normal iteration to postcopy - * Returns non-0 on error - */ -static int postcopy_start(MigrationState *ms, bool *old_vm_running) -{ - int ret; - const QEMUSizedBuffer *qsb; - int64_t time_at_stop = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); - migrate_set_state(&ms->state, MIGRATION_STATUS_ACTIVE, - MIGRATION_STATUS_POSTCOPY_ACTIVE); - - trace_postcopy_start(); - qemu_mutex_lock_iothread(); - trace_postcopy_start_set_run(); - - qemu_system_wakeup_request(QEMU_WAKEUP_REASON_OTHER); - *old_vm_running = runstate_is_running(); - global_state_store(); - ret = vm_stop_force_state(RUN_STATE_FINISH_MIGRATE); - if (ret < 0) { - goto fail; - } - - ret = bdrv_inactivate_all(); - if (ret < 0) { - goto fail; - } - - /* - * Cause any non-postcopiable, but iterative devices to - * send out their final data. - */ - qemu_savevm_state_complete_precopy(ms->to_dst_file, true); - - /* - * in Finish migrate and with the io-lock held everything should - * be quiet, but we've potentially still got dirty pages and we - * need to tell the destination to throw any pages it's already received - * that are dirty - */ - if (ram_postcopy_send_discard_bitmap(ms)) { - error_report("postcopy send discard bitmap failed"); - goto fail; - } - - /* - * send rest of state - note things that are doing postcopy - * will notice we're in POSTCOPY_ACTIVE and not actually - * wrap their state up here - */ - qemu_file_set_rate_limit(ms->to_dst_file, INT64_MAX); - /* Ping just for debugging, helps line traces up */ - qemu_savevm_send_ping(ms->to_dst_file, 2); - - /* - * While loading the device state we may trigger page transfer - * requests and the fd must be free to process those, and thus - * the destination must read the whole device state off the fd before - * it starts processing it. Unfortunately the ad-hoc migration format - * doesn't allow the destination to know the size to read without fully - * parsing it through each devices load-state code (especially the open - * coded devices that use get/put). - * So we wrap the device state up in a package with a length at the start; - * to do this we use a qemu_buf to hold the whole of the device state. - */ - QEMUFile *fb = qemu_bufopen("w", NULL); - if (!fb) { - error_report("Failed to create buffered file"); - goto fail; - } - - /* - * Make sure the receiver can get incoming pages before we send the rest - * of the state - */ - qemu_savevm_send_postcopy_listen(fb); - - qemu_savevm_state_complete_precopy(fb, false); - qemu_savevm_send_ping(fb, 3); - - qemu_savevm_send_postcopy_run(fb); - - /* <><> end of stuff going into the package */ - qsb = qemu_buf_get(fb); - - /* Now send that blob */ - if (qemu_savevm_send_packaged(ms->to_dst_file, qsb)) { - goto fail_closefb; - } - qemu_fclose(fb); - - /* Send a notify to give a chance for anything that needs to happen - * at the transition to postcopy and after the device state; in particular - * spice needs to trigger a transition now - */ - ms->postcopy_after_devices = true; - notifier_list_notify(&migration_state_notifiers, ms); - - ms->downtime = qemu_clock_get_ms(QEMU_CLOCK_REALTIME) - time_at_stop; - - qemu_mutex_unlock_iothread(); - - /* - * Although this ping is just for debug, it could potentially be - * used for getting a better measurement of downtime at the source. - */ - qemu_savevm_send_ping(ms->to_dst_file, 4); - - ret = qemu_file_get_error(ms->to_dst_file); - if (ret) { - error_report("postcopy_start: Migration stream errored"); - migrate_set_state(&ms->state, MIGRATION_STATUS_POSTCOPY_ACTIVE, - MIGRATION_STATUS_FAILED); - } - - return ret; - -fail_closefb: - qemu_fclose(fb); -fail: - migrate_set_state(&ms->state, MIGRATION_STATUS_POSTCOPY_ACTIVE, - MIGRATION_STATUS_FAILED); - qemu_mutex_unlock_iothread(); - return -1; -} - -/** - * migration_completion: Used by migration_thread when there's not much left. - * The caller 'breaks' the loop when this returns. - * - * @s: Current migration state - * @current_active_state: The migration state we expect to be in - * @*old_vm_running: Pointer to old_vm_running flag - * @*start_time: Pointer to time to update - */ -static void migration_completion(MigrationState *s, int current_active_state, - bool *old_vm_running, - int64_t *start_time) -{ - int ret; - - if (s->state == MIGRATION_STATUS_ACTIVE) { - qemu_mutex_lock_iothread(); - *start_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); - qemu_system_wakeup_request(QEMU_WAKEUP_REASON_OTHER); - *old_vm_running = runstate_is_running(); - ret = global_state_store(); - - if (!ret) { - ret = vm_stop_force_state(RUN_STATE_FINISH_MIGRATE); - if (ret >= 0) { - ret = bdrv_inactivate_all(); - } - if (ret >= 0) { - qemu_file_set_rate_limit(s->to_dst_file, INT64_MAX); - qemu_savevm_state_complete_precopy(s->to_dst_file, false); - } - } - qemu_mutex_unlock_iothread(); - - if (ret < 0) { - goto fail; - } - } else if (s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE) { - trace_migration_completion_postcopy_end(); - - qemu_savevm_state_complete_postcopy(s->to_dst_file); - trace_migration_completion_postcopy_end_after_complete(); - } - - /* - * If rp was opened we must clean up the thread before - * cleaning everything else up (since if there are no failures - * it will wait for the destination to send it's status in - * a SHUT command). - * Postcopy opens rp if enabled (even if it's not avtivated) - */ - if (migrate_postcopy_ram()) { - int rp_error; - trace_migration_completion_postcopy_end_before_rp(); - rp_error = await_return_path_close_on_source(s); - trace_migration_completion_postcopy_end_after_rp(rp_error); - if (rp_error) { - goto fail; - } - } - - if (qemu_file_get_error(s->to_dst_file)) { - trace_migration_completion_file_err(); - goto fail; - } - - migrate_set_state(&s->state, current_active_state, - MIGRATION_STATUS_COMPLETED); - return; - -fail: - migrate_set_state(&s->state, current_active_state, - MIGRATION_STATUS_FAILED); -} - -/* - * Master migration thread on the source VM. - * It drives the migration and pumps the data down the outgoing channel. - */ -static void *migration_thread(void *opaque) -{ - MigrationState *s = opaque; - /* Used by the bandwidth calcs, updated later */ - int64_t initial_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); - int64_t setup_start = qemu_clock_get_ms(QEMU_CLOCK_HOST); - int64_t initial_bytes = 0; - int64_t max_size = 0; - int64_t start_time = initial_time; - int64_t end_time; - bool old_vm_running = false; - bool entered_postcopy = false; - /* The active state we expect to be in; ACTIVE or POSTCOPY_ACTIVE */ - enum MigrationStatus current_active_state = MIGRATION_STATUS_ACTIVE; - - rcu_register_thread(); - - qemu_savevm_state_header(s->to_dst_file); - - if (migrate_postcopy_ram()) { - /* Now tell the dest that it should open its end so it can reply */ - qemu_savevm_send_open_return_path(s->to_dst_file); - - /* And do a ping that will make stuff easier to debug */ - qemu_savevm_send_ping(s->to_dst_file, 1); - - /* - * Tell the destination that we *might* want to do postcopy later; - * if the other end can't do postcopy it should fail now, nice and - * early. - */ - qemu_savevm_send_postcopy_advise(s->to_dst_file); - } - - qemu_savevm_state_begin(s->to_dst_file, &s->params); - - s->setup_time = qemu_clock_get_ms(QEMU_CLOCK_HOST) - setup_start; - current_active_state = MIGRATION_STATUS_ACTIVE; - migrate_set_state(&s->state, MIGRATION_STATUS_SETUP, - MIGRATION_STATUS_ACTIVE); - - trace_migration_thread_setup_complete(); - - while (s->state == MIGRATION_STATUS_ACTIVE || - s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE) { - int64_t current_time; - uint64_t pending_size; - - if (!qemu_file_rate_limit(s->to_dst_file)) { - uint64_t pend_post, pend_nonpost; - - qemu_savevm_state_pending(s->to_dst_file, max_size, &pend_nonpost, - &pend_post); - pending_size = pend_nonpost + pend_post; - trace_migrate_pending(pending_size, max_size, - pend_post, pend_nonpost); - if (pending_size && pending_size >= max_size) { - /* Still a significant amount to transfer */ - - if (migrate_postcopy_ram() && - s->state != MIGRATION_STATUS_POSTCOPY_ACTIVE && - pend_nonpost <= max_size && - atomic_read(&s->start_postcopy)) { - - if (!postcopy_start(s, &old_vm_running)) { - current_active_state = MIGRATION_STATUS_POSTCOPY_ACTIVE; - entered_postcopy = true; - } - - continue; - } - /* Just another iteration step */ - qemu_savevm_state_iterate(s->to_dst_file, entered_postcopy); - } else { - trace_migration_thread_low_pending(pending_size); - migration_completion(s, current_active_state, - &old_vm_running, &start_time); - break; - } - } - - if (qemu_file_get_error(s->to_dst_file)) { - migrate_set_state(&s->state, current_active_state, - MIGRATION_STATUS_FAILED); - trace_migration_thread_file_err(); - break; - } - current_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); - if (current_time >= initial_time + BUFFER_DELAY) { - uint64_t transferred_bytes = qemu_ftell(s->to_dst_file) - - initial_bytes; - uint64_t time_spent = current_time - initial_time; - double bandwidth = (double)transferred_bytes / time_spent; - max_size = bandwidth * migrate_max_downtime() / 1000000; - - s->mbps = (((double) transferred_bytes * 8.0) / - ((double) time_spent / 1000.0)) / 1000.0 / 1000.0; - - trace_migrate_transferred(transferred_bytes, time_spent, - bandwidth, max_size); - /* if we haven't sent anything, we don't want to recalculate - 10000 is a small enough number for our purposes */ - if (s->dirty_bytes_rate && transferred_bytes > 10000) { - s->expected_downtime = s->dirty_bytes_rate / bandwidth; - } - - qemu_file_reset_rate_limit(s->to_dst_file); - initial_time = current_time; - initial_bytes = qemu_ftell(s->to_dst_file); - } - if (qemu_file_rate_limit(s->to_dst_file)) { - /* usleep expects microseconds */ - g_usleep((initial_time + BUFFER_DELAY - current_time)*1000); - } - } - - trace_migration_thread_after_loop(); - /* If we enabled cpu throttling for auto-converge, turn it off. */ - cpu_throttle_stop(); - end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); - - qemu_mutex_lock_iothread(); - qemu_savevm_state_cleanup(); - if (s->state == MIGRATION_STATUS_COMPLETED) { - uint64_t transferred_bytes = qemu_ftell(s->to_dst_file); - s->total_time = end_time - s->total_time; - if (!entered_postcopy) { - s->downtime = end_time - start_time; - } - if (s->total_time) { - s->mbps = (((double) transferred_bytes * 8.0) / - ((double) s->total_time)) / 1000; - } - runstate_set(RUN_STATE_POSTMIGRATE); - } else { - if (old_vm_running && !entered_postcopy) { - vm_start(); - } - } - qemu_bh_schedule(s->cleanup_bh); - qemu_mutex_unlock_iothread(); - - rcu_unregister_thread(); - return NULL; -} - -void migrate_fd_connect(MigrationState *s) -{ - /* This is a best 1st approximation. ns to ms */ - s->expected_downtime = max_downtime/1000000; - s->cleanup_bh = qemu_bh_new(migrate_fd_cleanup, s); - - qemu_file_set_rate_limit(s->to_dst_file, - s->bandwidth_limit / XFER_LIMIT_RATIO); - - /* Notify before starting migration thread */ - notifier_list_notify(&migration_state_notifiers, s); - - /* - * Open the return path; currently for postcopy but other things might - * also want it. - */ - if (migrate_postcopy_ram()) { - if (open_return_path_on_source(s)) { - error_report("Unable to open return-path for postcopy"); - migrate_set_state(&s->state, MIGRATION_STATUS_SETUP, - MIGRATION_STATUS_FAILED); - migrate_fd_cleanup(s); - return; - } - } - - migrate_compress_threads_create(); - qemu_thread_create(&s->thread, "migration", migration_thread, s, - QEMU_THREAD_JOINABLE); - s->migration_thread_running = true; -} - -PostcopyState postcopy_state_get(void) -{ - return atomic_mb_read(&incoming_postcopy_state); -} - -/* Set the state and return the old state */ -PostcopyState postcopy_state_set(PostcopyState new_state) -{ - return atomic_xchg(&incoming_postcopy_state, new_state); -} - diff --git a/qemu/migration/postcopy-ram.c b/qemu/migration/postcopy-ram.c deleted file mode 100644 index fbd0064fc..000000000 --- a/qemu/migration/postcopy-ram.c +++ /dev/null @@ -1,761 +0,0 @@ -/* - * Postcopy migration for RAM - * - * Copyright 2013-2015 Red Hat, Inc. and/or its affiliates - * - * Authors: - * Dave Gilbert <dgilbert@redhat.com> - * - * This work is licensed under the terms of the GNU GPL, version 2 or later. - * See the COPYING file in the top-level directory. - * - */ - -/* - * Postcopy is a migration technique where the execution flips from the - * source to the destination before all the data has been copied. - */ - -#include "qemu/osdep.h" -#include <glib.h> - -#include "qemu-common.h" -#include "migration/migration.h" -#include "migration/postcopy-ram.h" -#include "sysemu/sysemu.h" -#include "sysemu/balloon.h" -#include "qemu/error-report.h" -#include "trace.h" - -/* Arbitrary limit on size of each discard command, - * keeps them around ~200 bytes - */ -#define MAX_DISCARDS_PER_COMMAND 12 - -struct PostcopyDiscardState { - const char *ramblock_name; - uint64_t offset; /* Bitmap entry for the 1st bit of this RAMBlock */ - uint16_t cur_entry; - /* - * Start and length of a discard range (bytes) - */ - uint64_t start_list[MAX_DISCARDS_PER_COMMAND]; - uint64_t length_list[MAX_DISCARDS_PER_COMMAND]; - unsigned int nsentwords; - unsigned int nsentcmds; -}; - -/* Postcopy needs to detect accesses to pages that haven't yet been copied - * across, and efficiently map new pages in, the techniques for doing this - * are target OS specific. - */ -#if defined(__linux__) - -#include <poll.h> -#include <sys/mman.h> -#include <sys/ioctl.h> -#include <sys/syscall.h> -#include <asm/types.h> /* for __u64 */ -#endif - -#if defined(__linux__) && defined(__NR_userfaultfd) && defined(CONFIG_EVENTFD) -#include <sys/eventfd.h> -#include <linux/userfaultfd.h> - -static bool ufd_version_check(int ufd) -{ - struct uffdio_api api_struct; - uint64_t ioctl_mask; - - api_struct.api = UFFD_API; - api_struct.features = 0; - if (ioctl(ufd, UFFDIO_API, &api_struct)) { - error_report("postcopy_ram_supported_by_host: UFFDIO_API failed: %s", - strerror(errno)); - return false; - } - - ioctl_mask = (__u64)1 << _UFFDIO_REGISTER | - (__u64)1 << _UFFDIO_UNREGISTER; - if ((api_struct.ioctls & ioctl_mask) != ioctl_mask) { - error_report("Missing userfault features: %" PRIx64, - (uint64_t)(~api_struct.ioctls & ioctl_mask)); - return false; - } - - return true; -} - -/* - * Note: This has the side effect of munlock'ing all of RAM, that's - * normally fine since if the postcopy succeeds it gets turned back on at the - * end. - */ -bool postcopy_ram_supported_by_host(void) -{ - long pagesize = getpagesize(); - int ufd = -1; - bool ret = false; /* Error unless we change it */ - void *testarea = NULL; - struct uffdio_register reg_struct; - struct uffdio_range range_struct; - uint64_t feature_mask; - - if ((1ul << qemu_target_page_bits()) > pagesize) { - error_report("Target page size bigger than host page size"); - goto out; - } - - ufd = syscall(__NR_userfaultfd, O_CLOEXEC); - if (ufd == -1) { - error_report("%s: userfaultfd not available: %s", __func__, - strerror(errno)); - goto out; - } - - /* Version and features check */ - if (!ufd_version_check(ufd)) { - goto out; - } - - /* - * userfault and mlock don't go together; we'll put it back later if - * it was enabled. - */ - if (munlockall()) { - error_report("%s: munlockall: %s", __func__, strerror(errno)); - return -1; - } - - /* - * We need to check that the ops we need are supported on anon memory - * To do that we need to register a chunk and see the flags that - * are returned. - */ - testarea = mmap(NULL, pagesize, PROT_READ | PROT_WRITE, MAP_PRIVATE | - MAP_ANONYMOUS, -1, 0); - if (testarea == MAP_FAILED) { - error_report("%s: Failed to map test area: %s", __func__, - strerror(errno)); - goto out; - } - g_assert(((size_t)testarea & (pagesize-1)) == 0); - - reg_struct.range.start = (uintptr_t)testarea; - reg_struct.range.len = pagesize; - reg_struct.mode = UFFDIO_REGISTER_MODE_MISSING; - - if (ioctl(ufd, UFFDIO_REGISTER, ®_struct)) { - error_report("%s userfault register: %s", __func__, strerror(errno)); - goto out; - } - - range_struct.start = (uintptr_t)testarea; - range_struct.len = pagesize; - if (ioctl(ufd, UFFDIO_UNREGISTER, &range_struct)) { - error_report("%s userfault unregister: %s", __func__, strerror(errno)); - goto out; - } - - feature_mask = (__u64)1 << _UFFDIO_WAKE | - (__u64)1 << _UFFDIO_COPY | - (__u64)1 << _UFFDIO_ZEROPAGE; - if ((reg_struct.ioctls & feature_mask) != feature_mask) { - error_report("Missing userfault map features: %" PRIx64, - (uint64_t)(~reg_struct.ioctls & feature_mask)); - goto out; - } - - /* Success! */ - ret = true; -out: - if (testarea) { - munmap(testarea, pagesize); - } - if (ufd != -1) { - close(ufd); - } - return ret; -} - -/** - * postcopy_ram_discard_range: Discard a range of memory. - * We can assume that if we've been called postcopy_ram_hosttest returned true. - * - * @mis: Current incoming migration state. - * @start, @length: range of memory to discard. - * - * returns: 0 on success. - */ -int postcopy_ram_discard_range(MigrationIncomingState *mis, uint8_t *start, - size_t length) -{ - trace_postcopy_ram_discard_range(start, length); - if (madvise(start, length, MADV_DONTNEED)) { - error_report("%s MADV_DONTNEED: %s", __func__, strerror(errno)); - return -1; - } - - return 0; -} - -/* - * Setup an area of RAM so that it *can* be used for postcopy later; this - * must be done right at the start prior to pre-copy. - * opaque should be the MIS. - */ -static int init_range(const char *block_name, void *host_addr, - ram_addr_t offset, ram_addr_t length, void *opaque) -{ - MigrationIncomingState *mis = opaque; - - trace_postcopy_init_range(block_name, host_addr, offset, length); - - /* - * We need the whole of RAM to be truly empty for postcopy, so things - * like ROMs and any data tables built during init must be zero'd - * - we're going to get the copy from the source anyway. - * (Precopy will just overwrite this data, so doesn't need the discard) - */ - if (postcopy_ram_discard_range(mis, host_addr, length)) { - return -1; - } - - return 0; -} - -/* - * At the end of migration, undo the effects of init_range - * opaque should be the MIS. - */ -static int cleanup_range(const char *block_name, void *host_addr, - ram_addr_t offset, ram_addr_t length, void *opaque) -{ - MigrationIncomingState *mis = opaque; - struct uffdio_range range_struct; - trace_postcopy_cleanup_range(block_name, host_addr, offset, length); - - /* - * We turned off hugepage for the precopy stage with postcopy enabled - * we can turn it back on now. - */ - qemu_madvise(host_addr, length, QEMU_MADV_HUGEPAGE); - - /* - * We can also turn off userfault now since we should have all the - * pages. It can be useful to leave it on to debug postcopy - * if you're not sure it's always getting every page. - */ - range_struct.start = (uintptr_t)host_addr; - range_struct.len = length; - - if (ioctl(mis->userfault_fd, UFFDIO_UNREGISTER, &range_struct)) { - error_report("%s: userfault unregister %s", __func__, strerror(errno)); - - return -1; - } - - return 0; -} - -/* - * Initialise postcopy-ram, setting the RAM to a state where we can go into - * postcopy later; must be called prior to any precopy. - * called from arch_init's similarly named ram_postcopy_incoming_init - */ -int postcopy_ram_incoming_init(MigrationIncomingState *mis, size_t ram_pages) -{ - if (qemu_ram_foreach_block(init_range, mis)) { - return -1; - } - - return 0; -} - -/* - * At the end of a migration where postcopy_ram_incoming_init was called. - */ -int postcopy_ram_incoming_cleanup(MigrationIncomingState *mis) -{ - trace_postcopy_ram_incoming_cleanup_entry(); - - if (mis->have_fault_thread) { - uint64_t tmp64; - - if (qemu_ram_foreach_block(cleanup_range, mis)) { - return -1; - } - /* - * Tell the fault_thread to exit, it's an eventfd that should - * currently be at 0, we're going to increment it to 1 - */ - tmp64 = 1; - if (write(mis->userfault_quit_fd, &tmp64, 8) == 8) { - trace_postcopy_ram_incoming_cleanup_join(); - qemu_thread_join(&mis->fault_thread); - } else { - /* Not much we can do here, but may as well report it */ - error_report("%s: incrementing userfault_quit_fd: %s", __func__, - strerror(errno)); - } - trace_postcopy_ram_incoming_cleanup_closeuf(); - close(mis->userfault_fd); - close(mis->userfault_quit_fd); - mis->have_fault_thread = false; - } - - qemu_balloon_inhibit(false); - - if (enable_mlock) { - if (os_mlock() < 0) { - error_report("mlock: %s", strerror(errno)); - /* - * It doesn't feel right to fail at this point, we have a valid - * VM state. - */ - } - } - - postcopy_state_set(POSTCOPY_INCOMING_END); - migrate_send_rp_shut(mis, qemu_file_get_error(mis->from_src_file) != 0); - - if (mis->postcopy_tmp_page) { - munmap(mis->postcopy_tmp_page, getpagesize()); - mis->postcopy_tmp_page = NULL; - } - trace_postcopy_ram_incoming_cleanup_exit(); - return 0; -} - -/* - * Disable huge pages on an area - */ -static int nhp_range(const char *block_name, void *host_addr, - ram_addr_t offset, ram_addr_t length, void *opaque) -{ - trace_postcopy_nhp_range(block_name, host_addr, offset, length); - - /* - * Before we do discards we need to ensure those discards really - * do delete areas of the page, even if THP thinks a hugepage would - * be a good idea, so force hugepages off. - */ - qemu_madvise(host_addr, length, QEMU_MADV_NOHUGEPAGE); - - return 0; -} - -/* - * Userfault requires us to mark RAM as NOHUGEPAGE prior to discard - * however leaving it until after precopy means that most of the precopy - * data is still THPd - */ -int postcopy_ram_prepare_discard(MigrationIncomingState *mis) -{ - if (qemu_ram_foreach_block(nhp_range, mis)) { - return -1; - } - - postcopy_state_set(POSTCOPY_INCOMING_DISCARD); - - return 0; -} - -/* - * Mark the given area of RAM as requiring notification to unwritten areas - * Used as a callback on qemu_ram_foreach_block. - * host_addr: Base of area to mark - * offset: Offset in the whole ram arena - * length: Length of the section - * opaque: MigrationIncomingState pointer - * Returns 0 on success - */ -static int ram_block_enable_notify(const char *block_name, void *host_addr, - ram_addr_t offset, ram_addr_t length, - void *opaque) -{ - MigrationIncomingState *mis = opaque; - struct uffdio_register reg_struct; - - reg_struct.range.start = (uintptr_t)host_addr; - reg_struct.range.len = length; - reg_struct.mode = UFFDIO_REGISTER_MODE_MISSING; - - /* Now tell our userfault_fd that it's responsible for this area */ - if (ioctl(mis->userfault_fd, UFFDIO_REGISTER, ®_struct)) { - error_report("%s userfault register: %s", __func__, strerror(errno)); - return -1; - } - - return 0; -} - -/* - * Handle faults detected by the USERFAULT markings - */ -static void *postcopy_ram_fault_thread(void *opaque) -{ - MigrationIncomingState *mis = opaque; - struct uffd_msg msg; - int ret; - size_t hostpagesize = getpagesize(); - RAMBlock *rb = NULL; - RAMBlock *last_rb = NULL; /* last RAMBlock we sent part of */ - - trace_postcopy_ram_fault_thread_entry(); - qemu_sem_post(&mis->fault_thread_sem); - - while (true) { - ram_addr_t rb_offset; - ram_addr_t in_raspace; - struct pollfd pfd[2]; - - /* - * We're mainly waiting for the kernel to give us a faulting HVA, - * however we can be told to quit via userfault_quit_fd which is - * an eventfd - */ - pfd[0].fd = mis->userfault_fd; - pfd[0].events = POLLIN; - pfd[0].revents = 0; - pfd[1].fd = mis->userfault_quit_fd; - pfd[1].events = POLLIN; /* Waiting for eventfd to go positive */ - pfd[1].revents = 0; - - if (poll(pfd, 2, -1 /* Wait forever */) == -1) { - error_report("%s: userfault poll: %s", __func__, strerror(errno)); - break; - } - - if (pfd[1].revents) { - trace_postcopy_ram_fault_thread_quit(); - break; - } - - ret = read(mis->userfault_fd, &msg, sizeof(msg)); - if (ret != sizeof(msg)) { - if (errno == EAGAIN) { - /* - * if a wake up happens on the other thread just after - * the poll, there is nothing to read. - */ - continue; - } - if (ret < 0) { - error_report("%s: Failed to read full userfault message: %s", - __func__, strerror(errno)); - break; - } else { - error_report("%s: Read %d bytes from userfaultfd expected %zd", - __func__, ret, sizeof(msg)); - break; /* Lost alignment, don't know what we'd read next */ - } - } - if (msg.event != UFFD_EVENT_PAGEFAULT) { - error_report("%s: Read unexpected event %ud from userfaultfd", - __func__, msg.event); - continue; /* It's not a page fault, shouldn't happen */ - } - - rb = qemu_ram_block_from_host( - (void *)(uintptr_t)msg.arg.pagefault.address, - true, &in_raspace, &rb_offset); - if (!rb) { - error_report("postcopy_ram_fault_thread: Fault outside guest: %" - PRIx64, (uint64_t)msg.arg.pagefault.address); - break; - } - - rb_offset &= ~(hostpagesize - 1); - trace_postcopy_ram_fault_thread_request(msg.arg.pagefault.address, - qemu_ram_get_idstr(rb), - rb_offset); - - /* - * Send the request to the source - we want to request one - * of our host page sizes (which is >= TPS) - */ - if (rb != last_rb) { - last_rb = rb; - migrate_send_rp_req_pages(mis, qemu_ram_get_idstr(rb), - rb_offset, hostpagesize); - } else { - /* Save some space */ - migrate_send_rp_req_pages(mis, NULL, - rb_offset, hostpagesize); - } - } - trace_postcopy_ram_fault_thread_exit(); - return NULL; -} - -int postcopy_ram_enable_notify(MigrationIncomingState *mis) -{ - /* Open the fd for the kernel to give us userfaults */ - mis->userfault_fd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK); - if (mis->userfault_fd == -1) { - error_report("%s: Failed to open userfault fd: %s", __func__, - strerror(errno)); - return -1; - } - - /* - * Although the host check already tested the API, we need to - * do the check again as an ABI handshake on the new fd. - */ - if (!ufd_version_check(mis->userfault_fd)) { - return -1; - } - - /* Now an eventfd we use to tell the fault-thread to quit */ - mis->userfault_quit_fd = eventfd(0, EFD_CLOEXEC); - if (mis->userfault_quit_fd == -1) { - error_report("%s: Opening userfault_quit_fd: %s", __func__, - strerror(errno)); - close(mis->userfault_fd); - return -1; - } - - qemu_sem_init(&mis->fault_thread_sem, 0); - qemu_thread_create(&mis->fault_thread, "postcopy/fault", - postcopy_ram_fault_thread, mis, QEMU_THREAD_JOINABLE); - qemu_sem_wait(&mis->fault_thread_sem); - qemu_sem_destroy(&mis->fault_thread_sem); - mis->have_fault_thread = true; - - /* Mark so that we get notified of accesses to unwritten areas */ - if (qemu_ram_foreach_block(ram_block_enable_notify, mis)) { - return -1; - } - - /* - * Ballooning can mark pages as absent while we're postcopying - * that would cause false userfaults. - */ - qemu_balloon_inhibit(true); - - trace_postcopy_ram_enable_notify(); - - return 0; -} - -/* - * Place a host page (from) at (host) atomically - * returns 0 on success - */ -int postcopy_place_page(MigrationIncomingState *mis, void *host, void *from) -{ - struct uffdio_copy copy_struct; - - copy_struct.dst = (uint64_t)(uintptr_t)host; - copy_struct.src = (uint64_t)(uintptr_t)from; - copy_struct.len = getpagesize(); - copy_struct.mode = 0; - - /* copy also acks to the kernel waking the stalled thread up - * TODO: We can inhibit that ack and only do it if it was requested - * which would be slightly cheaper, but we'd have to be careful - * of the order of updating our page state. - */ - if (ioctl(mis->userfault_fd, UFFDIO_COPY, ©_struct)) { - int e = errno; - error_report("%s: %s copy host: %p from: %p", - __func__, strerror(e), host, from); - - return -e; - } - - trace_postcopy_place_page(host); - return 0; -} - -/* - * Place a zero page at (host) atomically - * returns 0 on success - */ -int postcopy_place_page_zero(MigrationIncomingState *mis, void *host) -{ - struct uffdio_zeropage zero_struct; - - zero_struct.range.start = (uint64_t)(uintptr_t)host; - zero_struct.range.len = getpagesize(); - zero_struct.mode = 0; - - if (ioctl(mis->userfault_fd, UFFDIO_ZEROPAGE, &zero_struct)) { - int e = errno; - error_report("%s: %s zero host: %p", - __func__, strerror(e), host); - - return -e; - } - - trace_postcopy_place_page_zero(host); - return 0; -} - -/* - * Returns a target page of memory that can be mapped at a later point in time - * using postcopy_place_page - * The same address is used repeatedly, postcopy_place_page just takes the - * backing page away. - * Returns: Pointer to allocated page - * - */ -void *postcopy_get_tmp_page(MigrationIncomingState *mis) -{ - if (!mis->postcopy_tmp_page) { - mis->postcopy_tmp_page = mmap(NULL, getpagesize(), - PROT_READ | PROT_WRITE, MAP_PRIVATE | - MAP_ANONYMOUS, -1, 0); - if (!mis->postcopy_tmp_page) { - error_report("%s: %s", __func__, strerror(errno)); - return NULL; - } - } - - return mis->postcopy_tmp_page; -} - -#else -/* No target OS support, stubs just fail */ -bool postcopy_ram_supported_by_host(void) -{ - error_report("%s: No OS support", __func__); - return false; -} - -int postcopy_ram_incoming_init(MigrationIncomingState *mis, size_t ram_pages) -{ - error_report("postcopy_ram_incoming_init: No OS support"); - return -1; -} - -int postcopy_ram_incoming_cleanup(MigrationIncomingState *mis) -{ - assert(0); - return -1; -} - -int postcopy_ram_discard_range(MigrationIncomingState *mis, uint8_t *start, - size_t length) -{ - assert(0); - return -1; -} - -int postcopy_ram_prepare_discard(MigrationIncomingState *mis) -{ - assert(0); - return -1; -} - -int postcopy_ram_enable_notify(MigrationIncomingState *mis) -{ - assert(0); - return -1; -} - -int postcopy_place_page(MigrationIncomingState *mis, void *host, void *from) -{ - assert(0); - return -1; -} - -int postcopy_place_page_zero(MigrationIncomingState *mis, void *host) -{ - assert(0); - return -1; -} - -void *postcopy_get_tmp_page(MigrationIncomingState *mis) -{ - assert(0); - return NULL; -} - -#endif - -/* ------------------------------------------------------------------------- */ - -/** - * postcopy_discard_send_init: Called at the start of each RAMBlock before - * asking to discard individual ranges. - * - * @ms: The current migration state. - * @offset: the bitmap offset of the named RAMBlock in the migration - * bitmap. - * @name: RAMBlock that discards will operate on. - * - * returns: a new PDS. - */ -PostcopyDiscardState *postcopy_discard_send_init(MigrationState *ms, - unsigned long offset, - const char *name) -{ - PostcopyDiscardState *res = g_malloc0(sizeof(PostcopyDiscardState)); - - if (res) { - res->ramblock_name = name; - res->offset = offset; - } - - return res; -} - -/** - * postcopy_discard_send_range: Called by the bitmap code for each chunk to - * discard. May send a discard message, may just leave it queued to - * be sent later. - * - * @ms: Current migration state. - * @pds: Structure initialised by postcopy_discard_send_init(). - * @start,@length: a range of pages in the migration bitmap in the - * RAM block passed to postcopy_discard_send_init() (length=1 is one page) - */ -void postcopy_discard_send_range(MigrationState *ms, PostcopyDiscardState *pds, - unsigned long start, unsigned long length) -{ - size_t tp_bits = qemu_target_page_bits(); - /* Convert to byte offsets within the RAM block */ - pds->start_list[pds->cur_entry] = (start - pds->offset) << tp_bits; - pds->length_list[pds->cur_entry] = length << tp_bits; - trace_postcopy_discard_send_range(pds->ramblock_name, start, length); - pds->cur_entry++; - pds->nsentwords++; - - if (pds->cur_entry == MAX_DISCARDS_PER_COMMAND) { - /* Full set, ship it! */ - qemu_savevm_send_postcopy_ram_discard(ms->to_dst_file, - pds->ramblock_name, - pds->cur_entry, - pds->start_list, - pds->length_list); - pds->nsentcmds++; - pds->cur_entry = 0; - } -} - -/** - * postcopy_discard_send_finish: Called at the end of each RAMBlock by the - * bitmap code. Sends any outstanding discard messages, frees the PDS - * - * @ms: Current migration state. - * @pds: Structure initialised by postcopy_discard_send_init(). - */ -void postcopy_discard_send_finish(MigrationState *ms, PostcopyDiscardState *pds) -{ - /* Anything unsent? */ - if (pds->cur_entry) { - qemu_savevm_send_postcopy_ram_discard(ms->to_dst_file, - pds->ramblock_name, - pds->cur_entry, - pds->start_list, - pds->length_list); - pds->nsentcmds++; - } - - trace_postcopy_discard_send_finish(pds->ramblock_name, pds->nsentwords, - pds->nsentcmds); - - g_free(pds); -} diff --git a/qemu/migration/qemu-file-buf.c b/qemu/migration/qemu-file-buf.c deleted file mode 100644 index 7b8e78e99..000000000 --- a/qemu/migration/qemu-file-buf.c +++ /dev/null @@ -1,464 +0,0 @@ -/* - * QEMU System Emulator - * - * Copyright (c) 2003-2008 Fabrice Bellard - * Copyright (c) 2014 IBM Corp. - * - * Authors: - * Stefan Berger <stefanb@linux.vnet.ibm.com> - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. - */ -#include "qemu/osdep.h" -#include "qemu-common.h" -#include "qemu/error-report.h" -#include "qemu/iov.h" -#include "qemu/sockets.h" -#include "qemu/coroutine.h" -#include "migration/migration.h" -#include "migration/qemu-file.h" -#include "migration/qemu-file-internal.h" -#include "trace.h" - -#define QSB_CHUNK_SIZE (1 << 10) -#define QSB_MAX_CHUNK_SIZE (16 * QSB_CHUNK_SIZE) - -/** - * Create a QEMUSizedBuffer - * This type of buffer uses scatter-gather lists internally and - * can grow to any size. Any data array in the scatter-gather list - * can hold different amount of bytes. - * - * @buffer: Optional buffer to copy into the QSB - * @len: size of initial buffer; if @buffer is given, buffer must - * hold at least len bytes - * - * Returns a pointer to a QEMUSizedBuffer or NULL on allocation failure - */ -QEMUSizedBuffer *qsb_create(const uint8_t *buffer, size_t len) -{ - QEMUSizedBuffer *qsb; - size_t alloc_len, num_chunks, i, to_copy; - size_t chunk_size = (len > QSB_MAX_CHUNK_SIZE) - ? QSB_MAX_CHUNK_SIZE - : QSB_CHUNK_SIZE; - - num_chunks = DIV_ROUND_UP(len ? len : QSB_CHUNK_SIZE, chunk_size); - alloc_len = num_chunks * chunk_size; - - qsb = g_try_new0(QEMUSizedBuffer, 1); - if (!qsb) { - return NULL; - } - - qsb->iov = g_try_new0(struct iovec, num_chunks); - if (!qsb->iov) { - g_free(qsb); - return NULL; - } - - qsb->n_iov = num_chunks; - - for (i = 0; i < num_chunks; i++) { - qsb->iov[i].iov_base = g_try_malloc0(chunk_size); - if (!qsb->iov[i].iov_base) { - /* qsb_free is safe since g_free can cope with NULL */ - qsb_free(qsb); - return NULL; - } - - qsb->iov[i].iov_len = chunk_size; - if (buffer) { - to_copy = (len - qsb->used) > chunk_size - ? chunk_size : (len - qsb->used); - memcpy(qsb->iov[i].iov_base, &buffer[qsb->used], to_copy); - qsb->used += to_copy; - } - } - - qsb->size = alloc_len; - - return qsb; -} - -/** - * Free the QEMUSizedBuffer - * - * @qsb: The QEMUSizedBuffer to free - */ -void qsb_free(QEMUSizedBuffer *qsb) -{ - size_t i; - - if (!qsb) { - return; - } - - for (i = 0; i < qsb->n_iov; i++) { - g_free(qsb->iov[i].iov_base); - } - g_free(qsb->iov); - g_free(qsb); -} - -/** - * Get the number of used bytes in the QEMUSizedBuffer - * - * @qsb: A QEMUSizedBuffer - * - * Returns the number of bytes currently used in this buffer - */ -size_t qsb_get_length(const QEMUSizedBuffer *qsb) -{ - return qsb->used; -} - -/** - * Set the length of the buffer; the primary usage of this - * function is to truncate the number of used bytes in the buffer. - * The size will not be extended beyond the current number of - * allocated bytes in the QEMUSizedBuffer. - * - * @qsb: A QEMUSizedBuffer - * @new_len: The new length of bytes in the buffer - * - * Returns the number of bytes the buffer was truncated or extended - * to. - */ -size_t qsb_set_length(QEMUSizedBuffer *qsb, size_t new_len) -{ - if (new_len <= qsb->size) { - qsb->used = new_len; - } else { - qsb->used = qsb->size; - } - return qsb->used; -} - -/** - * Get the iovec that holds the data for a given position @pos. - * - * @qsb: A QEMUSizedBuffer - * @pos: The index of a byte in the buffer - * @d_off: Pointer to an offset that this function will indicate - * at what position within the returned iovec the byte - * is to be found - * - * Returns the index of the iovec that holds the byte at the given - * index @pos in the byte stream; a negative number if the iovec - * for the given position @pos does not exist. - */ -static ssize_t qsb_get_iovec(const QEMUSizedBuffer *qsb, - off_t pos, off_t *d_off) -{ - ssize_t i; - off_t curr = 0; - - if (pos > qsb->used) { - return -1; - } - - for (i = 0; i < qsb->n_iov; i++) { - if (curr + qsb->iov[i].iov_len > pos) { - *d_off = pos - curr; - return i; - } - curr += qsb->iov[i].iov_len; - } - return -1; -} - -/* - * Convert the QEMUSizedBuffer into a flat buffer. - * - * Note: If at all possible, try to avoid this function since it - * may unnecessarily copy memory around. - * - * @qsb: pointer to QEMUSizedBuffer - * @start: offset to start at - * @count: number of bytes to copy - * @buf: a pointer to a buffer to write into (at least @count bytes) - * - * Returns the number of bytes copied into the output buffer - */ -ssize_t qsb_get_buffer(const QEMUSizedBuffer *qsb, off_t start, - size_t count, uint8_t *buffer) -{ - const struct iovec *iov; - size_t to_copy, all_copy; - ssize_t index; - off_t s_off; - off_t d_off = 0; - char *s; - - if (start > qsb->used) { - return 0; - } - - all_copy = qsb->used - start; - if (all_copy > count) { - all_copy = count; - } else { - count = all_copy; - } - - index = qsb_get_iovec(qsb, start, &s_off); - if (index < 0) { - return 0; - } - - while (all_copy > 0) { - iov = &qsb->iov[index]; - - s = iov->iov_base; - - to_copy = iov->iov_len - s_off; - if (to_copy > all_copy) { - to_copy = all_copy; - } - memcpy(&buffer[d_off], &s[s_off], to_copy); - - d_off += to_copy; - all_copy -= to_copy; - - s_off = 0; - index++; - } - - return count; -} - -/** - * Grow the QEMUSizedBuffer to the given size and allocate - * memory for it. - * - * @qsb: A QEMUSizedBuffer - * @new_size: The new size of the buffer - * - * Return: - * a negative error code in case of memory allocation failure - * or - * the new size of the buffer. The returned size may be greater or equal - * to @new_size. - */ -static ssize_t qsb_grow(QEMUSizedBuffer *qsb, size_t new_size) -{ - size_t needed_chunks, i; - - if (qsb->size < new_size) { - struct iovec *new_iov; - size_t size_diff = new_size - qsb->size; - size_t chunk_size = (size_diff > QSB_MAX_CHUNK_SIZE) - ? QSB_MAX_CHUNK_SIZE : QSB_CHUNK_SIZE; - - needed_chunks = DIV_ROUND_UP(size_diff, chunk_size); - - new_iov = g_try_new(struct iovec, qsb->n_iov + needed_chunks); - if (new_iov == NULL) { - return -ENOMEM; - } - - /* Allocate new chunks as needed into new_iov */ - for (i = qsb->n_iov; i < qsb->n_iov + needed_chunks; i++) { - new_iov[i].iov_base = g_try_malloc0(chunk_size); - new_iov[i].iov_len = chunk_size; - if (!new_iov[i].iov_base) { - size_t j; - - /* Free previously allocated new chunks */ - for (j = qsb->n_iov; j < i; j++) { - g_free(new_iov[j].iov_base); - } - g_free(new_iov); - - return -ENOMEM; - } - } - - /* - * Now we can't get any allocation errors, copy over to new iov - * and switch. - */ - for (i = 0; i < qsb->n_iov; i++) { - new_iov[i] = qsb->iov[i]; - } - - qsb->n_iov += needed_chunks; - g_free(qsb->iov); - qsb->iov = new_iov; - qsb->size += (needed_chunks * chunk_size); - } - - return qsb->size; -} - -/** - * Write into the QEMUSizedBuffer at a given position and a given - * number of bytes. This function will automatically grow the - * QEMUSizedBuffer. - * - * @qsb: A QEMUSizedBuffer - * @source: A byte array to copy data from - * @pos: The position within the @qsb to write data to - * @size: The number of bytes to copy into the @qsb - * - * Returns @size or a negative error code in case of memory allocation failure, - * or with an invalid 'pos' - */ -ssize_t qsb_write_at(QEMUSizedBuffer *qsb, const uint8_t *source, - off_t pos, size_t count) -{ - ssize_t rc = qsb_grow(qsb, pos + count); - size_t to_copy; - size_t all_copy = count; - const struct iovec *iov; - ssize_t index; - char *dest; - off_t d_off, s_off = 0; - - if (rc < 0) { - return rc; - } - - if (pos + count > qsb->used) { - qsb->used = pos + count; - } - - index = qsb_get_iovec(qsb, pos, &d_off); - if (index < 0) { - return -EINVAL; - } - - while (all_copy > 0) { - iov = &qsb->iov[index]; - - dest = iov->iov_base; - - to_copy = iov->iov_len - d_off; - if (to_copy > all_copy) { - to_copy = all_copy; - } - - memcpy(&dest[d_off], &source[s_off], to_copy); - - s_off += to_copy; - all_copy -= to_copy; - - d_off = 0; - index++; - } - - return count; -} - -typedef struct QEMUBuffer { - QEMUSizedBuffer *qsb; - QEMUFile *file; - bool qsb_allocated; -} QEMUBuffer; - -static ssize_t buf_get_buffer(void *opaque, uint8_t *buf, int64_t pos, - size_t size) -{ - QEMUBuffer *s = opaque; - ssize_t len = qsb_get_length(s->qsb) - pos; - - if (len <= 0) { - return 0; - } - - if (len > size) { - len = size; - } - return qsb_get_buffer(s->qsb, pos, len, buf); -} - -static ssize_t buf_put_buffer(void *opaque, const uint8_t *buf, - int64_t pos, size_t size) -{ - QEMUBuffer *s = opaque; - - return qsb_write_at(s->qsb, buf, pos, size); -} - -static int buf_close(void *opaque) -{ - QEMUBuffer *s = opaque; - - if (s->qsb_allocated) { - qsb_free(s->qsb); - } - - g_free(s); - - return 0; -} - -const QEMUSizedBuffer *qemu_buf_get(QEMUFile *f) -{ - QEMUBuffer *p; - - qemu_fflush(f); - - p = f->opaque; - - return p->qsb; -} - -static const QEMUFileOps buf_read_ops = { - .get_buffer = buf_get_buffer, - .close = buf_close, -}; - -static const QEMUFileOps buf_write_ops = { - .put_buffer = buf_put_buffer, - .close = buf_close, -}; - -QEMUFile *qemu_bufopen(const char *mode, QEMUSizedBuffer *input) -{ - QEMUBuffer *s; - - if (mode == NULL || (mode[0] != 'r' && mode[0] != 'w') || - mode[1] != '\0') { - error_report("qemu_bufopen: Argument validity check failed"); - return NULL; - } - - s = g_new0(QEMUBuffer, 1); - s->qsb = input; - - if (s->qsb == NULL) { - s->qsb = qsb_create(NULL, 0); - s->qsb_allocated = true; - } - if (!s->qsb) { - g_free(s); - error_report("qemu_bufopen: qsb_create failed"); - return NULL; - } - - - if (mode[0] == 'r') { - s->file = qemu_fopen_ops(s, &buf_read_ops); - } else { - s->file = qemu_fopen_ops(s, &buf_write_ops); - } - return s->file; -} diff --git a/qemu/migration/qemu-file-internal.h b/qemu/migration/qemu-file-internal.h deleted file mode 100644 index d95e8538e..000000000 --- a/qemu/migration/qemu-file-internal.h +++ /dev/null @@ -1,53 +0,0 @@ -/* - * QEMU System Emulator - * - * Copyright (c) 2003-2008 Fabrice Bellard - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. - */ - -#ifndef QEMU_FILE_INTERNAL_H -#define QEMU_FILE_INTERNAL_H 1 - -#include "qemu-common.h" -#include "qemu/iov.h" - -#define IO_BUF_SIZE 32768 -#define MAX_IOV_SIZE MIN(IOV_MAX, 64) - -struct QEMUFile { - const QEMUFileOps *ops; - void *opaque; - - int64_t bytes_xfer; - int64_t xfer_limit; - - int64_t pos; /* start of buffer when writing, end of buffer - when reading */ - int buf_index; - int buf_size; /* 0 when writing */ - uint8_t buf[IO_BUF_SIZE]; - - struct iovec iov[MAX_IOV_SIZE]; - unsigned int iovcnt; - - int last_error; -}; - -#endif diff --git a/qemu/migration/qemu-file-stdio.c b/qemu/migration/qemu-file-stdio.c deleted file mode 100644 index f402e8f70..000000000 --- a/qemu/migration/qemu-file-stdio.c +++ /dev/null @@ -1,196 +0,0 @@ -/* - * QEMU System Emulator - * - * Copyright (c) 2003-2008 Fabrice Bellard - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. - */ -#include "qemu/osdep.h" -#include "qemu-common.h" -#include "qemu/coroutine.h" -#include "migration/qemu-file.h" - -typedef struct QEMUFileStdio { - FILE *stdio_file; - QEMUFile *file; -} QEMUFileStdio; - -static int stdio_get_fd(void *opaque) -{ - QEMUFileStdio *s = opaque; - - return fileno(s->stdio_file); -} - -static ssize_t stdio_put_buffer(void *opaque, const uint8_t *buf, int64_t pos, - size_t size) -{ - QEMUFileStdio *s = opaque; - size_t res; - - res = fwrite(buf, 1, size, s->stdio_file); - - if (res != size) { - return -errno; - } - return res; -} - -static ssize_t stdio_get_buffer(void *opaque, uint8_t *buf, int64_t pos, - size_t size) -{ - QEMUFileStdio *s = opaque; - FILE *fp = s->stdio_file; - ssize_t bytes; - - for (;;) { - clearerr(fp); - bytes = fread(buf, 1, size, fp); - if (bytes != 0 || !ferror(fp)) { - break; - } - if (errno == EAGAIN) { - yield_until_fd_readable(fileno(fp)); - } else if (errno != EINTR) { - break; - } - } - return bytes; -} - -static int stdio_pclose(void *opaque) -{ - QEMUFileStdio *s = opaque; - int ret; - ret = pclose(s->stdio_file); - if (ret == -1) { - ret = -errno; - } else if (!WIFEXITED(ret) || WEXITSTATUS(ret) != 0) { - /* close succeeded, but non-zero exit code: */ - ret = -EIO; /* fake errno value */ - } - g_free(s); - return ret; -} - -static int stdio_fclose(void *opaque) -{ - QEMUFileStdio *s = opaque; - int ret = 0; - - if (qemu_file_is_writable(s->file)) { - int fd = fileno(s->stdio_file); - struct stat st; - - ret = fstat(fd, &st); - if (ret == 0 && S_ISREG(st.st_mode)) { - /* - * If the file handle is a regular file make sure the - * data is flushed to disk before signaling success. - */ - ret = fsync(fd); - if (ret != 0) { - ret = -errno; - return ret; - } - } - } - if (fclose(s->stdio_file) == EOF) { - ret = -errno; - } - g_free(s); - return ret; -} - -static const QEMUFileOps stdio_pipe_read_ops = { - .get_fd = stdio_get_fd, - .get_buffer = stdio_get_buffer, - .close = stdio_pclose -}; - -static const QEMUFileOps stdio_pipe_write_ops = { - .get_fd = stdio_get_fd, - .put_buffer = stdio_put_buffer, - .close = stdio_pclose -}; - -QEMUFile *qemu_popen_cmd(const char *command, const char *mode) -{ - FILE *stdio_file; - QEMUFileStdio *s; - - if (mode == NULL || (mode[0] != 'r' && mode[0] != 'w') || mode[1] != 0) { - fprintf(stderr, "qemu_popen: Argument validity check failed\n"); - return NULL; - } - - stdio_file = popen(command, mode); - if (stdio_file == NULL) { - return NULL; - } - - s = g_new0(QEMUFileStdio, 1); - - s->stdio_file = stdio_file; - - if (mode[0] == 'r') { - s->file = qemu_fopen_ops(s, &stdio_pipe_read_ops); - } else { - s->file = qemu_fopen_ops(s, &stdio_pipe_write_ops); - } - return s->file; -} - -static const QEMUFileOps stdio_file_read_ops = { - .get_fd = stdio_get_fd, - .get_buffer = stdio_get_buffer, - .close = stdio_fclose -}; - -static const QEMUFileOps stdio_file_write_ops = { - .get_fd = stdio_get_fd, - .put_buffer = stdio_put_buffer, - .close = stdio_fclose -}; - -QEMUFile *qemu_fopen(const char *filename, const char *mode) -{ - QEMUFileStdio *s; - - if (qemu_file_mode_is_not_valid(mode)) { - return NULL; - } - - s = g_new0(QEMUFileStdio, 1); - - s->stdio_file = fopen(filename, mode); - if (!s->stdio_file) { - goto fail; - } - - if (mode[0] == 'w') { - s->file = qemu_fopen_ops(s, &stdio_file_write_ops); - } else { - s->file = qemu_fopen_ops(s, &stdio_file_read_ops); - } - return s->file; -fail: - g_free(s); - return NULL; -} diff --git a/qemu/migration/qemu-file-unix.c b/qemu/migration/qemu-file-unix.c deleted file mode 100644 index 4474e18ff..000000000 --- a/qemu/migration/qemu-file-unix.c +++ /dev/null @@ -1,323 +0,0 @@ -/* - * QEMU System Emulator - * - * Copyright (c) 2003-2008 Fabrice Bellard - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. - */ -#include "qemu/osdep.h" -#include "qemu-common.h" -#include "qemu/error-report.h" -#include "qemu/iov.h" -#include "qemu/sockets.h" -#include "qemu/coroutine.h" -#include "migration/qemu-file.h" -#include "migration/qemu-file-internal.h" - -typedef struct QEMUFileSocket { - int fd; - QEMUFile *file; -} QEMUFileSocket; - -static ssize_t socket_writev_buffer(void *opaque, struct iovec *iov, int iovcnt, - int64_t pos) -{ - QEMUFileSocket *s = opaque; - ssize_t len; - ssize_t size = iov_size(iov, iovcnt); - ssize_t offset = 0; - int err; - - while (size > 0) { - len = iov_send(s->fd, iov, iovcnt, offset, size); - - if (len > 0) { - size -= len; - offset += len; - } - - if (size > 0) { - if (errno != EAGAIN && errno != EWOULDBLOCK) { - error_report("socket_writev_buffer: Got err=%d for (%zu/%zu)", - errno, (size_t)size, (size_t)len); - /* - * If I've already sent some but only just got the error, I - * could return the amount validly sent so far and wait for the - * next call to report the error, but I'd rather flag the error - * immediately. - */ - return -errno; - } - - /* Emulate blocking */ - GPollFD pfd; - - pfd.fd = s->fd; - pfd.events = G_IO_OUT | G_IO_ERR; - pfd.revents = 0; - TFR(err = g_poll(&pfd, 1, -1 /* no timeout */)); - /* Errors other than EINTR intentionally ignored */ - } - } - - return offset; -} - -static int socket_get_fd(void *opaque) -{ - QEMUFileSocket *s = opaque; - - return s->fd; -} - -static ssize_t socket_get_buffer(void *opaque, uint8_t *buf, int64_t pos, - size_t size) -{ - QEMUFileSocket *s = opaque; - ssize_t len; - - for (;;) { - len = qemu_recv(s->fd, buf, size, 0); - if (len != -1) { - break; - } - if (errno == EAGAIN) { - yield_until_fd_readable(s->fd); - } else if (errno != EINTR) { - break; - } - } - - if (len == -1) { - len = -errno; - } - return len; -} - -static int socket_close(void *opaque) -{ - QEMUFileSocket *s = opaque; - closesocket(s->fd); - g_free(s); - return 0; -} - -static int socket_shutdown(void *opaque, bool rd, bool wr) -{ - QEMUFileSocket *s = opaque; - - if (shutdown(s->fd, rd ? (wr ? SHUT_RDWR : SHUT_RD) : SHUT_WR)) { - return -errno; - } else { - return 0; - } -} - -static int socket_return_close(void *opaque) -{ - QEMUFileSocket *s = opaque; - /* - * Note: We don't close the socket, that should be done by the forward - * path. - */ - g_free(s); - return 0; -} - -static const QEMUFileOps socket_return_read_ops = { - .get_fd = socket_get_fd, - .get_buffer = socket_get_buffer, - .close = socket_return_close, - .shut_down = socket_shutdown, -}; - -static const QEMUFileOps socket_return_write_ops = { - .get_fd = socket_get_fd, - .writev_buffer = socket_writev_buffer, - .close = socket_return_close, - .shut_down = socket_shutdown, -}; - -/* - * Give a QEMUFile* off the same socket but data in the opposite - * direction. - */ -static QEMUFile *socket_get_return_path(void *opaque) -{ - QEMUFileSocket *forward = opaque; - QEMUFileSocket *reverse; - - if (qemu_file_get_error(forward->file)) { - /* If the forward file is in error, don't try and open a return */ - return NULL; - } - - reverse = g_malloc0(sizeof(QEMUFileSocket)); - reverse->fd = forward->fd; - /* I don't think there's a better way to tell which direction 'this' is */ - if (forward->file->ops->get_buffer != NULL) { - /* being called from the read side, so we need to be able to write */ - return qemu_fopen_ops(reverse, &socket_return_write_ops); - } else { - return qemu_fopen_ops(reverse, &socket_return_read_ops); - } -} - -static ssize_t unix_writev_buffer(void *opaque, struct iovec *iov, int iovcnt, - int64_t pos) -{ - QEMUFileSocket *s = opaque; - ssize_t len, offset; - ssize_t size = iov_size(iov, iovcnt); - ssize_t total = 0; - - assert(iovcnt > 0); - offset = 0; - while (size > 0) { - /* Find the next start position; skip all full-sized vector elements */ - while (offset >= iov[0].iov_len) { - offset -= iov[0].iov_len; - iov++, iovcnt--; - } - - /* skip `offset' bytes from the (now) first element, undo it on exit */ - assert(iovcnt > 0); - iov[0].iov_base += offset; - iov[0].iov_len -= offset; - - do { - len = writev(s->fd, iov, iovcnt); - } while (len == -1 && errno == EINTR); - if (len == -1) { - return -errno; - } - - /* Undo the changes above */ - iov[0].iov_base -= offset; - iov[0].iov_len += offset; - - /* Prepare for the next iteration */ - offset += len; - total += len; - size -= len; - } - - return total; -} - -static ssize_t unix_get_buffer(void *opaque, uint8_t *buf, int64_t pos, - size_t size) -{ - QEMUFileSocket *s = opaque; - ssize_t len; - - for (;;) { - len = read(s->fd, buf, size); - if (len != -1) { - break; - } - if (errno == EAGAIN) { - yield_until_fd_readable(s->fd); - } else if (errno != EINTR) { - break; - } - } - - if (len == -1) { - len = -errno; - } - return len; -} - -static int unix_close(void *opaque) -{ - QEMUFileSocket *s = opaque; - close(s->fd); - g_free(s); - return 0; -} - -static const QEMUFileOps unix_read_ops = { - .get_fd = socket_get_fd, - .get_buffer = unix_get_buffer, - .close = unix_close -}; - -static const QEMUFileOps unix_write_ops = { - .get_fd = socket_get_fd, - .writev_buffer = unix_writev_buffer, - .close = unix_close -}; - -QEMUFile *qemu_fdopen(int fd, const char *mode) -{ - QEMUFileSocket *s; - - if (mode == NULL || - (mode[0] != 'r' && mode[0] != 'w') || - mode[1] != 'b' || mode[2] != 0) { - fprintf(stderr, "qemu_fdopen: Argument validity check failed\n"); - return NULL; - } - - s = g_new0(QEMUFileSocket, 1); - s->fd = fd; - - if (mode[0] == 'r') { - s->file = qemu_fopen_ops(s, &unix_read_ops); - } else { - s->file = qemu_fopen_ops(s, &unix_write_ops); - } - return s->file; -} - -static const QEMUFileOps socket_read_ops = { - .get_fd = socket_get_fd, - .get_buffer = socket_get_buffer, - .close = socket_close, - .shut_down = socket_shutdown, - .get_return_path = socket_get_return_path -}; - -static const QEMUFileOps socket_write_ops = { - .get_fd = socket_get_fd, - .writev_buffer = socket_writev_buffer, - .close = socket_close, - .shut_down = socket_shutdown, - .get_return_path = socket_get_return_path -}; - -QEMUFile *qemu_fopen_socket(int fd, const char *mode) -{ - QEMUFileSocket *s; - - if (qemu_file_mode_is_not_valid(mode)) { - return NULL; - } - - s = g_new0(QEMUFileSocket, 1); - s->fd = fd; - if (mode[0] == 'w') { - qemu_set_block(s->fd); - s->file = qemu_fopen_ops(s, &socket_write_ops); - } else { - s->file = qemu_fopen_ops(s, &socket_read_ops); - } - return s->file; -} diff --git a/qemu/migration/qemu-file.c b/qemu/migration/qemu-file.c deleted file mode 100644 index 6f4a1299b..000000000 --- a/qemu/migration/qemu-file.c +++ /dev/null @@ -1,678 +0,0 @@ -/* - * QEMU System Emulator - * - * Copyright (c) 2003-2008 Fabrice Bellard - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. - */ -#include "qemu/osdep.h" -#include <zlib.h> -#include "qemu-common.h" -#include "qemu/error-report.h" -#include "qemu/iov.h" -#include "qemu/sockets.h" -#include "qemu/coroutine.h" -#include "migration/migration.h" -#include "migration/qemu-file.h" -#include "migration/qemu-file-internal.h" -#include "trace.h" - -/* - * Stop a file from being read/written - not all backing files can do this - * typically only sockets can. - */ -int qemu_file_shutdown(QEMUFile *f) -{ - if (!f->ops->shut_down) { - return -ENOSYS; - } - return f->ops->shut_down(f->opaque, true, true); -} - -/* - * Result: QEMUFile* for a 'return path' for comms in the opposite direction - * NULL if not available - */ -QEMUFile *qemu_file_get_return_path(QEMUFile *f) -{ - if (!f->ops->get_return_path) { - return NULL; - } - return f->ops->get_return_path(f->opaque); -} - -bool qemu_file_mode_is_not_valid(const char *mode) -{ - if (mode == NULL || - (mode[0] != 'r' && mode[0] != 'w') || - mode[1] != 'b' || mode[2] != 0) { - fprintf(stderr, "qemu_fopen: Argument validity check failed\n"); - return true; - } - - return false; -} - -QEMUFile *qemu_fopen_ops(void *opaque, const QEMUFileOps *ops) -{ - QEMUFile *f; - - f = g_new0(QEMUFile, 1); - - f->opaque = opaque; - f->ops = ops; - return f; -} - -/* - * Get last error for stream f - * - * Return negative error value if there has been an error on previous - * operations, return 0 if no error happened. - * - */ -int qemu_file_get_error(QEMUFile *f) -{ - return f->last_error; -} - -void qemu_file_set_error(QEMUFile *f, int ret) -{ - if (f->last_error == 0) { - f->last_error = ret; - } -} - -bool qemu_file_is_writable(QEMUFile *f) -{ - return f->ops->writev_buffer || f->ops->put_buffer; -} - -/** - * Flushes QEMUFile buffer - * - * If there is writev_buffer QEMUFileOps it uses it otherwise uses - * put_buffer ops. - */ -void qemu_fflush(QEMUFile *f) -{ - ssize_t ret = 0; - - if (!qemu_file_is_writable(f)) { - return; - } - - if (f->ops->writev_buffer) { - if (f->iovcnt > 0) { - ret = f->ops->writev_buffer(f->opaque, f->iov, f->iovcnt, f->pos); - } - } else { - if (f->buf_index > 0) { - ret = f->ops->put_buffer(f->opaque, f->buf, f->pos, f->buf_index); - } - } - if (ret >= 0) { - f->pos += ret; - } - f->buf_index = 0; - f->iovcnt = 0; - if (ret < 0) { - qemu_file_set_error(f, ret); - } -} - -void ram_control_before_iterate(QEMUFile *f, uint64_t flags) -{ - int ret = 0; - - if (f->ops->before_ram_iterate) { - ret = f->ops->before_ram_iterate(f, f->opaque, flags, NULL); - if (ret < 0) { - qemu_file_set_error(f, ret); - } - } -} - -void ram_control_after_iterate(QEMUFile *f, uint64_t flags) -{ - int ret = 0; - - if (f->ops->after_ram_iterate) { - ret = f->ops->after_ram_iterate(f, f->opaque, flags, NULL); - if (ret < 0) { - qemu_file_set_error(f, ret); - } - } -} - -void ram_control_load_hook(QEMUFile *f, uint64_t flags, void *data) -{ - int ret = -EINVAL; - - if (f->ops->hook_ram_load) { - ret = f->ops->hook_ram_load(f, f->opaque, flags, data); - if (ret < 0) { - qemu_file_set_error(f, ret); - } - } else { - /* - * Hook is a hook specifically requested by the source sending a flag - * that expects there to be a hook on the destination. - */ - if (flags == RAM_CONTROL_HOOK) { - qemu_file_set_error(f, ret); - } - } -} - -size_t ram_control_save_page(QEMUFile *f, ram_addr_t block_offset, - ram_addr_t offset, size_t size, - uint64_t *bytes_sent) -{ - if (f->ops->save_page) { - int ret = f->ops->save_page(f, f->opaque, block_offset, - offset, size, bytes_sent); - - if (ret != RAM_SAVE_CONTROL_DELAYED) { - if (bytes_sent && *bytes_sent > 0) { - qemu_update_position(f, *bytes_sent); - } else if (ret < 0) { - qemu_file_set_error(f, ret); - } - } - - return ret; - } - - return RAM_SAVE_CONTROL_NOT_SUPP; -} - -/* - * Attempt to fill the buffer from the underlying file - * Returns the number of bytes read, or negative value for an error. - * - * Note that it can return a partially full buffer even in a not error/not EOF - * case if the underlying file descriptor gives a short read, and that can - * happen even on a blocking fd. - */ -static ssize_t qemu_fill_buffer(QEMUFile *f) -{ - int len; - int pending; - - assert(!qemu_file_is_writable(f)); - - pending = f->buf_size - f->buf_index; - if (pending > 0) { - memmove(f->buf, f->buf + f->buf_index, pending); - } - f->buf_index = 0; - f->buf_size = pending; - - len = f->ops->get_buffer(f->opaque, f->buf + pending, f->pos, - IO_BUF_SIZE - pending); - if (len > 0) { - f->buf_size += len; - f->pos += len; - } else if (len == 0) { - qemu_file_set_error(f, -EIO); - } else if (len != -EAGAIN) { - qemu_file_set_error(f, len); - } - - return len; -} - -int qemu_get_fd(QEMUFile *f) -{ - if (f->ops->get_fd) { - return f->ops->get_fd(f->opaque); - } - return -1; -} - -void qemu_update_position(QEMUFile *f, size_t size) -{ - f->pos += size; -} - -/** Closes the file - * - * Returns negative error value if any error happened on previous operations or - * while closing the file. Returns 0 or positive number on success. - * - * The meaning of return value on success depends on the specific backend - * being used. - */ -int qemu_fclose(QEMUFile *f) -{ - int ret; - qemu_fflush(f); - ret = qemu_file_get_error(f); - - if (f->ops->close) { - int ret2 = f->ops->close(f->opaque); - if (ret >= 0) { - ret = ret2; - } - } - /* If any error was spotted before closing, we should report it - * instead of the close() return value. - */ - if (f->last_error) { - ret = f->last_error; - } - g_free(f); - trace_qemu_file_fclose(); - return ret; -} - -static void add_to_iovec(QEMUFile *f, const uint8_t *buf, size_t size) -{ - /* check for adjacent buffer and coalesce them */ - if (f->iovcnt > 0 && buf == f->iov[f->iovcnt - 1].iov_base + - f->iov[f->iovcnt - 1].iov_len) { - f->iov[f->iovcnt - 1].iov_len += size; - } else { - f->iov[f->iovcnt].iov_base = (uint8_t *)buf; - f->iov[f->iovcnt++].iov_len = size; - } - - if (f->iovcnt >= MAX_IOV_SIZE) { - qemu_fflush(f); - } -} - -void qemu_put_buffer_async(QEMUFile *f, const uint8_t *buf, size_t size) -{ - if (!f->ops->writev_buffer) { - qemu_put_buffer(f, buf, size); - return; - } - - if (f->last_error) { - return; - } - - f->bytes_xfer += size; - add_to_iovec(f, buf, size); -} - -void qemu_put_buffer(QEMUFile *f, const uint8_t *buf, size_t size) -{ - size_t l; - - if (f->last_error) { - return; - } - - while (size > 0) { - l = IO_BUF_SIZE - f->buf_index; - if (l > size) { - l = size; - } - memcpy(f->buf + f->buf_index, buf, l); - f->bytes_xfer += l; - if (f->ops->writev_buffer) { - add_to_iovec(f, f->buf + f->buf_index, l); - } - f->buf_index += l; - if (f->buf_index == IO_BUF_SIZE) { - qemu_fflush(f); - } - if (qemu_file_get_error(f)) { - break; - } - buf += l; - size -= l; - } -} - -void qemu_put_byte(QEMUFile *f, int v) -{ - if (f->last_error) { - return; - } - - f->buf[f->buf_index] = v; - f->bytes_xfer++; - if (f->ops->writev_buffer) { - add_to_iovec(f, f->buf + f->buf_index, 1); - } - f->buf_index++; - if (f->buf_index == IO_BUF_SIZE) { - qemu_fflush(f); - } -} - -void qemu_file_skip(QEMUFile *f, int size) -{ - if (f->buf_index + size <= f->buf_size) { - f->buf_index += size; - } -} - -/* - * Read 'size' bytes from file (at 'offset') without moving the - * pointer and set 'buf' to point to that data. - * - * It will return size bytes unless there was an error, in which case it will - * return as many as it managed to read (assuming blocking fd's which - * all current QEMUFile are) - */ -size_t qemu_peek_buffer(QEMUFile *f, uint8_t **buf, size_t size, size_t offset) -{ - ssize_t pending; - size_t index; - - assert(!qemu_file_is_writable(f)); - assert(offset < IO_BUF_SIZE); - assert(size <= IO_BUF_SIZE - offset); - - /* The 1st byte to read from */ - index = f->buf_index + offset; - /* The number of available bytes starting at index */ - pending = f->buf_size - index; - - /* - * qemu_fill_buffer might return just a few bytes, even when there isn't - * an error, so loop collecting them until we get enough. - */ - while (pending < size) { - int received = qemu_fill_buffer(f); - - if (received <= 0) { - break; - } - - index = f->buf_index + offset; - pending = f->buf_size - index; - } - - if (pending <= 0) { - return 0; - } - if (size > pending) { - size = pending; - } - - *buf = f->buf + index; - return size; -} - -/* - * Read 'size' bytes of data from the file into buf. - * 'size' can be larger than the internal buffer. - * - * It will return size bytes unless there was an error, in which case it will - * return as many as it managed to read (assuming blocking fd's which - * all current QEMUFile are) - */ -size_t qemu_get_buffer(QEMUFile *f, uint8_t *buf, size_t size) -{ - size_t pending = size; - size_t done = 0; - - while (pending > 0) { - size_t res; - uint8_t *src; - - res = qemu_peek_buffer(f, &src, MIN(pending, IO_BUF_SIZE), 0); - if (res == 0) { - return done; - } - memcpy(buf, src, res); - qemu_file_skip(f, res); - buf += res; - pending -= res; - done += res; - } - return done; -} - -/* - * Read 'size' bytes of data from the file. - * 'size' can be larger than the internal buffer. - * - * The data: - * may be held on an internal buffer (in which case *buf is updated - * to point to it) that is valid until the next qemu_file operation. - * OR - * will be copied to the *buf that was passed in. - * - * The code tries to avoid the copy if possible. - * - * It will return size bytes unless there was an error, in which case it will - * return as many as it managed to read (assuming blocking fd's which - * all current QEMUFile are) - * - * Note: Since **buf may get changed, the caller should take care to - * keep a pointer to the original buffer if it needs to deallocate it. - */ -size_t qemu_get_buffer_in_place(QEMUFile *f, uint8_t **buf, size_t size) -{ - if (size < IO_BUF_SIZE) { - size_t res; - uint8_t *src; - - res = qemu_peek_buffer(f, &src, size, 0); - - if (res == size) { - qemu_file_skip(f, res); - *buf = src; - return res; - } - } - - return qemu_get_buffer(f, *buf, size); -} - -/* - * Peeks a single byte from the buffer; this isn't guaranteed to work if - * offset leaves a gap after the previous read/peeked data. - */ -int qemu_peek_byte(QEMUFile *f, int offset) -{ - int index = f->buf_index + offset; - - assert(!qemu_file_is_writable(f)); - assert(offset < IO_BUF_SIZE); - - if (index >= f->buf_size) { - qemu_fill_buffer(f); - index = f->buf_index + offset; - if (index >= f->buf_size) { - return 0; - } - } - return f->buf[index]; -} - -int qemu_get_byte(QEMUFile *f) -{ - int result; - - result = qemu_peek_byte(f, 0); - qemu_file_skip(f, 1); - return result; -} - -int64_t qemu_ftell_fast(QEMUFile *f) -{ - int64_t ret = f->pos; - int i; - - if (f->ops->writev_buffer) { - for (i = 0; i < f->iovcnt; i++) { - ret += f->iov[i].iov_len; - } - } else { - ret += f->buf_index; - } - - return ret; -} - -int64_t qemu_ftell(QEMUFile *f) -{ - qemu_fflush(f); - return f->pos; -} - -int qemu_file_rate_limit(QEMUFile *f) -{ - if (qemu_file_get_error(f)) { - return 1; - } - if (f->xfer_limit > 0 && f->bytes_xfer > f->xfer_limit) { - return 1; - } - return 0; -} - -int64_t qemu_file_get_rate_limit(QEMUFile *f) -{ - return f->xfer_limit; -} - -void qemu_file_set_rate_limit(QEMUFile *f, int64_t limit) -{ - f->xfer_limit = limit; -} - -void qemu_file_reset_rate_limit(QEMUFile *f) -{ - f->bytes_xfer = 0; -} - -void qemu_put_be16(QEMUFile *f, unsigned int v) -{ - qemu_put_byte(f, v >> 8); - qemu_put_byte(f, v); -} - -void qemu_put_be32(QEMUFile *f, unsigned int v) -{ - qemu_put_byte(f, v >> 24); - qemu_put_byte(f, v >> 16); - qemu_put_byte(f, v >> 8); - qemu_put_byte(f, v); -} - -void qemu_put_be64(QEMUFile *f, uint64_t v) -{ - qemu_put_be32(f, v >> 32); - qemu_put_be32(f, v); -} - -unsigned int qemu_get_be16(QEMUFile *f) -{ - unsigned int v; - v = qemu_get_byte(f) << 8; - v |= qemu_get_byte(f); - return v; -} - -unsigned int qemu_get_be32(QEMUFile *f) -{ - unsigned int v; - v = (unsigned int)qemu_get_byte(f) << 24; - v |= qemu_get_byte(f) << 16; - v |= qemu_get_byte(f) << 8; - v |= qemu_get_byte(f); - return v; -} - -uint64_t qemu_get_be64(QEMUFile *f) -{ - uint64_t v; - v = (uint64_t)qemu_get_be32(f) << 32; - v |= qemu_get_be32(f); - return v; -} - -/* compress size bytes of data start at p with specific compression - * level and store the compressed data to the buffer of f. - */ - -ssize_t qemu_put_compression_data(QEMUFile *f, const uint8_t *p, size_t size, - int level) -{ - ssize_t blen = IO_BUF_SIZE - f->buf_index - sizeof(int32_t); - - if (blen < compressBound(size)) { - return 0; - } - if (compress2(f->buf + f->buf_index + sizeof(int32_t), (uLongf *)&blen, - (Bytef *)p, size, level) != Z_OK) { - error_report("Compress Failed!"); - return 0; - } - qemu_put_be32(f, blen); - f->buf_index += blen; - return blen + sizeof(int32_t); -} - -/* Put the data in the buffer of f_src to the buffer of f_des, and - * then reset the buf_index of f_src to 0. - */ - -int qemu_put_qemu_file(QEMUFile *f_des, QEMUFile *f_src) -{ - int len = 0; - - if (f_src->buf_index > 0) { - len = f_src->buf_index; - qemu_put_buffer(f_des, f_src->buf, f_src->buf_index); - f_src->buf_index = 0; - } - return len; -} - -/* - * Get a string whose length is determined by a single preceding byte - * A preallocated 256 byte buffer must be passed in. - * Returns: len on success and a 0 terminated string in the buffer - * else 0 - * (Note a 0 length string will return 0 either way) - */ -size_t qemu_get_counted_string(QEMUFile *f, char buf[256]) -{ - size_t len = qemu_get_byte(f); - size_t res = qemu_get_buffer(f, (uint8_t *)buf, len); - - buf[res] = 0; - - return res == len ? res : 0; -} - -/* - * Set the blocking state of the QEMUFile. - * Note: On some transports the OS only keeps a single blocking state for - * both directions, and thus changing the blocking on the main - * QEMUFile can also affect the return path. - */ -void qemu_file_set_blocking(QEMUFile *f, bool block) -{ - if (block) { - qemu_set_block(qemu_get_fd(f)); - } else { - qemu_set_nonblock(qemu_get_fd(f)); - } -} diff --git a/qemu/migration/ram.c b/qemu/migration/ram.c deleted file mode 100644 index 3f057388c..000000000 --- a/qemu/migration/ram.c +++ /dev/null @@ -1,2561 +0,0 @@ -/* - * QEMU System Emulator - * - * Copyright (c) 2003-2008 Fabrice Bellard - * Copyright (c) 2011-2015 Red Hat Inc - * - * Authors: - * Juan Quintela <quintela@redhat.com> - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. - */ -#include "qemu/osdep.h" -#include <zlib.h> -#include "qapi-event.h" -#include "qemu/cutils.h" -#include "qemu/bitops.h" -#include "qemu/bitmap.h" -#include "qemu/timer.h" -#include "qemu/main-loop.h" -#include "migration/migration.h" -#include "migration/postcopy-ram.h" -#include "exec/address-spaces.h" -#include "migration/page_cache.h" -#include "qemu/error-report.h" -#include "trace.h" -#include "exec/ram_addr.h" -#include "qemu/rcu_queue.h" - -#ifdef DEBUG_MIGRATION_RAM -#define DPRINTF(fmt, ...) \ - do { fprintf(stdout, "migration_ram: " fmt, ## __VA_ARGS__); } while (0) -#else -#define DPRINTF(fmt, ...) \ - do { } while (0) -#endif - -static int dirty_rate_high_cnt; - -static uint64_t bitmap_sync_count; - -/***********************************************************/ -/* ram save/restore */ - -#define RAM_SAVE_FLAG_FULL 0x01 /* Obsolete, not used anymore */ -#define RAM_SAVE_FLAG_COMPRESS 0x02 -#define RAM_SAVE_FLAG_MEM_SIZE 0x04 -#define RAM_SAVE_FLAG_PAGE 0x08 -#define RAM_SAVE_FLAG_EOS 0x10 -#define RAM_SAVE_FLAG_CONTINUE 0x20 -#define RAM_SAVE_FLAG_XBZRLE 0x40 -/* 0x80 is reserved in migration.h start with 0x100 next */ -#define RAM_SAVE_FLAG_COMPRESS_PAGE 0x100 - -static const uint8_t ZERO_TARGET_PAGE[TARGET_PAGE_SIZE]; - -static inline bool is_zero_range(uint8_t *p, uint64_t size) -{ - return buffer_find_nonzero_offset(p, size) == size; -} - -/* struct contains XBZRLE cache and a static page - used by the compression */ -static struct { - /* buffer used for XBZRLE encoding */ - uint8_t *encoded_buf; - /* buffer for storing page content */ - uint8_t *current_buf; - /* Cache for XBZRLE, Protected by lock. */ - PageCache *cache; - QemuMutex lock; -} XBZRLE; - -/* buffer used for XBZRLE decoding */ -static uint8_t *xbzrle_decoded_buf; - -static void XBZRLE_cache_lock(void) -{ - if (migrate_use_xbzrle()) - qemu_mutex_lock(&XBZRLE.lock); -} - -static void XBZRLE_cache_unlock(void) -{ - if (migrate_use_xbzrle()) - qemu_mutex_unlock(&XBZRLE.lock); -} - -/* - * called from qmp_migrate_set_cache_size in main thread, possibly while - * a migration is in progress. - * A running migration maybe using the cache and might finish during this - * call, hence changes to the cache are protected by XBZRLE.lock(). - */ -int64_t xbzrle_cache_resize(int64_t new_size) -{ - PageCache *new_cache; - int64_t ret; - - if (new_size < TARGET_PAGE_SIZE) { - return -1; - } - - XBZRLE_cache_lock(); - - if (XBZRLE.cache != NULL) { - if (pow2floor(new_size) == migrate_xbzrle_cache_size()) { - goto out_new_size; - } - new_cache = cache_init(new_size / TARGET_PAGE_SIZE, - TARGET_PAGE_SIZE); - if (!new_cache) { - error_report("Error creating cache"); - ret = -1; - goto out; - } - - cache_fini(XBZRLE.cache); - XBZRLE.cache = new_cache; - } - -out_new_size: - ret = pow2floor(new_size); -out: - XBZRLE_cache_unlock(); - return ret; -} - -/* accounting for migration statistics */ -typedef struct AccountingInfo { - uint64_t dup_pages; - uint64_t skipped_pages; - uint64_t norm_pages; - uint64_t iterations; - uint64_t xbzrle_bytes; - uint64_t xbzrle_pages; - uint64_t xbzrle_cache_miss; - double xbzrle_cache_miss_rate; - uint64_t xbzrle_overflows; -} AccountingInfo; - -static AccountingInfo acct_info; - -static void acct_clear(void) -{ - memset(&acct_info, 0, sizeof(acct_info)); -} - -uint64_t dup_mig_bytes_transferred(void) -{ - return acct_info.dup_pages * TARGET_PAGE_SIZE; -} - -uint64_t dup_mig_pages_transferred(void) -{ - return acct_info.dup_pages; -} - -uint64_t skipped_mig_bytes_transferred(void) -{ - return acct_info.skipped_pages * TARGET_PAGE_SIZE; -} - -uint64_t skipped_mig_pages_transferred(void) -{ - return acct_info.skipped_pages; -} - -uint64_t norm_mig_bytes_transferred(void) -{ - return acct_info.norm_pages * TARGET_PAGE_SIZE; -} - -uint64_t norm_mig_pages_transferred(void) -{ - return acct_info.norm_pages; -} - -uint64_t xbzrle_mig_bytes_transferred(void) -{ - return acct_info.xbzrle_bytes; -} - -uint64_t xbzrle_mig_pages_transferred(void) -{ - return acct_info.xbzrle_pages; -} - -uint64_t xbzrle_mig_pages_cache_miss(void) -{ - return acct_info.xbzrle_cache_miss; -} - -double xbzrle_mig_cache_miss_rate(void) -{ - return acct_info.xbzrle_cache_miss_rate; -} - -uint64_t xbzrle_mig_pages_overflow(void) -{ - return acct_info.xbzrle_overflows; -} - -/* This is the last block that we have visited serching for dirty pages - */ -static RAMBlock *last_seen_block; -/* This is the last block from where we have sent data */ -static RAMBlock *last_sent_block; -static ram_addr_t last_offset; -static QemuMutex migration_bitmap_mutex; -static uint64_t migration_dirty_pages; -static uint32_t last_version; -static bool ram_bulk_stage; - -/* used by the search for pages to send */ -struct PageSearchStatus { - /* Current block being searched */ - RAMBlock *block; - /* Current offset to search from */ - ram_addr_t offset; - /* Set once we wrap around */ - bool complete_round; -}; -typedef struct PageSearchStatus PageSearchStatus; - -static struct BitmapRcu { - struct rcu_head rcu; - /* Main migration bitmap */ - unsigned long *bmap; - /* bitmap of pages that haven't been sent even once - * only maintained and used in postcopy at the moment - * where it's used to send the dirtymap at the start - * of the postcopy phase - */ - unsigned long *unsentmap; -} *migration_bitmap_rcu; - -struct CompressParam { - bool start; - bool done; - QEMUFile *file; - QemuMutex mutex; - QemuCond cond; - RAMBlock *block; - ram_addr_t offset; -}; -typedef struct CompressParam CompressParam; - -struct DecompressParam { - bool start; - QemuMutex mutex; - QemuCond cond; - void *des; - uint8_t *compbuf; - int len; -}; -typedef struct DecompressParam DecompressParam; - -static CompressParam *comp_param; -static QemuThread *compress_threads; -/* comp_done_cond is used to wake up the migration thread when - * one of the compression threads has finished the compression. - * comp_done_lock is used to co-work with comp_done_cond. - */ -static QemuMutex *comp_done_lock; -static QemuCond *comp_done_cond; -/* The empty QEMUFileOps will be used by file in CompressParam */ -static const QEMUFileOps empty_ops = { }; - -static bool compression_switch; -static bool quit_comp_thread; -static bool quit_decomp_thread; -static DecompressParam *decomp_param; -static QemuThread *decompress_threads; - -static int do_compress_ram_page(CompressParam *param); - -static void *do_data_compress(void *opaque) -{ - CompressParam *param = opaque; - - while (!quit_comp_thread) { - qemu_mutex_lock(¶m->mutex); - /* Re-check the quit_comp_thread in case of - * terminate_compression_threads is called just before - * qemu_mutex_lock(¶m->mutex) and after - * while(!quit_comp_thread), re-check it here can make - * sure the compression thread terminate as expected. - */ - while (!param->start && !quit_comp_thread) { - qemu_cond_wait(¶m->cond, ¶m->mutex); - } - if (!quit_comp_thread) { - do_compress_ram_page(param); - } - param->start = false; - qemu_mutex_unlock(¶m->mutex); - - qemu_mutex_lock(comp_done_lock); - param->done = true; - qemu_cond_signal(comp_done_cond); - qemu_mutex_unlock(comp_done_lock); - } - - return NULL; -} - -static inline void terminate_compression_threads(void) -{ - int idx, thread_count; - - thread_count = migrate_compress_threads(); - quit_comp_thread = true; - for (idx = 0; idx < thread_count; idx++) { - qemu_mutex_lock(&comp_param[idx].mutex); - qemu_cond_signal(&comp_param[idx].cond); - qemu_mutex_unlock(&comp_param[idx].mutex); - } -} - -void migrate_compress_threads_join(void) -{ - int i, thread_count; - - if (!migrate_use_compression()) { - return; - } - terminate_compression_threads(); - thread_count = migrate_compress_threads(); - for (i = 0; i < thread_count; i++) { - qemu_thread_join(compress_threads + i); - qemu_fclose(comp_param[i].file); - qemu_mutex_destroy(&comp_param[i].mutex); - qemu_cond_destroy(&comp_param[i].cond); - } - qemu_mutex_destroy(comp_done_lock); - qemu_cond_destroy(comp_done_cond); - g_free(compress_threads); - g_free(comp_param); - g_free(comp_done_cond); - g_free(comp_done_lock); - compress_threads = NULL; - comp_param = NULL; - comp_done_cond = NULL; - comp_done_lock = NULL; -} - -void migrate_compress_threads_create(void) -{ - int i, thread_count; - - if (!migrate_use_compression()) { - return; - } - quit_comp_thread = false; - compression_switch = true; - thread_count = migrate_compress_threads(); - compress_threads = g_new0(QemuThread, thread_count); - comp_param = g_new0(CompressParam, thread_count); - comp_done_cond = g_new0(QemuCond, 1); - comp_done_lock = g_new0(QemuMutex, 1); - qemu_cond_init(comp_done_cond); - qemu_mutex_init(comp_done_lock); - for (i = 0; i < thread_count; i++) { - /* com_param[i].file is just used as a dummy buffer to save data, set - * it's ops to empty. - */ - comp_param[i].file = qemu_fopen_ops(NULL, &empty_ops); - comp_param[i].done = true; - qemu_mutex_init(&comp_param[i].mutex); - qemu_cond_init(&comp_param[i].cond); - qemu_thread_create(compress_threads + i, "compress", - do_data_compress, comp_param + i, - QEMU_THREAD_JOINABLE); - } -} - -/** - * save_page_header: Write page header to wire - * - * If this is the 1st block, it also writes the block identification - * - * Returns: Number of bytes written - * - * @f: QEMUFile where to send the data - * @block: block that contains the page we want to send - * @offset: offset inside the block for the page - * in the lower bits, it contains flags - */ -static size_t save_page_header(QEMUFile *f, RAMBlock *block, ram_addr_t offset) -{ - size_t size, len; - - qemu_put_be64(f, offset); - size = 8; - - if (!(offset & RAM_SAVE_FLAG_CONTINUE)) { - len = strlen(block->idstr); - qemu_put_byte(f, len); - qemu_put_buffer(f, (uint8_t *)block->idstr, len); - size += 1 + len; - } - return size; -} - -/* Reduce amount of guest cpu execution to hopefully slow down memory writes. - * If guest dirty memory rate is reduced below the rate at which we can - * transfer pages to the destination then we should be able to complete - * migration. Some workloads dirty memory way too fast and will not effectively - * converge, even with auto-converge. - */ -static void mig_throttle_guest_down(void) -{ - MigrationState *s = migrate_get_current(); - uint64_t pct_initial = - s->parameters[MIGRATION_PARAMETER_X_CPU_THROTTLE_INITIAL]; - uint64_t pct_icrement = - s->parameters[MIGRATION_PARAMETER_X_CPU_THROTTLE_INCREMENT]; - - /* We have not started throttling yet. Let's start it. */ - if (!cpu_throttle_active()) { - cpu_throttle_set(pct_initial); - } else { - /* Throttling already on, just increase the rate */ - cpu_throttle_set(cpu_throttle_get_percentage() + pct_icrement); - } -} - -/* Update the xbzrle cache to reflect a page that's been sent as all 0. - * The important thing is that a stale (not-yet-0'd) page be replaced - * by the new data. - * As a bonus, if the page wasn't in the cache it gets added so that - * when a small write is made into the 0'd page it gets XBZRLE sent - */ -static void xbzrle_cache_zero_page(ram_addr_t current_addr) -{ - if (ram_bulk_stage || !migrate_use_xbzrle()) { - return; - } - - /* We don't care if this fails to allocate a new cache page - * as long as it updated an old one */ - cache_insert(XBZRLE.cache, current_addr, ZERO_TARGET_PAGE, - bitmap_sync_count); -} - -#define ENCODING_FLAG_XBZRLE 0x1 - -/** - * save_xbzrle_page: compress and send current page - * - * Returns: 1 means that we wrote the page - * 0 means that page is identical to the one already sent - * -1 means that xbzrle would be longer than normal - * - * @f: QEMUFile where to send the data - * @current_data: - * @current_addr: - * @block: block that contains the page we want to send - * @offset: offset inside the block for the page - * @last_stage: if we are at the completion stage - * @bytes_transferred: increase it with the number of transferred bytes - */ -static int save_xbzrle_page(QEMUFile *f, uint8_t **current_data, - ram_addr_t current_addr, RAMBlock *block, - ram_addr_t offset, bool last_stage, - uint64_t *bytes_transferred) -{ - int encoded_len = 0, bytes_xbzrle; - uint8_t *prev_cached_page; - - if (!cache_is_cached(XBZRLE.cache, current_addr, bitmap_sync_count)) { - acct_info.xbzrle_cache_miss++; - if (!last_stage) { - if (cache_insert(XBZRLE.cache, current_addr, *current_data, - bitmap_sync_count) == -1) { - return -1; - } else { - /* update *current_data when the page has been - inserted into cache */ - *current_data = get_cached_data(XBZRLE.cache, current_addr); - } - } - return -1; - } - - prev_cached_page = get_cached_data(XBZRLE.cache, current_addr); - - /* save current buffer into memory */ - memcpy(XBZRLE.current_buf, *current_data, TARGET_PAGE_SIZE); - - /* XBZRLE encoding (if there is no overflow) */ - encoded_len = xbzrle_encode_buffer(prev_cached_page, XBZRLE.current_buf, - TARGET_PAGE_SIZE, XBZRLE.encoded_buf, - TARGET_PAGE_SIZE); - if (encoded_len == 0) { - DPRINTF("Skipping unmodified page\n"); - return 0; - } else if (encoded_len == -1) { - DPRINTF("Overflow\n"); - acct_info.xbzrle_overflows++; - /* update data in the cache */ - if (!last_stage) { - memcpy(prev_cached_page, *current_data, TARGET_PAGE_SIZE); - *current_data = prev_cached_page; - } - return -1; - } - - /* we need to update the data in the cache, in order to get the same data */ - if (!last_stage) { - memcpy(prev_cached_page, XBZRLE.current_buf, TARGET_PAGE_SIZE); - } - - /* Send XBZRLE based compressed page */ - bytes_xbzrle = save_page_header(f, block, offset | RAM_SAVE_FLAG_XBZRLE); - qemu_put_byte(f, ENCODING_FLAG_XBZRLE); - qemu_put_be16(f, encoded_len); - qemu_put_buffer(f, XBZRLE.encoded_buf, encoded_len); - bytes_xbzrle += encoded_len + 1 + 2; - acct_info.xbzrle_pages++; - acct_info.xbzrle_bytes += bytes_xbzrle; - *bytes_transferred += bytes_xbzrle; - - return 1; -} - -/* Called with rcu_read_lock() to protect migration_bitmap - * rb: The RAMBlock to search for dirty pages in - * start: Start address (typically so we can continue from previous page) - * ram_addr_abs: Pointer into which to store the address of the dirty page - * within the global ram_addr space - * - * Returns: byte offset within memory region of the start of a dirty page - */ -static inline -ram_addr_t migration_bitmap_find_dirty(RAMBlock *rb, - ram_addr_t start, - ram_addr_t *ram_addr_abs) -{ - unsigned long base = rb->offset >> TARGET_PAGE_BITS; - unsigned long nr = base + (start >> TARGET_PAGE_BITS); - uint64_t rb_size = rb->used_length; - unsigned long size = base + (rb_size >> TARGET_PAGE_BITS); - unsigned long *bitmap; - - unsigned long next; - - bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap; - if (ram_bulk_stage && nr > base) { - next = nr + 1; - } else { - next = find_next_bit(bitmap, size, nr); - } - - *ram_addr_abs = next << TARGET_PAGE_BITS; - return (next - base) << TARGET_PAGE_BITS; -} - -static inline bool migration_bitmap_clear_dirty(ram_addr_t addr) -{ - bool ret; - int nr = addr >> TARGET_PAGE_BITS; - unsigned long *bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap; - - ret = test_and_clear_bit(nr, bitmap); - - if (ret) { - migration_dirty_pages--; - } - return ret; -} - -static void migration_bitmap_sync_range(ram_addr_t start, ram_addr_t length) -{ - unsigned long *bitmap; - bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap; - migration_dirty_pages += - cpu_physical_memory_sync_dirty_bitmap(bitmap, start, length); -} - -/* Fix me: there are too many global variables used in migration process. */ -static int64_t start_time; -static int64_t bytes_xfer_prev; -static int64_t num_dirty_pages_period; -static uint64_t xbzrle_cache_miss_prev; -static uint64_t iterations_prev; - -static void migration_bitmap_sync_init(void) -{ - start_time = 0; - bytes_xfer_prev = 0; - num_dirty_pages_period = 0; - xbzrle_cache_miss_prev = 0; - iterations_prev = 0; -} - -static void migration_bitmap_sync(void) -{ - RAMBlock *block; - uint64_t num_dirty_pages_init = migration_dirty_pages; - MigrationState *s = migrate_get_current(); - int64_t end_time; - int64_t bytes_xfer_now; - - bitmap_sync_count++; - - if (!bytes_xfer_prev) { - bytes_xfer_prev = ram_bytes_transferred(); - } - - if (!start_time) { - start_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); - } - - trace_migration_bitmap_sync_start(); - address_space_sync_dirty_bitmap(&address_space_memory); - - qemu_mutex_lock(&migration_bitmap_mutex); - rcu_read_lock(); - QLIST_FOREACH_RCU(block, &ram_list.blocks, next) { - migration_bitmap_sync_range(block->offset, block->used_length); - } - rcu_read_unlock(); - qemu_mutex_unlock(&migration_bitmap_mutex); - - trace_migration_bitmap_sync_end(migration_dirty_pages - - num_dirty_pages_init); - num_dirty_pages_period += migration_dirty_pages - num_dirty_pages_init; - end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME); - - /* more than 1 second = 1000 millisecons */ - if (end_time > start_time + 1000) { - if (migrate_auto_converge()) { - /* The following detection logic can be refined later. For now: - Check to see if the dirtied bytes is 50% more than the approx. - amount of bytes that just got transferred since the last time we - were in this routine. If that happens twice, start or increase - throttling */ - bytes_xfer_now = ram_bytes_transferred(); - - if (s->dirty_pages_rate && - (num_dirty_pages_period * TARGET_PAGE_SIZE > - (bytes_xfer_now - bytes_xfer_prev)/2) && - (dirty_rate_high_cnt++ >= 2)) { - trace_migration_throttle(); - dirty_rate_high_cnt = 0; - mig_throttle_guest_down(); - } - bytes_xfer_prev = bytes_xfer_now; - } - - if (migrate_use_xbzrle()) { - if (iterations_prev != acct_info.iterations) { - acct_info.xbzrle_cache_miss_rate = - (double)(acct_info.xbzrle_cache_miss - - xbzrle_cache_miss_prev) / - (acct_info.iterations - iterations_prev); - } - iterations_prev = acct_info.iterations; - xbzrle_cache_miss_prev = acct_info.xbzrle_cache_miss; - } - s->dirty_pages_rate = num_dirty_pages_period * 1000 - / (end_time - start_time); - s->dirty_bytes_rate = s->dirty_pages_rate * TARGET_PAGE_SIZE; - start_time = end_time; - num_dirty_pages_period = 0; - } - s->dirty_sync_count = bitmap_sync_count; - if (migrate_use_events()) { - qapi_event_send_migration_pass(bitmap_sync_count, NULL); - } -} - -/** - * save_zero_page: Send the zero page to the stream - * - * Returns: Number of pages written. - * - * @f: QEMUFile where to send the data - * @block: block that contains the page we want to send - * @offset: offset inside the block for the page - * @p: pointer to the page - * @bytes_transferred: increase it with the number of transferred bytes - */ -static int save_zero_page(QEMUFile *f, RAMBlock *block, ram_addr_t offset, - uint8_t *p, uint64_t *bytes_transferred) -{ - int pages = -1; - - if (is_zero_range(p, TARGET_PAGE_SIZE)) { - acct_info.dup_pages++; - *bytes_transferred += save_page_header(f, block, - offset | RAM_SAVE_FLAG_COMPRESS); - qemu_put_byte(f, 0); - *bytes_transferred += 1; - pages = 1; - } - - return pages; -} - -/** - * ram_save_page: Send the given page to the stream - * - * Returns: Number of pages written. - * < 0 - error - * >=0 - Number of pages written - this might legally be 0 - * if xbzrle noticed the page was the same. - * - * @f: QEMUFile where to send the data - * @block: block that contains the page we want to send - * @offset: offset inside the block for the page - * @last_stage: if we are at the completion stage - * @bytes_transferred: increase it with the number of transferred bytes - */ -static int ram_save_page(QEMUFile *f, PageSearchStatus *pss, - bool last_stage, uint64_t *bytes_transferred) -{ - int pages = -1; - uint64_t bytes_xmit; - ram_addr_t current_addr; - uint8_t *p; - int ret; - bool send_async = true; - RAMBlock *block = pss->block; - ram_addr_t offset = pss->offset; - - p = block->host + offset; - - /* In doubt sent page as normal */ - bytes_xmit = 0; - ret = ram_control_save_page(f, block->offset, - offset, TARGET_PAGE_SIZE, &bytes_xmit); - if (bytes_xmit) { - *bytes_transferred += bytes_xmit; - pages = 1; - } - - XBZRLE_cache_lock(); - - current_addr = block->offset + offset; - - if (block == last_sent_block) { - offset |= RAM_SAVE_FLAG_CONTINUE; - } - if (ret != RAM_SAVE_CONTROL_NOT_SUPP) { - if (ret != RAM_SAVE_CONTROL_DELAYED) { - if (bytes_xmit > 0) { - acct_info.norm_pages++; - } else if (bytes_xmit == 0) { - acct_info.dup_pages++; - } - } - } else { - pages = save_zero_page(f, block, offset, p, bytes_transferred); - if (pages > 0) { - /* Must let xbzrle know, otherwise a previous (now 0'd) cached - * page would be stale - */ - xbzrle_cache_zero_page(current_addr); - } else if (!ram_bulk_stage && migrate_use_xbzrle()) { - pages = save_xbzrle_page(f, &p, current_addr, block, - offset, last_stage, bytes_transferred); - if (!last_stage) { - /* Can't send this cached data async, since the cache page - * might get updated before it gets to the wire - */ - send_async = false; - } - } - } - - /* XBZRLE overflow or normal page */ - if (pages == -1) { - *bytes_transferred += save_page_header(f, block, - offset | RAM_SAVE_FLAG_PAGE); - if (send_async) { - qemu_put_buffer_async(f, p, TARGET_PAGE_SIZE); - } else { - qemu_put_buffer(f, p, TARGET_PAGE_SIZE); - } - *bytes_transferred += TARGET_PAGE_SIZE; - pages = 1; - acct_info.norm_pages++; - } - - XBZRLE_cache_unlock(); - - return pages; -} - -static int do_compress_ram_page(CompressParam *param) -{ - int bytes_sent, blen; - uint8_t *p; - RAMBlock *block = param->block; - ram_addr_t offset = param->offset; - - p = block->host + (offset & TARGET_PAGE_MASK); - - bytes_sent = save_page_header(param->file, block, offset | - RAM_SAVE_FLAG_COMPRESS_PAGE); - blen = qemu_put_compression_data(param->file, p, TARGET_PAGE_SIZE, - migrate_compress_level()); - bytes_sent += blen; - - return bytes_sent; -} - -static inline void start_compression(CompressParam *param) -{ - param->done = false; - qemu_mutex_lock(¶m->mutex); - param->start = true; - qemu_cond_signal(¶m->cond); - qemu_mutex_unlock(¶m->mutex); -} - -static inline void start_decompression(DecompressParam *param) -{ - qemu_mutex_lock(¶m->mutex); - param->start = true; - qemu_cond_signal(¶m->cond); - qemu_mutex_unlock(¶m->mutex); -} - -static uint64_t bytes_transferred; - -static void flush_compressed_data(QEMUFile *f) -{ - int idx, len, thread_count; - - if (!migrate_use_compression()) { - return; - } - thread_count = migrate_compress_threads(); - for (idx = 0; idx < thread_count; idx++) { - if (!comp_param[idx].done) { - qemu_mutex_lock(comp_done_lock); - while (!comp_param[idx].done && !quit_comp_thread) { - qemu_cond_wait(comp_done_cond, comp_done_lock); - } - qemu_mutex_unlock(comp_done_lock); - } - if (!quit_comp_thread) { - len = qemu_put_qemu_file(f, comp_param[idx].file); - bytes_transferred += len; - } - } -} - -static inline void set_compress_params(CompressParam *param, RAMBlock *block, - ram_addr_t offset) -{ - param->block = block; - param->offset = offset; -} - -static int compress_page_with_multi_thread(QEMUFile *f, RAMBlock *block, - ram_addr_t offset, - uint64_t *bytes_transferred) -{ - int idx, thread_count, bytes_xmit = -1, pages = -1; - - thread_count = migrate_compress_threads(); - qemu_mutex_lock(comp_done_lock); - while (true) { - for (idx = 0; idx < thread_count; idx++) { - if (comp_param[idx].done) { - bytes_xmit = qemu_put_qemu_file(f, comp_param[idx].file); - set_compress_params(&comp_param[idx], block, offset); - start_compression(&comp_param[idx]); - pages = 1; - acct_info.norm_pages++; - *bytes_transferred += bytes_xmit; - break; - } - } - if (pages > 0) { - break; - } else { - qemu_cond_wait(comp_done_cond, comp_done_lock); - } - } - qemu_mutex_unlock(comp_done_lock); - - return pages; -} - -/** - * ram_save_compressed_page: compress the given page and send it to the stream - * - * Returns: Number of pages written. - * - * @f: QEMUFile where to send the data - * @block: block that contains the page we want to send - * @offset: offset inside the block for the page - * @last_stage: if we are at the completion stage - * @bytes_transferred: increase it with the number of transferred bytes - */ -static int ram_save_compressed_page(QEMUFile *f, PageSearchStatus *pss, - bool last_stage, - uint64_t *bytes_transferred) -{ - int pages = -1; - uint64_t bytes_xmit; - uint8_t *p; - int ret; - RAMBlock *block = pss->block; - ram_addr_t offset = pss->offset; - - p = block->host + offset; - - bytes_xmit = 0; - ret = ram_control_save_page(f, block->offset, - offset, TARGET_PAGE_SIZE, &bytes_xmit); - if (bytes_xmit) { - *bytes_transferred += bytes_xmit; - pages = 1; - } - if (block == last_sent_block) { - offset |= RAM_SAVE_FLAG_CONTINUE; - } - if (ret != RAM_SAVE_CONTROL_NOT_SUPP) { - if (ret != RAM_SAVE_CONTROL_DELAYED) { - if (bytes_xmit > 0) { - acct_info.norm_pages++; - } else if (bytes_xmit == 0) { - acct_info.dup_pages++; - } - } - } else { - /* When starting the process of a new block, the first page of - * the block should be sent out before other pages in the same - * block, and all the pages in last block should have been sent - * out, keeping this order is important, because the 'cont' flag - * is used to avoid resending the block name. - */ - if (block != last_sent_block) { - flush_compressed_data(f); - pages = save_zero_page(f, block, offset, p, bytes_transferred); - if (pages == -1) { - set_compress_params(&comp_param[0], block, offset); - /* Use the qemu thread to compress the data to make sure the - * first page is sent out before other pages - */ - bytes_xmit = do_compress_ram_page(&comp_param[0]); - acct_info.norm_pages++; - qemu_put_qemu_file(f, comp_param[0].file); - *bytes_transferred += bytes_xmit; - pages = 1; - } - } else { - pages = save_zero_page(f, block, offset, p, bytes_transferred); - if (pages == -1) { - pages = compress_page_with_multi_thread(f, block, offset, - bytes_transferred); - } - } - } - - return pages; -} - -/* - * Find the next dirty page and update any state associated with - * the search process. - * - * Returns: True if a page is found - * - * @f: Current migration stream. - * @pss: Data about the state of the current dirty page scan. - * @*again: Set to false if the search has scanned the whole of RAM - * *ram_addr_abs: Pointer into which to store the address of the dirty page - * within the global ram_addr space - */ -static bool find_dirty_block(QEMUFile *f, PageSearchStatus *pss, - bool *again, ram_addr_t *ram_addr_abs) -{ - pss->offset = migration_bitmap_find_dirty(pss->block, pss->offset, - ram_addr_abs); - if (pss->complete_round && pss->block == last_seen_block && - pss->offset >= last_offset) { - /* - * We've been once around the RAM and haven't found anything. - * Give up. - */ - *again = false; - return false; - } - if (pss->offset >= pss->block->used_length) { - /* Didn't find anything in this RAM Block */ - pss->offset = 0; - pss->block = QLIST_NEXT_RCU(pss->block, next); - if (!pss->block) { - /* Hit the end of the list */ - pss->block = QLIST_FIRST_RCU(&ram_list.blocks); - /* Flag that we've looped */ - pss->complete_round = true; - ram_bulk_stage = false; - if (migrate_use_xbzrle()) { - /* If xbzrle is on, stop using the data compression at this - * point. In theory, xbzrle can do better than compression. - */ - flush_compressed_data(f); - compression_switch = false; - } - } - /* Didn't find anything this time, but try again on the new block */ - *again = true; - return false; - } else { - /* Can go around again, but... */ - *again = true; - /* We've found something so probably don't need to */ - return true; - } -} - -/* - * Helper for 'get_queued_page' - gets a page off the queue - * ms: MigrationState in - * *offset: Used to return the offset within the RAMBlock - * ram_addr_abs: global offset in the dirty/sent bitmaps - * - * Returns: block (or NULL if none available) - */ -static RAMBlock *unqueue_page(MigrationState *ms, ram_addr_t *offset, - ram_addr_t *ram_addr_abs) -{ - RAMBlock *block = NULL; - - qemu_mutex_lock(&ms->src_page_req_mutex); - if (!QSIMPLEQ_EMPTY(&ms->src_page_requests)) { - struct MigrationSrcPageRequest *entry = - QSIMPLEQ_FIRST(&ms->src_page_requests); - block = entry->rb; - *offset = entry->offset; - *ram_addr_abs = (entry->offset + entry->rb->offset) & - TARGET_PAGE_MASK; - - if (entry->len > TARGET_PAGE_SIZE) { - entry->len -= TARGET_PAGE_SIZE; - entry->offset += TARGET_PAGE_SIZE; - } else { - memory_region_unref(block->mr); - QSIMPLEQ_REMOVE_HEAD(&ms->src_page_requests, next_req); - g_free(entry); - } - } - qemu_mutex_unlock(&ms->src_page_req_mutex); - - return block; -} - -/* - * Unqueue a page from the queue fed by postcopy page requests; skips pages - * that are already sent (!dirty) - * - * ms: MigrationState in - * pss: PageSearchStatus structure updated with found block/offset - * ram_addr_abs: global offset in the dirty/sent bitmaps - * - * Returns: true if a queued page is found - */ -static bool get_queued_page(MigrationState *ms, PageSearchStatus *pss, - ram_addr_t *ram_addr_abs) -{ - RAMBlock *block; - ram_addr_t offset; - bool dirty; - - do { - block = unqueue_page(ms, &offset, ram_addr_abs); - /* - * We're sending this page, and since it's postcopy nothing else - * will dirty it, and we must make sure it doesn't get sent again - * even if this queue request was received after the background - * search already sent it. - */ - if (block) { - unsigned long *bitmap; - bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap; - dirty = test_bit(*ram_addr_abs >> TARGET_PAGE_BITS, bitmap); - if (!dirty) { - trace_get_queued_page_not_dirty( - block->idstr, (uint64_t)offset, - (uint64_t)*ram_addr_abs, - test_bit(*ram_addr_abs >> TARGET_PAGE_BITS, - atomic_rcu_read(&migration_bitmap_rcu)->unsentmap)); - } else { - trace_get_queued_page(block->idstr, - (uint64_t)offset, - (uint64_t)*ram_addr_abs); - } - } - - } while (block && !dirty); - - if (block) { - /* - * As soon as we start servicing pages out of order, then we have - * to kill the bulk stage, since the bulk stage assumes - * in (migration_bitmap_find_and_reset_dirty) that every page is - * dirty, that's no longer true. - */ - ram_bulk_stage = false; - - /* - * We want the background search to continue from the queued page - * since the guest is likely to want other pages near to the page - * it just requested. - */ - pss->block = block; - pss->offset = offset; - } - - return !!block; -} - -/** - * flush_page_queue: Flush any remaining pages in the ram request queue - * it should be empty at the end anyway, but in error cases there may be - * some left. - * - * ms: MigrationState - */ -void flush_page_queue(MigrationState *ms) -{ - struct MigrationSrcPageRequest *mspr, *next_mspr; - /* This queue generally should be empty - but in the case of a failed - * migration might have some droppings in. - */ - rcu_read_lock(); - QSIMPLEQ_FOREACH_SAFE(mspr, &ms->src_page_requests, next_req, next_mspr) { - memory_region_unref(mspr->rb->mr); - QSIMPLEQ_REMOVE_HEAD(&ms->src_page_requests, next_req); - g_free(mspr); - } - rcu_read_unlock(); -} - -/** - * Queue the pages for transmission, e.g. a request from postcopy destination - * ms: MigrationStatus in which the queue is held - * rbname: The RAMBlock the request is for - may be NULL (to mean reuse last) - * start: Offset from the start of the RAMBlock - * len: Length (in bytes) to send - * Return: 0 on success - */ -int ram_save_queue_pages(MigrationState *ms, const char *rbname, - ram_addr_t start, ram_addr_t len) -{ - RAMBlock *ramblock; - - rcu_read_lock(); - if (!rbname) { - /* Reuse last RAMBlock */ - ramblock = ms->last_req_rb; - - if (!ramblock) { - /* - * Shouldn't happen, we can't reuse the last RAMBlock if - * it's the 1st request. - */ - error_report("ram_save_queue_pages no previous block"); - goto err; - } - } else { - ramblock = qemu_ram_block_by_name(rbname); - - if (!ramblock) { - /* We shouldn't be asked for a non-existent RAMBlock */ - error_report("ram_save_queue_pages no block '%s'", rbname); - goto err; - } - ms->last_req_rb = ramblock; - } - trace_ram_save_queue_pages(ramblock->idstr, start, len); - if (start+len > ramblock->used_length) { - error_report("%s request overrun start=" RAM_ADDR_FMT " len=" - RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT, - __func__, start, len, ramblock->used_length); - goto err; - } - - struct MigrationSrcPageRequest *new_entry = - g_malloc0(sizeof(struct MigrationSrcPageRequest)); - new_entry->rb = ramblock; - new_entry->offset = start; - new_entry->len = len; - - memory_region_ref(ramblock->mr); - qemu_mutex_lock(&ms->src_page_req_mutex); - QSIMPLEQ_INSERT_TAIL(&ms->src_page_requests, new_entry, next_req); - qemu_mutex_unlock(&ms->src_page_req_mutex); - rcu_read_unlock(); - - return 0; - -err: - rcu_read_unlock(); - return -1; -} - -/** - * ram_save_target_page: Save one target page - * - * - * @f: QEMUFile where to send the data - * @block: pointer to block that contains the page we want to send - * @offset: offset inside the block for the page; - * @last_stage: if we are at the completion stage - * @bytes_transferred: increase it with the number of transferred bytes - * @dirty_ram_abs: Address of the start of the dirty page in ram_addr_t space - * - * Returns: Number of pages written. - */ -static int ram_save_target_page(MigrationState *ms, QEMUFile *f, - PageSearchStatus *pss, - bool last_stage, - uint64_t *bytes_transferred, - ram_addr_t dirty_ram_abs) -{ - int res = 0; - - /* Check the pages is dirty and if it is send it */ - if (migration_bitmap_clear_dirty(dirty_ram_abs)) { - unsigned long *unsentmap; - if (compression_switch && migrate_use_compression()) { - res = ram_save_compressed_page(f, pss, - last_stage, - bytes_transferred); - } else { - res = ram_save_page(f, pss, last_stage, - bytes_transferred); - } - - if (res < 0) { - return res; - } - unsentmap = atomic_rcu_read(&migration_bitmap_rcu)->unsentmap; - if (unsentmap) { - clear_bit(dirty_ram_abs >> TARGET_PAGE_BITS, unsentmap); - } - /* Only update last_sent_block if a block was actually sent; xbzrle - * might have decided the page was identical so didn't bother writing - * to the stream. - */ - if (res > 0) { - last_sent_block = pss->block; - } - } - - return res; -} - -/** - * ram_save_host_page: Starting at *offset send pages upto the end - * of the current host page. It's valid for the initial - * offset to point into the middle of a host page - * in which case the remainder of the hostpage is sent. - * Only dirty target pages are sent. - * - * Returns: Number of pages written. - * - * @f: QEMUFile where to send the data - * @block: pointer to block that contains the page we want to send - * @offset: offset inside the block for the page; updated to last target page - * sent - * @last_stage: if we are at the completion stage - * @bytes_transferred: increase it with the number of transferred bytes - * @dirty_ram_abs: Address of the start of the dirty page in ram_addr_t space - */ -static int ram_save_host_page(MigrationState *ms, QEMUFile *f, - PageSearchStatus *pss, - bool last_stage, - uint64_t *bytes_transferred, - ram_addr_t dirty_ram_abs) -{ - int tmppages, pages = 0; - do { - tmppages = ram_save_target_page(ms, f, pss, last_stage, - bytes_transferred, dirty_ram_abs); - if (tmppages < 0) { - return tmppages; - } - - pages += tmppages; - pss->offset += TARGET_PAGE_SIZE; - dirty_ram_abs += TARGET_PAGE_SIZE; - } while (pss->offset & (qemu_host_page_size - 1)); - - /* The offset we leave with is the last one we looked at */ - pss->offset -= TARGET_PAGE_SIZE; - return pages; -} - -/** - * ram_find_and_save_block: Finds a dirty page and sends it to f - * - * Called within an RCU critical section. - * - * Returns: The number of pages written - * 0 means no dirty pages - * - * @f: QEMUFile where to send the data - * @last_stage: if we are at the completion stage - * @bytes_transferred: increase it with the number of transferred bytes - * - * On systems where host-page-size > target-page-size it will send all the - * pages in a host page that are dirty. - */ - -static int ram_find_and_save_block(QEMUFile *f, bool last_stage, - uint64_t *bytes_transferred) -{ - PageSearchStatus pss; - MigrationState *ms = migrate_get_current(); - int pages = 0; - bool again, found; - ram_addr_t dirty_ram_abs; /* Address of the start of the dirty page in - ram_addr_t space */ - - pss.block = last_seen_block; - pss.offset = last_offset; - pss.complete_round = false; - - if (!pss.block) { - pss.block = QLIST_FIRST_RCU(&ram_list.blocks); - } - - do { - again = true; - found = get_queued_page(ms, &pss, &dirty_ram_abs); - - if (!found) { - /* priority queue empty, so just search for something dirty */ - found = find_dirty_block(f, &pss, &again, &dirty_ram_abs); - } - - if (found) { - pages = ram_save_host_page(ms, f, &pss, - last_stage, bytes_transferred, - dirty_ram_abs); - } - } while (!pages && again); - - last_seen_block = pss.block; - last_offset = pss.offset; - - return pages; -} - -void acct_update_position(QEMUFile *f, size_t size, bool zero) -{ - uint64_t pages = size / TARGET_PAGE_SIZE; - if (zero) { - acct_info.dup_pages += pages; - } else { - acct_info.norm_pages += pages; - bytes_transferred += size; - qemu_update_position(f, size); - } -} - -static ram_addr_t ram_save_remaining(void) -{ - return migration_dirty_pages; -} - -uint64_t ram_bytes_remaining(void) -{ - return ram_save_remaining() * TARGET_PAGE_SIZE; -} - -uint64_t ram_bytes_transferred(void) -{ - return bytes_transferred; -} - -uint64_t ram_bytes_total(void) -{ - RAMBlock *block; - uint64_t total = 0; - - rcu_read_lock(); - QLIST_FOREACH_RCU(block, &ram_list.blocks, next) - total += block->used_length; - rcu_read_unlock(); - return total; -} - -void free_xbzrle_decoded_buf(void) -{ - g_free(xbzrle_decoded_buf); - xbzrle_decoded_buf = NULL; -} - -static void migration_bitmap_free(struct BitmapRcu *bmap) -{ - g_free(bmap->bmap); - g_free(bmap->unsentmap); - g_free(bmap); -} - -static void ram_migration_cleanup(void *opaque) -{ - /* caller have hold iothread lock or is in a bh, so there is - * no writing race against this migration_bitmap - */ - struct BitmapRcu *bitmap = migration_bitmap_rcu; - atomic_rcu_set(&migration_bitmap_rcu, NULL); - if (bitmap) { - memory_global_dirty_log_stop(); - call_rcu(bitmap, migration_bitmap_free, rcu); - } - - XBZRLE_cache_lock(); - if (XBZRLE.cache) { - cache_fini(XBZRLE.cache); - g_free(XBZRLE.encoded_buf); - g_free(XBZRLE.current_buf); - XBZRLE.cache = NULL; - XBZRLE.encoded_buf = NULL; - XBZRLE.current_buf = NULL; - } - XBZRLE_cache_unlock(); -} - -static void reset_ram_globals(void) -{ - last_seen_block = NULL; - last_sent_block = NULL; - last_offset = 0; - last_version = ram_list.version; - ram_bulk_stage = true; -} - -#define MAX_WAIT 50 /* ms, half buffered_file limit */ - -void migration_bitmap_extend(ram_addr_t old, ram_addr_t new) -{ - /* called in qemu main thread, so there is - * no writing race against this migration_bitmap - */ - if (migration_bitmap_rcu) { - struct BitmapRcu *old_bitmap = migration_bitmap_rcu, *bitmap; - bitmap = g_new(struct BitmapRcu, 1); - bitmap->bmap = bitmap_new(new); - - /* prevent migration_bitmap content from being set bit - * by migration_bitmap_sync_range() at the same time. - * it is safe to migration if migration_bitmap is cleared bit - * at the same time. - */ - qemu_mutex_lock(&migration_bitmap_mutex); - bitmap_copy(bitmap->bmap, old_bitmap->bmap, old); - bitmap_set(bitmap->bmap, old, new - old); - - /* We don't have a way to safely extend the sentmap - * with RCU; so mark it as missing, entry to postcopy - * will fail. - */ - bitmap->unsentmap = NULL; - - atomic_rcu_set(&migration_bitmap_rcu, bitmap); - qemu_mutex_unlock(&migration_bitmap_mutex); - migration_dirty_pages += new - old; - call_rcu(old_bitmap, migration_bitmap_free, rcu); - } -} - -/* - * 'expected' is the value you expect the bitmap mostly to be full - * of; it won't bother printing lines that are all this value. - * If 'todump' is null the migration bitmap is dumped. - */ -void ram_debug_dump_bitmap(unsigned long *todump, bool expected) -{ - int64_t ram_pages = last_ram_offset() >> TARGET_PAGE_BITS; - - int64_t cur; - int64_t linelen = 128; - char linebuf[129]; - - if (!todump) { - todump = atomic_rcu_read(&migration_bitmap_rcu)->bmap; - } - - for (cur = 0; cur < ram_pages; cur += linelen) { - int64_t curb; - bool found = false; - /* - * Last line; catch the case where the line length - * is longer than remaining ram - */ - if (cur + linelen > ram_pages) { - linelen = ram_pages - cur; - } - for (curb = 0; curb < linelen; curb++) { - bool thisbit = test_bit(cur + curb, todump); - linebuf[curb] = thisbit ? '1' : '.'; - found = found || (thisbit != expected); - } - if (found) { - linebuf[curb] = '\0'; - fprintf(stderr, "0x%08" PRIx64 " : %s\n", cur, linebuf); - } - } -} - -/* **** functions for postcopy ***** */ - -/* - * Callback from postcopy_each_ram_send_discard for each RAMBlock - * Note: At this point the 'unsentmap' is the processed bitmap combined - * with the dirtymap; so a '1' means it's either dirty or unsent. - * start,length: Indexes into the bitmap for the first bit - * representing the named block and length in target-pages - */ -static int postcopy_send_discard_bm_ram(MigrationState *ms, - PostcopyDiscardState *pds, - unsigned long start, - unsigned long length) -{ - unsigned long end = start + length; /* one after the end */ - unsigned long current; - unsigned long *unsentmap; - - unsentmap = atomic_rcu_read(&migration_bitmap_rcu)->unsentmap; - for (current = start; current < end; ) { - unsigned long one = find_next_bit(unsentmap, end, current); - - if (one <= end) { - unsigned long zero = find_next_zero_bit(unsentmap, end, one + 1); - unsigned long discard_length; - - if (zero >= end) { - discard_length = end - one; - } else { - discard_length = zero - one; - } - postcopy_discard_send_range(ms, pds, one, discard_length); - current = one + discard_length; - } else { - current = one; - } - } - - return 0; -} - -/* - * Utility for the outgoing postcopy code. - * Calls postcopy_send_discard_bm_ram for each RAMBlock - * passing it bitmap indexes and name. - * Returns: 0 on success - * (qemu_ram_foreach_block ends up passing unscaled lengths - * which would mean postcopy code would have to deal with target page) - */ -static int postcopy_each_ram_send_discard(MigrationState *ms) -{ - struct RAMBlock *block; - int ret; - - QLIST_FOREACH_RCU(block, &ram_list.blocks, next) { - unsigned long first = block->offset >> TARGET_PAGE_BITS; - PostcopyDiscardState *pds = postcopy_discard_send_init(ms, - first, - block->idstr); - - /* - * Postcopy sends chunks of bitmap over the wire, but it - * just needs indexes at this point, avoids it having - * target page specific code. - */ - ret = postcopy_send_discard_bm_ram(ms, pds, first, - block->used_length >> TARGET_PAGE_BITS); - postcopy_discard_send_finish(ms, pds); - if (ret) { - return ret; - } - } - - return 0; -} - -/* - * Helper for postcopy_chunk_hostpages; it's called twice to cleanup - * the two bitmaps, that are similar, but one is inverted. - * - * We search for runs of target-pages that don't start or end on a - * host page boundary; - * unsent_pass=true: Cleans up partially unsent host pages by searching - * the unsentmap - * unsent_pass=false: Cleans up partially dirty host pages by searching - * the main migration bitmap - * - */ -static void postcopy_chunk_hostpages_pass(MigrationState *ms, bool unsent_pass, - RAMBlock *block, - PostcopyDiscardState *pds) -{ - unsigned long *bitmap; - unsigned long *unsentmap; - unsigned int host_ratio = qemu_host_page_size / TARGET_PAGE_SIZE; - unsigned long first = block->offset >> TARGET_PAGE_BITS; - unsigned long len = block->used_length >> TARGET_PAGE_BITS; - unsigned long last = first + (len - 1); - unsigned long run_start; - - bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap; - unsentmap = atomic_rcu_read(&migration_bitmap_rcu)->unsentmap; - - if (unsent_pass) { - /* Find a sent page */ - run_start = find_next_zero_bit(unsentmap, last + 1, first); - } else { - /* Find a dirty page */ - run_start = find_next_bit(bitmap, last + 1, first); - } - - while (run_start <= last) { - bool do_fixup = false; - unsigned long fixup_start_addr; - unsigned long host_offset; - - /* - * If the start of this run of pages is in the middle of a host - * page, then we need to fixup this host page. - */ - host_offset = run_start % host_ratio; - if (host_offset) { - do_fixup = true; - run_start -= host_offset; - fixup_start_addr = run_start; - /* For the next pass */ - run_start = run_start + host_ratio; - } else { - /* Find the end of this run */ - unsigned long run_end; - if (unsent_pass) { - run_end = find_next_bit(unsentmap, last + 1, run_start + 1); - } else { - run_end = find_next_zero_bit(bitmap, last + 1, run_start + 1); - } - /* - * If the end isn't at the start of a host page, then the - * run doesn't finish at the end of a host page - * and we need to discard. - */ - host_offset = run_end % host_ratio; - if (host_offset) { - do_fixup = true; - fixup_start_addr = run_end - host_offset; - /* - * This host page has gone, the next loop iteration starts - * from after the fixup - */ - run_start = fixup_start_addr + host_ratio; - } else { - /* - * No discards on this iteration, next loop starts from - * next sent/dirty page - */ - run_start = run_end + 1; - } - } - - if (do_fixup) { - unsigned long page; - - /* Tell the destination to discard this page */ - if (unsent_pass || !test_bit(fixup_start_addr, unsentmap)) { - /* For the unsent_pass we: - * discard partially sent pages - * For the !unsent_pass (dirty) we: - * discard partially dirty pages that were sent - * (any partially sent pages were already discarded - * by the previous unsent_pass) - */ - postcopy_discard_send_range(ms, pds, fixup_start_addr, - host_ratio); - } - - /* Clean up the bitmap */ - for (page = fixup_start_addr; - page < fixup_start_addr + host_ratio; page++) { - /* All pages in this host page are now not sent */ - set_bit(page, unsentmap); - - /* - * Remark them as dirty, updating the count for any pages - * that weren't previously dirty. - */ - migration_dirty_pages += !test_and_set_bit(page, bitmap); - } - } - - if (unsent_pass) { - /* Find the next sent page for the next iteration */ - run_start = find_next_zero_bit(unsentmap, last + 1, - run_start); - } else { - /* Find the next dirty page for the next iteration */ - run_start = find_next_bit(bitmap, last + 1, run_start); - } - } -} - -/* - * Utility for the outgoing postcopy code. - * - * Discard any partially sent host-page size chunks, mark any partially - * dirty host-page size chunks as all dirty. - * - * Returns: 0 on success - */ -static int postcopy_chunk_hostpages(MigrationState *ms) -{ - struct RAMBlock *block; - - if (qemu_host_page_size == TARGET_PAGE_SIZE) { - /* Easy case - TPS==HPS - nothing to be done */ - return 0; - } - - /* Easiest way to make sure we don't resume in the middle of a host-page */ - last_seen_block = NULL; - last_sent_block = NULL; - last_offset = 0; - - QLIST_FOREACH_RCU(block, &ram_list.blocks, next) { - unsigned long first = block->offset >> TARGET_PAGE_BITS; - - PostcopyDiscardState *pds = - postcopy_discard_send_init(ms, first, block->idstr); - - /* First pass: Discard all partially sent host pages */ - postcopy_chunk_hostpages_pass(ms, true, block, pds); - /* - * Second pass: Ensure that all partially dirty host pages are made - * fully dirty. - */ - postcopy_chunk_hostpages_pass(ms, false, block, pds); - - postcopy_discard_send_finish(ms, pds); - } /* ram_list loop */ - - return 0; -} - -/* - * Transmit the set of pages to be discarded after precopy to the target - * these are pages that: - * a) Have been previously transmitted but are now dirty again - * b) Pages that have never been transmitted, this ensures that - * any pages on the destination that have been mapped by background - * tasks get discarded (transparent huge pages is the specific concern) - * Hopefully this is pretty sparse - */ -int ram_postcopy_send_discard_bitmap(MigrationState *ms) -{ - int ret; - unsigned long *bitmap, *unsentmap; - - rcu_read_lock(); - - /* This should be our last sync, the src is now paused */ - migration_bitmap_sync(); - - unsentmap = atomic_rcu_read(&migration_bitmap_rcu)->unsentmap; - if (!unsentmap) { - /* We don't have a safe way to resize the sentmap, so - * if the bitmap was resized it will be NULL at this - * point. - */ - error_report("migration ram resized during precopy phase"); - rcu_read_unlock(); - return -EINVAL; - } - - /* Deal with TPS != HPS */ - ret = postcopy_chunk_hostpages(ms); - if (ret) { - rcu_read_unlock(); - return ret; - } - - /* - * Update the unsentmap to be unsentmap = unsentmap | dirty - */ - bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap; - bitmap_or(unsentmap, unsentmap, bitmap, - last_ram_offset() >> TARGET_PAGE_BITS); - - - trace_ram_postcopy_send_discard_bitmap(); -#ifdef DEBUG_POSTCOPY - ram_debug_dump_bitmap(unsentmap, true); -#endif - - ret = postcopy_each_ram_send_discard(ms); - rcu_read_unlock(); - - return ret; -} - -/* - * At the start of the postcopy phase of migration, any now-dirty - * precopied pages are discarded. - * - * start, length describe a byte address range within the RAMBlock - * - * Returns 0 on success. - */ -int ram_discard_range(MigrationIncomingState *mis, - const char *block_name, - uint64_t start, size_t length) -{ - int ret = -1; - - rcu_read_lock(); - RAMBlock *rb = qemu_ram_block_by_name(block_name); - - if (!rb) { - error_report("ram_discard_range: Failed to find block '%s'", - block_name); - goto err; - } - - uint8_t *host_startaddr = rb->host + start; - - if ((uintptr_t)host_startaddr & (qemu_host_page_size - 1)) { - error_report("ram_discard_range: Unaligned start address: %p", - host_startaddr); - goto err; - } - - if ((start + length) <= rb->used_length) { - uint8_t *host_endaddr = host_startaddr + length; - if ((uintptr_t)host_endaddr & (qemu_host_page_size - 1)) { - error_report("ram_discard_range: Unaligned end address: %p", - host_endaddr); - goto err; - } - ret = postcopy_ram_discard_range(mis, host_startaddr, length); - } else { - error_report("ram_discard_range: Overrun block '%s' (%" PRIu64 - "/%zx/" RAM_ADDR_FMT")", - block_name, start, length, rb->used_length); - } - -err: - rcu_read_unlock(); - - return ret; -} - - -/* Each of ram_save_setup, ram_save_iterate and ram_save_complete has - * long-running RCU critical section. When rcu-reclaims in the code - * start to become numerous it will be necessary to reduce the - * granularity of these critical sections. - */ - -static int ram_save_setup(QEMUFile *f, void *opaque) -{ - RAMBlock *block; - int64_t ram_bitmap_pages; /* Size of bitmap in pages, including gaps */ - - dirty_rate_high_cnt = 0; - bitmap_sync_count = 0; - migration_bitmap_sync_init(); - qemu_mutex_init(&migration_bitmap_mutex); - - if (migrate_use_xbzrle()) { - XBZRLE_cache_lock(); - XBZRLE.cache = cache_init(migrate_xbzrle_cache_size() / - TARGET_PAGE_SIZE, - TARGET_PAGE_SIZE); - if (!XBZRLE.cache) { - XBZRLE_cache_unlock(); - error_report("Error creating cache"); - return -1; - } - XBZRLE_cache_unlock(); - - /* We prefer not to abort if there is no memory */ - XBZRLE.encoded_buf = g_try_malloc0(TARGET_PAGE_SIZE); - if (!XBZRLE.encoded_buf) { - error_report("Error allocating encoded_buf"); - return -1; - } - - XBZRLE.current_buf = g_try_malloc(TARGET_PAGE_SIZE); - if (!XBZRLE.current_buf) { - error_report("Error allocating current_buf"); - g_free(XBZRLE.encoded_buf); - XBZRLE.encoded_buf = NULL; - return -1; - } - - acct_clear(); - } - - /* For memory_global_dirty_log_start below. */ - qemu_mutex_lock_iothread(); - - qemu_mutex_lock_ramlist(); - rcu_read_lock(); - bytes_transferred = 0; - reset_ram_globals(); - - ram_bitmap_pages = last_ram_offset() >> TARGET_PAGE_BITS; - migration_bitmap_rcu = g_new0(struct BitmapRcu, 1); - migration_bitmap_rcu->bmap = bitmap_new(ram_bitmap_pages); - bitmap_set(migration_bitmap_rcu->bmap, 0, ram_bitmap_pages); - - if (migrate_postcopy_ram()) { - migration_bitmap_rcu->unsentmap = bitmap_new(ram_bitmap_pages); - bitmap_set(migration_bitmap_rcu->unsentmap, 0, ram_bitmap_pages); - } - - /* - * Count the total number of pages used by ram blocks not including any - * gaps due to alignment or unplugs. - */ - migration_dirty_pages = ram_bytes_total() >> TARGET_PAGE_BITS; - - memory_global_dirty_log_start(); - migration_bitmap_sync(); - qemu_mutex_unlock_ramlist(); - qemu_mutex_unlock_iothread(); - - qemu_put_be64(f, ram_bytes_total() | RAM_SAVE_FLAG_MEM_SIZE); - - QLIST_FOREACH_RCU(block, &ram_list.blocks, next) { - qemu_put_byte(f, strlen(block->idstr)); - qemu_put_buffer(f, (uint8_t *)block->idstr, strlen(block->idstr)); - qemu_put_be64(f, block->used_length); - } - - rcu_read_unlock(); - - ram_control_before_iterate(f, RAM_CONTROL_SETUP); - ram_control_after_iterate(f, RAM_CONTROL_SETUP); - - qemu_put_be64(f, RAM_SAVE_FLAG_EOS); - - return 0; -} - -static int ram_save_iterate(QEMUFile *f, void *opaque) -{ - int ret; - int i; - int64_t t0; - int pages_sent = 0; - - rcu_read_lock(); - if (ram_list.version != last_version) { - reset_ram_globals(); - } - - /* Read version before ram_list.blocks */ - smp_rmb(); - - ram_control_before_iterate(f, RAM_CONTROL_ROUND); - - t0 = qemu_clock_get_ns(QEMU_CLOCK_REALTIME); - i = 0; - while ((ret = qemu_file_rate_limit(f)) == 0) { - int pages; - - pages = ram_find_and_save_block(f, false, &bytes_transferred); - /* no more pages to sent */ - if (pages == 0) { - break; - } - pages_sent += pages; - acct_info.iterations++; - - /* we want to check in the 1st loop, just in case it was the 1st time - and we had to sync the dirty bitmap. - qemu_get_clock_ns() is a bit expensive, so we only check each some - iterations - */ - if ((i & 63) == 0) { - uint64_t t1 = (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - t0) / 1000000; - if (t1 > MAX_WAIT) { - DPRINTF("big wait: %" PRIu64 " milliseconds, %d iterations\n", - t1, i); - break; - } - } - i++; - } - flush_compressed_data(f); - rcu_read_unlock(); - - /* - * Must occur before EOS (or any QEMUFile operation) - * because of RDMA protocol. - */ - ram_control_after_iterate(f, RAM_CONTROL_ROUND); - - qemu_put_be64(f, RAM_SAVE_FLAG_EOS); - bytes_transferred += 8; - - ret = qemu_file_get_error(f); - if (ret < 0) { - return ret; - } - - return pages_sent; -} - -/* Called with iothread lock */ -static int ram_save_complete(QEMUFile *f, void *opaque) -{ - rcu_read_lock(); - - if (!migration_in_postcopy(migrate_get_current())) { - migration_bitmap_sync(); - } - - ram_control_before_iterate(f, RAM_CONTROL_FINISH); - - /* try transferring iterative blocks of memory */ - - /* flush all remaining blocks regardless of rate limiting */ - while (true) { - int pages; - - pages = ram_find_and_save_block(f, true, &bytes_transferred); - /* no more blocks to sent */ - if (pages == 0) { - break; - } - } - - flush_compressed_data(f); - ram_control_after_iterate(f, RAM_CONTROL_FINISH); - - rcu_read_unlock(); - - qemu_put_be64(f, RAM_SAVE_FLAG_EOS); - - return 0; -} - -static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size, - uint64_t *non_postcopiable_pending, - uint64_t *postcopiable_pending) -{ - uint64_t remaining_size; - - remaining_size = ram_save_remaining() * TARGET_PAGE_SIZE; - - if (!migration_in_postcopy(migrate_get_current()) && - remaining_size < max_size) { - qemu_mutex_lock_iothread(); - rcu_read_lock(); - migration_bitmap_sync(); - rcu_read_unlock(); - qemu_mutex_unlock_iothread(); - remaining_size = ram_save_remaining() * TARGET_PAGE_SIZE; - } - - /* We can do postcopy, and all the data is postcopiable */ - *postcopiable_pending += remaining_size; -} - -static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host) -{ - unsigned int xh_len; - int xh_flags; - uint8_t *loaded_data; - - if (!xbzrle_decoded_buf) { - xbzrle_decoded_buf = g_malloc(TARGET_PAGE_SIZE); - } - loaded_data = xbzrle_decoded_buf; - - /* extract RLE header */ - xh_flags = qemu_get_byte(f); - xh_len = qemu_get_be16(f); - - if (xh_flags != ENCODING_FLAG_XBZRLE) { - error_report("Failed to load XBZRLE page - wrong compression!"); - return -1; - } - - if (xh_len > TARGET_PAGE_SIZE) { - error_report("Failed to load XBZRLE page - len overflow!"); - return -1; - } - /* load data and decode */ - qemu_get_buffer_in_place(f, &loaded_data, xh_len); - - /* decode RLE */ - if (xbzrle_decode_buffer(loaded_data, xh_len, host, - TARGET_PAGE_SIZE) == -1) { - error_report("Failed to load XBZRLE page - decode error!"); - return -1; - } - - return 0; -} - -/* Must be called from within a rcu critical section. - * Returns a pointer from within the RCU-protected ram_list. - */ -/* - * Read a RAMBlock ID from the stream f. - * - * f: Stream to read from - * flags: Page flags (mostly to see if it's a continuation of previous block) - */ -static inline RAMBlock *ram_block_from_stream(QEMUFile *f, - int flags) -{ - static RAMBlock *block = NULL; - char id[256]; - uint8_t len; - - if (flags & RAM_SAVE_FLAG_CONTINUE) { - if (!block) { - error_report("Ack, bad migration stream!"); - return NULL; - } - return block; - } - - len = qemu_get_byte(f); - qemu_get_buffer(f, (uint8_t *)id, len); - id[len] = 0; - - block = qemu_ram_block_by_name(id); - if (!block) { - error_report("Can't find block %s", id); - return NULL; - } - - return block; -} - -static inline void *host_from_ram_block_offset(RAMBlock *block, - ram_addr_t offset) -{ - if (!offset_in_ramblock(block, offset)) { - return NULL; - } - - return block->host + offset; -} - -/* - * If a page (or a whole RDMA chunk) has been - * determined to be zero, then zap it. - */ -void ram_handle_compressed(void *host, uint8_t ch, uint64_t size) -{ - if (ch != 0 || !is_zero_range(host, size)) { - memset(host, ch, size); - } -} - -static void *do_data_decompress(void *opaque) -{ - DecompressParam *param = opaque; - unsigned long pagesize; - - while (!quit_decomp_thread) { - qemu_mutex_lock(¶m->mutex); - while (!param->start && !quit_decomp_thread) { - qemu_cond_wait(¶m->cond, ¶m->mutex); - pagesize = TARGET_PAGE_SIZE; - if (!quit_decomp_thread) { - /* uncompress() will return failed in some case, especially - * when the page is dirted when doing the compression, it's - * not a problem because the dirty page will be retransferred - * and uncompress() won't break the data in other pages. - */ - uncompress((Bytef *)param->des, &pagesize, - (const Bytef *)param->compbuf, param->len); - } - param->start = false; - } - qemu_mutex_unlock(¶m->mutex); - } - - return NULL; -} - -void migrate_decompress_threads_create(void) -{ - int i, thread_count; - - thread_count = migrate_decompress_threads(); - decompress_threads = g_new0(QemuThread, thread_count); - decomp_param = g_new0(DecompressParam, thread_count); - quit_decomp_thread = false; - for (i = 0; i < thread_count; i++) { - qemu_mutex_init(&decomp_param[i].mutex); - qemu_cond_init(&decomp_param[i].cond); - decomp_param[i].compbuf = g_malloc0(compressBound(TARGET_PAGE_SIZE)); - qemu_thread_create(decompress_threads + i, "decompress", - do_data_decompress, decomp_param + i, - QEMU_THREAD_JOINABLE); - } -} - -void migrate_decompress_threads_join(void) -{ - int i, thread_count; - - quit_decomp_thread = true; - thread_count = migrate_decompress_threads(); - for (i = 0; i < thread_count; i++) { - qemu_mutex_lock(&decomp_param[i].mutex); - qemu_cond_signal(&decomp_param[i].cond); - qemu_mutex_unlock(&decomp_param[i].mutex); - } - for (i = 0; i < thread_count; i++) { - qemu_thread_join(decompress_threads + i); - qemu_mutex_destroy(&decomp_param[i].mutex); - qemu_cond_destroy(&decomp_param[i].cond); - g_free(decomp_param[i].compbuf); - } - g_free(decompress_threads); - g_free(decomp_param); - decompress_threads = NULL; - decomp_param = NULL; -} - -static void decompress_data_with_multi_threads(QEMUFile *f, - void *host, int len) -{ - int idx, thread_count; - - thread_count = migrate_decompress_threads(); - while (true) { - for (idx = 0; idx < thread_count; idx++) { - if (!decomp_param[idx].start) { - qemu_get_buffer(f, decomp_param[idx].compbuf, len); - decomp_param[idx].des = host; - decomp_param[idx].len = len; - start_decompression(&decomp_param[idx]); - break; - } - } - if (idx < thread_count) { - break; - } - } -} - -/* - * Allocate data structures etc needed by incoming migration with postcopy-ram - * postcopy-ram's similarly names postcopy_ram_incoming_init does the work - */ -int ram_postcopy_incoming_init(MigrationIncomingState *mis) -{ - size_t ram_pages = last_ram_offset() >> TARGET_PAGE_BITS; - - return postcopy_ram_incoming_init(mis, ram_pages); -} - -/* - * Called in postcopy mode by ram_load(). - * rcu_read_lock is taken prior to this being called. - */ -static int ram_load_postcopy(QEMUFile *f) -{ - int flags = 0, ret = 0; - bool place_needed = false; - bool matching_page_sizes = qemu_host_page_size == TARGET_PAGE_SIZE; - MigrationIncomingState *mis = migration_incoming_get_current(); - /* Temporary page that is later 'placed' */ - void *postcopy_host_page = postcopy_get_tmp_page(mis); - void *last_host = NULL; - bool all_zero = false; - - while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) { - ram_addr_t addr; - void *host = NULL; - void *page_buffer = NULL; - void *place_source = NULL; - uint8_t ch; - - addr = qemu_get_be64(f); - flags = addr & ~TARGET_PAGE_MASK; - addr &= TARGET_PAGE_MASK; - - trace_ram_load_postcopy_loop((uint64_t)addr, flags); - place_needed = false; - if (flags & (RAM_SAVE_FLAG_COMPRESS | RAM_SAVE_FLAG_PAGE)) { - RAMBlock *block = ram_block_from_stream(f, flags); - - host = host_from_ram_block_offset(block, addr); - if (!host) { - error_report("Illegal RAM offset " RAM_ADDR_FMT, addr); - ret = -EINVAL; - break; - } - page_buffer = host; - /* - * Postcopy requires that we place whole host pages atomically. - * To make it atomic, the data is read into a temporary page - * that's moved into place later. - * The migration protocol uses, possibly smaller, target-pages - * however the source ensures it always sends all the components - * of a host page in order. - */ - page_buffer = postcopy_host_page + - ((uintptr_t)host & ~qemu_host_page_mask); - /* If all TP are zero then we can optimise the place */ - if (!((uintptr_t)host & ~qemu_host_page_mask)) { - all_zero = true; - } else { - /* not the 1st TP within the HP */ - if (host != (last_host + TARGET_PAGE_SIZE)) { - error_report("Non-sequential target page %p/%p", - host, last_host); - ret = -EINVAL; - break; - } - } - - - /* - * If it's the last part of a host page then we place the host - * page - */ - place_needed = (((uintptr_t)host + TARGET_PAGE_SIZE) & - ~qemu_host_page_mask) == 0; - place_source = postcopy_host_page; - } - last_host = host; - - switch (flags & ~RAM_SAVE_FLAG_CONTINUE) { - case RAM_SAVE_FLAG_COMPRESS: - ch = qemu_get_byte(f); - memset(page_buffer, ch, TARGET_PAGE_SIZE); - if (ch) { - all_zero = false; - } - break; - - case RAM_SAVE_FLAG_PAGE: - all_zero = false; - if (!place_needed || !matching_page_sizes) { - qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE); - } else { - /* Avoids the qemu_file copy during postcopy, which is - * going to do a copy later; can only do it when we - * do this read in one go (matching page sizes) - */ - qemu_get_buffer_in_place(f, (uint8_t **)&place_source, - TARGET_PAGE_SIZE); - } - break; - case RAM_SAVE_FLAG_EOS: - /* normal exit */ - break; - default: - error_report("Unknown combination of migration flags: %#x" - " (postcopy mode)", flags); - ret = -EINVAL; - } - - if (place_needed) { - /* This gets called at the last target page in the host page */ - if (all_zero) { - ret = postcopy_place_page_zero(mis, - host + TARGET_PAGE_SIZE - - qemu_host_page_size); - } else { - ret = postcopy_place_page(mis, host + TARGET_PAGE_SIZE - - qemu_host_page_size, - place_source); - } - } - if (!ret) { - ret = qemu_file_get_error(f); - } - } - - return ret; -} - -static int ram_load(QEMUFile *f, void *opaque, int version_id) -{ - int flags = 0, ret = 0; - static uint64_t seq_iter; - int len = 0; - /* - * If system is running in postcopy mode, page inserts to host memory must - * be atomic - */ - bool postcopy_running = postcopy_state_get() >= POSTCOPY_INCOMING_LISTENING; - - seq_iter++; - - if (version_id != 4) { - ret = -EINVAL; - } - - /* This RCU critical section can be very long running. - * When RCU reclaims in the code start to become numerous, - * it will be necessary to reduce the granularity of this - * critical section. - */ - rcu_read_lock(); - - if (postcopy_running) { - ret = ram_load_postcopy(f); - } - - while (!postcopy_running && !ret && !(flags & RAM_SAVE_FLAG_EOS)) { - ram_addr_t addr, total_ram_bytes; - void *host = NULL; - uint8_t ch; - - addr = qemu_get_be64(f); - flags = addr & ~TARGET_PAGE_MASK; - addr &= TARGET_PAGE_MASK; - - if (flags & (RAM_SAVE_FLAG_COMPRESS | RAM_SAVE_FLAG_PAGE | - RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) { - RAMBlock *block = ram_block_from_stream(f, flags); - - host = host_from_ram_block_offset(block, addr); - if (!host) { - error_report("Illegal RAM offset " RAM_ADDR_FMT, addr); - ret = -EINVAL; - break; - } - } - - switch (flags & ~RAM_SAVE_FLAG_CONTINUE) { - case RAM_SAVE_FLAG_MEM_SIZE: - /* Synchronize RAM block list */ - total_ram_bytes = addr; - while (!ret && total_ram_bytes) { - RAMBlock *block; - char id[256]; - ram_addr_t length; - - len = qemu_get_byte(f); - qemu_get_buffer(f, (uint8_t *)id, len); - id[len] = 0; - length = qemu_get_be64(f); - - block = qemu_ram_block_by_name(id); - if (block) { - if (length != block->used_length) { - Error *local_err = NULL; - - ret = qemu_ram_resize(block->offset, length, - &local_err); - if (local_err) { - error_report_err(local_err); - } - } - ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG, - block->idstr); - } else { - error_report("Unknown ramblock \"%s\", cannot " - "accept migration", id); - ret = -EINVAL; - } - - total_ram_bytes -= length; - } - break; - - case RAM_SAVE_FLAG_COMPRESS: - ch = qemu_get_byte(f); - ram_handle_compressed(host, ch, TARGET_PAGE_SIZE); - break; - - case RAM_SAVE_FLAG_PAGE: - qemu_get_buffer(f, host, TARGET_PAGE_SIZE); - break; - - case RAM_SAVE_FLAG_COMPRESS_PAGE: - len = qemu_get_be32(f); - if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) { - error_report("Invalid compressed data length: %d", len); - ret = -EINVAL; - break; - } - decompress_data_with_multi_threads(f, host, len); - break; - - case RAM_SAVE_FLAG_XBZRLE: - if (load_xbzrle(f, addr, host) < 0) { - error_report("Failed to decompress XBZRLE page at " - RAM_ADDR_FMT, addr); - ret = -EINVAL; - break; - } - break; - case RAM_SAVE_FLAG_EOS: - /* normal exit */ - break; - default: - if (flags & RAM_SAVE_FLAG_HOOK) { - ram_control_load_hook(f, RAM_CONTROL_HOOK, NULL); - } else { - error_report("Unknown combination of migration flags: %#x", - flags); - ret = -EINVAL; - } - } - if (!ret) { - ret = qemu_file_get_error(f); - } - } - - rcu_read_unlock(); - DPRINTF("Completed load of VM with exit code %d seq iteration " - "%" PRIu64 "\n", ret, seq_iter); - return ret; -} - -static SaveVMHandlers savevm_ram_handlers = { - .save_live_setup = ram_save_setup, - .save_live_iterate = ram_save_iterate, - .save_live_complete_postcopy = ram_save_complete, - .save_live_complete_precopy = ram_save_complete, - .save_live_pending = ram_save_pending, - .load_state = ram_load, - .cleanup = ram_migration_cleanup, -}; - -void ram_mig_init(void) -{ - qemu_mutex_init(&XBZRLE.lock); - register_savevm_live(NULL, "ram", 0, 4, &savevm_ram_handlers, NULL); -} diff --git a/qemu/migration/rdma.c b/qemu/migration/rdma.c deleted file mode 100644 index f6a9992b3..000000000 --- a/qemu/migration/rdma.c +++ /dev/null @@ -1,3516 +0,0 @@ -/* - * RDMA protocol and interfaces - * - * Copyright IBM, Corp. 2010-2013 - * - * Authors: - * Michael R. Hines <mrhines@us.ibm.com> - * Jiuxing Liu <jl@us.ibm.com> - * - * This work is licensed under the terms of the GNU GPL, version 2 or - * later. See the COPYING file in the top-level directory. - * - */ -#include "qemu/osdep.h" -#include "qapi/error.h" -#include "qemu-common.h" -#include "qemu/cutils.h" -#include "migration/migration.h" -#include "migration/qemu-file.h" -#include "exec/cpu-common.h" -#include "qemu/error-report.h" -#include "qemu/main-loop.h" -#include "qemu/sockets.h" -#include "qemu/bitmap.h" -#include "qemu/coroutine.h" -#include <sys/socket.h> -#include <netdb.h> -#include <arpa/inet.h> -#include <rdma/rdma_cma.h> -#include "trace.h" - -/* - * Print and error on both the Monitor and the Log file. - */ -#define ERROR(errp, fmt, ...) \ - do { \ - fprintf(stderr, "RDMA ERROR: " fmt "\n", ## __VA_ARGS__); \ - if (errp && (*(errp) == NULL)) { \ - error_setg(errp, "RDMA ERROR: " fmt, ## __VA_ARGS__); \ - } \ - } while (0) - -#define RDMA_RESOLVE_TIMEOUT_MS 10000 - -/* Do not merge data if larger than this. */ -#define RDMA_MERGE_MAX (2 * 1024 * 1024) -#define RDMA_SIGNALED_SEND_MAX (RDMA_MERGE_MAX / 4096) - -#define RDMA_REG_CHUNK_SHIFT 20 /* 1 MB */ - -/* - * This is only for non-live state being migrated. - * Instead of RDMA_WRITE messages, we use RDMA_SEND - * messages for that state, which requires a different - * delivery design than main memory. - */ -#define RDMA_SEND_INCREMENT 32768 - -/* - * Maximum size infiniband SEND message - */ -#define RDMA_CONTROL_MAX_BUFFER (512 * 1024) -#define RDMA_CONTROL_MAX_COMMANDS_PER_MESSAGE 4096 - -#define RDMA_CONTROL_VERSION_CURRENT 1 -/* - * Capabilities for negotiation. - */ -#define RDMA_CAPABILITY_PIN_ALL 0x01 - -/* - * Add the other flags above to this list of known capabilities - * as they are introduced. - */ -static uint32_t known_capabilities = RDMA_CAPABILITY_PIN_ALL; - -#define CHECK_ERROR_STATE() \ - do { \ - if (rdma->error_state) { \ - if (!rdma->error_reported) { \ - error_report("RDMA is in an error state waiting migration" \ - " to abort!"); \ - rdma->error_reported = 1; \ - } \ - return rdma->error_state; \ - } \ - } while (0); - -/* - * A work request ID is 64-bits and we split up these bits - * into 3 parts: - * - * bits 0-15 : type of control message, 2^16 - * bits 16-29: ram block index, 2^14 - * bits 30-63: ram block chunk number, 2^34 - * - * The last two bit ranges are only used for RDMA writes, - * in order to track their completion and potentially - * also track unregistration status of the message. - */ -#define RDMA_WRID_TYPE_SHIFT 0UL -#define RDMA_WRID_BLOCK_SHIFT 16UL -#define RDMA_WRID_CHUNK_SHIFT 30UL - -#define RDMA_WRID_TYPE_MASK \ - ((1UL << RDMA_WRID_BLOCK_SHIFT) - 1UL) - -#define RDMA_WRID_BLOCK_MASK \ - (~RDMA_WRID_TYPE_MASK & ((1UL << RDMA_WRID_CHUNK_SHIFT) - 1UL)) - -#define RDMA_WRID_CHUNK_MASK (~RDMA_WRID_BLOCK_MASK & ~RDMA_WRID_TYPE_MASK) - -/* - * RDMA migration protocol: - * 1. RDMA Writes (data messages, i.e. RAM) - * 2. IB Send/Recv (control channel messages) - */ -enum { - RDMA_WRID_NONE = 0, - RDMA_WRID_RDMA_WRITE = 1, - RDMA_WRID_SEND_CONTROL = 2000, - RDMA_WRID_RECV_CONTROL = 4000, -}; - -static const char *wrid_desc[] = { - [RDMA_WRID_NONE] = "NONE", - [RDMA_WRID_RDMA_WRITE] = "WRITE RDMA", - [RDMA_WRID_SEND_CONTROL] = "CONTROL SEND", - [RDMA_WRID_RECV_CONTROL] = "CONTROL RECV", -}; - -/* - * Work request IDs for IB SEND messages only (not RDMA writes). - * This is used by the migration protocol to transmit - * control messages (such as device state and registration commands) - * - * We could use more WRs, but we have enough for now. - */ -enum { - RDMA_WRID_READY = 0, - RDMA_WRID_DATA, - RDMA_WRID_CONTROL, - RDMA_WRID_MAX, -}; - -/* - * SEND/RECV IB Control Messages. - */ -enum { - RDMA_CONTROL_NONE = 0, - RDMA_CONTROL_ERROR, - RDMA_CONTROL_READY, /* ready to receive */ - RDMA_CONTROL_QEMU_FILE, /* QEMUFile-transmitted bytes */ - RDMA_CONTROL_RAM_BLOCKS_REQUEST, /* RAMBlock synchronization */ - RDMA_CONTROL_RAM_BLOCKS_RESULT, /* RAMBlock synchronization */ - RDMA_CONTROL_COMPRESS, /* page contains repeat values */ - RDMA_CONTROL_REGISTER_REQUEST, /* dynamic page registration */ - RDMA_CONTROL_REGISTER_RESULT, /* key to use after registration */ - RDMA_CONTROL_REGISTER_FINISHED, /* current iteration finished */ - RDMA_CONTROL_UNREGISTER_REQUEST, /* dynamic UN-registration */ - RDMA_CONTROL_UNREGISTER_FINISHED, /* unpinning finished */ -}; - -static const char *control_desc[] = { - [RDMA_CONTROL_NONE] = "NONE", - [RDMA_CONTROL_ERROR] = "ERROR", - [RDMA_CONTROL_READY] = "READY", - [RDMA_CONTROL_QEMU_FILE] = "QEMU FILE", - [RDMA_CONTROL_RAM_BLOCKS_REQUEST] = "RAM BLOCKS REQUEST", - [RDMA_CONTROL_RAM_BLOCKS_RESULT] = "RAM BLOCKS RESULT", - [RDMA_CONTROL_COMPRESS] = "COMPRESS", - [RDMA_CONTROL_REGISTER_REQUEST] = "REGISTER REQUEST", - [RDMA_CONTROL_REGISTER_RESULT] = "REGISTER RESULT", - [RDMA_CONTROL_REGISTER_FINISHED] = "REGISTER FINISHED", - [RDMA_CONTROL_UNREGISTER_REQUEST] = "UNREGISTER REQUEST", - [RDMA_CONTROL_UNREGISTER_FINISHED] = "UNREGISTER FINISHED", -}; - -/* - * Memory and MR structures used to represent an IB Send/Recv work request. - * This is *not* used for RDMA writes, only IB Send/Recv. - */ -typedef struct { - uint8_t control[RDMA_CONTROL_MAX_BUFFER]; /* actual buffer to register */ - struct ibv_mr *control_mr; /* registration metadata */ - size_t control_len; /* length of the message */ - uint8_t *control_curr; /* start of unconsumed bytes */ -} RDMAWorkRequestData; - -/* - * Negotiate RDMA capabilities during connection-setup time. - */ -typedef struct { - uint32_t version; - uint32_t flags; -} RDMACapabilities; - -static void caps_to_network(RDMACapabilities *cap) -{ - cap->version = htonl(cap->version); - cap->flags = htonl(cap->flags); -} - -static void network_to_caps(RDMACapabilities *cap) -{ - cap->version = ntohl(cap->version); - cap->flags = ntohl(cap->flags); -} - -/* - * Representation of a RAMBlock from an RDMA perspective. - * This is not transmitted, only local. - * This and subsequent structures cannot be linked lists - * because we're using a single IB message to transmit - * the information. It's small anyway, so a list is overkill. - */ -typedef struct RDMALocalBlock { - char *block_name; - uint8_t *local_host_addr; /* local virtual address */ - uint64_t remote_host_addr; /* remote virtual address */ - uint64_t offset; - uint64_t length; - struct ibv_mr **pmr; /* MRs for chunk-level registration */ - struct ibv_mr *mr; /* MR for non-chunk-level registration */ - uint32_t *remote_keys; /* rkeys for chunk-level registration */ - uint32_t remote_rkey; /* rkeys for non-chunk-level registration */ - int index; /* which block are we */ - unsigned int src_index; /* (Only used on dest) */ - bool is_ram_block; - int nb_chunks; - unsigned long *transit_bitmap; - unsigned long *unregister_bitmap; -} RDMALocalBlock; - -/* - * Also represents a RAMblock, but only on the dest. - * This gets transmitted by the dest during connection-time - * to the source VM and then is used to populate the - * corresponding RDMALocalBlock with - * the information needed to perform the actual RDMA. - */ -typedef struct QEMU_PACKED RDMADestBlock { - uint64_t remote_host_addr; - uint64_t offset; - uint64_t length; - uint32_t remote_rkey; - uint32_t padding; -} RDMADestBlock; - -static uint64_t htonll(uint64_t v) -{ - union { uint32_t lv[2]; uint64_t llv; } u; - u.lv[0] = htonl(v >> 32); - u.lv[1] = htonl(v & 0xFFFFFFFFULL); - return u.llv; -} - -static uint64_t ntohll(uint64_t v) { - union { uint32_t lv[2]; uint64_t llv; } u; - u.llv = v; - return ((uint64_t)ntohl(u.lv[0]) << 32) | (uint64_t) ntohl(u.lv[1]); -} - -static void dest_block_to_network(RDMADestBlock *db) -{ - db->remote_host_addr = htonll(db->remote_host_addr); - db->offset = htonll(db->offset); - db->length = htonll(db->length); - db->remote_rkey = htonl(db->remote_rkey); -} - -static void network_to_dest_block(RDMADestBlock *db) -{ - db->remote_host_addr = ntohll(db->remote_host_addr); - db->offset = ntohll(db->offset); - db->length = ntohll(db->length); - db->remote_rkey = ntohl(db->remote_rkey); -} - -/* - * Virtual address of the above structures used for transmitting - * the RAMBlock descriptions at connection-time. - * This structure is *not* transmitted. - */ -typedef struct RDMALocalBlocks { - int nb_blocks; - bool init; /* main memory init complete */ - RDMALocalBlock *block; -} RDMALocalBlocks; - -/* - * Main data structure for RDMA state. - * While there is only one copy of this structure being allocated right now, - * this is the place where one would start if you wanted to consider - * having more than one RDMA connection open at the same time. - */ -typedef struct RDMAContext { - char *host; - int port; - - RDMAWorkRequestData wr_data[RDMA_WRID_MAX]; - - /* - * This is used by *_exchange_send() to figure out whether or not - * the initial "READY" message has already been received or not. - * This is because other functions may potentially poll() and detect - * the READY message before send() does, in which case we need to - * know if it completed. - */ - int control_ready_expected; - - /* number of outstanding writes */ - int nb_sent; - - /* store info about current buffer so that we can - merge it with future sends */ - uint64_t current_addr; - uint64_t current_length; - /* index of ram block the current buffer belongs to */ - int current_index; - /* index of the chunk in the current ram block */ - int current_chunk; - - bool pin_all; - - /* - * infiniband-specific variables for opening the device - * and maintaining connection state and so forth. - * - * cm_id also has ibv_context, rdma_event_channel, and ibv_qp in - * cm_id->verbs, cm_id->channel, and cm_id->qp. - */ - struct rdma_cm_id *cm_id; /* connection manager ID */ - struct rdma_cm_id *listen_id; - bool connected; - - struct ibv_context *verbs; - struct rdma_event_channel *channel; - struct ibv_qp *qp; /* queue pair */ - struct ibv_comp_channel *comp_channel; /* completion channel */ - struct ibv_pd *pd; /* protection domain */ - struct ibv_cq *cq; /* completion queue */ - - /* - * If a previous write failed (perhaps because of a failed - * memory registration, then do not attempt any future work - * and remember the error state. - */ - int error_state; - int error_reported; - - /* - * Description of ram blocks used throughout the code. - */ - RDMALocalBlocks local_ram_blocks; - RDMADestBlock *dest_blocks; - - /* Index of the next RAMBlock received during block registration */ - unsigned int next_src_index; - - /* - * Migration on *destination* started. - * Then use coroutine yield function. - * Source runs in a thread, so we don't care. - */ - int migration_started_on_destination; - - int total_registrations; - int total_writes; - - int unregister_current, unregister_next; - uint64_t unregistrations[RDMA_SIGNALED_SEND_MAX]; - - GHashTable *blockmap; -} RDMAContext; - -/* - * Interface to the rest of the migration call stack. - */ -typedef struct QEMUFileRDMA { - RDMAContext *rdma; - size_t len; - void *file; -} QEMUFileRDMA; - -/* - * Main structure for IB Send/Recv control messages. - * This gets prepended at the beginning of every Send/Recv. - */ -typedef struct QEMU_PACKED { - uint32_t len; /* Total length of data portion */ - uint32_t type; /* which control command to perform */ - uint32_t repeat; /* number of commands in data portion of same type */ - uint32_t padding; -} RDMAControlHeader; - -static void control_to_network(RDMAControlHeader *control) -{ - control->type = htonl(control->type); - control->len = htonl(control->len); - control->repeat = htonl(control->repeat); -} - -static void network_to_control(RDMAControlHeader *control) -{ - control->type = ntohl(control->type); - control->len = ntohl(control->len); - control->repeat = ntohl(control->repeat); -} - -/* - * Register a single Chunk. - * Information sent by the source VM to inform the dest - * to register an single chunk of memory before we can perform - * the actual RDMA operation. - */ -typedef struct QEMU_PACKED { - union QEMU_PACKED { - uint64_t current_addr; /* offset into the ram_addr_t space */ - uint64_t chunk; /* chunk to lookup if unregistering */ - } key; - uint32_t current_index; /* which ramblock the chunk belongs to */ - uint32_t padding; - uint64_t chunks; /* how many sequential chunks to register */ -} RDMARegister; - -static void register_to_network(RDMAContext *rdma, RDMARegister *reg) -{ - RDMALocalBlock *local_block; - local_block = &rdma->local_ram_blocks.block[reg->current_index]; - - if (local_block->is_ram_block) { - /* - * current_addr as passed in is an address in the local ram_addr_t - * space, we need to translate this for the destination - */ - reg->key.current_addr -= local_block->offset; - reg->key.current_addr += rdma->dest_blocks[reg->current_index].offset; - } - reg->key.current_addr = htonll(reg->key.current_addr); - reg->current_index = htonl(reg->current_index); - reg->chunks = htonll(reg->chunks); -} - -static void network_to_register(RDMARegister *reg) -{ - reg->key.current_addr = ntohll(reg->key.current_addr); - reg->current_index = ntohl(reg->current_index); - reg->chunks = ntohll(reg->chunks); -} - -typedef struct QEMU_PACKED { - uint32_t value; /* if zero, we will madvise() */ - uint32_t block_idx; /* which ram block index */ - uint64_t offset; /* Address in remote ram_addr_t space */ - uint64_t length; /* length of the chunk */ -} RDMACompress; - -static void compress_to_network(RDMAContext *rdma, RDMACompress *comp) -{ - comp->value = htonl(comp->value); - /* - * comp->offset as passed in is an address in the local ram_addr_t - * space, we need to translate this for the destination - */ - comp->offset -= rdma->local_ram_blocks.block[comp->block_idx].offset; - comp->offset += rdma->dest_blocks[comp->block_idx].offset; - comp->block_idx = htonl(comp->block_idx); - comp->offset = htonll(comp->offset); - comp->length = htonll(comp->length); -} - -static void network_to_compress(RDMACompress *comp) -{ - comp->value = ntohl(comp->value); - comp->block_idx = ntohl(comp->block_idx); - comp->offset = ntohll(comp->offset); - comp->length = ntohll(comp->length); -} - -/* - * The result of the dest's memory registration produces an "rkey" - * which the source VM must reference in order to perform - * the RDMA operation. - */ -typedef struct QEMU_PACKED { - uint32_t rkey; - uint32_t padding; - uint64_t host_addr; -} RDMARegisterResult; - -static void result_to_network(RDMARegisterResult *result) -{ - result->rkey = htonl(result->rkey); - result->host_addr = htonll(result->host_addr); -}; - -static void network_to_result(RDMARegisterResult *result) -{ - result->rkey = ntohl(result->rkey); - result->host_addr = ntohll(result->host_addr); -}; - -const char *print_wrid(int wrid); -static int qemu_rdma_exchange_send(RDMAContext *rdma, RDMAControlHeader *head, - uint8_t *data, RDMAControlHeader *resp, - int *resp_idx, - int (*callback)(RDMAContext *rdma)); - -static inline uint64_t ram_chunk_index(const uint8_t *start, - const uint8_t *host) -{ - return ((uintptr_t) host - (uintptr_t) start) >> RDMA_REG_CHUNK_SHIFT; -} - -static inline uint8_t *ram_chunk_start(const RDMALocalBlock *rdma_ram_block, - uint64_t i) -{ - return (uint8_t *)(uintptr_t)(rdma_ram_block->local_host_addr + - (i << RDMA_REG_CHUNK_SHIFT)); -} - -static inline uint8_t *ram_chunk_end(const RDMALocalBlock *rdma_ram_block, - uint64_t i) -{ - uint8_t *result = ram_chunk_start(rdma_ram_block, i) + - (1UL << RDMA_REG_CHUNK_SHIFT); - - if (result > (rdma_ram_block->local_host_addr + rdma_ram_block->length)) { - result = rdma_ram_block->local_host_addr + rdma_ram_block->length; - } - - return result; -} - -static int rdma_add_block(RDMAContext *rdma, const char *block_name, - void *host_addr, - ram_addr_t block_offset, uint64_t length) -{ - RDMALocalBlocks *local = &rdma->local_ram_blocks; - RDMALocalBlock *block; - RDMALocalBlock *old = local->block; - - local->block = g_new0(RDMALocalBlock, local->nb_blocks + 1); - - if (local->nb_blocks) { - int x; - - if (rdma->blockmap) { - for (x = 0; x < local->nb_blocks; x++) { - g_hash_table_remove(rdma->blockmap, - (void *)(uintptr_t)old[x].offset); - g_hash_table_insert(rdma->blockmap, - (void *)(uintptr_t)old[x].offset, - &local->block[x]); - } - } - memcpy(local->block, old, sizeof(RDMALocalBlock) * local->nb_blocks); - g_free(old); - } - - block = &local->block[local->nb_blocks]; - - block->block_name = g_strdup(block_name); - block->local_host_addr = host_addr; - block->offset = block_offset; - block->length = length; - block->index = local->nb_blocks; - block->src_index = ~0U; /* Filled in by the receipt of the block list */ - block->nb_chunks = ram_chunk_index(host_addr, host_addr + length) + 1UL; - block->transit_bitmap = bitmap_new(block->nb_chunks); - bitmap_clear(block->transit_bitmap, 0, block->nb_chunks); - block->unregister_bitmap = bitmap_new(block->nb_chunks); - bitmap_clear(block->unregister_bitmap, 0, block->nb_chunks); - block->remote_keys = g_new0(uint32_t, block->nb_chunks); - - block->is_ram_block = local->init ? false : true; - - if (rdma->blockmap) { - g_hash_table_insert(rdma->blockmap, (void *)(uintptr_t)block_offset, block); - } - - trace_rdma_add_block(block_name, local->nb_blocks, - (uintptr_t) block->local_host_addr, - block->offset, block->length, - (uintptr_t) (block->local_host_addr + block->length), - BITS_TO_LONGS(block->nb_chunks) * - sizeof(unsigned long) * 8, - block->nb_chunks); - - local->nb_blocks++; - - return 0; -} - -/* - * Memory regions need to be registered with the device and queue pairs setup - * in advanced before the migration starts. This tells us where the RAM blocks - * are so that we can register them individually. - */ -static int qemu_rdma_init_one_block(const char *block_name, void *host_addr, - ram_addr_t block_offset, ram_addr_t length, void *opaque) -{ - return rdma_add_block(opaque, block_name, host_addr, block_offset, length); -} - -/* - * Identify the RAMBlocks and their quantity. They will be references to - * identify chunk boundaries inside each RAMBlock and also be referenced - * during dynamic page registration. - */ -static int qemu_rdma_init_ram_blocks(RDMAContext *rdma) -{ - RDMALocalBlocks *local = &rdma->local_ram_blocks; - - assert(rdma->blockmap == NULL); - memset(local, 0, sizeof *local); - qemu_ram_foreach_block(qemu_rdma_init_one_block, rdma); - trace_qemu_rdma_init_ram_blocks(local->nb_blocks); - rdma->dest_blocks = g_new0(RDMADestBlock, - rdma->local_ram_blocks.nb_blocks); - local->init = true; - return 0; -} - -/* - * Note: If used outside of cleanup, the caller must ensure that the destination - * block structures are also updated - */ -static int rdma_delete_block(RDMAContext *rdma, RDMALocalBlock *block) -{ - RDMALocalBlocks *local = &rdma->local_ram_blocks; - RDMALocalBlock *old = local->block; - int x; - - if (rdma->blockmap) { - g_hash_table_remove(rdma->blockmap, (void *)(uintptr_t)block->offset); - } - if (block->pmr) { - int j; - - for (j = 0; j < block->nb_chunks; j++) { - if (!block->pmr[j]) { - continue; - } - ibv_dereg_mr(block->pmr[j]); - rdma->total_registrations--; - } - g_free(block->pmr); - block->pmr = NULL; - } - - if (block->mr) { - ibv_dereg_mr(block->mr); - rdma->total_registrations--; - block->mr = NULL; - } - - g_free(block->transit_bitmap); - block->transit_bitmap = NULL; - - g_free(block->unregister_bitmap); - block->unregister_bitmap = NULL; - - g_free(block->remote_keys); - block->remote_keys = NULL; - - g_free(block->block_name); - block->block_name = NULL; - - if (rdma->blockmap) { - for (x = 0; x < local->nb_blocks; x++) { - g_hash_table_remove(rdma->blockmap, - (void *)(uintptr_t)old[x].offset); - } - } - - if (local->nb_blocks > 1) { - - local->block = g_new0(RDMALocalBlock, local->nb_blocks - 1); - - if (block->index) { - memcpy(local->block, old, sizeof(RDMALocalBlock) * block->index); - } - - if (block->index < (local->nb_blocks - 1)) { - memcpy(local->block + block->index, old + (block->index + 1), - sizeof(RDMALocalBlock) * - (local->nb_blocks - (block->index + 1))); - } - } else { - assert(block == local->block); - local->block = NULL; - } - - trace_rdma_delete_block(block, (uintptr_t)block->local_host_addr, - block->offset, block->length, - (uintptr_t)(block->local_host_addr + block->length), - BITS_TO_LONGS(block->nb_chunks) * - sizeof(unsigned long) * 8, block->nb_chunks); - - g_free(old); - - local->nb_blocks--; - - if (local->nb_blocks && rdma->blockmap) { - for (x = 0; x < local->nb_blocks; x++) { - g_hash_table_insert(rdma->blockmap, - (void *)(uintptr_t)local->block[x].offset, - &local->block[x]); - } - } - - return 0; -} - -/* - * Put in the log file which RDMA device was opened and the details - * associated with that device. - */ -static void qemu_rdma_dump_id(const char *who, struct ibv_context *verbs) -{ - struct ibv_port_attr port; - - if (ibv_query_port(verbs, 1, &port)) { - error_report("Failed to query port information"); - return; - } - - printf("%s RDMA Device opened: kernel name %s " - "uverbs device name %s, " - "infiniband_verbs class device path %s, " - "infiniband class device path %s, " - "transport: (%d) %s\n", - who, - verbs->device->name, - verbs->device->dev_name, - verbs->device->dev_path, - verbs->device->ibdev_path, - port.link_layer, - (port.link_layer == IBV_LINK_LAYER_INFINIBAND) ? "Infiniband" : - ((port.link_layer == IBV_LINK_LAYER_ETHERNET) - ? "Ethernet" : "Unknown")); -} - -/* - * Put in the log file the RDMA gid addressing information, - * useful for folks who have trouble understanding the - * RDMA device hierarchy in the kernel. - */ -static void qemu_rdma_dump_gid(const char *who, struct rdma_cm_id *id) -{ - char sgid[33]; - char dgid[33]; - inet_ntop(AF_INET6, &id->route.addr.addr.ibaddr.sgid, sgid, sizeof sgid); - inet_ntop(AF_INET6, &id->route.addr.addr.ibaddr.dgid, dgid, sizeof dgid); - trace_qemu_rdma_dump_gid(who, sgid, dgid); -} - -/* - * As of now, IPv6 over RoCE / iWARP is not supported by linux. - * We will try the next addrinfo struct, and fail if there are - * no other valid addresses to bind against. - * - * If user is listening on '[::]', then we will not have a opened a device - * yet and have no way of verifying if the device is RoCE or not. - * - * In this case, the source VM will throw an error for ALL types of - * connections (both IPv4 and IPv6) if the destination machine does not have - * a regular infiniband network available for use. - * - * The only way to guarantee that an error is thrown for broken kernels is - * for the management software to choose a *specific* interface at bind time - * and validate what time of hardware it is. - * - * Unfortunately, this puts the user in a fix: - * - * If the source VM connects with an IPv4 address without knowing that the - * destination has bound to '[::]' the migration will unconditionally fail - * unless the management software is explicitly listening on the IPv4 - * address while using a RoCE-based device. - * - * If the source VM connects with an IPv6 address, then we're OK because we can - * throw an error on the source (and similarly on the destination). - * - * But in mixed environments, this will be broken for a while until it is fixed - * inside linux. - * - * We do provide a *tiny* bit of help in this function: We can list all of the - * devices in the system and check to see if all the devices are RoCE or - * Infiniband. - * - * If we detect that we have a *pure* RoCE environment, then we can safely - * thrown an error even if the management software has specified '[::]' as the - * bind address. - * - * However, if there is are multiple hetergeneous devices, then we cannot make - * this assumption and the user just has to be sure they know what they are - * doing. - * - * Patches are being reviewed on linux-rdma. - */ -static int qemu_rdma_broken_ipv6_kernel(Error **errp, struct ibv_context *verbs) -{ - struct ibv_port_attr port_attr; - - /* This bug only exists in linux, to our knowledge. */ -#ifdef CONFIG_LINUX - - /* - * Verbs are only NULL if management has bound to '[::]'. - * - * Let's iterate through all the devices and see if there any pure IB - * devices (non-ethernet). - * - * If not, then we can safely proceed with the migration. - * Otherwise, there are no guarantees until the bug is fixed in linux. - */ - if (!verbs) { - int num_devices, x; - struct ibv_device ** dev_list = ibv_get_device_list(&num_devices); - bool roce_found = false; - bool ib_found = false; - - for (x = 0; x < num_devices; x++) { - verbs = ibv_open_device(dev_list[x]); - if (!verbs) { - if (errno == EPERM) { - continue; - } else { - return -EINVAL; - } - } - - if (ibv_query_port(verbs, 1, &port_attr)) { - ibv_close_device(verbs); - ERROR(errp, "Could not query initial IB port"); - return -EINVAL; - } - - if (port_attr.link_layer == IBV_LINK_LAYER_INFINIBAND) { - ib_found = true; - } else if (port_attr.link_layer == IBV_LINK_LAYER_ETHERNET) { - roce_found = true; - } - - ibv_close_device(verbs); - - } - - if (roce_found) { - if (ib_found) { - fprintf(stderr, "WARN: migrations may fail:" - " IPv6 over RoCE / iWARP in linux" - " is broken. But since you appear to have a" - " mixed RoCE / IB environment, be sure to only" - " migrate over the IB fabric until the kernel " - " fixes the bug.\n"); - } else { - ERROR(errp, "You only have RoCE / iWARP devices in your systems" - " and your management software has specified '[::]'" - ", but IPv6 over RoCE / iWARP is not supported in Linux."); - return -ENONET; - } - } - - return 0; - } - - /* - * If we have a verbs context, that means that some other than '[::]' was - * used by the management software for binding. In which case we can - * actually warn the user about a potentially broken kernel. - */ - - /* IB ports start with 1, not 0 */ - if (ibv_query_port(verbs, 1, &port_attr)) { - ERROR(errp, "Could not query initial IB port"); - return -EINVAL; - } - - if (port_attr.link_layer == IBV_LINK_LAYER_ETHERNET) { - ERROR(errp, "Linux kernel's RoCE / iWARP does not support IPv6 " - "(but patches on linux-rdma in progress)"); - return -ENONET; - } - -#endif - - return 0; -} - -/* - * Figure out which RDMA device corresponds to the requested IP hostname - * Also create the initial connection manager identifiers for opening - * the connection. - */ -static int qemu_rdma_resolve_host(RDMAContext *rdma, Error **errp) -{ - int ret; - struct rdma_addrinfo *res; - char port_str[16]; - struct rdma_cm_event *cm_event; - char ip[40] = "unknown"; - struct rdma_addrinfo *e; - - if (rdma->host == NULL || !strcmp(rdma->host, "")) { - ERROR(errp, "RDMA hostname has not been set"); - return -EINVAL; - } - - /* create CM channel */ - rdma->channel = rdma_create_event_channel(); - if (!rdma->channel) { - ERROR(errp, "could not create CM channel"); - return -EINVAL; - } - - /* create CM id */ - ret = rdma_create_id(rdma->channel, &rdma->cm_id, NULL, RDMA_PS_TCP); - if (ret) { - ERROR(errp, "could not create channel id"); - goto err_resolve_create_id; - } - - snprintf(port_str, 16, "%d", rdma->port); - port_str[15] = '\0'; - - ret = rdma_getaddrinfo(rdma->host, port_str, NULL, &res); - if (ret < 0) { - ERROR(errp, "could not rdma_getaddrinfo address %s", rdma->host); - goto err_resolve_get_addr; - } - - for (e = res; e != NULL; e = e->ai_next) { - inet_ntop(e->ai_family, - &((struct sockaddr_in *) e->ai_dst_addr)->sin_addr, ip, sizeof ip); - trace_qemu_rdma_resolve_host_trying(rdma->host, ip); - - ret = rdma_resolve_addr(rdma->cm_id, NULL, e->ai_dst_addr, - RDMA_RESOLVE_TIMEOUT_MS); - if (!ret) { - if (e->ai_family == AF_INET6) { - ret = qemu_rdma_broken_ipv6_kernel(errp, rdma->cm_id->verbs); - if (ret) { - continue; - } - } - goto route; - } - } - - ERROR(errp, "could not resolve address %s", rdma->host); - goto err_resolve_get_addr; - -route: - qemu_rdma_dump_gid("source_resolve_addr", rdma->cm_id); - - ret = rdma_get_cm_event(rdma->channel, &cm_event); - if (ret) { - ERROR(errp, "could not perform event_addr_resolved"); - goto err_resolve_get_addr; - } - - if (cm_event->event != RDMA_CM_EVENT_ADDR_RESOLVED) { - ERROR(errp, "result not equal to event_addr_resolved %s", - rdma_event_str(cm_event->event)); - perror("rdma_resolve_addr"); - rdma_ack_cm_event(cm_event); - ret = -EINVAL; - goto err_resolve_get_addr; - } - rdma_ack_cm_event(cm_event); - - /* resolve route */ - ret = rdma_resolve_route(rdma->cm_id, RDMA_RESOLVE_TIMEOUT_MS); - if (ret) { - ERROR(errp, "could not resolve rdma route"); - goto err_resolve_get_addr; - } - - ret = rdma_get_cm_event(rdma->channel, &cm_event); - if (ret) { - ERROR(errp, "could not perform event_route_resolved"); - goto err_resolve_get_addr; - } - if (cm_event->event != RDMA_CM_EVENT_ROUTE_RESOLVED) { - ERROR(errp, "result not equal to event_route_resolved: %s", - rdma_event_str(cm_event->event)); - rdma_ack_cm_event(cm_event); - ret = -EINVAL; - goto err_resolve_get_addr; - } - rdma_ack_cm_event(cm_event); - rdma->verbs = rdma->cm_id->verbs; - qemu_rdma_dump_id("source_resolve_host", rdma->cm_id->verbs); - qemu_rdma_dump_gid("source_resolve_host", rdma->cm_id); - return 0; - -err_resolve_get_addr: - rdma_destroy_id(rdma->cm_id); - rdma->cm_id = NULL; -err_resolve_create_id: - rdma_destroy_event_channel(rdma->channel); - rdma->channel = NULL; - return ret; -} - -/* - * Create protection domain and completion queues - */ -static int qemu_rdma_alloc_pd_cq(RDMAContext *rdma) -{ - /* allocate pd */ - rdma->pd = ibv_alloc_pd(rdma->verbs); - if (!rdma->pd) { - error_report("failed to allocate protection domain"); - return -1; - } - - /* create completion channel */ - rdma->comp_channel = ibv_create_comp_channel(rdma->verbs); - if (!rdma->comp_channel) { - error_report("failed to allocate completion channel"); - goto err_alloc_pd_cq; - } - - /* - * Completion queue can be filled by both read and write work requests, - * so must reflect the sum of both possible queue sizes. - */ - rdma->cq = ibv_create_cq(rdma->verbs, (RDMA_SIGNALED_SEND_MAX * 3), - NULL, rdma->comp_channel, 0); - if (!rdma->cq) { - error_report("failed to allocate completion queue"); - goto err_alloc_pd_cq; - } - - return 0; - -err_alloc_pd_cq: - if (rdma->pd) { - ibv_dealloc_pd(rdma->pd); - } - if (rdma->comp_channel) { - ibv_destroy_comp_channel(rdma->comp_channel); - } - rdma->pd = NULL; - rdma->comp_channel = NULL; - return -1; - -} - -/* - * Create queue pairs. - */ -static int qemu_rdma_alloc_qp(RDMAContext *rdma) -{ - struct ibv_qp_init_attr attr = { 0 }; - int ret; - - attr.cap.max_send_wr = RDMA_SIGNALED_SEND_MAX; - attr.cap.max_recv_wr = 3; - attr.cap.max_send_sge = 1; - attr.cap.max_recv_sge = 1; - attr.send_cq = rdma->cq; - attr.recv_cq = rdma->cq; - attr.qp_type = IBV_QPT_RC; - - ret = rdma_create_qp(rdma->cm_id, rdma->pd, &attr); - if (ret) { - return -1; - } - - rdma->qp = rdma->cm_id->qp; - return 0; -} - -static int qemu_rdma_reg_whole_ram_blocks(RDMAContext *rdma) -{ - int i; - RDMALocalBlocks *local = &rdma->local_ram_blocks; - - for (i = 0; i < local->nb_blocks; i++) { - local->block[i].mr = - ibv_reg_mr(rdma->pd, - local->block[i].local_host_addr, - local->block[i].length, - IBV_ACCESS_LOCAL_WRITE | - IBV_ACCESS_REMOTE_WRITE - ); - if (!local->block[i].mr) { - perror("Failed to register local dest ram block!\n"); - break; - } - rdma->total_registrations++; - } - - if (i >= local->nb_blocks) { - return 0; - } - - for (i--; i >= 0; i--) { - ibv_dereg_mr(local->block[i].mr); - rdma->total_registrations--; - } - - return -1; - -} - -/* - * Find the ram block that corresponds to the page requested to be - * transmitted by QEMU. - * - * Once the block is found, also identify which 'chunk' within that - * block that the page belongs to. - * - * This search cannot fail or the migration will fail. - */ -static int qemu_rdma_search_ram_block(RDMAContext *rdma, - uintptr_t block_offset, - uint64_t offset, - uint64_t length, - uint64_t *block_index, - uint64_t *chunk_index) -{ - uint64_t current_addr = block_offset + offset; - RDMALocalBlock *block = g_hash_table_lookup(rdma->blockmap, - (void *) block_offset); - assert(block); - assert(current_addr >= block->offset); - assert((current_addr + length) <= (block->offset + block->length)); - - *block_index = block->index; - *chunk_index = ram_chunk_index(block->local_host_addr, - block->local_host_addr + (current_addr - block->offset)); - - return 0; -} - -/* - * Register a chunk with IB. If the chunk was already registered - * previously, then skip. - * - * Also return the keys associated with the registration needed - * to perform the actual RDMA operation. - */ -static int qemu_rdma_register_and_get_keys(RDMAContext *rdma, - RDMALocalBlock *block, uintptr_t host_addr, - uint32_t *lkey, uint32_t *rkey, int chunk, - uint8_t *chunk_start, uint8_t *chunk_end) -{ - if (block->mr) { - if (lkey) { - *lkey = block->mr->lkey; - } - if (rkey) { - *rkey = block->mr->rkey; - } - return 0; - } - - /* allocate memory to store chunk MRs */ - if (!block->pmr) { - block->pmr = g_new0(struct ibv_mr *, block->nb_chunks); - } - - /* - * If 'rkey', then we're the destination, so grant access to the source. - * - * If 'lkey', then we're the source VM, so grant access only to ourselves. - */ - if (!block->pmr[chunk]) { - uint64_t len = chunk_end - chunk_start; - - trace_qemu_rdma_register_and_get_keys(len, chunk_start); - - block->pmr[chunk] = ibv_reg_mr(rdma->pd, - chunk_start, len, - (rkey ? (IBV_ACCESS_LOCAL_WRITE | - IBV_ACCESS_REMOTE_WRITE) : 0)); - - if (!block->pmr[chunk]) { - perror("Failed to register chunk!"); - fprintf(stderr, "Chunk details: block: %d chunk index %d" - " start %" PRIuPTR " end %" PRIuPTR - " host %" PRIuPTR - " local %" PRIuPTR " registrations: %d\n", - block->index, chunk, (uintptr_t)chunk_start, - (uintptr_t)chunk_end, host_addr, - (uintptr_t)block->local_host_addr, - rdma->total_registrations); - return -1; - } - rdma->total_registrations++; - } - - if (lkey) { - *lkey = block->pmr[chunk]->lkey; - } - if (rkey) { - *rkey = block->pmr[chunk]->rkey; - } - return 0; -} - -/* - * Register (at connection time) the memory used for control - * channel messages. - */ -static int qemu_rdma_reg_control(RDMAContext *rdma, int idx) -{ - rdma->wr_data[idx].control_mr = ibv_reg_mr(rdma->pd, - rdma->wr_data[idx].control, RDMA_CONTROL_MAX_BUFFER, - IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE); - if (rdma->wr_data[idx].control_mr) { - rdma->total_registrations++; - return 0; - } - error_report("qemu_rdma_reg_control failed"); - return -1; -} - -const char *print_wrid(int wrid) -{ - if (wrid >= RDMA_WRID_RECV_CONTROL) { - return wrid_desc[RDMA_WRID_RECV_CONTROL]; - } - return wrid_desc[wrid]; -} - -/* - * RDMA requires memory registration (mlock/pinning), but this is not good for - * overcommitment. - * - * In preparation for the future where LRU information or workload-specific - * writable writable working set memory access behavior is available to QEMU - * it would be nice to have in place the ability to UN-register/UN-pin - * particular memory regions from the RDMA hardware when it is determine that - * those regions of memory will likely not be accessed again in the near future. - * - * While we do not yet have such information right now, the following - * compile-time option allows us to perform a non-optimized version of this - * behavior. - * - * By uncommenting this option, you will cause *all* RDMA transfers to be - * unregistered immediately after the transfer completes on both sides of the - * connection. This has no effect in 'rdma-pin-all' mode, only regular mode. - * - * This will have a terrible impact on migration performance, so until future - * workload information or LRU information is available, do not attempt to use - * this feature except for basic testing. - */ -//#define RDMA_UNREGISTRATION_EXAMPLE - -/* - * Perform a non-optimized memory unregistration after every transfer - * for demonstration purposes, only if pin-all is not requested. - * - * Potential optimizations: - * 1. Start a new thread to run this function continuously - - for bit clearing - - and for receipt of unregister messages - * 2. Use an LRU. - * 3. Use workload hints. - */ -static int qemu_rdma_unregister_waiting(RDMAContext *rdma) -{ - while (rdma->unregistrations[rdma->unregister_current]) { - int ret; - uint64_t wr_id = rdma->unregistrations[rdma->unregister_current]; - uint64_t chunk = - (wr_id & RDMA_WRID_CHUNK_MASK) >> RDMA_WRID_CHUNK_SHIFT; - uint64_t index = - (wr_id & RDMA_WRID_BLOCK_MASK) >> RDMA_WRID_BLOCK_SHIFT; - RDMALocalBlock *block = - &(rdma->local_ram_blocks.block[index]); - RDMARegister reg = { .current_index = index }; - RDMAControlHeader resp = { .type = RDMA_CONTROL_UNREGISTER_FINISHED, - }; - RDMAControlHeader head = { .len = sizeof(RDMARegister), - .type = RDMA_CONTROL_UNREGISTER_REQUEST, - .repeat = 1, - }; - - trace_qemu_rdma_unregister_waiting_proc(chunk, - rdma->unregister_current); - - rdma->unregistrations[rdma->unregister_current] = 0; - rdma->unregister_current++; - - if (rdma->unregister_current == RDMA_SIGNALED_SEND_MAX) { - rdma->unregister_current = 0; - } - - - /* - * Unregistration is speculative (because migration is single-threaded - * and we cannot break the protocol's inifinband message ordering). - * Thus, if the memory is currently being used for transmission, - * then abort the attempt to unregister and try again - * later the next time a completion is received for this memory. - */ - clear_bit(chunk, block->unregister_bitmap); - - if (test_bit(chunk, block->transit_bitmap)) { - trace_qemu_rdma_unregister_waiting_inflight(chunk); - continue; - } - - trace_qemu_rdma_unregister_waiting_send(chunk); - - ret = ibv_dereg_mr(block->pmr[chunk]); - block->pmr[chunk] = NULL; - block->remote_keys[chunk] = 0; - - if (ret != 0) { - perror("unregistration chunk failed"); - return -ret; - } - rdma->total_registrations--; - - reg.key.chunk = chunk; - register_to_network(rdma, ®); - ret = qemu_rdma_exchange_send(rdma, &head, (uint8_t *) ®, - &resp, NULL, NULL); - if (ret < 0) { - return ret; - } - - trace_qemu_rdma_unregister_waiting_complete(chunk); - } - - return 0; -} - -static uint64_t qemu_rdma_make_wrid(uint64_t wr_id, uint64_t index, - uint64_t chunk) -{ - uint64_t result = wr_id & RDMA_WRID_TYPE_MASK; - - result |= (index << RDMA_WRID_BLOCK_SHIFT); - result |= (chunk << RDMA_WRID_CHUNK_SHIFT); - - return result; -} - -/* - * Set bit for unregistration in the next iteration. - * We cannot transmit right here, but will unpin later. - */ -static void qemu_rdma_signal_unregister(RDMAContext *rdma, uint64_t index, - uint64_t chunk, uint64_t wr_id) -{ - if (rdma->unregistrations[rdma->unregister_next] != 0) { - error_report("rdma migration: queue is full"); - } else { - RDMALocalBlock *block = &(rdma->local_ram_blocks.block[index]); - - if (!test_and_set_bit(chunk, block->unregister_bitmap)) { - trace_qemu_rdma_signal_unregister_append(chunk, - rdma->unregister_next); - - rdma->unregistrations[rdma->unregister_next++] = - qemu_rdma_make_wrid(wr_id, index, chunk); - - if (rdma->unregister_next == RDMA_SIGNALED_SEND_MAX) { - rdma->unregister_next = 0; - } - } else { - trace_qemu_rdma_signal_unregister_already(chunk); - } - } -} - -/* - * Consult the connection manager to see a work request - * (of any kind) has completed. - * Return the work request ID that completed. - */ -static uint64_t qemu_rdma_poll(RDMAContext *rdma, uint64_t *wr_id_out, - uint32_t *byte_len) -{ - int ret; - struct ibv_wc wc; - uint64_t wr_id; - - ret = ibv_poll_cq(rdma->cq, 1, &wc); - - if (!ret) { - *wr_id_out = RDMA_WRID_NONE; - return 0; - } - - if (ret < 0) { - error_report("ibv_poll_cq return %d", ret); - return ret; - } - - wr_id = wc.wr_id & RDMA_WRID_TYPE_MASK; - - if (wc.status != IBV_WC_SUCCESS) { - fprintf(stderr, "ibv_poll_cq wc.status=%d %s!\n", - wc.status, ibv_wc_status_str(wc.status)); - fprintf(stderr, "ibv_poll_cq wrid=%s!\n", wrid_desc[wr_id]); - - return -1; - } - - if (rdma->control_ready_expected && - (wr_id >= RDMA_WRID_RECV_CONTROL)) { - trace_qemu_rdma_poll_recv(wrid_desc[RDMA_WRID_RECV_CONTROL], - wr_id - RDMA_WRID_RECV_CONTROL, wr_id, rdma->nb_sent); - rdma->control_ready_expected = 0; - } - - if (wr_id == RDMA_WRID_RDMA_WRITE) { - uint64_t chunk = - (wc.wr_id & RDMA_WRID_CHUNK_MASK) >> RDMA_WRID_CHUNK_SHIFT; - uint64_t index = - (wc.wr_id & RDMA_WRID_BLOCK_MASK) >> RDMA_WRID_BLOCK_SHIFT; - RDMALocalBlock *block = &(rdma->local_ram_blocks.block[index]); - - trace_qemu_rdma_poll_write(print_wrid(wr_id), wr_id, rdma->nb_sent, - index, chunk, block->local_host_addr, - (void *)(uintptr_t)block->remote_host_addr); - - clear_bit(chunk, block->transit_bitmap); - - if (rdma->nb_sent > 0) { - rdma->nb_sent--; - } - - if (!rdma->pin_all) { - /* - * FYI: If one wanted to signal a specific chunk to be unregistered - * using LRU or workload-specific information, this is the function - * you would call to do so. That chunk would then get asynchronously - * unregistered later. - */ -#ifdef RDMA_UNREGISTRATION_EXAMPLE - qemu_rdma_signal_unregister(rdma, index, chunk, wc.wr_id); -#endif - } - } else { - trace_qemu_rdma_poll_other(print_wrid(wr_id), wr_id, rdma->nb_sent); - } - - *wr_id_out = wc.wr_id; - if (byte_len) { - *byte_len = wc.byte_len; - } - - return 0; -} - -/* - * Block until the next work request has completed. - * - * First poll to see if a work request has already completed, - * otherwise block. - * - * If we encounter completed work requests for IDs other than - * the one we're interested in, then that's generally an error. - * - * The only exception is actual RDMA Write completions. These - * completions only need to be recorded, but do not actually - * need further processing. - */ -static int qemu_rdma_block_for_wrid(RDMAContext *rdma, int wrid_requested, - uint32_t *byte_len) -{ - int num_cq_events = 0, ret = 0; - struct ibv_cq *cq; - void *cq_ctx; - uint64_t wr_id = RDMA_WRID_NONE, wr_id_in; - - if (ibv_req_notify_cq(rdma->cq, 0)) { - return -1; - } - /* poll cq first */ - while (wr_id != wrid_requested) { - ret = qemu_rdma_poll(rdma, &wr_id_in, byte_len); - if (ret < 0) { - return ret; - } - - wr_id = wr_id_in & RDMA_WRID_TYPE_MASK; - - if (wr_id == RDMA_WRID_NONE) { - break; - } - if (wr_id != wrid_requested) { - trace_qemu_rdma_block_for_wrid_miss(print_wrid(wrid_requested), - wrid_requested, print_wrid(wr_id), wr_id); - } - } - - if (wr_id == wrid_requested) { - return 0; - } - - while (1) { - /* - * Coroutine doesn't start until process_incoming_migration() - * so don't yield unless we know we're running inside of a coroutine. - */ - if (rdma->migration_started_on_destination) { - yield_until_fd_readable(rdma->comp_channel->fd); - } - - if (ibv_get_cq_event(rdma->comp_channel, &cq, &cq_ctx)) { - perror("ibv_get_cq_event"); - goto err_block_for_wrid; - } - - num_cq_events++; - - if (ibv_req_notify_cq(cq, 0)) { - goto err_block_for_wrid; - } - - while (wr_id != wrid_requested) { - ret = qemu_rdma_poll(rdma, &wr_id_in, byte_len); - if (ret < 0) { - goto err_block_for_wrid; - } - - wr_id = wr_id_in & RDMA_WRID_TYPE_MASK; - - if (wr_id == RDMA_WRID_NONE) { - break; - } - if (wr_id != wrid_requested) { - trace_qemu_rdma_block_for_wrid_miss(print_wrid(wrid_requested), - wrid_requested, print_wrid(wr_id), wr_id); - } - } - - if (wr_id == wrid_requested) { - goto success_block_for_wrid; - } - } - -success_block_for_wrid: - if (num_cq_events) { - ibv_ack_cq_events(cq, num_cq_events); - } - return 0; - -err_block_for_wrid: - if (num_cq_events) { - ibv_ack_cq_events(cq, num_cq_events); - } - return ret; -} - -/* - * Post a SEND message work request for the control channel - * containing some data and block until the post completes. - */ -static int qemu_rdma_post_send_control(RDMAContext *rdma, uint8_t *buf, - RDMAControlHeader *head) -{ - int ret = 0; - RDMAWorkRequestData *wr = &rdma->wr_data[RDMA_WRID_CONTROL]; - struct ibv_send_wr *bad_wr; - struct ibv_sge sge = { - .addr = (uintptr_t)(wr->control), - .length = head->len + sizeof(RDMAControlHeader), - .lkey = wr->control_mr->lkey, - }; - struct ibv_send_wr send_wr = { - .wr_id = RDMA_WRID_SEND_CONTROL, - .opcode = IBV_WR_SEND, - .send_flags = IBV_SEND_SIGNALED, - .sg_list = &sge, - .num_sge = 1, - }; - - trace_qemu_rdma_post_send_control(control_desc[head->type]); - - /* - * We don't actually need to do a memcpy() in here if we used - * the "sge" properly, but since we're only sending control messages - * (not RAM in a performance-critical path), then its OK for now. - * - * The copy makes the RDMAControlHeader simpler to manipulate - * for the time being. - */ - assert(head->len <= RDMA_CONTROL_MAX_BUFFER - sizeof(*head)); - memcpy(wr->control, head, sizeof(RDMAControlHeader)); - control_to_network((void *) wr->control); - - if (buf) { - memcpy(wr->control + sizeof(RDMAControlHeader), buf, head->len); - } - - - ret = ibv_post_send(rdma->qp, &send_wr, &bad_wr); - - if (ret > 0) { - error_report("Failed to use post IB SEND for control"); - return -ret; - } - - ret = qemu_rdma_block_for_wrid(rdma, RDMA_WRID_SEND_CONTROL, NULL); - if (ret < 0) { - error_report("rdma migration: send polling control error"); - } - - return ret; -} - -/* - * Post a RECV work request in anticipation of some future receipt - * of data on the control channel. - */ -static int qemu_rdma_post_recv_control(RDMAContext *rdma, int idx) -{ - struct ibv_recv_wr *bad_wr; - struct ibv_sge sge = { - .addr = (uintptr_t)(rdma->wr_data[idx].control), - .length = RDMA_CONTROL_MAX_BUFFER, - .lkey = rdma->wr_data[idx].control_mr->lkey, - }; - - struct ibv_recv_wr recv_wr = { - .wr_id = RDMA_WRID_RECV_CONTROL + idx, - .sg_list = &sge, - .num_sge = 1, - }; - - - if (ibv_post_recv(rdma->qp, &recv_wr, &bad_wr)) { - return -1; - } - - return 0; -} - -/* - * Block and wait for a RECV control channel message to arrive. - */ -static int qemu_rdma_exchange_get_response(RDMAContext *rdma, - RDMAControlHeader *head, int expecting, int idx) -{ - uint32_t byte_len; - int ret = qemu_rdma_block_for_wrid(rdma, RDMA_WRID_RECV_CONTROL + idx, - &byte_len); - - if (ret < 0) { - error_report("rdma migration: recv polling control error!"); - return ret; - } - - network_to_control((void *) rdma->wr_data[idx].control); - memcpy(head, rdma->wr_data[idx].control, sizeof(RDMAControlHeader)); - - trace_qemu_rdma_exchange_get_response_start(control_desc[expecting]); - - if (expecting == RDMA_CONTROL_NONE) { - trace_qemu_rdma_exchange_get_response_none(control_desc[head->type], - head->type); - } else if (head->type != expecting || head->type == RDMA_CONTROL_ERROR) { - error_report("Was expecting a %s (%d) control message" - ", but got: %s (%d), length: %d", - control_desc[expecting], expecting, - control_desc[head->type], head->type, head->len); - return -EIO; - } - if (head->len > RDMA_CONTROL_MAX_BUFFER - sizeof(*head)) { - error_report("too long length: %d", head->len); - return -EINVAL; - } - if (sizeof(*head) + head->len != byte_len) { - error_report("Malformed length: %d byte_len %d", head->len, byte_len); - return -EINVAL; - } - - return 0; -} - -/* - * When a RECV work request has completed, the work request's - * buffer is pointed at the header. - * - * This will advance the pointer to the data portion - * of the control message of the work request's buffer that - * was populated after the work request finished. - */ -static void qemu_rdma_move_header(RDMAContext *rdma, int idx, - RDMAControlHeader *head) -{ - rdma->wr_data[idx].control_len = head->len; - rdma->wr_data[idx].control_curr = - rdma->wr_data[idx].control + sizeof(RDMAControlHeader); -} - -/* - * This is an 'atomic' high-level operation to deliver a single, unified - * control-channel message. - * - * Additionally, if the user is expecting some kind of reply to this message, - * they can request a 'resp' response message be filled in by posting an - * additional work request on behalf of the user and waiting for an additional - * completion. - * - * The extra (optional) response is used during registration to us from having - * to perform an *additional* exchange of message just to provide a response by - * instead piggy-backing on the acknowledgement. - */ -static int qemu_rdma_exchange_send(RDMAContext *rdma, RDMAControlHeader *head, - uint8_t *data, RDMAControlHeader *resp, - int *resp_idx, - int (*callback)(RDMAContext *rdma)) -{ - int ret = 0; - - /* - * Wait until the dest is ready before attempting to deliver the message - * by waiting for a READY message. - */ - if (rdma->control_ready_expected) { - RDMAControlHeader resp; - ret = qemu_rdma_exchange_get_response(rdma, - &resp, RDMA_CONTROL_READY, RDMA_WRID_READY); - if (ret < 0) { - return ret; - } - } - - /* - * If the user is expecting a response, post a WR in anticipation of it. - */ - if (resp) { - ret = qemu_rdma_post_recv_control(rdma, RDMA_WRID_DATA); - if (ret) { - error_report("rdma migration: error posting" - " extra control recv for anticipated result!"); - return ret; - } - } - - /* - * Post a WR to replace the one we just consumed for the READY message. - */ - ret = qemu_rdma_post_recv_control(rdma, RDMA_WRID_READY); - if (ret) { - error_report("rdma migration: error posting first control recv!"); - return ret; - } - - /* - * Deliver the control message that was requested. - */ - ret = qemu_rdma_post_send_control(rdma, data, head); - - if (ret < 0) { - error_report("Failed to send control buffer!"); - return ret; - } - - /* - * If we're expecting a response, block and wait for it. - */ - if (resp) { - if (callback) { - trace_qemu_rdma_exchange_send_issue_callback(); - ret = callback(rdma); - if (ret < 0) { - return ret; - } - } - - trace_qemu_rdma_exchange_send_waiting(control_desc[resp->type]); - ret = qemu_rdma_exchange_get_response(rdma, resp, - resp->type, RDMA_WRID_DATA); - - if (ret < 0) { - return ret; - } - - qemu_rdma_move_header(rdma, RDMA_WRID_DATA, resp); - if (resp_idx) { - *resp_idx = RDMA_WRID_DATA; - } - trace_qemu_rdma_exchange_send_received(control_desc[resp->type]); - } - - rdma->control_ready_expected = 1; - - return 0; -} - -/* - * This is an 'atomic' high-level operation to receive a single, unified - * control-channel message. - */ -static int qemu_rdma_exchange_recv(RDMAContext *rdma, RDMAControlHeader *head, - int expecting) -{ - RDMAControlHeader ready = { - .len = 0, - .type = RDMA_CONTROL_READY, - .repeat = 1, - }; - int ret; - - /* - * Inform the source that we're ready to receive a message. - */ - ret = qemu_rdma_post_send_control(rdma, NULL, &ready); - - if (ret < 0) { - error_report("Failed to send control buffer!"); - return ret; - } - - /* - * Block and wait for the message. - */ - ret = qemu_rdma_exchange_get_response(rdma, head, - expecting, RDMA_WRID_READY); - - if (ret < 0) { - return ret; - } - - qemu_rdma_move_header(rdma, RDMA_WRID_READY, head); - - /* - * Post a new RECV work request to replace the one we just consumed. - */ - ret = qemu_rdma_post_recv_control(rdma, RDMA_WRID_READY); - if (ret) { - error_report("rdma migration: error posting second control recv!"); - return ret; - } - - return 0; -} - -/* - * Write an actual chunk of memory using RDMA. - * - * If we're using dynamic registration on the dest-side, we have to - * send a registration command first. - */ -static int qemu_rdma_write_one(QEMUFile *f, RDMAContext *rdma, - int current_index, uint64_t current_addr, - uint64_t length) -{ - struct ibv_sge sge; - struct ibv_send_wr send_wr = { 0 }; - struct ibv_send_wr *bad_wr; - int reg_result_idx, ret, count = 0; - uint64_t chunk, chunks; - uint8_t *chunk_start, *chunk_end; - RDMALocalBlock *block = &(rdma->local_ram_blocks.block[current_index]); - RDMARegister reg; - RDMARegisterResult *reg_result; - RDMAControlHeader resp = { .type = RDMA_CONTROL_REGISTER_RESULT }; - RDMAControlHeader head = { .len = sizeof(RDMARegister), - .type = RDMA_CONTROL_REGISTER_REQUEST, - .repeat = 1, - }; - -retry: - sge.addr = (uintptr_t)(block->local_host_addr + - (current_addr - block->offset)); - sge.length = length; - - chunk = ram_chunk_index(block->local_host_addr, - (uint8_t *)(uintptr_t)sge.addr); - chunk_start = ram_chunk_start(block, chunk); - - if (block->is_ram_block) { - chunks = length / (1UL << RDMA_REG_CHUNK_SHIFT); - - if (chunks && ((length % (1UL << RDMA_REG_CHUNK_SHIFT)) == 0)) { - chunks--; - } - } else { - chunks = block->length / (1UL << RDMA_REG_CHUNK_SHIFT); - - if (chunks && ((block->length % (1UL << RDMA_REG_CHUNK_SHIFT)) == 0)) { - chunks--; - } - } - - trace_qemu_rdma_write_one_top(chunks + 1, - (chunks + 1) * - (1UL << RDMA_REG_CHUNK_SHIFT) / 1024 / 1024); - - chunk_end = ram_chunk_end(block, chunk + chunks); - - if (!rdma->pin_all) { -#ifdef RDMA_UNREGISTRATION_EXAMPLE - qemu_rdma_unregister_waiting(rdma); -#endif - } - - while (test_bit(chunk, block->transit_bitmap)) { - (void)count; - trace_qemu_rdma_write_one_block(count++, current_index, chunk, - sge.addr, length, rdma->nb_sent, block->nb_chunks); - - ret = qemu_rdma_block_for_wrid(rdma, RDMA_WRID_RDMA_WRITE, NULL); - - if (ret < 0) { - error_report("Failed to Wait for previous write to complete " - "block %d chunk %" PRIu64 - " current %" PRIu64 " len %" PRIu64 " %d", - current_index, chunk, sge.addr, length, rdma->nb_sent); - return ret; - } - } - - if (!rdma->pin_all || !block->is_ram_block) { - if (!block->remote_keys[chunk]) { - /* - * This chunk has not yet been registered, so first check to see - * if the entire chunk is zero. If so, tell the other size to - * memset() + madvise() the entire chunk without RDMA. - */ - - if (can_use_buffer_find_nonzero_offset((void *)(uintptr_t)sge.addr, - length) - && buffer_find_nonzero_offset((void *)(uintptr_t)sge.addr, - length) == length) { - RDMACompress comp = { - .offset = current_addr, - .value = 0, - .block_idx = current_index, - .length = length, - }; - - head.len = sizeof(comp); - head.type = RDMA_CONTROL_COMPRESS; - - trace_qemu_rdma_write_one_zero(chunk, sge.length, - current_index, current_addr); - - compress_to_network(rdma, &comp); - ret = qemu_rdma_exchange_send(rdma, &head, - (uint8_t *) &comp, NULL, NULL, NULL); - - if (ret < 0) { - return -EIO; - } - - acct_update_position(f, sge.length, true); - - return 1; - } - - /* - * Otherwise, tell other side to register. - */ - reg.current_index = current_index; - if (block->is_ram_block) { - reg.key.current_addr = current_addr; - } else { - reg.key.chunk = chunk; - } - reg.chunks = chunks; - - trace_qemu_rdma_write_one_sendreg(chunk, sge.length, current_index, - current_addr); - - register_to_network(rdma, ®); - ret = qemu_rdma_exchange_send(rdma, &head, (uint8_t *) ®, - &resp, ®_result_idx, NULL); - if (ret < 0) { - return ret; - } - - /* try to overlap this single registration with the one we sent. */ - if (qemu_rdma_register_and_get_keys(rdma, block, sge.addr, - &sge.lkey, NULL, chunk, - chunk_start, chunk_end)) { - error_report("cannot get lkey"); - return -EINVAL; - } - - reg_result = (RDMARegisterResult *) - rdma->wr_data[reg_result_idx].control_curr; - - network_to_result(reg_result); - - trace_qemu_rdma_write_one_recvregres(block->remote_keys[chunk], - reg_result->rkey, chunk); - - block->remote_keys[chunk] = reg_result->rkey; - block->remote_host_addr = reg_result->host_addr; - } else { - /* already registered before */ - if (qemu_rdma_register_and_get_keys(rdma, block, sge.addr, - &sge.lkey, NULL, chunk, - chunk_start, chunk_end)) { - error_report("cannot get lkey!"); - return -EINVAL; - } - } - - send_wr.wr.rdma.rkey = block->remote_keys[chunk]; - } else { - send_wr.wr.rdma.rkey = block->remote_rkey; - - if (qemu_rdma_register_and_get_keys(rdma, block, sge.addr, - &sge.lkey, NULL, chunk, - chunk_start, chunk_end)) { - error_report("cannot get lkey!"); - return -EINVAL; - } - } - - /* - * Encode the ram block index and chunk within this wrid. - * We will use this information at the time of completion - * to figure out which bitmap to check against and then which - * chunk in the bitmap to look for. - */ - send_wr.wr_id = qemu_rdma_make_wrid(RDMA_WRID_RDMA_WRITE, - current_index, chunk); - - send_wr.opcode = IBV_WR_RDMA_WRITE; - send_wr.send_flags = IBV_SEND_SIGNALED; - send_wr.sg_list = &sge; - send_wr.num_sge = 1; - send_wr.wr.rdma.remote_addr = block->remote_host_addr + - (current_addr - block->offset); - - trace_qemu_rdma_write_one_post(chunk, sge.addr, send_wr.wr.rdma.remote_addr, - sge.length); - - /* - * ibv_post_send() does not return negative error numbers, - * per the specification they are positive - no idea why. - */ - ret = ibv_post_send(rdma->qp, &send_wr, &bad_wr); - - if (ret == ENOMEM) { - trace_qemu_rdma_write_one_queue_full(); - ret = qemu_rdma_block_for_wrid(rdma, RDMA_WRID_RDMA_WRITE, NULL); - if (ret < 0) { - error_report("rdma migration: failed to make " - "room in full send queue! %d", ret); - return ret; - } - - goto retry; - - } else if (ret > 0) { - perror("rdma migration: post rdma write failed"); - return -ret; - } - - set_bit(chunk, block->transit_bitmap); - acct_update_position(f, sge.length, false); - rdma->total_writes++; - - return 0; -} - -/* - * Push out any unwritten RDMA operations. - * - * We support sending out multiple chunks at the same time. - * Not all of them need to get signaled in the completion queue. - */ -static int qemu_rdma_write_flush(QEMUFile *f, RDMAContext *rdma) -{ - int ret; - - if (!rdma->current_length) { - return 0; - } - - ret = qemu_rdma_write_one(f, rdma, - rdma->current_index, rdma->current_addr, rdma->current_length); - - if (ret < 0) { - return ret; - } - - if (ret == 0) { - rdma->nb_sent++; - trace_qemu_rdma_write_flush(rdma->nb_sent); - } - - rdma->current_length = 0; - rdma->current_addr = 0; - - return 0; -} - -static inline int qemu_rdma_buffer_mergable(RDMAContext *rdma, - uint64_t offset, uint64_t len) -{ - RDMALocalBlock *block; - uint8_t *host_addr; - uint8_t *chunk_end; - - if (rdma->current_index < 0) { - return 0; - } - - if (rdma->current_chunk < 0) { - return 0; - } - - block = &(rdma->local_ram_blocks.block[rdma->current_index]); - host_addr = block->local_host_addr + (offset - block->offset); - chunk_end = ram_chunk_end(block, rdma->current_chunk); - - if (rdma->current_length == 0) { - return 0; - } - - /* - * Only merge into chunk sequentially. - */ - if (offset != (rdma->current_addr + rdma->current_length)) { - return 0; - } - - if (offset < block->offset) { - return 0; - } - - if ((offset + len) > (block->offset + block->length)) { - return 0; - } - - if ((host_addr + len) > chunk_end) { - return 0; - } - - return 1; -} - -/* - * We're not actually writing here, but doing three things: - * - * 1. Identify the chunk the buffer belongs to. - * 2. If the chunk is full or the buffer doesn't belong to the current - * chunk, then start a new chunk and flush() the old chunk. - * 3. To keep the hardware busy, we also group chunks into batches - * and only require that a batch gets acknowledged in the completion - * qeueue instead of each individual chunk. - */ -static int qemu_rdma_write(QEMUFile *f, RDMAContext *rdma, - uint64_t block_offset, uint64_t offset, - uint64_t len) -{ - uint64_t current_addr = block_offset + offset; - uint64_t index = rdma->current_index; - uint64_t chunk = rdma->current_chunk; - int ret; - - /* If we cannot merge it, we flush the current buffer first. */ - if (!qemu_rdma_buffer_mergable(rdma, current_addr, len)) { - ret = qemu_rdma_write_flush(f, rdma); - if (ret) { - return ret; - } - rdma->current_length = 0; - rdma->current_addr = current_addr; - - ret = qemu_rdma_search_ram_block(rdma, block_offset, - offset, len, &index, &chunk); - if (ret) { - error_report("ram block search failed"); - return ret; - } - rdma->current_index = index; - rdma->current_chunk = chunk; - } - - /* merge it */ - rdma->current_length += len; - - /* flush it if buffer is too large */ - if (rdma->current_length >= RDMA_MERGE_MAX) { - return qemu_rdma_write_flush(f, rdma); - } - - return 0; -} - -static void qemu_rdma_cleanup(RDMAContext *rdma) -{ - struct rdma_cm_event *cm_event; - int ret, idx; - - if (rdma->cm_id && rdma->connected) { - if (rdma->error_state) { - RDMAControlHeader head = { .len = 0, - .type = RDMA_CONTROL_ERROR, - .repeat = 1, - }; - error_report("Early error. Sending error."); - qemu_rdma_post_send_control(rdma, NULL, &head); - } - - ret = rdma_disconnect(rdma->cm_id); - if (!ret) { - trace_qemu_rdma_cleanup_waiting_for_disconnect(); - ret = rdma_get_cm_event(rdma->channel, &cm_event); - if (!ret) { - rdma_ack_cm_event(cm_event); - } - } - trace_qemu_rdma_cleanup_disconnect(); - rdma->connected = false; - } - - g_free(rdma->dest_blocks); - rdma->dest_blocks = NULL; - - for (idx = 0; idx < RDMA_WRID_MAX; idx++) { - if (rdma->wr_data[idx].control_mr) { - rdma->total_registrations--; - ibv_dereg_mr(rdma->wr_data[idx].control_mr); - } - rdma->wr_data[idx].control_mr = NULL; - } - - if (rdma->local_ram_blocks.block) { - while (rdma->local_ram_blocks.nb_blocks) { - rdma_delete_block(rdma, &rdma->local_ram_blocks.block[0]); - } - } - - if (rdma->qp) { - rdma_destroy_qp(rdma->cm_id); - rdma->qp = NULL; - } - if (rdma->cq) { - ibv_destroy_cq(rdma->cq); - rdma->cq = NULL; - } - if (rdma->comp_channel) { - ibv_destroy_comp_channel(rdma->comp_channel); - rdma->comp_channel = NULL; - } - if (rdma->pd) { - ibv_dealloc_pd(rdma->pd); - rdma->pd = NULL; - } - if (rdma->cm_id) { - rdma_destroy_id(rdma->cm_id); - rdma->cm_id = NULL; - } - if (rdma->listen_id) { - rdma_destroy_id(rdma->listen_id); - rdma->listen_id = NULL; - } - if (rdma->channel) { - rdma_destroy_event_channel(rdma->channel); - rdma->channel = NULL; - } - g_free(rdma->host); - rdma->host = NULL; -} - - -static int qemu_rdma_source_init(RDMAContext *rdma, Error **errp, bool pin_all) -{ - int ret, idx; - Error *local_err = NULL, **temp = &local_err; - - /* - * Will be validated against destination's actual capabilities - * after the connect() completes. - */ - rdma->pin_all = pin_all; - - ret = qemu_rdma_resolve_host(rdma, temp); - if (ret) { - goto err_rdma_source_init; - } - - ret = qemu_rdma_alloc_pd_cq(rdma); - if (ret) { - ERROR(temp, "rdma migration: error allocating pd and cq! Your mlock()" - " limits may be too low. Please check $ ulimit -a # and " - "search for 'ulimit -l' in the output"); - goto err_rdma_source_init; - } - - ret = qemu_rdma_alloc_qp(rdma); - if (ret) { - ERROR(temp, "rdma migration: error allocating qp!"); - goto err_rdma_source_init; - } - - ret = qemu_rdma_init_ram_blocks(rdma); - if (ret) { - ERROR(temp, "rdma migration: error initializing ram blocks!"); - goto err_rdma_source_init; - } - - /* Build the hash that maps from offset to RAMBlock */ - rdma->blockmap = g_hash_table_new(g_direct_hash, g_direct_equal); - for (idx = 0; idx < rdma->local_ram_blocks.nb_blocks; idx++) { - g_hash_table_insert(rdma->blockmap, - (void *)(uintptr_t)rdma->local_ram_blocks.block[idx].offset, - &rdma->local_ram_blocks.block[idx]); - } - - for (idx = 0; idx < RDMA_WRID_MAX; idx++) { - ret = qemu_rdma_reg_control(rdma, idx); - if (ret) { - ERROR(temp, "rdma migration: error registering %d control!", - idx); - goto err_rdma_source_init; - } - } - - return 0; - -err_rdma_source_init: - error_propagate(errp, local_err); - qemu_rdma_cleanup(rdma); - return -1; -} - -static int qemu_rdma_connect(RDMAContext *rdma, Error **errp) -{ - RDMACapabilities cap = { - .version = RDMA_CONTROL_VERSION_CURRENT, - .flags = 0, - }; - struct rdma_conn_param conn_param = { .initiator_depth = 2, - .retry_count = 5, - .private_data = &cap, - .private_data_len = sizeof(cap), - }; - struct rdma_cm_event *cm_event; - int ret; - - /* - * Only negotiate the capability with destination if the user - * on the source first requested the capability. - */ - if (rdma->pin_all) { - trace_qemu_rdma_connect_pin_all_requested(); - cap.flags |= RDMA_CAPABILITY_PIN_ALL; - } - - caps_to_network(&cap); - - ret = rdma_connect(rdma->cm_id, &conn_param); - if (ret) { - perror("rdma_connect"); - ERROR(errp, "connecting to destination!"); - goto err_rdma_source_connect; - } - - ret = rdma_get_cm_event(rdma->channel, &cm_event); - if (ret) { - perror("rdma_get_cm_event after rdma_connect"); - ERROR(errp, "connecting to destination!"); - rdma_ack_cm_event(cm_event); - goto err_rdma_source_connect; - } - - if (cm_event->event != RDMA_CM_EVENT_ESTABLISHED) { - perror("rdma_get_cm_event != EVENT_ESTABLISHED after rdma_connect"); - ERROR(errp, "connecting to destination!"); - rdma_ack_cm_event(cm_event); - goto err_rdma_source_connect; - } - rdma->connected = true; - - memcpy(&cap, cm_event->param.conn.private_data, sizeof(cap)); - network_to_caps(&cap); - - /* - * Verify that the *requested* capabilities are supported by the destination - * and disable them otherwise. - */ - if (rdma->pin_all && !(cap.flags & RDMA_CAPABILITY_PIN_ALL)) { - ERROR(errp, "Server cannot support pinning all memory. " - "Will register memory dynamically."); - rdma->pin_all = false; - } - - trace_qemu_rdma_connect_pin_all_outcome(rdma->pin_all); - - rdma_ack_cm_event(cm_event); - - ret = qemu_rdma_post_recv_control(rdma, RDMA_WRID_READY); - if (ret) { - ERROR(errp, "posting second control recv!"); - goto err_rdma_source_connect; - } - - rdma->control_ready_expected = 1; - rdma->nb_sent = 0; - return 0; - -err_rdma_source_connect: - qemu_rdma_cleanup(rdma); - return -1; -} - -static int qemu_rdma_dest_init(RDMAContext *rdma, Error **errp) -{ - int ret, idx; - struct rdma_cm_id *listen_id; - char ip[40] = "unknown"; - struct rdma_addrinfo *res, *e; - char port_str[16]; - - for (idx = 0; idx < RDMA_WRID_MAX; idx++) { - rdma->wr_data[idx].control_len = 0; - rdma->wr_data[idx].control_curr = NULL; - } - - if (!rdma->host || !rdma->host[0]) { - ERROR(errp, "RDMA host is not set!"); - rdma->error_state = -EINVAL; - return -1; - } - /* create CM channel */ - rdma->channel = rdma_create_event_channel(); - if (!rdma->channel) { - ERROR(errp, "could not create rdma event channel"); - rdma->error_state = -EINVAL; - return -1; - } - - /* create CM id */ - ret = rdma_create_id(rdma->channel, &listen_id, NULL, RDMA_PS_TCP); - if (ret) { - ERROR(errp, "could not create cm_id!"); - goto err_dest_init_create_listen_id; - } - - snprintf(port_str, 16, "%d", rdma->port); - port_str[15] = '\0'; - - ret = rdma_getaddrinfo(rdma->host, port_str, NULL, &res); - if (ret < 0) { - ERROR(errp, "could not rdma_getaddrinfo address %s", rdma->host); - goto err_dest_init_bind_addr; - } - - for (e = res; e != NULL; e = e->ai_next) { - inet_ntop(e->ai_family, - &((struct sockaddr_in *) e->ai_dst_addr)->sin_addr, ip, sizeof ip); - trace_qemu_rdma_dest_init_trying(rdma->host, ip); - ret = rdma_bind_addr(listen_id, e->ai_dst_addr); - if (ret) { - continue; - } - if (e->ai_family == AF_INET6) { - ret = qemu_rdma_broken_ipv6_kernel(errp, listen_id->verbs); - if (ret) { - continue; - } - } - break; - } - - if (!e) { - ERROR(errp, "Error: could not rdma_bind_addr!"); - goto err_dest_init_bind_addr; - } - - rdma->listen_id = listen_id; - qemu_rdma_dump_gid("dest_init", listen_id); - return 0; - -err_dest_init_bind_addr: - rdma_destroy_id(listen_id); -err_dest_init_create_listen_id: - rdma_destroy_event_channel(rdma->channel); - rdma->channel = NULL; - rdma->error_state = ret; - return ret; - -} - -static void *qemu_rdma_data_init(const char *host_port, Error **errp) -{ - RDMAContext *rdma = NULL; - InetSocketAddress *addr; - - if (host_port) { - rdma = g_new0(RDMAContext, 1); - rdma->current_index = -1; - rdma->current_chunk = -1; - - addr = inet_parse(host_port, NULL); - if (addr != NULL) { - rdma->port = atoi(addr->port); - rdma->host = g_strdup(addr->host); - } else { - ERROR(errp, "bad RDMA migration address '%s'", host_port); - g_free(rdma); - rdma = NULL; - } - - qapi_free_InetSocketAddress(addr); - } - - return rdma; -} - -/* - * QEMUFile interface to the control channel. - * SEND messages for control only. - * VM's ram is handled with regular RDMA messages. - */ -static ssize_t qemu_rdma_put_buffer(void *opaque, const uint8_t *buf, - int64_t pos, size_t size) -{ - QEMUFileRDMA *r = opaque; - QEMUFile *f = r->file; - RDMAContext *rdma = r->rdma; - size_t remaining = size; - uint8_t * data = (void *) buf; - int ret; - - CHECK_ERROR_STATE(); - - /* - * Push out any writes that - * we're queued up for VM's ram. - */ - ret = qemu_rdma_write_flush(f, rdma); - if (ret < 0) { - rdma->error_state = ret; - return ret; - } - - while (remaining) { - RDMAControlHeader head; - - r->len = MIN(remaining, RDMA_SEND_INCREMENT); - remaining -= r->len; - - /* Guaranteed to fit due to RDMA_SEND_INCREMENT MIN above */ - head.len = (uint32_t)r->len; - head.type = RDMA_CONTROL_QEMU_FILE; - - ret = qemu_rdma_exchange_send(rdma, &head, data, NULL, NULL, NULL); - - if (ret < 0) { - rdma->error_state = ret; - return ret; - } - - data += r->len; - } - - return size; -} - -static size_t qemu_rdma_fill(RDMAContext *rdma, uint8_t *buf, - size_t size, int idx) -{ - size_t len = 0; - - if (rdma->wr_data[idx].control_len) { - trace_qemu_rdma_fill(rdma->wr_data[idx].control_len, size); - - len = MIN(size, rdma->wr_data[idx].control_len); - memcpy(buf, rdma->wr_data[idx].control_curr, len); - rdma->wr_data[idx].control_curr += len; - rdma->wr_data[idx].control_len -= len; - } - - return len; -} - -/* - * QEMUFile interface to the control channel. - * RDMA links don't use bytestreams, so we have to - * return bytes to QEMUFile opportunistically. - */ -static ssize_t qemu_rdma_get_buffer(void *opaque, uint8_t *buf, - int64_t pos, size_t size) -{ - QEMUFileRDMA *r = opaque; - RDMAContext *rdma = r->rdma; - RDMAControlHeader head; - int ret = 0; - - CHECK_ERROR_STATE(); - - /* - * First, we hold on to the last SEND message we - * were given and dish out the bytes until we run - * out of bytes. - */ - r->len = qemu_rdma_fill(r->rdma, buf, size, 0); - if (r->len) { - return r->len; - } - - /* - * Once we run out, we block and wait for another - * SEND message to arrive. - */ - ret = qemu_rdma_exchange_recv(rdma, &head, RDMA_CONTROL_QEMU_FILE); - - if (ret < 0) { - rdma->error_state = ret; - return ret; - } - - /* - * SEND was received with new bytes, now try again. - */ - return qemu_rdma_fill(r->rdma, buf, size, 0); -} - -/* - * Block until all the outstanding chunks have been delivered by the hardware. - */ -static int qemu_rdma_drain_cq(QEMUFile *f, RDMAContext *rdma) -{ - int ret; - - if (qemu_rdma_write_flush(f, rdma) < 0) { - return -EIO; - } - - while (rdma->nb_sent) { - ret = qemu_rdma_block_for_wrid(rdma, RDMA_WRID_RDMA_WRITE, NULL); - if (ret < 0) { - error_report("rdma migration: complete polling error!"); - return -EIO; - } - } - - qemu_rdma_unregister_waiting(rdma); - - return 0; -} - -static int qemu_rdma_close(void *opaque) -{ - trace_qemu_rdma_close(); - QEMUFileRDMA *r = opaque; - if (r->rdma) { - qemu_rdma_cleanup(r->rdma); - g_free(r->rdma); - } - g_free(r); - return 0; -} - -/* - * Parameters: - * @offset == 0 : - * This means that 'block_offset' is a full virtual address that does not - * belong to a RAMBlock of the virtual machine and instead - * represents a private malloc'd memory area that the caller wishes to - * transfer. - * - * @offset != 0 : - * Offset is an offset to be added to block_offset and used - * to also lookup the corresponding RAMBlock. - * - * @size > 0 : - * Initiate an transfer this size. - * - * @size == 0 : - * A 'hint' or 'advice' that means that we wish to speculatively - * and asynchronously unregister this memory. In this case, there is no - * guarantee that the unregister will actually happen, for example, - * if the memory is being actively transmitted. Additionally, the memory - * may be re-registered at any future time if a write within the same - * chunk was requested again, even if you attempted to unregister it - * here. - * - * @size < 0 : TODO, not yet supported - * Unregister the memory NOW. This means that the caller does not - * expect there to be any future RDMA transfers and we just want to clean - * things up. This is used in case the upper layer owns the memory and - * cannot wait for qemu_fclose() to occur. - * - * @bytes_sent : User-specificed pointer to indicate how many bytes were - * sent. Usually, this will not be more than a few bytes of - * the protocol because most transfers are sent asynchronously. - */ -static size_t qemu_rdma_save_page(QEMUFile *f, void *opaque, - ram_addr_t block_offset, ram_addr_t offset, - size_t size, uint64_t *bytes_sent) -{ - QEMUFileRDMA *rfile = opaque; - RDMAContext *rdma = rfile->rdma; - int ret; - - CHECK_ERROR_STATE(); - - qemu_fflush(f); - - if (size > 0) { - /* - * Add this page to the current 'chunk'. If the chunk - * is full, or the page doen't belong to the current chunk, - * an actual RDMA write will occur and a new chunk will be formed. - */ - ret = qemu_rdma_write(f, rdma, block_offset, offset, size); - if (ret < 0) { - error_report("rdma migration: write error! %d", ret); - goto err; - } - - /* - * We always return 1 bytes because the RDMA - * protocol is completely asynchronous. We do not yet know - * whether an identified chunk is zero or not because we're - * waiting for other pages to potentially be merged with - * the current chunk. So, we have to call qemu_update_position() - * later on when the actual write occurs. - */ - if (bytes_sent) { - *bytes_sent = 1; - } - } else { - uint64_t index, chunk; - - /* TODO: Change QEMUFileOps prototype to be signed: size_t => long - if (size < 0) { - ret = qemu_rdma_drain_cq(f, rdma); - if (ret < 0) { - fprintf(stderr, "rdma: failed to synchronously drain" - " completion queue before unregistration.\n"); - goto err; - } - } - */ - - ret = qemu_rdma_search_ram_block(rdma, block_offset, - offset, size, &index, &chunk); - - if (ret) { - error_report("ram block search failed"); - goto err; - } - - qemu_rdma_signal_unregister(rdma, index, chunk, 0); - - /* - * TODO: Synchronous, guaranteed unregistration (should not occur during - * fast-path). Otherwise, unregisters will process on the next call to - * qemu_rdma_drain_cq() - if (size < 0) { - qemu_rdma_unregister_waiting(rdma); - } - */ - } - - /* - * Drain the Completion Queue if possible, but do not block, - * just poll. - * - * If nothing to poll, the end of the iteration will do this - * again to make sure we don't overflow the request queue. - */ - while (1) { - uint64_t wr_id, wr_id_in; - int ret = qemu_rdma_poll(rdma, &wr_id_in, NULL); - if (ret < 0) { - error_report("rdma migration: polling error! %d", ret); - goto err; - } - - wr_id = wr_id_in & RDMA_WRID_TYPE_MASK; - - if (wr_id == RDMA_WRID_NONE) { - break; - } - } - - return RAM_SAVE_CONTROL_DELAYED; -err: - rdma->error_state = ret; - return ret; -} - -static int qemu_rdma_accept(RDMAContext *rdma) -{ - RDMACapabilities cap; - struct rdma_conn_param conn_param = { - .responder_resources = 2, - .private_data = &cap, - .private_data_len = sizeof(cap), - }; - struct rdma_cm_event *cm_event; - struct ibv_context *verbs; - int ret = -EINVAL; - int idx; - - ret = rdma_get_cm_event(rdma->channel, &cm_event); - if (ret) { - goto err_rdma_dest_wait; - } - - if (cm_event->event != RDMA_CM_EVENT_CONNECT_REQUEST) { - rdma_ack_cm_event(cm_event); - goto err_rdma_dest_wait; - } - - memcpy(&cap, cm_event->param.conn.private_data, sizeof(cap)); - - network_to_caps(&cap); - - if (cap.version < 1 || cap.version > RDMA_CONTROL_VERSION_CURRENT) { - error_report("Unknown source RDMA version: %d, bailing...", - cap.version); - rdma_ack_cm_event(cm_event); - goto err_rdma_dest_wait; - } - - /* - * Respond with only the capabilities this version of QEMU knows about. - */ - cap.flags &= known_capabilities; - - /* - * Enable the ones that we do know about. - * Add other checks here as new ones are introduced. - */ - if (cap.flags & RDMA_CAPABILITY_PIN_ALL) { - rdma->pin_all = true; - } - - rdma->cm_id = cm_event->id; - verbs = cm_event->id->verbs; - - rdma_ack_cm_event(cm_event); - - trace_qemu_rdma_accept_pin_state(rdma->pin_all); - - caps_to_network(&cap); - - trace_qemu_rdma_accept_pin_verbsc(verbs); - - if (!rdma->verbs) { - rdma->verbs = verbs; - } else if (rdma->verbs != verbs) { - error_report("ibv context not matching %p, %p!", rdma->verbs, - verbs); - goto err_rdma_dest_wait; - } - - qemu_rdma_dump_id("dest_init", verbs); - - ret = qemu_rdma_alloc_pd_cq(rdma); - if (ret) { - error_report("rdma migration: error allocating pd and cq!"); - goto err_rdma_dest_wait; - } - - ret = qemu_rdma_alloc_qp(rdma); - if (ret) { - error_report("rdma migration: error allocating qp!"); - goto err_rdma_dest_wait; - } - - ret = qemu_rdma_init_ram_blocks(rdma); - if (ret) { - error_report("rdma migration: error initializing ram blocks!"); - goto err_rdma_dest_wait; - } - - for (idx = 0; idx < RDMA_WRID_MAX; idx++) { - ret = qemu_rdma_reg_control(rdma, idx); - if (ret) { - error_report("rdma: error registering %d control", idx); - goto err_rdma_dest_wait; - } - } - - qemu_set_fd_handler(rdma->channel->fd, NULL, NULL, NULL); - - ret = rdma_accept(rdma->cm_id, &conn_param); - if (ret) { - error_report("rdma_accept returns %d", ret); - goto err_rdma_dest_wait; - } - - ret = rdma_get_cm_event(rdma->channel, &cm_event); - if (ret) { - error_report("rdma_accept get_cm_event failed %d", ret); - goto err_rdma_dest_wait; - } - - if (cm_event->event != RDMA_CM_EVENT_ESTABLISHED) { - error_report("rdma_accept not event established"); - rdma_ack_cm_event(cm_event); - goto err_rdma_dest_wait; - } - - rdma_ack_cm_event(cm_event); - rdma->connected = true; - - ret = qemu_rdma_post_recv_control(rdma, RDMA_WRID_READY); - if (ret) { - error_report("rdma migration: error posting second control recv"); - goto err_rdma_dest_wait; - } - - qemu_rdma_dump_gid("dest_connect", rdma->cm_id); - - return 0; - -err_rdma_dest_wait: - rdma->error_state = ret; - qemu_rdma_cleanup(rdma); - return ret; -} - -static int dest_ram_sort_func(const void *a, const void *b) -{ - unsigned int a_index = ((const RDMALocalBlock *)a)->src_index; - unsigned int b_index = ((const RDMALocalBlock *)b)->src_index; - - return (a_index < b_index) ? -1 : (a_index != b_index); -} - -/* - * During each iteration of the migration, we listen for instructions - * by the source VM to perform dynamic page registrations before they - * can perform RDMA operations. - * - * We respond with the 'rkey'. - * - * Keep doing this until the source tells us to stop. - */ -static int qemu_rdma_registration_handle(QEMUFile *f, void *opaque) -{ - RDMAControlHeader reg_resp = { .len = sizeof(RDMARegisterResult), - .type = RDMA_CONTROL_REGISTER_RESULT, - .repeat = 0, - }; - RDMAControlHeader unreg_resp = { .len = 0, - .type = RDMA_CONTROL_UNREGISTER_FINISHED, - .repeat = 0, - }; - RDMAControlHeader blocks = { .type = RDMA_CONTROL_RAM_BLOCKS_RESULT, - .repeat = 1 }; - QEMUFileRDMA *rfile = opaque; - RDMAContext *rdma = rfile->rdma; - RDMALocalBlocks *local = &rdma->local_ram_blocks; - RDMAControlHeader head; - RDMARegister *reg, *registers; - RDMACompress *comp; - RDMARegisterResult *reg_result; - static RDMARegisterResult results[RDMA_CONTROL_MAX_COMMANDS_PER_MESSAGE]; - RDMALocalBlock *block; - void *host_addr; - int ret = 0; - int idx = 0; - int count = 0; - int i = 0; - - CHECK_ERROR_STATE(); - - do { - trace_qemu_rdma_registration_handle_wait(); - - ret = qemu_rdma_exchange_recv(rdma, &head, RDMA_CONTROL_NONE); - - if (ret < 0) { - break; - } - - if (head.repeat > RDMA_CONTROL_MAX_COMMANDS_PER_MESSAGE) { - error_report("rdma: Too many requests in this message (%d)." - "Bailing.", head.repeat); - ret = -EIO; - break; - } - - switch (head.type) { - case RDMA_CONTROL_COMPRESS: - comp = (RDMACompress *) rdma->wr_data[idx].control_curr; - network_to_compress(comp); - - trace_qemu_rdma_registration_handle_compress(comp->length, - comp->block_idx, - comp->offset); - if (comp->block_idx >= rdma->local_ram_blocks.nb_blocks) { - error_report("rdma: 'compress' bad block index %u (vs %d)", - (unsigned int)comp->block_idx, - rdma->local_ram_blocks.nb_blocks); - ret = -EIO; - goto out; - } - block = &(rdma->local_ram_blocks.block[comp->block_idx]); - - host_addr = block->local_host_addr + - (comp->offset - block->offset); - - ram_handle_compressed(host_addr, comp->value, comp->length); - break; - - case RDMA_CONTROL_REGISTER_FINISHED: - trace_qemu_rdma_registration_handle_finished(); - goto out; - - case RDMA_CONTROL_RAM_BLOCKS_REQUEST: - trace_qemu_rdma_registration_handle_ram_blocks(); - - /* Sort our local RAM Block list so it's the same as the source, - * we can do this since we've filled in a src_index in the list - * as we received the RAMBlock list earlier. - */ - qsort(rdma->local_ram_blocks.block, - rdma->local_ram_blocks.nb_blocks, - sizeof(RDMALocalBlock), dest_ram_sort_func); - if (rdma->pin_all) { - ret = qemu_rdma_reg_whole_ram_blocks(rdma); - if (ret) { - error_report("rdma migration: error dest " - "registering ram blocks"); - goto out; - } - } - - /* - * Dest uses this to prepare to transmit the RAMBlock descriptions - * to the source VM after connection setup. - * Both sides use the "remote" structure to communicate and update - * their "local" descriptions with what was sent. - */ - for (i = 0; i < local->nb_blocks; i++) { - rdma->dest_blocks[i].remote_host_addr = - (uintptr_t)(local->block[i].local_host_addr); - - if (rdma->pin_all) { - rdma->dest_blocks[i].remote_rkey = local->block[i].mr->rkey; - } - - rdma->dest_blocks[i].offset = local->block[i].offset; - rdma->dest_blocks[i].length = local->block[i].length; - - dest_block_to_network(&rdma->dest_blocks[i]); - trace_qemu_rdma_registration_handle_ram_blocks_loop( - local->block[i].block_name, - local->block[i].offset, - local->block[i].length, - local->block[i].local_host_addr, - local->block[i].src_index); - } - - blocks.len = rdma->local_ram_blocks.nb_blocks - * sizeof(RDMADestBlock); - - - ret = qemu_rdma_post_send_control(rdma, - (uint8_t *) rdma->dest_blocks, &blocks); - - if (ret < 0) { - error_report("rdma migration: error sending remote info"); - goto out; - } - - break; - case RDMA_CONTROL_REGISTER_REQUEST: - trace_qemu_rdma_registration_handle_register(head.repeat); - - reg_resp.repeat = head.repeat; - registers = (RDMARegister *) rdma->wr_data[idx].control_curr; - - for (count = 0; count < head.repeat; count++) { - uint64_t chunk; - uint8_t *chunk_start, *chunk_end; - - reg = ®isters[count]; - network_to_register(reg); - - reg_result = &results[count]; - - trace_qemu_rdma_registration_handle_register_loop(count, - reg->current_index, reg->key.current_addr, reg->chunks); - - if (reg->current_index >= rdma->local_ram_blocks.nb_blocks) { - error_report("rdma: 'register' bad block index %u (vs %d)", - (unsigned int)reg->current_index, - rdma->local_ram_blocks.nb_blocks); - ret = -ENOENT; - goto out; - } - block = &(rdma->local_ram_blocks.block[reg->current_index]); - if (block->is_ram_block) { - if (block->offset > reg->key.current_addr) { - error_report("rdma: bad register address for block %s" - " offset: %" PRIx64 " current_addr: %" PRIx64, - block->block_name, block->offset, - reg->key.current_addr); - ret = -ERANGE; - goto out; - } - host_addr = (block->local_host_addr + - (reg->key.current_addr - block->offset)); - chunk = ram_chunk_index(block->local_host_addr, - (uint8_t *) host_addr); - } else { - chunk = reg->key.chunk; - host_addr = block->local_host_addr + - (reg->key.chunk * (1UL << RDMA_REG_CHUNK_SHIFT)); - /* Check for particularly bad chunk value */ - if (host_addr < (void *)block->local_host_addr) { - error_report("rdma: bad chunk for block %s" - " chunk: %" PRIx64, - block->block_name, reg->key.chunk); - ret = -ERANGE; - goto out; - } - } - chunk_start = ram_chunk_start(block, chunk); - chunk_end = ram_chunk_end(block, chunk + reg->chunks); - if (qemu_rdma_register_and_get_keys(rdma, block, - (uintptr_t)host_addr, NULL, ®_result->rkey, - chunk, chunk_start, chunk_end)) { - error_report("cannot get rkey"); - ret = -EINVAL; - goto out; - } - - reg_result->host_addr = (uintptr_t)block->local_host_addr; - - trace_qemu_rdma_registration_handle_register_rkey( - reg_result->rkey); - - result_to_network(reg_result); - } - - ret = qemu_rdma_post_send_control(rdma, - (uint8_t *) results, ®_resp); - - if (ret < 0) { - error_report("Failed to send control buffer"); - goto out; - } - break; - case RDMA_CONTROL_UNREGISTER_REQUEST: - trace_qemu_rdma_registration_handle_unregister(head.repeat); - unreg_resp.repeat = head.repeat; - registers = (RDMARegister *) rdma->wr_data[idx].control_curr; - - for (count = 0; count < head.repeat; count++) { - reg = ®isters[count]; - network_to_register(reg); - - trace_qemu_rdma_registration_handle_unregister_loop(count, - reg->current_index, reg->key.chunk); - - block = &(rdma->local_ram_blocks.block[reg->current_index]); - - ret = ibv_dereg_mr(block->pmr[reg->key.chunk]); - block->pmr[reg->key.chunk] = NULL; - - if (ret != 0) { - perror("rdma unregistration chunk failed"); - ret = -ret; - goto out; - } - - rdma->total_registrations--; - - trace_qemu_rdma_registration_handle_unregister_success( - reg->key.chunk); - } - - ret = qemu_rdma_post_send_control(rdma, NULL, &unreg_resp); - - if (ret < 0) { - error_report("Failed to send control buffer"); - goto out; - } - break; - case RDMA_CONTROL_REGISTER_RESULT: - error_report("Invalid RESULT message at dest."); - ret = -EIO; - goto out; - default: - error_report("Unknown control message %s", control_desc[head.type]); - ret = -EIO; - goto out; - } - } while (1); -out: - if (ret < 0) { - rdma->error_state = ret; - } - return ret; -} - -/* Destination: - * Called via a ram_control_load_hook during the initial RAM load section which - * lists the RAMBlocks by name. This lets us know the order of the RAMBlocks - * on the source. - * We've already built our local RAMBlock list, but not yet sent the list to - * the source. - */ -static int rdma_block_notification_handle(QEMUFileRDMA *rfile, const char *name) -{ - RDMAContext *rdma = rfile->rdma; - int curr; - int found = -1; - - /* Find the matching RAMBlock in our local list */ - for (curr = 0; curr < rdma->local_ram_blocks.nb_blocks; curr++) { - if (!strcmp(rdma->local_ram_blocks.block[curr].block_name, name)) { - found = curr; - break; - } - } - - if (found == -1) { - error_report("RAMBlock '%s' not found on destination", name); - return -ENOENT; - } - - rdma->local_ram_blocks.block[curr].src_index = rdma->next_src_index; - trace_rdma_block_notification_handle(name, rdma->next_src_index); - rdma->next_src_index++; - - return 0; -} - -static int rdma_load_hook(QEMUFile *f, void *opaque, uint64_t flags, void *data) -{ - switch (flags) { - case RAM_CONTROL_BLOCK_REG: - return rdma_block_notification_handle(opaque, data); - - case RAM_CONTROL_HOOK: - return qemu_rdma_registration_handle(f, opaque); - - default: - /* Shouldn't be called with any other values */ - abort(); - } -} - -static int qemu_rdma_registration_start(QEMUFile *f, void *opaque, - uint64_t flags, void *data) -{ - QEMUFileRDMA *rfile = opaque; - RDMAContext *rdma = rfile->rdma; - - CHECK_ERROR_STATE(); - - trace_qemu_rdma_registration_start(flags); - qemu_put_be64(f, RAM_SAVE_FLAG_HOOK); - qemu_fflush(f); - - return 0; -} - -/* - * Inform dest that dynamic registrations are done for now. - * First, flush writes, if any. - */ -static int qemu_rdma_registration_stop(QEMUFile *f, void *opaque, - uint64_t flags, void *data) -{ - Error *local_err = NULL, **errp = &local_err; - QEMUFileRDMA *rfile = opaque; - RDMAContext *rdma = rfile->rdma; - RDMAControlHeader head = { .len = 0, .repeat = 1 }; - int ret = 0; - - CHECK_ERROR_STATE(); - - qemu_fflush(f); - ret = qemu_rdma_drain_cq(f, rdma); - - if (ret < 0) { - goto err; - } - - if (flags == RAM_CONTROL_SETUP) { - RDMAControlHeader resp = {.type = RDMA_CONTROL_RAM_BLOCKS_RESULT }; - RDMALocalBlocks *local = &rdma->local_ram_blocks; - int reg_result_idx, i, nb_dest_blocks; - - head.type = RDMA_CONTROL_RAM_BLOCKS_REQUEST; - trace_qemu_rdma_registration_stop_ram(); - - /* - * Make sure that we parallelize the pinning on both sides. - * For very large guests, doing this serially takes a really - * long time, so we have to 'interleave' the pinning locally - * with the control messages by performing the pinning on this - * side before we receive the control response from the other - * side that the pinning has completed. - */ - ret = qemu_rdma_exchange_send(rdma, &head, NULL, &resp, - ®_result_idx, rdma->pin_all ? - qemu_rdma_reg_whole_ram_blocks : NULL); - if (ret < 0) { - ERROR(errp, "receiving remote info!"); - return ret; - } - - nb_dest_blocks = resp.len / sizeof(RDMADestBlock); - - /* - * The protocol uses two different sets of rkeys (mutually exclusive): - * 1. One key to represent the virtual address of the entire ram block. - * (dynamic chunk registration disabled - pin everything with one rkey.) - * 2. One to represent individual chunks within a ram block. - * (dynamic chunk registration enabled - pin individual chunks.) - * - * Once the capability is successfully negotiated, the destination transmits - * the keys to use (or sends them later) including the virtual addresses - * and then propagates the remote ram block descriptions to his local copy. - */ - - if (local->nb_blocks != nb_dest_blocks) { - ERROR(errp, "ram blocks mismatch (Number of blocks %d vs %d) " - "Your QEMU command line parameters are probably " - "not identical on both the source and destination.", - local->nb_blocks, nb_dest_blocks); - rdma->error_state = -EINVAL; - return -EINVAL; - } - - qemu_rdma_move_header(rdma, reg_result_idx, &resp); - memcpy(rdma->dest_blocks, - rdma->wr_data[reg_result_idx].control_curr, resp.len); - for (i = 0; i < nb_dest_blocks; i++) { - network_to_dest_block(&rdma->dest_blocks[i]); - - /* We require that the blocks are in the same order */ - if (rdma->dest_blocks[i].length != local->block[i].length) { - ERROR(errp, "Block %s/%d has a different length %" PRIu64 - "vs %" PRIu64, local->block[i].block_name, i, - local->block[i].length, - rdma->dest_blocks[i].length); - rdma->error_state = -EINVAL; - return -EINVAL; - } - local->block[i].remote_host_addr = - rdma->dest_blocks[i].remote_host_addr; - local->block[i].remote_rkey = rdma->dest_blocks[i].remote_rkey; - } - } - - trace_qemu_rdma_registration_stop(flags); - - head.type = RDMA_CONTROL_REGISTER_FINISHED; - ret = qemu_rdma_exchange_send(rdma, &head, NULL, NULL, NULL, NULL); - - if (ret < 0) { - goto err; - } - - return 0; -err: - rdma->error_state = ret; - return ret; -} - -static int qemu_rdma_get_fd(void *opaque) -{ - QEMUFileRDMA *rfile = opaque; - RDMAContext *rdma = rfile->rdma; - - return rdma->comp_channel->fd; -} - -static const QEMUFileOps rdma_read_ops = { - .get_buffer = qemu_rdma_get_buffer, - .get_fd = qemu_rdma_get_fd, - .close = qemu_rdma_close, - .hook_ram_load = rdma_load_hook, -}; - -static const QEMUFileOps rdma_write_ops = { - .put_buffer = qemu_rdma_put_buffer, - .close = qemu_rdma_close, - .before_ram_iterate = qemu_rdma_registration_start, - .after_ram_iterate = qemu_rdma_registration_stop, - .save_page = qemu_rdma_save_page, -}; - -static void *qemu_fopen_rdma(RDMAContext *rdma, const char *mode) -{ - QEMUFileRDMA *r; - - if (qemu_file_mode_is_not_valid(mode)) { - return NULL; - } - - r = g_new0(QEMUFileRDMA, 1); - r->rdma = rdma; - - if (mode[0] == 'w') { - r->file = qemu_fopen_ops(r, &rdma_write_ops); - } else { - r->file = qemu_fopen_ops(r, &rdma_read_ops); - } - - return r->file; -} - -static void rdma_accept_incoming_migration(void *opaque) -{ - RDMAContext *rdma = opaque; - int ret; - QEMUFile *f; - Error *local_err = NULL, **errp = &local_err; - - trace_qemu_rdma_accept_incoming_migration(); - ret = qemu_rdma_accept(rdma); - - if (ret) { - ERROR(errp, "RDMA Migration initialization failed!"); - return; - } - - trace_qemu_rdma_accept_incoming_migration_accepted(); - - f = qemu_fopen_rdma(rdma, "rb"); - if (f == NULL) { - ERROR(errp, "could not qemu_fopen_rdma!"); - qemu_rdma_cleanup(rdma); - return; - } - - rdma->migration_started_on_destination = 1; - process_incoming_migration(f); -} - -void rdma_start_incoming_migration(const char *host_port, Error **errp) -{ - int ret; - RDMAContext *rdma; - Error *local_err = NULL; - - trace_rdma_start_incoming_migration(); - rdma = qemu_rdma_data_init(host_port, &local_err); - - if (rdma == NULL) { - goto err; - } - - ret = qemu_rdma_dest_init(rdma, &local_err); - - if (ret) { - goto err; - } - - trace_rdma_start_incoming_migration_after_dest_init(); - - ret = rdma_listen(rdma->listen_id, 5); - - if (ret) { - ERROR(errp, "listening on socket!"); - goto err; - } - - trace_rdma_start_incoming_migration_after_rdma_listen(); - - qemu_set_fd_handler(rdma->channel->fd, rdma_accept_incoming_migration, - NULL, (void *)(intptr_t)rdma); - return; -err: - error_propagate(errp, local_err); - g_free(rdma); -} - -void rdma_start_outgoing_migration(void *opaque, - const char *host_port, Error **errp) -{ - MigrationState *s = opaque; - Error *local_err = NULL, **temp = &local_err; - RDMAContext *rdma = qemu_rdma_data_init(host_port, &local_err); - int ret = 0; - - if (rdma == NULL) { - ERROR(temp, "Failed to initialize RDMA data structures! %d", ret); - goto err; - } - - ret = qemu_rdma_source_init(rdma, &local_err, - s->enabled_capabilities[MIGRATION_CAPABILITY_RDMA_PIN_ALL]); - - if (ret) { - goto err; - } - - trace_rdma_start_outgoing_migration_after_rdma_source_init(); - ret = qemu_rdma_connect(rdma, &local_err); - - if (ret) { - goto err; - } - - trace_rdma_start_outgoing_migration_after_rdma_connect(); - - s->to_dst_file = qemu_fopen_rdma(rdma, "wb"); - migrate_fd_connect(s); - return; -err: - error_propagate(errp, local_err); - g_free(rdma); - migrate_fd_error(s); -} diff --git a/qemu/migration/savevm.c b/qemu/migration/savevm.c deleted file mode 100644 index 16ba44379..000000000 --- a/qemu/migration/savevm.c +++ /dev/null @@ -1,2243 +0,0 @@ -/* - * QEMU System Emulator - * - * Copyright (c) 2003-2008 Fabrice Bellard - * Copyright (c) 2009-2015 Red Hat Inc - * - * Authors: - * Juan Quintela <quintela@redhat.com> - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. - */ - -#include "qemu/osdep.h" -#include "hw/boards.h" -#include "hw/hw.h" -#include "hw/qdev.h" -#include "net/net.h" -#include "monitor/monitor.h" -#include "sysemu/sysemu.h" -#include "qemu/timer.h" -#include "audio/audio.h" -#include "migration/migration.h" -#include "migration/postcopy-ram.h" -#include "qapi/qmp/qerror.h" -#include "qemu/error-report.h" -#include "qemu/sockets.h" -#include "qemu/queue.h" -#include "sysemu/cpus.h" -#include "exec/memory.h" -#include "qmp-commands.h" -#include "trace.h" -#include "qemu/bitops.h" -#include "qemu/iov.h" -#include "block/snapshot.h" -#include "block/qapi.h" -#include "qemu/cutils.h" - -#ifndef ETH_P_RARP -#define ETH_P_RARP 0x8035 -#endif -#define ARP_HTYPE_ETH 0x0001 -#define ARP_PTYPE_IP 0x0800 -#define ARP_OP_REQUEST_REV 0x3 - -const unsigned int postcopy_ram_discard_version = 0; - -static bool skip_section_footers; - -static struct mig_cmd_args { - ssize_t len; /* -1 = variable */ - const char *name; -} mig_cmd_args[] = { - [MIG_CMD_INVALID] = { .len = -1, .name = "INVALID" }, - [MIG_CMD_OPEN_RETURN_PATH] = { .len = 0, .name = "OPEN_RETURN_PATH" }, - [MIG_CMD_PING] = { .len = sizeof(uint32_t), .name = "PING" }, - [MIG_CMD_POSTCOPY_ADVISE] = { .len = 16, .name = "POSTCOPY_ADVISE" }, - [MIG_CMD_POSTCOPY_LISTEN] = { .len = 0, .name = "POSTCOPY_LISTEN" }, - [MIG_CMD_POSTCOPY_RUN] = { .len = 0, .name = "POSTCOPY_RUN" }, - [MIG_CMD_POSTCOPY_RAM_DISCARD] = { - .len = -1, .name = "POSTCOPY_RAM_DISCARD" }, - [MIG_CMD_PACKAGED] = { .len = 4, .name = "PACKAGED" }, - [MIG_CMD_MAX] = { .len = -1, .name = "MAX" }, -}; - -static int announce_self_create(uint8_t *buf, - uint8_t *mac_addr) -{ - /* Ethernet header. */ - memset(buf, 0xff, 6); /* destination MAC addr */ - memcpy(buf + 6, mac_addr, 6); /* source MAC addr */ - *(uint16_t *)(buf + 12) = htons(ETH_P_RARP); /* ethertype */ - - /* RARP header. */ - *(uint16_t *)(buf + 14) = htons(ARP_HTYPE_ETH); /* hardware addr space */ - *(uint16_t *)(buf + 16) = htons(ARP_PTYPE_IP); /* protocol addr space */ - *(buf + 18) = 6; /* hardware addr length (ethernet) */ - *(buf + 19) = 4; /* protocol addr length (IPv4) */ - *(uint16_t *)(buf + 20) = htons(ARP_OP_REQUEST_REV); /* opcode */ - memcpy(buf + 22, mac_addr, 6); /* source hw addr */ - memset(buf + 28, 0x00, 4); /* source protocol addr */ - memcpy(buf + 32, mac_addr, 6); /* target hw addr */ - memset(buf + 38, 0x00, 4); /* target protocol addr */ - - /* Padding to get up to 60 bytes (ethernet min packet size, minus FCS). */ - memset(buf + 42, 0x00, 18); - - return 60; /* len (FCS will be added by hardware) */ -} - -static void qemu_announce_self_iter(NICState *nic, void *opaque) -{ - uint8_t buf[60]; - int len; - - trace_qemu_announce_self_iter(qemu_ether_ntoa(&nic->conf->macaddr)); - len = announce_self_create(buf, nic->conf->macaddr.a); - - qemu_send_packet_raw(qemu_get_queue(nic), buf, len); -} - - -static void qemu_announce_self_once(void *opaque) -{ - static int count = SELF_ANNOUNCE_ROUNDS; - QEMUTimer *timer = *(QEMUTimer **)opaque; - - qemu_foreach_nic(qemu_announce_self_iter, NULL); - - if (--count) { - /* delay 50ms, 150ms, 250ms, ... */ - timer_mod(timer, qemu_clock_get_ms(QEMU_CLOCK_REALTIME) + - self_announce_delay(count)); - } else { - timer_del(timer); - timer_free(timer); - } -} - -void qemu_announce_self(void) -{ - static QEMUTimer *timer; - timer = timer_new_ms(QEMU_CLOCK_REALTIME, qemu_announce_self_once, &timer); - qemu_announce_self_once(&timer); -} - -/***********************************************************/ -/* savevm/loadvm support */ - -static ssize_t block_writev_buffer(void *opaque, struct iovec *iov, int iovcnt, - int64_t pos) -{ - int ret; - QEMUIOVector qiov; - - qemu_iovec_init_external(&qiov, iov, iovcnt); - ret = bdrv_writev_vmstate(opaque, &qiov, pos); - if (ret < 0) { - return ret; - } - - return qiov.size; -} - -static ssize_t block_put_buffer(void *opaque, const uint8_t *buf, - int64_t pos, size_t size) -{ - bdrv_save_vmstate(opaque, buf, pos, size); - return size; -} - -static ssize_t block_get_buffer(void *opaque, uint8_t *buf, int64_t pos, - size_t size) -{ - return bdrv_load_vmstate(opaque, buf, pos, size); -} - -static int bdrv_fclose(void *opaque) -{ - return bdrv_flush(opaque); -} - -static const QEMUFileOps bdrv_read_ops = { - .get_buffer = block_get_buffer, - .close = bdrv_fclose -}; - -static const QEMUFileOps bdrv_write_ops = { - .put_buffer = block_put_buffer, - .writev_buffer = block_writev_buffer, - .close = bdrv_fclose -}; - -static QEMUFile *qemu_fopen_bdrv(BlockDriverState *bs, int is_writable) -{ - if (is_writable) { - return qemu_fopen_ops(bs, &bdrv_write_ops); - } - return qemu_fopen_ops(bs, &bdrv_read_ops); -} - - -/* QEMUFile timer support. - * Not in qemu-file.c to not add qemu-timer.c as dependency to qemu-file.c - */ - -void timer_put(QEMUFile *f, QEMUTimer *ts) -{ - uint64_t expire_time; - - expire_time = timer_expire_time_ns(ts); - qemu_put_be64(f, expire_time); -} - -void timer_get(QEMUFile *f, QEMUTimer *ts) -{ - uint64_t expire_time; - - expire_time = qemu_get_be64(f); - if (expire_time != -1) { - timer_mod_ns(ts, expire_time); - } else { - timer_del(ts); - } -} - - -/* VMState timer support. - * Not in vmstate.c to not add qemu-timer.c as dependency to vmstate.c - */ - -static int get_timer(QEMUFile *f, void *pv, size_t size) -{ - QEMUTimer *v = pv; - timer_get(f, v); - return 0; -} - -static void put_timer(QEMUFile *f, void *pv, size_t size) -{ - QEMUTimer *v = pv; - timer_put(f, v); -} - -const VMStateInfo vmstate_info_timer = { - .name = "timer", - .get = get_timer, - .put = put_timer, -}; - - -typedef struct CompatEntry { - char idstr[256]; - int instance_id; -} CompatEntry; - -typedef struct SaveStateEntry { - QTAILQ_ENTRY(SaveStateEntry) entry; - char idstr[256]; - int instance_id; - int alias_id; - int version_id; - int section_id; - SaveVMHandlers *ops; - const VMStateDescription *vmsd; - void *opaque; - CompatEntry *compat; - int is_ram; -} SaveStateEntry; - -typedef struct SaveState { - QTAILQ_HEAD(, SaveStateEntry) handlers; - int global_section_id; - bool skip_configuration; - uint32_t len; - const char *name; -} SaveState; - -static SaveState savevm_state = { - .handlers = QTAILQ_HEAD_INITIALIZER(savevm_state.handlers), - .global_section_id = 0, - .skip_configuration = false, -}; - -void savevm_skip_configuration(void) -{ - savevm_state.skip_configuration = true; -} - - -static void configuration_pre_save(void *opaque) -{ - SaveState *state = opaque; - const char *current_name = MACHINE_GET_CLASS(current_machine)->name; - - state->len = strlen(current_name); - state->name = current_name; -} - -static int configuration_post_load(void *opaque, int version_id) -{ - SaveState *state = opaque; - const char *current_name = MACHINE_GET_CLASS(current_machine)->name; - - if (strncmp(state->name, current_name, state->len) != 0) { - error_report("Machine type received is '%.*s' and local is '%s'", - (int) state->len, state->name, current_name); - return -EINVAL; - } - return 0; -} - -static const VMStateDescription vmstate_configuration = { - .name = "configuration", - .version_id = 1, - .post_load = configuration_post_load, - .pre_save = configuration_pre_save, - .fields = (VMStateField[]) { - VMSTATE_UINT32(len, SaveState), - VMSTATE_VBUFFER_ALLOC_UINT32(name, SaveState, 0, NULL, 0, len), - VMSTATE_END_OF_LIST() - }, -}; - -static void dump_vmstate_vmsd(FILE *out_file, - const VMStateDescription *vmsd, int indent, - bool is_subsection); - -static void dump_vmstate_vmsf(FILE *out_file, const VMStateField *field, - int indent) -{ - fprintf(out_file, "%*s{\n", indent, ""); - indent += 2; - fprintf(out_file, "%*s\"field\": \"%s\",\n", indent, "", field->name); - fprintf(out_file, "%*s\"version_id\": %d,\n", indent, "", - field->version_id); - fprintf(out_file, "%*s\"field_exists\": %s,\n", indent, "", - field->field_exists ? "true" : "false"); - fprintf(out_file, "%*s\"size\": %zu", indent, "", field->size); - if (field->vmsd != NULL) { - fprintf(out_file, ",\n"); - dump_vmstate_vmsd(out_file, field->vmsd, indent, false); - } - fprintf(out_file, "\n%*s}", indent - 2, ""); -} - -static void dump_vmstate_vmss(FILE *out_file, - const VMStateDescription **subsection, - int indent) -{ - if (*subsection != NULL) { - dump_vmstate_vmsd(out_file, *subsection, indent, true); - } -} - -static void dump_vmstate_vmsd(FILE *out_file, - const VMStateDescription *vmsd, int indent, - bool is_subsection) -{ - if (is_subsection) { - fprintf(out_file, "%*s{\n", indent, ""); - } else { - fprintf(out_file, "%*s\"%s\": {\n", indent, "", "Description"); - } - indent += 2; - fprintf(out_file, "%*s\"name\": \"%s\",\n", indent, "", vmsd->name); - fprintf(out_file, "%*s\"version_id\": %d,\n", indent, "", - vmsd->version_id); - fprintf(out_file, "%*s\"minimum_version_id\": %d", indent, "", - vmsd->minimum_version_id); - if (vmsd->fields != NULL) { - const VMStateField *field = vmsd->fields; - bool first; - - fprintf(out_file, ",\n%*s\"Fields\": [\n", indent, ""); - first = true; - while (field->name != NULL) { - if (field->flags & VMS_MUST_EXIST) { - /* Ignore VMSTATE_VALIDATE bits; these don't get migrated */ - field++; - continue; - } - if (!first) { - fprintf(out_file, ",\n"); - } - dump_vmstate_vmsf(out_file, field, indent + 2); - field++; - first = false; - } - fprintf(out_file, "\n%*s]", indent, ""); - } - if (vmsd->subsections != NULL) { - const VMStateDescription **subsection = vmsd->subsections; - bool first; - - fprintf(out_file, ",\n%*s\"Subsections\": [\n", indent, ""); - first = true; - while (*subsection != NULL) { - if (!first) { - fprintf(out_file, ",\n"); - } - dump_vmstate_vmss(out_file, subsection, indent + 2); - subsection++; - first = false; - } - fprintf(out_file, "\n%*s]", indent, ""); - } - fprintf(out_file, "\n%*s}", indent - 2, ""); -} - -static void dump_machine_type(FILE *out_file) -{ - MachineClass *mc; - - mc = MACHINE_GET_CLASS(current_machine); - - fprintf(out_file, " \"vmschkmachine\": {\n"); - fprintf(out_file, " \"Name\": \"%s\"\n", mc->name); - fprintf(out_file, " },\n"); -} - -void dump_vmstate_json_to_file(FILE *out_file) -{ - GSList *list, *elt; - bool first; - - fprintf(out_file, "{\n"); - dump_machine_type(out_file); - - first = true; - list = object_class_get_list(TYPE_DEVICE, true); - for (elt = list; elt; elt = elt->next) { - DeviceClass *dc = OBJECT_CLASS_CHECK(DeviceClass, elt->data, - TYPE_DEVICE); - const char *name; - int indent = 2; - - if (!dc->vmsd) { - continue; - } - - if (!first) { - fprintf(out_file, ",\n"); - } - name = object_class_get_name(OBJECT_CLASS(dc)); - fprintf(out_file, "%*s\"%s\": {\n", indent, "", name); - indent += 2; - fprintf(out_file, "%*s\"Name\": \"%s\",\n", indent, "", name); - fprintf(out_file, "%*s\"version_id\": %d,\n", indent, "", - dc->vmsd->version_id); - fprintf(out_file, "%*s\"minimum_version_id\": %d,\n", indent, "", - dc->vmsd->minimum_version_id); - - dump_vmstate_vmsd(out_file, dc->vmsd, indent, false); - - fprintf(out_file, "\n%*s}", indent - 2, ""); - first = false; - } - fprintf(out_file, "\n}\n"); - fclose(out_file); -} - -static int calculate_new_instance_id(const char *idstr) -{ - SaveStateEntry *se; - int instance_id = 0; - - QTAILQ_FOREACH(se, &savevm_state.handlers, entry) { - if (strcmp(idstr, se->idstr) == 0 - && instance_id <= se->instance_id) { - instance_id = se->instance_id + 1; - } - } - return instance_id; -} - -static int calculate_compat_instance_id(const char *idstr) -{ - SaveStateEntry *se; - int instance_id = 0; - - QTAILQ_FOREACH(se, &savevm_state.handlers, entry) { - if (!se->compat) { - continue; - } - - if (strcmp(idstr, se->compat->idstr) == 0 - && instance_id <= se->compat->instance_id) { - instance_id = se->compat->instance_id + 1; - } - } - return instance_id; -} - -/* TODO: Individual devices generally have very little idea about the rest - of the system, so instance_id should be removed/replaced. - Meanwhile pass -1 as instance_id if you do not already have a clearly - distinguishing id for all instances of your device class. */ -int register_savevm_live(DeviceState *dev, - const char *idstr, - int instance_id, - int version_id, - SaveVMHandlers *ops, - void *opaque) -{ - SaveStateEntry *se; - - se = g_new0(SaveStateEntry, 1); - se->version_id = version_id; - se->section_id = savevm_state.global_section_id++; - se->ops = ops; - se->opaque = opaque; - se->vmsd = NULL; - /* if this is a live_savem then set is_ram */ - if (ops->save_live_setup != NULL) { - se->is_ram = 1; - } - - if (dev) { - char *id = qdev_get_dev_path(dev); - if (id) { - pstrcpy(se->idstr, sizeof(se->idstr), id); - pstrcat(se->idstr, sizeof(se->idstr), "/"); - g_free(id); - - se->compat = g_new0(CompatEntry, 1); - pstrcpy(se->compat->idstr, sizeof(se->compat->idstr), idstr); - se->compat->instance_id = instance_id == -1 ? - calculate_compat_instance_id(idstr) : instance_id; - instance_id = -1; - } - } - pstrcat(se->idstr, sizeof(se->idstr), idstr); - - if (instance_id == -1) { - se->instance_id = calculate_new_instance_id(se->idstr); - } else { - se->instance_id = instance_id; - } - assert(!se->compat || se->instance_id == 0); - /* add at the end of list */ - QTAILQ_INSERT_TAIL(&savevm_state.handlers, se, entry); - return 0; -} - -int register_savevm(DeviceState *dev, - const char *idstr, - int instance_id, - int version_id, - SaveStateHandler *save_state, - LoadStateHandler *load_state, - void *opaque) -{ - SaveVMHandlers *ops = g_new0(SaveVMHandlers, 1); - ops->save_state = save_state; - ops->load_state = load_state; - return register_savevm_live(dev, idstr, instance_id, version_id, - ops, opaque); -} - -void unregister_savevm(DeviceState *dev, const char *idstr, void *opaque) -{ - SaveStateEntry *se, *new_se; - char id[256] = ""; - - if (dev) { - char *path = qdev_get_dev_path(dev); - if (path) { - pstrcpy(id, sizeof(id), path); - pstrcat(id, sizeof(id), "/"); - g_free(path); - } - } - pstrcat(id, sizeof(id), idstr); - - QTAILQ_FOREACH_SAFE(se, &savevm_state.handlers, entry, new_se) { - if (strcmp(se->idstr, id) == 0 && se->opaque == opaque) { - QTAILQ_REMOVE(&savevm_state.handlers, se, entry); - g_free(se->compat); - g_free(se->ops); - g_free(se); - } - } -} - -int vmstate_register_with_alias_id(DeviceState *dev, int instance_id, - const VMStateDescription *vmsd, - void *opaque, int alias_id, - int required_for_version) -{ - SaveStateEntry *se; - - /* If this triggers, alias support can be dropped for the vmsd. */ - assert(alias_id == -1 || required_for_version >= vmsd->minimum_version_id); - - se = g_new0(SaveStateEntry, 1); - se->version_id = vmsd->version_id; - se->section_id = savevm_state.global_section_id++; - se->opaque = opaque; - se->vmsd = vmsd; - se->alias_id = alias_id; - - if (dev) { - char *id = qdev_get_dev_path(dev); - if (id) { - pstrcpy(se->idstr, sizeof(se->idstr), id); - pstrcat(se->idstr, sizeof(se->idstr), "/"); - g_free(id); - - se->compat = g_new0(CompatEntry, 1); - pstrcpy(se->compat->idstr, sizeof(se->compat->idstr), vmsd->name); - se->compat->instance_id = instance_id == -1 ? - calculate_compat_instance_id(vmsd->name) : instance_id; - instance_id = -1; - } - } - pstrcat(se->idstr, sizeof(se->idstr), vmsd->name); - - if (instance_id == -1) { - se->instance_id = calculate_new_instance_id(se->idstr); - } else { - se->instance_id = instance_id; - } - assert(!se->compat || se->instance_id == 0); - /* add at the end of list */ - QTAILQ_INSERT_TAIL(&savevm_state.handlers, se, entry); - return 0; -} - -void vmstate_unregister(DeviceState *dev, const VMStateDescription *vmsd, - void *opaque) -{ - SaveStateEntry *se, *new_se; - - QTAILQ_FOREACH_SAFE(se, &savevm_state.handlers, entry, new_se) { - if (se->vmsd == vmsd && se->opaque == opaque) { - QTAILQ_REMOVE(&savevm_state.handlers, se, entry); - g_free(se->compat); - g_free(se); - } - } -} - -static int vmstate_load(QEMUFile *f, SaveStateEntry *se, int version_id) -{ - trace_vmstate_load(se->idstr, se->vmsd ? se->vmsd->name : "(old)"); - if (!se->vmsd) { /* Old style */ - return se->ops->load_state(f, se->opaque, version_id); - } - return vmstate_load_state(f, se->vmsd, se->opaque, version_id); -} - -static void vmstate_save_old_style(QEMUFile *f, SaveStateEntry *se, QJSON *vmdesc) -{ - int64_t old_offset, size; - - old_offset = qemu_ftell_fast(f); - se->ops->save_state(f, se->opaque); - size = qemu_ftell_fast(f) - old_offset; - - if (vmdesc) { - json_prop_int(vmdesc, "size", size); - json_start_array(vmdesc, "fields"); - json_start_object(vmdesc, NULL); - json_prop_str(vmdesc, "name", "data"); - json_prop_int(vmdesc, "size", size); - json_prop_str(vmdesc, "type", "buffer"); - json_end_object(vmdesc); - json_end_array(vmdesc); - } -} - -static void vmstate_save(QEMUFile *f, SaveStateEntry *se, QJSON *vmdesc) -{ - trace_vmstate_save(se->idstr, se->vmsd ? se->vmsd->name : "(old)"); - if (!se->vmsd) { - vmstate_save_old_style(f, se, vmdesc); - return; - } - vmstate_save_state(f, se->vmsd, se->opaque, vmdesc); -} - -void savevm_skip_section_footers(void) -{ - skip_section_footers = true; -} - -/* - * Write the header for device section (QEMU_VM_SECTION START/END/PART/FULL) - */ -static void save_section_header(QEMUFile *f, SaveStateEntry *se, - uint8_t section_type) -{ - qemu_put_byte(f, section_type); - qemu_put_be32(f, se->section_id); - - if (section_type == QEMU_VM_SECTION_FULL || - section_type == QEMU_VM_SECTION_START) { - /* ID string */ - size_t len = strlen(se->idstr); - qemu_put_byte(f, len); - qemu_put_buffer(f, (uint8_t *)se->idstr, len); - - qemu_put_be32(f, se->instance_id); - qemu_put_be32(f, se->version_id); - } -} - -/* - * Write a footer onto device sections that catches cases misformatted device - * sections. - */ -static void save_section_footer(QEMUFile *f, SaveStateEntry *se) -{ - if (!skip_section_footers) { - qemu_put_byte(f, QEMU_VM_SECTION_FOOTER); - qemu_put_be32(f, se->section_id); - } -} - -/** - * qemu_savevm_command_send: Send a 'QEMU_VM_COMMAND' type element with the - * command and associated data. - * - * @f: File to send command on - * @command: Command type to send - * @len: Length of associated data - * @data: Data associated with command. - */ -void qemu_savevm_command_send(QEMUFile *f, - enum qemu_vm_cmd command, - uint16_t len, - uint8_t *data) -{ - trace_savevm_command_send(command, len); - qemu_put_byte(f, QEMU_VM_COMMAND); - qemu_put_be16(f, (uint16_t)command); - qemu_put_be16(f, len); - qemu_put_buffer(f, data, len); - qemu_fflush(f); -} - -void qemu_savevm_send_ping(QEMUFile *f, uint32_t value) -{ - uint32_t buf; - - trace_savevm_send_ping(value); - buf = cpu_to_be32(value); - qemu_savevm_command_send(f, MIG_CMD_PING, sizeof(value), (uint8_t *)&buf); -} - -void qemu_savevm_send_open_return_path(QEMUFile *f) -{ - trace_savevm_send_open_return_path(); - qemu_savevm_command_send(f, MIG_CMD_OPEN_RETURN_PATH, 0, NULL); -} - -/* We have a buffer of data to send; we don't want that all to be loaded - * by the command itself, so the command contains just the length of the - * extra buffer that we then send straight after it. - * TODO: Must be a better way to organise that - * - * Returns: - * 0 on success - * -ve on error - */ -int qemu_savevm_send_packaged(QEMUFile *f, const QEMUSizedBuffer *qsb) -{ - size_t cur_iov; - size_t len = qsb_get_length(qsb); - uint32_t tmp; - - if (len > MAX_VM_CMD_PACKAGED_SIZE) { - error_report("%s: Unreasonably large packaged state: %zu", - __func__, len); - return -1; - } - - tmp = cpu_to_be32(len); - - trace_qemu_savevm_send_packaged(); - qemu_savevm_command_send(f, MIG_CMD_PACKAGED, 4, (uint8_t *)&tmp); - - /* all the data follows (concatinating the iov's) */ - for (cur_iov = 0; cur_iov < qsb->n_iov; cur_iov++) { - /* The iov entries are partially filled */ - size_t towrite = MIN(qsb->iov[cur_iov].iov_len, len); - len -= towrite; - - if (!towrite) { - break; - } - - qemu_put_buffer(f, qsb->iov[cur_iov].iov_base, towrite); - } - - return 0; -} - -/* Send prior to any postcopy transfer */ -void qemu_savevm_send_postcopy_advise(QEMUFile *f) -{ - uint64_t tmp[2]; - tmp[0] = cpu_to_be64(getpagesize()); - tmp[1] = cpu_to_be64(1ul << qemu_target_page_bits()); - - trace_qemu_savevm_send_postcopy_advise(); - qemu_savevm_command_send(f, MIG_CMD_POSTCOPY_ADVISE, 16, (uint8_t *)tmp); -} - -/* Sent prior to starting the destination running in postcopy, discard pages - * that have already been sent but redirtied on the source. - * CMD_POSTCOPY_RAM_DISCARD consist of: - * byte version (0) - * byte Length of name field (not including 0) - * n x byte RAM block name - * byte 0 terminator (just for safety) - * n x Byte ranges within the named RAMBlock - * be64 Start of the range - * be64 Length - * - * name: RAMBlock name that these entries are part of - * len: Number of page entries - * start_list: 'len' addresses - * length_list: 'len' addresses - * - */ -void qemu_savevm_send_postcopy_ram_discard(QEMUFile *f, const char *name, - uint16_t len, - uint64_t *start_list, - uint64_t *length_list) -{ - uint8_t *buf; - uint16_t tmplen; - uint16_t t; - size_t name_len = strlen(name); - - trace_qemu_savevm_send_postcopy_ram_discard(name, len); - assert(name_len < 256); - buf = g_malloc0(1 + 1 + name_len + 1 + (8 + 8) * len); - buf[0] = postcopy_ram_discard_version; - buf[1] = name_len; - memcpy(buf + 2, name, name_len); - tmplen = 2 + name_len; - buf[tmplen++] = '\0'; - - for (t = 0; t < len; t++) { - cpu_to_be64w((uint64_t *)(buf + tmplen), start_list[t]); - tmplen += 8; - cpu_to_be64w((uint64_t *)(buf + tmplen), length_list[t]); - tmplen += 8; - } - qemu_savevm_command_send(f, MIG_CMD_POSTCOPY_RAM_DISCARD, tmplen, buf); - g_free(buf); -} - -/* Get the destination into a state where it can receive postcopy data. */ -void qemu_savevm_send_postcopy_listen(QEMUFile *f) -{ - trace_savevm_send_postcopy_listen(); - qemu_savevm_command_send(f, MIG_CMD_POSTCOPY_LISTEN, 0, NULL); -} - -/* Kick the destination into running */ -void qemu_savevm_send_postcopy_run(QEMUFile *f) -{ - trace_savevm_send_postcopy_run(); - qemu_savevm_command_send(f, MIG_CMD_POSTCOPY_RUN, 0, NULL); -} - -bool qemu_savevm_state_blocked(Error **errp) -{ - SaveStateEntry *se; - - QTAILQ_FOREACH(se, &savevm_state.handlers, entry) { - if (se->vmsd && se->vmsd->unmigratable) { - error_setg(errp, "State blocked by non-migratable device '%s'", - se->idstr); - return true; - } - } - return false; -} - -static bool enforce_config_section(void) -{ - MachineState *machine = MACHINE(qdev_get_machine()); - return machine->enforce_config_section; -} - -void qemu_savevm_state_header(QEMUFile *f) -{ - trace_savevm_state_header(); - qemu_put_be32(f, QEMU_VM_FILE_MAGIC); - qemu_put_be32(f, QEMU_VM_FILE_VERSION); - - if (!savevm_state.skip_configuration || enforce_config_section()) { - qemu_put_byte(f, QEMU_VM_CONFIGURATION); - vmstate_save_state(f, &vmstate_configuration, &savevm_state, 0); - } - -} - -void qemu_savevm_state_begin(QEMUFile *f, - const MigrationParams *params) -{ - SaveStateEntry *se; - int ret; - - trace_savevm_state_begin(); - QTAILQ_FOREACH(se, &savevm_state.handlers, entry) { - if (!se->ops || !se->ops->set_params) { - continue; - } - se->ops->set_params(params, se->opaque); - } - - QTAILQ_FOREACH(se, &savevm_state.handlers, entry) { - if (!se->ops || !se->ops->save_live_setup) { - continue; - } - if (se->ops && se->ops->is_active) { - if (!se->ops->is_active(se->opaque)) { - continue; - } - } - save_section_header(f, se, QEMU_VM_SECTION_START); - - ret = se->ops->save_live_setup(f, se->opaque); - save_section_footer(f, se); - if (ret < 0) { - qemu_file_set_error(f, ret); - break; - } - } -} - -/* - * this function has three return values: - * negative: there was one error, and we have -errno. - * 0 : We haven't finished, caller have to go again - * 1 : We have finished, we can go to complete phase - */ -int qemu_savevm_state_iterate(QEMUFile *f, bool postcopy) -{ - SaveStateEntry *se; - int ret = 1; - - trace_savevm_state_iterate(); - QTAILQ_FOREACH(se, &savevm_state.handlers, entry) { - if (!se->ops || !se->ops->save_live_iterate) { - continue; - } - if (se->ops && se->ops->is_active) { - if (!se->ops->is_active(se->opaque)) { - continue; - } - } - /* - * In the postcopy phase, any device that doesn't know how to - * do postcopy should have saved it's state in the _complete - * call that's already run, it might get confused if we call - * iterate afterwards. - */ - if (postcopy && !se->ops->save_live_complete_postcopy) { - continue; - } - if (qemu_file_rate_limit(f)) { - return 0; - } - trace_savevm_section_start(se->idstr, se->section_id); - - save_section_header(f, se, QEMU_VM_SECTION_PART); - - ret = se->ops->save_live_iterate(f, se->opaque); - trace_savevm_section_end(se->idstr, se->section_id, ret); - save_section_footer(f, se); - - if (ret < 0) { - qemu_file_set_error(f, ret); - } - if (ret <= 0) { - /* Do not proceed to the next vmstate before this one reported - completion of the current stage. This serializes the migration - and reduces the probability that a faster changing state is - synchronized over and over again. */ - break; - } - } - return ret; -} - -static bool should_send_vmdesc(void) -{ - MachineState *machine = MACHINE(qdev_get_machine()); - bool in_postcopy = migration_in_postcopy(migrate_get_current()); - return !machine->suppress_vmdesc && !in_postcopy; -} - -/* - * Calls the save_live_complete_postcopy methods - * causing the last few pages to be sent immediately and doing any associated - * cleanup. - * Note postcopy also calls qemu_savevm_state_complete_precopy to complete - * all the other devices, but that happens at the point we switch to postcopy. - */ -void qemu_savevm_state_complete_postcopy(QEMUFile *f) -{ - SaveStateEntry *se; - int ret; - - QTAILQ_FOREACH(se, &savevm_state.handlers, entry) { - if (!se->ops || !se->ops->save_live_complete_postcopy) { - continue; - } - if (se->ops && se->ops->is_active) { - if (!se->ops->is_active(se->opaque)) { - continue; - } - } - trace_savevm_section_start(se->idstr, se->section_id); - /* Section type */ - qemu_put_byte(f, QEMU_VM_SECTION_END); - qemu_put_be32(f, se->section_id); - - ret = se->ops->save_live_complete_postcopy(f, se->opaque); - trace_savevm_section_end(se->idstr, se->section_id, ret); - save_section_footer(f, se); - if (ret < 0) { - qemu_file_set_error(f, ret); - return; - } - } - - qemu_put_byte(f, QEMU_VM_EOF); - qemu_fflush(f); -} - -void qemu_savevm_state_complete_precopy(QEMUFile *f, bool iterable_only) -{ - QJSON *vmdesc; - int vmdesc_len; - SaveStateEntry *se; - int ret; - bool in_postcopy = migration_in_postcopy(migrate_get_current()); - - trace_savevm_state_complete_precopy(); - - cpu_synchronize_all_states(); - - QTAILQ_FOREACH(se, &savevm_state.handlers, entry) { - if (!se->ops || - (in_postcopy && se->ops->save_live_complete_postcopy) || - (in_postcopy && !iterable_only) || - !se->ops->save_live_complete_precopy) { - continue; - } - - if (se->ops && se->ops->is_active) { - if (!se->ops->is_active(se->opaque)) { - continue; - } - } - trace_savevm_section_start(se->idstr, se->section_id); - - save_section_header(f, se, QEMU_VM_SECTION_END); - - ret = se->ops->save_live_complete_precopy(f, se->opaque); - trace_savevm_section_end(se->idstr, se->section_id, ret); - save_section_footer(f, se); - if (ret < 0) { - qemu_file_set_error(f, ret); - return; - } - } - - if (iterable_only) { - return; - } - - vmdesc = qjson_new(); - json_prop_int(vmdesc, "page_size", TARGET_PAGE_SIZE); - json_start_array(vmdesc, "devices"); - QTAILQ_FOREACH(se, &savevm_state.handlers, entry) { - - if ((!se->ops || !se->ops->save_state) && !se->vmsd) { - continue; - } - if (se->vmsd && !vmstate_save_needed(se->vmsd, se->opaque)) { - trace_savevm_section_skip(se->idstr, se->section_id); - continue; - } - - trace_savevm_section_start(se->idstr, se->section_id); - - json_start_object(vmdesc, NULL); - json_prop_str(vmdesc, "name", se->idstr); - json_prop_int(vmdesc, "instance_id", se->instance_id); - - save_section_header(f, se, QEMU_VM_SECTION_FULL); - vmstate_save(f, se, vmdesc); - trace_savevm_section_end(se->idstr, se->section_id, 0); - save_section_footer(f, se); - - json_end_object(vmdesc); - } - - if (!in_postcopy) { - /* Postcopy stream will still be going */ - qemu_put_byte(f, QEMU_VM_EOF); - } - - json_end_array(vmdesc); - qjson_finish(vmdesc); - vmdesc_len = strlen(qjson_get_str(vmdesc)); - - if (should_send_vmdesc()) { - qemu_put_byte(f, QEMU_VM_VMDESCRIPTION); - qemu_put_be32(f, vmdesc_len); - qemu_put_buffer(f, (uint8_t *)qjson_get_str(vmdesc), vmdesc_len); - } - object_unref(OBJECT(vmdesc)); - - qemu_fflush(f); -} - -/* Give an estimate of the amount left to be transferred, - * the result is split into the amount for units that can and - * for units that can't do postcopy. - */ -void qemu_savevm_state_pending(QEMUFile *f, uint64_t max_size, - uint64_t *res_non_postcopiable, - uint64_t *res_postcopiable) -{ - SaveStateEntry *se; - - *res_non_postcopiable = 0; - *res_postcopiable = 0; - - - QTAILQ_FOREACH(se, &savevm_state.handlers, entry) { - if (!se->ops || !se->ops->save_live_pending) { - continue; - } - if (se->ops && se->ops->is_active) { - if (!se->ops->is_active(se->opaque)) { - continue; - } - } - se->ops->save_live_pending(f, se->opaque, max_size, - res_non_postcopiable, res_postcopiable); - } -} - -void qemu_savevm_state_cleanup(void) -{ - SaveStateEntry *se; - - trace_savevm_state_cleanup(); - QTAILQ_FOREACH(se, &savevm_state.handlers, entry) { - if (se->ops && se->ops->cleanup) { - se->ops->cleanup(se->opaque); - } - } -} - -static int qemu_savevm_state(QEMUFile *f, Error **errp) -{ - int ret; - MigrationParams params = { - .blk = 0, - .shared = 0 - }; - MigrationState *ms = migrate_init(¶ms); - ms->to_dst_file = f; - - if (qemu_savevm_state_blocked(errp)) { - return -EINVAL; - } - - qemu_mutex_unlock_iothread(); - qemu_savevm_state_header(f); - qemu_savevm_state_begin(f, ¶ms); - qemu_mutex_lock_iothread(); - - while (qemu_file_get_error(f) == 0) { - if (qemu_savevm_state_iterate(f, false) > 0) { - break; - } - } - - ret = qemu_file_get_error(f); - if (ret == 0) { - qemu_savevm_state_complete_precopy(f, false); - ret = qemu_file_get_error(f); - } - qemu_savevm_state_cleanup(); - if (ret != 0) { - error_setg_errno(errp, -ret, "Error while writing VM state"); - } - return ret; -} - -static int qemu_save_device_state(QEMUFile *f) -{ - SaveStateEntry *se; - - qemu_put_be32(f, QEMU_VM_FILE_MAGIC); - qemu_put_be32(f, QEMU_VM_FILE_VERSION); - - cpu_synchronize_all_states(); - - QTAILQ_FOREACH(se, &savevm_state.handlers, entry) { - if (se->is_ram) { - continue; - } - if ((!se->ops || !se->ops->save_state) && !se->vmsd) { - continue; - } - if (se->vmsd && !vmstate_save_needed(se->vmsd, se->opaque)) { - continue; - } - - save_section_header(f, se, QEMU_VM_SECTION_FULL); - - vmstate_save(f, se, NULL); - - save_section_footer(f, se); - } - - qemu_put_byte(f, QEMU_VM_EOF); - - return qemu_file_get_error(f); -} - -static SaveStateEntry *find_se(const char *idstr, int instance_id) -{ - SaveStateEntry *se; - - QTAILQ_FOREACH(se, &savevm_state.handlers, entry) { - if (!strcmp(se->idstr, idstr) && - (instance_id == se->instance_id || - instance_id == se->alias_id)) - return se; - /* Migrating from an older version? */ - if (strstr(se->idstr, idstr) && se->compat) { - if (!strcmp(se->compat->idstr, idstr) && - (instance_id == se->compat->instance_id || - instance_id == se->alias_id)) - return se; - } - } - return NULL; -} - -enum LoadVMExitCodes { - /* Allow a command to quit all layers of nested loadvm loops */ - LOADVM_QUIT = 1, -}; - -static int qemu_loadvm_state_main(QEMUFile *f, MigrationIncomingState *mis); - -/* ------ incoming postcopy messages ------ */ -/* 'advise' arrives before any transfers just to tell us that a postcopy - * *might* happen - it might be skipped if precopy transferred everything - * quickly. - */ -static int loadvm_postcopy_handle_advise(MigrationIncomingState *mis) -{ - PostcopyState ps = postcopy_state_set(POSTCOPY_INCOMING_ADVISE); - uint64_t remote_hps, remote_tps; - - trace_loadvm_postcopy_handle_advise(); - if (ps != POSTCOPY_INCOMING_NONE) { - error_report("CMD_POSTCOPY_ADVISE in wrong postcopy state (%d)", ps); - return -1; - } - - if (!postcopy_ram_supported_by_host()) { - return -1; - } - - remote_hps = qemu_get_be64(mis->from_src_file); - if (remote_hps != getpagesize()) { - /* - * Some combinations of mismatch are probably possible but it gets - * a bit more complicated. In particular we need to place whole - * host pages on the dest at once, and we need to ensure that we - * handle dirtying to make sure we never end up sending part of - * a hostpage on it's own. - */ - error_report("Postcopy needs matching host page sizes (s=%d d=%d)", - (int)remote_hps, getpagesize()); - return -1; - } - - remote_tps = qemu_get_be64(mis->from_src_file); - if (remote_tps != (1ul << qemu_target_page_bits())) { - /* - * Again, some differences could be dealt with, but for now keep it - * simple. - */ - error_report("Postcopy needs matching target page sizes (s=%d d=%d)", - (int)remote_tps, 1 << qemu_target_page_bits()); - return -1; - } - - if (ram_postcopy_incoming_init(mis)) { - return -1; - } - - postcopy_state_set(POSTCOPY_INCOMING_ADVISE); - - return 0; -} - -/* After postcopy we will be told to throw some pages away since they're - * dirty and will have to be demand fetched. Must happen before CPU is - * started. - * There can be 0..many of these messages, each encoding multiple pages. - */ -static int loadvm_postcopy_ram_handle_discard(MigrationIncomingState *mis, - uint16_t len) -{ - int tmp; - char ramid[256]; - PostcopyState ps = postcopy_state_get(); - - trace_loadvm_postcopy_ram_handle_discard(); - - switch (ps) { - case POSTCOPY_INCOMING_ADVISE: - /* 1st discard */ - tmp = postcopy_ram_prepare_discard(mis); - if (tmp) { - return tmp; - } - break; - - case POSTCOPY_INCOMING_DISCARD: - /* Expected state */ - break; - - default: - error_report("CMD_POSTCOPY_RAM_DISCARD in wrong postcopy state (%d)", - ps); - return -1; - } - /* We're expecting a - * Version (0) - * a RAM ID string (length byte, name, 0 term) - * then at least 1 16 byte chunk - */ - if (len < (1 + 1 + 1 + 1 + 2 * 8)) { - error_report("CMD_POSTCOPY_RAM_DISCARD invalid length (%d)", len); - return -1; - } - - tmp = qemu_get_byte(mis->from_src_file); - if (tmp != postcopy_ram_discard_version) { - error_report("CMD_POSTCOPY_RAM_DISCARD invalid version (%d)", tmp); - return -1; - } - - if (!qemu_get_counted_string(mis->from_src_file, ramid)) { - error_report("CMD_POSTCOPY_RAM_DISCARD Failed to read RAMBlock ID"); - return -1; - } - tmp = qemu_get_byte(mis->from_src_file); - if (tmp != 0) { - error_report("CMD_POSTCOPY_RAM_DISCARD missing nil (%d)", tmp); - return -1; - } - - len -= 3 + strlen(ramid); - if (len % 16) { - error_report("CMD_POSTCOPY_RAM_DISCARD invalid length (%d)", len); - return -1; - } - trace_loadvm_postcopy_ram_handle_discard_header(ramid, len); - while (len) { - uint64_t start_addr, block_length; - start_addr = qemu_get_be64(mis->from_src_file); - block_length = qemu_get_be64(mis->from_src_file); - - len -= 16; - int ret = ram_discard_range(mis, ramid, start_addr, - block_length); - if (ret) { - return ret; - } - } - trace_loadvm_postcopy_ram_handle_discard_end(); - - return 0; -} - -/* - * Triggered by a postcopy_listen command; this thread takes over reading - * the input stream, leaving the main thread free to carry on loading the rest - * of the device state (from RAM). - * (TODO:This could do with being in a postcopy file - but there again it's - * just another input loop, not that postcopy specific) - */ -static void *postcopy_ram_listen_thread(void *opaque) -{ - QEMUFile *f = opaque; - MigrationIncomingState *mis = migration_incoming_get_current(); - int load_res; - - migrate_set_state(&mis->state, MIGRATION_STATUS_ACTIVE, - MIGRATION_STATUS_POSTCOPY_ACTIVE); - qemu_sem_post(&mis->listen_thread_sem); - trace_postcopy_ram_listen_thread_start(); - - /* - * Because we're a thread and not a coroutine we can't yield - * in qemu_file, and thus we must be blocking now. - */ - qemu_file_set_blocking(f, true); - load_res = qemu_loadvm_state_main(f, mis); - /* And non-blocking again so we don't block in any cleanup */ - qemu_file_set_blocking(f, false); - - trace_postcopy_ram_listen_thread_exit(); - if (load_res < 0) { - error_report("%s: loadvm failed: %d", __func__, load_res); - qemu_file_set_error(f, load_res); - migrate_set_state(&mis->state, MIGRATION_STATUS_POSTCOPY_ACTIVE, - MIGRATION_STATUS_FAILED); - } else { - /* - * This looks good, but it's possible that the device loading in the - * main thread hasn't finished yet, and so we might not be in 'RUN' - * state yet; wait for the end of the main thread. - */ - qemu_event_wait(&mis->main_thread_load_event); - } - postcopy_ram_incoming_cleanup(mis); - - if (load_res < 0) { - /* - * If something went wrong then we have a bad state so exit; - * depending how far we got it might be possible at this point - * to leave the guest running and fire MCEs for pages that never - * arrived as a desperate recovery step. - */ - exit(EXIT_FAILURE); - } - - migrate_set_state(&mis->state, MIGRATION_STATUS_POSTCOPY_ACTIVE, - MIGRATION_STATUS_COMPLETED); - /* - * If everything has worked fine, then the main thread has waited - * for us to start, and we're the last use of the mis. - * (If something broke then qemu will have to exit anyway since it's - * got a bad migration state). - */ - migration_incoming_state_destroy(); - - - return NULL; -} - -/* After this message we must be able to immediately receive postcopy data */ -static int loadvm_postcopy_handle_listen(MigrationIncomingState *mis) -{ - PostcopyState ps = postcopy_state_set(POSTCOPY_INCOMING_LISTENING); - trace_loadvm_postcopy_handle_listen(); - if (ps != POSTCOPY_INCOMING_ADVISE && ps != POSTCOPY_INCOMING_DISCARD) { - error_report("CMD_POSTCOPY_LISTEN in wrong postcopy state (%d)", ps); - return -1; - } - if (ps == POSTCOPY_INCOMING_ADVISE) { - /* - * A rare case, we entered listen without having to do any discards, - * so do the setup that's normally done at the time of the 1st discard. - */ - postcopy_ram_prepare_discard(mis); - } - - /* - * Sensitise RAM - can now generate requests for blocks that don't exist - * However, at this point the CPU shouldn't be running, and the IO - * shouldn't be doing anything yet so don't actually expect requests - */ - if (postcopy_ram_enable_notify(mis)) { - return -1; - } - - if (mis->have_listen_thread) { - error_report("CMD_POSTCOPY_RAM_LISTEN already has a listen thread"); - return -1; - } - - mis->have_listen_thread = true; - /* Start up the listening thread and wait for it to signal ready */ - qemu_sem_init(&mis->listen_thread_sem, 0); - qemu_thread_create(&mis->listen_thread, "postcopy/listen", - postcopy_ram_listen_thread, mis->from_src_file, - QEMU_THREAD_DETACHED); - qemu_sem_wait(&mis->listen_thread_sem); - qemu_sem_destroy(&mis->listen_thread_sem); - - return 0; -} - - -typedef struct { - QEMUBH *bh; -} HandleRunBhData; - -static void loadvm_postcopy_handle_run_bh(void *opaque) -{ - Error *local_err = NULL; - HandleRunBhData *data = opaque; - - /* TODO we should move all of this lot into postcopy_ram.c or a shared code - * in migration.c - */ - cpu_synchronize_all_post_init(); - - qemu_announce_self(); - - /* Make sure all file formats flush their mutable metadata */ - bdrv_invalidate_cache_all(&local_err); - if (local_err) { - error_report_err(local_err); - } - - trace_loadvm_postcopy_handle_run_cpu_sync(); - cpu_synchronize_all_post_init(); - - trace_loadvm_postcopy_handle_run_vmstart(); - - if (autostart) { - /* Hold onto your hats, starting the CPU */ - vm_start(); - } else { - /* leave it paused and let management decide when to start the CPU */ - runstate_set(RUN_STATE_PAUSED); - } - - qemu_bh_delete(data->bh); - g_free(data); -} - -/* After all discards we can start running and asking for pages */ -static int loadvm_postcopy_handle_run(MigrationIncomingState *mis) -{ - PostcopyState ps = postcopy_state_set(POSTCOPY_INCOMING_RUNNING); - HandleRunBhData *data; - - trace_loadvm_postcopy_handle_run(); - if (ps != POSTCOPY_INCOMING_LISTENING) { - error_report("CMD_POSTCOPY_RUN in wrong postcopy state (%d)", ps); - return -1; - } - - data = g_new(HandleRunBhData, 1); - data->bh = qemu_bh_new(loadvm_postcopy_handle_run_bh, data); - qemu_bh_schedule(data->bh); - - /* We need to finish reading the stream from the package - * and also stop reading anything more from the stream that loaded the - * package (since it's now being read by the listener thread). - * LOADVM_QUIT will quit all the layers of nested loadvm loops. - */ - return LOADVM_QUIT; -} - -/** - * Immediately following this command is a blob of data containing an embedded - * chunk of migration stream; read it and load it. - * - * @mis: Incoming state - * @length: Length of packaged data to read - * - * Returns: Negative values on error - * - */ -static int loadvm_handle_cmd_packaged(MigrationIncomingState *mis) -{ - int ret; - uint8_t *buffer; - uint32_t length; - QEMUSizedBuffer *qsb; - - length = qemu_get_be32(mis->from_src_file); - trace_loadvm_handle_cmd_packaged(length); - - if (length > MAX_VM_CMD_PACKAGED_SIZE) { - error_report("Unreasonably large packaged state: %u", length); - return -1; - } - buffer = g_malloc0(length); - ret = qemu_get_buffer(mis->from_src_file, buffer, (int)length); - if (ret != length) { - g_free(buffer); - error_report("CMD_PACKAGED: Buffer receive fail ret=%d length=%d", - ret, length); - return (ret < 0) ? ret : -EAGAIN; - } - trace_loadvm_handle_cmd_packaged_received(ret); - - /* Setup a dummy QEMUFile that actually reads from the buffer */ - qsb = qsb_create(buffer, length); - g_free(buffer); /* Because qsb_create copies */ - if (!qsb) { - error_report("Unable to create qsb"); - } - QEMUFile *packf = qemu_bufopen("r", qsb); - - ret = qemu_loadvm_state_main(packf, mis); - trace_loadvm_handle_cmd_packaged_main(ret); - qemu_fclose(packf); - qsb_free(qsb); - - return ret; -} - -/* - * Process an incoming 'QEMU_VM_COMMAND' - * 0 just a normal return - * LOADVM_QUIT All good, but exit the loop - * <0 Error - */ -static int loadvm_process_command(QEMUFile *f) -{ - MigrationIncomingState *mis = migration_incoming_get_current(); - uint16_t cmd; - uint16_t len; - uint32_t tmp32; - - cmd = qemu_get_be16(f); - len = qemu_get_be16(f); - - trace_loadvm_process_command(cmd, len); - if (cmd >= MIG_CMD_MAX || cmd == MIG_CMD_INVALID) { - error_report("MIG_CMD 0x%x unknown (len 0x%x)", cmd, len); - return -EINVAL; - } - - if (mig_cmd_args[cmd].len != -1 && mig_cmd_args[cmd].len != len) { - error_report("%s received with bad length - expecting %zu, got %d", - mig_cmd_args[cmd].name, - (size_t)mig_cmd_args[cmd].len, len); - return -ERANGE; - } - - switch (cmd) { - case MIG_CMD_OPEN_RETURN_PATH: - if (mis->to_src_file) { - error_report("CMD_OPEN_RETURN_PATH called when RP already open"); - /* Not really a problem, so don't give up */ - return 0; - } - mis->to_src_file = qemu_file_get_return_path(f); - if (!mis->to_src_file) { - error_report("CMD_OPEN_RETURN_PATH failed"); - return -1; - } - break; - - case MIG_CMD_PING: - tmp32 = qemu_get_be32(f); - trace_loadvm_process_command_ping(tmp32); - if (!mis->to_src_file) { - error_report("CMD_PING (0x%x) received with no return path", - tmp32); - return -1; - } - migrate_send_rp_pong(mis, tmp32); - break; - - case MIG_CMD_PACKAGED: - return loadvm_handle_cmd_packaged(mis); - - case MIG_CMD_POSTCOPY_ADVISE: - return loadvm_postcopy_handle_advise(mis); - - case MIG_CMD_POSTCOPY_LISTEN: - return loadvm_postcopy_handle_listen(mis); - - case MIG_CMD_POSTCOPY_RUN: - return loadvm_postcopy_handle_run(mis); - - case MIG_CMD_POSTCOPY_RAM_DISCARD: - return loadvm_postcopy_ram_handle_discard(mis, len); - } - - return 0; -} - -struct LoadStateEntry { - QLIST_ENTRY(LoadStateEntry) entry; - SaveStateEntry *se; - int section_id; - int version_id; -}; - -/* - * Read a footer off the wire and check that it matches the expected section - * - * Returns: true if the footer was good - * false if there is a problem (and calls error_report to say why) - */ -static bool check_section_footer(QEMUFile *f, LoadStateEntry *le) -{ - uint8_t read_mark; - uint32_t read_section_id; - - if (skip_section_footers) { - /* No footer to check */ - return true; - } - - read_mark = qemu_get_byte(f); - - if (read_mark != QEMU_VM_SECTION_FOOTER) { - error_report("Missing section footer for %s", le->se->idstr); - return false; - } - - read_section_id = qemu_get_be32(f); - if (read_section_id != le->section_id) { - error_report("Mismatched section id in footer for %s -" - " read 0x%x expected 0x%x", - le->se->idstr, read_section_id, le->section_id); - return false; - } - - /* All good */ - return true; -} - -void loadvm_free_handlers(MigrationIncomingState *mis) -{ - LoadStateEntry *le, *new_le; - - QLIST_FOREACH_SAFE(le, &mis->loadvm_handlers, entry, new_le) { - QLIST_REMOVE(le, entry); - g_free(le); - } -} - -static int -qemu_loadvm_section_start_full(QEMUFile *f, MigrationIncomingState *mis) -{ - uint32_t instance_id, version_id, section_id; - SaveStateEntry *se; - LoadStateEntry *le; - char idstr[256]; - int ret; - - /* Read section start */ - section_id = qemu_get_be32(f); - if (!qemu_get_counted_string(f, idstr)) { - error_report("Unable to read ID string for section %u", - section_id); - return -EINVAL; - } - instance_id = qemu_get_be32(f); - version_id = qemu_get_be32(f); - - trace_qemu_loadvm_state_section_startfull(section_id, idstr, - instance_id, version_id); - /* Find savevm section */ - se = find_se(idstr, instance_id); - if (se == NULL) { - error_report("Unknown savevm section or instance '%s' %d", - idstr, instance_id); - return -EINVAL; - } - - /* Validate version */ - if (version_id > se->version_id) { - error_report("savevm: unsupported version %d for '%s' v%d", - version_id, idstr, se->version_id); - return -EINVAL; - } - - /* Add entry */ - le = g_malloc0(sizeof(*le)); - - le->se = se; - le->section_id = section_id; - le->version_id = version_id; - QLIST_INSERT_HEAD(&mis->loadvm_handlers, le, entry); - - ret = vmstate_load(f, le->se, le->version_id); - if (ret < 0) { - error_report("error while loading state for instance 0x%x of" - " device '%s'", instance_id, idstr); - return ret; - } - if (!check_section_footer(f, le)) { - return -EINVAL; - } - - return 0; -} - -static int -qemu_loadvm_section_part_end(QEMUFile *f, MigrationIncomingState *mis) -{ - uint32_t section_id; - LoadStateEntry *le; - int ret; - - section_id = qemu_get_be32(f); - - trace_qemu_loadvm_state_section_partend(section_id); - QLIST_FOREACH(le, &mis->loadvm_handlers, entry) { - if (le->section_id == section_id) { - break; - } - } - if (le == NULL) { - error_report("Unknown savevm section %d", section_id); - return -EINVAL; - } - - ret = vmstate_load(f, le->se, le->version_id); - if (ret < 0) { - error_report("error while loading state section id %d(%s)", - section_id, le->se->idstr); - return ret; - } - if (!check_section_footer(f, le)) { - return -EINVAL; - } - - return 0; -} - -static int qemu_loadvm_state_main(QEMUFile *f, MigrationIncomingState *mis) -{ - uint8_t section_type; - int ret; - - while ((section_type = qemu_get_byte(f)) != QEMU_VM_EOF) { - - trace_qemu_loadvm_state_section(section_type); - switch (section_type) { - case QEMU_VM_SECTION_START: - case QEMU_VM_SECTION_FULL: - ret = qemu_loadvm_section_start_full(f, mis); - if (ret < 0) { - return ret; - } - break; - case QEMU_VM_SECTION_PART: - case QEMU_VM_SECTION_END: - ret = qemu_loadvm_section_part_end(f, mis); - if (ret < 0) { - return ret; - } - break; - case QEMU_VM_COMMAND: - ret = loadvm_process_command(f); - trace_qemu_loadvm_state_section_command(ret); - if ((ret < 0) || (ret & LOADVM_QUIT)) { - return ret; - } - break; - default: - error_report("Unknown savevm section type %d", section_type); - return -EINVAL; - } - } - - return 0; -} - -int qemu_loadvm_state(QEMUFile *f) -{ - MigrationIncomingState *mis = migration_incoming_get_current(); - Error *local_err = NULL; - unsigned int v; - int ret; - - if (qemu_savevm_state_blocked(&local_err)) { - error_report_err(local_err); - return -EINVAL; - } - - v = qemu_get_be32(f); - if (v != QEMU_VM_FILE_MAGIC) { - error_report("Not a migration stream"); - return -EINVAL; - } - - v = qemu_get_be32(f); - if (v == QEMU_VM_FILE_VERSION_COMPAT) { - error_report("SaveVM v2 format is obsolete and don't work anymore"); - return -ENOTSUP; - } - if (v != QEMU_VM_FILE_VERSION) { - error_report("Unsupported migration stream version"); - return -ENOTSUP; - } - - if (!savevm_state.skip_configuration || enforce_config_section()) { - if (qemu_get_byte(f) != QEMU_VM_CONFIGURATION) { - error_report("Configuration section missing"); - return -EINVAL; - } - ret = vmstate_load_state(f, &vmstate_configuration, &savevm_state, 0); - - if (ret) { - return ret; - } - } - - ret = qemu_loadvm_state_main(f, mis); - qemu_event_set(&mis->main_thread_load_event); - - trace_qemu_loadvm_state_post_main(ret); - - if (mis->have_listen_thread) { - /* Listen thread still going, can't clean up yet */ - return ret; - } - - if (ret == 0) { - ret = qemu_file_get_error(f); - } - - /* - * Try to read in the VMDESC section as well, so that dumping tools that - * intercept our migration stream have the chance to see it. - */ - - /* We've got to be careful; if we don't read the data and just shut the fd - * then the sender can error if we close while it's still sending. - * We also mustn't read data that isn't there; some transports (RDMA) - * will stall waiting for that data when the source has already closed. - */ - if (ret == 0 && should_send_vmdesc()) { - uint8_t *buf; - uint32_t size; - uint8_t section_type = qemu_get_byte(f); - - if (section_type != QEMU_VM_VMDESCRIPTION) { - error_report("Expected vmdescription section, but got %d", - section_type); - /* - * It doesn't seem worth failing at this point since - * we apparently have an otherwise valid VM state - */ - } else { - buf = g_malloc(0x1000); - size = qemu_get_be32(f); - - while (size > 0) { - uint32_t read_chunk = MIN(size, 0x1000); - qemu_get_buffer(f, buf, read_chunk); - size -= read_chunk; - } - g_free(buf); - } - } - - cpu_synchronize_all_post_init(); - - return ret; -} - -void hmp_savevm(Monitor *mon, const QDict *qdict) -{ - BlockDriverState *bs, *bs1; - QEMUSnapshotInfo sn1, *sn = &sn1, old_sn1, *old_sn = &old_sn1; - int ret; - QEMUFile *f; - int saved_vm_running; - uint64_t vm_state_size; - qemu_timeval tv; - struct tm tm; - const char *name = qdict_get_try_str(qdict, "name"); - Error *local_err = NULL; - AioContext *aio_context; - - if (!bdrv_all_can_snapshot(&bs)) { - monitor_printf(mon, "Device '%s' is writable but does not " - "support snapshots.\n", bdrv_get_device_name(bs)); - return; - } - - /* Delete old snapshots of the same name */ - if (name && bdrv_all_delete_snapshot(name, &bs1, &local_err) < 0) { - error_reportf_err(local_err, - "Error while deleting snapshot on device '%s': ", - bdrv_get_device_name(bs1)); - return; - } - - bs = bdrv_all_find_vmstate_bs(); - if (bs == NULL) { - monitor_printf(mon, "No block device can accept snapshots\n"); - return; - } - aio_context = bdrv_get_aio_context(bs); - - saved_vm_running = runstate_is_running(); - - ret = global_state_store(); - if (ret) { - monitor_printf(mon, "Error saving global state\n"); - return; - } - vm_stop(RUN_STATE_SAVE_VM); - - aio_context_acquire(aio_context); - - memset(sn, 0, sizeof(*sn)); - - /* fill auxiliary fields */ - qemu_gettimeofday(&tv); - sn->date_sec = tv.tv_sec; - sn->date_nsec = tv.tv_usec * 1000; - sn->vm_clock_nsec = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL); - - if (name) { - ret = bdrv_snapshot_find(bs, old_sn, name); - if (ret >= 0) { - pstrcpy(sn->name, sizeof(sn->name), old_sn->name); - pstrcpy(sn->id_str, sizeof(sn->id_str), old_sn->id_str); - } else { - pstrcpy(sn->name, sizeof(sn->name), name); - } - } else { - /* cast below needed for OpenBSD where tv_sec is still 'long' */ - localtime_r((const time_t *)&tv.tv_sec, &tm); - strftime(sn->name, sizeof(sn->name), "vm-%Y%m%d%H%M%S", &tm); - } - - /* save the VM state */ - f = qemu_fopen_bdrv(bs, 1); - if (!f) { - monitor_printf(mon, "Could not open VM state file\n"); - goto the_end; - } - ret = qemu_savevm_state(f, &local_err); - vm_state_size = qemu_ftell(f); - qemu_fclose(f); - if (ret < 0) { - error_report_err(local_err); - goto the_end; - } - - ret = bdrv_all_create_snapshot(sn, bs, vm_state_size, &bs); - if (ret < 0) { - monitor_printf(mon, "Error while creating snapshot on '%s'\n", - bdrv_get_device_name(bs)); - } - - the_end: - aio_context_release(aio_context); - if (saved_vm_running) { - vm_start(); - } -} - -void qmp_xen_save_devices_state(const char *filename, Error **errp) -{ - QEMUFile *f; - int saved_vm_running; - int ret; - - saved_vm_running = runstate_is_running(); - vm_stop(RUN_STATE_SAVE_VM); - global_state_store_running(); - - f = qemu_fopen(filename, "wb"); - if (!f) { - error_setg_file_open(errp, errno, filename); - goto the_end; - } - ret = qemu_save_device_state(f); - qemu_fclose(f); - if (ret < 0) { - error_setg(errp, QERR_IO_ERROR); - } - - the_end: - if (saved_vm_running) { - vm_start(); - } -} - -int load_vmstate(const char *name) -{ - BlockDriverState *bs, *bs_vm_state; - QEMUSnapshotInfo sn; - QEMUFile *f; - int ret; - AioContext *aio_context; - - if (!bdrv_all_can_snapshot(&bs)) { - error_report("Device '%s' is writable but does not support snapshots.", - bdrv_get_device_name(bs)); - return -ENOTSUP; - } - ret = bdrv_all_find_snapshot(name, &bs); - if (ret < 0) { - error_report("Device '%s' does not have the requested snapshot '%s'", - bdrv_get_device_name(bs), name); - return ret; - } - - bs_vm_state = bdrv_all_find_vmstate_bs(); - if (!bs_vm_state) { - error_report("No block device supports snapshots"); - return -ENOTSUP; - } - aio_context = bdrv_get_aio_context(bs_vm_state); - - /* Don't even try to load empty VM states */ - aio_context_acquire(aio_context); - ret = bdrv_snapshot_find(bs_vm_state, &sn, name); - aio_context_release(aio_context); - if (ret < 0) { - return ret; - } else if (sn.vm_state_size == 0) { - error_report("This is a disk-only snapshot. Revert to it offline " - "using qemu-img."); - return -EINVAL; - } - - /* Flush all IO requests so they don't interfere with the new state. */ - bdrv_drain_all(); - - ret = bdrv_all_goto_snapshot(name, &bs); - if (ret < 0) { - error_report("Error %d while activating snapshot '%s' on '%s'", - ret, name, bdrv_get_device_name(bs)); - return ret; - } - - /* restore the VM state */ - f = qemu_fopen_bdrv(bs_vm_state, 0); - if (!f) { - error_report("Could not open VM state file"); - return -EINVAL; - } - - qemu_system_reset(VMRESET_SILENT); - migration_incoming_state_new(f); - - aio_context_acquire(aio_context); - ret = qemu_loadvm_state(f); - qemu_fclose(f); - aio_context_release(aio_context); - - migration_incoming_state_destroy(); - if (ret < 0) { - error_report("Error %d while loading VM state", ret); - return ret; - } - - return 0; -} - -void hmp_delvm(Monitor *mon, const QDict *qdict) -{ - BlockDriverState *bs; - Error *err; - const char *name = qdict_get_str(qdict, "name"); - - if (bdrv_all_delete_snapshot(name, &bs, &err) < 0) { - error_reportf_err(err, - "Error while deleting snapshot on device '%s': ", - bdrv_get_device_name(bs)); - } -} - -void hmp_info_snapshots(Monitor *mon, const QDict *qdict) -{ - BlockDriverState *bs, *bs1; - QEMUSnapshotInfo *sn_tab, *sn; - int nb_sns, i; - int total; - int *available_snapshots; - AioContext *aio_context; - - bs = bdrv_all_find_vmstate_bs(); - if (!bs) { - monitor_printf(mon, "No available block device supports snapshots\n"); - return; - } - aio_context = bdrv_get_aio_context(bs); - - aio_context_acquire(aio_context); - nb_sns = bdrv_snapshot_list(bs, &sn_tab); - aio_context_release(aio_context); - - if (nb_sns < 0) { - monitor_printf(mon, "bdrv_snapshot_list: error %d\n", nb_sns); - return; - } - - if (nb_sns == 0) { - monitor_printf(mon, "There is no snapshot available.\n"); - return; - } - - available_snapshots = g_new0(int, nb_sns); - total = 0; - for (i = 0; i < nb_sns; i++) { - if (bdrv_all_find_snapshot(sn_tab[i].id_str, &bs1) == 0) { - available_snapshots[total] = i; - total++; - } - } - - if (total > 0) { - bdrv_snapshot_dump((fprintf_function)monitor_printf, mon, NULL); - monitor_printf(mon, "\n"); - for (i = 0; i < total; i++) { - sn = &sn_tab[available_snapshots[i]]; - bdrv_snapshot_dump((fprintf_function)monitor_printf, mon, sn); - monitor_printf(mon, "\n"); - } - } else { - monitor_printf(mon, "There is no suitable snapshot available\n"); - } - - g_free(sn_tab); - g_free(available_snapshots); - -} - -void vmstate_register_ram(MemoryRegion *mr, DeviceState *dev) -{ - qemu_ram_set_idstr(memory_region_get_ram_addr(mr) & TARGET_PAGE_MASK, - memory_region_name(mr), dev); -} - -void vmstate_unregister_ram(MemoryRegion *mr, DeviceState *dev) -{ - qemu_ram_unset_idstr(memory_region_get_ram_addr(mr) & TARGET_PAGE_MASK); -} - -void vmstate_register_ram_global(MemoryRegion *mr) -{ - vmstate_register_ram(mr, NULL); -} diff --git a/qemu/migration/tcp.c b/qemu/migration/tcp.c deleted file mode 100644 index e1fa7f8f1..000000000 --- a/qemu/migration/tcp.c +++ /dev/null @@ -1,102 +0,0 @@ -/* - * QEMU live migration - * - * Copyright IBM, Corp. 2008 - * - * Authors: - * Anthony Liguori <aliguori@us.ibm.com> - * - * This work is licensed under the terms of the GNU GPL, version 2. See - * the COPYING file in the top-level directory. - * - * Contributions after 2012-01-13 are licensed under the terms of the - * GNU GPL, version 2 or (at your option) any later version. - */ - -#include "qemu/osdep.h" - -#include "qemu-common.h" -#include "qemu/error-report.h" -#include "qemu/sockets.h" -#include "migration/migration.h" -#include "migration/qemu-file.h" -#include "block/block.h" -#include "qemu/main-loop.h" - -//#define DEBUG_MIGRATION_TCP - -#ifdef DEBUG_MIGRATION_TCP -#define DPRINTF(fmt, ...) \ - do { printf("migration-tcp: " fmt, ## __VA_ARGS__); } while (0) -#else -#define DPRINTF(fmt, ...) \ - do { } while (0) -#endif - -static void tcp_wait_for_connect(int fd, Error *err, void *opaque) -{ - MigrationState *s = opaque; - - if (fd < 0) { - DPRINTF("migrate connect error: %s\n", error_get_pretty(err)); - s->to_dst_file = NULL; - migrate_fd_error(s); - } else { - DPRINTF("migrate connect success\n"); - s->to_dst_file = qemu_fopen_socket(fd, "wb"); - migrate_fd_connect(s); - } -} - -void tcp_start_outgoing_migration(MigrationState *s, const char *host_port, Error **errp) -{ - inet_nonblocking_connect(host_port, tcp_wait_for_connect, s, errp); -} - -static void tcp_accept_incoming_migration(void *opaque) -{ - struct sockaddr_in addr; - socklen_t addrlen = sizeof(addr); - int s = (intptr_t)opaque; - QEMUFile *f; - int c; - - do { - c = qemu_accept(s, (struct sockaddr *)&addr, &addrlen); - } while (c < 0 && errno == EINTR); - qemu_set_fd_handler(s, NULL, NULL, NULL); - closesocket(s); - - DPRINTF("accepted migration\n"); - - if (c < 0) { - error_report("could not accept migration connection (%s)", - strerror(errno)); - return; - } - - f = qemu_fopen_socket(c, "rb"); - if (f == NULL) { - error_report("could not qemu_fopen socket"); - goto out; - } - - process_incoming_migration(f); - return; - -out: - closesocket(c); -} - -void tcp_start_incoming_migration(const char *host_port, Error **errp) -{ - int s; - - s = inet_listen(host_port, NULL, 256, SOCK_STREAM, 0, errp); - if (s < 0) { - return; - } - - qemu_set_fd_handler(s, tcp_accept_incoming_migration, NULL, - (void *)(intptr_t)s); -} diff --git a/qemu/migration/unix.c b/qemu/migration/unix.c deleted file mode 100644 index d9aac36b9..000000000 --- a/qemu/migration/unix.c +++ /dev/null @@ -1,103 +0,0 @@ -/* - * QEMU live migration via Unix Domain Sockets - * - * Copyright Red Hat, Inc. 2009 - * - * Authors: - * Chris Lalancette <clalance@redhat.com> - * - * This work is licensed under the terms of the GNU GPL, version 2. See - * the COPYING file in the top-level directory. - * - * Contributions after 2012-01-13 are licensed under the terms of the - * GNU GPL, version 2 or (at your option) any later version. - */ - -#include "qemu/osdep.h" - -#include "qemu-common.h" -#include "qemu/error-report.h" -#include "qemu/sockets.h" -#include "qemu/main-loop.h" -#include "migration/migration.h" -#include "migration/qemu-file.h" -#include "block/block.h" - -//#define DEBUG_MIGRATION_UNIX - -#ifdef DEBUG_MIGRATION_UNIX -#define DPRINTF(fmt, ...) \ - do { printf("migration-unix: " fmt, ## __VA_ARGS__); } while (0) -#else -#define DPRINTF(fmt, ...) \ - do { } while (0) -#endif - -static void unix_wait_for_connect(int fd, Error *err, void *opaque) -{ - MigrationState *s = opaque; - - if (fd < 0) { - DPRINTF("migrate connect error: %s\n", error_get_pretty(err)); - s->to_dst_file = NULL; - migrate_fd_error(s); - } else { - DPRINTF("migrate connect success\n"); - s->to_dst_file = qemu_fopen_socket(fd, "wb"); - migrate_fd_connect(s); - } -} - -void unix_start_outgoing_migration(MigrationState *s, const char *path, Error **errp) -{ - unix_nonblocking_connect(path, unix_wait_for_connect, s, errp); -} - -static void unix_accept_incoming_migration(void *opaque) -{ - struct sockaddr_un addr; - socklen_t addrlen = sizeof(addr); - int s = (intptr_t)opaque; - QEMUFile *f; - int c, err; - - do { - c = qemu_accept(s, (struct sockaddr *)&addr, &addrlen); - err = errno; - } while (c < 0 && err == EINTR); - qemu_set_fd_handler(s, NULL, NULL, NULL); - close(s); - - DPRINTF("accepted migration\n"); - - if (c < 0) { - error_report("could not accept migration connection (%s)", - strerror(err)); - return; - } - - f = qemu_fopen_socket(c, "rb"); - if (f == NULL) { - error_report("could not qemu_fopen socket"); - goto out; - } - - process_incoming_migration(f); - return; - -out: - close(c); -} - -void unix_start_incoming_migration(const char *path, Error **errp) -{ - int s; - - s = unix_listen(path, NULL, 0, errp); - if (s < 0) { - return; - } - - qemu_set_fd_handler(s, unix_accept_incoming_migration, NULL, - (void *)(intptr_t)s); -} diff --git a/qemu/migration/vmstate.c b/qemu/migration/vmstate.c deleted file mode 100644 index bf3d5db30..000000000 --- a/qemu/migration/vmstate.c +++ /dev/null @@ -1,918 +0,0 @@ -#include "qemu/osdep.h" -#include "qemu-common.h" -#include "migration/migration.h" -#include "migration/qemu-file.h" -#include "migration/vmstate.h" -#include "qemu/bitops.h" -#include "qemu/error-report.h" -#include "trace.h" -#include "qjson.h" - -static void vmstate_subsection_save(QEMUFile *f, const VMStateDescription *vmsd, - void *opaque, QJSON *vmdesc); -static int vmstate_subsection_load(QEMUFile *f, const VMStateDescription *vmsd, - void *opaque); - -static int vmstate_n_elems(void *opaque, VMStateField *field) -{ - int n_elems = 1; - - if (field->flags & VMS_ARRAY) { - n_elems = field->num; - } else if (field->flags & VMS_VARRAY_INT32) { - n_elems = *(int32_t *)(opaque+field->num_offset); - } else if (field->flags & VMS_VARRAY_UINT32) { - n_elems = *(uint32_t *)(opaque+field->num_offset); - } else if (field->flags & VMS_VARRAY_UINT16) { - n_elems = *(uint16_t *)(opaque+field->num_offset); - } else if (field->flags & VMS_VARRAY_UINT8) { - n_elems = *(uint8_t *)(opaque+field->num_offset); - } - - if (field->flags & VMS_MULTIPLY_ELEMENTS) { - n_elems *= field->num; - } - - return n_elems; -} - -static int vmstate_size(void *opaque, VMStateField *field) -{ - int size = field->size; - - if (field->flags & VMS_VBUFFER) { - size = *(int32_t *)(opaque+field->size_offset); - if (field->flags & VMS_MULTIPLY) { - size *= field->size; - } - } - - return size; -} - -static void *vmstate_base_addr(void *opaque, VMStateField *field, bool alloc) -{ - void *base_addr = opaque + field->offset; - - if (field->flags & VMS_POINTER) { - if (alloc && (field->flags & VMS_ALLOC)) { - gsize size = 0; - if (field->flags & VMS_VBUFFER) { - size = vmstate_size(opaque, field); - } else { - int n_elems = vmstate_n_elems(opaque, field); - if (n_elems) { - size = n_elems * field->size; - } - } - if (size) { - *((void **)base_addr + field->start) = g_malloc(size); - } - } - base_addr = *(void **)base_addr + field->start; - } - - return base_addr; -} - -int vmstate_load_state(QEMUFile *f, const VMStateDescription *vmsd, - void *opaque, int version_id) -{ - VMStateField *field = vmsd->fields; - int ret = 0; - - trace_vmstate_load_state(vmsd->name, version_id); - if (version_id > vmsd->version_id) { - trace_vmstate_load_state_end(vmsd->name, "too new", -EINVAL); - return -EINVAL; - } - if (version_id < vmsd->minimum_version_id) { - if (vmsd->load_state_old && - version_id >= vmsd->minimum_version_id_old) { - ret = vmsd->load_state_old(f, opaque, version_id); - trace_vmstate_load_state_end(vmsd->name, "old path", ret); - return ret; - } - trace_vmstate_load_state_end(vmsd->name, "too old", -EINVAL); - return -EINVAL; - } - if (vmsd->pre_load) { - int ret = vmsd->pre_load(opaque); - if (ret) { - return ret; - } - } - while (field->name) { - trace_vmstate_load_state_field(vmsd->name, field->name); - if ((field->field_exists && - field->field_exists(opaque, version_id)) || - (!field->field_exists && - field->version_id <= version_id)) { - void *base_addr = vmstate_base_addr(opaque, field, true); - int i, n_elems = vmstate_n_elems(opaque, field); - int size = vmstate_size(opaque, field); - - for (i = 0; i < n_elems; i++) { - void *addr = base_addr + size * i; - - if (field->flags & VMS_ARRAY_OF_POINTER) { - addr = *(void **)addr; - } - if (field->flags & VMS_STRUCT) { - ret = vmstate_load_state(f, field->vmsd, addr, - field->vmsd->version_id); - } else { - ret = field->info->get(f, addr, size); - - } - if (ret >= 0) { - ret = qemu_file_get_error(f); - } - if (ret < 0) { - qemu_file_set_error(f, ret); - trace_vmstate_load_field_error(field->name, ret); - return ret; - } - } - } else if (field->flags & VMS_MUST_EXIST) { - error_report("Input validation failed: %s/%s", - vmsd->name, field->name); - return -1; - } - field++; - } - ret = vmstate_subsection_load(f, vmsd, opaque); - if (ret != 0) { - return ret; - } - if (vmsd->post_load) { - ret = vmsd->post_load(opaque, version_id); - } - trace_vmstate_load_state_end(vmsd->name, "end", ret); - return ret; -} - -static int vmfield_name_num(VMStateField *start, VMStateField *search) -{ - VMStateField *field; - int found = 0; - - for (field = start; field->name; field++) { - if (!strcmp(field->name, search->name)) { - if (field == search) { - return found; - } - found++; - } - } - - return -1; -} - -static bool vmfield_name_is_unique(VMStateField *start, VMStateField *search) -{ - VMStateField *field; - int found = 0; - - for (field = start; field->name; field++) { - if (!strcmp(field->name, search->name)) { - found++; - /* name found more than once, so it's not unique */ - if (found > 1) { - return false; - } - } - } - - return true; -} - -static const char *vmfield_get_type_name(VMStateField *field) -{ - const char *type = "unknown"; - - if (field->flags & VMS_STRUCT) { - type = "struct"; - } else if (field->info->name) { - type = field->info->name; - } - - return type; -} - -static bool vmsd_can_compress(VMStateField *field) -{ - if (field->field_exists) { - /* Dynamically existing fields mess up compression */ - return false; - } - - if (field->flags & VMS_STRUCT) { - VMStateField *sfield = field->vmsd->fields; - while (sfield->name) { - if (!vmsd_can_compress(sfield)) { - /* Child elements can't compress, so can't we */ - return false; - } - sfield++; - } - - if (field->vmsd->subsections) { - /* Subsections may come and go, better don't compress */ - return false; - } - } - - return true; -} - -static void vmsd_desc_field_start(const VMStateDescription *vmsd, QJSON *vmdesc, - VMStateField *field, int i, int max) -{ - char *name, *old_name; - bool is_array = max > 1; - bool can_compress = vmsd_can_compress(field); - - if (!vmdesc) { - return; - } - - name = g_strdup(field->name); - - /* Field name is not unique, need to make it unique */ - if (!vmfield_name_is_unique(vmsd->fields, field)) { - int num = vmfield_name_num(vmsd->fields, field); - old_name = name; - name = g_strdup_printf("%s[%d]", name, num); - g_free(old_name); - } - - json_start_object(vmdesc, NULL); - json_prop_str(vmdesc, "name", name); - if (is_array) { - if (can_compress) { - json_prop_int(vmdesc, "array_len", max); - } else { - json_prop_int(vmdesc, "index", i); - } - } - json_prop_str(vmdesc, "type", vmfield_get_type_name(field)); - - if (field->flags & VMS_STRUCT) { - json_start_object(vmdesc, "struct"); - } - - g_free(name); -} - -static void vmsd_desc_field_end(const VMStateDescription *vmsd, QJSON *vmdesc, - VMStateField *field, size_t size, int i) -{ - if (!vmdesc) { - return; - } - - if (field->flags & VMS_STRUCT) { - /* We printed a struct in between, close its child object */ - json_end_object(vmdesc); - } - - json_prop_int(vmdesc, "size", size); - json_end_object(vmdesc); -} - - -bool vmstate_save_needed(const VMStateDescription *vmsd, void *opaque) -{ - if (vmsd->needed && !vmsd->needed(opaque)) { - /* optional section not needed */ - return false; - } - return true; -} - - -void vmstate_save_state(QEMUFile *f, const VMStateDescription *vmsd, - void *opaque, QJSON *vmdesc) -{ - VMStateField *field = vmsd->fields; - - if (vmsd->pre_save) { - vmsd->pre_save(opaque); - } - - if (vmdesc) { - json_prop_str(vmdesc, "vmsd_name", vmsd->name); - json_prop_int(vmdesc, "version", vmsd->version_id); - json_start_array(vmdesc, "fields"); - } - - while (field->name) { - if (!field->field_exists || - field->field_exists(opaque, vmsd->version_id)) { - void *base_addr = vmstate_base_addr(opaque, field, false); - int i, n_elems = vmstate_n_elems(opaque, field); - int size = vmstate_size(opaque, field); - int64_t old_offset, written_bytes; - QJSON *vmdesc_loop = vmdesc; - - for (i = 0; i < n_elems; i++) { - void *addr = base_addr + size * i; - - vmsd_desc_field_start(vmsd, vmdesc_loop, field, i, n_elems); - old_offset = qemu_ftell_fast(f); - - if (field->flags & VMS_ARRAY_OF_POINTER) { - addr = *(void **)addr; - } - if (field->flags & VMS_STRUCT) { - vmstate_save_state(f, field->vmsd, addr, vmdesc_loop); - } else { - field->info->put(f, addr, size); - } - - written_bytes = qemu_ftell_fast(f) - old_offset; - vmsd_desc_field_end(vmsd, vmdesc_loop, field, written_bytes, i); - - /* Compressed arrays only care about the first element */ - if (vmdesc_loop && vmsd_can_compress(field)) { - vmdesc_loop = NULL; - } - } - } else { - if (field->flags & VMS_MUST_EXIST) { - error_report("Output state validation failed: %s/%s", - vmsd->name, field->name); - assert(!(field->flags & VMS_MUST_EXIST)); - } - } - field++; - } - - if (vmdesc) { - json_end_array(vmdesc); - } - - vmstate_subsection_save(f, vmsd, opaque, vmdesc); -} - -static const VMStateDescription * -vmstate_get_subsection(const VMStateDescription **sub, char *idstr) -{ - while (sub && *sub && (*sub)->needed) { - if (strcmp(idstr, (*sub)->name) == 0) { - return *sub; - } - sub++; - } - return NULL; -} - -static int vmstate_subsection_load(QEMUFile *f, const VMStateDescription *vmsd, - void *opaque) -{ - trace_vmstate_subsection_load(vmsd->name); - - while (qemu_peek_byte(f, 0) == QEMU_VM_SUBSECTION) { - char idstr[256], *idstr_ret; - int ret; - uint8_t version_id, len, size; - const VMStateDescription *sub_vmsd; - - len = qemu_peek_byte(f, 1); - if (len < strlen(vmsd->name) + 1) { - /* subsection name has be be "section_name/a" */ - trace_vmstate_subsection_load_bad(vmsd->name, "(short)"); - return 0; - } - size = qemu_peek_buffer(f, (uint8_t **)&idstr_ret, len, 2); - if (size != len) { - trace_vmstate_subsection_load_bad(vmsd->name, "(peek fail)"); - return 0; - } - memcpy(idstr, idstr_ret, size); - idstr[size] = 0; - - if (strncmp(vmsd->name, idstr, strlen(vmsd->name)) != 0) { - trace_vmstate_subsection_load_bad(vmsd->name, idstr); - /* it don't have a valid subsection name */ - return 0; - } - sub_vmsd = vmstate_get_subsection(vmsd->subsections, idstr); - if (sub_vmsd == NULL) { - trace_vmstate_subsection_load_bad(vmsd->name, "(lookup)"); - return -ENOENT; - } - qemu_file_skip(f, 1); /* subsection */ - qemu_file_skip(f, 1); /* len */ - qemu_file_skip(f, len); /* idstr */ - version_id = qemu_get_be32(f); - - ret = vmstate_load_state(f, sub_vmsd, opaque, version_id); - if (ret) { - trace_vmstate_subsection_load_bad(vmsd->name, "(child)"); - return ret; - } - } - - trace_vmstate_subsection_load_good(vmsd->name); - return 0; -} - -static void vmstate_subsection_save(QEMUFile *f, const VMStateDescription *vmsd, - void *opaque, QJSON *vmdesc) -{ - const VMStateDescription **sub = vmsd->subsections; - bool subsection_found = false; - - while (sub && *sub && (*sub)->needed) { - if ((*sub)->needed(opaque)) { - const VMStateDescription *vmsd = *sub; - uint8_t len; - - if (vmdesc) { - /* Only create subsection array when we have any */ - if (!subsection_found) { - json_start_array(vmdesc, "subsections"); - subsection_found = true; - } - - json_start_object(vmdesc, NULL); - } - - qemu_put_byte(f, QEMU_VM_SUBSECTION); - len = strlen(vmsd->name); - qemu_put_byte(f, len); - qemu_put_buffer(f, (uint8_t *)vmsd->name, len); - qemu_put_be32(f, vmsd->version_id); - vmstate_save_state(f, vmsd, opaque, vmdesc); - - if (vmdesc) { - json_end_object(vmdesc); - } - } - sub++; - } - - if (vmdesc && subsection_found) { - json_end_array(vmdesc); - } -} - -/* bool */ - -static int get_bool(QEMUFile *f, void *pv, size_t size) -{ - bool *v = pv; - *v = qemu_get_byte(f); - return 0; -} - -static void put_bool(QEMUFile *f, void *pv, size_t size) -{ - bool *v = pv; - qemu_put_byte(f, *v); -} - -const VMStateInfo vmstate_info_bool = { - .name = "bool", - .get = get_bool, - .put = put_bool, -}; - -/* 8 bit int */ - -static int get_int8(QEMUFile *f, void *pv, size_t size) -{ - int8_t *v = pv; - qemu_get_s8s(f, v); - return 0; -} - -static void put_int8(QEMUFile *f, void *pv, size_t size) -{ - int8_t *v = pv; - qemu_put_s8s(f, v); -} - -const VMStateInfo vmstate_info_int8 = { - .name = "int8", - .get = get_int8, - .put = put_int8, -}; - -/* 16 bit int */ - -static int get_int16(QEMUFile *f, void *pv, size_t size) -{ - int16_t *v = pv; - qemu_get_sbe16s(f, v); - return 0; -} - -static void put_int16(QEMUFile *f, void *pv, size_t size) -{ - int16_t *v = pv; - qemu_put_sbe16s(f, v); -} - -const VMStateInfo vmstate_info_int16 = { - .name = "int16", - .get = get_int16, - .put = put_int16, -}; - -/* 32 bit int */ - -static int get_int32(QEMUFile *f, void *pv, size_t size) -{ - int32_t *v = pv; - qemu_get_sbe32s(f, v); - return 0; -} - -static void put_int32(QEMUFile *f, void *pv, size_t size) -{ - int32_t *v = pv; - qemu_put_sbe32s(f, v); -} - -const VMStateInfo vmstate_info_int32 = { - .name = "int32", - .get = get_int32, - .put = put_int32, -}; - -/* 32 bit int. See that the received value is the same than the one - in the field */ - -static int get_int32_equal(QEMUFile *f, void *pv, size_t size) -{ - int32_t *v = pv; - int32_t v2; - qemu_get_sbe32s(f, &v2); - - if (*v == v2) { - return 0; - } - return -EINVAL; -} - -const VMStateInfo vmstate_info_int32_equal = { - .name = "int32 equal", - .get = get_int32_equal, - .put = put_int32, -}; - -/* 32 bit int. Check that the received value is non-negative - * and less than or equal to the one in the field. - */ - -static int get_int32_le(QEMUFile *f, void *pv, size_t size) -{ - int32_t *cur = pv; - int32_t loaded; - qemu_get_sbe32s(f, &loaded); - - if (loaded >= 0 && loaded <= *cur) { - *cur = loaded; - return 0; - } - return -EINVAL; -} - -const VMStateInfo vmstate_info_int32_le = { - .name = "int32 le", - .get = get_int32_le, - .put = put_int32, -}; - -/* 64 bit int */ - -static int get_int64(QEMUFile *f, void *pv, size_t size) -{ - int64_t *v = pv; - qemu_get_sbe64s(f, v); - return 0; -} - -static void put_int64(QEMUFile *f, void *pv, size_t size) -{ - int64_t *v = pv; - qemu_put_sbe64s(f, v); -} - -const VMStateInfo vmstate_info_int64 = { - .name = "int64", - .get = get_int64, - .put = put_int64, -}; - -/* 8 bit unsigned int */ - -static int get_uint8(QEMUFile *f, void *pv, size_t size) -{ - uint8_t *v = pv; - qemu_get_8s(f, v); - return 0; -} - -static void put_uint8(QEMUFile *f, void *pv, size_t size) -{ - uint8_t *v = pv; - qemu_put_8s(f, v); -} - -const VMStateInfo vmstate_info_uint8 = { - .name = "uint8", - .get = get_uint8, - .put = put_uint8, -}; - -/* 16 bit unsigned int */ - -static int get_uint16(QEMUFile *f, void *pv, size_t size) -{ - uint16_t *v = pv; - qemu_get_be16s(f, v); - return 0; -} - -static void put_uint16(QEMUFile *f, void *pv, size_t size) -{ - uint16_t *v = pv; - qemu_put_be16s(f, v); -} - -const VMStateInfo vmstate_info_uint16 = { - .name = "uint16", - .get = get_uint16, - .put = put_uint16, -}; - -/* 32 bit unsigned int */ - -static int get_uint32(QEMUFile *f, void *pv, size_t size) -{ - uint32_t *v = pv; - qemu_get_be32s(f, v); - return 0; -} - -static void put_uint32(QEMUFile *f, void *pv, size_t size) -{ - uint32_t *v = pv; - qemu_put_be32s(f, v); -} - -const VMStateInfo vmstate_info_uint32 = { - .name = "uint32", - .get = get_uint32, - .put = put_uint32, -}; - -/* 32 bit uint. See that the received value is the same than the one - in the field */ - -static int get_uint32_equal(QEMUFile *f, void *pv, size_t size) -{ - uint32_t *v = pv; - uint32_t v2; - qemu_get_be32s(f, &v2); - - if (*v == v2) { - return 0; - } - return -EINVAL; -} - -const VMStateInfo vmstate_info_uint32_equal = { - .name = "uint32 equal", - .get = get_uint32_equal, - .put = put_uint32, -}; - -/* 64 bit unsigned int */ - -static int get_uint64(QEMUFile *f, void *pv, size_t size) -{ - uint64_t *v = pv; - qemu_get_be64s(f, v); - return 0; -} - -static void put_uint64(QEMUFile *f, void *pv, size_t size) -{ - uint64_t *v = pv; - qemu_put_be64s(f, v); -} - -const VMStateInfo vmstate_info_uint64 = { - .name = "uint64", - .get = get_uint64, - .put = put_uint64, -}; - -/* 64 bit unsigned int. See that the received value is the same than the one - in the field */ - -static int get_uint64_equal(QEMUFile *f, void *pv, size_t size) -{ - uint64_t *v = pv; - uint64_t v2; - qemu_get_be64s(f, &v2); - - if (*v == v2) { - return 0; - } - return -EINVAL; -} - -const VMStateInfo vmstate_info_uint64_equal = { - .name = "int64 equal", - .get = get_uint64_equal, - .put = put_uint64, -}; - -/* 8 bit int. See that the received value is the same than the one - in the field */ - -static int get_uint8_equal(QEMUFile *f, void *pv, size_t size) -{ - uint8_t *v = pv; - uint8_t v2; - qemu_get_8s(f, &v2); - - if (*v == v2) { - return 0; - } - return -EINVAL; -} - -const VMStateInfo vmstate_info_uint8_equal = { - .name = "uint8 equal", - .get = get_uint8_equal, - .put = put_uint8, -}; - -/* 16 bit unsigned int int. See that the received value is the same than the one - in the field */ - -static int get_uint16_equal(QEMUFile *f, void *pv, size_t size) -{ - uint16_t *v = pv; - uint16_t v2; - qemu_get_be16s(f, &v2); - - if (*v == v2) { - return 0; - } - return -EINVAL; -} - -const VMStateInfo vmstate_info_uint16_equal = { - .name = "uint16 equal", - .get = get_uint16_equal, - .put = put_uint16, -}; - -/* floating point */ - -static int get_float64(QEMUFile *f, void *pv, size_t size) -{ - float64 *v = pv; - - *v = make_float64(qemu_get_be64(f)); - return 0; -} - -static void put_float64(QEMUFile *f, void *pv, size_t size) -{ - uint64_t *v = pv; - - qemu_put_be64(f, float64_val(*v)); -} - -const VMStateInfo vmstate_info_float64 = { - .name = "float64", - .get = get_float64, - .put = put_float64, -}; - -/* CPU_DoubleU type */ - -static int get_cpudouble(QEMUFile *f, void *pv, size_t size) -{ - CPU_DoubleU *v = pv; - qemu_get_be32s(f, &v->l.upper); - qemu_get_be32s(f, &v->l.lower); - return 0; -} - -static void put_cpudouble(QEMUFile *f, void *pv, size_t size) -{ - CPU_DoubleU *v = pv; - qemu_put_be32s(f, &v->l.upper); - qemu_put_be32s(f, &v->l.lower); -} - -const VMStateInfo vmstate_info_cpudouble = { - .name = "CPU_Double_U", - .get = get_cpudouble, - .put = put_cpudouble, -}; - -/* uint8_t buffers */ - -static int get_buffer(QEMUFile *f, void *pv, size_t size) -{ - uint8_t *v = pv; - qemu_get_buffer(f, v, size); - return 0; -} - -static void put_buffer(QEMUFile *f, void *pv, size_t size) -{ - uint8_t *v = pv; - qemu_put_buffer(f, v, size); -} - -const VMStateInfo vmstate_info_buffer = { - .name = "buffer", - .get = get_buffer, - .put = put_buffer, -}; - -/* unused buffers: space that was used for some fields that are - not useful anymore */ - -static int get_unused_buffer(QEMUFile *f, void *pv, size_t size) -{ - uint8_t buf[1024]; - int block_len; - - while (size > 0) { - block_len = MIN(sizeof(buf), size); - size -= block_len; - qemu_get_buffer(f, buf, block_len); - } - return 0; -} - -static void put_unused_buffer(QEMUFile *f, void *pv, size_t size) -{ - static const uint8_t buf[1024]; - int block_len; - - while (size > 0) { - block_len = MIN(sizeof(buf), size); - size -= block_len; - qemu_put_buffer(f, buf, block_len); - } -} - -const VMStateInfo vmstate_info_unused_buffer = { - .name = "unused_buffer", - .get = get_unused_buffer, - .put = put_unused_buffer, -}; - -/* bitmaps (as defined by bitmap.h). Note that size here is the size - * of the bitmap in bits. The on-the-wire format of a bitmap is 64 - * bit words with the bits in big endian order. The in-memory format - * is an array of 'unsigned long', which may be either 32 or 64 bits. - */ -/* This is the number of 64 bit words sent over the wire */ -#define BITS_TO_U64S(nr) DIV_ROUND_UP(nr, 64) -static int get_bitmap(QEMUFile *f, void *pv, size_t size) -{ - unsigned long *bmp = pv; - int i, idx = 0; - for (i = 0; i < BITS_TO_U64S(size); i++) { - uint64_t w = qemu_get_be64(f); - bmp[idx++] = w; - if (sizeof(unsigned long) == 4 && idx < BITS_TO_LONGS(size)) { - bmp[idx++] = w >> 32; - } - } - return 0; -} - -static void put_bitmap(QEMUFile *f, void *pv, size_t size) -{ - unsigned long *bmp = pv; - int i, idx = 0; - for (i = 0; i < BITS_TO_U64S(size); i++) { - uint64_t w = bmp[idx++]; - if (sizeof(unsigned long) == 4 && idx < BITS_TO_LONGS(size)) { - w |= ((uint64_t)bmp[idx++]) << 32; - } - qemu_put_be64(f, w); - } -} - -const VMStateInfo vmstate_info_bitmap = { - .name = "bitmap", - .get = get_bitmap, - .put = put_bitmap, -}; diff --git a/qemu/migration/xbzrle.c b/qemu/migration/xbzrle.c deleted file mode 100644 index c85833925..000000000 --- a/qemu/migration/xbzrle.c +++ /dev/null @@ -1,176 +0,0 @@ -/* - * Xor Based Zero Run Length Encoding - * - * Copyright 2013 Red Hat, Inc. and/or its affiliates - * - * Authors: - * Orit Wasserman <owasserm@redhat.com> - * - * This work is licensed under the terms of the GNU GPL, version 2 or later. - * See the COPYING file in the top-level directory. - * - */ -#include "qemu/osdep.h" -#include "qemu/cutils.h" -#include "include/migration/migration.h" - -/* - page = zrun nzrun - | zrun nzrun page - - zrun = length - - nzrun = length byte... - - length = uleb128 encoded integer - */ -int xbzrle_encode_buffer(uint8_t *old_buf, uint8_t *new_buf, int slen, - uint8_t *dst, int dlen) -{ - uint32_t zrun_len = 0, nzrun_len = 0; - int d = 0, i = 0; - long res; - uint8_t *nzrun_start = NULL; - - g_assert(!(((uintptr_t)old_buf | (uintptr_t)new_buf | slen) % - sizeof(long))); - - while (i < slen) { - /* overflow */ - if (d + 2 > dlen) { - return -1; - } - - /* not aligned to sizeof(long) */ - res = (slen - i) % sizeof(long); - while (res && old_buf[i] == new_buf[i]) { - zrun_len++; - i++; - res--; - } - - /* word at a time for speed */ - if (!res) { - while (i < slen && - (*(long *)(old_buf + i)) == (*(long *)(new_buf + i))) { - i += sizeof(long); - zrun_len += sizeof(long); - } - - /* go over the rest */ - while (i < slen && old_buf[i] == new_buf[i]) { - zrun_len++; - i++; - } - } - - /* buffer unchanged */ - if (zrun_len == slen) { - return 0; - } - - /* skip last zero run */ - if (i == slen) { - return d; - } - - d += uleb128_encode_small(dst + d, zrun_len); - - zrun_len = 0; - nzrun_start = new_buf + i; - - /* overflow */ - if (d + 2 > dlen) { - return -1; - } - /* not aligned to sizeof(long) */ - res = (slen - i) % sizeof(long); - while (res && old_buf[i] != new_buf[i]) { - i++; - nzrun_len++; - res--; - } - - /* word at a time for speed, use of 32-bit long okay */ - if (!res) { - /* truncation to 32-bit long okay */ - unsigned long mask = (unsigned long)0x0101010101010101ULL; - while (i < slen) { - unsigned long xor; - xor = *(unsigned long *)(old_buf + i) - ^ *(unsigned long *)(new_buf + i); - if ((xor - mask) & ~xor & (mask << 7)) { - /* found the end of an nzrun within the current long */ - while (old_buf[i] != new_buf[i]) { - nzrun_len++; - i++; - } - break; - } else { - i += sizeof(long); - nzrun_len += sizeof(long); - } - } - } - - d += uleb128_encode_small(dst + d, nzrun_len); - /* overflow */ - if (d + nzrun_len > dlen) { - return -1; - } - memcpy(dst + d, nzrun_start, nzrun_len); - d += nzrun_len; - nzrun_len = 0; - } - - return d; -} - -int xbzrle_decode_buffer(uint8_t *src, int slen, uint8_t *dst, int dlen) -{ - int i = 0, d = 0; - int ret; - uint32_t count = 0; - - while (i < slen) { - - /* zrun */ - if ((slen - i) < 2) { - return -1; - } - - ret = uleb128_decode_small(src + i, &count); - if (ret < 0 || (i && !count)) { - return -1; - } - i += ret; - d += count; - - /* overflow */ - if (d > dlen) { - return -1; - } - - /* nzrun */ - if ((slen - i) < 2) { - return -1; - } - - ret = uleb128_decode_small(src + i, &count); - if (ret < 0 || !count) { - return -1; - } - i += ret; - - /* overflow */ - if (d + count > dlen || i + count > slen) { - return -1; - } - - memcpy(dst + d, src + i, count); - d += count; - i += count; - } - - return d; -} |