diff options
Diffstat (limited to 'kernel/arch/powerpc/kvm')
21 files changed, 1038 insertions, 242 deletions
diff --git a/kernel/arch/powerpc/kvm/Kconfig b/kernel/arch/powerpc/kvm/Kconfig index d4c48506e..2303788da 100644 --- a/kernel/arch/powerpc/kvm/Kconfig +++ b/kernel/arch/powerpc/kvm/Kconfig @@ -74,14 +74,14 @@ config KVM_BOOK3S_64 If unsure, say N. config KVM_BOOK3S_64_HV - tristate "KVM support for POWER7 and PPC970 using hypervisor mode in host" + tristate "KVM for POWER7 and later using hypervisor mode in host" depends on KVM_BOOK3S_64 && PPC_POWERNV select KVM_BOOK3S_HV_POSSIBLE select MMU_NOTIFIER select CMA ---help--- Support running unmodified book3s_64 guest kernels in - virtual machines on POWER7 and PPC970 processors that have + virtual machines on POWER7 and newer processors that have hypervisor mode available to the host. If you say Y here, KVM will use the hardware virtualization @@ -89,8 +89,8 @@ config KVM_BOOK3S_64_HV guest operating systems will run at full hardware speed using supervisor and user modes. However, this also means that KVM is not usable under PowerVM (pHyp), is only usable - on POWER7 (or later) processors and PPC970-family processors, - and cannot emulate a different processor from the host processor. + on POWER7 or later processors, and cannot emulate a + different processor from the host processor. If unsure, say N. diff --git a/kernel/arch/powerpc/kvm/book3s.c b/kernel/arch/powerpc/kvm/book3s.c index 453a8a47a..099c79d8c 100644 --- a/kernel/arch/powerpc/kvm/book3s.c +++ b/kernel/arch/powerpc/kvm/book3s.c @@ -53,6 +53,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { { "ext_intr", VCPU_STAT(ext_intr_exits) }, { "queue_intr", VCPU_STAT(queue_intr) }, { "halt_successful_poll", VCPU_STAT(halt_successful_poll), }, + { "halt_attempted_poll", VCPU_STAT(halt_attempted_poll), }, { "halt_wakeup", VCPU_STAT(halt_wakeup) }, { "pf_storage", VCPU_STAT(pf_storage) }, { "sp_storage", VCPU_STAT(sp_storage) }, @@ -240,7 +241,8 @@ void kvmppc_core_queue_inst_storage(struct kvm_vcpu *vcpu, ulong flags) kvmppc_book3s_queue_irqprio(vcpu, BOOK3S_INTERRUPT_INST_STORAGE); } -int kvmppc_book3s_irqprio_deliver(struct kvm_vcpu *vcpu, unsigned int priority) +static int kvmppc_book3s_irqprio_deliver(struct kvm_vcpu *vcpu, + unsigned int priority) { int deliver = 1; int vec = 0; @@ -757,16 +759,17 @@ void kvmppc_core_flush_memslot(struct kvm *kvm, struct kvm_memory_slot *memslot) int kvmppc_core_prepare_memory_region(struct kvm *kvm, struct kvm_memory_slot *memslot, - struct kvm_userspace_memory_region *mem) + const struct kvm_userspace_memory_region *mem) { return kvm->arch.kvm_ops->prepare_memory_region(kvm, memslot, mem); } void kvmppc_core_commit_memory_region(struct kvm *kvm, - struct kvm_userspace_memory_region *mem, - const struct kvm_memory_slot *old) + const struct kvm_userspace_memory_region *mem, + const struct kvm_memory_slot *old, + const struct kvm_memory_slot *new) { - kvm->arch.kvm_ops->commit_memory_region(kvm, mem, old); + kvm->arch.kvm_ops->commit_memory_region(kvm, mem, old, new); } int kvm_unmap_hva(struct kvm *kvm, unsigned long hva) @@ -826,12 +829,15 @@ int kvmppc_h_logical_ci_load(struct kvm_vcpu *vcpu) unsigned long size = kvmppc_get_gpr(vcpu, 4); unsigned long addr = kvmppc_get_gpr(vcpu, 5); u64 buf; + int srcu_idx; int ret; if (!is_power_of_2(size) || (size > sizeof(buf))) return H_TOO_HARD; + srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); ret = kvm_io_bus_read(vcpu, KVM_MMIO_BUS, addr, size, &buf); + srcu_read_unlock(&vcpu->kvm->srcu, srcu_idx); if (ret != 0) return H_TOO_HARD; @@ -866,6 +872,7 @@ int kvmppc_h_logical_ci_store(struct kvm_vcpu *vcpu) unsigned long addr = kvmppc_get_gpr(vcpu, 5); unsigned long val = kvmppc_get_gpr(vcpu, 6); u64 buf; + int srcu_idx; int ret; switch (size) { @@ -889,7 +896,9 @@ int kvmppc_h_logical_ci_store(struct kvm_vcpu *vcpu) return H_TOO_HARD; } + srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); ret = kvm_io_bus_write(vcpu, KVM_MMIO_BUS, addr, size, &buf); + srcu_read_unlock(&vcpu->kvm->srcu, srcu_idx); if (ret != 0) return H_TOO_HARD; @@ -901,7 +910,7 @@ int kvmppc_core_check_processor_compat(void) { /* * We always return 0 for book3s. We check - * for compatability while loading the HV + * for compatibility while loading the HV * or PR module */ return 0; diff --git a/kernel/arch/powerpc/kvm/book3s_32_mmu_host.c b/kernel/arch/powerpc/kvm/book3s_32_mmu_host.c index 2035d16a9..d5c9bfeb0 100644 --- a/kernel/arch/powerpc/kvm/book3s_32_mmu_host.c +++ b/kernel/arch/powerpc/kvm/book3s_32_mmu_host.c @@ -26,6 +26,7 @@ #include <asm/machdep.h> #include <asm/mmu_context.h> #include <asm/hw_irq.h> +#include "book3s.h" /* #define DEBUG_MMU */ /* #define DEBUG_SR */ diff --git a/kernel/arch/powerpc/kvm/book3s_64_mmu_host.c b/kernel/arch/powerpc/kvm/book3s_64_mmu_host.c index b982d925c..79ad35abd 100644 --- a/kernel/arch/powerpc/kvm/book3s_64_mmu_host.c +++ b/kernel/arch/powerpc/kvm/book3s_64_mmu_host.c @@ -28,6 +28,7 @@ #include <asm/mmu_context.h> #include <asm/hw_irq.h> #include "trace_pr.h" +#include "book3s.h" #define PTE_SIZE 12 diff --git a/kernel/arch/powerpc/kvm/book3s_64_mmu_hv.c b/kernel/arch/powerpc/kvm/book3s_64_mmu_hv.c index 1a4acf8bf..fb37290a5 100644 --- a/kernel/arch/powerpc/kvm/book3s_64_mmu_hv.c +++ b/kernel/arch/powerpc/kvm/book3s_64_mmu_hv.c @@ -70,7 +70,8 @@ long kvmppc_alloc_hpt(struct kvm *kvm, u32 *htab_orderp) } /* Lastly try successively smaller sizes from the page allocator */ - while (!hpt && order > PPC_MIN_HPT_ORDER) { + /* Only do this if userspace didn't specify a size via ioctl */ + while (!hpt && order > PPC_MIN_HPT_ORDER && !htab_orderp) { hpt = __get_free_pages(GFP_KERNEL|__GFP_ZERO|__GFP_REPEAT| __GFP_NOWARN, order - PAGE_SHIFT); if (!hpt) @@ -543,7 +544,7 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, */ local_irq_save(flags); ptep = find_linux_pte_or_hugepte(current->mm->pgd, - hva, NULL); + hva, NULL, NULL); if (ptep) { pte = kvmppc_read_update_linux_pte(ptep, 1); if (pte_write(pte)) @@ -650,7 +651,7 @@ static void kvmppc_rmap_reset(struct kvm *kvm) int srcu_idx; srcu_idx = srcu_read_lock(&kvm->srcu); - slots = kvm->memslots; + slots = kvm_memslots(kvm); kvm_for_each_memslot(memslot, slots) { /* * This assumes it is acceptable to lose reference and @@ -761,6 +762,8 @@ static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp, /* Harvest R and C */ rcbits = be64_to_cpu(hptep[1]) & (HPTE_R_R | HPTE_R_C); *rmapp |= rcbits << KVMPPC_RMAP_RC_SHIFT; + if (rcbits & HPTE_R_C) + kvmppc_update_rmap_change(rmapp, psize); if (rcbits & ~rev[i].guest_rpte) { rev[i].guest_rpte = ptel | rcbits; note_hpte_modification(kvm, &rev[i]); @@ -927,8 +930,12 @@ static int kvm_test_clear_dirty_npages(struct kvm *kvm, unsigned long *rmapp) retry: lock_rmap(rmapp); if (*rmapp & KVMPPC_RMAP_CHANGED) { - *rmapp &= ~KVMPPC_RMAP_CHANGED; + long change_order = (*rmapp & KVMPPC_RMAP_CHG_ORDER) + >> KVMPPC_RMAP_CHG_SHIFT; + *rmapp &= ~(KVMPPC_RMAP_CHANGED | KVMPPC_RMAP_CHG_ORDER); npages_dirty = 1; + if (change_order > PAGE_SHIFT) + npages_dirty = 1ul << (change_order - PAGE_SHIFT); } if (!(*rmapp & KVMPPC_RMAP_PRESENT)) { unlock_rmap(rmapp); diff --git a/kernel/arch/powerpc/kvm/book3s_emulate.c b/kernel/arch/powerpc/kvm/book3s_emulate.c index 5a2bc4b0d..2afdb9c09 100644 --- a/kernel/arch/powerpc/kvm/book3s_emulate.c +++ b/kernel/arch/powerpc/kvm/book3s_emulate.c @@ -23,6 +23,7 @@ #include <asm/reg.h> #include <asm/switch_to.h> #include <asm/time.h> +#include "book3s.h" #define OP_19_XOP_RFID 18 #define OP_19_XOP_RFI 50 diff --git a/kernel/arch/powerpc/kvm/book3s_hv.c b/kernel/arch/powerpc/kvm/book3s_hv.c index 3e9087f45..df34a6432 100644 --- a/kernel/arch/powerpc/kvm/book3s_hv.c +++ b/kernel/arch/powerpc/kvm/book3s_hv.c @@ -36,7 +36,6 @@ #include <asm/reg.h> #include <asm/cputable.h> -#include <asm/cache.h> #include <asm/cacheflush.h> #include <asm/tlbflush.h> #include <asm/uaccess.h> @@ -75,12 +74,12 @@ static DECLARE_BITMAP(default_enabled_hcalls, MAX_HCALL_OPCODE/4 + 1); -#if defined(CONFIG_PPC_64K_PAGES) -#define MPP_BUFFER_ORDER 0 -#elif defined(CONFIG_PPC_4K_PAGES) -#define MPP_BUFFER_ORDER 3 -#endif - +static int dynamic_mt_modes = 6; +module_param(dynamic_mt_modes, int, S_IRUGO | S_IWUSR); +MODULE_PARM_DESC(dynamic_mt_modes, "Set of allowed dynamic micro-threading modes: 0 (= none), 2, 4, or 6 (= 2 or 4)"); +static int target_smt_mode; +module_param(target_smt_mode, int, S_IRUGO | S_IWUSR); +MODULE_PARM_DESC(target_smt_mode, "Target threads per core (0 = max)"); static void kvmppc_end_cede(struct kvm_vcpu *vcpu); static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu); @@ -114,19 +113,20 @@ static bool kvmppc_ipi_thread(int cpu) static void kvmppc_fast_vcpu_kick_hv(struct kvm_vcpu *vcpu) { - int cpu = vcpu->cpu; - struct swait_head *wqp; + int cpu; + struct swait_queue_head *wqp; wqp = kvm_arch_vcpu_wq(vcpu); - if (swaitqueue_active(wqp)) { - swait_wake_interruptible(wqp); + if (swait_active(wqp)) { + swake_up(wqp); ++vcpu->stat.halt_wakeup; } - if (kvmppc_ipi_thread(cpu + vcpu->arch.ptid)) + if (kvmppc_ipi_thread(vcpu->arch.thread_cpu)) return; /* CPU points to the first thread of the core */ + cpu = vcpu->cpu; if (cpu >= 0 && cpu < nr_cpu_ids && cpu_online(cpu)) smp_send_reschedule(cpu); } @@ -164,6 +164,27 @@ static void kvmppc_fast_vcpu_kick_hv(struct kvm_vcpu *vcpu) * they should never fail.) */ +static void kvmppc_core_start_stolen(struct kvmppc_vcore *vc) +{ + unsigned long flags; + + spin_lock_irqsave(&vc->stoltb_lock, flags); + vc->preempt_tb = mftb(); + spin_unlock_irqrestore(&vc->stoltb_lock, flags); +} + +static void kvmppc_core_end_stolen(struct kvmppc_vcore *vc) +{ + unsigned long flags; + + spin_lock_irqsave(&vc->stoltb_lock, flags); + if (vc->preempt_tb != TB_NIL) { + vc->stolen_tb += mftb() - vc->preempt_tb; + vc->preempt_tb = TB_NIL; + } + spin_unlock_irqrestore(&vc->stoltb_lock, flags); +} + static void kvmppc_core_vcpu_load_hv(struct kvm_vcpu *vcpu, int cpu) { struct kvmppc_vcore *vc = vcpu->arch.vcore; @@ -175,14 +196,9 @@ static void kvmppc_core_vcpu_load_hv(struct kvm_vcpu *vcpu, int cpu) * vcpu, and once it is set to this vcpu, only this task * ever sets it to NULL. */ - if (vc->runner == vcpu && vc->vcore_state != VCORE_INACTIVE) { - spin_lock_irqsave(&vc->stoltb_lock, flags); - if (vc->preempt_tb != TB_NIL) { - vc->stolen_tb += mftb() - vc->preempt_tb; - vc->preempt_tb = TB_NIL; - } - spin_unlock_irqrestore(&vc->stoltb_lock, flags); - } + if (vc->runner == vcpu && vc->vcore_state >= VCORE_SLEEPING) + kvmppc_core_end_stolen(vc); + spin_lock_irqsave(&vcpu->arch.tbacct_lock, flags); if (vcpu->arch.state == KVMPPC_VCPU_BUSY_IN_HOST && vcpu->arch.busy_preempt != TB_NIL) { @@ -197,11 +213,9 @@ static void kvmppc_core_vcpu_put_hv(struct kvm_vcpu *vcpu) struct kvmppc_vcore *vc = vcpu->arch.vcore; unsigned long flags; - if (vc->runner == vcpu && vc->vcore_state != VCORE_INACTIVE) { - spin_lock_irqsave(&vc->stoltb_lock, flags); - vc->preempt_tb = mftb(); - spin_unlock_irqrestore(&vc->stoltb_lock, flags); - } + if (vc->runner == vcpu && vc->vcore_state >= VCORE_SLEEPING) + kvmppc_core_start_stolen(vc); + spin_lock_irqsave(&vcpu->arch.tbacct_lock, flags); if (vcpu->arch.state == KVMPPC_VCPU_BUSY_IN_HOST) vcpu->arch.busy_preempt = mftb(); @@ -210,16 +224,22 @@ static void kvmppc_core_vcpu_put_hv(struct kvm_vcpu *vcpu) static void kvmppc_set_msr_hv(struct kvm_vcpu *vcpu, u64 msr) { + /* + * Check for illegal transactional state bit combination + * and if we find it, force the TS field to a safe state. + */ + if ((msr & MSR_TS_MASK) == MSR_TS_MASK) + msr &= ~MSR_TS_MASK; vcpu->arch.shregs.msr = msr; kvmppc_end_cede(vcpu); } -void kvmppc_set_pvr_hv(struct kvm_vcpu *vcpu, u32 pvr) +static void kvmppc_set_pvr_hv(struct kvm_vcpu *vcpu, u32 pvr) { vcpu->arch.pvr = pvr; } -int kvmppc_set_arch_compat(struct kvm_vcpu *vcpu, u32 arch_compat) +static int kvmppc_set_arch_compat(struct kvm_vcpu *vcpu, u32 arch_compat) { unsigned long pcr = 0; struct kvmppc_vcore *vc = vcpu->arch.vcore; @@ -259,7 +279,7 @@ int kvmppc_set_arch_compat(struct kvm_vcpu *vcpu, u32 arch_compat) return 0; } -void kvmppc_dump_regs(struct kvm_vcpu *vcpu) +static void kvmppc_dump_regs(struct kvm_vcpu *vcpu) { int r; @@ -292,7 +312,7 @@ void kvmppc_dump_regs(struct kvm_vcpu *vcpu) vcpu->arch.last_inst); } -struct kvm_vcpu *kvmppc_find_vcpu(struct kvm *kvm, int id) +static struct kvm_vcpu *kvmppc_find_vcpu(struct kvm *kvm, int id) { int r; struct kvm_vcpu *v, *ret = NULL; @@ -641,7 +661,8 @@ static int kvm_arch_vcpu_yield_to(struct kvm_vcpu *target) spin_lock(&vcore->lock); if (target->arch.state == KVMPPC_VCPU_RUNNABLE && - vcore->vcore_state != VCORE_INACTIVE) + vcore->vcore_state != VCORE_INACTIVE && + vcore->runner) target = vcore->runner; spin_unlock(&vcore->lock); @@ -686,8 +707,8 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu) tvcpu->arch.prodded = 1; smp_mb(); if (vcpu->arch.ceded) { - if (swaitqueue_active(&vcpu->wq)) { - swait_wake_interruptible(&vcpu->wq); + if (swait_active(&vcpu->wq)) { + swake_up(&vcpu->wq); vcpu->stat.halt_wakeup++; } } @@ -1426,18 +1447,12 @@ static struct kvmppc_vcore *kvmppc_vcore_create(struct kvm *kvm, int core) INIT_LIST_HEAD(&vcore->runnable_threads); spin_lock_init(&vcore->lock); spin_lock_init(&vcore->stoltb_lock); - init_swait_head(&vcore->wq); + init_swait_queue_head(&vcore->wq); vcore->preempt_tb = TB_NIL; vcore->lpcr = kvm->arch.lpcr; vcore->first_vcpuid = core * threads_per_subcore; vcore->kvm = kvm; - - vcore->mpp_buffer_is_valid = false; - - if (cpu_has_feature(CPU_FTR_ARCH_207S)) - vcore->mpp_buffer = (void *)__get_free_pages( - GFP_KERNEL|__GFP_ZERO, - MPP_BUFFER_ORDER); + INIT_LIST_HEAD(&vcore->preempt_list); return vcore; } @@ -1655,6 +1670,7 @@ static struct kvm_vcpu *kvmppc_core_vcpu_create_hv(struct kvm *kvm, spin_unlock(&vcore->lock); vcpu->arch.vcore = vcore; vcpu->arch.ptid = vcpu->vcpu_id - vcore->first_vcpuid; + vcpu->arch.thread_cpu = -1; vcpu->arch.cpu_type = KVM_CPU_3S_64; kvmppc_sanity_check(vcpu); @@ -1749,6 +1765,7 @@ static int kvmppc_grab_hwthread(int cpu) /* Ensure the thread won't go into the kernel if it wakes */ tpaca->kvm_hstate.kvm_vcpu = NULL; + tpaca->kvm_hstate.kvm_vcore = NULL; tpaca->kvm_hstate.napping = 0; smp_wmb(); tpaca->kvm_hstate.hwthread_req = 1; @@ -1780,26 +1797,32 @@ static void kvmppc_release_hwthread(int cpu) tpaca = &paca[cpu]; tpaca->kvm_hstate.hwthread_req = 0; tpaca->kvm_hstate.kvm_vcpu = NULL; + tpaca->kvm_hstate.kvm_vcore = NULL; + tpaca->kvm_hstate.kvm_split_mode = NULL; } -static void kvmppc_start_thread(struct kvm_vcpu *vcpu) +static void kvmppc_start_thread(struct kvm_vcpu *vcpu, struct kvmppc_vcore *vc) { int cpu; struct paca_struct *tpaca; - struct kvmppc_vcore *vc = vcpu->arch.vcore; + struct kvmppc_vcore *mvc = vc->master_vcore; - if (vcpu->arch.timer_running) { - hrtimer_try_to_cancel(&vcpu->arch.dec_timer); - vcpu->arch.timer_running = 0; + cpu = vc->pcpu; + if (vcpu) { + if (vcpu->arch.timer_running) { + hrtimer_try_to_cancel(&vcpu->arch.dec_timer); + vcpu->arch.timer_running = 0; + } + cpu += vcpu->arch.ptid; + vcpu->cpu = mvc->pcpu; + vcpu->arch.thread_cpu = cpu; } - cpu = vc->pcpu + vcpu->arch.ptid; tpaca = &paca[cpu]; - tpaca->kvm_hstate.kvm_vcore = vc; - tpaca->kvm_hstate.ptid = vcpu->arch.ptid; - vcpu->cpu = vc->pcpu; - /* Order stores to hstate.kvm_vcore etc. before store to kvm_vcpu */ - smp_wmb(); tpaca->kvm_hstate.kvm_vcpu = vcpu; + tpaca->kvm_hstate.ptid = cpu - mvc->pcpu; + /* Order stores to hstate.kvm_vcpu etc. before store to kvm_vcore */ + smp_wmb(); + tpaca->kvm_hstate.kvm_vcore = mvc; if (cpu != smp_processor_id()) kvmppc_ipi_thread(cpu); } @@ -1812,12 +1835,12 @@ static void kvmppc_wait_for_nap(void) for (loops = 0; loops < 1000000; ++loops) { /* * Check if all threads are finished. - * We set the vcpu pointer when starting a thread + * We set the vcore pointer when starting a thread * and the thread clears it when finished, so we look - * for any threads that still have a non-NULL vcpu ptr. + * for any threads that still have a non-NULL vcore ptr. */ for (i = 1; i < threads_per_subcore; ++i) - if (paca[cpu + i].kvm_hstate.kvm_vcpu) + if (paca[cpu + i].kvm_hstate.kvm_vcore) break; if (i == threads_per_subcore) { HMT_medium(); @@ -1827,7 +1850,7 @@ static void kvmppc_wait_for_nap(void) } HMT_medium(); for (i = 1; i < threads_per_subcore; ++i) - if (paca[cpu + i].kvm_hstate.kvm_vcpu) + if (paca[cpu + i].kvm_hstate.kvm_vcore) pr_err("KVM: CPU %d seems to be stuck\n", cpu + i); } @@ -1863,31 +1886,276 @@ static int on_primary_thread(void) return 1; } -static void kvmppc_start_saving_l2_cache(struct kvmppc_vcore *vc) +/* + * A list of virtual cores for each physical CPU. + * These are vcores that could run but their runner VCPU tasks are + * (or may be) preempted. + */ +struct preempted_vcore_list { + struct list_head list; + spinlock_t lock; +}; + +static DEFINE_PER_CPU(struct preempted_vcore_list, preempted_vcores); + +static void init_vcore_lists(void) +{ + int cpu; + + for_each_possible_cpu(cpu) { + struct preempted_vcore_list *lp = &per_cpu(preempted_vcores, cpu); + spin_lock_init(&lp->lock); + INIT_LIST_HEAD(&lp->list); + } +} + +static void kvmppc_vcore_preempt(struct kvmppc_vcore *vc) +{ + struct preempted_vcore_list *lp = this_cpu_ptr(&preempted_vcores); + + vc->vcore_state = VCORE_PREEMPT; + vc->pcpu = smp_processor_id(); + if (vc->num_threads < threads_per_subcore) { + spin_lock(&lp->lock); + list_add_tail(&vc->preempt_list, &lp->list); + spin_unlock(&lp->lock); + } + + /* Start accumulating stolen time */ + kvmppc_core_start_stolen(vc); +} + +static void kvmppc_vcore_end_preempt(struct kvmppc_vcore *vc) +{ + struct preempted_vcore_list *lp; + + kvmppc_core_end_stolen(vc); + if (!list_empty(&vc->preempt_list)) { + lp = &per_cpu(preempted_vcores, vc->pcpu); + spin_lock(&lp->lock); + list_del_init(&vc->preempt_list); + spin_unlock(&lp->lock); + } + vc->vcore_state = VCORE_INACTIVE; +} + +/* + * This stores information about the virtual cores currently + * assigned to a physical core. + */ +struct core_info { + int n_subcores; + int max_subcore_threads; + int total_threads; + int subcore_threads[MAX_SUBCORES]; + struct kvm *subcore_vm[MAX_SUBCORES]; + struct list_head vcs[MAX_SUBCORES]; +}; + +/* + * This mapping means subcores 0 and 1 can use threads 0-3 and 4-7 + * respectively in 2-way micro-threading (split-core) mode. + */ +static int subcore_thread_map[MAX_SUBCORES] = { 0, 4, 2, 6 }; + +static void init_core_info(struct core_info *cip, struct kvmppc_vcore *vc) +{ + int sub; + + memset(cip, 0, sizeof(*cip)); + cip->n_subcores = 1; + cip->max_subcore_threads = vc->num_threads; + cip->total_threads = vc->num_threads; + cip->subcore_threads[0] = vc->num_threads; + cip->subcore_vm[0] = vc->kvm; + for (sub = 0; sub < MAX_SUBCORES; ++sub) + INIT_LIST_HEAD(&cip->vcs[sub]); + list_add_tail(&vc->preempt_list, &cip->vcs[0]); +} + +static bool subcore_config_ok(int n_subcores, int n_threads) +{ + /* Can only dynamically split if unsplit to begin with */ + if (n_subcores > 1 && threads_per_subcore < MAX_SMT_THREADS) + return false; + if (n_subcores > MAX_SUBCORES) + return false; + if (n_subcores > 1) { + if (!(dynamic_mt_modes & 2)) + n_subcores = 4; + if (n_subcores > 2 && !(dynamic_mt_modes & 4)) + return false; + } + + return n_subcores * roundup_pow_of_two(n_threads) <= MAX_SMT_THREADS; +} + +static void init_master_vcore(struct kvmppc_vcore *vc) +{ + vc->master_vcore = vc; + vc->entry_exit_map = 0; + vc->in_guest = 0; + vc->napping_threads = 0; + vc->conferring_threads = 0; +} + +/* + * See if the existing subcores can be split into 3 (or fewer) subcores + * of at most two threads each, so we can fit in another vcore. This + * assumes there are at most two subcores and at most 6 threads in total. + */ +static bool can_split_piggybacked_subcores(struct core_info *cip) +{ + int sub, new_sub; + int large_sub = -1; + int thr; + int n_subcores = cip->n_subcores; + struct kvmppc_vcore *vc, *vcnext; + struct kvmppc_vcore *master_vc = NULL; + + for (sub = 0; sub < cip->n_subcores; ++sub) { + if (cip->subcore_threads[sub] <= 2) + continue; + if (large_sub >= 0) + return false; + large_sub = sub; + vc = list_first_entry(&cip->vcs[sub], struct kvmppc_vcore, + preempt_list); + if (vc->num_threads > 2) + return false; + n_subcores += (cip->subcore_threads[sub] - 1) >> 1; + } + if (large_sub < 0 || !subcore_config_ok(n_subcores + 1, 2)) + return false; + + /* + * Seems feasible, so go through and move vcores to new subcores. + * Note that when we have two or more vcores in one subcore, + * all those vcores must have only one thread each. + */ + new_sub = cip->n_subcores; + thr = 0; + sub = large_sub; + list_for_each_entry_safe(vc, vcnext, &cip->vcs[sub], preempt_list) { + if (thr >= 2) { + list_del(&vc->preempt_list); + list_add_tail(&vc->preempt_list, &cip->vcs[new_sub]); + /* vc->num_threads must be 1 */ + if (++cip->subcore_threads[new_sub] == 1) { + cip->subcore_vm[new_sub] = vc->kvm; + init_master_vcore(vc); + master_vc = vc; + ++cip->n_subcores; + } else { + vc->master_vcore = master_vc; + ++new_sub; + } + } + thr += vc->num_threads; + } + cip->subcore_threads[large_sub] = 2; + cip->max_subcore_threads = 2; + + return true; +} + +static bool can_dynamic_split(struct kvmppc_vcore *vc, struct core_info *cip) +{ + int n_threads = vc->num_threads; + int sub; + + if (!cpu_has_feature(CPU_FTR_ARCH_207S)) + return false; + + if (n_threads < cip->max_subcore_threads) + n_threads = cip->max_subcore_threads; + if (subcore_config_ok(cip->n_subcores + 1, n_threads)) { + cip->max_subcore_threads = n_threads; + } else if (cip->n_subcores <= 2 && cip->total_threads <= 6 && + vc->num_threads <= 2) { + /* + * We may be able to fit another subcore in by + * splitting an existing subcore with 3 or 4 + * threads into two 2-thread subcores, or one + * with 5 or 6 threads into three subcores. + * We can only do this if those subcores have + * piggybacked virtual cores. + */ + if (!can_split_piggybacked_subcores(cip)) + return false; + } else { + return false; + } + + sub = cip->n_subcores; + ++cip->n_subcores; + cip->total_threads += vc->num_threads; + cip->subcore_threads[sub] = vc->num_threads; + cip->subcore_vm[sub] = vc->kvm; + init_master_vcore(vc); + list_del(&vc->preempt_list); + list_add_tail(&vc->preempt_list, &cip->vcs[sub]); + + return true; +} + +static bool can_piggyback_subcore(struct kvmppc_vcore *pvc, + struct core_info *cip, int sub) { - phys_addr_t phy_addr, mpp_addr; + struct kvmppc_vcore *vc; + int n_thr; - phy_addr = (phys_addr_t)virt_to_phys(vc->mpp_buffer); - mpp_addr = phy_addr & PPC_MPPE_ADDRESS_MASK; + vc = list_first_entry(&cip->vcs[sub], struct kvmppc_vcore, + preempt_list); - mtspr(SPRN_MPPR, mpp_addr | PPC_MPPR_FETCH_ABORT); - logmpp(mpp_addr | PPC_LOGMPP_LOG_L2); + /* require same VM and same per-core reg values */ + if (pvc->kvm != vc->kvm || + pvc->tb_offset != vc->tb_offset || + pvc->pcr != vc->pcr || + pvc->lpcr != vc->lpcr) + return false; - vc->mpp_buffer_is_valid = true; + /* P8 guest with > 1 thread per core would see wrong TIR value */ + if (cpu_has_feature(CPU_FTR_ARCH_207S) && + (vc->num_threads > 1 || pvc->num_threads > 1)) + return false; + + n_thr = cip->subcore_threads[sub] + pvc->num_threads; + if (n_thr > cip->max_subcore_threads) { + if (!subcore_config_ok(cip->n_subcores, n_thr)) + return false; + cip->max_subcore_threads = n_thr; + } + + cip->total_threads += pvc->num_threads; + cip->subcore_threads[sub] = n_thr; + pvc->master_vcore = vc; + list_del(&pvc->preempt_list); + list_add_tail(&pvc->preempt_list, &cip->vcs[sub]); + + return true; } -static void kvmppc_start_restoring_l2_cache(const struct kvmppc_vcore *vc) +/* + * Work out whether it is possible to piggyback the execution of + * vcore *pvc onto the execution of the other vcores described in *cip. + */ +static bool can_piggyback(struct kvmppc_vcore *pvc, struct core_info *cip, + int target_threads) { - phys_addr_t phy_addr, mpp_addr; + int sub; - phy_addr = virt_to_phys(vc->mpp_buffer); - mpp_addr = phy_addr & PPC_MPPE_ADDRESS_MASK; + if (cip->total_threads + pvc->num_threads > target_threads) + return false; + for (sub = 0; sub < cip->n_subcores; ++sub) + if (cip->subcore_threads[sub] && + can_piggyback_subcore(pvc, cip, sub)) + return true; + + if (can_dynamic_split(pvc, cip)) + return true; - /* We must abort any in-progress save operations to ensure - * the table is valid so that prefetch engine knows when to - * stop prefetching. */ - logmpp(mpp_addr | PPC_LOGMPP_LOG_ABORT); - mtspr(SPRN_MPPR, mpp_addr | PPC_MPPR_FETCH_WHOLE_TABLE); + return false; } static void prepare_threads(struct kvmppc_vcore *vc) @@ -1909,12 +2177,45 @@ static void prepare_threads(struct kvmppc_vcore *vc) } } -static void post_guest_process(struct kvmppc_vcore *vc) +static void collect_piggybacks(struct core_info *cip, int target_threads) +{ + struct preempted_vcore_list *lp = this_cpu_ptr(&preempted_vcores); + struct kvmppc_vcore *pvc, *vcnext; + + spin_lock(&lp->lock); + list_for_each_entry_safe(pvc, vcnext, &lp->list, preempt_list) { + if (!spin_trylock(&pvc->lock)) + continue; + prepare_threads(pvc); + if (!pvc->n_runnable) { + list_del_init(&pvc->preempt_list); + if (pvc->runner == NULL) { + pvc->vcore_state = VCORE_INACTIVE; + kvmppc_core_end_stolen(pvc); + } + spin_unlock(&pvc->lock); + continue; + } + if (!can_piggyback(pvc, cip, target_threads)) { + spin_unlock(&pvc->lock); + continue; + } + kvmppc_core_end_stolen(pvc); + pvc->vcore_state = VCORE_PIGGYBACK; + if (cip->total_threads >= target_threads) + break; + } + spin_unlock(&lp->lock); +} + +static void post_guest_process(struct kvmppc_vcore *vc, bool is_master) { + int still_running = 0; u64 now; long ret; struct kvm_vcpu *vcpu, *vnext; + spin_lock(&vc->lock); now = get_tb(); list_for_each_entry_safe(vcpu, vnext, &vc->runnable_threads, arch.run_list) { @@ -1933,17 +2234,36 @@ static void post_guest_process(struct kvmppc_vcore *vc) vcpu->arch.ret = ret; vcpu->arch.trap = 0; - if (vcpu->arch.ceded) { - if (!is_kvmppc_resume_guest(ret)) - kvmppc_end_cede(vcpu); - else + if (is_kvmppc_resume_guest(vcpu->arch.ret)) { + if (vcpu->arch.pending_exceptions) + kvmppc_core_prepare_to_enter(vcpu); + if (vcpu->arch.ceded) kvmppc_set_timer(vcpu); - } - if (!is_kvmppc_resume_guest(vcpu->arch.ret)) { + else + ++still_running; + } else { kvmppc_remove_runnable(vc, vcpu); wake_up(&vcpu->arch.cpu_run); } } + list_del_init(&vc->preempt_list); + if (!is_master) { + if (still_running > 0) { + kvmppc_vcore_preempt(vc); + } else if (vc->runner) { + vc->vcore_state = VCORE_PREEMPT; + kvmppc_core_start_stolen(vc); + } else { + vc->vcore_state = VCORE_INACTIVE; + } + if (vc->n_runnable > 0 && vc->runner == NULL) { + /* make sure there's a candidate runner awake */ + vcpu = list_first_entry(&vc->runnable_threads, + struct kvm_vcpu, arch.run_list); + wake_up(&vcpu->arch.cpu_run); + } + } + spin_unlock(&vc->lock); } /* @@ -1955,6 +2275,15 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc) struct kvm_vcpu *vcpu, *vnext; int i; int srcu_idx; + struct core_info core_info; + struct kvmppc_vcore *pvc, *vcnext; + struct kvm_split_mode split_info, *sip; + int split, subcore_size, active; + int sub; + bool thr0_done; + unsigned long cmd_bit, stat_bit; + int pcpu, thr; + int target_threads; /* * Remove from the list any threads that have a signal pending @@ -1969,11 +2298,8 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc) /* * Initialize *vc. */ - vc->entry_exit_map = 0; + init_master_vcore(vc); vc->preempt_tb = TB_NIL; - vc->in_guest = 0; - vc->napping_threads = 0; - vc->conferring_threads = 0; /* * Make sure we are running on primary threads, and that secondary @@ -1991,60 +2317,176 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc) goto out; } + /* + * See if we could run any other vcores on the physical core + * along with this one. + */ + init_core_info(&core_info, vc); + pcpu = smp_processor_id(); + target_threads = threads_per_subcore; + if (target_smt_mode && target_smt_mode < target_threads) + target_threads = target_smt_mode; + if (vc->num_threads < target_threads) + collect_piggybacks(&core_info, target_threads); + + /* Decide on micro-threading (split-core) mode */ + subcore_size = threads_per_subcore; + cmd_bit = stat_bit = 0; + split = core_info.n_subcores; + sip = NULL; + if (split > 1) { + /* threads_per_subcore must be MAX_SMT_THREADS (8) here */ + if (split == 2 && (dynamic_mt_modes & 2)) { + cmd_bit = HID0_POWER8_1TO2LPAR; + stat_bit = HID0_POWER8_2LPARMODE; + } else { + split = 4; + cmd_bit = HID0_POWER8_1TO4LPAR; + stat_bit = HID0_POWER8_4LPARMODE; + } + subcore_size = MAX_SMT_THREADS / split; + sip = &split_info; + memset(&split_info, 0, sizeof(split_info)); + split_info.rpr = mfspr(SPRN_RPR); + split_info.pmmar = mfspr(SPRN_PMMAR); + split_info.ldbar = mfspr(SPRN_LDBAR); + split_info.subcore_size = subcore_size; + for (sub = 0; sub < core_info.n_subcores; ++sub) + split_info.master_vcs[sub] = + list_first_entry(&core_info.vcs[sub], + struct kvmppc_vcore, preempt_list); + /* order writes to split_info before kvm_split_mode pointer */ + smp_wmb(); + } + pcpu = smp_processor_id(); + for (thr = 0; thr < threads_per_subcore; ++thr) + paca[pcpu + thr].kvm_hstate.kvm_split_mode = sip; + + /* Initiate micro-threading (split-core) if required */ + if (cmd_bit) { + unsigned long hid0 = mfspr(SPRN_HID0); + + hid0 |= cmd_bit | HID0_POWER8_DYNLPARDIS; + mb(); + mtspr(SPRN_HID0, hid0); + isync(); + for (;;) { + hid0 = mfspr(SPRN_HID0); + if (hid0 & stat_bit) + break; + cpu_relax(); + } + } - vc->pcpu = smp_processor_id(); - list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list) { - kvmppc_start_thread(vcpu); - kvmppc_create_dtl_entry(vcpu, vc); - trace_kvm_guest_enter(vcpu); + /* Start all the threads */ + active = 0; + for (sub = 0; sub < core_info.n_subcores; ++sub) { + thr = subcore_thread_map[sub]; + thr0_done = false; + active |= 1 << thr; + list_for_each_entry(pvc, &core_info.vcs[sub], preempt_list) { + pvc->pcpu = pcpu + thr; + list_for_each_entry(vcpu, &pvc->runnable_threads, + arch.run_list) { + kvmppc_start_thread(vcpu, pvc); + kvmppc_create_dtl_entry(vcpu, pvc); + trace_kvm_guest_enter(vcpu); + if (!vcpu->arch.ptid) + thr0_done = true; + active |= 1 << (thr + vcpu->arch.ptid); + } + /* + * We need to start the first thread of each subcore + * even if it doesn't have a vcpu. + */ + if (pvc->master_vcore == pvc && !thr0_done) + kvmppc_start_thread(NULL, pvc); + thr += pvc->num_threads; + } } - /* Set this explicitly in case thread 0 doesn't have a vcpu */ - get_paca()->kvm_hstate.kvm_vcore = vc; - get_paca()->kvm_hstate.ptid = 0; + /* + * Ensure that split_info.do_nap is set after setting + * the vcore pointer in the PACA of the secondaries. + */ + smp_mb(); + if (cmd_bit) + split_info.do_nap = 1; /* ask secondaries to nap when done */ + + /* + * When doing micro-threading, poke the inactive threads as well. + * This gets them to the nap instruction after kvm_do_nap, + * which reduces the time taken to unsplit later. + */ + if (split > 1) + for (thr = 1; thr < threads_per_subcore; ++thr) + if (!(active & (1 << thr))) + kvmppc_ipi_thread(pcpu + thr); vc->vcore_state = VCORE_RUNNING; preempt_disable(); trace_kvmppc_run_core(vc, 0); - spin_unlock(&vc->lock); + for (sub = 0; sub < core_info.n_subcores; ++sub) + list_for_each_entry(pvc, &core_info.vcs[sub], preempt_list) + spin_unlock(&pvc->lock); kvm_guest_enter(); srcu_idx = srcu_read_lock(&vc->kvm->srcu); - if (vc->mpp_buffer_is_valid) - kvmppc_start_restoring_l2_cache(vc); - __kvmppc_vcore_entry(); - spin_lock(&vc->lock); + srcu_read_unlock(&vc->kvm->srcu, srcu_idx); - if (vc->mpp_buffer) - kvmppc_start_saving_l2_cache(vc); + spin_lock(&vc->lock); + /* prevent other vcpu threads from doing kvmppc_start_thread() now */ + vc->vcore_state = VCORE_EXITING; - /* disable sending of IPIs on virtual external irqs */ - list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list) - vcpu->cpu = -1; /* wait for secondary threads to finish writing their state to memory */ kvmppc_wait_for_nap(); - for (i = 0; i < threads_per_subcore; ++i) - kvmppc_release_hwthread(vc->pcpu + i); - /* prevent other vcpu threads from doing kvmppc_start_thread() now */ - vc->vcore_state = VCORE_EXITING; - spin_unlock(&vc->lock); - srcu_read_unlock(&vc->kvm->srcu, srcu_idx); + /* Return to whole-core mode if we split the core earlier */ + if (split > 1) { + unsigned long hid0 = mfspr(SPRN_HID0); + unsigned long loops = 0; + + hid0 &= ~HID0_POWER8_DYNLPARDIS; + stat_bit = HID0_POWER8_2LPARMODE | HID0_POWER8_4LPARMODE; + mb(); + mtspr(SPRN_HID0, hid0); + isync(); + for (;;) { + hid0 = mfspr(SPRN_HID0); + if (!(hid0 & stat_bit)) + break; + cpu_relax(); + ++loops; + } + split_info.do_nap = 0; + } + + /* Let secondaries go back to the offline loop */ + for (i = 0; i < threads_per_subcore; ++i) { + kvmppc_release_hwthread(pcpu + i); + if (sip && sip->napped[i]) + kvmppc_ipi_thread(pcpu + i); + } + + spin_unlock(&vc->lock); /* make sure updates to secondary vcpu structs are visible now */ smp_mb(); kvm_guest_exit(); - preempt_enable(); + for (sub = 0; sub < core_info.n_subcores; ++sub) + list_for_each_entry_safe(pvc, vcnext, &core_info.vcs[sub], + preempt_list) + post_guest_process(pvc, pvc == vc); spin_lock(&vc->lock); - post_guest_process(vc); + preempt_enable(); out: vc->vcore_state = VCORE_INACTIVE; @@ -2055,13 +2497,17 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc) * Wait for some other vcpu thread to execute us, and * wake us up when we need to handle something in the host. */ -static void kvmppc_wait_for_exec(struct kvm_vcpu *vcpu, int wait_state) +static void kvmppc_wait_for_exec(struct kvmppc_vcore *vc, + struct kvm_vcpu *vcpu, int wait_state) { DEFINE_WAIT(wait); prepare_to_wait(&vcpu->arch.cpu_run, &wait, wait_state); - if (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE) + if (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE) { + spin_unlock(&vc->lock); schedule(); + spin_lock(&vc->lock); + } finish_wait(&vcpu->arch.cpu_run, &wait); } @@ -2073,9 +2519,9 @@ static void kvmppc_vcore_blocked(struct kvmppc_vcore *vc) { struct kvm_vcpu *vcpu; int do_sleep = 1; - DEFINE_SWAITER(wait); + DECLARE_SWAITQUEUE(wait); - swait_prepare(&vc->wq, &wait, TASK_INTERRUPTIBLE); + prepare_to_swait(&vc->wq, &wait, TASK_INTERRUPTIBLE); /* * Check one last time for pending exceptions and ceded state after @@ -2089,7 +2535,7 @@ static void kvmppc_vcore_blocked(struct kvmppc_vcore *vc) } if (!do_sleep) { - swait_finish(&vc->wq, &wait); + finish_swait(&vc->wq, &wait); return; } @@ -2097,7 +2543,7 @@ static void kvmppc_vcore_blocked(struct kvmppc_vcore *vc) trace_kvmppc_vcore_blocked(vc, 0); spin_unlock(&vc->lock); schedule(); - swait_finish(&vc->wq, &wait); + finish_swait(&vc->wq, &wait); spin_lock(&vc->lock); vc->vcore_state = VCORE_INACTIVE; trace_kvmppc_vcore_blocked(vc, 1); @@ -2136,22 +2582,35 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) * this thread straight away and have it join in. */ if (!signal_pending(current)) { - if (vc->vcore_state == VCORE_RUNNING && !VCORE_IS_EXITING(vc)) { + if (vc->vcore_state == VCORE_PIGGYBACK) { + struct kvmppc_vcore *mvc = vc->master_vcore; + if (spin_trylock(&mvc->lock)) { + if (mvc->vcore_state == VCORE_RUNNING && + !VCORE_IS_EXITING(mvc)) { + kvmppc_create_dtl_entry(vcpu, vc); + kvmppc_start_thread(vcpu, vc); + trace_kvm_guest_enter(vcpu); + } + spin_unlock(&mvc->lock); + } + } else if (vc->vcore_state == VCORE_RUNNING && + !VCORE_IS_EXITING(vc)) { kvmppc_create_dtl_entry(vcpu, vc); - kvmppc_start_thread(vcpu); + kvmppc_start_thread(vcpu, vc); trace_kvm_guest_enter(vcpu); } else if (vc->vcore_state == VCORE_SLEEPING) { - swait_wake(&vc->wq); + swake_up(&vc->wq); } } while (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE && !signal_pending(current)) { + if (vc->vcore_state == VCORE_PREEMPT && vc->runner == NULL) + kvmppc_vcore_end_preempt(vc); + if (vc->vcore_state != VCORE_INACTIVE) { - spin_unlock(&vc->lock); - kvmppc_wait_for_exec(vcpu, TASK_INTERRUPTIBLE); - spin_lock(&vc->lock); + kvmppc_wait_for_exec(vc, vcpu, TASK_INTERRUPTIBLE); continue; } list_for_each_entry_safe(v, vn, &vc->runnable_threads, @@ -2177,11 +2636,12 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) vc->runner = vcpu; if (n_ceded == vc->n_runnable) { kvmppc_vcore_blocked(vc); - } else if (should_resched()) { - vc->vcore_state = VCORE_PREEMPT; + } else if (need_resched()) { + kvmppc_vcore_preempt(vc); /* Let something else run */ cond_resched_lock(&vc->lock); - vc->vcore_state = VCORE_INACTIVE; + if (vc->vcore_state == VCORE_PREEMPT) + kvmppc_vcore_end_preempt(vc); } else { kvmppc_run_core(vc); } @@ -2190,11 +2650,12 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) while (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE && (vc->vcore_state == VCORE_RUNNING || - vc->vcore_state == VCORE_EXITING)) { - spin_unlock(&vc->lock); - kvmppc_wait_for_exec(vcpu, TASK_UNINTERRUPTIBLE); - spin_lock(&vc->lock); - } + vc->vcore_state == VCORE_EXITING || + vc->vcore_state == VCORE_PIGGYBACK)) + kvmppc_wait_for_exec(vc, vcpu, TASK_UNINTERRUPTIBLE); + + if (vc->vcore_state == VCORE_PREEMPT && vc->runner == NULL) + kvmppc_vcore_end_preempt(vc); if (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE) { kvmppc_remove_runnable(vc, vcpu); @@ -2320,6 +2781,7 @@ static int kvm_vm_ioctl_get_smmu_info_hv(struct kvm *kvm, static int kvm_vm_ioctl_get_dirty_log_hv(struct kvm *kvm, struct kvm_dirty_log *log) { + struct kvm_memslots *slots; struct kvm_memory_slot *memslot; int r; unsigned long n; @@ -2330,7 +2792,8 @@ static int kvm_vm_ioctl_get_dirty_log_hv(struct kvm *kvm, if (log->slot >= KVM_USER_MEM_SLOTS) goto out; - memslot = id_to_memslot(kvm->memslots, log->slot); + slots = kvm_memslots(kvm); + memslot = id_to_memslot(slots, log->slot); r = -ENOENT; if (!memslot->dirty_bitmap) goto out; @@ -2373,16 +2836,18 @@ static int kvmppc_core_create_memslot_hv(struct kvm_memory_slot *slot, static int kvmppc_core_prepare_memory_region_hv(struct kvm *kvm, struct kvm_memory_slot *memslot, - struct kvm_userspace_memory_region *mem) + const struct kvm_userspace_memory_region *mem) { return 0; } static void kvmppc_core_commit_memory_region_hv(struct kvm *kvm, - struct kvm_userspace_memory_region *mem, - const struct kvm_memory_slot *old) + const struct kvm_userspace_memory_region *mem, + const struct kvm_memory_slot *old, + const struct kvm_memory_slot *new) { unsigned long npages = mem->memory_size >> PAGE_SHIFT; + struct kvm_memslots *slots; struct kvm_memory_slot *memslot; if (npages && old->npages) { @@ -2392,7 +2857,8 @@ static void kvmppc_core_commit_memory_region_hv(struct kvm *kvm, * since the rmap array starts out as all zeroes, * i.e. no pages are dirty. */ - memslot = id_to_memslot(kvm->memslots, mem->slot); + slots = kvm_memslots(kvm); + memslot = id_to_memslot(slots, mem->slot); kvmppc_hv_get_dirty_log(kvm, memslot, NULL); } } @@ -2565,14 +3031,8 @@ static void kvmppc_free_vcores(struct kvm *kvm) { long int i; - for (i = 0; i < KVM_MAX_VCORES; ++i) { - if (kvm->arch.vcores[i] && kvm->arch.vcores[i]->mpp_buffer) { - struct kvmppc_vcore *vc = kvm->arch.vcores[i]; - free_pages((unsigned long)vc->mpp_buffer, - MPP_BUFFER_ORDER); - } + for (i = 0; i < KVM_MAX_VCORES; ++i) kfree(kvm->arch.vcores[i]); - } kvm->arch.online_vcores = 0; } @@ -2749,6 +3209,8 @@ static int kvmppc_book3s_init_hv(void) init_default_hcalls(); + init_vcore_lists(); + r = kvmppc_mmu_hv_init(); return r; } diff --git a/kernel/arch/powerpc/kvm/book3s_hv_builtin.c b/kernel/arch/powerpc/kvm/book3s_hv_builtin.c index ed2589d45..fd7006bf6 100644 --- a/kernel/arch/powerpc/kvm/book3s_hv_builtin.c +++ b/kernel/arch/powerpc/kvm/book3s_hv_builtin.c @@ -110,14 +110,15 @@ void __init kvm_cma_reserve(void) long int kvmppc_rm_h_confer(struct kvm_vcpu *vcpu, int target, unsigned int yield_count) { - struct kvmppc_vcore *vc = vcpu->arch.vcore; + struct kvmppc_vcore *vc = local_paca->kvm_hstate.kvm_vcore; + int ptid = local_paca->kvm_hstate.ptid; int threads_running; int threads_ceded; int threads_conferring; u64 stop = get_tb() + 10 * tb_ticks_per_usec; int rv = H_SUCCESS; /* => don't yield */ - set_bit(vcpu->arch.ptid, &vc->conferring_threads); + set_bit(ptid, &vc->conferring_threads); while ((get_tb() < stop) && !VCORE_IS_EXITING(vc)) { threads_running = VCORE_ENTRY_MAP(vc); threads_ceded = vc->napping_threads; @@ -127,7 +128,7 @@ long int kvmppc_rm_h_confer(struct kvm_vcpu *vcpu, int target, break; } } - clear_bit(vcpu->arch.ptid, &vc->conferring_threads); + clear_bit(ptid, &vc->conferring_threads); return rv; } @@ -238,7 +239,8 @@ void kvmhv_commence_exit(int trap) { struct kvmppc_vcore *vc = local_paca->kvm_hstate.kvm_vcore; int ptid = local_paca->kvm_hstate.ptid; - int me, ee; + struct kvm_split_mode *sip = local_paca->kvm_hstate.kvm_split_mode; + int me, ee, i; /* Set our bit in the threads-exiting-guest map in the 0xff00 bits of vcore->entry_exit_map */ @@ -258,4 +260,26 @@ void kvmhv_commence_exit(int trap) */ if (trap != BOOK3S_INTERRUPT_HV_DECREMENTER) kvmhv_interrupt_vcore(vc, ee & ~(1 << ptid)); + + /* + * If we are doing dynamic micro-threading, interrupt the other + * subcores to pull them out of their guests too. + */ + if (!sip) + return; + + for (i = 0; i < MAX_SUBCORES; ++i) { + vc = sip->master_vcs[i]; + if (!vc) + break; + do { + ee = vc->entry_exit_map; + /* Already asked to exit? */ + if ((ee >> 8) != 0) + break; + } while (cmpxchg(&vc->entry_exit_map, ee, + ee | VCORE_EXIT_REQ) != ee); + if ((ee >> 8) == 0) + kvmhv_interrupt_vcore(vc, ee); + } } diff --git a/kernel/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/kernel/arch/powerpc/kvm/book3s_hv_rm_mmu.c index c6d601cc9..91700518b 100644 --- a/kernel/arch/powerpc/kvm/book3s_hv_rm_mmu.c +++ b/kernel/arch/powerpc/kvm/book3s_hv_rm_mmu.c @@ -12,6 +12,7 @@ #include <linux/kvm_host.h> #include <linux/hugetlb.h> #include <linux/module.h> +#include <linux/log2.h> #include <asm/tlbflush.h> #include <asm/kvm_ppc.h> @@ -31,7 +32,7 @@ static void *real_vmalloc_addr(void *x) * So don't worry about THP collapse/split. Called * Only in realmode, hence won't need irq_save/restore. */ - p = __find_linux_pte_or_hugepte(swapper_pg_dir, addr, NULL); + p = __find_linux_pte_or_hugepte(swapper_pg_dir, addr, NULL, NULL); if (!p || !pte_present(*p)) return NULL; addr = (pte_pfn(*p) << PAGE_SHIFT) | (addr & ~PAGE_MASK); @@ -97,25 +98,52 @@ void kvmppc_add_revmap_chain(struct kvm *kvm, struct revmap_entry *rev, } EXPORT_SYMBOL_GPL(kvmppc_add_revmap_chain); +/* Update the changed page order field of an rmap entry */ +void kvmppc_update_rmap_change(unsigned long *rmap, unsigned long psize) +{ + unsigned long order; + + if (!psize) + return; + order = ilog2(psize); + order <<= KVMPPC_RMAP_CHG_SHIFT; + if (order > (*rmap & KVMPPC_RMAP_CHG_ORDER)) + *rmap = (*rmap & ~KVMPPC_RMAP_CHG_ORDER) | order; +} +EXPORT_SYMBOL_GPL(kvmppc_update_rmap_change); + +/* Returns a pointer to the revmap entry for the page mapped by a HPTE */ +static unsigned long *revmap_for_hpte(struct kvm *kvm, unsigned long hpte_v, + unsigned long hpte_gr) +{ + struct kvm_memory_slot *memslot; + unsigned long *rmap; + unsigned long gfn; + + gfn = hpte_rpn(hpte_gr, hpte_page_size(hpte_v, hpte_gr)); + memslot = __gfn_to_memslot(kvm_memslots_raw(kvm), gfn); + if (!memslot) + return NULL; + + rmap = real_vmalloc_addr(&memslot->arch.rmap[gfn - memslot->base_gfn]); + return rmap; +} + /* Remove this HPTE from the chain for a real page */ static void remove_revmap_chain(struct kvm *kvm, long pte_index, struct revmap_entry *rev, unsigned long hpte_v, unsigned long hpte_r) { struct revmap_entry *next, *prev; - unsigned long gfn, ptel, head; - struct kvm_memory_slot *memslot; + unsigned long ptel, head; unsigned long *rmap; unsigned long rcbits; rcbits = hpte_r & (HPTE_R_R | HPTE_R_C); ptel = rev->guest_rpte |= rcbits; - gfn = hpte_rpn(ptel, hpte_page_size(hpte_v, ptel)); - memslot = __gfn_to_memslot(kvm_memslots_raw(kvm), gfn); - if (!memslot) + rmap = revmap_for_hpte(kvm, hpte_v, ptel); + if (!rmap) return; - - rmap = real_vmalloc_addr(&memslot->arch.rmap[gfn - memslot->base_gfn]); lock_rmap(rmap); head = *rmap & KVMPPC_RMAP_INDEX; @@ -131,6 +159,8 @@ static void remove_revmap_chain(struct kvm *kvm, long pte_index, *rmap = (*rmap & ~KVMPPC_RMAP_INDEX) | head; } *rmap |= rcbits << KVMPPC_RMAP_RC_SHIFT; + if (rcbits & HPTE_R_C) + kvmppc_update_rmap_change(rmap, hpte_page_size(hpte_v, hpte_r)); unlock_rmap(rmap); } @@ -191,10 +221,12 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags, * retry via mmu_notifier_retry. */ if (realmode) - ptep = __find_linux_pte_or_hugepte(pgdir, hva, &hpage_shift); + ptep = __find_linux_pte_or_hugepte(pgdir, hva, NULL, + &hpage_shift); else { local_irq_save(irq_flags); - ptep = find_linux_pte_or_hugepte(pgdir, hva, &hpage_shift); + ptep = find_linux_pte_or_hugepte(pgdir, hva, NULL, + &hpage_shift); } if (ptep) { pte_t pte; @@ -440,6 +472,8 @@ long kvmppc_do_h_remove(struct kvm *kvm, unsigned long flags, note_hpte_modification(kvm, rev); unlock_hpte(hpte, 0); + if (v & HPTE_V_ABSENT) + v = (v & ~HPTE_V_ABSENT) | HPTE_V_VALID; hpret[0] = v; hpret[1] = r; return H_SUCCESS; @@ -661,6 +695,105 @@ long kvmppc_h_read(struct kvm_vcpu *vcpu, unsigned long flags, return H_SUCCESS; } +long kvmppc_h_clear_ref(struct kvm_vcpu *vcpu, unsigned long flags, + unsigned long pte_index) +{ + struct kvm *kvm = vcpu->kvm; + __be64 *hpte; + unsigned long v, r, gr; + struct revmap_entry *rev; + unsigned long *rmap; + long ret = H_NOT_FOUND; + + if (pte_index >= kvm->arch.hpt_npte) + return H_PARAMETER; + + rev = real_vmalloc_addr(&kvm->arch.revmap[pte_index]); + hpte = (__be64 *)(kvm->arch.hpt_virt + (pte_index << 4)); + while (!try_lock_hpte(hpte, HPTE_V_HVLOCK)) + cpu_relax(); + v = be64_to_cpu(hpte[0]); + r = be64_to_cpu(hpte[1]); + if (!(v & (HPTE_V_VALID | HPTE_V_ABSENT))) + goto out; + + gr = rev->guest_rpte; + if (rev->guest_rpte & HPTE_R_R) { + rev->guest_rpte &= ~HPTE_R_R; + note_hpte_modification(kvm, rev); + } + if (v & HPTE_V_VALID) { + gr |= r & (HPTE_R_R | HPTE_R_C); + if (r & HPTE_R_R) { + kvmppc_clear_ref_hpte(kvm, hpte, pte_index); + rmap = revmap_for_hpte(kvm, v, gr); + if (rmap) { + lock_rmap(rmap); + *rmap |= KVMPPC_RMAP_REFERENCED; + unlock_rmap(rmap); + } + } + } + vcpu->arch.gpr[4] = gr; + ret = H_SUCCESS; + out: + unlock_hpte(hpte, v & ~HPTE_V_HVLOCK); + return ret; +} + +long kvmppc_h_clear_mod(struct kvm_vcpu *vcpu, unsigned long flags, + unsigned long pte_index) +{ + struct kvm *kvm = vcpu->kvm; + __be64 *hpte; + unsigned long v, r, gr; + struct revmap_entry *rev; + unsigned long *rmap; + long ret = H_NOT_FOUND; + + if (pte_index >= kvm->arch.hpt_npte) + return H_PARAMETER; + + rev = real_vmalloc_addr(&kvm->arch.revmap[pte_index]); + hpte = (__be64 *)(kvm->arch.hpt_virt + (pte_index << 4)); + while (!try_lock_hpte(hpte, HPTE_V_HVLOCK)) + cpu_relax(); + v = be64_to_cpu(hpte[0]); + r = be64_to_cpu(hpte[1]); + if (!(v & (HPTE_V_VALID | HPTE_V_ABSENT))) + goto out; + + gr = rev->guest_rpte; + if (gr & HPTE_R_C) { + rev->guest_rpte &= ~HPTE_R_C; + note_hpte_modification(kvm, rev); + } + if (v & HPTE_V_VALID) { + /* need to make it temporarily absent so C is stable */ + hpte[0] |= cpu_to_be64(HPTE_V_ABSENT); + kvmppc_invalidate_hpte(kvm, hpte, pte_index); + r = be64_to_cpu(hpte[1]); + gr |= r & (HPTE_R_R | HPTE_R_C); + if (r & HPTE_R_C) { + unsigned long psize = hpte_page_size(v, r); + hpte[1] = cpu_to_be64(r & ~HPTE_R_C); + eieio(); + rmap = revmap_for_hpte(kvm, v, gr); + if (rmap) { + lock_rmap(rmap); + *rmap |= KVMPPC_RMAP_CHANGED; + kvmppc_update_rmap_change(rmap, psize); + unlock_rmap(rmap); + } + } + } + vcpu->arch.gpr[4] = gr; + ret = H_SUCCESS; + out: + unlock_hpte(hpte, v & ~HPTE_V_HVLOCK); + return ret; +} + void kvmppc_invalidate_hpte(struct kvm *kvm, __be64 *hptep, unsigned long pte_index) { diff --git a/kernel/arch/powerpc/kvm/book3s_hv_rm_xics.c b/kernel/arch/powerpc/kvm/book3s_hv_rm_xics.c index 00e45b6d4..24f58076d 100644 --- a/kernel/arch/powerpc/kvm/book3s_hv_rm_xics.c +++ b/kernel/arch/powerpc/kvm/book3s_hv_rm_xics.c @@ -67,14 +67,12 @@ static void icp_rm_set_vcpu_irq(struct kvm_vcpu *vcpu, } /* Check if the core is loaded, if not, too hard */ - cpu = vcpu->cpu; + cpu = vcpu->arch.thread_cpu; if (cpu < 0 || cpu >= nr_cpu_ids) { this_icp->rm_action |= XICS_RM_KICK_VCPU; this_icp->rm_kick_target = vcpu; return; } - /* In SMT cpu will always point to thread 0, we adjust it */ - cpu += vcpu->arch.ptid; smp_mb(); kvmhv_rm_send_ipi(cpu); diff --git a/kernel/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/kernel/arch/powerpc/kvm/book3s_hv_rmhandlers.S index 3b2d2c5b6..463af88c9 100644 --- a/kernel/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/kernel/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -128,6 +128,10 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) subf r4, r4, r3 mtspr SPRN_DEC, r4 + /* hwthread_req may have got set by cede or no vcpu, so clear it */ + li r0, 0 + stb r0, HSTATE_HWTHREAD_REQ(r13) + /* * For external and machine check interrupts, we need * to call the Linux handler to process the interrupt. @@ -146,6 +150,8 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) cmpwi cr1, r12, BOOK3S_INTERRUPT_MACHINE_CHECK cmpwi r12, BOOK3S_INTERRUPT_EXTERNAL beq 11f + cmpwi r12, BOOK3S_INTERRUPT_H_DOORBELL + beq 15f /* Invoke the H_DOORBELL handler */ cmpwi cr2, r12, BOOK3S_INTERRUPT_HMI beq cr2, 14f /* HMI check */ @@ -170,6 +176,10 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) mtspr SPRN_HSRR1, r7 b hmi_exception_after_realmode +15: mtspr SPRN_HSRR0, r8 + mtspr SPRN_HSRR1, r7 + ba 0xe80 + kvmppc_primary_no_guest: /* We handle this much like a ceded vcpu */ /* put the HDEC into the DEC, since HDEC interrupts don't wake us */ @@ -215,7 +225,6 @@ kvm_novcpu_wakeup: ld r5, HSTATE_KVM_VCORE(r13) li r0, 0 stb r0, HSTATE_NAPPING(r13) - stb r0, HSTATE_HWTHREAD_REQ(r13) /* check the wake reason */ bl kvmppc_check_wake_reason @@ -315,33 +324,54 @@ kvm_start_guest: cmpdi r3, 0 bge kvm_no_guest - /* get vcpu pointer, NULL if we have no vcpu to run */ - ld r4,HSTATE_KVM_VCPU(r13) - cmpdi r4,0 - /* if we have no vcpu to run, go back to sleep */ + /* get vcore pointer, NULL if we have nothing to run */ + ld r5,HSTATE_KVM_VCORE(r13) + cmpdi r5,0 + /* if we have no vcore to run, go back to sleep */ beq kvm_no_guest kvm_secondary_got_guest: /* Set HSTATE_DSCR(r13) to something sensible */ - ld r6, PACA_DSCR(r13) + ld r6, PACA_DSCR_DEFAULT(r13) std r6, HSTATE_DSCR(r13) - /* Order load of vcore, ptid etc. after load of vcpu */ + /* On thread 0 of a subcore, set HDEC to max */ + lbz r4, HSTATE_PTID(r13) + cmpwi r4, 0 + bne 63f + lis r6, 0x7fff + ori r6, r6, 0xffff + mtspr SPRN_HDEC, r6 + /* and set per-LPAR registers, if doing dynamic micro-threading */ + ld r6, HSTATE_SPLIT_MODE(r13) + cmpdi r6, 0 + beq 63f + ld r0, KVM_SPLIT_RPR(r6) + mtspr SPRN_RPR, r0 + ld r0, KVM_SPLIT_PMMAR(r6) + mtspr SPRN_PMMAR, r0 + ld r0, KVM_SPLIT_LDBAR(r6) + mtspr SPRN_LDBAR, r0 + isync +63: + /* Order load of vcpu after load of vcore */ lwsync + ld r4, HSTATE_KVM_VCPU(r13) bl kvmppc_hv_entry /* Back from the guest, go back to nap */ - /* Clear our vcpu pointer so we don't come back in early */ + /* Clear our vcpu and vcore pointers so we don't come back in early */ li r0, 0 + std r0, HSTATE_KVM_VCPU(r13) /* - * Once we clear HSTATE_KVM_VCPU(r13), the code in + * Once we clear HSTATE_KVM_VCORE(r13), the code in * kvmppc_run_core() is going to assume that all our vcpu * state is visible in memory. This lwsync makes sure * that that is true. */ lwsync - std r0, HSTATE_KVM_VCPU(r13) + std r0, HSTATE_KVM_VCORE(r13) /* * At this point we have finished executing in the guest. @@ -374,16 +404,71 @@ kvm_no_guest: b power7_wakeup_loss 53: HMT_LOW - ld r4, HSTATE_KVM_VCPU(r13) - cmpdi r4, 0 + ld r5, HSTATE_KVM_VCORE(r13) + cmpdi r5, 0 + bne 60f + ld r3, HSTATE_SPLIT_MODE(r13) + cmpdi r3, 0 + beq kvm_no_guest + lbz r0, KVM_SPLIT_DO_NAP(r3) + cmpwi r0, 0 beq kvm_no_guest HMT_MEDIUM + b kvm_unsplit_nap +60: HMT_MEDIUM b kvm_secondary_got_guest 54: li r0, KVM_HWTHREAD_IN_KVM stb r0, HSTATE_HWTHREAD_STATE(r13) b kvm_no_guest +/* + * Here the primary thread is trying to return the core to + * whole-core mode, so we need to nap. + */ +kvm_unsplit_nap: + /* + * Ensure that secondary doesn't nap when it has + * its vcore pointer set. + */ + sync /* matches smp_mb() before setting split_info.do_nap */ + ld r0, HSTATE_KVM_VCORE(r13) + cmpdi r0, 0 + bne kvm_no_guest + /* clear any pending message */ +BEGIN_FTR_SECTION + lis r6, (PPC_DBELL_SERVER << (63-36))@h + PPC_MSGCLR(6) +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) + /* Set kvm_split_mode.napped[tid] = 1 */ + ld r3, HSTATE_SPLIT_MODE(r13) + li r0, 1 + lhz r4, PACAPACAINDEX(r13) + clrldi r4, r4, 61 /* micro-threading => P8 => 8 threads/core */ + addi r4, r4, KVM_SPLIT_NAPPED + stbx r0, r3, r4 + /* Check the do_nap flag again after setting napped[] */ + sync + lbz r0, KVM_SPLIT_DO_NAP(r3) + cmpwi r0, 0 + beq 57f + li r3, (LPCR_PECEDH | LPCR_PECE0) >> 4 + mfspr r4, SPRN_LPCR + rlwimi r4, r3, 4, (LPCR_PECEDP | LPCR_PECEDH | LPCR_PECE0 | LPCR_PECE1) + mtspr SPRN_LPCR, r4 + isync + std r0, HSTATE_SCRATCH0(r13) + ptesync + ld r0, HSTATE_SCRATCH0(r13) +1: cmpd r0, r0 + bne 1b + nap + b . + +57: li r0, 0 + stbx r0, r3, r4 + b kvm_no_guest + /****************************************************************************** * * * Entry code * @@ -854,7 +939,10 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S) cmpwi r0, 0 bne 21f HMT_LOW -20: lbz r0, VCORE_IN_GUEST(r5) +20: lwz r3, VCORE_ENTRY_EXIT(r5) + cmpwi r3, 0x100 + bge no_switch_exit + lbz r0, VCORE_IN_GUEST(r5) cmpwi r0, 0 beq 20b HMT_MEDIUM @@ -870,7 +958,7 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S) blt hdec_soon ld r6, VCPU_CTR(r4) - lwz r7, VCPU_XER(r4) + ld r7, VCPU_XER(r4) mtctr r6 mtxer r7 @@ -985,9 +1073,13 @@ secondary_too_late: #endif 11: b kvmhv_switch_to_host +no_switch_exit: + HMT_MEDIUM + li r12, 0 + b 12f hdec_soon: li r12, BOOK3S_INTERRUPT_HV_DECREMENTER - stw r12, VCPU_TRAP(r4) +12: stw r12, VCPU_TRAP(r4) mr r9, r4 #ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING addi r3, r4, VCPU_TB_RMEXIT @@ -1103,7 +1195,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) mfctr r3 mfxer r4 std r3, VCPU_CTR(r9) - stw r4, VCPU_XER(r9) + std r4, VCPU_XER(r9) /* If this is a page table miss then see if it's theirs or ours */ cmpwi r12, BOOK3S_INTERRUPT_H_DATA_STORAGE @@ -1171,12 +1263,18 @@ mc_cont: bl kvmhv_accumulate_time #endif + mr r3, r12 /* Increment exit count, poke other threads to exit */ bl kvmhv_commence_exit nop ld r9, HSTATE_KVM_VCPU(r13) lwz r12, VCPU_TRAP(r9) + /* Stop others sending VCPU interrupts to this physical CPU */ + li r0, -1 + stw r0, VCPU_CPU(r9) + stw r0, VCPU_THREAD_CPU(r9) + /* Save guest CTRL register, set runlatch to 1 */ mfspr r6,SPRN_CTRLF stw r6,VCPU_CTRL(r9) @@ -1272,6 +1370,20 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S) std r6, VCPU_ACOP(r9) stw r7, VCPU_GUEST_PID(r9) std r8, VCPU_WORT(r9) + /* + * Restore various registers to 0, where non-zero values + * set by the guest could disrupt the host. + */ + li r0, 0 + mtspr SPRN_IAMR, r0 + mtspr SPRN_CIABR, r0 + mtspr SPRN_DAWRX, r0 + mtspr SPRN_TCSCR, r0 + mtspr SPRN_WORT, r0 + /* Set MMCRS to 1<<31 to freeze and disable the SPMC counters */ + li r0, 1 + sldi r0, r0, 31 + mtspr SPRN_MMCRS, r0 8: /* Save and reset AMR and UAMOR before turning on the MMU */ @@ -1541,12 +1653,17 @@ kvmhv_switch_to_host: /* Primary thread waits for all the secondaries to exit guest */ 15: lwz r3,VCORE_ENTRY_EXIT(r5) - srwi r0,r3,8 + rlwinm r0,r3,32-8,0xff clrldi r3,r3,56 cmpw r3,r0 bne 15b isync + /* Did we actually switch to the guest at all? */ + lbz r6, VCORE_IN_GUEST(r5) + cmpwi r6, 0 + beq 19f + /* Primary thread switches back to host partition */ ld r6,KVM_HOST_SDR1(r4) lwz r7,KVM_HOST_LPID(r4) @@ -1590,7 +1707,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) 18: /* Signal secondary CPUs to continue */ stb r0,VCORE_IN_GUEST(r5) - lis r8,0x7fff /* MAX_INT@h */ +19: lis r8,0x7fff /* MAX_INT@h */ mtspr SPRN_HDEC,r8 16: ld r8,KVM_HOST_LPCR(r4) @@ -1646,7 +1763,8 @@ kvmppc_hdsi: beq 3f clrrdi r0, r4, 28 PPC_SLBFEE_DOT(R5, R0) /* if so, look up SLB */ - bne 1f /* if no SLB entry found */ + li r0, BOOK3S_INTERRUPT_DATA_SEGMENT + bne 7f /* if no SLB entry found */ 4: std r4, VCPU_FAULT_DAR(r9) stw r6, VCPU_FAULT_DSISR(r9) @@ -1665,18 +1783,19 @@ kvmppc_hdsi: cmpdi r3, -2 /* MMIO emulation; need instr word */ beq 2f - /* Synthesize a DSI for the guest */ + /* Synthesize a DSI (or DSegI) for the guest */ ld r4, VCPU_FAULT_DAR(r9) mr r6, r3 -1: mtspr SPRN_DAR, r4 +1: li r0, BOOK3S_INTERRUPT_DATA_STORAGE mtspr SPRN_DSISR, r6 +7: mtspr SPRN_DAR, r4 mtspr SPRN_SRR0, r10 mtspr SPRN_SRR1, r11 - li r10, BOOK3S_INTERRUPT_DATA_STORAGE + mr r10, r0 bl kvmppc_msr_interrupt fast_interrupt_c_return: 6: ld r7, VCPU_CTR(r9) - lwz r8, VCPU_XER(r9) + ld r8, VCPU_XER(r9) mtctr r7 mtxer r8 mr r4, r9 @@ -1720,7 +1839,8 @@ kvmppc_hisi: beq 3f clrrdi r0, r10, 28 PPC_SLBFEE_DOT(R5, R0) /* if so, look up SLB */ - bne 1f /* if no SLB entry found */ + li r0, BOOK3S_INTERRUPT_INST_SEGMENT + bne 7f /* if no SLB entry found */ 4: /* Search the hash table. */ mr r3, r9 /* vcpu pointer */ @@ -1737,11 +1857,12 @@ kvmppc_hisi: cmpdi r3, -1 /* handle in kernel mode */ beq guest_exit_cont - /* Synthesize an ISI for the guest */ + /* Synthesize an ISI (or ISegI) for the guest */ mr r11, r3 -1: mtspr SPRN_SRR0, r10 +1: li r0, BOOK3S_INTERRUPT_INST_STORAGE +7: mtspr SPRN_SRR0, r10 mtspr SPRN_SRR1, r11 - li r10, BOOK3S_INTERRUPT_INST_STORAGE + mr r10, r0 bl kvmppc_msr_interrupt b fast_interrupt_c_return @@ -1817,8 +1938,8 @@ hcall_real_table: .long DOTSYM(kvmppc_h_remove) - hcall_real_table .long DOTSYM(kvmppc_h_enter) - hcall_real_table .long DOTSYM(kvmppc_h_read) - hcall_real_table - .long 0 /* 0x10 - H_CLEAR_MOD */ - .long 0 /* 0x14 - H_CLEAR_REF */ + .long DOTSYM(kvmppc_h_clear_mod) - hcall_real_table + .long DOTSYM(kvmppc_h_clear_ref) - hcall_real_table .long DOTSYM(kvmppc_h_protect) - hcall_real_table .long DOTSYM(kvmppc_h_get_tce) - hcall_real_table .long DOTSYM(kvmppc_h_put_tce) - hcall_real_table @@ -2046,7 +2167,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) /* Emulate H_SET_DABR/X on P8 for the sake of compat mode guests */ 2: rlwimi r5, r4, 5, DAWRX_DR | DAWRX_DW - rlwimi r5, r4, 1, DAWRX_WT + rlwimi r5, r4, 2, DAWRX_WT clrrdi r4, r4, 3 std r4, VCPU_DAWR(r3) std r5, VCPU_DAWRX(r3) @@ -2280,7 +2401,6 @@ machine_check_realmode: mr r3, r9 /* get vcpu pointer */ bl kvmppc_realmode_machine_check nop - cmpdi r3, 0 /* Did we handle MCE ? */ ld r9, HSTATE_KVM_VCPU(r13) li r12, BOOK3S_INTERRUPT_MACHINE_CHECK /* @@ -2293,13 +2413,18 @@ machine_check_realmode: * The old code used to return to host for unhandled errors which * was causing guest to hang with soft lockups inside guest and * makes it difficult to recover guest instance. + * + * if we receive machine check with MSR(RI=0) then deliver it to + * guest as machine check causing guest to crash. */ - ld r10, VCPU_PC(r9) ld r11, VCPU_MSR(r9) + andi. r10, r11, MSR_RI /* check for unrecoverable exception */ + beq 1f /* Deliver a machine check to guest */ + ld r10, VCPU_PC(r9) + cmpdi r3, 0 /* Did we handle MCE ? */ bne 2f /* Continue guest execution. */ /* If not, deliver a machine check. SRR0/1 are already set */ - li r10, BOOK3S_INTERRUPT_MACHINE_CHECK - ld r11, VCPU_MSR(r9) +1: li r10, BOOK3S_INTERRUPT_MACHINE_CHECK bl kvmppc_msr_interrupt 2: b fast_interrupt_c_return @@ -2339,14 +2464,19 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) /* hypervisor doorbell */ 3: li r12, BOOK3S_INTERRUPT_H_DOORBELL + + /* + * Clear the doorbell as we will invoke the handler + * explicitly in the guest exit path. + */ + lis r6, (PPC_DBELL_SERVER << (63-36))@h + PPC_MSGCLR(6) /* see if it's a host IPI */ li r3, 1 lbz r0, HSTATE_HOST_IPI(r13) cmpwi r0, 0 bnelr - /* if not, clear it and return -1 */ - lis r6, (PPC_DBELL_SERVER << (63-36))@h - PPC_MSGCLR(6) + /* if not, return -1 */ li r3, -1 blr diff --git a/kernel/arch/powerpc/kvm/book3s_paired_singles.c b/kernel/arch/powerpc/kvm/book3s_paired_singles.c index bd6ab1672..a759d9adb 100644 --- a/kernel/arch/powerpc/kvm/book3s_paired_singles.c +++ b/kernel/arch/powerpc/kvm/book3s_paired_singles.c @@ -352,7 +352,7 @@ static inline u32 inst_get_field(u32 inst, int msb, int lsb) return kvmppc_get_field(inst, msb + 32, lsb + 32); } -bool kvmppc_inst_is_paired_single(struct kvm_vcpu *vcpu, u32 inst) +static bool kvmppc_inst_is_paired_single(struct kvm_vcpu *vcpu, u32 inst) { if (!(vcpu->arch.hflags & BOOK3S_HFLAG_PAIRED_SINGLE)) return false; diff --git a/kernel/arch/powerpc/kvm/book3s_pr.c b/kernel/arch/powerpc/kvm/book3s_pr.c index f57383941..64891b081 100644 --- a/kernel/arch/powerpc/kvm/book3s_pr.c +++ b/kernel/arch/powerpc/kvm/book3s_pr.c @@ -1530,6 +1530,7 @@ out: static int kvm_vm_ioctl_get_dirty_log_pr(struct kvm *kvm, struct kvm_dirty_log *log) { + struct kvm_memslots *slots; struct kvm_memory_slot *memslot; struct kvm_vcpu *vcpu; ulong ga, ga_end; @@ -1545,7 +1546,8 @@ static int kvm_vm_ioctl_get_dirty_log_pr(struct kvm *kvm, /* If nothing is dirty, don't bother messing with page tables. */ if (is_dirty) { - memslot = id_to_memslot(kvm->memslots, log->slot); + slots = kvm_memslots(kvm); + memslot = id_to_memslot(slots, log->slot); ga = memslot->base_gfn << PAGE_SHIFT; ga_end = ga + (memslot->npages << PAGE_SHIFT); @@ -1571,14 +1573,15 @@ static void kvmppc_core_flush_memslot_pr(struct kvm *kvm, static int kvmppc_core_prepare_memory_region_pr(struct kvm *kvm, struct kvm_memory_slot *memslot, - struct kvm_userspace_memory_region *mem) + const struct kvm_userspace_memory_region *mem) { return 0; } static void kvmppc_core_commit_memory_region_pr(struct kvm *kvm, - struct kvm_userspace_memory_region *mem, - const struct kvm_memory_slot *old) + const struct kvm_userspace_memory_region *mem, + const struct kvm_memory_slot *old, + const struct kvm_memory_slot *new) { return; } diff --git a/kernel/arch/powerpc/kvm/book3s_segment.S b/kernel/arch/powerpc/kvm/book3s_segment.S index acee37cde..ca8f17428 100644 --- a/kernel/arch/powerpc/kvm/book3s_segment.S +++ b/kernel/arch/powerpc/kvm/book3s_segment.S @@ -123,7 +123,7 @@ no_dcbz32_on: PPC_LL r8, SVCPU_CTR(r3) PPC_LL r9, SVCPU_LR(r3) lwz r10, SVCPU_CR(r3) - lwz r11, SVCPU_XER(r3) + PPC_LL r11, SVCPU_XER(r3) mtctr r8 mtlr r9 @@ -237,7 +237,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_HVMODE) mfctr r8 mflr r9 - stw r5, SVCPU_XER(r13) + PPC_STL r5, SVCPU_XER(r13) PPC_STL r6, SVCPU_FAULT_DAR(r13) stw r7, SVCPU_FAULT_DSISR(r13) PPC_STL r8, SVCPU_CTR(r13) diff --git a/kernel/arch/powerpc/kvm/book3s_xics.c b/kernel/arch/powerpc/kvm/book3s_xics.c index c6ca7db64..905e94a13 100644 --- a/kernel/arch/powerpc/kvm/book3s_xics.c +++ b/kernel/arch/powerpc/kvm/book3s_xics.c @@ -41,7 +41,7 @@ * ======= * * Each ICS has a spin lock protecting the information about the IRQ - * sources and avoiding simultaneous deliveries if the same interrupt. + * sources and avoiding simultaneous deliveries of the same interrupt. * * ICP operations are done via a single compare & swap transaction * (most ICP state fits in the union kvmppc_icp_state) diff --git a/kernel/arch/powerpc/kvm/booke.c b/kernel/arch/powerpc/kvm/booke.c index 6c1316a15..fd5875179 100644 --- a/kernel/arch/powerpc/kvm/booke.c +++ b/kernel/arch/powerpc/kvm/booke.c @@ -63,6 +63,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { { "dec", VCPU_STAT(dec_exits) }, { "ext_intr", VCPU_STAT(ext_intr_exits) }, { "halt_successful_poll", VCPU_STAT(halt_successful_poll) }, + { "halt_attempted_poll", VCPU_STAT(halt_attempted_poll) }, { "halt_wakeup", VCPU_STAT(halt_wakeup) }, { "doorbell", VCPU_STAT(dbell_exits) }, { "guest doorbell", VCPU_STAT(gdbell_exits) }, @@ -933,6 +934,7 @@ static void kvmppc_restart_interrupt(struct kvm_vcpu *vcpu, #endif break; case BOOKE_INTERRUPT_CRITICAL: + kvmppc_fill_pt_regs(®s); unknown_exception(®s); break; case BOOKE_INTERRUPT_DEBUG: @@ -1004,10 +1006,10 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, break; } - local_irq_enable(); - trace_kvm_exit(exit_nr, vcpu); - kvm_guest_exit(); + __kvm_guest_exit(); + + local_irq_enable(); run->exit_reason = KVM_EXIT_UNKNOWN; run->ready_for_interrupt_injection = 1; @@ -1784,14 +1786,15 @@ int kvmppc_core_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot, int kvmppc_core_prepare_memory_region(struct kvm *kvm, struct kvm_memory_slot *memslot, - struct kvm_userspace_memory_region *mem) + const struct kvm_userspace_memory_region *mem) { return 0; } void kvmppc_core_commit_memory_region(struct kvm *kvm, - struct kvm_userspace_memory_region *mem, - const struct kvm_memory_slot *old) + const struct kvm_userspace_memory_region *mem, + const struct kvm_memory_slot *old, + const struct kvm_memory_slot *new) { } diff --git a/kernel/arch/powerpc/kvm/e500.c b/kernel/arch/powerpc/kvm/e500.c index b29ce752c..32fdab57d 100644 --- a/kernel/arch/powerpc/kvm/e500.c +++ b/kernel/arch/powerpc/kvm/e500.c @@ -237,7 +237,8 @@ void kvmppc_e500_tlbil_one(struct kvmppc_vcpu_e500 *vcpu_e500, struct kvm_book3e_206_tlb_entry *gtlbe) { struct vcpu_id_table *idt = vcpu_e500->idt; - unsigned int pr, tid, ts, pid; + unsigned int pr, tid, ts; + int pid; u32 val, eaddr; unsigned long flags; diff --git a/kernel/arch/powerpc/kvm/e500_emulate.c b/kernel/arch/powerpc/kvm/e500_emulate.c index ce7291c79..990db69a1 100644 --- a/kernel/arch/powerpc/kvm/e500_emulate.c +++ b/kernel/arch/powerpc/kvm/e500_emulate.c @@ -15,6 +15,7 @@ #include <asm/kvm_ppc.h> #include <asm/disassemble.h> #include <asm/dbell.h> +#include <asm/reg_booke.h> #include "booke.h" #include "e500.h" @@ -22,6 +23,7 @@ #define XOP_DCBTLS 166 #define XOP_MSGSND 206 #define XOP_MSGCLR 238 +#define XOP_MFTMR 366 #define XOP_TLBIVAX 786 #define XOP_TLBSX 914 #define XOP_TLBRE 946 @@ -113,6 +115,19 @@ static int kvmppc_e500_emul_dcbtls(struct kvm_vcpu *vcpu) return EMULATE_DONE; } +static int kvmppc_e500_emul_mftmr(struct kvm_vcpu *vcpu, unsigned int inst, + int rt) +{ + /* Expose one thread per vcpu */ + if (get_tmrn(inst) == TMRN_TMCFG0) { + kvmppc_set_gpr(vcpu, rt, + 1 | (1 << TMRN_TMCFG0_NATHRD_SHIFT)); + return EMULATE_DONE; + } + + return EMULATE_FAIL; +} + int kvmppc_core_emulate_op_e500(struct kvm_run *run, struct kvm_vcpu *vcpu, unsigned int inst, int *advance) { @@ -165,6 +180,10 @@ int kvmppc_core_emulate_op_e500(struct kvm_run *run, struct kvm_vcpu *vcpu, emulated = kvmppc_e500_emul_tlbivax(vcpu, ea); break; + case XOP_MFTMR: + emulated = kvmppc_e500_emul_mftmr(vcpu, inst, rt); + break; + case XOP_EHPRIV: emulated = kvmppc_e500_emul_ehpriv(run, vcpu, inst, advance); diff --git a/kernel/arch/powerpc/kvm/e500_mmu.c b/kernel/arch/powerpc/kvm/e500_mmu.c index 50860e919..29911a07b 100644 --- a/kernel/arch/powerpc/kvm/e500_mmu.c +++ b/kernel/arch/powerpc/kvm/e500_mmu.c @@ -377,7 +377,7 @@ int kvmppc_e500_emul_tlbsx(struct kvm_vcpu *vcpu, gva_t ea) | MAS0_NV(vcpu_e500->gtlb_nv[tlbsel]); vcpu->arch.shared->mas1 = (vcpu->arch.shared->mas6 & MAS6_SPID0) - | (vcpu->arch.shared->mas6 & (MAS6_SAS ? MAS1_TS : 0)) + | ((vcpu->arch.shared->mas6 & MAS6_SAS) ? MAS1_TS : 0) | (vcpu->arch.shared->mas4 & MAS4_TSIZED(~0)); vcpu->arch.shared->mas2 &= MAS2_EPN; vcpu->arch.shared->mas2 |= vcpu->arch.shared->mas4 & diff --git a/kernel/arch/powerpc/kvm/e500_mmu_host.c b/kernel/arch/powerpc/kvm/e500_mmu_host.c index 4d33e199e..34c43fff4 100644 --- a/kernel/arch/powerpc/kvm/e500_mmu_host.c +++ b/kernel/arch/powerpc/kvm/e500_mmu_host.c @@ -406,7 +406,7 @@ static inline int kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500, for (; tsize > BOOK3E_PAGESZ_4K; tsize -= 2) { unsigned long gfn_start, gfn_end; - tsize_pages = 1 << (tsize - 2); + tsize_pages = 1UL << (tsize - 2); gfn_start = gfn & ~(tsize_pages - 1); gfn_end = gfn_start + tsize_pages; @@ -447,7 +447,7 @@ static inline int kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500, } if (likely(!pfnmap)) { - tsize_pages = 1 << (tsize + 10 - PAGE_SHIFT); + tsize_pages = 1UL << (tsize + 10 - PAGE_SHIFT); pfn = gfn_to_pfn_memslot(slot, gfn); if (is_error_noslot_pfn(pfn)) { if (printk_ratelimit()) @@ -476,7 +476,7 @@ static inline int kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500, * can't run hence pfn won't change. */ local_irq_save(flags); - ptep = find_linux_pte_or_hugepte(pgdir, hva, NULL); + ptep = find_linux_pte_or_hugepte(pgdir, hva, NULL, NULL); if (ptep) { pte_t pte = READ_ONCE(*ptep); diff --git a/kernel/arch/powerpc/kvm/powerpc.c b/kernel/arch/powerpc/kvm/powerpc.c index ac3ddf115..a3b182dcb 100644 --- a/kernel/arch/powerpc/kvm/powerpc.c +++ b/kernel/arch/powerpc/kvm/powerpc.c @@ -115,7 +115,7 @@ int kvmppc_prepare_to_enter(struct kvm_vcpu *vcpu) continue; } - kvm_guest_enter(); + __kvm_guest_enter(); return 1; } @@ -559,6 +559,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) else r = num_online_cpus(); break; + case KVM_CAP_NR_MEMSLOTS: + r = KVM_USER_MEM_SLOTS; + break; case KVM_CAP_MAX_VCPUS: r = KVM_MAX_VCPUS; break; @@ -595,18 +598,19 @@ int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot, int kvm_arch_prepare_memory_region(struct kvm *kvm, struct kvm_memory_slot *memslot, - struct kvm_userspace_memory_region *mem, + const struct kvm_userspace_memory_region *mem, enum kvm_mr_change change) { return kvmppc_core_prepare_memory_region(kvm, memslot, mem); } void kvm_arch_commit_memory_region(struct kvm *kvm, - struct kvm_userspace_memory_region *mem, + const struct kvm_userspace_memory_region *mem, const struct kvm_memory_slot *old, + const struct kvm_memory_slot *new, enum kvm_mr_change change) { - kvmppc_core_commit_memory_region(kvm, mem, old); + kvmppc_core_commit_memory_region(kvm, mem, old, new); } void kvm_arch_flush_shadow_memslot(struct kvm *kvm, @@ -659,7 +663,7 @@ int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu) return kvmppc_core_pending_dec(vcpu); } -enum hrtimer_restart kvmppc_decrementer_wakeup(struct hrtimer *timer) +static enum hrtimer_restart kvmppc_decrementer_wakeup(struct hrtimer *timer) { struct kvm_vcpu *vcpu; @@ -915,21 +919,17 @@ int kvm_vcpu_ioctl_get_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg) r = -ENXIO; break; } - vcpu->arch.vr.vr[reg->id - KVM_REG_PPC_VR0] = val.vval; + val.vval = vcpu->arch.vr.vr[reg->id - KVM_REG_PPC_VR0]; break; case KVM_REG_PPC_VSCR: if (!cpu_has_feature(CPU_FTR_ALTIVEC)) { r = -ENXIO; break; } - vcpu->arch.vr.vscr.u[3] = set_reg_val(reg->id, val); + val = get_reg_val(reg->id, vcpu->arch.vr.vscr.u[3]); break; case KVM_REG_PPC_VRSAVE: - if (!cpu_has_feature(CPU_FTR_ALTIVEC)) { - r = -ENXIO; - break; - } - vcpu->arch.vrsave = set_reg_val(reg->id, val); + val = get_reg_val(reg->id, vcpu->arch.vrsave); break; #endif /* CONFIG_ALTIVEC */ default: @@ -970,17 +970,21 @@ int kvm_vcpu_ioctl_set_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg) r = -ENXIO; break; } - val.vval = vcpu->arch.vr.vr[reg->id - KVM_REG_PPC_VR0]; + vcpu->arch.vr.vr[reg->id - KVM_REG_PPC_VR0] = val.vval; break; case KVM_REG_PPC_VSCR: if (!cpu_has_feature(CPU_FTR_ALTIVEC)) { r = -ENXIO; break; } - val = get_reg_val(reg->id, vcpu->arch.vr.vscr.u[3]); + vcpu->arch.vr.vscr.u[3] = set_reg_val(reg->id, val); break; case KVM_REG_PPC_VRSAVE: - val = get_reg_val(reg->id, vcpu->arch.vrsave); + if (!cpu_has_feature(CPU_FTR_ALTIVEC)) { + r = -ENXIO; + break; + } + vcpu->arch.vrsave = set_reg_val(reg->id, val); break; #endif /* CONFIG_ALTIVEC */ default: |