diff options
author | Yunhong Jiang <yunhong.jiang@intel.com> | 2015-08-04 12:17:53 -0700 |
---|---|---|
committer | Yunhong Jiang <yunhong.jiang@intel.com> | 2015-08-04 15:44:42 -0700 |
commit | 9ca8dbcc65cfc63d6f5ef3312a33184e1d726e00 (patch) | |
tree | 1c9cafbcd35f783a87880a10f85d1a060db1a563 /kernel/arch/powerpc/kvm | |
parent | 98260f3884f4a202f9ca5eabed40b1354c489b29 (diff) |
Add the rt linux 4.1.3-rt3 as base
Import the rt linux 4.1.3-rt3 as OPNFV kvm base.
It's from git://git.kernel.org/pub/scm/linux/kernel/git/rt/linux-rt-devel.git linux-4.1.y-rt and
the base is:
commit 0917f823c59692d751951bf5ea699a2d1e2f26a2
Author: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Sat Jul 25 12:13:34 2015 +0200
Prepare v4.1.3-rt3
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
We lose all the git history this way and it's not good. We
should apply another opnfv project repo in future.
Change-Id: I87543d81c9df70d99c5001fbdf646b202c19f423
Signed-off-by: Yunhong Jiang <yunhong.jiang@intel.com>
Diffstat (limited to 'kernel/arch/powerpc/kvm')
57 files changed, 33297 insertions, 0 deletions
diff --git a/kernel/arch/powerpc/kvm/Kconfig b/kernel/arch/powerpc/kvm/Kconfig new file mode 100644 index 000000000..d4c48506e --- /dev/null +++ b/kernel/arch/powerpc/kvm/Kconfig @@ -0,0 +1,199 @@ +# +# KVM configuration +# + +source "virt/kvm/Kconfig" + +menuconfig VIRTUALIZATION + bool "Virtualization" + ---help--- + Say Y here to get to see options for using your Linux host to run + other operating systems inside virtual machines (guests). + This option alone does not add any kernel code. + + If you say N, all options in this submenu will be skipped and + disabled. + +if VIRTUALIZATION + +config KVM + bool + select PREEMPT_NOTIFIERS + select ANON_INODES + select HAVE_KVM_EVENTFD + select SRCU + +config KVM_BOOK3S_HANDLER + bool + +config KVM_BOOK3S_32_HANDLER + bool + select KVM_BOOK3S_HANDLER + select KVM_MMIO + +config KVM_BOOK3S_64_HANDLER + bool + select KVM_BOOK3S_HANDLER + +config KVM_BOOK3S_PR_POSSIBLE + bool + select KVM_MMIO + select MMU_NOTIFIER + +config KVM_BOOK3S_HV_POSSIBLE + bool + +config KVM_BOOK3S_32 + tristate "KVM support for PowerPC book3s_32 processors" + depends on PPC_BOOK3S_32 && !SMP && !PTE_64BIT + select KVM + select KVM_BOOK3S_32_HANDLER + select KVM_BOOK3S_PR_POSSIBLE + ---help--- + Support running unmodified book3s_32 guest kernels + in virtual machines on book3s_32 host processors. + + This module provides access to the hardware capabilities through + a character device node named /dev/kvm. + + If unsure, say N. + +config KVM_BOOK3S_64 + tristate "KVM support for PowerPC book3s_64 processors" + depends on PPC_BOOK3S_64 + select KVM_BOOK3S_64_HANDLER + select KVM + select KVM_BOOK3S_PR_POSSIBLE if !KVM_BOOK3S_HV_POSSIBLE + ---help--- + Support running unmodified book3s_64 and book3s_32 guest kernels + in virtual machines on book3s_64 host processors. + + This module provides access to the hardware capabilities through + a character device node named /dev/kvm. + + If unsure, say N. + +config KVM_BOOK3S_64_HV + tristate "KVM support for POWER7 and PPC970 using hypervisor mode in host" + depends on KVM_BOOK3S_64 && PPC_POWERNV + select KVM_BOOK3S_HV_POSSIBLE + select MMU_NOTIFIER + select CMA + ---help--- + Support running unmodified book3s_64 guest kernels in + virtual machines on POWER7 and PPC970 processors that have + hypervisor mode available to the host. + + If you say Y here, KVM will use the hardware virtualization + facilities of POWER7 (and later) processors, meaning that + guest operating systems will run at full hardware speed + using supervisor and user modes. However, this also means + that KVM is not usable under PowerVM (pHyp), is only usable + on POWER7 (or later) processors and PPC970-family processors, + and cannot emulate a different processor from the host processor. + + If unsure, say N. + +config KVM_BOOK3S_64_PR + tristate "KVM support without using hypervisor mode in host" + depends on KVM_BOOK3S_64 + select KVM_BOOK3S_PR_POSSIBLE + ---help--- + Support running guest kernels in virtual machines on processors + without using hypervisor mode in the host, by running the + guest in user mode (problem state) and emulating all + privileged instructions and registers. + + This is not as fast as using hypervisor mode, but works on + machines where hypervisor mode is not available or not usable, + and can emulate processors that are different from the host + processor, including emulating 32-bit processors on a 64-bit + host. + +config KVM_BOOK3S_HV_EXIT_TIMING + bool "Detailed timing for hypervisor real-mode code" + depends on KVM_BOOK3S_HV_POSSIBLE && DEBUG_FS + ---help--- + Calculate time taken for each vcpu in the real-mode guest entry, + exit, and interrupt handling code, plus time spent in the guest + and in nap mode due to idle (cede) while other threads are still + in the guest. The total, minimum and maximum times in nanoseconds + together with the number of executions are reported in debugfs in + kvm/vm#/vcpu#/timings. The overhead is of the order of 30 - 40 + ns per exit on POWER8. + + If unsure, say N. + +config KVM_BOOKE_HV + bool + +config KVM_EXIT_TIMING + bool "Detailed exit timing" + depends on KVM_E500V2 || KVM_E500MC + ---help--- + Calculate elapsed time for every exit/enter cycle. A per-vcpu + report is available in debugfs kvm/vm#_vcpu#_timing. + The overhead is relatively small, however it is not recommended for + production environments. + + If unsure, say N. + +config KVM_E500V2 + bool "KVM support for PowerPC E500v2 processors" + depends on E500 && !PPC_E500MC + select KVM + select KVM_MMIO + select MMU_NOTIFIER + ---help--- + Support running unmodified E500 guest kernels in virtual machines on + E500v2 host processors. + + This module provides access to the hardware capabilities through + a character device node named /dev/kvm. + + If unsure, say N. + +config KVM_E500MC + bool "KVM support for PowerPC E500MC/E5500/E6500 processors" + depends on PPC_E500MC + select KVM + select KVM_MMIO + select KVM_BOOKE_HV + select MMU_NOTIFIER + ---help--- + Support running unmodified E500MC/E5500/E6500 guest kernels in + virtual machines on E500MC/E5500/E6500 host processors. + + This module provides access to the hardware capabilities through + a character device node named /dev/kvm. + + If unsure, say N. + +config KVM_MPIC + bool "KVM in-kernel MPIC emulation" + depends on KVM && E500 + depends on !PREEMPT_RT_FULL + select HAVE_KVM_IRQCHIP + select HAVE_KVM_IRQFD + select HAVE_KVM_IRQ_ROUTING + select HAVE_KVM_MSI + help + Enable support for emulating MPIC devices inside the + host kernel, rather than relying on userspace to emulate. + Currently, support is limited to certain versions of + Freescale's MPIC implementation. + +config KVM_XICS + bool "KVM in-kernel XICS emulation" + depends on KVM_BOOK3S_64 && !KVM_MPIC + select HAVE_KVM_IRQCHIP + select HAVE_KVM_IRQFD + default y + ---help--- + Include support for the XICS (eXternal Interrupt Controller + Specification) interrupt controller architecture used on + IBM POWER (pSeries) servers. + +source drivers/vhost/Kconfig + +endif # VIRTUALIZATION diff --git a/kernel/arch/powerpc/kvm/Makefile b/kernel/arch/powerpc/kvm/Makefile new file mode 100644 index 000000000..0570eef83 --- /dev/null +++ b/kernel/arch/powerpc/kvm/Makefile @@ -0,0 +1,128 @@ +# +# Makefile for Kernel-based Virtual Machine module +# + +subdir-ccflags-$(CONFIG_PPC_WERROR) := -Werror + +ccflags-y := -Ivirt/kvm -Iarch/powerpc/kvm +KVM := ../../../virt/kvm + +common-objs-y = $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o \ + $(KVM)/eventfd.o + +CFLAGS_e500_mmu.o := -I. +CFLAGS_e500_mmu_host.o := -I. +CFLAGS_emulate.o := -I. +CFLAGS_emulate_loadstore.o := -I. + +common-objs-y += powerpc.o emulate.o emulate_loadstore.o +obj-$(CONFIG_KVM_EXIT_TIMING) += timing.o +obj-$(CONFIG_KVM_BOOK3S_HANDLER) += book3s_exports.o + +AFLAGS_booke_interrupts.o := -I$(obj) + +kvm-e500-objs := \ + $(common-objs-y) \ + booke.o \ + booke_emulate.o \ + booke_interrupts.o \ + e500.o \ + e500_mmu.o \ + e500_mmu_host.o \ + e500_emulate.o +kvm-objs-$(CONFIG_KVM_E500V2) := $(kvm-e500-objs) + +kvm-e500mc-objs := \ + $(common-objs-y) \ + booke.o \ + booke_emulate.o \ + bookehv_interrupts.o \ + e500mc.o \ + e500_mmu.o \ + e500_mmu_host.o \ + e500_emulate.o +kvm-objs-$(CONFIG_KVM_E500MC) := $(kvm-e500mc-objs) + +kvm-book3s_64-builtin-objs-$(CONFIG_KVM_BOOK3S_64_HANDLER) := \ + book3s_64_vio_hv.o + +kvm-pr-y := \ + fpu.o \ + emulate.o \ + book3s_paired_singles.o \ + book3s_pr.o \ + book3s_pr_papr.o \ + book3s_emulate.o \ + book3s_interrupts.o \ + book3s_mmu_hpte.o \ + book3s_64_mmu_host.o \ + book3s_64_mmu.o \ + book3s_32_mmu.o + +ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE +kvm-book3s_64-module-objs := \ + $(KVM)/coalesced_mmio.o + +kvm-book3s_64-builtin-objs-$(CONFIG_KVM_BOOK3S_64_HANDLER) += \ + book3s_rmhandlers.o +endif + +kvm-hv-y += \ + book3s_hv.o \ + book3s_hv_interrupts.o \ + book3s_64_mmu_hv.o + +kvm-book3s_64-builtin-xics-objs-$(CONFIG_KVM_XICS) := \ + book3s_hv_rm_xics.o + +ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE +kvm-book3s_64-builtin-objs-$(CONFIG_KVM_BOOK3S_64_HANDLER) += \ + book3s_hv_rmhandlers.o \ + book3s_hv_rm_mmu.o \ + book3s_hv_ras.o \ + book3s_hv_builtin.o \ + $(kvm-book3s_64-builtin-xics-objs-y) +endif + +kvm-book3s_64-objs-$(CONFIG_KVM_XICS) += \ + book3s_xics.o + +kvm-book3s_64-module-objs += \ + $(KVM)/kvm_main.o \ + $(KVM)/eventfd.o \ + powerpc.o \ + emulate_loadstore.o \ + book3s.o \ + book3s_64_vio.o \ + book3s_rtas.o \ + $(kvm-book3s_64-objs-y) + +kvm-objs-$(CONFIG_KVM_BOOK3S_64) := $(kvm-book3s_64-module-objs) + +kvm-book3s_32-objs := \ + $(common-objs-y) \ + fpu.o \ + book3s_paired_singles.o \ + book3s.o \ + book3s_pr.o \ + book3s_emulate.o \ + book3s_interrupts.o \ + book3s_mmu_hpte.o \ + book3s_32_mmu_host.o \ + book3s_32_mmu.o +kvm-objs-$(CONFIG_KVM_BOOK3S_32) := $(kvm-book3s_32-objs) + +kvm-objs-$(CONFIG_KVM_MPIC) += mpic.o +kvm-objs-$(CONFIG_HAVE_KVM_IRQ_ROUTING) += $(KVM)/irqchip.o + +kvm-objs := $(kvm-objs-m) $(kvm-objs-y) + +obj-$(CONFIG_KVM_E500V2) += kvm.o +obj-$(CONFIG_KVM_E500MC) += kvm.o +obj-$(CONFIG_KVM_BOOK3S_64) += kvm.o +obj-$(CONFIG_KVM_BOOK3S_32) += kvm.o + +obj-$(CONFIG_KVM_BOOK3S_64_PR) += kvm-pr.o +obj-$(CONFIG_KVM_BOOK3S_64_HV) += kvm-hv.o + +obj-y += $(kvm-book3s_64-builtin-objs-y) diff --git a/kernel/arch/powerpc/kvm/book3s.c b/kernel/arch/powerpc/kvm/book3s.c new file mode 100644 index 000000000..453a8a47a --- /dev/null +++ b/kernel/arch/powerpc/kvm/book3s.c @@ -0,0 +1,944 @@ +/* + * Copyright (C) 2009. SUSE Linux Products GmbH. All rights reserved. + * + * Authors: + * Alexander Graf <agraf@suse.de> + * Kevin Wolf <mail@kevin-wolf.de> + * + * Description: + * This file is derived from arch/powerpc/kvm/44x.c, + * by Hollis Blanchard <hollisb@us.ibm.com>. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + */ + +#include <linux/kvm_host.h> +#include <linux/err.h> +#include <linux/export.h> +#include <linux/slab.h> +#include <linux/module.h> +#include <linux/miscdevice.h> + +#include <asm/reg.h> +#include <asm/cputable.h> +#include <asm/cacheflush.h> +#include <asm/tlbflush.h> +#include <asm/uaccess.h> +#include <asm/io.h> +#include <asm/kvm_ppc.h> +#include <asm/kvm_book3s.h> +#include <asm/mmu_context.h> +#include <asm/page.h> +#include <linux/gfp.h> +#include <linux/sched.h> +#include <linux/vmalloc.h> +#include <linux/highmem.h> + +#include "book3s.h" +#include "trace.h" + +#define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU + +/* #define EXIT_DEBUG */ + +struct kvm_stats_debugfs_item debugfs_entries[] = { + { "exits", VCPU_STAT(sum_exits) }, + { "mmio", VCPU_STAT(mmio_exits) }, + { "sig", VCPU_STAT(signal_exits) }, + { "sysc", VCPU_STAT(syscall_exits) }, + { "inst_emu", VCPU_STAT(emulated_inst_exits) }, + { "dec", VCPU_STAT(dec_exits) }, + { "ext_intr", VCPU_STAT(ext_intr_exits) }, + { "queue_intr", VCPU_STAT(queue_intr) }, + { "halt_successful_poll", VCPU_STAT(halt_successful_poll), }, + { "halt_wakeup", VCPU_STAT(halt_wakeup) }, + { "pf_storage", VCPU_STAT(pf_storage) }, + { "sp_storage", VCPU_STAT(sp_storage) }, + { "pf_instruc", VCPU_STAT(pf_instruc) }, + { "sp_instruc", VCPU_STAT(sp_instruc) }, + { "ld", VCPU_STAT(ld) }, + { "ld_slow", VCPU_STAT(ld_slow) }, + { "st", VCPU_STAT(st) }, + { "st_slow", VCPU_STAT(st_slow) }, + { NULL } +}; + +void kvmppc_unfixup_split_real(struct kvm_vcpu *vcpu) +{ + if (vcpu->arch.hflags & BOOK3S_HFLAG_SPLIT_HACK) { + ulong pc = kvmppc_get_pc(vcpu); + if ((pc & SPLIT_HACK_MASK) == SPLIT_HACK_OFFS) + kvmppc_set_pc(vcpu, pc & ~SPLIT_HACK_MASK); + vcpu->arch.hflags &= ~BOOK3S_HFLAG_SPLIT_HACK; + } +} +EXPORT_SYMBOL_GPL(kvmppc_unfixup_split_real); + +static inline unsigned long kvmppc_interrupt_offset(struct kvm_vcpu *vcpu) +{ + if (!is_kvmppc_hv_enabled(vcpu->kvm)) + return to_book3s(vcpu)->hior; + return 0; +} + +static inline void kvmppc_update_int_pending(struct kvm_vcpu *vcpu, + unsigned long pending_now, unsigned long old_pending) +{ + if (is_kvmppc_hv_enabled(vcpu->kvm)) + return; + if (pending_now) + kvmppc_set_int_pending(vcpu, 1); + else if (old_pending) + kvmppc_set_int_pending(vcpu, 0); +} + +static inline bool kvmppc_critical_section(struct kvm_vcpu *vcpu) +{ + ulong crit_raw; + ulong crit_r1; + bool crit; + + if (is_kvmppc_hv_enabled(vcpu->kvm)) + return false; + + crit_raw = kvmppc_get_critical(vcpu); + crit_r1 = kvmppc_get_gpr(vcpu, 1); + + /* Truncate crit indicators in 32 bit mode */ + if (!(kvmppc_get_msr(vcpu) & MSR_SF)) { + crit_raw &= 0xffffffff; + crit_r1 &= 0xffffffff; + } + + /* Critical section when crit == r1 */ + crit = (crit_raw == crit_r1); + /* ... and we're in supervisor mode */ + crit = crit && !(kvmppc_get_msr(vcpu) & MSR_PR); + + return crit; +} + +void kvmppc_inject_interrupt(struct kvm_vcpu *vcpu, int vec, u64 flags) +{ + kvmppc_unfixup_split_real(vcpu); + kvmppc_set_srr0(vcpu, kvmppc_get_pc(vcpu)); + kvmppc_set_srr1(vcpu, kvmppc_get_msr(vcpu) | flags); + kvmppc_set_pc(vcpu, kvmppc_interrupt_offset(vcpu) + vec); + vcpu->arch.mmu.reset_msr(vcpu); +} + +static int kvmppc_book3s_vec2irqprio(unsigned int vec) +{ + unsigned int prio; + + switch (vec) { + case 0x100: prio = BOOK3S_IRQPRIO_SYSTEM_RESET; break; + case 0x200: prio = BOOK3S_IRQPRIO_MACHINE_CHECK; break; + case 0x300: prio = BOOK3S_IRQPRIO_DATA_STORAGE; break; + case 0x380: prio = BOOK3S_IRQPRIO_DATA_SEGMENT; break; + case 0x400: prio = BOOK3S_IRQPRIO_INST_STORAGE; break; + case 0x480: prio = BOOK3S_IRQPRIO_INST_SEGMENT; break; + case 0x500: prio = BOOK3S_IRQPRIO_EXTERNAL; break; + case 0x501: prio = BOOK3S_IRQPRIO_EXTERNAL_LEVEL; break; + case 0x600: prio = BOOK3S_IRQPRIO_ALIGNMENT; break; + case 0x700: prio = BOOK3S_IRQPRIO_PROGRAM; break; + case 0x800: prio = BOOK3S_IRQPRIO_FP_UNAVAIL; break; + case 0x900: prio = BOOK3S_IRQPRIO_DECREMENTER; break; + case 0xc00: prio = BOOK3S_IRQPRIO_SYSCALL; break; + case 0xd00: prio = BOOK3S_IRQPRIO_DEBUG; break; + case 0xf20: prio = BOOK3S_IRQPRIO_ALTIVEC; break; + case 0xf40: prio = BOOK3S_IRQPRIO_VSX; break; + case 0xf60: prio = BOOK3S_IRQPRIO_FAC_UNAVAIL; break; + default: prio = BOOK3S_IRQPRIO_MAX; break; + } + + return prio; +} + +void kvmppc_book3s_dequeue_irqprio(struct kvm_vcpu *vcpu, + unsigned int vec) +{ + unsigned long old_pending = vcpu->arch.pending_exceptions; + + clear_bit(kvmppc_book3s_vec2irqprio(vec), + &vcpu->arch.pending_exceptions); + + kvmppc_update_int_pending(vcpu, vcpu->arch.pending_exceptions, + old_pending); +} + +void kvmppc_book3s_queue_irqprio(struct kvm_vcpu *vcpu, unsigned int vec) +{ + vcpu->stat.queue_intr++; + + set_bit(kvmppc_book3s_vec2irqprio(vec), + &vcpu->arch.pending_exceptions); +#ifdef EXIT_DEBUG + printk(KERN_INFO "Queueing interrupt %x\n", vec); +#endif +} +EXPORT_SYMBOL_GPL(kvmppc_book3s_queue_irqprio); + +void kvmppc_core_queue_program(struct kvm_vcpu *vcpu, ulong flags) +{ + /* might as well deliver this straight away */ + kvmppc_inject_interrupt(vcpu, BOOK3S_INTERRUPT_PROGRAM, flags); +} +EXPORT_SYMBOL_GPL(kvmppc_core_queue_program); + +void kvmppc_core_queue_dec(struct kvm_vcpu *vcpu) +{ + kvmppc_book3s_queue_irqprio(vcpu, BOOK3S_INTERRUPT_DECREMENTER); +} +EXPORT_SYMBOL_GPL(kvmppc_core_queue_dec); + +int kvmppc_core_pending_dec(struct kvm_vcpu *vcpu) +{ + return test_bit(BOOK3S_IRQPRIO_DECREMENTER, &vcpu->arch.pending_exceptions); +} +EXPORT_SYMBOL_GPL(kvmppc_core_pending_dec); + +void kvmppc_core_dequeue_dec(struct kvm_vcpu *vcpu) +{ + kvmppc_book3s_dequeue_irqprio(vcpu, BOOK3S_INTERRUPT_DECREMENTER); +} +EXPORT_SYMBOL_GPL(kvmppc_core_dequeue_dec); + +void kvmppc_core_queue_external(struct kvm_vcpu *vcpu, + struct kvm_interrupt *irq) +{ + unsigned int vec = BOOK3S_INTERRUPT_EXTERNAL; + + if (irq->irq == KVM_INTERRUPT_SET_LEVEL) + vec = BOOK3S_INTERRUPT_EXTERNAL_LEVEL; + + kvmppc_book3s_queue_irqprio(vcpu, vec); +} + +void kvmppc_core_dequeue_external(struct kvm_vcpu *vcpu) +{ + kvmppc_book3s_dequeue_irqprio(vcpu, BOOK3S_INTERRUPT_EXTERNAL); + kvmppc_book3s_dequeue_irqprio(vcpu, BOOK3S_INTERRUPT_EXTERNAL_LEVEL); +} + +void kvmppc_core_queue_data_storage(struct kvm_vcpu *vcpu, ulong dar, + ulong flags) +{ + kvmppc_set_dar(vcpu, dar); + kvmppc_set_dsisr(vcpu, flags); + kvmppc_book3s_queue_irqprio(vcpu, BOOK3S_INTERRUPT_DATA_STORAGE); +} + +void kvmppc_core_queue_inst_storage(struct kvm_vcpu *vcpu, ulong flags) +{ + u64 msr = kvmppc_get_msr(vcpu); + msr &= ~(SRR1_ISI_NOPT | SRR1_ISI_N_OR_G | SRR1_ISI_PROT); + msr |= flags & (SRR1_ISI_NOPT | SRR1_ISI_N_OR_G | SRR1_ISI_PROT); + kvmppc_set_msr_fast(vcpu, msr); + kvmppc_book3s_queue_irqprio(vcpu, BOOK3S_INTERRUPT_INST_STORAGE); +} + +int kvmppc_book3s_irqprio_deliver(struct kvm_vcpu *vcpu, unsigned int priority) +{ + int deliver = 1; + int vec = 0; + bool crit = kvmppc_critical_section(vcpu); + + switch (priority) { + case BOOK3S_IRQPRIO_DECREMENTER: + deliver = (kvmppc_get_msr(vcpu) & MSR_EE) && !crit; + vec = BOOK3S_INTERRUPT_DECREMENTER; + break; + case BOOK3S_IRQPRIO_EXTERNAL: + case BOOK3S_IRQPRIO_EXTERNAL_LEVEL: + deliver = (kvmppc_get_msr(vcpu) & MSR_EE) && !crit; + vec = BOOK3S_INTERRUPT_EXTERNAL; + break; + case BOOK3S_IRQPRIO_SYSTEM_RESET: + vec = BOOK3S_INTERRUPT_SYSTEM_RESET; + break; + case BOOK3S_IRQPRIO_MACHINE_CHECK: + vec = BOOK3S_INTERRUPT_MACHINE_CHECK; + break; + case BOOK3S_IRQPRIO_DATA_STORAGE: + vec = BOOK3S_INTERRUPT_DATA_STORAGE; + break; + case BOOK3S_IRQPRIO_INST_STORAGE: + vec = BOOK3S_INTERRUPT_INST_STORAGE; + break; + case BOOK3S_IRQPRIO_DATA_SEGMENT: + vec = BOOK3S_INTERRUPT_DATA_SEGMENT; + break; + case BOOK3S_IRQPRIO_INST_SEGMENT: + vec = BOOK3S_INTERRUPT_INST_SEGMENT; + break; + case BOOK3S_IRQPRIO_ALIGNMENT: + vec = BOOK3S_INTERRUPT_ALIGNMENT; + break; + case BOOK3S_IRQPRIO_PROGRAM: + vec = BOOK3S_INTERRUPT_PROGRAM; + break; + case BOOK3S_IRQPRIO_VSX: + vec = BOOK3S_INTERRUPT_VSX; + break; + case BOOK3S_IRQPRIO_ALTIVEC: + vec = BOOK3S_INTERRUPT_ALTIVEC; + break; + case BOOK3S_IRQPRIO_FP_UNAVAIL: + vec = BOOK3S_INTERRUPT_FP_UNAVAIL; + break; + case BOOK3S_IRQPRIO_SYSCALL: + vec = BOOK3S_INTERRUPT_SYSCALL; + break; + case BOOK3S_IRQPRIO_DEBUG: + vec = BOOK3S_INTERRUPT_TRACE; + break; + case BOOK3S_IRQPRIO_PERFORMANCE_MONITOR: + vec = BOOK3S_INTERRUPT_PERFMON; + break; + case BOOK3S_IRQPRIO_FAC_UNAVAIL: + vec = BOOK3S_INTERRUPT_FAC_UNAVAIL; + break; + default: + deliver = 0; + printk(KERN_ERR "KVM: Unknown interrupt: 0x%x\n", priority); + break; + } + +#if 0 + printk(KERN_INFO "Deliver interrupt 0x%x? %x\n", vec, deliver); +#endif + + if (deliver) + kvmppc_inject_interrupt(vcpu, vec, 0); + + return deliver; +} + +/* + * This function determines if an irqprio should be cleared once issued. + */ +static bool clear_irqprio(struct kvm_vcpu *vcpu, unsigned int priority) +{ + switch (priority) { + case BOOK3S_IRQPRIO_DECREMENTER: + /* DEC interrupts get cleared by mtdec */ + return false; + case BOOK3S_IRQPRIO_EXTERNAL_LEVEL: + /* External interrupts get cleared by userspace */ + return false; + } + + return true; +} + +int kvmppc_core_prepare_to_enter(struct kvm_vcpu *vcpu) +{ + unsigned long *pending = &vcpu->arch.pending_exceptions; + unsigned long old_pending = vcpu->arch.pending_exceptions; + unsigned int priority; + +#ifdef EXIT_DEBUG + if (vcpu->arch.pending_exceptions) + printk(KERN_EMERG "KVM: Check pending: %lx\n", vcpu->arch.pending_exceptions); +#endif + priority = __ffs(*pending); + while (priority < BOOK3S_IRQPRIO_MAX) { + if (kvmppc_book3s_irqprio_deliver(vcpu, priority) && + clear_irqprio(vcpu, priority)) { + clear_bit(priority, &vcpu->arch.pending_exceptions); + break; + } + + priority = find_next_bit(pending, + BITS_PER_BYTE * sizeof(*pending), + priority + 1); + } + + /* Tell the guest about our interrupt status */ + kvmppc_update_int_pending(vcpu, *pending, old_pending); + + return 0; +} +EXPORT_SYMBOL_GPL(kvmppc_core_prepare_to_enter); + +pfn_t kvmppc_gpa_to_pfn(struct kvm_vcpu *vcpu, gpa_t gpa, bool writing, + bool *writable) +{ + ulong mp_pa = vcpu->arch.magic_page_pa & KVM_PAM; + gfn_t gfn = gpa >> PAGE_SHIFT; + + if (!(kvmppc_get_msr(vcpu) & MSR_SF)) + mp_pa = (uint32_t)mp_pa; + + /* Magic page override */ + gpa &= ~0xFFFULL; + if (unlikely(mp_pa) && unlikely((gpa & KVM_PAM) == mp_pa)) { + ulong shared_page = ((ulong)vcpu->arch.shared) & PAGE_MASK; + pfn_t pfn; + + pfn = (pfn_t)virt_to_phys((void*)shared_page) >> PAGE_SHIFT; + get_page(pfn_to_page(pfn)); + if (writable) + *writable = true; + return pfn; + } + + return gfn_to_pfn_prot(vcpu->kvm, gfn, writing, writable); +} +EXPORT_SYMBOL_GPL(kvmppc_gpa_to_pfn); + +int kvmppc_xlate(struct kvm_vcpu *vcpu, ulong eaddr, enum xlate_instdata xlid, + enum xlate_readwrite xlrw, struct kvmppc_pte *pte) +{ + bool data = (xlid == XLATE_DATA); + bool iswrite = (xlrw == XLATE_WRITE); + int relocated = (kvmppc_get_msr(vcpu) & (data ? MSR_DR : MSR_IR)); + int r; + + if (relocated) { + r = vcpu->arch.mmu.xlate(vcpu, eaddr, pte, data, iswrite); + } else { + pte->eaddr = eaddr; + pte->raddr = eaddr & KVM_PAM; + pte->vpage = VSID_REAL | eaddr >> 12; + pte->may_read = true; + pte->may_write = true; + pte->may_execute = true; + r = 0; + + if ((kvmppc_get_msr(vcpu) & (MSR_IR | MSR_DR)) == MSR_DR && + !data) { + if ((vcpu->arch.hflags & BOOK3S_HFLAG_SPLIT_HACK) && + ((eaddr & SPLIT_HACK_MASK) == SPLIT_HACK_OFFS)) + pte->raddr &= ~SPLIT_HACK_MASK; + } + } + + return r; +} + +int kvmppc_load_last_inst(struct kvm_vcpu *vcpu, enum instruction_type type, + u32 *inst) +{ + ulong pc = kvmppc_get_pc(vcpu); + int r; + + if (type == INST_SC) + pc -= 4; + + r = kvmppc_ld(vcpu, &pc, sizeof(u32), inst, false); + if (r == EMULATE_DONE) + return r; + else + return EMULATE_AGAIN; +} +EXPORT_SYMBOL_GPL(kvmppc_load_last_inst); + +int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) +{ + return 0; +} + +int kvmppc_subarch_vcpu_init(struct kvm_vcpu *vcpu) +{ + return 0; +} + +void kvmppc_subarch_vcpu_uninit(struct kvm_vcpu *vcpu) +{ +} + +int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, + struct kvm_sregs *sregs) +{ + return vcpu->kvm->arch.kvm_ops->get_sregs(vcpu, sregs); +} + +int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, + struct kvm_sregs *sregs) +{ + return vcpu->kvm->arch.kvm_ops->set_sregs(vcpu, sregs); +} + +int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) +{ + int i; + + regs->pc = kvmppc_get_pc(vcpu); + regs->cr = kvmppc_get_cr(vcpu); + regs->ctr = kvmppc_get_ctr(vcpu); + regs->lr = kvmppc_get_lr(vcpu); + regs->xer = kvmppc_get_xer(vcpu); + regs->msr = kvmppc_get_msr(vcpu); + regs->srr0 = kvmppc_get_srr0(vcpu); + regs->srr1 = kvmppc_get_srr1(vcpu); + regs->pid = vcpu->arch.pid; + regs->sprg0 = kvmppc_get_sprg0(vcpu); + regs->sprg1 = kvmppc_get_sprg1(vcpu); + regs->sprg2 = kvmppc_get_sprg2(vcpu); + regs->sprg3 = kvmppc_get_sprg3(vcpu); + regs->sprg4 = kvmppc_get_sprg4(vcpu); + regs->sprg5 = kvmppc_get_sprg5(vcpu); + regs->sprg6 = kvmppc_get_sprg6(vcpu); + regs->sprg7 = kvmppc_get_sprg7(vcpu); + + for (i = 0; i < ARRAY_SIZE(regs->gpr); i++) + regs->gpr[i] = kvmppc_get_gpr(vcpu, i); + + return 0; +} + +int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) +{ + int i; + + kvmppc_set_pc(vcpu, regs->pc); + kvmppc_set_cr(vcpu, regs->cr); + kvmppc_set_ctr(vcpu, regs->ctr); + kvmppc_set_lr(vcpu, regs->lr); + kvmppc_set_xer(vcpu, regs->xer); + kvmppc_set_msr(vcpu, regs->msr); + kvmppc_set_srr0(vcpu, regs->srr0); + kvmppc_set_srr1(vcpu, regs->srr1); + kvmppc_set_sprg0(vcpu, regs->sprg0); + kvmppc_set_sprg1(vcpu, regs->sprg1); + kvmppc_set_sprg2(vcpu, regs->sprg2); + kvmppc_set_sprg3(vcpu, regs->sprg3); + kvmppc_set_sprg4(vcpu, regs->sprg4); + kvmppc_set_sprg5(vcpu, regs->sprg5); + kvmppc_set_sprg6(vcpu, regs->sprg6); + kvmppc_set_sprg7(vcpu, regs->sprg7); + + for (i = 0; i < ARRAY_SIZE(regs->gpr); i++) + kvmppc_set_gpr(vcpu, i, regs->gpr[i]); + + return 0; +} + +int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) +{ + return -ENOTSUPP; +} + +int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) +{ + return -ENOTSUPP; +} + +int kvmppc_get_one_reg(struct kvm_vcpu *vcpu, u64 id, + union kvmppc_one_reg *val) +{ + int r = 0; + long int i; + + r = vcpu->kvm->arch.kvm_ops->get_one_reg(vcpu, id, val); + if (r == -EINVAL) { + r = 0; + switch (id) { + case KVM_REG_PPC_DAR: + *val = get_reg_val(id, kvmppc_get_dar(vcpu)); + break; + case KVM_REG_PPC_DSISR: + *val = get_reg_val(id, kvmppc_get_dsisr(vcpu)); + break; + case KVM_REG_PPC_FPR0 ... KVM_REG_PPC_FPR31: + i = id - KVM_REG_PPC_FPR0; + *val = get_reg_val(id, VCPU_FPR(vcpu, i)); + break; + case KVM_REG_PPC_FPSCR: + *val = get_reg_val(id, vcpu->arch.fp.fpscr); + break; +#ifdef CONFIG_VSX + case KVM_REG_PPC_VSR0 ... KVM_REG_PPC_VSR31: + if (cpu_has_feature(CPU_FTR_VSX)) { + i = id - KVM_REG_PPC_VSR0; + val->vsxval[0] = vcpu->arch.fp.fpr[i][0]; + val->vsxval[1] = vcpu->arch.fp.fpr[i][1]; + } else { + r = -ENXIO; + } + break; +#endif /* CONFIG_VSX */ + case KVM_REG_PPC_DEBUG_INST: + *val = get_reg_val(id, INS_TW); + break; +#ifdef CONFIG_KVM_XICS + case KVM_REG_PPC_ICP_STATE: + if (!vcpu->arch.icp) { + r = -ENXIO; + break; + } + *val = get_reg_val(id, kvmppc_xics_get_icp(vcpu)); + break; +#endif /* CONFIG_KVM_XICS */ + case KVM_REG_PPC_FSCR: + *val = get_reg_val(id, vcpu->arch.fscr); + break; + case KVM_REG_PPC_TAR: + *val = get_reg_val(id, vcpu->arch.tar); + break; + case KVM_REG_PPC_EBBHR: + *val = get_reg_val(id, vcpu->arch.ebbhr); + break; + case KVM_REG_PPC_EBBRR: + *val = get_reg_val(id, vcpu->arch.ebbrr); + break; + case KVM_REG_PPC_BESCR: + *val = get_reg_val(id, vcpu->arch.bescr); + break; + case KVM_REG_PPC_VTB: + *val = get_reg_val(id, vcpu->arch.vtb); + break; + case KVM_REG_PPC_IC: + *val = get_reg_val(id, vcpu->arch.ic); + break; + default: + r = -EINVAL; + break; + } + } + + return r; +} + +int kvmppc_set_one_reg(struct kvm_vcpu *vcpu, u64 id, + union kvmppc_one_reg *val) +{ + int r = 0; + long int i; + + r = vcpu->kvm->arch.kvm_ops->set_one_reg(vcpu, id, val); + if (r == -EINVAL) { + r = 0; + switch (id) { + case KVM_REG_PPC_DAR: + kvmppc_set_dar(vcpu, set_reg_val(id, *val)); + break; + case KVM_REG_PPC_DSISR: + kvmppc_set_dsisr(vcpu, set_reg_val(id, *val)); + break; + case KVM_REG_PPC_FPR0 ... KVM_REG_PPC_FPR31: + i = id - KVM_REG_PPC_FPR0; + VCPU_FPR(vcpu, i) = set_reg_val(id, *val); + break; + case KVM_REG_PPC_FPSCR: + vcpu->arch.fp.fpscr = set_reg_val(id, *val); + break; +#ifdef CONFIG_VSX + case KVM_REG_PPC_VSR0 ... KVM_REG_PPC_VSR31: + if (cpu_has_feature(CPU_FTR_VSX)) { + i = id - KVM_REG_PPC_VSR0; + vcpu->arch.fp.fpr[i][0] = val->vsxval[0]; + vcpu->arch.fp.fpr[i][1] = val->vsxval[1]; + } else { + r = -ENXIO; + } + break; +#endif /* CONFIG_VSX */ +#ifdef CONFIG_KVM_XICS + case KVM_REG_PPC_ICP_STATE: + if (!vcpu->arch.icp) { + r = -ENXIO; + break; + } + r = kvmppc_xics_set_icp(vcpu, + set_reg_val(id, *val)); + break; +#endif /* CONFIG_KVM_XICS */ + case KVM_REG_PPC_FSCR: + vcpu->arch.fscr = set_reg_val(id, *val); + break; + case KVM_REG_PPC_TAR: + vcpu->arch.tar = set_reg_val(id, *val); + break; + case KVM_REG_PPC_EBBHR: + vcpu->arch.ebbhr = set_reg_val(id, *val); + break; + case KVM_REG_PPC_EBBRR: + vcpu->arch.ebbrr = set_reg_val(id, *val); + break; + case KVM_REG_PPC_BESCR: + vcpu->arch.bescr = set_reg_val(id, *val); + break; + case KVM_REG_PPC_VTB: + vcpu->arch.vtb = set_reg_val(id, *val); + break; + case KVM_REG_PPC_IC: + vcpu->arch.ic = set_reg_val(id, *val); + break; + default: + r = -EINVAL; + break; + } + } + + return r; +} + +void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu) +{ + vcpu->kvm->arch.kvm_ops->vcpu_load(vcpu, cpu); +} + +void kvmppc_core_vcpu_put(struct kvm_vcpu *vcpu) +{ + vcpu->kvm->arch.kvm_ops->vcpu_put(vcpu); +} + +void kvmppc_set_msr(struct kvm_vcpu *vcpu, u64 msr) +{ + vcpu->kvm->arch.kvm_ops->set_msr(vcpu, msr); +} +EXPORT_SYMBOL_GPL(kvmppc_set_msr); + +int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) +{ + return vcpu->kvm->arch.kvm_ops->vcpu_run(kvm_run, vcpu); +} + +int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu, + struct kvm_translation *tr) +{ + return 0; +} + +int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, + struct kvm_guest_debug *dbg) +{ + vcpu->guest_debug = dbg->control; + return 0; +} + +void kvmppc_decrementer_func(struct kvm_vcpu *vcpu) +{ + kvmppc_core_queue_dec(vcpu); + kvm_vcpu_kick(vcpu); +} + +struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id) +{ + return kvm->arch.kvm_ops->vcpu_create(kvm, id); +} + +void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu) +{ + vcpu->kvm->arch.kvm_ops->vcpu_free(vcpu); +} + +int kvmppc_core_check_requests(struct kvm_vcpu *vcpu) +{ + return vcpu->kvm->arch.kvm_ops->check_requests(vcpu); +} + +int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log) +{ + return kvm->arch.kvm_ops->get_dirty_log(kvm, log); +} + +void kvmppc_core_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free, + struct kvm_memory_slot *dont) +{ + kvm->arch.kvm_ops->free_memslot(free, dont); +} + +int kvmppc_core_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot, + unsigned long npages) +{ + return kvm->arch.kvm_ops->create_memslot(slot, npages); +} + +void kvmppc_core_flush_memslot(struct kvm *kvm, struct kvm_memory_slot *memslot) +{ + kvm->arch.kvm_ops->flush_memslot(kvm, memslot); +} + +int kvmppc_core_prepare_memory_region(struct kvm *kvm, + struct kvm_memory_slot *memslot, + struct kvm_userspace_memory_region *mem) +{ + return kvm->arch.kvm_ops->prepare_memory_region(kvm, memslot, mem); +} + +void kvmppc_core_commit_memory_region(struct kvm *kvm, + struct kvm_userspace_memory_region *mem, + const struct kvm_memory_slot *old) +{ + kvm->arch.kvm_ops->commit_memory_region(kvm, mem, old); +} + +int kvm_unmap_hva(struct kvm *kvm, unsigned long hva) +{ + return kvm->arch.kvm_ops->unmap_hva(kvm, hva); +} +EXPORT_SYMBOL_GPL(kvm_unmap_hva); + +int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end) +{ + return kvm->arch.kvm_ops->unmap_hva_range(kvm, start, end); +} + +int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end) +{ + return kvm->arch.kvm_ops->age_hva(kvm, start, end); +} + +int kvm_test_age_hva(struct kvm *kvm, unsigned long hva) +{ + return kvm->arch.kvm_ops->test_age_hva(kvm, hva); +} + +void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte) +{ + kvm->arch.kvm_ops->set_spte_hva(kvm, hva, pte); +} + +void kvmppc_mmu_destroy(struct kvm_vcpu *vcpu) +{ + vcpu->kvm->arch.kvm_ops->mmu_destroy(vcpu); +} + +int kvmppc_core_init_vm(struct kvm *kvm) +{ + +#ifdef CONFIG_PPC64 + INIT_LIST_HEAD(&kvm->arch.spapr_tce_tables); + INIT_LIST_HEAD(&kvm->arch.rtas_tokens); +#endif + + return kvm->arch.kvm_ops->init_vm(kvm); +} + +void kvmppc_core_destroy_vm(struct kvm *kvm) +{ + kvm->arch.kvm_ops->destroy_vm(kvm); + +#ifdef CONFIG_PPC64 + kvmppc_rtas_tokens_free(kvm); + WARN_ON(!list_empty(&kvm->arch.spapr_tce_tables)); +#endif +} + +int kvmppc_h_logical_ci_load(struct kvm_vcpu *vcpu) +{ + unsigned long size = kvmppc_get_gpr(vcpu, 4); + unsigned long addr = kvmppc_get_gpr(vcpu, 5); + u64 buf; + int ret; + + if (!is_power_of_2(size) || (size > sizeof(buf))) + return H_TOO_HARD; + + ret = kvm_io_bus_read(vcpu, KVM_MMIO_BUS, addr, size, &buf); + if (ret != 0) + return H_TOO_HARD; + + switch (size) { + case 1: + kvmppc_set_gpr(vcpu, 4, *(u8 *)&buf); + break; + + case 2: + kvmppc_set_gpr(vcpu, 4, be16_to_cpu(*(__be16 *)&buf)); + break; + + case 4: + kvmppc_set_gpr(vcpu, 4, be32_to_cpu(*(__be32 *)&buf)); + break; + + case 8: + kvmppc_set_gpr(vcpu, 4, be64_to_cpu(*(__be64 *)&buf)); + break; + + default: + BUG(); + } + + return H_SUCCESS; +} +EXPORT_SYMBOL_GPL(kvmppc_h_logical_ci_load); + +int kvmppc_h_logical_ci_store(struct kvm_vcpu *vcpu) +{ + unsigned long size = kvmppc_get_gpr(vcpu, 4); + unsigned long addr = kvmppc_get_gpr(vcpu, 5); + unsigned long val = kvmppc_get_gpr(vcpu, 6); + u64 buf; + int ret; + + switch (size) { + case 1: + *(u8 *)&buf = val; + break; + + case 2: + *(__be16 *)&buf = cpu_to_be16(val); + break; + + case 4: + *(__be32 *)&buf = cpu_to_be32(val); + break; + + case 8: + *(__be64 *)&buf = cpu_to_be64(val); + break; + + default: + return H_TOO_HARD; + } + + ret = kvm_io_bus_write(vcpu, KVM_MMIO_BUS, addr, size, &buf); + if (ret != 0) + return H_TOO_HARD; + + return H_SUCCESS; +} +EXPORT_SYMBOL_GPL(kvmppc_h_logical_ci_store); + +int kvmppc_core_check_processor_compat(void) +{ + /* + * We always return 0 for book3s. We check + * for compatability while loading the HV + * or PR module + */ + return 0; +} + +int kvmppc_book3s_hcall_implemented(struct kvm *kvm, unsigned long hcall) +{ + return kvm->arch.kvm_ops->hcall_implemented(hcall); +} + +static int kvmppc_book3s_init(void) +{ + int r; + + r = kvm_init(NULL, sizeof(struct kvm_vcpu), 0, THIS_MODULE); + if (r) + return r; +#ifdef CONFIG_KVM_BOOK3S_32_HANDLER + r = kvmppc_book3s_init_pr(); +#endif + return r; + +} + +static void kvmppc_book3s_exit(void) +{ +#ifdef CONFIG_KVM_BOOK3S_32_HANDLER + kvmppc_book3s_exit_pr(); +#endif + kvm_exit(); +} + +module_init(kvmppc_book3s_init); +module_exit(kvmppc_book3s_exit); + +/* On 32bit this is our one and only kernel module */ +#ifdef CONFIG_KVM_BOOK3S_32_HANDLER +MODULE_ALIAS_MISCDEV(KVM_MINOR); +MODULE_ALIAS("devname:kvm"); +#endif diff --git a/kernel/arch/powerpc/kvm/book3s.h b/kernel/arch/powerpc/kvm/book3s.h new file mode 100644 index 000000000..d2b3ec088 --- /dev/null +++ b/kernel/arch/powerpc/kvm/book3s.h @@ -0,0 +1,35 @@ +/* + * Copyright IBM Corporation, 2013 + * Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License or (at your optional) any later version of the license. + * + */ + +#ifndef __POWERPC_KVM_BOOK3S_H__ +#define __POWERPC_KVM_BOOK3S_H__ + +extern void kvmppc_core_flush_memslot_hv(struct kvm *kvm, + struct kvm_memory_slot *memslot); +extern int kvm_unmap_hva_hv(struct kvm *kvm, unsigned long hva); +extern int kvm_unmap_hva_range_hv(struct kvm *kvm, unsigned long start, + unsigned long end); +extern int kvm_age_hva_hv(struct kvm *kvm, unsigned long start, + unsigned long end); +extern int kvm_test_age_hva_hv(struct kvm *kvm, unsigned long hva); +extern void kvm_set_spte_hva_hv(struct kvm *kvm, unsigned long hva, pte_t pte); + +extern void kvmppc_mmu_destroy_pr(struct kvm_vcpu *vcpu); +extern int kvmppc_core_emulate_op_pr(struct kvm_run *run, struct kvm_vcpu *vcpu, + unsigned int inst, int *advance); +extern int kvmppc_core_emulate_mtspr_pr(struct kvm_vcpu *vcpu, + int sprn, ulong spr_val); +extern int kvmppc_core_emulate_mfspr_pr(struct kvm_vcpu *vcpu, + int sprn, ulong *spr_val); +extern int kvmppc_book3s_init_pr(void); +extern void kvmppc_book3s_exit_pr(void); + +#endif diff --git a/kernel/arch/powerpc/kvm/book3s_32_mmu.c b/kernel/arch/powerpc/kvm/book3s_32_mmu.c new file mode 100644 index 000000000..a2eb6d354 --- /dev/null +++ b/kernel/arch/powerpc/kvm/book3s_32_mmu.c @@ -0,0 +1,430 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * + * Copyright SUSE Linux Products GmbH 2009 + * + * Authors: Alexander Graf <agraf@suse.de> + */ + +#include <linux/types.h> +#include <linux/string.h> +#include <linux/kvm.h> +#include <linux/kvm_host.h> +#include <linux/highmem.h> + +#include <asm/tlbflush.h> +#include <asm/kvm_ppc.h> +#include <asm/kvm_book3s.h> + +/* #define DEBUG_MMU */ +/* #define DEBUG_MMU_PTE */ +/* #define DEBUG_MMU_PTE_IP 0xfff14c40 */ + +#ifdef DEBUG_MMU +#define dprintk(X...) printk(KERN_INFO X) +#else +#define dprintk(X...) do { } while(0) +#endif + +#ifdef DEBUG_MMU_PTE +#define dprintk_pte(X...) printk(KERN_INFO X) +#else +#define dprintk_pte(X...) do { } while(0) +#endif + +#define PTEG_FLAG_ACCESSED 0x00000100 +#define PTEG_FLAG_DIRTY 0x00000080 +#ifndef SID_SHIFT +#define SID_SHIFT 28 +#endif + +static inline bool check_debug_ip(struct kvm_vcpu *vcpu) +{ +#ifdef DEBUG_MMU_PTE_IP + return vcpu->arch.pc == DEBUG_MMU_PTE_IP; +#else + return true; +#endif +} + +static inline u32 sr_vsid(u32 sr_raw) +{ + return sr_raw & 0x0fffffff; +} + +static inline bool sr_valid(u32 sr_raw) +{ + return (sr_raw & 0x80000000) ? false : true; +} + +static inline bool sr_ks(u32 sr_raw) +{ + return (sr_raw & 0x40000000) ? true: false; +} + +static inline bool sr_kp(u32 sr_raw) +{ + return (sr_raw & 0x20000000) ? true: false; +} + +static int kvmppc_mmu_book3s_32_xlate_bat(struct kvm_vcpu *vcpu, gva_t eaddr, + struct kvmppc_pte *pte, bool data, + bool iswrite); +static int kvmppc_mmu_book3s_32_esid_to_vsid(struct kvm_vcpu *vcpu, ulong esid, + u64 *vsid); + +static u32 find_sr(struct kvm_vcpu *vcpu, gva_t eaddr) +{ + return kvmppc_get_sr(vcpu, (eaddr >> 28) & 0xf); +} + +static u64 kvmppc_mmu_book3s_32_ea_to_vp(struct kvm_vcpu *vcpu, gva_t eaddr, + bool data) +{ + u64 vsid; + struct kvmppc_pte pte; + + if (!kvmppc_mmu_book3s_32_xlate_bat(vcpu, eaddr, &pte, data, false)) + return pte.vpage; + + kvmppc_mmu_book3s_32_esid_to_vsid(vcpu, eaddr >> SID_SHIFT, &vsid); + return (((u64)eaddr >> 12) & 0xffff) | (vsid << 16); +} + +static void kvmppc_mmu_book3s_32_reset_msr(struct kvm_vcpu *vcpu) +{ + kvmppc_set_msr(vcpu, 0); +} + +static hva_t kvmppc_mmu_book3s_32_get_pteg(struct kvm_vcpu *vcpu, + u32 sre, gva_t eaddr, + bool primary) +{ + struct kvmppc_vcpu_book3s *vcpu_book3s = to_book3s(vcpu); + u32 page, hash, pteg, htabmask; + hva_t r; + + page = (eaddr & 0x0FFFFFFF) >> 12; + htabmask = ((vcpu_book3s->sdr1 & 0x1FF) << 16) | 0xFFC0; + + hash = ((sr_vsid(sre) ^ page) << 6); + if (!primary) + hash = ~hash; + hash &= htabmask; + + pteg = (vcpu_book3s->sdr1 & 0xffff0000) | hash; + + dprintk("MMU: pc=0x%lx eaddr=0x%lx sdr1=0x%llx pteg=0x%x vsid=0x%x\n", + kvmppc_get_pc(vcpu), eaddr, vcpu_book3s->sdr1, pteg, + sr_vsid(sre)); + + r = gfn_to_hva(vcpu->kvm, pteg >> PAGE_SHIFT); + if (kvm_is_error_hva(r)) + return r; + return r | (pteg & ~PAGE_MASK); +} + +static u32 kvmppc_mmu_book3s_32_get_ptem(u32 sre, gva_t eaddr, bool primary) +{ + return ((eaddr & 0x0fffffff) >> 22) | (sr_vsid(sre) << 7) | + (primary ? 0 : 0x40) | 0x80000000; +} + +static int kvmppc_mmu_book3s_32_xlate_bat(struct kvm_vcpu *vcpu, gva_t eaddr, + struct kvmppc_pte *pte, bool data, + bool iswrite) +{ + struct kvmppc_vcpu_book3s *vcpu_book3s = to_book3s(vcpu); + struct kvmppc_bat *bat; + int i; + + for (i = 0; i < 8; i++) { + if (data) + bat = &vcpu_book3s->dbat[i]; + else + bat = &vcpu_book3s->ibat[i]; + + if (kvmppc_get_msr(vcpu) & MSR_PR) { + if (!bat->vp) + continue; + } else { + if (!bat->vs) + continue; + } + + if (check_debug_ip(vcpu)) + { + dprintk_pte("%cBAT %02d: 0x%lx - 0x%x (0x%x)\n", + data ? 'd' : 'i', i, eaddr, bat->bepi, + bat->bepi_mask); + } + if ((eaddr & bat->bepi_mask) == bat->bepi) { + u64 vsid; + kvmppc_mmu_book3s_32_esid_to_vsid(vcpu, + eaddr >> SID_SHIFT, &vsid); + vsid <<= 16; + pte->vpage = (((u64)eaddr >> 12) & 0xffff) | vsid; + + pte->raddr = bat->brpn | (eaddr & ~bat->bepi_mask); + pte->may_read = bat->pp; + pte->may_write = bat->pp > 1; + pte->may_execute = true; + if (!pte->may_read) { + printk(KERN_INFO "BAT is not readable!\n"); + continue; + } + if (iswrite && !pte->may_write) { + dprintk_pte("BAT is read-only!\n"); + continue; + } + + return 0; + } + } + + return -ENOENT; +} + +static int kvmppc_mmu_book3s_32_xlate_pte(struct kvm_vcpu *vcpu, gva_t eaddr, + struct kvmppc_pte *pte, bool data, + bool iswrite, bool primary) +{ + u32 sre; + hva_t ptegp; + u32 pteg[16]; + u32 pte0, pte1; + u32 ptem = 0; + int i; + int found = 0; + + sre = find_sr(vcpu, eaddr); + + dprintk_pte("SR 0x%lx: vsid=0x%x, raw=0x%x\n", eaddr >> 28, + sr_vsid(sre), sre); + + pte->vpage = kvmppc_mmu_book3s_32_ea_to_vp(vcpu, eaddr, data); + + ptegp = kvmppc_mmu_book3s_32_get_pteg(vcpu, sre, eaddr, primary); + if (kvm_is_error_hva(ptegp)) { + printk(KERN_INFO "KVM: Invalid PTEG!\n"); + goto no_page_found; + } + + ptem = kvmppc_mmu_book3s_32_get_ptem(sre, eaddr, primary); + + if(copy_from_user(pteg, (void __user *)ptegp, sizeof(pteg))) { + printk(KERN_ERR "KVM: Can't copy data from 0x%lx!\n", ptegp); + goto no_page_found; + } + + for (i=0; i<16; i+=2) { + pte0 = be32_to_cpu(pteg[i]); + pte1 = be32_to_cpu(pteg[i + 1]); + if (ptem == pte0) { + u8 pp; + + pte->raddr = (pte1 & ~(0xFFFULL)) | (eaddr & 0xFFF); + pp = pte1 & 3; + + if ((sr_kp(sre) && (kvmppc_get_msr(vcpu) & MSR_PR)) || + (sr_ks(sre) && !(kvmppc_get_msr(vcpu) & MSR_PR))) + pp |= 4; + + pte->may_write = false; + pte->may_read = false; + pte->may_execute = true; + switch (pp) { + case 0: + case 1: + case 2: + case 6: + pte->may_write = true; + case 3: + case 5: + case 7: + pte->may_read = true; + break; + } + + dprintk_pte("MMU: Found PTE -> %x %x - %x\n", + pte0, pte1, pp); + found = 1; + break; + } + } + + /* Update PTE C and A bits, so the guest's swapper knows we used the + page */ + if (found) { + u32 pte_r = pte1; + char __user *addr = (char __user *) (ptegp + (i+1) * sizeof(u32)); + + /* + * Use single-byte writes to update the HPTE, to + * conform to what real hardware does. + */ + if (pte->may_read && !(pte_r & PTEG_FLAG_ACCESSED)) { + pte_r |= PTEG_FLAG_ACCESSED; + put_user(pte_r >> 8, addr + 2); + } + if (iswrite && pte->may_write && !(pte_r & PTEG_FLAG_DIRTY)) { + pte_r |= PTEG_FLAG_DIRTY; + put_user(pte_r, addr + 3); + } + if (!pte->may_read || (iswrite && !pte->may_write)) + return -EPERM; + return 0; + } + +no_page_found: + + if (check_debug_ip(vcpu)) { + dprintk_pte("KVM MMU: No PTE found (sdr1=0x%llx ptegp=0x%lx)\n", + to_book3s(vcpu)->sdr1, ptegp); + for (i=0; i<16; i+=2) { + dprintk_pte(" %02d: 0x%x - 0x%x (0x%x)\n", + i, be32_to_cpu(pteg[i]), + be32_to_cpu(pteg[i+1]), ptem); + } + } + + return -ENOENT; +} + +static int kvmppc_mmu_book3s_32_xlate(struct kvm_vcpu *vcpu, gva_t eaddr, + struct kvmppc_pte *pte, bool data, + bool iswrite) +{ + int r; + ulong mp_ea = vcpu->arch.magic_page_ea; + + pte->eaddr = eaddr; + pte->page_size = MMU_PAGE_4K; + + /* Magic page override */ + if (unlikely(mp_ea) && + unlikely((eaddr & ~0xfffULL) == (mp_ea & ~0xfffULL)) && + !(kvmppc_get_msr(vcpu) & MSR_PR)) { + pte->vpage = kvmppc_mmu_book3s_32_ea_to_vp(vcpu, eaddr, data); + pte->raddr = vcpu->arch.magic_page_pa | (pte->raddr & 0xfff); + pte->raddr &= KVM_PAM; + pte->may_execute = true; + pte->may_read = true; + pte->may_write = true; + + return 0; + } + + r = kvmppc_mmu_book3s_32_xlate_bat(vcpu, eaddr, pte, data, iswrite); + if (r < 0) + r = kvmppc_mmu_book3s_32_xlate_pte(vcpu, eaddr, pte, + data, iswrite, true); + if (r == -ENOENT) + r = kvmppc_mmu_book3s_32_xlate_pte(vcpu, eaddr, pte, + data, iswrite, false); + + return r; +} + + +static u32 kvmppc_mmu_book3s_32_mfsrin(struct kvm_vcpu *vcpu, u32 srnum) +{ + return kvmppc_get_sr(vcpu, srnum); +} + +static void kvmppc_mmu_book3s_32_mtsrin(struct kvm_vcpu *vcpu, u32 srnum, + ulong value) +{ + kvmppc_set_sr(vcpu, srnum, value); + kvmppc_mmu_map_segment(vcpu, srnum << SID_SHIFT); +} + +static void kvmppc_mmu_book3s_32_tlbie(struct kvm_vcpu *vcpu, ulong ea, bool large) +{ + int i; + struct kvm_vcpu *v; + + /* flush this VA on all cpus */ + kvm_for_each_vcpu(i, v, vcpu->kvm) + kvmppc_mmu_pte_flush(v, ea, 0x0FFFF000); +} + +static int kvmppc_mmu_book3s_32_esid_to_vsid(struct kvm_vcpu *vcpu, ulong esid, + u64 *vsid) +{ + ulong ea = esid << SID_SHIFT; + u32 sr; + u64 gvsid = esid; + u64 msr = kvmppc_get_msr(vcpu); + + if (msr & (MSR_DR|MSR_IR)) { + sr = find_sr(vcpu, ea); + if (sr_valid(sr)) + gvsid = sr_vsid(sr); + } + + /* In case we only have one of MSR_IR or MSR_DR set, let's put + that in the real-mode context (and hope RM doesn't access + high memory) */ + switch (msr & (MSR_DR|MSR_IR)) { + case 0: + *vsid = VSID_REAL | esid; + break; + case MSR_IR: + *vsid = VSID_REAL_IR | gvsid; + break; + case MSR_DR: + *vsid = VSID_REAL_DR | gvsid; + break; + case MSR_DR|MSR_IR: + if (sr_valid(sr)) + *vsid = sr_vsid(sr); + else + *vsid = VSID_BAT | gvsid; + break; + default: + BUG(); + } + + if (msr & MSR_PR) + *vsid |= VSID_PR; + + return 0; +} + +static bool kvmppc_mmu_book3s_32_is_dcbz32(struct kvm_vcpu *vcpu) +{ + return true; +} + + +void kvmppc_mmu_book3s_32_init(struct kvm_vcpu *vcpu) +{ + struct kvmppc_mmu *mmu = &vcpu->arch.mmu; + + mmu->mtsrin = kvmppc_mmu_book3s_32_mtsrin; + mmu->mfsrin = kvmppc_mmu_book3s_32_mfsrin; + mmu->xlate = kvmppc_mmu_book3s_32_xlate; + mmu->reset_msr = kvmppc_mmu_book3s_32_reset_msr; + mmu->tlbie = kvmppc_mmu_book3s_32_tlbie; + mmu->esid_to_vsid = kvmppc_mmu_book3s_32_esid_to_vsid; + mmu->ea_to_vp = kvmppc_mmu_book3s_32_ea_to_vp; + mmu->is_dcbz32 = kvmppc_mmu_book3s_32_is_dcbz32; + + mmu->slbmte = NULL; + mmu->slbmfee = NULL; + mmu->slbmfev = NULL; + mmu->slbie = NULL; + mmu->slbia = NULL; +} diff --git a/kernel/arch/powerpc/kvm/book3s_32_mmu_host.c b/kernel/arch/powerpc/kvm/book3s_32_mmu_host.c new file mode 100644 index 000000000..2035d16a9 --- /dev/null +++ b/kernel/arch/powerpc/kvm/book3s_32_mmu_host.c @@ -0,0 +1,409 @@ +/* + * Copyright (C) 2010 SUSE Linux Products GmbH. All rights reserved. + * + * Authors: + * Alexander Graf <agraf@suse.de> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + */ + +#include <linux/kvm_host.h> + +#include <asm/kvm_ppc.h> +#include <asm/kvm_book3s.h> +#include <asm/mmu-hash32.h> +#include <asm/machdep.h> +#include <asm/mmu_context.h> +#include <asm/hw_irq.h> + +/* #define DEBUG_MMU */ +/* #define DEBUG_SR */ + +#ifdef DEBUG_MMU +#define dprintk_mmu(a, ...) printk(KERN_INFO a, __VA_ARGS__) +#else +#define dprintk_mmu(a, ...) do { } while(0) +#endif + +#ifdef DEBUG_SR +#define dprintk_sr(a, ...) printk(KERN_INFO a, __VA_ARGS__) +#else +#define dprintk_sr(a, ...) do { } while(0) +#endif + +#if PAGE_SHIFT != 12 +#error Unknown page size +#endif + +#ifdef CONFIG_SMP +#error XXX need to grab mmu_hash_lock +#endif + +#ifdef CONFIG_PTE_64BIT +#error Only 32 bit pages are supported for now +#endif + +static ulong htab; +static u32 htabmask; + +void kvmppc_mmu_invalidate_pte(struct kvm_vcpu *vcpu, struct hpte_cache *pte) +{ + volatile u32 *pteg; + + /* Remove from host HTAB */ + pteg = (u32*)pte->slot; + pteg[0] = 0; + + /* And make sure it's gone from the TLB too */ + asm volatile ("sync"); + asm volatile ("tlbie %0" : : "r" (pte->pte.eaddr) : "memory"); + asm volatile ("sync"); + asm volatile ("tlbsync"); +} + +/* We keep 512 gvsid->hvsid entries, mapping the guest ones to the array using + * a hash, so we don't waste cycles on looping */ +static u16 kvmppc_sid_hash(struct kvm_vcpu *vcpu, u64 gvsid) +{ + return (u16)(((gvsid >> (SID_MAP_BITS * 7)) & SID_MAP_MASK) ^ + ((gvsid >> (SID_MAP_BITS * 6)) & SID_MAP_MASK) ^ + ((gvsid >> (SID_MAP_BITS * 5)) & SID_MAP_MASK) ^ + ((gvsid >> (SID_MAP_BITS * 4)) & SID_MAP_MASK) ^ + ((gvsid >> (SID_MAP_BITS * 3)) & SID_MAP_MASK) ^ + ((gvsid >> (SID_MAP_BITS * 2)) & SID_MAP_MASK) ^ + ((gvsid >> (SID_MAP_BITS * 1)) & SID_MAP_MASK) ^ + ((gvsid >> (SID_MAP_BITS * 0)) & SID_MAP_MASK)); +} + + +static struct kvmppc_sid_map *find_sid_vsid(struct kvm_vcpu *vcpu, u64 gvsid) +{ + struct kvmppc_sid_map *map; + u16 sid_map_mask; + + if (kvmppc_get_msr(vcpu) & MSR_PR) + gvsid |= VSID_PR; + + sid_map_mask = kvmppc_sid_hash(vcpu, gvsid); + map = &to_book3s(vcpu)->sid_map[sid_map_mask]; + if (map->guest_vsid == gvsid) { + dprintk_sr("SR: Searching 0x%llx -> 0x%llx\n", + gvsid, map->host_vsid); + return map; + } + + map = &to_book3s(vcpu)->sid_map[SID_MAP_MASK - sid_map_mask]; + if (map->guest_vsid == gvsid) { + dprintk_sr("SR: Searching 0x%llx -> 0x%llx\n", + gvsid, map->host_vsid); + return map; + } + + dprintk_sr("SR: Searching 0x%llx -> not found\n", gvsid); + return NULL; +} + +static u32 *kvmppc_mmu_get_pteg(struct kvm_vcpu *vcpu, u32 vsid, u32 eaddr, + bool primary) +{ + u32 page, hash; + ulong pteg = htab; + + page = (eaddr & ~ESID_MASK) >> 12; + + hash = ((vsid ^ page) << 6); + if (!primary) + hash = ~hash; + + hash &= htabmask; + + pteg |= hash; + + dprintk_mmu("htab: %lx | hash: %x | htabmask: %x | pteg: %lx\n", + htab, hash, htabmask, pteg); + + return (u32*)pteg; +} + +extern char etext[]; + +int kvmppc_mmu_map_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *orig_pte, + bool iswrite) +{ + pfn_t hpaddr; + u64 vpn; + u64 vsid; + struct kvmppc_sid_map *map; + volatile u32 *pteg; + u32 eaddr = orig_pte->eaddr; + u32 pteg0, pteg1; + register int rr = 0; + bool primary = false; + bool evict = false; + struct hpte_cache *pte; + int r = 0; + bool writable; + + /* Get host physical address for gpa */ + hpaddr = kvmppc_gpa_to_pfn(vcpu, orig_pte->raddr, iswrite, &writable); + if (is_error_noslot_pfn(hpaddr)) { + printk(KERN_INFO "Couldn't get guest page for gpa %lx!\n", + orig_pte->raddr); + r = -EINVAL; + goto out; + } + hpaddr <<= PAGE_SHIFT; + + /* and write the mapping ea -> hpa into the pt */ + vcpu->arch.mmu.esid_to_vsid(vcpu, orig_pte->eaddr >> SID_SHIFT, &vsid); + map = find_sid_vsid(vcpu, vsid); + if (!map) { + kvmppc_mmu_map_segment(vcpu, eaddr); + map = find_sid_vsid(vcpu, vsid); + } + BUG_ON(!map); + + vsid = map->host_vsid; + vpn = (vsid << (SID_SHIFT - VPN_SHIFT)) | + ((eaddr & ~ESID_MASK) >> VPN_SHIFT); +next_pteg: + if (rr == 16) { + primary = !primary; + evict = true; + rr = 0; + } + + pteg = kvmppc_mmu_get_pteg(vcpu, vsid, eaddr, primary); + + /* not evicting yet */ + if (!evict && (pteg[rr] & PTE_V)) { + rr += 2; + goto next_pteg; + } + + dprintk_mmu("KVM: old PTEG: %p (%d)\n", pteg, rr); + dprintk_mmu("KVM: %08x - %08x\n", pteg[0], pteg[1]); + dprintk_mmu("KVM: %08x - %08x\n", pteg[2], pteg[3]); + dprintk_mmu("KVM: %08x - %08x\n", pteg[4], pteg[5]); + dprintk_mmu("KVM: %08x - %08x\n", pteg[6], pteg[7]); + dprintk_mmu("KVM: %08x - %08x\n", pteg[8], pteg[9]); + dprintk_mmu("KVM: %08x - %08x\n", pteg[10], pteg[11]); + dprintk_mmu("KVM: %08x - %08x\n", pteg[12], pteg[13]); + dprintk_mmu("KVM: %08x - %08x\n", pteg[14], pteg[15]); + + pteg0 = ((eaddr & 0x0fffffff) >> 22) | (vsid << 7) | PTE_V | + (primary ? 0 : PTE_SEC); + pteg1 = hpaddr | PTE_M | PTE_R | PTE_C; + + if (orig_pte->may_write && writable) { + pteg1 |= PP_RWRW; + mark_page_dirty(vcpu->kvm, orig_pte->raddr >> PAGE_SHIFT); + } else { + pteg1 |= PP_RWRX; + } + + if (orig_pte->may_execute) + kvmppc_mmu_flush_icache(hpaddr >> PAGE_SHIFT); + + local_irq_disable(); + + if (pteg[rr]) { + pteg[rr] = 0; + asm volatile ("sync"); + } + pteg[rr + 1] = pteg1; + pteg[rr] = pteg0; + asm volatile ("sync"); + + local_irq_enable(); + + dprintk_mmu("KVM: new PTEG: %p\n", pteg); + dprintk_mmu("KVM: %08x - %08x\n", pteg[0], pteg[1]); + dprintk_mmu("KVM: %08x - %08x\n", pteg[2], pteg[3]); + dprintk_mmu("KVM: %08x - %08x\n", pteg[4], pteg[5]); + dprintk_mmu("KVM: %08x - %08x\n", pteg[6], pteg[7]); + dprintk_mmu("KVM: %08x - %08x\n", pteg[8], pteg[9]); + dprintk_mmu("KVM: %08x - %08x\n", pteg[10], pteg[11]); + dprintk_mmu("KVM: %08x - %08x\n", pteg[12], pteg[13]); + dprintk_mmu("KVM: %08x - %08x\n", pteg[14], pteg[15]); + + + /* Now tell our Shadow PTE code about the new page */ + + pte = kvmppc_mmu_hpte_cache_next(vcpu); + if (!pte) { + kvm_release_pfn_clean(hpaddr >> PAGE_SHIFT); + r = -EAGAIN; + goto out; + } + + dprintk_mmu("KVM: %c%c Map 0x%llx: [%lx] 0x%llx (0x%llx) -> %lx\n", + orig_pte->may_write ? 'w' : '-', + orig_pte->may_execute ? 'x' : '-', + orig_pte->eaddr, (ulong)pteg, vpn, + orig_pte->vpage, hpaddr); + + pte->slot = (ulong)&pteg[rr]; + pte->host_vpn = vpn; + pte->pte = *orig_pte; + pte->pfn = hpaddr >> PAGE_SHIFT; + + kvmppc_mmu_hpte_cache_map(vcpu, pte); + + kvm_release_pfn_clean(hpaddr >> PAGE_SHIFT); +out: + return r; +} + +void kvmppc_mmu_unmap_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *pte) +{ + kvmppc_mmu_pte_vflush(vcpu, pte->vpage, 0xfffffffffULL); +} + +static struct kvmppc_sid_map *create_sid_map(struct kvm_vcpu *vcpu, u64 gvsid) +{ + struct kvmppc_sid_map *map; + struct kvmppc_vcpu_book3s *vcpu_book3s = to_book3s(vcpu); + u16 sid_map_mask; + static int backwards_map = 0; + + if (kvmppc_get_msr(vcpu) & MSR_PR) + gvsid |= VSID_PR; + + /* We might get collisions that trap in preceding order, so let's + map them differently */ + + sid_map_mask = kvmppc_sid_hash(vcpu, gvsid); + if (backwards_map) + sid_map_mask = SID_MAP_MASK - sid_map_mask; + + map = &to_book3s(vcpu)->sid_map[sid_map_mask]; + + /* Make sure we're taking the other map next time */ + backwards_map = !backwards_map; + + /* Uh-oh ... out of mappings. Let's flush! */ + if (vcpu_book3s->vsid_next >= VSID_POOL_SIZE) { + vcpu_book3s->vsid_next = 0; + memset(vcpu_book3s->sid_map, 0, + sizeof(struct kvmppc_sid_map) * SID_MAP_NUM); + kvmppc_mmu_pte_flush(vcpu, 0, 0); + kvmppc_mmu_flush_segments(vcpu); + } + map->host_vsid = vcpu_book3s->vsid_pool[vcpu_book3s->vsid_next]; + vcpu_book3s->vsid_next++; + + map->guest_vsid = gvsid; + map->valid = true; + + return map; +} + +int kvmppc_mmu_map_segment(struct kvm_vcpu *vcpu, ulong eaddr) +{ + u32 esid = eaddr >> SID_SHIFT; + u64 gvsid; + u32 sr; + struct kvmppc_sid_map *map; + struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu); + int r = 0; + + if (vcpu->arch.mmu.esid_to_vsid(vcpu, esid, &gvsid)) { + /* Invalidate an entry */ + svcpu->sr[esid] = SR_INVALID; + r = -ENOENT; + goto out; + } + + map = find_sid_vsid(vcpu, gvsid); + if (!map) + map = create_sid_map(vcpu, gvsid); + + map->guest_esid = esid; + sr = map->host_vsid | SR_KP; + svcpu->sr[esid] = sr; + + dprintk_sr("MMU: mtsr %d, 0x%x\n", esid, sr); + +out: + svcpu_put(svcpu); + return r; +} + +void kvmppc_mmu_flush_segments(struct kvm_vcpu *vcpu) +{ + int i; + struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu); + + dprintk_sr("MMU: flushing all segments (%d)\n", ARRAY_SIZE(svcpu->sr)); + for (i = 0; i < ARRAY_SIZE(svcpu->sr); i++) + svcpu->sr[i] = SR_INVALID; + + svcpu_put(svcpu); +} + +void kvmppc_mmu_destroy_pr(struct kvm_vcpu *vcpu) +{ + int i; + + kvmppc_mmu_hpte_destroy(vcpu); + preempt_disable(); + for (i = 0; i < SID_CONTEXTS; i++) + __destroy_context(to_book3s(vcpu)->context_id[i]); + preempt_enable(); +} + +/* From mm/mmu_context_hash32.c */ +#define CTX_TO_VSID(c, id) ((((c) * (897 * 16)) + (id * 0x111)) & 0xffffff) + +int kvmppc_mmu_init(struct kvm_vcpu *vcpu) +{ + struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu); + int err; + ulong sdr1; + int i; + int j; + + for (i = 0; i < SID_CONTEXTS; i++) { + err = __init_new_context(); + if (err < 0) + goto init_fail; + vcpu3s->context_id[i] = err; + + /* Remember context id for this combination */ + for (j = 0; j < 16; j++) + vcpu3s->vsid_pool[(i * 16) + j] = CTX_TO_VSID(err, j); + } + + vcpu3s->vsid_next = 0; + + /* Remember where the HTAB is */ + asm ( "mfsdr1 %0" : "=r"(sdr1) ); + htabmask = ((sdr1 & 0x1FF) << 16) | 0xFFC0; + htab = (ulong)__va(sdr1 & 0xffff0000); + + kvmppc_mmu_hpte_init(vcpu); + + return 0; + +init_fail: + for (j = 0; j < i; j++) { + if (!vcpu3s->context_id[j]) + continue; + + __destroy_context(to_book3s(vcpu)->context_id[j]); + } + + return -1; +} diff --git a/kernel/arch/powerpc/kvm/book3s_32_sr.S b/kernel/arch/powerpc/kvm/book3s_32_sr.S new file mode 100644 index 000000000..7e06a6fc8 --- /dev/null +++ b/kernel/arch/powerpc/kvm/book3s_32_sr.S @@ -0,0 +1,143 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * + * Copyright SUSE Linux Products GmbH 2009 + * + * Authors: Alexander Graf <agraf@suse.de> + */ + +/****************************************************************************** + * * + * Entry code * + * * + *****************************************************************************/ + +.macro LOAD_GUEST_SEGMENTS + + /* Required state: + * + * MSR = ~IR|DR + * R1 = host R1 + * R2 = host R2 + * R3 = shadow vcpu + * all other volatile GPRS = free except R4, R6 + * SVCPU[CR] = guest CR + * SVCPU[XER] = guest XER + * SVCPU[CTR] = guest CTR + * SVCPU[LR] = guest LR + */ + +#define XCHG_SR(n) lwz r9, (SVCPU_SR+(n*4))(r3); \ + mtsr n, r9 + + XCHG_SR(0) + XCHG_SR(1) + XCHG_SR(2) + XCHG_SR(3) + XCHG_SR(4) + XCHG_SR(5) + XCHG_SR(6) + XCHG_SR(7) + XCHG_SR(8) + XCHG_SR(9) + XCHG_SR(10) + XCHG_SR(11) + XCHG_SR(12) + XCHG_SR(13) + XCHG_SR(14) + XCHG_SR(15) + + /* Clear BATs. */ + +#define KVM_KILL_BAT(n, reg) \ + mtspr SPRN_IBAT##n##U,reg; \ + mtspr SPRN_IBAT##n##L,reg; \ + mtspr SPRN_DBAT##n##U,reg; \ + mtspr SPRN_DBAT##n##L,reg; \ + + li r9, 0 + KVM_KILL_BAT(0, r9) + KVM_KILL_BAT(1, r9) + KVM_KILL_BAT(2, r9) + KVM_KILL_BAT(3, r9) + +.endm + +/****************************************************************************** + * * + * Exit code * + * * + *****************************************************************************/ + +.macro LOAD_HOST_SEGMENTS + + /* Register usage at this point: + * + * R1 = host R1 + * R2 = host R2 + * R12 = exit handler id + * R13 = shadow vcpu - SHADOW_VCPU_OFF + * SVCPU.* = guest * + * SVCPU[CR] = guest CR + * SVCPU[XER] = guest XER + * SVCPU[CTR] = guest CTR + * SVCPU[LR] = guest LR + * + */ + + /* Restore BATs */ + + /* We only overwrite the upper part, so we only restoree + the upper part. */ +#define KVM_LOAD_BAT(n, reg, RA, RB) \ + lwz RA,(n*16)+0(reg); \ + lwz RB,(n*16)+4(reg); \ + mtspr SPRN_IBAT##n##U,RA; \ + mtspr SPRN_IBAT##n##L,RB; \ + lwz RA,(n*16)+8(reg); \ + lwz RB,(n*16)+12(reg); \ + mtspr SPRN_DBAT##n##U,RA; \ + mtspr SPRN_DBAT##n##L,RB; \ + + lis r9, BATS@ha + addi r9, r9, BATS@l + tophys(r9, r9) + KVM_LOAD_BAT(0, r9, r10, r11) + KVM_LOAD_BAT(1, r9, r10, r11) + KVM_LOAD_BAT(2, r9, r10, r11) + KVM_LOAD_BAT(3, r9, r10, r11) + + /* Restore Segment Registers */ + + /* 0xc - 0xf */ + + li r0, 4 + mtctr r0 + LOAD_REG_IMMEDIATE(r3, 0x20000000 | (0x111 * 0xc)) + lis r4, 0xc000 +3: mtsrin r3, r4 + addi r3, r3, 0x111 /* increment VSID */ + addis r4, r4, 0x1000 /* address of next segment */ + bdnz 3b + + /* 0x0 - 0xb */ + + /* 'current->mm' needs to be in r4 */ + tophys(r4, r2) + lwz r4, MM(r4) + tophys(r4, r4) + /* This only clobbers r0, r3, r4 and r5 */ + bl switch_mmu_context + +.endm diff --git a/kernel/arch/powerpc/kvm/book3s_64_mmu.c b/kernel/arch/powerpc/kvm/book3s_64_mmu.c new file mode 100644 index 000000000..774a253ca --- /dev/null +++ b/kernel/arch/powerpc/kvm/book3s_64_mmu.c @@ -0,0 +1,675 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * + * Copyright SUSE Linux Products GmbH 2009 + * + * Authors: Alexander Graf <agraf@suse.de> + */ + +#include <linux/types.h> +#include <linux/string.h> +#include <linux/kvm.h> +#include <linux/kvm_host.h> +#include <linux/highmem.h> + +#include <asm/tlbflush.h> +#include <asm/kvm_ppc.h> +#include <asm/kvm_book3s.h> +#include <asm/mmu-hash64.h> + +/* #define DEBUG_MMU */ + +#ifdef DEBUG_MMU +#define dprintk(X...) printk(KERN_INFO X) +#else +#define dprintk(X...) do { } while(0) +#endif + +static void kvmppc_mmu_book3s_64_reset_msr(struct kvm_vcpu *vcpu) +{ + kvmppc_set_msr(vcpu, vcpu->arch.intr_msr); +} + +static struct kvmppc_slb *kvmppc_mmu_book3s_64_find_slbe( + struct kvm_vcpu *vcpu, + gva_t eaddr) +{ + int i; + u64 esid = GET_ESID(eaddr); + u64 esid_1t = GET_ESID_1T(eaddr); + + for (i = 0; i < vcpu->arch.slb_nr; i++) { + u64 cmp_esid = esid; + + if (!vcpu->arch.slb[i].valid) + continue; + + if (vcpu->arch.slb[i].tb) + cmp_esid = esid_1t; + + if (vcpu->arch.slb[i].esid == cmp_esid) + return &vcpu->arch.slb[i]; + } + + dprintk("KVM: No SLB entry found for 0x%lx [%llx | %llx]\n", + eaddr, esid, esid_1t); + for (i = 0; i < vcpu->arch.slb_nr; i++) { + if (vcpu->arch.slb[i].vsid) + dprintk(" %d: %c%c%c %llx %llx\n", i, + vcpu->arch.slb[i].valid ? 'v' : ' ', + vcpu->arch.slb[i].large ? 'l' : ' ', + vcpu->arch.slb[i].tb ? 't' : ' ', + vcpu->arch.slb[i].esid, + vcpu->arch.slb[i].vsid); + } + + return NULL; +} + +static int kvmppc_slb_sid_shift(struct kvmppc_slb *slbe) +{ + return slbe->tb ? SID_SHIFT_1T : SID_SHIFT; +} + +static u64 kvmppc_slb_offset_mask(struct kvmppc_slb *slbe) +{ + return (1ul << kvmppc_slb_sid_shift(slbe)) - 1; +} + +static u64 kvmppc_slb_calc_vpn(struct kvmppc_slb *slb, gva_t eaddr) +{ + eaddr &= kvmppc_slb_offset_mask(slb); + + return (eaddr >> VPN_SHIFT) | + ((slb->vsid) << (kvmppc_slb_sid_shift(slb) - VPN_SHIFT)); +} + +static u64 kvmppc_mmu_book3s_64_ea_to_vp(struct kvm_vcpu *vcpu, gva_t eaddr, + bool data) +{ + struct kvmppc_slb *slb; + + slb = kvmppc_mmu_book3s_64_find_slbe(vcpu, eaddr); + if (!slb) + return 0; + + return kvmppc_slb_calc_vpn(slb, eaddr); +} + +static int mmu_pagesize(int mmu_pg) +{ + switch (mmu_pg) { + case MMU_PAGE_64K: + return 16; + case MMU_PAGE_16M: + return 24; + } + return 12; +} + +static int kvmppc_mmu_book3s_64_get_pagesize(struct kvmppc_slb *slbe) +{ + return mmu_pagesize(slbe->base_page_size); +} + +static u32 kvmppc_mmu_book3s_64_get_page(struct kvmppc_slb *slbe, gva_t eaddr) +{ + int p = kvmppc_mmu_book3s_64_get_pagesize(slbe); + + return ((eaddr & kvmppc_slb_offset_mask(slbe)) >> p); +} + +static hva_t kvmppc_mmu_book3s_64_get_pteg(struct kvm_vcpu *vcpu, + struct kvmppc_slb *slbe, gva_t eaddr, + bool second) +{ + struct kvmppc_vcpu_book3s *vcpu_book3s = to_book3s(vcpu); + u64 hash, pteg, htabsize; + u32 ssize; + hva_t r; + u64 vpn; + + htabsize = ((1 << ((vcpu_book3s->sdr1 & 0x1f) + 11)) - 1); + + vpn = kvmppc_slb_calc_vpn(slbe, eaddr); + ssize = slbe->tb ? MMU_SEGSIZE_1T : MMU_SEGSIZE_256M; + hash = hpt_hash(vpn, kvmppc_mmu_book3s_64_get_pagesize(slbe), ssize); + if (second) + hash = ~hash; + hash &= ((1ULL << 39ULL) - 1ULL); + hash &= htabsize; + hash <<= 7ULL; + + pteg = vcpu_book3s->sdr1 & 0xfffffffffffc0000ULL; + pteg |= hash; + + dprintk("MMU: page=0x%x sdr1=0x%llx pteg=0x%llx vsid=0x%llx\n", + page, vcpu_book3s->sdr1, pteg, slbe->vsid); + + /* When running a PAPR guest, SDR1 contains a HVA address instead + of a GPA */ + if (vcpu->arch.papr_enabled) + r = pteg; + else + r = gfn_to_hva(vcpu->kvm, pteg >> PAGE_SHIFT); + + if (kvm_is_error_hva(r)) + return r; + return r | (pteg & ~PAGE_MASK); +} + +static u64 kvmppc_mmu_book3s_64_get_avpn(struct kvmppc_slb *slbe, gva_t eaddr) +{ + int p = kvmppc_mmu_book3s_64_get_pagesize(slbe); + u64 avpn; + + avpn = kvmppc_mmu_book3s_64_get_page(slbe, eaddr); + avpn |= slbe->vsid << (kvmppc_slb_sid_shift(slbe) - p); + + if (p < 16) + avpn >>= ((80 - p) - 56) - 8; /* 16 - p */ + else + avpn <<= p - 16; + + return avpn; +} + +/* + * Return page size encoded in the second word of a HPTE, or + * -1 for an invalid encoding for the base page size indicated by + * the SLB entry. This doesn't handle mixed pagesize segments yet. + */ +static int decode_pagesize(struct kvmppc_slb *slbe, u64 r) +{ + switch (slbe->base_page_size) { + case MMU_PAGE_64K: + if ((r & 0xf000) == 0x1000) + return MMU_PAGE_64K; + break; + case MMU_PAGE_16M: + if ((r & 0xff000) == 0) + return MMU_PAGE_16M; + break; + } + return -1; +} + +static int kvmppc_mmu_book3s_64_xlate(struct kvm_vcpu *vcpu, gva_t eaddr, + struct kvmppc_pte *gpte, bool data, + bool iswrite) +{ + struct kvmppc_slb *slbe; + hva_t ptegp; + u64 pteg[16]; + u64 avpn = 0; + u64 v, r; + u64 v_val, v_mask; + u64 eaddr_mask; + int i; + u8 pp, key = 0; + bool found = false; + bool second = false; + int pgsize; + ulong mp_ea = vcpu->arch.magic_page_ea; + + /* Magic page override */ + if (unlikely(mp_ea) && + unlikely((eaddr & ~0xfffULL) == (mp_ea & ~0xfffULL)) && + !(kvmppc_get_msr(vcpu) & MSR_PR)) { + gpte->eaddr = eaddr; + gpte->vpage = kvmppc_mmu_book3s_64_ea_to_vp(vcpu, eaddr, data); + gpte->raddr = vcpu->arch.magic_page_pa | (gpte->raddr & 0xfff); + gpte->raddr &= KVM_PAM; + gpte->may_execute = true; + gpte->may_read = true; + gpte->may_write = true; + gpte->page_size = MMU_PAGE_4K; + + return 0; + } + + slbe = kvmppc_mmu_book3s_64_find_slbe(vcpu, eaddr); + if (!slbe) + goto no_seg_found; + + avpn = kvmppc_mmu_book3s_64_get_avpn(slbe, eaddr); + v_val = avpn & HPTE_V_AVPN; + + if (slbe->tb) + v_val |= SLB_VSID_B_1T; + if (slbe->large) + v_val |= HPTE_V_LARGE; + v_val |= HPTE_V_VALID; + + v_mask = SLB_VSID_B | HPTE_V_AVPN | HPTE_V_LARGE | HPTE_V_VALID | + HPTE_V_SECONDARY; + + pgsize = slbe->large ? MMU_PAGE_16M : MMU_PAGE_4K; + + mutex_lock(&vcpu->kvm->arch.hpt_mutex); + +do_second: + ptegp = kvmppc_mmu_book3s_64_get_pteg(vcpu, slbe, eaddr, second); + if (kvm_is_error_hva(ptegp)) + goto no_page_found; + + if(copy_from_user(pteg, (void __user *)ptegp, sizeof(pteg))) { + printk(KERN_ERR "KVM can't copy data from 0x%lx!\n", ptegp); + goto no_page_found; + } + + if ((kvmppc_get_msr(vcpu) & MSR_PR) && slbe->Kp) + key = 4; + else if (!(kvmppc_get_msr(vcpu) & MSR_PR) && slbe->Ks) + key = 4; + + for (i=0; i<16; i+=2) { + u64 pte0 = be64_to_cpu(pteg[i]); + u64 pte1 = be64_to_cpu(pteg[i + 1]); + + /* Check all relevant fields of 1st dword */ + if ((pte0 & v_mask) == v_val) { + /* If large page bit is set, check pgsize encoding */ + if (slbe->large && + (vcpu->arch.hflags & BOOK3S_HFLAG_MULTI_PGSIZE)) { + pgsize = decode_pagesize(slbe, pte1); + if (pgsize < 0) + continue; + } + found = true; + break; + } + } + + if (!found) { + if (second) + goto no_page_found; + v_val |= HPTE_V_SECONDARY; + second = true; + goto do_second; + } + + v = be64_to_cpu(pteg[i]); + r = be64_to_cpu(pteg[i+1]); + pp = (r & HPTE_R_PP) | key; + if (r & HPTE_R_PP0) + pp |= 8; + + gpte->eaddr = eaddr; + gpte->vpage = kvmppc_mmu_book3s_64_ea_to_vp(vcpu, eaddr, data); + + eaddr_mask = (1ull << mmu_pagesize(pgsize)) - 1; + gpte->raddr = (r & HPTE_R_RPN & ~eaddr_mask) | (eaddr & eaddr_mask); + gpte->page_size = pgsize; + gpte->may_execute = ((r & HPTE_R_N) ? false : true); + if (unlikely(vcpu->arch.disable_kernel_nx) && + !(kvmppc_get_msr(vcpu) & MSR_PR)) + gpte->may_execute = true; + gpte->may_read = false; + gpte->may_write = false; + + switch (pp) { + case 0: + case 1: + case 2: + case 6: + gpte->may_write = true; + /* fall through */ + case 3: + case 5: + case 7: + case 10: + gpte->may_read = true; + break; + } + + dprintk("KVM MMU: Translated 0x%lx [0x%llx] -> 0x%llx " + "-> 0x%lx\n", + eaddr, avpn, gpte->vpage, gpte->raddr); + + /* Update PTE R and C bits, so the guest's swapper knows we used the + * page */ + if (gpte->may_read && !(r & HPTE_R_R)) { + /* + * Set the accessed flag. + * We have to write this back with a single byte write + * because another vcpu may be accessing this on + * non-PAPR platforms such as mac99, and this is + * what real hardware does. + */ + char __user *addr = (char __user *) (ptegp + (i + 1) * sizeof(u64)); + r |= HPTE_R_R; + put_user(r >> 8, addr + 6); + } + if (iswrite && gpte->may_write && !(r & HPTE_R_C)) { + /* Set the dirty flag */ + /* Use a single byte write */ + char __user *addr = (char __user *) (ptegp + (i + 1) * sizeof(u64)); + r |= HPTE_R_C; + put_user(r, addr + 7); + } + + mutex_unlock(&vcpu->kvm->arch.hpt_mutex); + + if (!gpte->may_read || (iswrite && !gpte->may_write)) + return -EPERM; + return 0; + +no_page_found: + mutex_unlock(&vcpu->kvm->arch.hpt_mutex); + return -ENOENT; + +no_seg_found: + dprintk("KVM MMU: Trigger segment fault\n"); + return -EINVAL; +} + +static void kvmppc_mmu_book3s_64_slbmte(struct kvm_vcpu *vcpu, u64 rs, u64 rb) +{ + struct kvmppc_vcpu_book3s *vcpu_book3s; + u64 esid, esid_1t; + int slb_nr; + struct kvmppc_slb *slbe; + + dprintk("KVM MMU: slbmte(0x%llx, 0x%llx)\n", rs, rb); + + vcpu_book3s = to_book3s(vcpu); + + esid = GET_ESID(rb); + esid_1t = GET_ESID_1T(rb); + slb_nr = rb & 0xfff; + + if (slb_nr > vcpu->arch.slb_nr) + return; + + slbe = &vcpu->arch.slb[slb_nr]; + + slbe->large = (rs & SLB_VSID_L) ? 1 : 0; + slbe->tb = (rs & SLB_VSID_B_1T) ? 1 : 0; + slbe->esid = slbe->tb ? esid_1t : esid; + slbe->vsid = (rs & ~SLB_VSID_B) >> (kvmppc_slb_sid_shift(slbe) - 16); + slbe->valid = (rb & SLB_ESID_V) ? 1 : 0; + slbe->Ks = (rs & SLB_VSID_KS) ? 1 : 0; + slbe->Kp = (rs & SLB_VSID_KP) ? 1 : 0; + slbe->nx = (rs & SLB_VSID_N) ? 1 : 0; + slbe->class = (rs & SLB_VSID_C) ? 1 : 0; + + slbe->base_page_size = MMU_PAGE_4K; + if (slbe->large) { + if (vcpu->arch.hflags & BOOK3S_HFLAG_MULTI_PGSIZE) { + switch (rs & SLB_VSID_LP) { + case SLB_VSID_LP_00: + slbe->base_page_size = MMU_PAGE_16M; + break; + case SLB_VSID_LP_01: + slbe->base_page_size = MMU_PAGE_64K; + break; + } + } else + slbe->base_page_size = MMU_PAGE_16M; + } + + slbe->orige = rb & (ESID_MASK | SLB_ESID_V); + slbe->origv = rs; + + /* Map the new segment */ + kvmppc_mmu_map_segment(vcpu, esid << SID_SHIFT); +} + +static u64 kvmppc_mmu_book3s_64_slbmfee(struct kvm_vcpu *vcpu, u64 slb_nr) +{ + struct kvmppc_slb *slbe; + + if (slb_nr > vcpu->arch.slb_nr) + return 0; + + slbe = &vcpu->arch.slb[slb_nr]; + + return slbe->orige; +} + +static u64 kvmppc_mmu_book3s_64_slbmfev(struct kvm_vcpu *vcpu, u64 slb_nr) +{ + struct kvmppc_slb *slbe; + + if (slb_nr > vcpu->arch.slb_nr) + return 0; + + slbe = &vcpu->arch.slb[slb_nr]; + + return slbe->origv; +} + +static void kvmppc_mmu_book3s_64_slbie(struct kvm_vcpu *vcpu, u64 ea) +{ + struct kvmppc_slb *slbe; + u64 seg_size; + + dprintk("KVM MMU: slbie(0x%llx)\n", ea); + + slbe = kvmppc_mmu_book3s_64_find_slbe(vcpu, ea); + + if (!slbe) + return; + + dprintk("KVM MMU: slbie(0x%llx, 0x%llx)\n", ea, slbe->esid); + + slbe->valid = false; + slbe->orige = 0; + slbe->origv = 0; + + seg_size = 1ull << kvmppc_slb_sid_shift(slbe); + kvmppc_mmu_flush_segment(vcpu, ea & ~(seg_size - 1), seg_size); +} + +static void kvmppc_mmu_book3s_64_slbia(struct kvm_vcpu *vcpu) +{ + int i; + + dprintk("KVM MMU: slbia()\n"); + + for (i = 1; i < vcpu->arch.slb_nr; i++) { + vcpu->arch.slb[i].valid = false; + vcpu->arch.slb[i].orige = 0; + vcpu->arch.slb[i].origv = 0; + } + + if (kvmppc_get_msr(vcpu) & MSR_IR) { + kvmppc_mmu_flush_segments(vcpu); + kvmppc_mmu_map_segment(vcpu, kvmppc_get_pc(vcpu)); + } +} + +static void kvmppc_mmu_book3s_64_mtsrin(struct kvm_vcpu *vcpu, u32 srnum, + ulong value) +{ + u64 rb = 0, rs = 0; + + /* + * According to Book3 2.01 mtsrin is implemented as: + * + * The SLB entry specified by (RB)32:35 is loaded from register + * RS, as follows. + * + * SLBE Bit Source SLB Field + * + * 0:31 0x0000_0000 ESID-0:31 + * 32:35 (RB)32:35 ESID-32:35 + * 36 0b1 V + * 37:61 0x00_0000|| 0b0 VSID-0:24 + * 62:88 (RS)37:63 VSID-25:51 + * 89:91 (RS)33:35 Ks Kp N + * 92 (RS)36 L ((RS)36 must be 0b0) + * 93 0b0 C + */ + + dprintk("KVM MMU: mtsrin(0x%x, 0x%lx)\n", srnum, value); + + /* ESID = srnum */ + rb |= (srnum & 0xf) << 28; + /* Set the valid bit */ + rb |= 1 << 27; + /* Index = ESID */ + rb |= srnum; + + /* VSID = VSID */ + rs |= (value & 0xfffffff) << 12; + /* flags = flags */ + rs |= ((value >> 28) & 0x7) << 9; + + kvmppc_mmu_book3s_64_slbmte(vcpu, rs, rb); +} + +static void kvmppc_mmu_book3s_64_tlbie(struct kvm_vcpu *vcpu, ulong va, + bool large) +{ + u64 mask = 0xFFFFFFFFFULL; + long i; + struct kvm_vcpu *v; + + dprintk("KVM MMU: tlbie(0x%lx)\n", va); + + /* + * The tlbie instruction changed behaviour starting with + * POWER6. POWER6 and later don't have the large page flag + * in the instruction but in the RB value, along with bits + * indicating page and segment sizes. + */ + if (vcpu->arch.hflags & BOOK3S_HFLAG_NEW_TLBIE) { + /* POWER6 or later */ + if (va & 1) { /* L bit */ + if ((va & 0xf000) == 0x1000) + mask = 0xFFFFFFFF0ULL; /* 64k page */ + else + mask = 0xFFFFFF000ULL; /* 16M page */ + } + } else { + /* older processors, e.g. PPC970 */ + if (large) + mask = 0xFFFFFF000ULL; + } + /* flush this VA on all vcpus */ + kvm_for_each_vcpu(i, v, vcpu->kvm) + kvmppc_mmu_pte_vflush(v, va >> 12, mask); +} + +#ifdef CONFIG_PPC_64K_PAGES +static int segment_contains_magic_page(struct kvm_vcpu *vcpu, ulong esid) +{ + ulong mp_ea = vcpu->arch.magic_page_ea; + + return mp_ea && !(kvmppc_get_msr(vcpu) & MSR_PR) && + (mp_ea >> SID_SHIFT) == esid; +} +#endif + +static int kvmppc_mmu_book3s_64_esid_to_vsid(struct kvm_vcpu *vcpu, ulong esid, + u64 *vsid) +{ + ulong ea = esid << SID_SHIFT; + struct kvmppc_slb *slb; + u64 gvsid = esid; + ulong mp_ea = vcpu->arch.magic_page_ea; + int pagesize = MMU_PAGE_64K; + u64 msr = kvmppc_get_msr(vcpu); + + if (msr & (MSR_DR|MSR_IR)) { + slb = kvmppc_mmu_book3s_64_find_slbe(vcpu, ea); + if (slb) { + gvsid = slb->vsid; + pagesize = slb->base_page_size; + if (slb->tb) { + gvsid <<= SID_SHIFT_1T - SID_SHIFT; + gvsid |= esid & ((1ul << (SID_SHIFT_1T - SID_SHIFT)) - 1); + gvsid |= VSID_1T; + } + } + } + + switch (msr & (MSR_DR|MSR_IR)) { + case 0: + gvsid = VSID_REAL | esid; + break; + case MSR_IR: + gvsid |= VSID_REAL_IR; + break; + case MSR_DR: + gvsid |= VSID_REAL_DR; + break; + case MSR_DR|MSR_IR: + if (!slb) + goto no_slb; + + break; + default: + BUG(); + break; + } + +#ifdef CONFIG_PPC_64K_PAGES + /* + * Mark this as a 64k segment if the host is using + * 64k pages, the host MMU supports 64k pages and + * the guest segment page size is >= 64k, + * but not if this segment contains the magic page. + */ + if (pagesize >= MMU_PAGE_64K && + mmu_psize_defs[MMU_PAGE_64K].shift && + !segment_contains_magic_page(vcpu, esid)) + gvsid |= VSID_64K; +#endif + + if (kvmppc_get_msr(vcpu) & MSR_PR) + gvsid |= VSID_PR; + + *vsid = gvsid; + return 0; + +no_slb: + /* Catch magic page case */ + if (unlikely(mp_ea) && + unlikely(esid == (mp_ea >> SID_SHIFT)) && + !(kvmppc_get_msr(vcpu) & MSR_PR)) { + *vsid = VSID_REAL | esid; + return 0; + } + + return -EINVAL; +} + +static bool kvmppc_mmu_book3s_64_is_dcbz32(struct kvm_vcpu *vcpu) +{ + return (to_book3s(vcpu)->hid[5] & 0x80); +} + +void kvmppc_mmu_book3s_64_init(struct kvm_vcpu *vcpu) +{ + struct kvmppc_mmu *mmu = &vcpu->arch.mmu; + + mmu->mfsrin = NULL; + mmu->mtsrin = kvmppc_mmu_book3s_64_mtsrin; + mmu->slbmte = kvmppc_mmu_book3s_64_slbmte; + mmu->slbmfee = kvmppc_mmu_book3s_64_slbmfee; + mmu->slbmfev = kvmppc_mmu_book3s_64_slbmfev; + mmu->slbie = kvmppc_mmu_book3s_64_slbie; + mmu->slbia = kvmppc_mmu_book3s_64_slbia; + mmu->xlate = kvmppc_mmu_book3s_64_xlate; + mmu->reset_msr = kvmppc_mmu_book3s_64_reset_msr; + mmu->tlbie = kvmppc_mmu_book3s_64_tlbie; + mmu->esid_to_vsid = kvmppc_mmu_book3s_64_esid_to_vsid; + mmu->ea_to_vp = kvmppc_mmu_book3s_64_ea_to_vp; + mmu->is_dcbz32 = kvmppc_mmu_book3s_64_is_dcbz32; + + vcpu->arch.hflags |= BOOK3S_HFLAG_SLB; +} diff --git a/kernel/arch/powerpc/kvm/book3s_64_mmu_host.c b/kernel/arch/powerpc/kvm/book3s_64_mmu_host.c new file mode 100644 index 000000000..b982d925c --- /dev/null +++ b/kernel/arch/powerpc/kvm/book3s_64_mmu_host.c @@ -0,0 +1,403 @@ +/* + * Copyright (C) 2009 SUSE Linux Products GmbH. All rights reserved. + * + * Authors: + * Alexander Graf <agraf@suse.de> + * Kevin Wolf <mail@kevin-wolf.de> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + */ + +#include <linux/kvm_host.h> + +#include <asm/kvm_ppc.h> +#include <asm/kvm_book3s.h> +#include <asm/mmu-hash64.h> +#include <asm/machdep.h> +#include <asm/mmu_context.h> +#include <asm/hw_irq.h> +#include "trace_pr.h" + +#define PTE_SIZE 12 + +void kvmppc_mmu_invalidate_pte(struct kvm_vcpu *vcpu, struct hpte_cache *pte) +{ + ppc_md.hpte_invalidate(pte->slot, pte->host_vpn, + pte->pagesize, pte->pagesize, MMU_SEGSIZE_256M, + false); +} + +/* We keep 512 gvsid->hvsid entries, mapping the guest ones to the array using + * a hash, so we don't waste cycles on looping */ +static u16 kvmppc_sid_hash(struct kvm_vcpu *vcpu, u64 gvsid) +{ + return (u16)(((gvsid >> (SID_MAP_BITS * 7)) & SID_MAP_MASK) ^ + ((gvsid >> (SID_MAP_BITS * 6)) & SID_MAP_MASK) ^ + ((gvsid >> (SID_MAP_BITS * 5)) & SID_MAP_MASK) ^ + ((gvsid >> (SID_MAP_BITS * 4)) & SID_MAP_MASK) ^ + ((gvsid >> (SID_MAP_BITS * 3)) & SID_MAP_MASK) ^ + ((gvsid >> (SID_MAP_BITS * 2)) & SID_MAP_MASK) ^ + ((gvsid >> (SID_MAP_BITS * 1)) & SID_MAP_MASK) ^ + ((gvsid >> (SID_MAP_BITS * 0)) & SID_MAP_MASK)); +} + + +static struct kvmppc_sid_map *find_sid_vsid(struct kvm_vcpu *vcpu, u64 gvsid) +{ + struct kvmppc_sid_map *map; + u16 sid_map_mask; + + if (kvmppc_get_msr(vcpu) & MSR_PR) + gvsid |= VSID_PR; + + sid_map_mask = kvmppc_sid_hash(vcpu, gvsid); + map = &to_book3s(vcpu)->sid_map[sid_map_mask]; + if (map->valid && (map->guest_vsid == gvsid)) { + trace_kvm_book3s_slb_found(gvsid, map->host_vsid); + return map; + } + + map = &to_book3s(vcpu)->sid_map[SID_MAP_MASK - sid_map_mask]; + if (map->valid && (map->guest_vsid == gvsid)) { + trace_kvm_book3s_slb_found(gvsid, map->host_vsid); + return map; + } + + trace_kvm_book3s_slb_fail(sid_map_mask, gvsid); + return NULL; +} + +int kvmppc_mmu_map_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *orig_pte, + bool iswrite) +{ + unsigned long vpn; + pfn_t hpaddr; + ulong hash, hpteg; + u64 vsid; + int ret; + int rflags = 0x192; + int vflags = 0; + int attempt = 0; + struct kvmppc_sid_map *map; + int r = 0; + int hpsize = MMU_PAGE_4K; + bool writable; + unsigned long mmu_seq; + struct kvm *kvm = vcpu->kvm; + struct hpte_cache *cpte; + unsigned long gfn = orig_pte->raddr >> PAGE_SHIFT; + unsigned long pfn; + + /* used to check for invalidations in progress */ + mmu_seq = kvm->mmu_notifier_seq; + smp_rmb(); + + /* Get host physical address for gpa */ + pfn = kvmppc_gpa_to_pfn(vcpu, orig_pte->raddr, iswrite, &writable); + if (is_error_noslot_pfn(pfn)) { + printk(KERN_INFO "Couldn't get guest page for gpa %lx!\n", + orig_pte->raddr); + r = -EINVAL; + goto out; + } + hpaddr = pfn << PAGE_SHIFT; + + /* and write the mapping ea -> hpa into the pt */ + vcpu->arch.mmu.esid_to_vsid(vcpu, orig_pte->eaddr >> SID_SHIFT, &vsid); + map = find_sid_vsid(vcpu, vsid); + if (!map) { + ret = kvmppc_mmu_map_segment(vcpu, orig_pte->eaddr); + WARN_ON(ret < 0); + map = find_sid_vsid(vcpu, vsid); + } + if (!map) { + printk(KERN_ERR "KVM: Segment map for 0x%llx (0x%lx) failed\n", + vsid, orig_pte->eaddr); + WARN_ON(true); + r = -EINVAL; + goto out; + } + + vpn = hpt_vpn(orig_pte->eaddr, map->host_vsid, MMU_SEGSIZE_256M); + + kvm_set_pfn_accessed(pfn); + if (!orig_pte->may_write || !writable) + rflags |= PP_RXRX; + else { + mark_page_dirty(vcpu->kvm, gfn); + kvm_set_pfn_dirty(pfn); + } + + if (!orig_pte->may_execute) + rflags |= HPTE_R_N; + else + kvmppc_mmu_flush_icache(pfn); + + /* + * Use 64K pages if possible; otherwise, on 64K page kernels, + * we need to transfer 4 more bits from guest real to host real addr. + */ + if (vsid & VSID_64K) + hpsize = MMU_PAGE_64K; + else + hpaddr |= orig_pte->raddr & (~0xfffULL & ~PAGE_MASK); + + hash = hpt_hash(vpn, mmu_psize_defs[hpsize].shift, MMU_SEGSIZE_256M); + + cpte = kvmppc_mmu_hpte_cache_next(vcpu); + + spin_lock(&kvm->mmu_lock); + if (!cpte || mmu_notifier_retry(kvm, mmu_seq)) { + r = -EAGAIN; + goto out_unlock; + } + +map_again: + hpteg = ((hash & htab_hash_mask) * HPTES_PER_GROUP); + + /* In case we tried normal mapping already, let's nuke old entries */ + if (attempt > 1) + if (ppc_md.hpte_remove(hpteg) < 0) { + r = -1; + goto out_unlock; + } + + ret = ppc_md.hpte_insert(hpteg, vpn, hpaddr, rflags, vflags, + hpsize, hpsize, MMU_SEGSIZE_256M); + + if (ret < 0) { + /* If we couldn't map a primary PTE, try a secondary */ + hash = ~hash; + vflags ^= HPTE_V_SECONDARY; + attempt++; + goto map_again; + } else { + trace_kvm_book3s_64_mmu_map(rflags, hpteg, + vpn, hpaddr, orig_pte); + + /* The ppc_md code may give us a secondary entry even though we + asked for a primary. Fix up. */ + if ((ret & _PTEIDX_SECONDARY) && !(vflags & HPTE_V_SECONDARY)) { + hash = ~hash; + hpteg = ((hash & htab_hash_mask) * HPTES_PER_GROUP); + } + + cpte->slot = hpteg + (ret & 7); + cpte->host_vpn = vpn; + cpte->pte = *orig_pte; + cpte->pfn = pfn; + cpte->pagesize = hpsize; + + kvmppc_mmu_hpte_cache_map(vcpu, cpte); + cpte = NULL; + } + +out_unlock: + spin_unlock(&kvm->mmu_lock); + kvm_release_pfn_clean(pfn); + if (cpte) + kvmppc_mmu_hpte_cache_free(cpte); + +out: + return r; +} + +void kvmppc_mmu_unmap_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *pte) +{ + u64 mask = 0xfffffffffULL; + u64 vsid; + + vcpu->arch.mmu.esid_to_vsid(vcpu, pte->eaddr >> SID_SHIFT, &vsid); + if (vsid & VSID_64K) + mask = 0xffffffff0ULL; + kvmppc_mmu_pte_vflush(vcpu, pte->vpage, mask); +} + +static struct kvmppc_sid_map *create_sid_map(struct kvm_vcpu *vcpu, u64 gvsid) +{ + struct kvmppc_sid_map *map; + struct kvmppc_vcpu_book3s *vcpu_book3s = to_book3s(vcpu); + u16 sid_map_mask; + static int backwards_map = 0; + + if (kvmppc_get_msr(vcpu) & MSR_PR) + gvsid |= VSID_PR; + + /* We might get collisions that trap in preceding order, so let's + map them differently */ + + sid_map_mask = kvmppc_sid_hash(vcpu, gvsid); + if (backwards_map) + sid_map_mask = SID_MAP_MASK - sid_map_mask; + + map = &to_book3s(vcpu)->sid_map[sid_map_mask]; + + /* Make sure we're taking the other map next time */ + backwards_map = !backwards_map; + + /* Uh-oh ... out of mappings. Let's flush! */ + if (vcpu_book3s->proto_vsid_next == vcpu_book3s->proto_vsid_max) { + vcpu_book3s->proto_vsid_next = vcpu_book3s->proto_vsid_first; + memset(vcpu_book3s->sid_map, 0, + sizeof(struct kvmppc_sid_map) * SID_MAP_NUM); + kvmppc_mmu_pte_flush(vcpu, 0, 0); + kvmppc_mmu_flush_segments(vcpu); + } + map->host_vsid = vsid_scramble(vcpu_book3s->proto_vsid_next++, 256M); + + map->guest_vsid = gvsid; + map->valid = true; + + trace_kvm_book3s_slb_map(sid_map_mask, gvsid, map->host_vsid); + + return map; +} + +static int kvmppc_mmu_next_segment(struct kvm_vcpu *vcpu, ulong esid) +{ + struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu); + int i; + int max_slb_size = 64; + int found_inval = -1; + int r; + + /* Are we overwriting? */ + for (i = 0; i < svcpu->slb_max; i++) { + if (!(svcpu->slb[i].esid & SLB_ESID_V)) + found_inval = i; + else if ((svcpu->slb[i].esid & ESID_MASK) == esid) { + r = i; + goto out; + } + } + + /* Found a spare entry that was invalidated before */ + if (found_inval >= 0) { + r = found_inval; + goto out; + } + + /* No spare invalid entry, so create one */ + + if (mmu_slb_size < 64) + max_slb_size = mmu_slb_size; + + /* Overflowing -> purge */ + if ((svcpu->slb_max) == max_slb_size) + kvmppc_mmu_flush_segments(vcpu); + + r = svcpu->slb_max; + svcpu->slb_max++; + +out: + svcpu_put(svcpu); + return r; +} + +int kvmppc_mmu_map_segment(struct kvm_vcpu *vcpu, ulong eaddr) +{ + struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu); + u64 esid = eaddr >> SID_SHIFT; + u64 slb_esid = (eaddr & ESID_MASK) | SLB_ESID_V; + u64 slb_vsid = SLB_VSID_USER; + u64 gvsid; + int slb_index; + struct kvmppc_sid_map *map; + int r = 0; + + slb_index = kvmppc_mmu_next_segment(vcpu, eaddr & ESID_MASK); + + if (vcpu->arch.mmu.esid_to_vsid(vcpu, esid, &gvsid)) { + /* Invalidate an entry */ + svcpu->slb[slb_index].esid = 0; + r = -ENOENT; + goto out; + } + + map = find_sid_vsid(vcpu, gvsid); + if (!map) + map = create_sid_map(vcpu, gvsid); + + map->guest_esid = esid; + + slb_vsid |= (map->host_vsid << 12); + slb_vsid &= ~SLB_VSID_KP; + slb_esid |= slb_index; + +#ifdef CONFIG_PPC_64K_PAGES + /* Set host segment base page size to 64K if possible */ + if (gvsid & VSID_64K) + slb_vsid |= mmu_psize_defs[MMU_PAGE_64K].sllp; +#endif + + svcpu->slb[slb_index].esid = slb_esid; + svcpu->slb[slb_index].vsid = slb_vsid; + + trace_kvm_book3s_slbmte(slb_vsid, slb_esid); + +out: + svcpu_put(svcpu); + return r; +} + +void kvmppc_mmu_flush_segment(struct kvm_vcpu *vcpu, ulong ea, ulong seg_size) +{ + struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu); + ulong seg_mask = -seg_size; + int i; + + for (i = 0; i < svcpu->slb_max; i++) { + if ((svcpu->slb[i].esid & SLB_ESID_V) && + (svcpu->slb[i].esid & seg_mask) == ea) { + /* Invalidate this entry */ + svcpu->slb[i].esid = 0; + } + } + + svcpu_put(svcpu); +} + +void kvmppc_mmu_flush_segments(struct kvm_vcpu *vcpu) +{ + struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu); + svcpu->slb_max = 0; + svcpu->slb[0].esid = 0; + svcpu_put(svcpu); +} + +void kvmppc_mmu_destroy_pr(struct kvm_vcpu *vcpu) +{ + kvmppc_mmu_hpte_destroy(vcpu); + __destroy_context(to_book3s(vcpu)->context_id[0]); +} + +int kvmppc_mmu_init(struct kvm_vcpu *vcpu) +{ + struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu); + int err; + + err = __init_new_context(); + if (err < 0) + return -1; + vcpu3s->context_id[0] = err; + + vcpu3s->proto_vsid_max = ((u64)(vcpu3s->context_id[0] + 1) + << ESID_BITS) - 1; + vcpu3s->proto_vsid_first = (u64)vcpu3s->context_id[0] << ESID_BITS; + vcpu3s->proto_vsid_next = vcpu3s->proto_vsid_first; + + kvmppc_mmu_hpte_init(vcpu); + + return 0; +} diff --git a/kernel/arch/powerpc/kvm/book3s_64_mmu_hv.c b/kernel/arch/powerpc/kvm/book3s_64_mmu_hv.c new file mode 100644 index 000000000..1a4acf8bf --- /dev/null +++ b/kernel/arch/powerpc/kvm/book3s_64_mmu_hv.c @@ -0,0 +1,1637 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * + * Copyright 2010 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com> + */ + +#include <linux/types.h> +#include <linux/string.h> +#include <linux/kvm.h> +#include <linux/kvm_host.h> +#include <linux/highmem.h> +#include <linux/gfp.h> +#include <linux/slab.h> +#include <linux/hugetlb.h> +#include <linux/vmalloc.h> +#include <linux/srcu.h> +#include <linux/anon_inodes.h> +#include <linux/file.h> +#include <linux/debugfs.h> + +#include <asm/tlbflush.h> +#include <asm/kvm_ppc.h> +#include <asm/kvm_book3s.h> +#include <asm/mmu-hash64.h> +#include <asm/hvcall.h> +#include <asm/synch.h> +#include <asm/ppc-opcode.h> +#include <asm/cputable.h> + +#include "trace_hv.h" + +/* Power architecture requires HPT is at least 256kB */ +#define PPC_MIN_HPT_ORDER 18 + +static long kvmppc_virtmode_do_h_enter(struct kvm *kvm, unsigned long flags, + long pte_index, unsigned long pteh, + unsigned long ptel, unsigned long *pte_idx_ret); +static void kvmppc_rmap_reset(struct kvm *kvm); + +long kvmppc_alloc_hpt(struct kvm *kvm, u32 *htab_orderp) +{ + unsigned long hpt = 0; + struct revmap_entry *rev; + struct page *page = NULL; + long order = KVM_DEFAULT_HPT_ORDER; + + if (htab_orderp) { + order = *htab_orderp; + if (order < PPC_MIN_HPT_ORDER) + order = PPC_MIN_HPT_ORDER; + } + + kvm->arch.hpt_cma_alloc = 0; + page = kvm_alloc_hpt(1ul << (order - PAGE_SHIFT)); + if (page) { + hpt = (unsigned long)pfn_to_kaddr(page_to_pfn(page)); + memset((void *)hpt, 0, (1ul << order)); + kvm->arch.hpt_cma_alloc = 1; + } + + /* Lastly try successively smaller sizes from the page allocator */ + while (!hpt && order > PPC_MIN_HPT_ORDER) { + hpt = __get_free_pages(GFP_KERNEL|__GFP_ZERO|__GFP_REPEAT| + __GFP_NOWARN, order - PAGE_SHIFT); + if (!hpt) + --order; + } + + if (!hpt) + return -ENOMEM; + + kvm->arch.hpt_virt = hpt; + kvm->arch.hpt_order = order; + /* HPTEs are 2**4 bytes long */ + kvm->arch.hpt_npte = 1ul << (order - 4); + /* 128 (2**7) bytes in each HPTEG */ + kvm->arch.hpt_mask = (1ul << (order - 7)) - 1; + + /* Allocate reverse map array */ + rev = vmalloc(sizeof(struct revmap_entry) * kvm->arch.hpt_npte); + if (!rev) { + pr_err("kvmppc_alloc_hpt: Couldn't alloc reverse map array\n"); + goto out_freehpt; + } + kvm->arch.revmap = rev; + kvm->arch.sdr1 = __pa(hpt) | (order - 18); + + pr_info("KVM guest htab at %lx (order %ld), LPID %x\n", + hpt, order, kvm->arch.lpid); + + if (htab_orderp) + *htab_orderp = order; + return 0; + + out_freehpt: + if (kvm->arch.hpt_cma_alloc) + kvm_release_hpt(page, 1 << (order - PAGE_SHIFT)); + else + free_pages(hpt, order - PAGE_SHIFT); + return -ENOMEM; +} + +long kvmppc_alloc_reset_hpt(struct kvm *kvm, u32 *htab_orderp) +{ + long err = -EBUSY; + long order; + + mutex_lock(&kvm->lock); + if (kvm->arch.hpte_setup_done) { + kvm->arch.hpte_setup_done = 0; + /* order hpte_setup_done vs. vcpus_running */ + smp_mb(); + if (atomic_read(&kvm->arch.vcpus_running)) { + kvm->arch.hpte_setup_done = 1; + goto out; + } + } + if (kvm->arch.hpt_virt) { + order = kvm->arch.hpt_order; + /* Set the entire HPT to 0, i.e. invalid HPTEs */ + memset((void *)kvm->arch.hpt_virt, 0, 1ul << order); + /* + * Reset all the reverse-mapping chains for all memslots + */ + kvmppc_rmap_reset(kvm); + /* Ensure that each vcpu will flush its TLB on next entry. */ + cpumask_setall(&kvm->arch.need_tlb_flush); + *htab_orderp = order; + err = 0; + } else { + err = kvmppc_alloc_hpt(kvm, htab_orderp); + order = *htab_orderp; + } + out: + mutex_unlock(&kvm->lock); + return err; +} + +void kvmppc_free_hpt(struct kvm *kvm) +{ + kvmppc_free_lpid(kvm->arch.lpid); + vfree(kvm->arch.revmap); + if (kvm->arch.hpt_cma_alloc) + kvm_release_hpt(virt_to_page(kvm->arch.hpt_virt), + 1 << (kvm->arch.hpt_order - PAGE_SHIFT)); + else + free_pages(kvm->arch.hpt_virt, + kvm->arch.hpt_order - PAGE_SHIFT); +} + +/* Bits in first HPTE dword for pagesize 4k, 64k or 16M */ +static inline unsigned long hpte0_pgsize_encoding(unsigned long pgsize) +{ + return (pgsize > 0x1000) ? HPTE_V_LARGE : 0; +} + +/* Bits in second HPTE dword for pagesize 4k, 64k or 16M */ +static inline unsigned long hpte1_pgsize_encoding(unsigned long pgsize) +{ + return (pgsize == 0x10000) ? 0x1000 : 0; +} + +void kvmppc_map_vrma(struct kvm_vcpu *vcpu, struct kvm_memory_slot *memslot, + unsigned long porder) +{ + unsigned long i; + unsigned long npages; + unsigned long hp_v, hp_r; + unsigned long addr, hash; + unsigned long psize; + unsigned long hp0, hp1; + unsigned long idx_ret; + long ret; + struct kvm *kvm = vcpu->kvm; + + psize = 1ul << porder; + npages = memslot->npages >> (porder - PAGE_SHIFT); + + /* VRMA can't be > 1TB */ + if (npages > 1ul << (40 - porder)) + npages = 1ul << (40 - porder); + /* Can't use more than 1 HPTE per HPTEG */ + if (npages > kvm->arch.hpt_mask + 1) + npages = kvm->arch.hpt_mask + 1; + + hp0 = HPTE_V_1TB_SEG | (VRMA_VSID << (40 - 16)) | + HPTE_V_BOLTED | hpte0_pgsize_encoding(psize); + hp1 = hpte1_pgsize_encoding(psize) | + HPTE_R_R | HPTE_R_C | HPTE_R_M | PP_RWXX; + + for (i = 0; i < npages; ++i) { + addr = i << porder; + /* can't use hpt_hash since va > 64 bits */ + hash = (i ^ (VRMA_VSID ^ (VRMA_VSID << 25))) & kvm->arch.hpt_mask; + /* + * We assume that the hash table is empty and no + * vcpus are using it at this stage. Since we create + * at most one HPTE per HPTEG, we just assume entry 7 + * is available and use it. + */ + hash = (hash << 3) + 7; + hp_v = hp0 | ((addr >> 16) & ~0x7fUL); + hp_r = hp1 | addr; + ret = kvmppc_virtmode_do_h_enter(kvm, H_EXACT, hash, hp_v, hp_r, + &idx_ret); + if (ret != H_SUCCESS) { + pr_err("KVM: map_vrma at %lx failed, ret=%ld\n", + addr, ret); + break; + } + } +} + +int kvmppc_mmu_hv_init(void) +{ + unsigned long host_lpid, rsvd_lpid; + + if (!cpu_has_feature(CPU_FTR_HVMODE)) + return -EINVAL; + + /* POWER7 has 10-bit LPIDs (12-bit in POWER8) */ + host_lpid = mfspr(SPRN_LPID); + rsvd_lpid = LPID_RSVD; + + kvmppc_init_lpid(rsvd_lpid + 1); + + kvmppc_claim_lpid(host_lpid); + /* rsvd_lpid is reserved for use in partition switching */ + kvmppc_claim_lpid(rsvd_lpid); + + return 0; +} + +static void kvmppc_mmu_book3s_64_hv_reset_msr(struct kvm_vcpu *vcpu) +{ + unsigned long msr = vcpu->arch.intr_msr; + + /* If transactional, change to suspend mode on IRQ delivery */ + if (MSR_TM_TRANSACTIONAL(vcpu->arch.shregs.msr)) + msr |= MSR_TS_S; + else + msr |= vcpu->arch.shregs.msr & MSR_TS_MASK; + kvmppc_set_msr(vcpu, msr); +} + +long kvmppc_virtmode_do_h_enter(struct kvm *kvm, unsigned long flags, + long pte_index, unsigned long pteh, + unsigned long ptel, unsigned long *pte_idx_ret) +{ + long ret; + + /* Protect linux PTE lookup from page table destruction */ + rcu_read_lock_sched(); /* this disables preemption too */ + ret = kvmppc_do_h_enter(kvm, flags, pte_index, pteh, ptel, + current->mm->pgd, false, pte_idx_ret); + rcu_read_unlock_sched(); + if (ret == H_TOO_HARD) { + /* this can't happen */ + pr_err("KVM: Oops, kvmppc_h_enter returned too hard!\n"); + ret = H_RESOURCE; /* or something */ + } + return ret; + +} + +static struct kvmppc_slb *kvmppc_mmu_book3s_hv_find_slbe(struct kvm_vcpu *vcpu, + gva_t eaddr) +{ + u64 mask; + int i; + + for (i = 0; i < vcpu->arch.slb_nr; i++) { + if (!(vcpu->arch.slb[i].orige & SLB_ESID_V)) + continue; + + if (vcpu->arch.slb[i].origv & SLB_VSID_B_1T) + mask = ESID_MASK_1T; + else + mask = ESID_MASK; + + if (((vcpu->arch.slb[i].orige ^ eaddr) & mask) == 0) + return &vcpu->arch.slb[i]; + } + return NULL; +} + +static unsigned long kvmppc_mmu_get_real_addr(unsigned long v, unsigned long r, + unsigned long ea) +{ + unsigned long ra_mask; + + ra_mask = hpte_page_size(v, r) - 1; + return (r & HPTE_R_RPN & ~ra_mask) | (ea & ra_mask); +} + +static int kvmppc_mmu_book3s_64_hv_xlate(struct kvm_vcpu *vcpu, gva_t eaddr, + struct kvmppc_pte *gpte, bool data, bool iswrite) +{ + struct kvm *kvm = vcpu->kvm; + struct kvmppc_slb *slbe; + unsigned long slb_v; + unsigned long pp, key; + unsigned long v, gr; + __be64 *hptep; + int index; + int virtmode = vcpu->arch.shregs.msr & (data ? MSR_DR : MSR_IR); + + /* Get SLB entry */ + if (virtmode) { + slbe = kvmppc_mmu_book3s_hv_find_slbe(vcpu, eaddr); + if (!slbe) + return -EINVAL; + slb_v = slbe->origv; + } else { + /* real mode access */ + slb_v = vcpu->kvm->arch.vrma_slb_v; + } + + preempt_disable(); + /* Find the HPTE in the hash table */ + index = kvmppc_hv_find_lock_hpte(kvm, eaddr, slb_v, + HPTE_V_VALID | HPTE_V_ABSENT); + if (index < 0) { + preempt_enable(); + return -ENOENT; + } + hptep = (__be64 *)(kvm->arch.hpt_virt + (index << 4)); + v = be64_to_cpu(hptep[0]) & ~HPTE_V_HVLOCK; + gr = kvm->arch.revmap[index].guest_rpte; + + unlock_hpte(hptep, v); + preempt_enable(); + + gpte->eaddr = eaddr; + gpte->vpage = ((v & HPTE_V_AVPN) << 4) | ((eaddr >> 12) & 0xfff); + + /* Get PP bits and key for permission check */ + pp = gr & (HPTE_R_PP0 | HPTE_R_PP); + key = (vcpu->arch.shregs.msr & MSR_PR) ? SLB_VSID_KP : SLB_VSID_KS; + key &= slb_v; + + /* Calculate permissions */ + gpte->may_read = hpte_read_permission(pp, key); + gpte->may_write = hpte_write_permission(pp, key); + gpte->may_execute = gpte->may_read && !(gr & (HPTE_R_N | HPTE_R_G)); + + /* Storage key permission check for POWER7 */ + if (data && virtmode) { + int amrfield = hpte_get_skey_perm(gr, vcpu->arch.amr); + if (amrfield & 1) + gpte->may_read = 0; + if (amrfield & 2) + gpte->may_write = 0; + } + + /* Get the guest physical address */ + gpte->raddr = kvmppc_mmu_get_real_addr(v, gr, eaddr); + return 0; +} + +/* + * Quick test for whether an instruction is a load or a store. + * If the instruction is a load or a store, then this will indicate + * which it is, at least on server processors. (Embedded processors + * have some external PID instructions that don't follow the rule + * embodied here.) If the instruction isn't a load or store, then + * this doesn't return anything useful. + */ +static int instruction_is_store(unsigned int instr) +{ + unsigned int mask; + + mask = 0x10000000; + if ((instr & 0xfc000000) == 0x7c000000) + mask = 0x100; /* major opcode 31 */ + return (instr & mask) != 0; +} + +static int kvmppc_hv_emulate_mmio(struct kvm_run *run, struct kvm_vcpu *vcpu, + unsigned long gpa, gva_t ea, int is_store) +{ + u32 last_inst; + + /* + * If we fail, we just return to the guest and try executing it again. + */ + if (kvmppc_get_last_inst(vcpu, INST_GENERIC, &last_inst) != + EMULATE_DONE) + return RESUME_GUEST; + + /* + * WARNING: We do not know for sure whether the instruction we just + * read from memory is the same that caused the fault in the first + * place. If the instruction we read is neither an load or a store, + * then it can't access memory, so we don't need to worry about + * enforcing access permissions. So, assuming it is a load or + * store, we just check that its direction (load or store) is + * consistent with the original fault, since that's what we + * checked the access permissions against. If there is a mismatch + * we just return and retry the instruction. + */ + + if (instruction_is_store(last_inst) != !!is_store) + return RESUME_GUEST; + + /* + * Emulated accesses are emulated by looking at the hash for + * translation once, then performing the access later. The + * translation could be invalidated in the meantime in which + * point performing the subsequent memory access on the old + * physical address could possibly be a security hole for the + * guest (but not the host). + * + * This is less of an issue for MMIO stores since they aren't + * globally visible. It could be an issue for MMIO loads to + * a certain extent but we'll ignore it for now. + */ + + vcpu->arch.paddr_accessed = gpa; + vcpu->arch.vaddr_accessed = ea; + return kvmppc_emulate_mmio(run, vcpu); +} + +int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, + unsigned long ea, unsigned long dsisr) +{ + struct kvm *kvm = vcpu->kvm; + unsigned long hpte[3], r; + __be64 *hptep; + unsigned long mmu_seq, psize, pte_size; + unsigned long gpa_base, gfn_base; + unsigned long gpa, gfn, hva, pfn; + struct kvm_memory_slot *memslot; + unsigned long *rmap; + struct revmap_entry *rev; + struct page *page, *pages[1]; + long index, ret, npages; + unsigned long is_io; + unsigned int writing, write_ok; + struct vm_area_struct *vma; + unsigned long rcbits; + + /* + * Real-mode code has already searched the HPT and found the + * entry we're interested in. Lock the entry and check that + * it hasn't changed. If it has, just return and re-execute the + * instruction. + */ + if (ea != vcpu->arch.pgfault_addr) + return RESUME_GUEST; + index = vcpu->arch.pgfault_index; + hptep = (__be64 *)(kvm->arch.hpt_virt + (index << 4)); + rev = &kvm->arch.revmap[index]; + preempt_disable(); + while (!try_lock_hpte(hptep, HPTE_V_HVLOCK)) + cpu_relax(); + hpte[0] = be64_to_cpu(hptep[0]) & ~HPTE_V_HVLOCK; + hpte[1] = be64_to_cpu(hptep[1]); + hpte[2] = r = rev->guest_rpte; + unlock_hpte(hptep, hpte[0]); + preempt_enable(); + + if (hpte[0] != vcpu->arch.pgfault_hpte[0] || + hpte[1] != vcpu->arch.pgfault_hpte[1]) + return RESUME_GUEST; + + /* Translate the logical address and get the page */ + psize = hpte_page_size(hpte[0], r); + gpa_base = r & HPTE_R_RPN & ~(psize - 1); + gfn_base = gpa_base >> PAGE_SHIFT; + gpa = gpa_base | (ea & (psize - 1)); + gfn = gpa >> PAGE_SHIFT; + memslot = gfn_to_memslot(kvm, gfn); + + trace_kvm_page_fault_enter(vcpu, hpte, memslot, ea, dsisr); + + /* No memslot means it's an emulated MMIO region */ + if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID)) + return kvmppc_hv_emulate_mmio(run, vcpu, gpa, ea, + dsisr & DSISR_ISSTORE); + + /* + * This should never happen, because of the slot_is_aligned() + * check in kvmppc_do_h_enter(). + */ + if (gfn_base < memslot->base_gfn) + return -EFAULT; + + /* used to check for invalidations in progress */ + mmu_seq = kvm->mmu_notifier_seq; + smp_rmb(); + + ret = -EFAULT; + is_io = 0; + pfn = 0; + page = NULL; + pte_size = PAGE_SIZE; + writing = (dsisr & DSISR_ISSTORE) != 0; + /* If writing != 0, then the HPTE must allow writing, if we get here */ + write_ok = writing; + hva = gfn_to_hva_memslot(memslot, gfn); + npages = get_user_pages_fast(hva, 1, writing, pages); + if (npages < 1) { + /* Check if it's an I/O mapping */ + down_read(¤t->mm->mmap_sem); + vma = find_vma(current->mm, hva); + if (vma && vma->vm_start <= hva && hva + psize <= vma->vm_end && + (vma->vm_flags & VM_PFNMAP)) { + pfn = vma->vm_pgoff + + ((hva - vma->vm_start) >> PAGE_SHIFT); + pte_size = psize; + is_io = hpte_cache_bits(pgprot_val(vma->vm_page_prot)); + write_ok = vma->vm_flags & VM_WRITE; + } + up_read(¤t->mm->mmap_sem); + if (!pfn) + goto out_put; + } else { + page = pages[0]; + pfn = page_to_pfn(page); + if (PageHuge(page)) { + page = compound_head(page); + pte_size <<= compound_order(page); + } + /* if the guest wants write access, see if that is OK */ + if (!writing && hpte_is_writable(r)) { + pte_t *ptep, pte; + unsigned long flags; + /* + * We need to protect against page table destruction + * hugepage split and collapse. + */ + local_irq_save(flags); + ptep = find_linux_pte_or_hugepte(current->mm->pgd, + hva, NULL); + if (ptep) { + pte = kvmppc_read_update_linux_pte(ptep, 1); + if (pte_write(pte)) + write_ok = 1; + } + local_irq_restore(flags); + } + } + + if (psize > pte_size) + goto out_put; + + /* Check WIMG vs. the actual page we're accessing */ + if (!hpte_cache_flags_ok(r, is_io)) { + if (is_io) + goto out_put; + + /* + * Allow guest to map emulated device memory as + * uncacheable, but actually make it cacheable. + */ + r = (r & ~(HPTE_R_W|HPTE_R_I|HPTE_R_G)) | HPTE_R_M; + } + + /* + * Set the HPTE to point to pfn. + * Since the pfn is at PAGE_SIZE granularity, make sure we + * don't mask out lower-order bits if psize < PAGE_SIZE. + */ + if (psize < PAGE_SIZE) + psize = PAGE_SIZE; + r = (r & ~(HPTE_R_PP0 - psize)) | ((pfn << PAGE_SHIFT) & ~(psize - 1)); + if (hpte_is_writable(r) && !write_ok) + r = hpte_make_readonly(r); + ret = RESUME_GUEST; + preempt_disable(); + while (!try_lock_hpte(hptep, HPTE_V_HVLOCK)) + cpu_relax(); + if ((be64_to_cpu(hptep[0]) & ~HPTE_V_HVLOCK) != hpte[0] || + be64_to_cpu(hptep[1]) != hpte[1] || + rev->guest_rpte != hpte[2]) + /* HPTE has been changed under us; let the guest retry */ + goto out_unlock; + hpte[0] = (hpte[0] & ~HPTE_V_ABSENT) | HPTE_V_VALID; + + /* Always put the HPTE in the rmap chain for the page base address */ + rmap = &memslot->arch.rmap[gfn_base - memslot->base_gfn]; + lock_rmap(rmap); + + /* Check if we might have been invalidated; let the guest retry if so */ + ret = RESUME_GUEST; + if (mmu_notifier_retry(vcpu->kvm, mmu_seq)) { + unlock_rmap(rmap); + goto out_unlock; + } + + /* Only set R/C in real HPTE if set in both *rmap and guest_rpte */ + rcbits = *rmap >> KVMPPC_RMAP_RC_SHIFT; + r &= rcbits | ~(HPTE_R_R | HPTE_R_C); + + if (be64_to_cpu(hptep[0]) & HPTE_V_VALID) { + /* HPTE was previously valid, so we need to invalidate it */ + unlock_rmap(rmap); + hptep[0] |= cpu_to_be64(HPTE_V_ABSENT); + kvmppc_invalidate_hpte(kvm, hptep, index); + /* don't lose previous R and C bits */ + r |= be64_to_cpu(hptep[1]) & (HPTE_R_R | HPTE_R_C); + } else { + kvmppc_add_revmap_chain(kvm, rev, rmap, index, 0); + } + + hptep[1] = cpu_to_be64(r); + eieio(); + __unlock_hpte(hptep, hpte[0]); + asm volatile("ptesync" : : : "memory"); + preempt_enable(); + if (page && hpte_is_writable(r)) + SetPageDirty(page); + + out_put: + trace_kvm_page_fault_exit(vcpu, hpte, ret); + + if (page) { + /* + * We drop pages[0] here, not page because page might + * have been set to the head page of a compound, but + * we have to drop the reference on the correct tail + * page to match the get inside gup() + */ + put_page(pages[0]); + } + return ret; + + out_unlock: + __unlock_hpte(hptep, be64_to_cpu(hptep[0])); + preempt_enable(); + goto out_put; +} + +static void kvmppc_rmap_reset(struct kvm *kvm) +{ + struct kvm_memslots *slots; + struct kvm_memory_slot *memslot; + int srcu_idx; + + srcu_idx = srcu_read_lock(&kvm->srcu); + slots = kvm->memslots; + kvm_for_each_memslot(memslot, slots) { + /* + * This assumes it is acceptable to lose reference and + * change bits across a reset. + */ + memset(memslot->arch.rmap, 0, + memslot->npages * sizeof(*memslot->arch.rmap)); + } + srcu_read_unlock(&kvm->srcu, srcu_idx); +} + +static int kvm_handle_hva_range(struct kvm *kvm, + unsigned long start, + unsigned long end, + int (*handler)(struct kvm *kvm, + unsigned long *rmapp, + unsigned long gfn)) +{ + int ret; + int retval = 0; + struct kvm_memslots *slots; + struct kvm_memory_slot *memslot; + + slots = kvm_memslots(kvm); + kvm_for_each_memslot(memslot, slots) { + unsigned long hva_start, hva_end; + gfn_t gfn, gfn_end; + + hva_start = max(start, memslot->userspace_addr); + hva_end = min(end, memslot->userspace_addr + + (memslot->npages << PAGE_SHIFT)); + if (hva_start >= hva_end) + continue; + /* + * {gfn(page) | page intersects with [hva_start, hva_end)} = + * {gfn, gfn+1, ..., gfn_end-1}. + */ + gfn = hva_to_gfn_memslot(hva_start, memslot); + gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot); + + for (; gfn < gfn_end; ++gfn) { + gfn_t gfn_offset = gfn - memslot->base_gfn; + + ret = handler(kvm, &memslot->arch.rmap[gfn_offset], gfn); + retval |= ret; + } + } + + return retval; +} + +static int kvm_handle_hva(struct kvm *kvm, unsigned long hva, + int (*handler)(struct kvm *kvm, unsigned long *rmapp, + unsigned long gfn)) +{ + return kvm_handle_hva_range(kvm, hva, hva + 1, handler); +} + +static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp, + unsigned long gfn) +{ + struct revmap_entry *rev = kvm->arch.revmap; + unsigned long h, i, j; + __be64 *hptep; + unsigned long ptel, psize, rcbits; + + for (;;) { + lock_rmap(rmapp); + if (!(*rmapp & KVMPPC_RMAP_PRESENT)) { + unlock_rmap(rmapp); + break; + } + + /* + * To avoid an ABBA deadlock with the HPTE lock bit, + * we can't spin on the HPTE lock while holding the + * rmap chain lock. + */ + i = *rmapp & KVMPPC_RMAP_INDEX; + hptep = (__be64 *) (kvm->arch.hpt_virt + (i << 4)); + if (!try_lock_hpte(hptep, HPTE_V_HVLOCK)) { + /* unlock rmap before spinning on the HPTE lock */ + unlock_rmap(rmapp); + while (be64_to_cpu(hptep[0]) & HPTE_V_HVLOCK) + cpu_relax(); + continue; + } + j = rev[i].forw; + if (j == i) { + /* chain is now empty */ + *rmapp &= ~(KVMPPC_RMAP_PRESENT | KVMPPC_RMAP_INDEX); + } else { + /* remove i from chain */ + h = rev[i].back; + rev[h].forw = j; + rev[j].back = h; + rev[i].forw = rev[i].back = i; + *rmapp = (*rmapp & ~KVMPPC_RMAP_INDEX) | j; + } + + /* Now check and modify the HPTE */ + ptel = rev[i].guest_rpte; + psize = hpte_page_size(be64_to_cpu(hptep[0]), ptel); + if ((be64_to_cpu(hptep[0]) & HPTE_V_VALID) && + hpte_rpn(ptel, psize) == gfn) { + hptep[0] |= cpu_to_be64(HPTE_V_ABSENT); + kvmppc_invalidate_hpte(kvm, hptep, i); + /* Harvest R and C */ + rcbits = be64_to_cpu(hptep[1]) & (HPTE_R_R | HPTE_R_C); + *rmapp |= rcbits << KVMPPC_RMAP_RC_SHIFT; + if (rcbits & ~rev[i].guest_rpte) { + rev[i].guest_rpte = ptel | rcbits; + note_hpte_modification(kvm, &rev[i]); + } + } + unlock_rmap(rmapp); + __unlock_hpte(hptep, be64_to_cpu(hptep[0])); + } + return 0; +} + +int kvm_unmap_hva_hv(struct kvm *kvm, unsigned long hva) +{ + kvm_handle_hva(kvm, hva, kvm_unmap_rmapp); + return 0; +} + +int kvm_unmap_hva_range_hv(struct kvm *kvm, unsigned long start, unsigned long end) +{ + kvm_handle_hva_range(kvm, start, end, kvm_unmap_rmapp); + return 0; +} + +void kvmppc_core_flush_memslot_hv(struct kvm *kvm, + struct kvm_memory_slot *memslot) +{ + unsigned long *rmapp; + unsigned long gfn; + unsigned long n; + + rmapp = memslot->arch.rmap; + gfn = memslot->base_gfn; + for (n = memslot->npages; n; --n) { + /* + * Testing the present bit without locking is OK because + * the memslot has been marked invalid already, and hence + * no new HPTEs referencing this page can be created, + * thus the present bit can't go from 0 to 1. + */ + if (*rmapp & KVMPPC_RMAP_PRESENT) + kvm_unmap_rmapp(kvm, rmapp, gfn); + ++rmapp; + ++gfn; + } +} + +static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp, + unsigned long gfn) +{ + struct revmap_entry *rev = kvm->arch.revmap; + unsigned long head, i, j; + __be64 *hptep; + int ret = 0; + + retry: + lock_rmap(rmapp); + if (*rmapp & KVMPPC_RMAP_REFERENCED) { + *rmapp &= ~KVMPPC_RMAP_REFERENCED; + ret = 1; + } + if (!(*rmapp & KVMPPC_RMAP_PRESENT)) { + unlock_rmap(rmapp); + return ret; + } + + i = head = *rmapp & KVMPPC_RMAP_INDEX; + do { + hptep = (__be64 *) (kvm->arch.hpt_virt + (i << 4)); + j = rev[i].forw; + + /* If this HPTE isn't referenced, ignore it */ + if (!(be64_to_cpu(hptep[1]) & HPTE_R_R)) + continue; + + if (!try_lock_hpte(hptep, HPTE_V_HVLOCK)) { + /* unlock rmap before spinning on the HPTE lock */ + unlock_rmap(rmapp); + while (be64_to_cpu(hptep[0]) & HPTE_V_HVLOCK) + cpu_relax(); + goto retry; + } + + /* Now check and modify the HPTE */ + if ((be64_to_cpu(hptep[0]) & HPTE_V_VALID) && + (be64_to_cpu(hptep[1]) & HPTE_R_R)) { + kvmppc_clear_ref_hpte(kvm, hptep, i); + if (!(rev[i].guest_rpte & HPTE_R_R)) { + rev[i].guest_rpte |= HPTE_R_R; + note_hpte_modification(kvm, &rev[i]); + } + ret = 1; + } + __unlock_hpte(hptep, be64_to_cpu(hptep[0])); + } while ((i = j) != head); + + unlock_rmap(rmapp); + return ret; +} + +int kvm_age_hva_hv(struct kvm *kvm, unsigned long start, unsigned long end) +{ + return kvm_handle_hva_range(kvm, start, end, kvm_age_rmapp); +} + +static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp, + unsigned long gfn) +{ + struct revmap_entry *rev = kvm->arch.revmap; + unsigned long head, i, j; + unsigned long *hp; + int ret = 1; + + if (*rmapp & KVMPPC_RMAP_REFERENCED) + return 1; + + lock_rmap(rmapp); + if (*rmapp & KVMPPC_RMAP_REFERENCED) + goto out; + + if (*rmapp & KVMPPC_RMAP_PRESENT) { + i = head = *rmapp & KVMPPC_RMAP_INDEX; + do { + hp = (unsigned long *)(kvm->arch.hpt_virt + (i << 4)); + j = rev[i].forw; + if (be64_to_cpu(hp[1]) & HPTE_R_R) + goto out; + } while ((i = j) != head); + } + ret = 0; + + out: + unlock_rmap(rmapp); + return ret; +} + +int kvm_test_age_hva_hv(struct kvm *kvm, unsigned long hva) +{ + return kvm_handle_hva(kvm, hva, kvm_test_age_rmapp); +} + +void kvm_set_spte_hva_hv(struct kvm *kvm, unsigned long hva, pte_t pte) +{ + kvm_handle_hva(kvm, hva, kvm_unmap_rmapp); +} + +static int vcpus_running(struct kvm *kvm) +{ + return atomic_read(&kvm->arch.vcpus_running) != 0; +} + +/* + * Returns the number of system pages that are dirty. + * This can be more than 1 if we find a huge-page HPTE. + */ +static int kvm_test_clear_dirty_npages(struct kvm *kvm, unsigned long *rmapp) +{ + struct revmap_entry *rev = kvm->arch.revmap; + unsigned long head, i, j; + unsigned long n; + unsigned long v, r; + __be64 *hptep; + int npages_dirty = 0; + + retry: + lock_rmap(rmapp); + if (*rmapp & KVMPPC_RMAP_CHANGED) { + *rmapp &= ~KVMPPC_RMAP_CHANGED; + npages_dirty = 1; + } + if (!(*rmapp & KVMPPC_RMAP_PRESENT)) { + unlock_rmap(rmapp); + return npages_dirty; + } + + i = head = *rmapp & KVMPPC_RMAP_INDEX; + do { + unsigned long hptep1; + hptep = (__be64 *) (kvm->arch.hpt_virt + (i << 4)); + j = rev[i].forw; + + /* + * Checking the C (changed) bit here is racy since there + * is no guarantee about when the hardware writes it back. + * If the HPTE is not writable then it is stable since the + * page can't be written to, and we would have done a tlbie + * (which forces the hardware to complete any writeback) + * when making the HPTE read-only. + * If vcpus are running then this call is racy anyway + * since the page could get dirtied subsequently, so we + * expect there to be a further call which would pick up + * any delayed C bit writeback. + * Otherwise we need to do the tlbie even if C==0 in + * order to pick up any delayed writeback of C. + */ + hptep1 = be64_to_cpu(hptep[1]); + if (!(hptep1 & HPTE_R_C) && + (!hpte_is_writable(hptep1) || vcpus_running(kvm))) + continue; + + if (!try_lock_hpte(hptep, HPTE_V_HVLOCK)) { + /* unlock rmap before spinning on the HPTE lock */ + unlock_rmap(rmapp); + while (hptep[0] & cpu_to_be64(HPTE_V_HVLOCK)) + cpu_relax(); + goto retry; + } + + /* Now check and modify the HPTE */ + if (!(hptep[0] & cpu_to_be64(HPTE_V_VALID))) { + __unlock_hpte(hptep, be64_to_cpu(hptep[0])); + continue; + } + + /* need to make it temporarily absent so C is stable */ + hptep[0] |= cpu_to_be64(HPTE_V_ABSENT); + kvmppc_invalidate_hpte(kvm, hptep, i); + v = be64_to_cpu(hptep[0]); + r = be64_to_cpu(hptep[1]); + if (r & HPTE_R_C) { + hptep[1] = cpu_to_be64(r & ~HPTE_R_C); + if (!(rev[i].guest_rpte & HPTE_R_C)) { + rev[i].guest_rpte |= HPTE_R_C; + note_hpte_modification(kvm, &rev[i]); + } + n = hpte_page_size(v, r); + n = (n + PAGE_SIZE - 1) >> PAGE_SHIFT; + if (n > npages_dirty) + npages_dirty = n; + eieio(); + } + v &= ~HPTE_V_ABSENT; + v |= HPTE_V_VALID; + __unlock_hpte(hptep, v); + } while ((i = j) != head); + + unlock_rmap(rmapp); + return npages_dirty; +} + +static void harvest_vpa_dirty(struct kvmppc_vpa *vpa, + struct kvm_memory_slot *memslot, + unsigned long *map) +{ + unsigned long gfn; + + if (!vpa->dirty || !vpa->pinned_addr) + return; + gfn = vpa->gpa >> PAGE_SHIFT; + if (gfn < memslot->base_gfn || + gfn >= memslot->base_gfn + memslot->npages) + return; + + vpa->dirty = false; + if (map) + __set_bit_le(gfn - memslot->base_gfn, map); +} + +long kvmppc_hv_get_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot, + unsigned long *map) +{ + unsigned long i, j; + unsigned long *rmapp; + struct kvm_vcpu *vcpu; + + preempt_disable(); + rmapp = memslot->arch.rmap; + for (i = 0; i < memslot->npages; ++i) { + int npages = kvm_test_clear_dirty_npages(kvm, rmapp); + /* + * Note that if npages > 0 then i must be a multiple of npages, + * since we always put huge-page HPTEs in the rmap chain + * corresponding to their page base address. + */ + if (npages && map) + for (j = i; npages; ++j, --npages) + __set_bit_le(j, map); + ++rmapp; + } + + /* Harvest dirty bits from VPA and DTL updates */ + /* Note: we never modify the SLB shadow buffer areas */ + kvm_for_each_vcpu(i, vcpu, kvm) { + spin_lock(&vcpu->arch.vpa_update_lock); + harvest_vpa_dirty(&vcpu->arch.vpa, memslot, map); + harvest_vpa_dirty(&vcpu->arch.dtl, memslot, map); + spin_unlock(&vcpu->arch.vpa_update_lock); + } + preempt_enable(); + return 0; +} + +void *kvmppc_pin_guest_page(struct kvm *kvm, unsigned long gpa, + unsigned long *nb_ret) +{ + struct kvm_memory_slot *memslot; + unsigned long gfn = gpa >> PAGE_SHIFT; + struct page *page, *pages[1]; + int npages; + unsigned long hva, offset; + int srcu_idx; + + srcu_idx = srcu_read_lock(&kvm->srcu); + memslot = gfn_to_memslot(kvm, gfn); + if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID)) + goto err; + hva = gfn_to_hva_memslot(memslot, gfn); + npages = get_user_pages_fast(hva, 1, 1, pages); + if (npages < 1) + goto err; + page = pages[0]; + srcu_read_unlock(&kvm->srcu, srcu_idx); + + offset = gpa & (PAGE_SIZE - 1); + if (nb_ret) + *nb_ret = PAGE_SIZE - offset; + return page_address(page) + offset; + + err: + srcu_read_unlock(&kvm->srcu, srcu_idx); + return NULL; +} + +void kvmppc_unpin_guest_page(struct kvm *kvm, void *va, unsigned long gpa, + bool dirty) +{ + struct page *page = virt_to_page(va); + struct kvm_memory_slot *memslot; + unsigned long gfn; + unsigned long *rmap; + int srcu_idx; + + put_page(page); + + if (!dirty) + return; + + /* We need to mark this page dirty in the rmap chain */ + gfn = gpa >> PAGE_SHIFT; + srcu_idx = srcu_read_lock(&kvm->srcu); + memslot = gfn_to_memslot(kvm, gfn); + if (memslot) { + rmap = &memslot->arch.rmap[gfn - memslot->base_gfn]; + lock_rmap(rmap); + *rmap |= KVMPPC_RMAP_CHANGED; + unlock_rmap(rmap); + } + srcu_read_unlock(&kvm->srcu, srcu_idx); +} + +/* + * Functions for reading and writing the hash table via reads and + * writes on a file descriptor. + * + * Reads return the guest view of the hash table, which has to be + * pieced together from the real hash table and the guest_rpte + * values in the revmap array. + * + * On writes, each HPTE written is considered in turn, and if it + * is valid, it is written to the HPT as if an H_ENTER with the + * exact flag set was done. When the invalid count is non-zero + * in the header written to the stream, the kernel will make + * sure that that many HPTEs are invalid, and invalidate them + * if not. + */ + +struct kvm_htab_ctx { + unsigned long index; + unsigned long flags; + struct kvm *kvm; + int first_pass; +}; + +#define HPTE_SIZE (2 * sizeof(unsigned long)) + +/* + * Returns 1 if this HPT entry has been modified or has pending + * R/C bit changes. + */ +static int hpte_dirty(struct revmap_entry *revp, __be64 *hptp) +{ + unsigned long rcbits_unset; + + if (revp->guest_rpte & HPTE_GR_MODIFIED) + return 1; + + /* Also need to consider changes in reference and changed bits */ + rcbits_unset = ~revp->guest_rpte & (HPTE_R_R | HPTE_R_C); + if ((be64_to_cpu(hptp[0]) & HPTE_V_VALID) && + (be64_to_cpu(hptp[1]) & rcbits_unset)) + return 1; + + return 0; +} + +static long record_hpte(unsigned long flags, __be64 *hptp, + unsigned long *hpte, struct revmap_entry *revp, + int want_valid, int first_pass) +{ + unsigned long v, r; + unsigned long rcbits_unset; + int ok = 1; + int valid, dirty; + + /* Unmodified entries are uninteresting except on the first pass */ + dirty = hpte_dirty(revp, hptp); + if (!first_pass && !dirty) + return 0; + + valid = 0; + if (be64_to_cpu(hptp[0]) & (HPTE_V_VALID | HPTE_V_ABSENT)) { + valid = 1; + if ((flags & KVM_GET_HTAB_BOLTED_ONLY) && + !(be64_to_cpu(hptp[0]) & HPTE_V_BOLTED)) + valid = 0; + } + if (valid != want_valid) + return 0; + + v = r = 0; + if (valid || dirty) { + /* lock the HPTE so it's stable and read it */ + preempt_disable(); + while (!try_lock_hpte(hptp, HPTE_V_HVLOCK)) + cpu_relax(); + v = be64_to_cpu(hptp[0]); + + /* re-evaluate valid and dirty from synchronized HPTE value */ + valid = !!(v & HPTE_V_VALID); + dirty = !!(revp->guest_rpte & HPTE_GR_MODIFIED); + + /* Harvest R and C into guest view if necessary */ + rcbits_unset = ~revp->guest_rpte & (HPTE_R_R | HPTE_R_C); + if (valid && (rcbits_unset & be64_to_cpu(hptp[1]))) { + revp->guest_rpte |= (be64_to_cpu(hptp[1]) & + (HPTE_R_R | HPTE_R_C)) | HPTE_GR_MODIFIED; + dirty = 1; + } + + if (v & HPTE_V_ABSENT) { + v &= ~HPTE_V_ABSENT; + v |= HPTE_V_VALID; + valid = 1; + } + if ((flags & KVM_GET_HTAB_BOLTED_ONLY) && !(v & HPTE_V_BOLTED)) + valid = 0; + + r = revp->guest_rpte; + /* only clear modified if this is the right sort of entry */ + if (valid == want_valid && dirty) { + r &= ~HPTE_GR_MODIFIED; + revp->guest_rpte = r; + } + unlock_hpte(hptp, be64_to_cpu(hptp[0])); + preempt_enable(); + if (!(valid == want_valid && (first_pass || dirty))) + ok = 0; + } + hpte[0] = cpu_to_be64(v); + hpte[1] = cpu_to_be64(r); + return ok; +} + +static ssize_t kvm_htab_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + struct kvm_htab_ctx *ctx = file->private_data; + struct kvm *kvm = ctx->kvm; + struct kvm_get_htab_header hdr; + __be64 *hptp; + struct revmap_entry *revp; + unsigned long i, nb, nw; + unsigned long __user *lbuf; + struct kvm_get_htab_header __user *hptr; + unsigned long flags; + int first_pass; + unsigned long hpte[2]; + + if (!access_ok(VERIFY_WRITE, buf, count)) + return -EFAULT; + + first_pass = ctx->first_pass; + flags = ctx->flags; + + i = ctx->index; + hptp = (__be64 *)(kvm->arch.hpt_virt + (i * HPTE_SIZE)); + revp = kvm->arch.revmap + i; + lbuf = (unsigned long __user *)buf; + + nb = 0; + while (nb + sizeof(hdr) + HPTE_SIZE < count) { + /* Initialize header */ + hptr = (struct kvm_get_htab_header __user *)buf; + hdr.n_valid = 0; + hdr.n_invalid = 0; + nw = nb; + nb += sizeof(hdr); + lbuf = (unsigned long __user *)(buf + sizeof(hdr)); + + /* Skip uninteresting entries, i.e. clean on not-first pass */ + if (!first_pass) { + while (i < kvm->arch.hpt_npte && + !hpte_dirty(revp, hptp)) { + ++i; + hptp += 2; + ++revp; + } + } + hdr.index = i; + + /* Grab a series of valid entries */ + while (i < kvm->arch.hpt_npte && + hdr.n_valid < 0xffff && + nb + HPTE_SIZE < count && + record_hpte(flags, hptp, hpte, revp, 1, first_pass)) { + /* valid entry, write it out */ + ++hdr.n_valid; + if (__put_user(hpte[0], lbuf) || + __put_user(hpte[1], lbuf + 1)) + return -EFAULT; + nb += HPTE_SIZE; + lbuf += 2; + ++i; + hptp += 2; + ++revp; + } + /* Now skip invalid entries while we can */ + while (i < kvm->arch.hpt_npte && + hdr.n_invalid < 0xffff && + record_hpte(flags, hptp, hpte, revp, 0, first_pass)) { + /* found an invalid entry */ + ++hdr.n_invalid; + ++i; + hptp += 2; + ++revp; + } + + if (hdr.n_valid || hdr.n_invalid) { + /* write back the header */ + if (__copy_to_user(hptr, &hdr, sizeof(hdr))) + return -EFAULT; + nw = nb; + buf = (char __user *)lbuf; + } else { + nb = nw; + } + + /* Check if we've wrapped around the hash table */ + if (i >= kvm->arch.hpt_npte) { + i = 0; + ctx->first_pass = 0; + break; + } + } + + ctx->index = i; + + return nb; +} + +static ssize_t kvm_htab_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + struct kvm_htab_ctx *ctx = file->private_data; + struct kvm *kvm = ctx->kvm; + struct kvm_get_htab_header hdr; + unsigned long i, j; + unsigned long v, r; + unsigned long __user *lbuf; + __be64 *hptp; + unsigned long tmp[2]; + ssize_t nb; + long int err, ret; + int hpte_setup; + + if (!access_ok(VERIFY_READ, buf, count)) + return -EFAULT; + + /* lock out vcpus from running while we're doing this */ + mutex_lock(&kvm->lock); + hpte_setup = kvm->arch.hpte_setup_done; + if (hpte_setup) { + kvm->arch.hpte_setup_done = 0; /* temporarily */ + /* order hpte_setup_done vs. vcpus_running */ + smp_mb(); + if (atomic_read(&kvm->arch.vcpus_running)) { + kvm->arch.hpte_setup_done = 1; + mutex_unlock(&kvm->lock); + return -EBUSY; + } + } + + err = 0; + for (nb = 0; nb + sizeof(hdr) <= count; ) { + err = -EFAULT; + if (__copy_from_user(&hdr, buf, sizeof(hdr))) + break; + + err = 0; + if (nb + hdr.n_valid * HPTE_SIZE > count) + break; + + nb += sizeof(hdr); + buf += sizeof(hdr); + + err = -EINVAL; + i = hdr.index; + if (i >= kvm->arch.hpt_npte || + i + hdr.n_valid + hdr.n_invalid > kvm->arch.hpt_npte) + break; + + hptp = (__be64 *)(kvm->arch.hpt_virt + (i * HPTE_SIZE)); + lbuf = (unsigned long __user *)buf; + for (j = 0; j < hdr.n_valid; ++j) { + __be64 hpte_v; + __be64 hpte_r; + + err = -EFAULT; + if (__get_user(hpte_v, lbuf) || + __get_user(hpte_r, lbuf + 1)) + goto out; + v = be64_to_cpu(hpte_v); + r = be64_to_cpu(hpte_r); + err = -EINVAL; + if (!(v & HPTE_V_VALID)) + goto out; + lbuf += 2; + nb += HPTE_SIZE; + + if (be64_to_cpu(hptp[0]) & (HPTE_V_VALID | HPTE_V_ABSENT)) + kvmppc_do_h_remove(kvm, 0, i, 0, tmp); + err = -EIO; + ret = kvmppc_virtmode_do_h_enter(kvm, H_EXACT, i, v, r, + tmp); + if (ret != H_SUCCESS) { + pr_err("kvm_htab_write ret %ld i=%ld v=%lx " + "r=%lx\n", ret, i, v, r); + goto out; + } + if (!hpte_setup && is_vrma_hpte(v)) { + unsigned long psize = hpte_base_page_size(v, r); + unsigned long senc = slb_pgsize_encoding(psize); + unsigned long lpcr; + + kvm->arch.vrma_slb_v = senc | SLB_VSID_B_1T | + (VRMA_VSID << SLB_VSID_SHIFT_1T); + lpcr = senc << (LPCR_VRMASD_SH - 4); + kvmppc_update_lpcr(kvm, lpcr, LPCR_VRMASD); + hpte_setup = 1; + } + ++i; + hptp += 2; + } + + for (j = 0; j < hdr.n_invalid; ++j) { + if (be64_to_cpu(hptp[0]) & (HPTE_V_VALID | HPTE_V_ABSENT)) + kvmppc_do_h_remove(kvm, 0, i, 0, tmp); + ++i; + hptp += 2; + } + err = 0; + } + + out: + /* Order HPTE updates vs. hpte_setup_done */ + smp_wmb(); + kvm->arch.hpte_setup_done = hpte_setup; + mutex_unlock(&kvm->lock); + + if (err) + return err; + return nb; +} + +static int kvm_htab_release(struct inode *inode, struct file *filp) +{ + struct kvm_htab_ctx *ctx = filp->private_data; + + filp->private_data = NULL; + if (!(ctx->flags & KVM_GET_HTAB_WRITE)) + atomic_dec(&ctx->kvm->arch.hpte_mod_interest); + kvm_put_kvm(ctx->kvm); + kfree(ctx); + return 0; +} + +static const struct file_operations kvm_htab_fops = { + .read = kvm_htab_read, + .write = kvm_htab_write, + .llseek = default_llseek, + .release = kvm_htab_release, +}; + +int kvm_vm_ioctl_get_htab_fd(struct kvm *kvm, struct kvm_get_htab_fd *ghf) +{ + int ret; + struct kvm_htab_ctx *ctx; + int rwflag; + + /* reject flags we don't recognize */ + if (ghf->flags & ~(KVM_GET_HTAB_BOLTED_ONLY | KVM_GET_HTAB_WRITE)) + return -EINVAL; + ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); + if (!ctx) + return -ENOMEM; + kvm_get_kvm(kvm); + ctx->kvm = kvm; + ctx->index = ghf->start_index; + ctx->flags = ghf->flags; + ctx->first_pass = 1; + + rwflag = (ghf->flags & KVM_GET_HTAB_WRITE) ? O_WRONLY : O_RDONLY; + ret = anon_inode_getfd("kvm-htab", &kvm_htab_fops, ctx, rwflag | O_CLOEXEC); + if (ret < 0) { + kvm_put_kvm(kvm); + return ret; + } + + if (rwflag == O_RDONLY) { + mutex_lock(&kvm->slots_lock); + atomic_inc(&kvm->arch.hpte_mod_interest); + /* make sure kvmppc_do_h_enter etc. see the increment */ + synchronize_srcu_expedited(&kvm->srcu); + mutex_unlock(&kvm->slots_lock); + } + + return ret; +} + +struct debugfs_htab_state { + struct kvm *kvm; + struct mutex mutex; + unsigned long hpt_index; + int chars_left; + int buf_index; + char buf[64]; +}; + +static int debugfs_htab_open(struct inode *inode, struct file *file) +{ + struct kvm *kvm = inode->i_private; + struct debugfs_htab_state *p; + + p = kzalloc(sizeof(*p), GFP_KERNEL); + if (!p) + return -ENOMEM; + + kvm_get_kvm(kvm); + p->kvm = kvm; + mutex_init(&p->mutex); + file->private_data = p; + + return nonseekable_open(inode, file); +} + +static int debugfs_htab_release(struct inode *inode, struct file *file) +{ + struct debugfs_htab_state *p = file->private_data; + + kvm_put_kvm(p->kvm); + kfree(p); + return 0; +} + +static ssize_t debugfs_htab_read(struct file *file, char __user *buf, + size_t len, loff_t *ppos) +{ + struct debugfs_htab_state *p = file->private_data; + ssize_t ret, r; + unsigned long i, n; + unsigned long v, hr, gr; + struct kvm *kvm; + __be64 *hptp; + + ret = mutex_lock_interruptible(&p->mutex); + if (ret) + return ret; + + if (p->chars_left) { + n = p->chars_left; + if (n > len) + n = len; + r = copy_to_user(buf, p->buf + p->buf_index, n); + n -= r; + p->chars_left -= n; + p->buf_index += n; + buf += n; + len -= n; + ret = n; + if (r) { + if (!n) + ret = -EFAULT; + goto out; + } + } + + kvm = p->kvm; + i = p->hpt_index; + hptp = (__be64 *)(kvm->arch.hpt_virt + (i * HPTE_SIZE)); + for (; len != 0 && i < kvm->arch.hpt_npte; ++i, hptp += 2) { + if (!(be64_to_cpu(hptp[0]) & (HPTE_V_VALID | HPTE_V_ABSENT))) + continue; + + /* lock the HPTE so it's stable and read it */ + preempt_disable(); + while (!try_lock_hpte(hptp, HPTE_V_HVLOCK)) + cpu_relax(); + v = be64_to_cpu(hptp[0]) & ~HPTE_V_HVLOCK; + hr = be64_to_cpu(hptp[1]); + gr = kvm->arch.revmap[i].guest_rpte; + unlock_hpte(hptp, v); + preempt_enable(); + + if (!(v & (HPTE_V_VALID | HPTE_V_ABSENT))) + continue; + + n = scnprintf(p->buf, sizeof(p->buf), + "%6lx %.16lx %.16lx %.16lx\n", + i, v, hr, gr); + p->chars_left = n; + if (n > len) + n = len; + r = copy_to_user(buf, p->buf, n); + n -= r; + p->chars_left -= n; + p->buf_index = n; + buf += n; + len -= n; + ret += n; + if (r) { + if (!ret) + ret = -EFAULT; + goto out; + } + } + p->hpt_index = i; + + out: + mutex_unlock(&p->mutex); + return ret; +} + +ssize_t debugfs_htab_write(struct file *file, const char __user *buf, + size_t len, loff_t *ppos) +{ + return -EACCES; +} + +static const struct file_operations debugfs_htab_fops = { + .owner = THIS_MODULE, + .open = debugfs_htab_open, + .release = debugfs_htab_release, + .read = debugfs_htab_read, + .write = debugfs_htab_write, + .llseek = generic_file_llseek, +}; + +void kvmppc_mmu_debugfs_init(struct kvm *kvm) +{ + kvm->arch.htab_dentry = debugfs_create_file("htab", 0400, + kvm->arch.debugfs_dir, kvm, + &debugfs_htab_fops); +} + +void kvmppc_mmu_book3s_hv_init(struct kvm_vcpu *vcpu) +{ + struct kvmppc_mmu *mmu = &vcpu->arch.mmu; + + vcpu->arch.slb_nr = 32; /* POWER7/POWER8 */ + + mmu->xlate = kvmppc_mmu_book3s_64_hv_xlate; + mmu->reset_msr = kvmppc_mmu_book3s_64_hv_reset_msr; + + vcpu->arch.hflags |= BOOK3S_HFLAG_SLB; +} diff --git a/kernel/arch/powerpc/kvm/book3s_64_slb.S b/kernel/arch/powerpc/kvm/book3s_64_slb.S new file mode 100644 index 000000000..3589c4e3d --- /dev/null +++ b/kernel/arch/powerpc/kvm/book3s_64_slb.S @@ -0,0 +1,153 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * + * Copyright SUSE Linux Products GmbH 2009 + * + * Authors: Alexander Graf <agraf@suse.de> + */ + +#define SHADOW_SLB_ENTRY_LEN 0x10 +#define OFFSET_ESID(x) (SHADOW_SLB_ENTRY_LEN * x) +#define OFFSET_VSID(x) ((SHADOW_SLB_ENTRY_LEN * x) + 8) + +/****************************************************************************** + * * + * Entry code * + * * + *****************************************************************************/ + +.macro LOAD_GUEST_SEGMENTS + + /* Required state: + * + * MSR = ~IR|DR + * R13 = PACA + * R1 = host R1 + * R2 = host R2 + * R3 = shadow vcpu + * all other volatile GPRS = free except R4, R6 + * SVCPU[CR] = guest CR + * SVCPU[XER] = guest XER + * SVCPU[CTR] = guest CTR + * SVCPU[LR] = guest LR + */ + +BEGIN_FW_FTR_SECTION + + /* Declare SLB shadow as 0 entries big */ + + ld r11, PACA_SLBSHADOWPTR(r13) + li r8, 0 + stb r8, 3(r11) + +END_FW_FTR_SECTION_IFSET(FW_FEATURE_LPAR) + + /* Flush SLB */ + + li r10, 0 + slbmte r10, r10 + slbia + + /* Fill SLB with our shadow */ + + lbz r12, SVCPU_SLB_MAX(r3) + mulli r12, r12, 16 + addi r12, r12, SVCPU_SLB + add r12, r12, r3 + + /* for (r11 = kvm_slb; r11 < kvm_slb + kvm_slb_size; r11+=slb_entry) */ + li r11, SVCPU_SLB + add r11, r11, r3 + +slb_loop_enter: + + ld r10, 0(r11) + + andis. r9, r10, SLB_ESID_V@h + beq slb_loop_enter_skip + + ld r9, 8(r11) + slbmte r9, r10 + +slb_loop_enter_skip: + addi r11, r11, 16 + cmpd cr0, r11, r12 + blt slb_loop_enter + +slb_do_enter: + +.endm + +/****************************************************************************** + * * + * Exit code * + * * + *****************************************************************************/ + +.macro LOAD_HOST_SEGMENTS + + /* Register usage at this point: + * + * R1 = host R1 + * R2 = host R2 + * R12 = exit handler id + * R13 = shadow vcpu - SHADOW_VCPU_OFF [=PACA on PPC64] + * SVCPU.* = guest * + * SVCPU[CR] = guest CR + * SVCPU[XER] = guest XER + * SVCPU[CTR] = guest CTR + * SVCPU[LR] = guest LR + * + */ + + /* Remove all SLB entries that are in use. */ + + li r0, r0 + slbmte r0, r0 + slbia + + /* Restore bolted entries from the shadow */ + + ld r11, PACA_SLBSHADOWPTR(r13) + +BEGIN_FW_FTR_SECTION + + /* Declare SLB shadow as SLB_NUM_BOLTED entries big */ + + li r8, SLB_NUM_BOLTED + stb r8, 3(r11) + +END_FW_FTR_SECTION_IFSET(FW_FEATURE_LPAR) + + /* Manually load all entries from shadow SLB */ + + li r8, SLBSHADOW_SAVEAREA + li r7, SLBSHADOW_SAVEAREA + 8 + + .rept SLB_NUM_BOLTED + LDX_BE r10, r11, r8 + cmpdi r10, 0 + beq 1f + LDX_BE r9, r11, r7 + slbmte r9, r10 +1: addi r7, r7, SHADOW_SLB_ENTRY_LEN + addi r8, r8, SHADOW_SLB_ENTRY_LEN + .endr + + isync + sync + +slb_do_exit: + +.endm diff --git a/kernel/arch/powerpc/kvm/book3s_64_vio.c b/kernel/arch/powerpc/kvm/book3s_64_vio.c new file mode 100644 index 000000000..54cf9bc94 --- /dev/null +++ b/kernel/arch/powerpc/kvm/book3s_64_vio.c @@ -0,0 +1,150 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * + * Copyright 2010 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com> + * Copyright 2011 David Gibson, IBM Corporation <dwg@au1.ibm.com> + */ + +#include <linux/types.h> +#include <linux/string.h> +#include <linux/kvm.h> +#include <linux/kvm_host.h> +#include <linux/highmem.h> +#include <linux/gfp.h> +#include <linux/slab.h> +#include <linux/hugetlb.h> +#include <linux/list.h> +#include <linux/anon_inodes.h> + +#include <asm/tlbflush.h> +#include <asm/kvm_ppc.h> +#include <asm/kvm_book3s.h> +#include <asm/mmu-hash64.h> +#include <asm/hvcall.h> +#include <asm/synch.h> +#include <asm/ppc-opcode.h> +#include <asm/kvm_host.h> +#include <asm/udbg.h> + +#define TCES_PER_PAGE (PAGE_SIZE / sizeof(u64)) + +static long kvmppc_stt_npages(unsigned long window_size) +{ + return ALIGN((window_size >> SPAPR_TCE_SHIFT) + * sizeof(u64), PAGE_SIZE) / PAGE_SIZE; +} + +static void release_spapr_tce_table(struct kvmppc_spapr_tce_table *stt) +{ + struct kvm *kvm = stt->kvm; + int i; + + mutex_lock(&kvm->lock); + list_del(&stt->list); + for (i = 0; i < kvmppc_stt_npages(stt->window_size); i++) + __free_page(stt->pages[i]); + kfree(stt); + mutex_unlock(&kvm->lock); + + kvm_put_kvm(kvm); +} + +static int kvm_spapr_tce_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + struct kvmppc_spapr_tce_table *stt = vma->vm_file->private_data; + struct page *page; + + if (vmf->pgoff >= kvmppc_stt_npages(stt->window_size)) + return VM_FAULT_SIGBUS; + + page = stt->pages[vmf->pgoff]; + get_page(page); + vmf->page = page; + return 0; +} + +static const struct vm_operations_struct kvm_spapr_tce_vm_ops = { + .fault = kvm_spapr_tce_fault, +}; + +static int kvm_spapr_tce_mmap(struct file *file, struct vm_area_struct *vma) +{ + vma->vm_ops = &kvm_spapr_tce_vm_ops; + return 0; +} + +static int kvm_spapr_tce_release(struct inode *inode, struct file *filp) +{ + struct kvmppc_spapr_tce_table *stt = filp->private_data; + + release_spapr_tce_table(stt); + return 0; +} + +static const struct file_operations kvm_spapr_tce_fops = { + .mmap = kvm_spapr_tce_mmap, + .release = kvm_spapr_tce_release, +}; + +long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm, + struct kvm_create_spapr_tce *args) +{ + struct kvmppc_spapr_tce_table *stt = NULL; + long npages; + int ret = -ENOMEM; + int i; + + /* Check this LIOBN hasn't been previously allocated */ + list_for_each_entry(stt, &kvm->arch.spapr_tce_tables, list) { + if (stt->liobn == args->liobn) + return -EBUSY; + } + + npages = kvmppc_stt_npages(args->window_size); + + stt = kzalloc(sizeof(*stt) + npages * sizeof(struct page *), + GFP_KERNEL); + if (!stt) + goto fail; + + stt->liobn = args->liobn; + stt->window_size = args->window_size; + stt->kvm = kvm; + + for (i = 0; i < npages; i++) { + stt->pages[i] = alloc_page(GFP_KERNEL | __GFP_ZERO); + if (!stt->pages[i]) + goto fail; + } + + kvm_get_kvm(kvm); + + mutex_lock(&kvm->lock); + list_add(&stt->list, &kvm->arch.spapr_tce_tables); + + mutex_unlock(&kvm->lock); + + return anon_inode_getfd("kvm-spapr-tce", &kvm_spapr_tce_fops, + stt, O_RDWR | O_CLOEXEC); + +fail: + if (stt) { + for (i = 0; i < npages; i++) + if (stt->pages[i]) + __free_page(stt->pages[i]); + + kfree(stt); + } + return ret; +} diff --git a/kernel/arch/powerpc/kvm/book3s_64_vio_hv.c b/kernel/arch/powerpc/kvm/book3s_64_vio_hv.c new file mode 100644 index 000000000..89e96b3e0 --- /dev/null +++ b/kernel/arch/powerpc/kvm/book3s_64_vio_hv.c @@ -0,0 +1,105 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * + * Copyright 2010 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com> + * Copyright 2011 David Gibson, IBM Corporation <dwg@au1.ibm.com> + */ + +#include <linux/types.h> +#include <linux/string.h> +#include <linux/kvm.h> +#include <linux/kvm_host.h> +#include <linux/highmem.h> +#include <linux/gfp.h> +#include <linux/slab.h> +#include <linux/hugetlb.h> +#include <linux/list.h> + +#include <asm/tlbflush.h> +#include <asm/kvm_ppc.h> +#include <asm/kvm_book3s.h> +#include <asm/mmu-hash64.h> +#include <asm/hvcall.h> +#include <asm/synch.h> +#include <asm/ppc-opcode.h> +#include <asm/kvm_host.h> +#include <asm/udbg.h> + +#define TCES_PER_PAGE (PAGE_SIZE / sizeof(u64)) + +/* WARNING: This will be called in real-mode on HV KVM and virtual + * mode on PR KVM + */ +long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn, + unsigned long ioba, unsigned long tce) +{ + struct kvm *kvm = vcpu->kvm; + struct kvmppc_spapr_tce_table *stt; + + /* udbg_printf("H_PUT_TCE(): liobn=0x%lx ioba=0x%lx, tce=0x%lx\n", */ + /* liobn, ioba, tce); */ + + list_for_each_entry(stt, &kvm->arch.spapr_tce_tables, list) { + if (stt->liobn == liobn) { + unsigned long idx = ioba >> SPAPR_TCE_SHIFT; + struct page *page; + u64 *tbl; + + /* udbg_printf("H_PUT_TCE: liobn 0x%lx => stt=%p window_size=0x%x\n", */ + /* liobn, stt, stt->window_size); */ + if (ioba >= stt->window_size) + return H_PARAMETER; + + page = stt->pages[idx / TCES_PER_PAGE]; + tbl = (u64 *)page_address(page); + + /* FIXME: Need to validate the TCE itself */ + /* udbg_printf("tce @ %p\n", &tbl[idx % TCES_PER_PAGE]); */ + tbl[idx % TCES_PER_PAGE] = tce; + return H_SUCCESS; + } + } + + /* Didn't find the liobn, punt it to userspace */ + return H_TOO_HARD; +} +EXPORT_SYMBOL_GPL(kvmppc_h_put_tce); + +long kvmppc_h_get_tce(struct kvm_vcpu *vcpu, unsigned long liobn, + unsigned long ioba) +{ + struct kvm *kvm = vcpu->kvm; + struct kvmppc_spapr_tce_table *stt; + + list_for_each_entry(stt, &kvm->arch.spapr_tce_tables, list) { + if (stt->liobn == liobn) { + unsigned long idx = ioba >> SPAPR_TCE_SHIFT; + struct page *page; + u64 *tbl; + + if (ioba >= stt->window_size) + return H_PARAMETER; + + page = stt->pages[idx / TCES_PER_PAGE]; + tbl = (u64 *)page_address(page); + + vcpu->arch.gpr[4] = tbl[idx % TCES_PER_PAGE]; + return H_SUCCESS; + } + } + + /* Didn't find the liobn, punt it to userspace */ + return H_TOO_HARD; +} +EXPORT_SYMBOL_GPL(kvmppc_h_get_tce); diff --git a/kernel/arch/powerpc/kvm/book3s_emulate.c b/kernel/arch/powerpc/kvm/book3s_emulate.c new file mode 100644 index 000000000..5a2bc4b0d --- /dev/null +++ b/kernel/arch/powerpc/kvm/book3s_emulate.c @@ -0,0 +1,696 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * + * Copyright SUSE Linux Products GmbH 2009 + * + * Authors: Alexander Graf <agraf@suse.de> + */ + +#include <asm/kvm_ppc.h> +#include <asm/disassemble.h> +#include <asm/kvm_book3s.h> +#include <asm/reg.h> +#include <asm/switch_to.h> +#include <asm/time.h> + +#define OP_19_XOP_RFID 18 +#define OP_19_XOP_RFI 50 + +#define OP_31_XOP_MFMSR 83 +#define OP_31_XOP_MTMSR 146 +#define OP_31_XOP_MTMSRD 178 +#define OP_31_XOP_MTSR 210 +#define OP_31_XOP_MTSRIN 242 +#define OP_31_XOP_TLBIEL 274 +#define OP_31_XOP_TLBIE 306 +/* Opcode is officially reserved, reuse it as sc 1 when sc 1 doesn't trap */ +#define OP_31_XOP_FAKE_SC1 308 +#define OP_31_XOP_SLBMTE 402 +#define OP_31_XOP_SLBIE 434 +#define OP_31_XOP_SLBIA 498 +#define OP_31_XOP_MFSR 595 +#define OP_31_XOP_MFSRIN 659 +#define OP_31_XOP_DCBA 758 +#define OP_31_XOP_SLBMFEV 851 +#define OP_31_XOP_EIOIO 854 +#define OP_31_XOP_SLBMFEE 915 + +/* DCBZ is actually 1014, but we patch it to 1010 so we get a trap */ +#define OP_31_XOP_DCBZ 1010 + +#define OP_LFS 48 +#define OP_LFD 50 +#define OP_STFS 52 +#define OP_STFD 54 + +#define SPRN_GQR0 912 +#define SPRN_GQR1 913 +#define SPRN_GQR2 914 +#define SPRN_GQR3 915 +#define SPRN_GQR4 916 +#define SPRN_GQR5 917 +#define SPRN_GQR6 918 +#define SPRN_GQR7 919 + +/* Book3S_32 defines mfsrin(v) - but that messes up our abstract + * function pointers, so let's just disable the define. */ +#undef mfsrin + +enum priv_level { + PRIV_PROBLEM = 0, + PRIV_SUPER = 1, + PRIV_HYPER = 2, +}; + +static bool spr_allowed(struct kvm_vcpu *vcpu, enum priv_level level) +{ + /* PAPR VMs only access supervisor SPRs */ + if (vcpu->arch.papr_enabled && (level > PRIV_SUPER)) + return false; + + /* Limit user space to its own small SPR set */ + if ((kvmppc_get_msr(vcpu) & MSR_PR) && level > PRIV_PROBLEM) + return false; + + return true; +} + +int kvmppc_core_emulate_op_pr(struct kvm_run *run, struct kvm_vcpu *vcpu, + unsigned int inst, int *advance) +{ + int emulated = EMULATE_DONE; + int rt = get_rt(inst); + int rs = get_rs(inst); + int ra = get_ra(inst); + int rb = get_rb(inst); + u32 inst_sc = 0x44000002; + + switch (get_op(inst)) { + case 0: + emulated = EMULATE_FAIL; + if ((kvmppc_get_msr(vcpu) & MSR_LE) && + (inst == swab32(inst_sc))) { + /* + * This is the byte reversed syscall instruction of our + * hypercall handler. Early versions of LE Linux didn't + * swap the instructions correctly and ended up in + * illegal instructions. + * Just always fail hypercalls on these broken systems. + */ + kvmppc_set_gpr(vcpu, 3, EV_UNIMPLEMENTED); + kvmppc_set_pc(vcpu, kvmppc_get_pc(vcpu) + 4); + emulated = EMULATE_DONE; + } + break; + case 19: + switch (get_xop(inst)) { + case OP_19_XOP_RFID: + case OP_19_XOP_RFI: + kvmppc_set_pc(vcpu, kvmppc_get_srr0(vcpu)); + kvmppc_set_msr(vcpu, kvmppc_get_srr1(vcpu)); + *advance = 0; + break; + + default: + emulated = EMULATE_FAIL; + break; + } + break; + case 31: + switch (get_xop(inst)) { + case OP_31_XOP_MFMSR: + kvmppc_set_gpr(vcpu, rt, kvmppc_get_msr(vcpu)); + break; + case OP_31_XOP_MTMSRD: + { + ulong rs_val = kvmppc_get_gpr(vcpu, rs); + if (inst & 0x10000) { + ulong new_msr = kvmppc_get_msr(vcpu); + new_msr &= ~(MSR_RI | MSR_EE); + new_msr |= rs_val & (MSR_RI | MSR_EE); + kvmppc_set_msr_fast(vcpu, new_msr); + } else + kvmppc_set_msr(vcpu, rs_val); + break; + } + case OP_31_XOP_MTMSR: + kvmppc_set_msr(vcpu, kvmppc_get_gpr(vcpu, rs)); + break; + case OP_31_XOP_MFSR: + { + int srnum; + + srnum = kvmppc_get_field(inst, 12 + 32, 15 + 32); + if (vcpu->arch.mmu.mfsrin) { + u32 sr; + sr = vcpu->arch.mmu.mfsrin(vcpu, srnum); + kvmppc_set_gpr(vcpu, rt, sr); + } + break; + } + case OP_31_XOP_MFSRIN: + { + int srnum; + + srnum = (kvmppc_get_gpr(vcpu, rb) >> 28) & 0xf; + if (vcpu->arch.mmu.mfsrin) { + u32 sr; + sr = vcpu->arch.mmu.mfsrin(vcpu, srnum); + kvmppc_set_gpr(vcpu, rt, sr); + } + break; + } + case OP_31_XOP_MTSR: + vcpu->arch.mmu.mtsrin(vcpu, + (inst >> 16) & 0xf, + kvmppc_get_gpr(vcpu, rs)); + break; + case OP_31_XOP_MTSRIN: + vcpu->arch.mmu.mtsrin(vcpu, + (kvmppc_get_gpr(vcpu, rb) >> 28) & 0xf, + kvmppc_get_gpr(vcpu, rs)); + break; + case OP_31_XOP_TLBIE: + case OP_31_XOP_TLBIEL: + { + bool large = (inst & 0x00200000) ? true : false; + ulong addr = kvmppc_get_gpr(vcpu, rb); + vcpu->arch.mmu.tlbie(vcpu, addr, large); + break; + } +#ifdef CONFIG_PPC_BOOK3S_64 + case OP_31_XOP_FAKE_SC1: + { + /* SC 1 papr hypercalls */ + ulong cmd = kvmppc_get_gpr(vcpu, 3); + int i; + + if ((kvmppc_get_msr(vcpu) & MSR_PR) || + !vcpu->arch.papr_enabled) { + emulated = EMULATE_FAIL; + break; + } + + if (kvmppc_h_pr(vcpu, cmd) == EMULATE_DONE) + break; + + run->papr_hcall.nr = cmd; + for (i = 0; i < 9; ++i) { + ulong gpr = kvmppc_get_gpr(vcpu, 4 + i); + run->papr_hcall.args[i] = gpr; + } + + run->exit_reason = KVM_EXIT_PAPR_HCALL; + vcpu->arch.hcall_needed = 1; + emulated = EMULATE_EXIT_USER; + break; + } +#endif + case OP_31_XOP_EIOIO: + break; + case OP_31_XOP_SLBMTE: + if (!vcpu->arch.mmu.slbmte) + return EMULATE_FAIL; + + vcpu->arch.mmu.slbmte(vcpu, + kvmppc_get_gpr(vcpu, rs), + kvmppc_get_gpr(vcpu, rb)); + break; + case OP_31_XOP_SLBIE: + if (!vcpu->arch.mmu.slbie) + return EMULATE_FAIL; + + vcpu->arch.mmu.slbie(vcpu, + kvmppc_get_gpr(vcpu, rb)); + break; + case OP_31_XOP_SLBIA: + if (!vcpu->arch.mmu.slbia) + return EMULATE_FAIL; + + vcpu->arch.mmu.slbia(vcpu); + break; + case OP_31_XOP_SLBMFEE: + if (!vcpu->arch.mmu.slbmfee) { + emulated = EMULATE_FAIL; + } else { + ulong t, rb_val; + + rb_val = kvmppc_get_gpr(vcpu, rb); + t = vcpu->arch.mmu.slbmfee(vcpu, rb_val); + kvmppc_set_gpr(vcpu, rt, t); + } + break; + case OP_31_XOP_SLBMFEV: + if (!vcpu->arch.mmu.slbmfev) { + emulated = EMULATE_FAIL; + } else { + ulong t, rb_val; + + rb_val = kvmppc_get_gpr(vcpu, rb); + t = vcpu->arch.mmu.slbmfev(vcpu, rb_val); + kvmppc_set_gpr(vcpu, rt, t); + } + break; + case OP_31_XOP_DCBA: + /* Gets treated as NOP */ + break; + case OP_31_XOP_DCBZ: + { + ulong rb_val = kvmppc_get_gpr(vcpu, rb); + ulong ra_val = 0; + ulong addr, vaddr; + u32 zeros[8] = { 0, 0, 0, 0, 0, 0, 0, 0 }; + u32 dsisr; + int r; + + if (ra) + ra_val = kvmppc_get_gpr(vcpu, ra); + + addr = (ra_val + rb_val) & ~31ULL; + if (!(kvmppc_get_msr(vcpu) & MSR_SF)) + addr &= 0xffffffff; + vaddr = addr; + + r = kvmppc_st(vcpu, &addr, 32, zeros, true); + if ((r == -ENOENT) || (r == -EPERM)) { + *advance = 0; + kvmppc_set_dar(vcpu, vaddr); + vcpu->arch.fault_dar = vaddr; + + dsisr = DSISR_ISSTORE; + if (r == -ENOENT) + dsisr |= DSISR_NOHPTE; + else if (r == -EPERM) + dsisr |= DSISR_PROTFAULT; + + kvmppc_set_dsisr(vcpu, dsisr); + vcpu->arch.fault_dsisr = dsisr; + + kvmppc_book3s_queue_irqprio(vcpu, + BOOK3S_INTERRUPT_DATA_STORAGE); + } + + break; + } + default: + emulated = EMULATE_FAIL; + } + break; + default: + emulated = EMULATE_FAIL; + } + + if (emulated == EMULATE_FAIL) + emulated = kvmppc_emulate_paired_single(run, vcpu); + + return emulated; +} + +void kvmppc_set_bat(struct kvm_vcpu *vcpu, struct kvmppc_bat *bat, bool upper, + u32 val) +{ + if (upper) { + /* Upper BAT */ + u32 bl = (val >> 2) & 0x7ff; + bat->bepi_mask = (~bl << 17); + bat->bepi = val & 0xfffe0000; + bat->vs = (val & 2) ? 1 : 0; + bat->vp = (val & 1) ? 1 : 0; + bat->raw = (bat->raw & 0xffffffff00000000ULL) | val; + } else { + /* Lower BAT */ + bat->brpn = val & 0xfffe0000; + bat->wimg = (val >> 3) & 0xf; + bat->pp = val & 3; + bat->raw = (bat->raw & 0x00000000ffffffffULL) | ((u64)val << 32); + } +} + +static struct kvmppc_bat *kvmppc_find_bat(struct kvm_vcpu *vcpu, int sprn) +{ + struct kvmppc_vcpu_book3s *vcpu_book3s = to_book3s(vcpu); + struct kvmppc_bat *bat; + + switch (sprn) { + case SPRN_IBAT0U ... SPRN_IBAT3L: + bat = &vcpu_book3s->ibat[(sprn - SPRN_IBAT0U) / 2]; + break; + case SPRN_IBAT4U ... SPRN_IBAT7L: + bat = &vcpu_book3s->ibat[4 + ((sprn - SPRN_IBAT4U) / 2)]; + break; + case SPRN_DBAT0U ... SPRN_DBAT3L: + bat = &vcpu_book3s->dbat[(sprn - SPRN_DBAT0U) / 2]; + break; + case SPRN_DBAT4U ... SPRN_DBAT7L: + bat = &vcpu_book3s->dbat[4 + ((sprn - SPRN_DBAT4U) / 2)]; + break; + default: + BUG(); + } + + return bat; +} + +int kvmppc_core_emulate_mtspr_pr(struct kvm_vcpu *vcpu, int sprn, ulong spr_val) +{ + int emulated = EMULATE_DONE; + + switch (sprn) { + case SPRN_SDR1: + if (!spr_allowed(vcpu, PRIV_HYPER)) + goto unprivileged; + to_book3s(vcpu)->sdr1 = spr_val; + break; + case SPRN_DSISR: + kvmppc_set_dsisr(vcpu, spr_val); + break; + case SPRN_DAR: + kvmppc_set_dar(vcpu, spr_val); + break; + case SPRN_HIOR: + to_book3s(vcpu)->hior = spr_val; + break; + case SPRN_IBAT0U ... SPRN_IBAT3L: + case SPRN_IBAT4U ... SPRN_IBAT7L: + case SPRN_DBAT0U ... SPRN_DBAT3L: + case SPRN_DBAT4U ... SPRN_DBAT7L: + { + struct kvmppc_bat *bat = kvmppc_find_bat(vcpu, sprn); + + kvmppc_set_bat(vcpu, bat, !(sprn % 2), (u32)spr_val); + /* BAT writes happen so rarely that we're ok to flush + * everything here */ + kvmppc_mmu_pte_flush(vcpu, 0, 0); + kvmppc_mmu_flush_segments(vcpu); + break; + } + case SPRN_HID0: + to_book3s(vcpu)->hid[0] = spr_val; + break; + case SPRN_HID1: + to_book3s(vcpu)->hid[1] = spr_val; + break; + case SPRN_HID2: + to_book3s(vcpu)->hid[2] = spr_val; + break; + case SPRN_HID2_GEKKO: + to_book3s(vcpu)->hid[2] = spr_val; + /* HID2.PSE controls paired single on gekko */ + switch (vcpu->arch.pvr) { + case 0x00080200: /* lonestar 2.0 */ + case 0x00088202: /* lonestar 2.2 */ + case 0x70000100: /* gekko 1.0 */ + case 0x00080100: /* gekko 2.0 */ + case 0x00083203: /* gekko 2.3a */ + case 0x00083213: /* gekko 2.3b */ + case 0x00083204: /* gekko 2.4 */ + case 0x00083214: /* gekko 2.4e (8SE) - retail HW2 */ + case 0x00087200: /* broadway */ + if (vcpu->arch.hflags & BOOK3S_HFLAG_NATIVE_PS) { + /* Native paired singles */ + } else if (spr_val & (1 << 29)) { /* HID2.PSE */ + vcpu->arch.hflags |= BOOK3S_HFLAG_PAIRED_SINGLE; + kvmppc_giveup_ext(vcpu, MSR_FP); + } else { + vcpu->arch.hflags &= ~BOOK3S_HFLAG_PAIRED_SINGLE; + } + break; + } + break; + case SPRN_HID4: + case SPRN_HID4_GEKKO: + to_book3s(vcpu)->hid[4] = spr_val; + break; + case SPRN_HID5: + to_book3s(vcpu)->hid[5] = spr_val; + /* guest HID5 set can change is_dcbz32 */ + if (vcpu->arch.mmu.is_dcbz32(vcpu) && + (mfmsr() & MSR_HV)) + vcpu->arch.hflags |= BOOK3S_HFLAG_DCBZ32; + break; + case SPRN_GQR0: + case SPRN_GQR1: + case SPRN_GQR2: + case SPRN_GQR3: + case SPRN_GQR4: + case SPRN_GQR5: + case SPRN_GQR6: + case SPRN_GQR7: + to_book3s(vcpu)->gqr[sprn - SPRN_GQR0] = spr_val; + break; +#ifdef CONFIG_PPC_BOOK3S_64 + case SPRN_FSCR: + kvmppc_set_fscr(vcpu, spr_val); + break; + case SPRN_BESCR: + vcpu->arch.bescr = spr_val; + break; + case SPRN_EBBHR: + vcpu->arch.ebbhr = spr_val; + break; + case SPRN_EBBRR: + vcpu->arch.ebbrr = spr_val; + break; +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM + case SPRN_TFHAR: + vcpu->arch.tfhar = spr_val; + break; + case SPRN_TEXASR: + vcpu->arch.texasr = spr_val; + break; + case SPRN_TFIAR: + vcpu->arch.tfiar = spr_val; + break; +#endif +#endif + case SPRN_ICTC: + case SPRN_THRM1: + case SPRN_THRM2: + case SPRN_THRM3: + case SPRN_CTRLF: + case SPRN_CTRLT: + case SPRN_L2CR: + case SPRN_DSCR: + case SPRN_MMCR0_GEKKO: + case SPRN_MMCR1_GEKKO: + case SPRN_PMC1_GEKKO: + case SPRN_PMC2_GEKKO: + case SPRN_PMC3_GEKKO: + case SPRN_PMC4_GEKKO: + case SPRN_WPAR_GEKKO: + case SPRN_MSSSR0: + case SPRN_DABR: +#ifdef CONFIG_PPC_BOOK3S_64 + case SPRN_MMCRS: + case SPRN_MMCRA: + case SPRN_MMCR0: + case SPRN_MMCR1: + case SPRN_MMCR2: +#endif + break; +unprivileged: + default: + printk(KERN_INFO "KVM: invalid SPR write: %d\n", sprn); +#ifndef DEBUG_SPR + emulated = EMULATE_FAIL; +#endif + break; + } + + return emulated; +} + +int kvmppc_core_emulate_mfspr_pr(struct kvm_vcpu *vcpu, int sprn, ulong *spr_val) +{ + int emulated = EMULATE_DONE; + + switch (sprn) { + case SPRN_IBAT0U ... SPRN_IBAT3L: + case SPRN_IBAT4U ... SPRN_IBAT7L: + case SPRN_DBAT0U ... SPRN_DBAT3L: + case SPRN_DBAT4U ... SPRN_DBAT7L: + { + struct kvmppc_bat *bat = kvmppc_find_bat(vcpu, sprn); + + if (sprn % 2) + *spr_val = bat->raw >> 32; + else + *spr_val = bat->raw; + + break; + } + case SPRN_SDR1: + if (!spr_allowed(vcpu, PRIV_HYPER)) + goto unprivileged; + *spr_val = to_book3s(vcpu)->sdr1; + break; + case SPRN_DSISR: + *spr_val = kvmppc_get_dsisr(vcpu); + break; + case SPRN_DAR: + *spr_val = kvmppc_get_dar(vcpu); + break; + case SPRN_HIOR: + *spr_val = to_book3s(vcpu)->hior; + break; + case SPRN_HID0: + *spr_val = to_book3s(vcpu)->hid[0]; + break; + case SPRN_HID1: + *spr_val = to_book3s(vcpu)->hid[1]; + break; + case SPRN_HID2: + case SPRN_HID2_GEKKO: + *spr_val = to_book3s(vcpu)->hid[2]; + break; + case SPRN_HID4: + case SPRN_HID4_GEKKO: + *spr_val = to_book3s(vcpu)->hid[4]; + break; + case SPRN_HID5: + *spr_val = to_book3s(vcpu)->hid[5]; + break; + case SPRN_CFAR: + case SPRN_DSCR: + *spr_val = 0; + break; + case SPRN_PURR: + /* + * On exit we would have updated purr + */ + *spr_val = vcpu->arch.purr; + break; + case SPRN_SPURR: + /* + * On exit we would have updated spurr + */ + *spr_val = vcpu->arch.spurr; + break; + case SPRN_VTB: + *spr_val = vcpu->arch.vtb; + break; + case SPRN_IC: + *spr_val = vcpu->arch.ic; + break; + case SPRN_GQR0: + case SPRN_GQR1: + case SPRN_GQR2: + case SPRN_GQR3: + case SPRN_GQR4: + case SPRN_GQR5: + case SPRN_GQR6: + case SPRN_GQR7: + *spr_val = to_book3s(vcpu)->gqr[sprn - SPRN_GQR0]; + break; +#ifdef CONFIG_PPC_BOOK3S_64 + case SPRN_FSCR: + *spr_val = vcpu->arch.fscr; + break; + case SPRN_BESCR: + *spr_val = vcpu->arch.bescr; + break; + case SPRN_EBBHR: + *spr_val = vcpu->arch.ebbhr; + break; + case SPRN_EBBRR: + *spr_val = vcpu->arch.ebbrr; + break; +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM + case SPRN_TFHAR: + *spr_val = vcpu->arch.tfhar; + break; + case SPRN_TEXASR: + *spr_val = vcpu->arch.texasr; + break; + case SPRN_TFIAR: + *spr_val = vcpu->arch.tfiar; + break; +#endif +#endif + case SPRN_THRM1: + case SPRN_THRM2: + case SPRN_THRM3: + case SPRN_CTRLF: + case SPRN_CTRLT: + case SPRN_L2CR: + case SPRN_MMCR0_GEKKO: + case SPRN_MMCR1_GEKKO: + case SPRN_PMC1_GEKKO: + case SPRN_PMC2_GEKKO: + case SPRN_PMC3_GEKKO: + case SPRN_PMC4_GEKKO: + case SPRN_WPAR_GEKKO: + case SPRN_MSSSR0: + case SPRN_DABR: +#ifdef CONFIG_PPC_BOOK3S_64 + case SPRN_MMCRS: + case SPRN_MMCRA: + case SPRN_MMCR0: + case SPRN_MMCR1: + case SPRN_MMCR2: + case SPRN_TIR: +#endif + *spr_val = 0; + break; + default: +unprivileged: + printk(KERN_INFO "KVM: invalid SPR read: %d\n", sprn); +#ifndef DEBUG_SPR + emulated = EMULATE_FAIL; +#endif + break; + } + + return emulated; +} + +u32 kvmppc_alignment_dsisr(struct kvm_vcpu *vcpu, unsigned int inst) +{ + return make_dsisr(inst); +} + +ulong kvmppc_alignment_dar(struct kvm_vcpu *vcpu, unsigned int inst) +{ +#ifdef CONFIG_PPC_BOOK3S_64 + /* + * Linux's fix_alignment() assumes that DAR is valid, so can we + */ + return vcpu->arch.fault_dar; +#else + ulong dar = 0; + ulong ra = get_ra(inst); + ulong rb = get_rb(inst); + + switch (get_op(inst)) { + case OP_LFS: + case OP_LFD: + case OP_STFD: + case OP_STFS: + if (ra) + dar = kvmppc_get_gpr(vcpu, ra); + dar += (s32)((s16)inst); + break; + case 31: + if (ra) + dar = kvmppc_get_gpr(vcpu, ra); + dar += kvmppc_get_gpr(vcpu, rb); + break; + default: + printk(KERN_INFO "KVM: Unaligned instruction 0x%x\n", inst); + break; + } + + return dar; +#endif +} diff --git a/kernel/arch/powerpc/kvm/book3s_exports.c b/kernel/arch/powerpc/kvm/book3s_exports.c new file mode 100644 index 000000000..0d013fbc2 --- /dev/null +++ b/kernel/arch/powerpc/kvm/book3s_exports.c @@ -0,0 +1,30 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * + * Copyright SUSE Linux Products GmbH 2009 + * + * Authors: Alexander Graf <agraf@suse.de> + */ + +#include <linux/export.h> +#include <asm/kvm_ppc.h> +#include <asm/kvm_book3s.h> + +#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE +EXPORT_SYMBOL_GPL(kvmppc_hv_entry_trampoline); +#endif +#ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE +EXPORT_SYMBOL_GPL(kvmppc_entry_trampoline); +#endif + diff --git a/kernel/arch/powerpc/kvm/book3s_hv.c b/kernel/arch/powerpc/kvm/book3s_hv.c new file mode 100644 index 000000000..3e9087f45 --- /dev/null +++ b/kernel/arch/powerpc/kvm/book3s_hv.c @@ -0,0 +1,2765 @@ +/* + * Copyright 2011 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com> + * Copyright (C) 2009. SUSE Linux Products GmbH. All rights reserved. + * + * Authors: + * Paul Mackerras <paulus@au1.ibm.com> + * Alexander Graf <agraf@suse.de> + * Kevin Wolf <mail@kevin-wolf.de> + * + * Description: KVM functions specific to running on Book 3S + * processors in hypervisor mode (specifically POWER7 and later). + * + * This file is derived from arch/powerpc/kvm/book3s.c, + * by Alexander Graf <agraf@suse.de>. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + */ + +#include <linux/kvm_host.h> +#include <linux/err.h> +#include <linux/slab.h> +#include <linux/preempt.h> +#include <linux/sched.h> +#include <linux/delay.h> +#include <linux/export.h> +#include <linux/fs.h> +#include <linux/anon_inodes.h> +#include <linux/cpumask.h> +#include <linux/spinlock.h> +#include <linux/page-flags.h> +#include <linux/srcu.h> +#include <linux/miscdevice.h> +#include <linux/debugfs.h> + +#include <asm/reg.h> +#include <asm/cputable.h> +#include <asm/cache.h> +#include <asm/cacheflush.h> +#include <asm/tlbflush.h> +#include <asm/uaccess.h> +#include <asm/io.h> +#include <asm/kvm_ppc.h> +#include <asm/kvm_book3s.h> +#include <asm/mmu_context.h> +#include <asm/lppaca.h> +#include <asm/processor.h> +#include <asm/cputhreads.h> +#include <asm/page.h> +#include <asm/hvcall.h> +#include <asm/switch_to.h> +#include <asm/smp.h> +#include <asm/dbell.h> +#include <linux/gfp.h> +#include <linux/vmalloc.h> +#include <linux/highmem.h> +#include <linux/hugetlb.h> +#include <linux/module.h> + +#include "book3s.h" + +#define CREATE_TRACE_POINTS +#include "trace_hv.h" + +/* #define EXIT_DEBUG */ +/* #define EXIT_DEBUG_SIMPLE */ +/* #define EXIT_DEBUG_INT */ + +/* Used to indicate that a guest page fault needs to be handled */ +#define RESUME_PAGE_FAULT (RESUME_GUEST | RESUME_FLAG_ARCH1) + +/* Used as a "null" value for timebase values */ +#define TB_NIL (~(u64)0) + +static DECLARE_BITMAP(default_enabled_hcalls, MAX_HCALL_OPCODE/4 + 1); + +#if defined(CONFIG_PPC_64K_PAGES) +#define MPP_BUFFER_ORDER 0 +#elif defined(CONFIG_PPC_4K_PAGES) +#define MPP_BUFFER_ORDER 3 +#endif + + +static void kvmppc_end_cede(struct kvm_vcpu *vcpu); +static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu); + +static bool kvmppc_ipi_thread(int cpu) +{ + /* On POWER8 for IPIs to threads in the same core, use msgsnd */ + if (cpu_has_feature(CPU_FTR_ARCH_207S)) { + preempt_disable(); + if (cpu_first_thread_sibling(cpu) == + cpu_first_thread_sibling(smp_processor_id())) { + unsigned long msg = PPC_DBELL_TYPE(PPC_DBELL_SERVER); + msg |= cpu_thread_in_core(cpu); + smp_mb(); + __asm__ __volatile__ (PPC_MSGSND(%0) : : "r" (msg)); + preempt_enable(); + return true; + } + preempt_enable(); + } + +#if defined(CONFIG_PPC_ICP_NATIVE) && defined(CONFIG_SMP) + if (cpu >= 0 && cpu < nr_cpu_ids && paca[cpu].kvm_hstate.xics_phys) { + xics_wake_cpu(cpu); + return true; + } +#endif + + return false; +} + +static void kvmppc_fast_vcpu_kick_hv(struct kvm_vcpu *vcpu) +{ + int cpu = vcpu->cpu; + struct swait_head *wqp; + + wqp = kvm_arch_vcpu_wq(vcpu); + if (swaitqueue_active(wqp)) { + swait_wake_interruptible(wqp); + ++vcpu->stat.halt_wakeup; + } + + if (kvmppc_ipi_thread(cpu + vcpu->arch.ptid)) + return; + + /* CPU points to the first thread of the core */ + if (cpu >= 0 && cpu < nr_cpu_ids && cpu_online(cpu)) + smp_send_reschedule(cpu); +} + +/* + * We use the vcpu_load/put functions to measure stolen time. + * Stolen time is counted as time when either the vcpu is able to + * run as part of a virtual core, but the task running the vcore + * is preempted or sleeping, or when the vcpu needs something done + * in the kernel by the task running the vcpu, but that task is + * preempted or sleeping. Those two things have to be counted + * separately, since one of the vcpu tasks will take on the job + * of running the core, and the other vcpu tasks in the vcore will + * sleep waiting for it to do that, but that sleep shouldn't count + * as stolen time. + * + * Hence we accumulate stolen time when the vcpu can run as part of + * a vcore using vc->stolen_tb, and the stolen time when the vcpu + * needs its task to do other things in the kernel (for example, + * service a page fault) in busy_stolen. We don't accumulate + * stolen time for a vcore when it is inactive, or for a vcpu + * when it is in state RUNNING or NOTREADY. NOTREADY is a bit of + * a misnomer; it means that the vcpu task is not executing in + * the KVM_VCPU_RUN ioctl, i.e. it is in userspace or elsewhere in + * the kernel. We don't have any way of dividing up that time + * between time that the vcpu is genuinely stopped, time that + * the task is actively working on behalf of the vcpu, and time + * that the task is preempted, so we don't count any of it as + * stolen. + * + * Updates to busy_stolen are protected by arch.tbacct_lock; + * updates to vc->stolen_tb are protected by the vcore->stoltb_lock + * lock. The stolen times are measured in units of timebase ticks. + * (Note that the != TB_NIL checks below are purely defensive; + * they should never fail.) + */ + +static void kvmppc_core_vcpu_load_hv(struct kvm_vcpu *vcpu, int cpu) +{ + struct kvmppc_vcore *vc = vcpu->arch.vcore; + unsigned long flags; + + /* + * We can test vc->runner without taking the vcore lock, + * because only this task ever sets vc->runner to this + * vcpu, and once it is set to this vcpu, only this task + * ever sets it to NULL. + */ + if (vc->runner == vcpu && vc->vcore_state != VCORE_INACTIVE) { + spin_lock_irqsave(&vc->stoltb_lock, flags); + if (vc->preempt_tb != TB_NIL) { + vc->stolen_tb += mftb() - vc->preempt_tb; + vc->preempt_tb = TB_NIL; + } + spin_unlock_irqrestore(&vc->stoltb_lock, flags); + } + spin_lock_irqsave(&vcpu->arch.tbacct_lock, flags); + if (vcpu->arch.state == KVMPPC_VCPU_BUSY_IN_HOST && + vcpu->arch.busy_preempt != TB_NIL) { + vcpu->arch.busy_stolen += mftb() - vcpu->arch.busy_preempt; + vcpu->arch.busy_preempt = TB_NIL; + } + spin_unlock_irqrestore(&vcpu->arch.tbacct_lock, flags); +} + +static void kvmppc_core_vcpu_put_hv(struct kvm_vcpu *vcpu) +{ + struct kvmppc_vcore *vc = vcpu->arch.vcore; + unsigned long flags; + + if (vc->runner == vcpu && vc->vcore_state != VCORE_INACTIVE) { + spin_lock_irqsave(&vc->stoltb_lock, flags); + vc->preempt_tb = mftb(); + spin_unlock_irqrestore(&vc->stoltb_lock, flags); + } + spin_lock_irqsave(&vcpu->arch.tbacct_lock, flags); + if (vcpu->arch.state == KVMPPC_VCPU_BUSY_IN_HOST) + vcpu->arch.busy_preempt = mftb(); + spin_unlock_irqrestore(&vcpu->arch.tbacct_lock, flags); +} + +static void kvmppc_set_msr_hv(struct kvm_vcpu *vcpu, u64 msr) +{ + vcpu->arch.shregs.msr = msr; + kvmppc_end_cede(vcpu); +} + +void kvmppc_set_pvr_hv(struct kvm_vcpu *vcpu, u32 pvr) +{ + vcpu->arch.pvr = pvr; +} + +int kvmppc_set_arch_compat(struct kvm_vcpu *vcpu, u32 arch_compat) +{ + unsigned long pcr = 0; + struct kvmppc_vcore *vc = vcpu->arch.vcore; + + if (arch_compat) { + switch (arch_compat) { + case PVR_ARCH_205: + /* + * If an arch bit is set in PCR, all the defined + * higher-order arch bits also have to be set. + */ + pcr = PCR_ARCH_206 | PCR_ARCH_205; + break; + case PVR_ARCH_206: + case PVR_ARCH_206p: + pcr = PCR_ARCH_206; + break; + case PVR_ARCH_207: + break; + default: + return -EINVAL; + } + + if (!cpu_has_feature(CPU_FTR_ARCH_207S)) { + /* POWER7 can't emulate POWER8 */ + if (!(pcr & PCR_ARCH_206)) + return -EINVAL; + pcr &= ~PCR_ARCH_206; + } + } + + spin_lock(&vc->lock); + vc->arch_compat = arch_compat; + vc->pcr = pcr; + spin_unlock(&vc->lock); + + return 0; +} + +void kvmppc_dump_regs(struct kvm_vcpu *vcpu) +{ + int r; + + pr_err("vcpu %p (%d):\n", vcpu, vcpu->vcpu_id); + pr_err("pc = %.16lx msr = %.16llx trap = %x\n", + vcpu->arch.pc, vcpu->arch.shregs.msr, vcpu->arch.trap); + for (r = 0; r < 16; ++r) + pr_err("r%2d = %.16lx r%d = %.16lx\n", + r, kvmppc_get_gpr(vcpu, r), + r+16, kvmppc_get_gpr(vcpu, r+16)); + pr_err("ctr = %.16lx lr = %.16lx\n", + vcpu->arch.ctr, vcpu->arch.lr); + pr_err("srr0 = %.16llx srr1 = %.16llx\n", + vcpu->arch.shregs.srr0, vcpu->arch.shregs.srr1); + pr_err("sprg0 = %.16llx sprg1 = %.16llx\n", + vcpu->arch.shregs.sprg0, vcpu->arch.shregs.sprg1); + pr_err("sprg2 = %.16llx sprg3 = %.16llx\n", + vcpu->arch.shregs.sprg2, vcpu->arch.shregs.sprg3); + pr_err("cr = %.8x xer = %.16lx dsisr = %.8x\n", + vcpu->arch.cr, vcpu->arch.xer, vcpu->arch.shregs.dsisr); + pr_err("dar = %.16llx\n", vcpu->arch.shregs.dar); + pr_err("fault dar = %.16lx dsisr = %.8x\n", + vcpu->arch.fault_dar, vcpu->arch.fault_dsisr); + pr_err("SLB (%d entries):\n", vcpu->arch.slb_max); + for (r = 0; r < vcpu->arch.slb_max; ++r) + pr_err(" ESID = %.16llx VSID = %.16llx\n", + vcpu->arch.slb[r].orige, vcpu->arch.slb[r].origv); + pr_err("lpcr = %.16lx sdr1 = %.16lx last_inst = %.8x\n", + vcpu->arch.vcore->lpcr, vcpu->kvm->arch.sdr1, + vcpu->arch.last_inst); +} + +struct kvm_vcpu *kvmppc_find_vcpu(struct kvm *kvm, int id) +{ + int r; + struct kvm_vcpu *v, *ret = NULL; + + mutex_lock(&kvm->lock); + kvm_for_each_vcpu(r, v, kvm) { + if (v->vcpu_id == id) { + ret = v; + break; + } + } + mutex_unlock(&kvm->lock); + return ret; +} + +static void init_vpa(struct kvm_vcpu *vcpu, struct lppaca *vpa) +{ + vpa->__old_status |= LPPACA_OLD_SHARED_PROC; + vpa->yield_count = cpu_to_be32(1); +} + +static int set_vpa(struct kvm_vcpu *vcpu, struct kvmppc_vpa *v, + unsigned long addr, unsigned long len) +{ + /* check address is cacheline aligned */ + if (addr & (L1_CACHE_BYTES - 1)) + return -EINVAL; + spin_lock(&vcpu->arch.vpa_update_lock); + if (v->next_gpa != addr || v->len != len) { + v->next_gpa = addr; + v->len = addr ? len : 0; + v->update_pending = 1; + } + spin_unlock(&vcpu->arch.vpa_update_lock); + return 0; +} + +/* Length for a per-processor buffer is passed in at offset 4 in the buffer */ +struct reg_vpa { + u32 dummy; + union { + __be16 hword; + __be32 word; + } length; +}; + +static int vpa_is_registered(struct kvmppc_vpa *vpap) +{ + if (vpap->update_pending) + return vpap->next_gpa != 0; + return vpap->pinned_addr != NULL; +} + +static unsigned long do_h_register_vpa(struct kvm_vcpu *vcpu, + unsigned long flags, + unsigned long vcpuid, unsigned long vpa) +{ + struct kvm *kvm = vcpu->kvm; + unsigned long len, nb; + void *va; + struct kvm_vcpu *tvcpu; + int err; + int subfunc; + struct kvmppc_vpa *vpap; + + tvcpu = kvmppc_find_vcpu(kvm, vcpuid); + if (!tvcpu) + return H_PARAMETER; + + subfunc = (flags >> H_VPA_FUNC_SHIFT) & H_VPA_FUNC_MASK; + if (subfunc == H_VPA_REG_VPA || subfunc == H_VPA_REG_DTL || + subfunc == H_VPA_REG_SLB) { + /* Registering new area - address must be cache-line aligned */ + if ((vpa & (L1_CACHE_BYTES - 1)) || !vpa) + return H_PARAMETER; + + /* convert logical addr to kernel addr and read length */ + va = kvmppc_pin_guest_page(kvm, vpa, &nb); + if (va == NULL) + return H_PARAMETER; + if (subfunc == H_VPA_REG_VPA) + len = be16_to_cpu(((struct reg_vpa *)va)->length.hword); + else + len = be32_to_cpu(((struct reg_vpa *)va)->length.word); + kvmppc_unpin_guest_page(kvm, va, vpa, false); + + /* Check length */ + if (len > nb || len < sizeof(struct reg_vpa)) + return H_PARAMETER; + } else { + vpa = 0; + len = 0; + } + + err = H_PARAMETER; + vpap = NULL; + spin_lock(&tvcpu->arch.vpa_update_lock); + + switch (subfunc) { + case H_VPA_REG_VPA: /* register VPA */ + if (len < sizeof(struct lppaca)) + break; + vpap = &tvcpu->arch.vpa; + err = 0; + break; + + case H_VPA_REG_DTL: /* register DTL */ + if (len < sizeof(struct dtl_entry)) + break; + len -= len % sizeof(struct dtl_entry); + + /* Check that they have previously registered a VPA */ + err = H_RESOURCE; + if (!vpa_is_registered(&tvcpu->arch.vpa)) + break; + + vpap = &tvcpu->arch.dtl; + err = 0; + break; + + case H_VPA_REG_SLB: /* register SLB shadow buffer */ + /* Check that they have previously registered a VPA */ + err = H_RESOURCE; + if (!vpa_is_registered(&tvcpu->arch.vpa)) + break; + + vpap = &tvcpu->arch.slb_shadow; + err = 0; + break; + + case H_VPA_DEREG_VPA: /* deregister VPA */ + /* Check they don't still have a DTL or SLB buf registered */ + err = H_RESOURCE; + if (vpa_is_registered(&tvcpu->arch.dtl) || + vpa_is_registered(&tvcpu->arch.slb_shadow)) + break; + + vpap = &tvcpu->arch.vpa; + err = 0; + break; + + case H_VPA_DEREG_DTL: /* deregister DTL */ + vpap = &tvcpu->arch.dtl; + err = 0; + break; + + case H_VPA_DEREG_SLB: /* deregister SLB shadow buffer */ + vpap = &tvcpu->arch.slb_shadow; + err = 0; + break; + } + + if (vpap) { + vpap->next_gpa = vpa; + vpap->len = len; + vpap->update_pending = 1; + } + + spin_unlock(&tvcpu->arch.vpa_update_lock); + + return err; +} + +static void kvmppc_update_vpa(struct kvm_vcpu *vcpu, struct kvmppc_vpa *vpap) +{ + struct kvm *kvm = vcpu->kvm; + void *va; + unsigned long nb; + unsigned long gpa; + + /* + * We need to pin the page pointed to by vpap->next_gpa, + * but we can't call kvmppc_pin_guest_page under the lock + * as it does get_user_pages() and down_read(). So we + * have to drop the lock, pin the page, then get the lock + * again and check that a new area didn't get registered + * in the meantime. + */ + for (;;) { + gpa = vpap->next_gpa; + spin_unlock(&vcpu->arch.vpa_update_lock); + va = NULL; + nb = 0; + if (gpa) + va = kvmppc_pin_guest_page(kvm, gpa, &nb); + spin_lock(&vcpu->arch.vpa_update_lock); + if (gpa == vpap->next_gpa) + break; + /* sigh... unpin that one and try again */ + if (va) + kvmppc_unpin_guest_page(kvm, va, gpa, false); + } + + vpap->update_pending = 0; + if (va && nb < vpap->len) { + /* + * If it's now too short, it must be that userspace + * has changed the mappings underlying guest memory, + * so unregister the region. + */ + kvmppc_unpin_guest_page(kvm, va, gpa, false); + va = NULL; + } + if (vpap->pinned_addr) + kvmppc_unpin_guest_page(kvm, vpap->pinned_addr, vpap->gpa, + vpap->dirty); + vpap->gpa = gpa; + vpap->pinned_addr = va; + vpap->dirty = false; + if (va) + vpap->pinned_end = va + vpap->len; +} + +static void kvmppc_update_vpas(struct kvm_vcpu *vcpu) +{ + if (!(vcpu->arch.vpa.update_pending || + vcpu->arch.slb_shadow.update_pending || + vcpu->arch.dtl.update_pending)) + return; + + spin_lock(&vcpu->arch.vpa_update_lock); + if (vcpu->arch.vpa.update_pending) { + kvmppc_update_vpa(vcpu, &vcpu->arch.vpa); + if (vcpu->arch.vpa.pinned_addr) + init_vpa(vcpu, vcpu->arch.vpa.pinned_addr); + } + if (vcpu->arch.dtl.update_pending) { + kvmppc_update_vpa(vcpu, &vcpu->arch.dtl); + vcpu->arch.dtl_ptr = vcpu->arch.dtl.pinned_addr; + vcpu->arch.dtl_index = 0; + } + if (vcpu->arch.slb_shadow.update_pending) + kvmppc_update_vpa(vcpu, &vcpu->arch.slb_shadow); + spin_unlock(&vcpu->arch.vpa_update_lock); +} + +/* + * Return the accumulated stolen time for the vcore up until `now'. + * The caller should hold the vcore lock. + */ +static u64 vcore_stolen_time(struct kvmppc_vcore *vc, u64 now) +{ + u64 p; + unsigned long flags; + + spin_lock_irqsave(&vc->stoltb_lock, flags); + p = vc->stolen_tb; + if (vc->vcore_state != VCORE_INACTIVE && + vc->preempt_tb != TB_NIL) + p += now - vc->preempt_tb; + spin_unlock_irqrestore(&vc->stoltb_lock, flags); + return p; +} + +static void kvmppc_create_dtl_entry(struct kvm_vcpu *vcpu, + struct kvmppc_vcore *vc) +{ + struct dtl_entry *dt; + struct lppaca *vpa; + unsigned long stolen; + unsigned long core_stolen; + u64 now; + + dt = vcpu->arch.dtl_ptr; + vpa = vcpu->arch.vpa.pinned_addr; + now = mftb(); + core_stolen = vcore_stolen_time(vc, now); + stolen = core_stolen - vcpu->arch.stolen_logged; + vcpu->arch.stolen_logged = core_stolen; + spin_lock_irq(&vcpu->arch.tbacct_lock); + stolen += vcpu->arch.busy_stolen; + vcpu->arch.busy_stolen = 0; + spin_unlock_irq(&vcpu->arch.tbacct_lock); + if (!dt || !vpa) + return; + memset(dt, 0, sizeof(struct dtl_entry)); + dt->dispatch_reason = 7; + dt->processor_id = cpu_to_be16(vc->pcpu + vcpu->arch.ptid); + dt->timebase = cpu_to_be64(now + vc->tb_offset); + dt->enqueue_to_dispatch_time = cpu_to_be32(stolen); + dt->srr0 = cpu_to_be64(kvmppc_get_pc(vcpu)); + dt->srr1 = cpu_to_be64(vcpu->arch.shregs.msr); + ++dt; + if (dt == vcpu->arch.dtl.pinned_end) + dt = vcpu->arch.dtl.pinned_addr; + vcpu->arch.dtl_ptr = dt; + /* order writing *dt vs. writing vpa->dtl_idx */ + smp_wmb(); + vpa->dtl_idx = cpu_to_be64(++vcpu->arch.dtl_index); + vcpu->arch.dtl.dirty = true; +} + +static bool kvmppc_power8_compatible(struct kvm_vcpu *vcpu) +{ + if (vcpu->arch.vcore->arch_compat >= PVR_ARCH_207) + return true; + if ((!vcpu->arch.vcore->arch_compat) && + cpu_has_feature(CPU_FTR_ARCH_207S)) + return true; + return false; +} + +static int kvmppc_h_set_mode(struct kvm_vcpu *vcpu, unsigned long mflags, + unsigned long resource, unsigned long value1, + unsigned long value2) +{ + switch (resource) { + case H_SET_MODE_RESOURCE_SET_CIABR: + if (!kvmppc_power8_compatible(vcpu)) + return H_P2; + if (value2) + return H_P4; + if (mflags) + return H_UNSUPPORTED_FLAG_START; + /* Guests can't breakpoint the hypervisor */ + if ((value1 & CIABR_PRIV) == CIABR_PRIV_HYPER) + return H_P3; + vcpu->arch.ciabr = value1; + return H_SUCCESS; + case H_SET_MODE_RESOURCE_SET_DAWR: + if (!kvmppc_power8_compatible(vcpu)) + return H_P2; + if (mflags) + return H_UNSUPPORTED_FLAG_START; + if (value2 & DABRX_HYP) + return H_P4; + vcpu->arch.dawr = value1; + vcpu->arch.dawrx = value2; + return H_SUCCESS; + default: + return H_TOO_HARD; + } +} + +static int kvm_arch_vcpu_yield_to(struct kvm_vcpu *target) +{ + struct kvmppc_vcore *vcore = target->arch.vcore; + + /* + * We expect to have been called by the real mode handler + * (kvmppc_rm_h_confer()) which would have directly returned + * H_SUCCESS if the source vcore wasn't idle (e.g. if it may + * have useful work to do and should not confer) so we don't + * recheck that here. + */ + + spin_lock(&vcore->lock); + if (target->arch.state == KVMPPC_VCPU_RUNNABLE && + vcore->vcore_state != VCORE_INACTIVE) + target = vcore->runner; + spin_unlock(&vcore->lock); + + return kvm_vcpu_yield_to(target); +} + +static int kvmppc_get_yield_count(struct kvm_vcpu *vcpu) +{ + int yield_count = 0; + struct lppaca *lppaca; + + spin_lock(&vcpu->arch.vpa_update_lock); + lppaca = (struct lppaca *)vcpu->arch.vpa.pinned_addr; + if (lppaca) + yield_count = be32_to_cpu(lppaca->yield_count); + spin_unlock(&vcpu->arch.vpa_update_lock); + return yield_count; +} + +int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu) +{ + unsigned long req = kvmppc_get_gpr(vcpu, 3); + unsigned long target, ret = H_SUCCESS; + int yield_count; + struct kvm_vcpu *tvcpu; + int idx, rc; + + if (req <= MAX_HCALL_OPCODE && + !test_bit(req/4, vcpu->kvm->arch.enabled_hcalls)) + return RESUME_HOST; + + switch (req) { + case H_CEDE: + break; + case H_PROD: + target = kvmppc_get_gpr(vcpu, 4); + tvcpu = kvmppc_find_vcpu(vcpu->kvm, target); + if (!tvcpu) { + ret = H_PARAMETER; + break; + } + tvcpu->arch.prodded = 1; + smp_mb(); + if (vcpu->arch.ceded) { + if (swaitqueue_active(&vcpu->wq)) { + swait_wake_interruptible(&vcpu->wq); + vcpu->stat.halt_wakeup++; + } + } + break; + case H_CONFER: + target = kvmppc_get_gpr(vcpu, 4); + if (target == -1) + break; + tvcpu = kvmppc_find_vcpu(vcpu->kvm, target); + if (!tvcpu) { + ret = H_PARAMETER; + break; + } + yield_count = kvmppc_get_gpr(vcpu, 5); + if (kvmppc_get_yield_count(tvcpu) != yield_count) + break; + kvm_arch_vcpu_yield_to(tvcpu); + break; + case H_REGISTER_VPA: + ret = do_h_register_vpa(vcpu, kvmppc_get_gpr(vcpu, 4), + kvmppc_get_gpr(vcpu, 5), + kvmppc_get_gpr(vcpu, 6)); + break; + case H_RTAS: + if (list_empty(&vcpu->kvm->arch.rtas_tokens)) + return RESUME_HOST; + + idx = srcu_read_lock(&vcpu->kvm->srcu); + rc = kvmppc_rtas_hcall(vcpu); + srcu_read_unlock(&vcpu->kvm->srcu, idx); + + if (rc == -ENOENT) + return RESUME_HOST; + else if (rc == 0) + break; + + /* Send the error out to userspace via KVM_RUN */ + return rc; + case H_LOGICAL_CI_LOAD: + ret = kvmppc_h_logical_ci_load(vcpu); + if (ret == H_TOO_HARD) + return RESUME_HOST; + break; + case H_LOGICAL_CI_STORE: + ret = kvmppc_h_logical_ci_store(vcpu); + if (ret == H_TOO_HARD) + return RESUME_HOST; + break; + case H_SET_MODE: + ret = kvmppc_h_set_mode(vcpu, kvmppc_get_gpr(vcpu, 4), + kvmppc_get_gpr(vcpu, 5), + kvmppc_get_gpr(vcpu, 6), + kvmppc_get_gpr(vcpu, 7)); + if (ret == H_TOO_HARD) + return RESUME_HOST; + break; + case H_XIRR: + case H_CPPR: + case H_EOI: + case H_IPI: + case H_IPOLL: + case H_XIRR_X: + if (kvmppc_xics_enabled(vcpu)) { + ret = kvmppc_xics_hcall(vcpu, req); + break; + } /* fallthrough */ + default: + return RESUME_HOST; + } + kvmppc_set_gpr(vcpu, 3, ret); + vcpu->arch.hcall_needed = 0; + return RESUME_GUEST; +} + +static int kvmppc_hcall_impl_hv(unsigned long cmd) +{ + switch (cmd) { + case H_CEDE: + case H_PROD: + case H_CONFER: + case H_REGISTER_VPA: + case H_SET_MODE: + case H_LOGICAL_CI_LOAD: + case H_LOGICAL_CI_STORE: +#ifdef CONFIG_KVM_XICS + case H_XIRR: + case H_CPPR: + case H_EOI: + case H_IPI: + case H_IPOLL: + case H_XIRR_X: +#endif + return 1; + } + + /* See if it's in the real-mode table */ + return kvmppc_hcall_impl_hv_realmode(cmd); +} + +static int kvmppc_emulate_debug_inst(struct kvm_run *run, + struct kvm_vcpu *vcpu) +{ + u32 last_inst; + + if (kvmppc_get_last_inst(vcpu, INST_GENERIC, &last_inst) != + EMULATE_DONE) { + /* + * Fetch failed, so return to guest and + * try executing it again. + */ + return RESUME_GUEST; + } + + if (last_inst == KVMPPC_INST_SW_BREAKPOINT) { + run->exit_reason = KVM_EXIT_DEBUG; + run->debug.arch.address = kvmppc_get_pc(vcpu); + return RESUME_HOST; + } else { + kvmppc_core_queue_program(vcpu, SRR1_PROGILL); + return RESUME_GUEST; + } +} + +static int kvmppc_handle_exit_hv(struct kvm_run *run, struct kvm_vcpu *vcpu, + struct task_struct *tsk) +{ + int r = RESUME_HOST; + + vcpu->stat.sum_exits++; + + run->exit_reason = KVM_EXIT_UNKNOWN; + run->ready_for_interrupt_injection = 1; + switch (vcpu->arch.trap) { + /* We're good on these - the host merely wanted to get our attention */ + case BOOK3S_INTERRUPT_HV_DECREMENTER: + vcpu->stat.dec_exits++; + r = RESUME_GUEST; + break; + case BOOK3S_INTERRUPT_EXTERNAL: + case BOOK3S_INTERRUPT_H_DOORBELL: + vcpu->stat.ext_intr_exits++; + r = RESUME_GUEST; + break; + /* HMI is hypervisor interrupt and host has handled it. Resume guest.*/ + case BOOK3S_INTERRUPT_HMI: + case BOOK3S_INTERRUPT_PERFMON: + r = RESUME_GUEST; + break; + case BOOK3S_INTERRUPT_MACHINE_CHECK: + /* + * Deliver a machine check interrupt to the guest. + * We have to do this, even if the host has handled the + * machine check, because machine checks use SRR0/1 and + * the interrupt might have trashed guest state in them. + */ + kvmppc_book3s_queue_irqprio(vcpu, + BOOK3S_INTERRUPT_MACHINE_CHECK); + r = RESUME_GUEST; + break; + case BOOK3S_INTERRUPT_PROGRAM: + { + ulong flags; + /* + * Normally program interrupts are delivered directly + * to the guest by the hardware, but we can get here + * as a result of a hypervisor emulation interrupt + * (e40) getting turned into a 700 by BML RTAS. + */ + flags = vcpu->arch.shregs.msr & 0x1f0000ull; + kvmppc_core_queue_program(vcpu, flags); + r = RESUME_GUEST; + break; + } + case BOOK3S_INTERRUPT_SYSCALL: + { + /* hcall - punt to userspace */ + int i; + + /* hypercall with MSR_PR has already been handled in rmode, + * and never reaches here. + */ + + run->papr_hcall.nr = kvmppc_get_gpr(vcpu, 3); + for (i = 0; i < 9; ++i) + run->papr_hcall.args[i] = kvmppc_get_gpr(vcpu, 4 + i); + run->exit_reason = KVM_EXIT_PAPR_HCALL; + vcpu->arch.hcall_needed = 1; + r = RESUME_HOST; + break; + } + /* + * We get these next two if the guest accesses a page which it thinks + * it has mapped but which is not actually present, either because + * it is for an emulated I/O device or because the corresonding + * host page has been paged out. Any other HDSI/HISI interrupts + * have been handled already. + */ + case BOOK3S_INTERRUPT_H_DATA_STORAGE: + r = RESUME_PAGE_FAULT; + break; + case BOOK3S_INTERRUPT_H_INST_STORAGE: + vcpu->arch.fault_dar = kvmppc_get_pc(vcpu); + vcpu->arch.fault_dsisr = 0; + r = RESUME_PAGE_FAULT; + break; + /* + * This occurs if the guest executes an illegal instruction. + * If the guest debug is disabled, generate a program interrupt + * to the guest. If guest debug is enabled, we need to check + * whether the instruction is a software breakpoint instruction. + * Accordingly return to Guest or Host. + */ + case BOOK3S_INTERRUPT_H_EMUL_ASSIST: + if (vcpu->arch.emul_inst != KVM_INST_FETCH_FAILED) + vcpu->arch.last_inst = kvmppc_need_byteswap(vcpu) ? + swab32(vcpu->arch.emul_inst) : + vcpu->arch.emul_inst; + if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) { + r = kvmppc_emulate_debug_inst(run, vcpu); + } else { + kvmppc_core_queue_program(vcpu, SRR1_PROGILL); + r = RESUME_GUEST; + } + break; + /* + * This occurs if the guest (kernel or userspace), does something that + * is prohibited by HFSCR. We just generate a program interrupt to + * the guest. + */ + case BOOK3S_INTERRUPT_H_FAC_UNAVAIL: + kvmppc_core_queue_program(vcpu, SRR1_PROGILL); + r = RESUME_GUEST; + break; + default: + kvmppc_dump_regs(vcpu); + printk(KERN_EMERG "trap=0x%x | pc=0x%lx | msr=0x%llx\n", + vcpu->arch.trap, kvmppc_get_pc(vcpu), + vcpu->arch.shregs.msr); + run->hw.hardware_exit_reason = vcpu->arch.trap; + r = RESUME_HOST; + break; + } + + return r; +} + +static int kvm_arch_vcpu_ioctl_get_sregs_hv(struct kvm_vcpu *vcpu, + struct kvm_sregs *sregs) +{ + int i; + + memset(sregs, 0, sizeof(struct kvm_sregs)); + sregs->pvr = vcpu->arch.pvr; + for (i = 0; i < vcpu->arch.slb_max; i++) { + sregs->u.s.ppc64.slb[i].slbe = vcpu->arch.slb[i].orige; + sregs->u.s.ppc64.slb[i].slbv = vcpu->arch.slb[i].origv; + } + + return 0; +} + +static int kvm_arch_vcpu_ioctl_set_sregs_hv(struct kvm_vcpu *vcpu, + struct kvm_sregs *sregs) +{ + int i, j; + + /* Only accept the same PVR as the host's, since we can't spoof it */ + if (sregs->pvr != vcpu->arch.pvr) + return -EINVAL; + + j = 0; + for (i = 0; i < vcpu->arch.slb_nr; i++) { + if (sregs->u.s.ppc64.slb[i].slbe & SLB_ESID_V) { + vcpu->arch.slb[j].orige = sregs->u.s.ppc64.slb[i].slbe; + vcpu->arch.slb[j].origv = sregs->u.s.ppc64.slb[i].slbv; + ++j; + } + } + vcpu->arch.slb_max = j; + + return 0; +} + +static void kvmppc_set_lpcr(struct kvm_vcpu *vcpu, u64 new_lpcr, + bool preserve_top32) +{ + struct kvm *kvm = vcpu->kvm; + struct kvmppc_vcore *vc = vcpu->arch.vcore; + u64 mask; + + mutex_lock(&kvm->lock); + spin_lock(&vc->lock); + /* + * If ILE (interrupt little-endian) has changed, update the + * MSR_LE bit in the intr_msr for each vcpu in this vcore. + */ + if ((new_lpcr & LPCR_ILE) != (vc->lpcr & LPCR_ILE)) { + struct kvm_vcpu *vcpu; + int i; + + kvm_for_each_vcpu(i, vcpu, kvm) { + if (vcpu->arch.vcore != vc) + continue; + if (new_lpcr & LPCR_ILE) + vcpu->arch.intr_msr |= MSR_LE; + else + vcpu->arch.intr_msr &= ~MSR_LE; + } + } + + /* + * Userspace can only modify DPFD (default prefetch depth), + * ILE (interrupt little-endian) and TC (translation control). + * On POWER8 userspace can also modify AIL (alt. interrupt loc.) + */ + mask = LPCR_DPFD | LPCR_ILE | LPCR_TC; + if (cpu_has_feature(CPU_FTR_ARCH_207S)) + mask |= LPCR_AIL; + + /* Broken 32-bit version of LPCR must not clear top bits */ + if (preserve_top32) + mask &= 0xFFFFFFFF; + vc->lpcr = (vc->lpcr & ~mask) | (new_lpcr & mask); + spin_unlock(&vc->lock); + mutex_unlock(&kvm->lock); +} + +static int kvmppc_get_one_reg_hv(struct kvm_vcpu *vcpu, u64 id, + union kvmppc_one_reg *val) +{ + int r = 0; + long int i; + + switch (id) { + case KVM_REG_PPC_DEBUG_INST: + *val = get_reg_val(id, KVMPPC_INST_SW_BREAKPOINT); + break; + case KVM_REG_PPC_HIOR: + *val = get_reg_val(id, 0); + break; + case KVM_REG_PPC_DABR: + *val = get_reg_val(id, vcpu->arch.dabr); + break; + case KVM_REG_PPC_DABRX: + *val = get_reg_val(id, vcpu->arch.dabrx); + break; + case KVM_REG_PPC_DSCR: + *val = get_reg_val(id, vcpu->arch.dscr); + break; + case KVM_REG_PPC_PURR: + *val = get_reg_val(id, vcpu->arch.purr); + break; + case KVM_REG_PPC_SPURR: + *val = get_reg_val(id, vcpu->arch.spurr); + break; + case KVM_REG_PPC_AMR: + *val = get_reg_val(id, vcpu->arch.amr); + break; + case KVM_REG_PPC_UAMOR: + *val = get_reg_val(id, vcpu->arch.uamor); + break; + case KVM_REG_PPC_MMCR0 ... KVM_REG_PPC_MMCRS: + i = id - KVM_REG_PPC_MMCR0; + *val = get_reg_val(id, vcpu->arch.mmcr[i]); + break; + case KVM_REG_PPC_PMC1 ... KVM_REG_PPC_PMC8: + i = id - KVM_REG_PPC_PMC1; + *val = get_reg_val(id, vcpu->arch.pmc[i]); + break; + case KVM_REG_PPC_SPMC1 ... KVM_REG_PPC_SPMC2: + i = id - KVM_REG_PPC_SPMC1; + *val = get_reg_val(id, vcpu->arch.spmc[i]); + break; + case KVM_REG_PPC_SIAR: + *val = get_reg_val(id, vcpu->arch.siar); + break; + case KVM_REG_PPC_SDAR: + *val = get_reg_val(id, vcpu->arch.sdar); + break; + case KVM_REG_PPC_SIER: + *val = get_reg_val(id, vcpu->arch.sier); + break; + case KVM_REG_PPC_IAMR: + *val = get_reg_val(id, vcpu->arch.iamr); + break; + case KVM_REG_PPC_PSPB: + *val = get_reg_val(id, vcpu->arch.pspb); + break; + case KVM_REG_PPC_DPDES: + *val = get_reg_val(id, vcpu->arch.vcore->dpdes); + break; + case KVM_REG_PPC_DAWR: + *val = get_reg_val(id, vcpu->arch.dawr); + break; + case KVM_REG_PPC_DAWRX: + *val = get_reg_val(id, vcpu->arch.dawrx); + break; + case KVM_REG_PPC_CIABR: + *val = get_reg_val(id, vcpu->arch.ciabr); + break; + case KVM_REG_PPC_CSIGR: + *val = get_reg_val(id, vcpu->arch.csigr); + break; + case KVM_REG_PPC_TACR: + *val = get_reg_val(id, vcpu->arch.tacr); + break; + case KVM_REG_PPC_TCSCR: + *val = get_reg_val(id, vcpu->arch.tcscr); + break; + case KVM_REG_PPC_PID: + *val = get_reg_val(id, vcpu->arch.pid); + break; + case KVM_REG_PPC_ACOP: + *val = get_reg_val(id, vcpu->arch.acop); + break; + case KVM_REG_PPC_WORT: + *val = get_reg_val(id, vcpu->arch.wort); + break; + case KVM_REG_PPC_VPA_ADDR: + spin_lock(&vcpu->arch.vpa_update_lock); + *val = get_reg_val(id, vcpu->arch.vpa.next_gpa); + spin_unlock(&vcpu->arch.vpa_update_lock); + break; + case KVM_REG_PPC_VPA_SLB: + spin_lock(&vcpu->arch.vpa_update_lock); + val->vpaval.addr = vcpu->arch.slb_shadow.next_gpa; + val->vpaval.length = vcpu->arch.slb_shadow.len; + spin_unlock(&vcpu->arch.vpa_update_lock); + break; + case KVM_REG_PPC_VPA_DTL: + spin_lock(&vcpu->arch.vpa_update_lock); + val->vpaval.addr = vcpu->arch.dtl.next_gpa; + val->vpaval.length = vcpu->arch.dtl.len; + spin_unlock(&vcpu->arch.vpa_update_lock); + break; + case KVM_REG_PPC_TB_OFFSET: + *val = get_reg_val(id, vcpu->arch.vcore->tb_offset); + break; + case KVM_REG_PPC_LPCR: + case KVM_REG_PPC_LPCR_64: + *val = get_reg_val(id, vcpu->arch.vcore->lpcr); + break; + case KVM_REG_PPC_PPR: + *val = get_reg_val(id, vcpu->arch.ppr); + break; +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM + case KVM_REG_PPC_TFHAR: + *val = get_reg_val(id, vcpu->arch.tfhar); + break; + case KVM_REG_PPC_TFIAR: + *val = get_reg_val(id, vcpu->arch.tfiar); + break; + case KVM_REG_PPC_TEXASR: + *val = get_reg_val(id, vcpu->arch.texasr); + break; + case KVM_REG_PPC_TM_GPR0 ... KVM_REG_PPC_TM_GPR31: + i = id - KVM_REG_PPC_TM_GPR0; + *val = get_reg_val(id, vcpu->arch.gpr_tm[i]); + break; + case KVM_REG_PPC_TM_VSR0 ... KVM_REG_PPC_TM_VSR63: + { + int j; + i = id - KVM_REG_PPC_TM_VSR0; + if (i < 32) + for (j = 0; j < TS_FPRWIDTH; j++) + val->vsxval[j] = vcpu->arch.fp_tm.fpr[i][j]; + else { + if (cpu_has_feature(CPU_FTR_ALTIVEC)) + val->vval = vcpu->arch.vr_tm.vr[i-32]; + else + r = -ENXIO; + } + break; + } + case KVM_REG_PPC_TM_CR: + *val = get_reg_val(id, vcpu->arch.cr_tm); + break; + case KVM_REG_PPC_TM_LR: + *val = get_reg_val(id, vcpu->arch.lr_tm); + break; + case KVM_REG_PPC_TM_CTR: + *val = get_reg_val(id, vcpu->arch.ctr_tm); + break; + case KVM_REG_PPC_TM_FPSCR: + *val = get_reg_val(id, vcpu->arch.fp_tm.fpscr); + break; + case KVM_REG_PPC_TM_AMR: + *val = get_reg_val(id, vcpu->arch.amr_tm); + break; + case KVM_REG_PPC_TM_PPR: + *val = get_reg_val(id, vcpu->arch.ppr_tm); + break; + case KVM_REG_PPC_TM_VRSAVE: + *val = get_reg_val(id, vcpu->arch.vrsave_tm); + break; + case KVM_REG_PPC_TM_VSCR: + if (cpu_has_feature(CPU_FTR_ALTIVEC)) + *val = get_reg_val(id, vcpu->arch.vr_tm.vscr.u[3]); + else + r = -ENXIO; + break; + case KVM_REG_PPC_TM_DSCR: + *val = get_reg_val(id, vcpu->arch.dscr_tm); + break; + case KVM_REG_PPC_TM_TAR: + *val = get_reg_val(id, vcpu->arch.tar_tm); + break; +#endif + case KVM_REG_PPC_ARCH_COMPAT: + *val = get_reg_val(id, vcpu->arch.vcore->arch_compat); + break; + default: + r = -EINVAL; + break; + } + + return r; +} + +static int kvmppc_set_one_reg_hv(struct kvm_vcpu *vcpu, u64 id, + union kvmppc_one_reg *val) +{ + int r = 0; + long int i; + unsigned long addr, len; + + switch (id) { + case KVM_REG_PPC_HIOR: + /* Only allow this to be set to zero */ + if (set_reg_val(id, *val)) + r = -EINVAL; + break; + case KVM_REG_PPC_DABR: + vcpu->arch.dabr = set_reg_val(id, *val); + break; + case KVM_REG_PPC_DABRX: + vcpu->arch.dabrx = set_reg_val(id, *val) & ~DABRX_HYP; + break; + case KVM_REG_PPC_DSCR: + vcpu->arch.dscr = set_reg_val(id, *val); + break; + case KVM_REG_PPC_PURR: + vcpu->arch.purr = set_reg_val(id, *val); + break; + case KVM_REG_PPC_SPURR: + vcpu->arch.spurr = set_reg_val(id, *val); + break; + case KVM_REG_PPC_AMR: + vcpu->arch.amr = set_reg_val(id, *val); + break; + case KVM_REG_PPC_UAMOR: + vcpu->arch.uamor = set_reg_val(id, *val); + break; + case KVM_REG_PPC_MMCR0 ... KVM_REG_PPC_MMCRS: + i = id - KVM_REG_PPC_MMCR0; + vcpu->arch.mmcr[i] = set_reg_val(id, *val); + break; + case KVM_REG_PPC_PMC1 ... KVM_REG_PPC_PMC8: + i = id - KVM_REG_PPC_PMC1; + vcpu->arch.pmc[i] = set_reg_val(id, *val); + break; + case KVM_REG_PPC_SPMC1 ... KVM_REG_PPC_SPMC2: + i = id - KVM_REG_PPC_SPMC1; + vcpu->arch.spmc[i] = set_reg_val(id, *val); + break; + case KVM_REG_PPC_SIAR: + vcpu->arch.siar = set_reg_val(id, *val); + break; + case KVM_REG_PPC_SDAR: + vcpu->arch.sdar = set_reg_val(id, *val); + break; + case KVM_REG_PPC_SIER: + vcpu->arch.sier = set_reg_val(id, *val); + break; + case KVM_REG_PPC_IAMR: + vcpu->arch.iamr = set_reg_val(id, *val); + break; + case KVM_REG_PPC_PSPB: + vcpu->arch.pspb = set_reg_val(id, *val); + break; + case KVM_REG_PPC_DPDES: + vcpu->arch.vcore->dpdes = set_reg_val(id, *val); + break; + case KVM_REG_PPC_DAWR: + vcpu->arch.dawr = set_reg_val(id, *val); + break; + case KVM_REG_PPC_DAWRX: + vcpu->arch.dawrx = set_reg_val(id, *val) & ~DAWRX_HYP; + break; + case KVM_REG_PPC_CIABR: + vcpu->arch.ciabr = set_reg_val(id, *val); + /* Don't allow setting breakpoints in hypervisor code */ + if ((vcpu->arch.ciabr & CIABR_PRIV) == CIABR_PRIV_HYPER) + vcpu->arch.ciabr &= ~CIABR_PRIV; /* disable */ + break; + case KVM_REG_PPC_CSIGR: + vcpu->arch.csigr = set_reg_val(id, *val); + break; + case KVM_REG_PPC_TACR: + vcpu->arch.tacr = set_reg_val(id, *val); + break; + case KVM_REG_PPC_TCSCR: + vcpu->arch.tcscr = set_reg_val(id, *val); + break; + case KVM_REG_PPC_PID: + vcpu->arch.pid = set_reg_val(id, *val); + break; + case KVM_REG_PPC_ACOP: + vcpu->arch.acop = set_reg_val(id, *val); + break; + case KVM_REG_PPC_WORT: + vcpu->arch.wort = set_reg_val(id, *val); + break; + case KVM_REG_PPC_VPA_ADDR: + addr = set_reg_val(id, *val); + r = -EINVAL; + if (!addr && (vcpu->arch.slb_shadow.next_gpa || + vcpu->arch.dtl.next_gpa)) + break; + r = set_vpa(vcpu, &vcpu->arch.vpa, addr, sizeof(struct lppaca)); + break; + case KVM_REG_PPC_VPA_SLB: + addr = val->vpaval.addr; + len = val->vpaval.length; + r = -EINVAL; + if (addr && !vcpu->arch.vpa.next_gpa) + break; + r = set_vpa(vcpu, &vcpu->arch.slb_shadow, addr, len); + break; + case KVM_REG_PPC_VPA_DTL: + addr = val->vpaval.addr; + len = val->vpaval.length; + r = -EINVAL; + if (addr && (len < sizeof(struct dtl_entry) || + !vcpu->arch.vpa.next_gpa)) + break; + len -= len % sizeof(struct dtl_entry); + r = set_vpa(vcpu, &vcpu->arch.dtl, addr, len); + break; + case KVM_REG_PPC_TB_OFFSET: + /* round up to multiple of 2^24 */ + vcpu->arch.vcore->tb_offset = + ALIGN(set_reg_val(id, *val), 1UL << 24); + break; + case KVM_REG_PPC_LPCR: + kvmppc_set_lpcr(vcpu, set_reg_val(id, *val), true); + break; + case KVM_REG_PPC_LPCR_64: + kvmppc_set_lpcr(vcpu, set_reg_val(id, *val), false); + break; + case KVM_REG_PPC_PPR: + vcpu->arch.ppr = set_reg_val(id, *val); + break; +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM + case KVM_REG_PPC_TFHAR: + vcpu->arch.tfhar = set_reg_val(id, *val); + break; + case KVM_REG_PPC_TFIAR: + vcpu->arch.tfiar = set_reg_val(id, *val); + break; + case KVM_REG_PPC_TEXASR: + vcpu->arch.texasr = set_reg_val(id, *val); + break; + case KVM_REG_PPC_TM_GPR0 ... KVM_REG_PPC_TM_GPR31: + i = id - KVM_REG_PPC_TM_GPR0; + vcpu->arch.gpr_tm[i] = set_reg_val(id, *val); + break; + case KVM_REG_PPC_TM_VSR0 ... KVM_REG_PPC_TM_VSR63: + { + int j; + i = id - KVM_REG_PPC_TM_VSR0; + if (i < 32) + for (j = 0; j < TS_FPRWIDTH; j++) + vcpu->arch.fp_tm.fpr[i][j] = val->vsxval[j]; + else + if (cpu_has_feature(CPU_FTR_ALTIVEC)) + vcpu->arch.vr_tm.vr[i-32] = val->vval; + else + r = -ENXIO; + break; + } + case KVM_REG_PPC_TM_CR: + vcpu->arch.cr_tm = set_reg_val(id, *val); + break; + case KVM_REG_PPC_TM_LR: + vcpu->arch.lr_tm = set_reg_val(id, *val); + break; + case KVM_REG_PPC_TM_CTR: + vcpu->arch.ctr_tm = set_reg_val(id, *val); + break; + case KVM_REG_PPC_TM_FPSCR: + vcpu->arch.fp_tm.fpscr = set_reg_val(id, *val); + break; + case KVM_REG_PPC_TM_AMR: + vcpu->arch.amr_tm = set_reg_val(id, *val); + break; + case KVM_REG_PPC_TM_PPR: + vcpu->arch.ppr_tm = set_reg_val(id, *val); + break; + case KVM_REG_PPC_TM_VRSAVE: + vcpu->arch.vrsave_tm = set_reg_val(id, *val); + break; + case KVM_REG_PPC_TM_VSCR: + if (cpu_has_feature(CPU_FTR_ALTIVEC)) + vcpu->arch.vr.vscr.u[3] = set_reg_val(id, *val); + else + r = - ENXIO; + break; + case KVM_REG_PPC_TM_DSCR: + vcpu->arch.dscr_tm = set_reg_val(id, *val); + break; + case KVM_REG_PPC_TM_TAR: + vcpu->arch.tar_tm = set_reg_val(id, *val); + break; +#endif + case KVM_REG_PPC_ARCH_COMPAT: + r = kvmppc_set_arch_compat(vcpu, set_reg_val(id, *val)); + break; + default: + r = -EINVAL; + break; + } + + return r; +} + +static struct kvmppc_vcore *kvmppc_vcore_create(struct kvm *kvm, int core) +{ + struct kvmppc_vcore *vcore; + + vcore = kzalloc(sizeof(struct kvmppc_vcore), GFP_KERNEL); + + if (vcore == NULL) + return NULL; + + INIT_LIST_HEAD(&vcore->runnable_threads); + spin_lock_init(&vcore->lock); + spin_lock_init(&vcore->stoltb_lock); + init_swait_head(&vcore->wq); + vcore->preempt_tb = TB_NIL; + vcore->lpcr = kvm->arch.lpcr; + vcore->first_vcpuid = core * threads_per_subcore; + vcore->kvm = kvm; + + vcore->mpp_buffer_is_valid = false; + + if (cpu_has_feature(CPU_FTR_ARCH_207S)) + vcore->mpp_buffer = (void *)__get_free_pages( + GFP_KERNEL|__GFP_ZERO, + MPP_BUFFER_ORDER); + + return vcore; +} + +#ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING +static struct debugfs_timings_element { + const char *name; + size_t offset; +} timings[] = { + {"rm_entry", offsetof(struct kvm_vcpu, arch.rm_entry)}, + {"rm_intr", offsetof(struct kvm_vcpu, arch.rm_intr)}, + {"rm_exit", offsetof(struct kvm_vcpu, arch.rm_exit)}, + {"guest", offsetof(struct kvm_vcpu, arch.guest_time)}, + {"cede", offsetof(struct kvm_vcpu, arch.cede_time)}, +}; + +#define N_TIMINGS (sizeof(timings) / sizeof(timings[0])) + +struct debugfs_timings_state { + struct kvm_vcpu *vcpu; + unsigned int buflen; + char buf[N_TIMINGS * 100]; +}; + +static int debugfs_timings_open(struct inode *inode, struct file *file) +{ + struct kvm_vcpu *vcpu = inode->i_private; + struct debugfs_timings_state *p; + + p = kzalloc(sizeof(*p), GFP_KERNEL); + if (!p) + return -ENOMEM; + + kvm_get_kvm(vcpu->kvm); + p->vcpu = vcpu; + file->private_data = p; + + return nonseekable_open(inode, file); +} + +static int debugfs_timings_release(struct inode *inode, struct file *file) +{ + struct debugfs_timings_state *p = file->private_data; + + kvm_put_kvm(p->vcpu->kvm); + kfree(p); + return 0; +} + +static ssize_t debugfs_timings_read(struct file *file, char __user *buf, + size_t len, loff_t *ppos) +{ + struct debugfs_timings_state *p = file->private_data; + struct kvm_vcpu *vcpu = p->vcpu; + char *s, *buf_end; + struct kvmhv_tb_accumulator tb; + u64 count; + loff_t pos; + ssize_t n; + int i, loops; + bool ok; + + if (!p->buflen) { + s = p->buf; + buf_end = s + sizeof(p->buf); + for (i = 0; i < N_TIMINGS; ++i) { + struct kvmhv_tb_accumulator *acc; + + acc = (struct kvmhv_tb_accumulator *) + ((unsigned long)vcpu + timings[i].offset); + ok = false; + for (loops = 0; loops < 1000; ++loops) { + count = acc->seqcount; + if (!(count & 1)) { + smp_rmb(); + tb = *acc; + smp_rmb(); + if (count == acc->seqcount) { + ok = true; + break; + } + } + udelay(1); + } + if (!ok) + snprintf(s, buf_end - s, "%s: stuck\n", + timings[i].name); + else + snprintf(s, buf_end - s, + "%s: %llu %llu %llu %llu\n", + timings[i].name, count / 2, + tb_to_ns(tb.tb_total), + tb_to_ns(tb.tb_min), + tb_to_ns(tb.tb_max)); + s += strlen(s); + } + p->buflen = s - p->buf; + } + + pos = *ppos; + if (pos >= p->buflen) + return 0; + if (len > p->buflen - pos) + len = p->buflen - pos; + n = copy_to_user(buf, p->buf + pos, len); + if (n) { + if (n == len) + return -EFAULT; + len -= n; + } + *ppos = pos + len; + return len; +} + +static ssize_t debugfs_timings_write(struct file *file, const char __user *buf, + size_t len, loff_t *ppos) +{ + return -EACCES; +} + +static const struct file_operations debugfs_timings_ops = { + .owner = THIS_MODULE, + .open = debugfs_timings_open, + .release = debugfs_timings_release, + .read = debugfs_timings_read, + .write = debugfs_timings_write, + .llseek = generic_file_llseek, +}; + +/* Create a debugfs directory for the vcpu */ +static void debugfs_vcpu_init(struct kvm_vcpu *vcpu, unsigned int id) +{ + char buf[16]; + struct kvm *kvm = vcpu->kvm; + + snprintf(buf, sizeof(buf), "vcpu%u", id); + if (IS_ERR_OR_NULL(kvm->arch.debugfs_dir)) + return; + vcpu->arch.debugfs_dir = debugfs_create_dir(buf, kvm->arch.debugfs_dir); + if (IS_ERR_OR_NULL(vcpu->arch.debugfs_dir)) + return; + vcpu->arch.debugfs_timings = + debugfs_create_file("timings", 0444, vcpu->arch.debugfs_dir, + vcpu, &debugfs_timings_ops); +} + +#else /* CONFIG_KVM_BOOK3S_HV_EXIT_TIMING */ +static void debugfs_vcpu_init(struct kvm_vcpu *vcpu, unsigned int id) +{ +} +#endif /* CONFIG_KVM_BOOK3S_HV_EXIT_TIMING */ + +static struct kvm_vcpu *kvmppc_core_vcpu_create_hv(struct kvm *kvm, + unsigned int id) +{ + struct kvm_vcpu *vcpu; + int err = -EINVAL; + int core; + struct kvmppc_vcore *vcore; + + core = id / threads_per_subcore; + if (core >= KVM_MAX_VCORES) + goto out; + + err = -ENOMEM; + vcpu = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL); + if (!vcpu) + goto out; + + err = kvm_vcpu_init(vcpu, kvm, id); + if (err) + goto free_vcpu; + + vcpu->arch.shared = &vcpu->arch.shregs; +#ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE + /* + * The shared struct is never shared on HV, + * so we can always use host endianness + */ +#ifdef __BIG_ENDIAN__ + vcpu->arch.shared_big_endian = true; +#else + vcpu->arch.shared_big_endian = false; +#endif +#endif + vcpu->arch.mmcr[0] = MMCR0_FC; + vcpu->arch.ctrl = CTRL_RUNLATCH; + /* default to host PVR, since we can't spoof it */ + kvmppc_set_pvr_hv(vcpu, mfspr(SPRN_PVR)); + spin_lock_init(&vcpu->arch.vpa_update_lock); + spin_lock_init(&vcpu->arch.tbacct_lock); + vcpu->arch.busy_preempt = TB_NIL; + vcpu->arch.intr_msr = MSR_SF | MSR_ME; + + kvmppc_mmu_book3s_hv_init(vcpu); + + vcpu->arch.state = KVMPPC_VCPU_NOTREADY; + + init_waitqueue_head(&vcpu->arch.cpu_run); + + mutex_lock(&kvm->lock); + vcore = kvm->arch.vcores[core]; + if (!vcore) { + vcore = kvmppc_vcore_create(kvm, core); + kvm->arch.vcores[core] = vcore; + kvm->arch.online_vcores++; + } + mutex_unlock(&kvm->lock); + + if (!vcore) + goto free_vcpu; + + spin_lock(&vcore->lock); + ++vcore->num_threads; + spin_unlock(&vcore->lock); + vcpu->arch.vcore = vcore; + vcpu->arch.ptid = vcpu->vcpu_id - vcore->first_vcpuid; + + vcpu->arch.cpu_type = KVM_CPU_3S_64; + kvmppc_sanity_check(vcpu); + + debugfs_vcpu_init(vcpu, id); + + return vcpu; + +free_vcpu: + kmem_cache_free(kvm_vcpu_cache, vcpu); +out: + return ERR_PTR(err); +} + +static void unpin_vpa(struct kvm *kvm, struct kvmppc_vpa *vpa) +{ + if (vpa->pinned_addr) + kvmppc_unpin_guest_page(kvm, vpa->pinned_addr, vpa->gpa, + vpa->dirty); +} + +static void kvmppc_core_vcpu_free_hv(struct kvm_vcpu *vcpu) +{ + spin_lock(&vcpu->arch.vpa_update_lock); + unpin_vpa(vcpu->kvm, &vcpu->arch.dtl); + unpin_vpa(vcpu->kvm, &vcpu->arch.slb_shadow); + unpin_vpa(vcpu->kvm, &vcpu->arch.vpa); + spin_unlock(&vcpu->arch.vpa_update_lock); + kvm_vcpu_uninit(vcpu); + kmem_cache_free(kvm_vcpu_cache, vcpu); +} + +static int kvmppc_core_check_requests_hv(struct kvm_vcpu *vcpu) +{ + /* Indicate we want to get back into the guest */ + return 1; +} + +static void kvmppc_set_timer(struct kvm_vcpu *vcpu) +{ + unsigned long dec_nsec, now; + + now = get_tb(); + if (now > vcpu->arch.dec_expires) { + /* decrementer has already gone negative */ + kvmppc_core_queue_dec(vcpu); + kvmppc_core_prepare_to_enter(vcpu); + return; + } + dec_nsec = (vcpu->arch.dec_expires - now) * NSEC_PER_SEC + / tb_ticks_per_sec; + hrtimer_start(&vcpu->arch.dec_timer, ktime_set(0, dec_nsec), + HRTIMER_MODE_REL); + vcpu->arch.timer_running = 1; +} + +static void kvmppc_end_cede(struct kvm_vcpu *vcpu) +{ + vcpu->arch.ceded = 0; + if (vcpu->arch.timer_running) { + hrtimer_try_to_cancel(&vcpu->arch.dec_timer); + vcpu->arch.timer_running = 0; + } +} + +extern void __kvmppc_vcore_entry(void); + +static void kvmppc_remove_runnable(struct kvmppc_vcore *vc, + struct kvm_vcpu *vcpu) +{ + u64 now; + + if (vcpu->arch.state != KVMPPC_VCPU_RUNNABLE) + return; + spin_lock_irq(&vcpu->arch.tbacct_lock); + now = mftb(); + vcpu->arch.busy_stolen += vcore_stolen_time(vc, now) - + vcpu->arch.stolen_logged; + vcpu->arch.busy_preempt = now; + vcpu->arch.state = KVMPPC_VCPU_BUSY_IN_HOST; + spin_unlock_irq(&vcpu->arch.tbacct_lock); + --vc->n_runnable; + list_del(&vcpu->arch.run_list); +} + +static int kvmppc_grab_hwthread(int cpu) +{ + struct paca_struct *tpaca; + long timeout = 10000; + + tpaca = &paca[cpu]; + + /* Ensure the thread won't go into the kernel if it wakes */ + tpaca->kvm_hstate.kvm_vcpu = NULL; + tpaca->kvm_hstate.napping = 0; + smp_wmb(); + tpaca->kvm_hstate.hwthread_req = 1; + + /* + * If the thread is already executing in the kernel (e.g. handling + * a stray interrupt), wait for it to get back to nap mode. + * The smp_mb() is to ensure that our setting of hwthread_req + * is visible before we look at hwthread_state, so if this + * races with the code at system_reset_pSeries and the thread + * misses our setting of hwthread_req, we are sure to see its + * setting of hwthread_state, and vice versa. + */ + smp_mb(); + while (tpaca->kvm_hstate.hwthread_state == KVM_HWTHREAD_IN_KERNEL) { + if (--timeout <= 0) { + pr_err("KVM: couldn't grab cpu %d\n", cpu); + return -EBUSY; + } + udelay(1); + } + return 0; +} + +static void kvmppc_release_hwthread(int cpu) +{ + struct paca_struct *tpaca; + + tpaca = &paca[cpu]; + tpaca->kvm_hstate.hwthread_req = 0; + tpaca->kvm_hstate.kvm_vcpu = NULL; +} + +static void kvmppc_start_thread(struct kvm_vcpu *vcpu) +{ + int cpu; + struct paca_struct *tpaca; + struct kvmppc_vcore *vc = vcpu->arch.vcore; + + if (vcpu->arch.timer_running) { + hrtimer_try_to_cancel(&vcpu->arch.dec_timer); + vcpu->arch.timer_running = 0; + } + cpu = vc->pcpu + vcpu->arch.ptid; + tpaca = &paca[cpu]; + tpaca->kvm_hstate.kvm_vcore = vc; + tpaca->kvm_hstate.ptid = vcpu->arch.ptid; + vcpu->cpu = vc->pcpu; + /* Order stores to hstate.kvm_vcore etc. before store to kvm_vcpu */ + smp_wmb(); + tpaca->kvm_hstate.kvm_vcpu = vcpu; + if (cpu != smp_processor_id()) + kvmppc_ipi_thread(cpu); +} + +static void kvmppc_wait_for_nap(void) +{ + int cpu = smp_processor_id(); + int i, loops; + + for (loops = 0; loops < 1000000; ++loops) { + /* + * Check if all threads are finished. + * We set the vcpu pointer when starting a thread + * and the thread clears it when finished, so we look + * for any threads that still have a non-NULL vcpu ptr. + */ + for (i = 1; i < threads_per_subcore; ++i) + if (paca[cpu + i].kvm_hstate.kvm_vcpu) + break; + if (i == threads_per_subcore) { + HMT_medium(); + return; + } + HMT_low(); + } + HMT_medium(); + for (i = 1; i < threads_per_subcore; ++i) + if (paca[cpu + i].kvm_hstate.kvm_vcpu) + pr_err("KVM: CPU %d seems to be stuck\n", cpu + i); +} + +/* + * Check that we are on thread 0 and that any other threads in + * this core are off-line. Then grab the threads so they can't + * enter the kernel. + */ +static int on_primary_thread(void) +{ + int cpu = smp_processor_id(); + int thr; + + /* Are we on a primary subcore? */ + if (cpu_thread_in_subcore(cpu)) + return 0; + + thr = 0; + while (++thr < threads_per_subcore) + if (cpu_online(cpu + thr)) + return 0; + + /* Grab all hw threads so they can't go into the kernel */ + for (thr = 1; thr < threads_per_subcore; ++thr) { + if (kvmppc_grab_hwthread(cpu + thr)) { + /* Couldn't grab one; let the others go */ + do { + kvmppc_release_hwthread(cpu + thr); + } while (--thr > 0); + return 0; + } + } + return 1; +} + +static void kvmppc_start_saving_l2_cache(struct kvmppc_vcore *vc) +{ + phys_addr_t phy_addr, mpp_addr; + + phy_addr = (phys_addr_t)virt_to_phys(vc->mpp_buffer); + mpp_addr = phy_addr & PPC_MPPE_ADDRESS_MASK; + + mtspr(SPRN_MPPR, mpp_addr | PPC_MPPR_FETCH_ABORT); + logmpp(mpp_addr | PPC_LOGMPP_LOG_L2); + + vc->mpp_buffer_is_valid = true; +} + +static void kvmppc_start_restoring_l2_cache(const struct kvmppc_vcore *vc) +{ + phys_addr_t phy_addr, mpp_addr; + + phy_addr = virt_to_phys(vc->mpp_buffer); + mpp_addr = phy_addr & PPC_MPPE_ADDRESS_MASK; + + /* We must abort any in-progress save operations to ensure + * the table is valid so that prefetch engine knows when to + * stop prefetching. */ + logmpp(mpp_addr | PPC_LOGMPP_LOG_ABORT); + mtspr(SPRN_MPPR, mpp_addr | PPC_MPPR_FETCH_WHOLE_TABLE); +} + +static void prepare_threads(struct kvmppc_vcore *vc) +{ + struct kvm_vcpu *vcpu, *vnext; + + list_for_each_entry_safe(vcpu, vnext, &vc->runnable_threads, + arch.run_list) { + if (signal_pending(vcpu->arch.run_task)) + vcpu->arch.ret = -EINTR; + else if (vcpu->arch.vpa.update_pending || + vcpu->arch.slb_shadow.update_pending || + vcpu->arch.dtl.update_pending) + vcpu->arch.ret = RESUME_GUEST; + else + continue; + kvmppc_remove_runnable(vc, vcpu); + wake_up(&vcpu->arch.cpu_run); + } +} + +static void post_guest_process(struct kvmppc_vcore *vc) +{ + u64 now; + long ret; + struct kvm_vcpu *vcpu, *vnext; + + now = get_tb(); + list_for_each_entry_safe(vcpu, vnext, &vc->runnable_threads, + arch.run_list) { + /* cancel pending dec exception if dec is positive */ + if (now < vcpu->arch.dec_expires && + kvmppc_core_pending_dec(vcpu)) + kvmppc_core_dequeue_dec(vcpu); + + trace_kvm_guest_exit(vcpu); + + ret = RESUME_GUEST; + if (vcpu->arch.trap) + ret = kvmppc_handle_exit_hv(vcpu->arch.kvm_run, vcpu, + vcpu->arch.run_task); + + vcpu->arch.ret = ret; + vcpu->arch.trap = 0; + + if (vcpu->arch.ceded) { + if (!is_kvmppc_resume_guest(ret)) + kvmppc_end_cede(vcpu); + else + kvmppc_set_timer(vcpu); + } + if (!is_kvmppc_resume_guest(vcpu->arch.ret)) { + kvmppc_remove_runnable(vc, vcpu); + wake_up(&vcpu->arch.cpu_run); + } + } +} + +/* + * Run a set of guest threads on a physical core. + * Called with vc->lock held. + */ +static noinline void kvmppc_run_core(struct kvmppc_vcore *vc) +{ + struct kvm_vcpu *vcpu, *vnext; + int i; + int srcu_idx; + + /* + * Remove from the list any threads that have a signal pending + * or need a VPA update done + */ + prepare_threads(vc); + + /* if the runner is no longer runnable, let the caller pick a new one */ + if (vc->runner->arch.state != KVMPPC_VCPU_RUNNABLE) + return; + + /* + * Initialize *vc. + */ + vc->entry_exit_map = 0; + vc->preempt_tb = TB_NIL; + vc->in_guest = 0; + vc->napping_threads = 0; + vc->conferring_threads = 0; + + /* + * Make sure we are running on primary threads, and that secondary + * threads are offline. Also check if the number of threads in this + * guest are greater than the current system threads per guest. + */ + if ((threads_per_core > 1) && + ((vc->num_threads > threads_per_subcore) || !on_primary_thread())) { + list_for_each_entry_safe(vcpu, vnext, &vc->runnable_threads, + arch.run_list) { + vcpu->arch.ret = -EBUSY; + kvmppc_remove_runnable(vc, vcpu); + wake_up(&vcpu->arch.cpu_run); + } + goto out; + } + + + vc->pcpu = smp_processor_id(); + list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list) { + kvmppc_start_thread(vcpu); + kvmppc_create_dtl_entry(vcpu, vc); + trace_kvm_guest_enter(vcpu); + } + + /* Set this explicitly in case thread 0 doesn't have a vcpu */ + get_paca()->kvm_hstate.kvm_vcore = vc; + get_paca()->kvm_hstate.ptid = 0; + + vc->vcore_state = VCORE_RUNNING; + preempt_disable(); + + trace_kvmppc_run_core(vc, 0); + + spin_unlock(&vc->lock); + + kvm_guest_enter(); + + srcu_idx = srcu_read_lock(&vc->kvm->srcu); + + if (vc->mpp_buffer_is_valid) + kvmppc_start_restoring_l2_cache(vc); + + __kvmppc_vcore_entry(); + + spin_lock(&vc->lock); + + if (vc->mpp_buffer) + kvmppc_start_saving_l2_cache(vc); + + /* disable sending of IPIs on virtual external irqs */ + list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list) + vcpu->cpu = -1; + /* wait for secondary threads to finish writing their state to memory */ + kvmppc_wait_for_nap(); + for (i = 0; i < threads_per_subcore; ++i) + kvmppc_release_hwthread(vc->pcpu + i); + /* prevent other vcpu threads from doing kvmppc_start_thread() now */ + vc->vcore_state = VCORE_EXITING; + spin_unlock(&vc->lock); + + srcu_read_unlock(&vc->kvm->srcu, srcu_idx); + + /* make sure updates to secondary vcpu structs are visible now */ + smp_mb(); + kvm_guest_exit(); + + preempt_enable(); + + spin_lock(&vc->lock); + post_guest_process(vc); + + out: + vc->vcore_state = VCORE_INACTIVE; + trace_kvmppc_run_core(vc, 1); +} + +/* + * Wait for some other vcpu thread to execute us, and + * wake us up when we need to handle something in the host. + */ +static void kvmppc_wait_for_exec(struct kvm_vcpu *vcpu, int wait_state) +{ + DEFINE_WAIT(wait); + + prepare_to_wait(&vcpu->arch.cpu_run, &wait, wait_state); + if (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE) + schedule(); + finish_wait(&vcpu->arch.cpu_run, &wait); +} + +/* + * All the vcpus in this vcore are idle, so wait for a decrementer + * or external interrupt to one of the vcpus. vc->lock is held. + */ +static void kvmppc_vcore_blocked(struct kvmppc_vcore *vc) +{ + struct kvm_vcpu *vcpu; + int do_sleep = 1; + DEFINE_SWAITER(wait); + + swait_prepare(&vc->wq, &wait, TASK_INTERRUPTIBLE); + + /* + * Check one last time for pending exceptions and ceded state after + * we put ourselves on the wait queue + */ + list_for_each_entry(vcpu, &vc->runnable_threads, arch.run_list) { + if (vcpu->arch.pending_exceptions || !vcpu->arch.ceded) { + do_sleep = 0; + break; + } + } + + if (!do_sleep) { + swait_finish(&vc->wq, &wait); + return; + } + + vc->vcore_state = VCORE_SLEEPING; + trace_kvmppc_vcore_blocked(vc, 0); + spin_unlock(&vc->lock); + schedule(); + swait_finish(&vc->wq, &wait); + spin_lock(&vc->lock); + vc->vcore_state = VCORE_INACTIVE; + trace_kvmppc_vcore_blocked(vc, 1); +} + +static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) +{ + int n_ceded; + struct kvmppc_vcore *vc; + struct kvm_vcpu *v, *vn; + + trace_kvmppc_run_vcpu_enter(vcpu); + + kvm_run->exit_reason = 0; + vcpu->arch.ret = RESUME_GUEST; + vcpu->arch.trap = 0; + kvmppc_update_vpas(vcpu); + + /* + * Synchronize with other threads in this virtual core + */ + vc = vcpu->arch.vcore; + spin_lock(&vc->lock); + vcpu->arch.ceded = 0; + vcpu->arch.run_task = current; + vcpu->arch.kvm_run = kvm_run; + vcpu->arch.stolen_logged = vcore_stolen_time(vc, mftb()); + vcpu->arch.state = KVMPPC_VCPU_RUNNABLE; + vcpu->arch.busy_preempt = TB_NIL; + list_add_tail(&vcpu->arch.run_list, &vc->runnable_threads); + ++vc->n_runnable; + + /* + * This happens the first time this is called for a vcpu. + * If the vcore is already running, we may be able to start + * this thread straight away and have it join in. + */ + if (!signal_pending(current)) { + if (vc->vcore_state == VCORE_RUNNING && !VCORE_IS_EXITING(vc)) { + kvmppc_create_dtl_entry(vcpu, vc); + kvmppc_start_thread(vcpu); + trace_kvm_guest_enter(vcpu); + } else if (vc->vcore_state == VCORE_SLEEPING) { + swait_wake(&vc->wq); + } + + } + + while (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE && + !signal_pending(current)) { + if (vc->vcore_state != VCORE_INACTIVE) { + spin_unlock(&vc->lock); + kvmppc_wait_for_exec(vcpu, TASK_INTERRUPTIBLE); + spin_lock(&vc->lock); + continue; + } + list_for_each_entry_safe(v, vn, &vc->runnable_threads, + arch.run_list) { + kvmppc_core_prepare_to_enter(v); + if (signal_pending(v->arch.run_task)) { + kvmppc_remove_runnable(vc, v); + v->stat.signal_exits++; + v->arch.kvm_run->exit_reason = KVM_EXIT_INTR; + v->arch.ret = -EINTR; + wake_up(&v->arch.cpu_run); + } + } + if (!vc->n_runnable || vcpu->arch.state != KVMPPC_VCPU_RUNNABLE) + break; + n_ceded = 0; + list_for_each_entry(v, &vc->runnable_threads, arch.run_list) { + if (!v->arch.pending_exceptions) + n_ceded += v->arch.ceded; + else + v->arch.ceded = 0; + } + vc->runner = vcpu; + if (n_ceded == vc->n_runnable) { + kvmppc_vcore_blocked(vc); + } else if (should_resched()) { + vc->vcore_state = VCORE_PREEMPT; + /* Let something else run */ + cond_resched_lock(&vc->lock); + vc->vcore_state = VCORE_INACTIVE; + } else { + kvmppc_run_core(vc); + } + vc->runner = NULL; + } + + while (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE && + (vc->vcore_state == VCORE_RUNNING || + vc->vcore_state == VCORE_EXITING)) { + spin_unlock(&vc->lock); + kvmppc_wait_for_exec(vcpu, TASK_UNINTERRUPTIBLE); + spin_lock(&vc->lock); + } + + if (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE) { + kvmppc_remove_runnable(vc, vcpu); + vcpu->stat.signal_exits++; + kvm_run->exit_reason = KVM_EXIT_INTR; + vcpu->arch.ret = -EINTR; + } + + if (vc->n_runnable && vc->vcore_state == VCORE_INACTIVE) { + /* Wake up some vcpu to run the core */ + v = list_first_entry(&vc->runnable_threads, + struct kvm_vcpu, arch.run_list); + wake_up(&v->arch.cpu_run); + } + + trace_kvmppc_run_vcpu_exit(vcpu, kvm_run); + spin_unlock(&vc->lock); + return vcpu->arch.ret; +} + +static int kvmppc_vcpu_run_hv(struct kvm_run *run, struct kvm_vcpu *vcpu) +{ + int r; + int srcu_idx; + + if (!vcpu->arch.sane) { + run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + return -EINVAL; + } + + kvmppc_core_prepare_to_enter(vcpu); + + /* No need to go into the guest when all we'll do is come back out */ + if (signal_pending(current)) { + run->exit_reason = KVM_EXIT_INTR; + return -EINTR; + } + + atomic_inc(&vcpu->kvm->arch.vcpus_running); + /* Order vcpus_running vs. hpte_setup_done, see kvmppc_alloc_reset_hpt */ + smp_mb(); + + /* On the first time here, set up HTAB and VRMA */ + if (!vcpu->kvm->arch.hpte_setup_done) { + r = kvmppc_hv_setup_htab_rma(vcpu); + if (r) + goto out; + } + + flush_fp_to_thread(current); + flush_altivec_to_thread(current); + flush_vsx_to_thread(current); + vcpu->arch.wqp = &vcpu->arch.vcore->wq; + vcpu->arch.pgdir = current->mm->pgd; + vcpu->arch.state = KVMPPC_VCPU_BUSY_IN_HOST; + + do { + r = kvmppc_run_vcpu(run, vcpu); + + if (run->exit_reason == KVM_EXIT_PAPR_HCALL && + !(vcpu->arch.shregs.msr & MSR_PR)) { + trace_kvm_hcall_enter(vcpu); + r = kvmppc_pseries_do_hcall(vcpu); + trace_kvm_hcall_exit(vcpu, r); + kvmppc_core_prepare_to_enter(vcpu); + } else if (r == RESUME_PAGE_FAULT) { + srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); + r = kvmppc_book3s_hv_page_fault(run, vcpu, + vcpu->arch.fault_dar, vcpu->arch.fault_dsisr); + srcu_read_unlock(&vcpu->kvm->srcu, srcu_idx); + } + } while (is_kvmppc_resume_guest(r)); + + out: + vcpu->arch.state = KVMPPC_VCPU_NOTREADY; + atomic_dec(&vcpu->kvm->arch.vcpus_running); + return r; +} + +static void kvmppc_add_seg_page_size(struct kvm_ppc_one_seg_page_size **sps, + int linux_psize) +{ + struct mmu_psize_def *def = &mmu_psize_defs[linux_psize]; + + if (!def->shift) + return; + (*sps)->page_shift = def->shift; + (*sps)->slb_enc = def->sllp; + (*sps)->enc[0].page_shift = def->shift; + (*sps)->enc[0].pte_enc = def->penc[linux_psize]; + /* + * Add 16MB MPSS support if host supports it + */ + if (linux_psize != MMU_PAGE_16M && def->penc[MMU_PAGE_16M] != -1) { + (*sps)->enc[1].page_shift = 24; + (*sps)->enc[1].pte_enc = def->penc[MMU_PAGE_16M]; + } + (*sps)++; +} + +static int kvm_vm_ioctl_get_smmu_info_hv(struct kvm *kvm, + struct kvm_ppc_smmu_info *info) +{ + struct kvm_ppc_one_seg_page_size *sps; + + info->flags = KVM_PPC_PAGE_SIZES_REAL; + if (mmu_has_feature(MMU_FTR_1T_SEGMENT)) + info->flags |= KVM_PPC_1T_SEGMENTS; + info->slb_size = mmu_slb_size; + + /* We only support these sizes for now, and no muti-size segments */ + sps = &info->sps[0]; + kvmppc_add_seg_page_size(&sps, MMU_PAGE_4K); + kvmppc_add_seg_page_size(&sps, MMU_PAGE_64K); + kvmppc_add_seg_page_size(&sps, MMU_PAGE_16M); + + return 0; +} + +/* + * Get (and clear) the dirty memory log for a memory slot. + */ +static int kvm_vm_ioctl_get_dirty_log_hv(struct kvm *kvm, + struct kvm_dirty_log *log) +{ + struct kvm_memory_slot *memslot; + int r; + unsigned long n; + + mutex_lock(&kvm->slots_lock); + + r = -EINVAL; + if (log->slot >= KVM_USER_MEM_SLOTS) + goto out; + + memslot = id_to_memslot(kvm->memslots, log->slot); + r = -ENOENT; + if (!memslot->dirty_bitmap) + goto out; + + n = kvm_dirty_bitmap_bytes(memslot); + memset(memslot->dirty_bitmap, 0, n); + + r = kvmppc_hv_get_dirty_log(kvm, memslot, memslot->dirty_bitmap); + if (r) + goto out; + + r = -EFAULT; + if (copy_to_user(log->dirty_bitmap, memslot->dirty_bitmap, n)) + goto out; + + r = 0; +out: + mutex_unlock(&kvm->slots_lock); + return r; +} + +static void kvmppc_core_free_memslot_hv(struct kvm_memory_slot *free, + struct kvm_memory_slot *dont) +{ + if (!dont || free->arch.rmap != dont->arch.rmap) { + vfree(free->arch.rmap); + free->arch.rmap = NULL; + } +} + +static int kvmppc_core_create_memslot_hv(struct kvm_memory_slot *slot, + unsigned long npages) +{ + slot->arch.rmap = vzalloc(npages * sizeof(*slot->arch.rmap)); + if (!slot->arch.rmap) + return -ENOMEM; + + return 0; +} + +static int kvmppc_core_prepare_memory_region_hv(struct kvm *kvm, + struct kvm_memory_slot *memslot, + struct kvm_userspace_memory_region *mem) +{ + return 0; +} + +static void kvmppc_core_commit_memory_region_hv(struct kvm *kvm, + struct kvm_userspace_memory_region *mem, + const struct kvm_memory_slot *old) +{ + unsigned long npages = mem->memory_size >> PAGE_SHIFT; + struct kvm_memory_slot *memslot; + + if (npages && old->npages) { + /* + * If modifying a memslot, reset all the rmap dirty bits. + * If this is a new memslot, we don't need to do anything + * since the rmap array starts out as all zeroes, + * i.e. no pages are dirty. + */ + memslot = id_to_memslot(kvm->memslots, mem->slot); + kvmppc_hv_get_dirty_log(kvm, memslot, NULL); + } +} + +/* + * Update LPCR values in kvm->arch and in vcores. + * Caller must hold kvm->lock. + */ +void kvmppc_update_lpcr(struct kvm *kvm, unsigned long lpcr, unsigned long mask) +{ + long int i; + u32 cores_done = 0; + + if ((kvm->arch.lpcr & mask) == lpcr) + return; + + kvm->arch.lpcr = (kvm->arch.lpcr & ~mask) | lpcr; + + for (i = 0; i < KVM_MAX_VCORES; ++i) { + struct kvmppc_vcore *vc = kvm->arch.vcores[i]; + if (!vc) + continue; + spin_lock(&vc->lock); + vc->lpcr = (vc->lpcr & ~mask) | lpcr; + spin_unlock(&vc->lock); + if (++cores_done >= kvm->arch.online_vcores) + break; + } +} + +static void kvmppc_mmu_destroy_hv(struct kvm_vcpu *vcpu) +{ + return; +} + +static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu) +{ + int err = 0; + struct kvm *kvm = vcpu->kvm; + unsigned long hva; + struct kvm_memory_slot *memslot; + struct vm_area_struct *vma; + unsigned long lpcr = 0, senc; + unsigned long psize, porder; + int srcu_idx; + + mutex_lock(&kvm->lock); + if (kvm->arch.hpte_setup_done) + goto out; /* another vcpu beat us to it */ + + /* Allocate hashed page table (if not done already) and reset it */ + if (!kvm->arch.hpt_virt) { + err = kvmppc_alloc_hpt(kvm, NULL); + if (err) { + pr_err("KVM: Couldn't alloc HPT\n"); + goto out; + } + } + + /* Look up the memslot for guest physical address 0 */ + srcu_idx = srcu_read_lock(&kvm->srcu); + memslot = gfn_to_memslot(kvm, 0); + + /* We must have some memory at 0 by now */ + err = -EINVAL; + if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID)) + goto out_srcu; + + /* Look up the VMA for the start of this memory slot */ + hva = memslot->userspace_addr; + down_read(¤t->mm->mmap_sem); + vma = find_vma(current->mm, hva); + if (!vma || vma->vm_start > hva || (vma->vm_flags & VM_IO)) + goto up_out; + + psize = vma_kernel_pagesize(vma); + porder = __ilog2(psize); + + up_read(¤t->mm->mmap_sem); + + /* We can handle 4k, 64k or 16M pages in the VRMA */ + err = -EINVAL; + if (!(psize == 0x1000 || psize == 0x10000 || + psize == 0x1000000)) + goto out_srcu; + + /* Update VRMASD field in the LPCR */ + senc = slb_pgsize_encoding(psize); + kvm->arch.vrma_slb_v = senc | SLB_VSID_B_1T | + (VRMA_VSID << SLB_VSID_SHIFT_1T); + /* the -4 is to account for senc values starting at 0x10 */ + lpcr = senc << (LPCR_VRMASD_SH - 4); + + /* Create HPTEs in the hash page table for the VRMA */ + kvmppc_map_vrma(vcpu, memslot, porder); + + kvmppc_update_lpcr(kvm, lpcr, LPCR_VRMASD); + + /* Order updates to kvm->arch.lpcr etc. vs. hpte_setup_done */ + smp_wmb(); + kvm->arch.hpte_setup_done = 1; + err = 0; + out_srcu: + srcu_read_unlock(&kvm->srcu, srcu_idx); + out: + mutex_unlock(&kvm->lock); + return err; + + up_out: + up_read(¤t->mm->mmap_sem); + goto out_srcu; +} + +static int kvmppc_core_init_vm_hv(struct kvm *kvm) +{ + unsigned long lpcr, lpid; + char buf[32]; + + /* Allocate the guest's logical partition ID */ + + lpid = kvmppc_alloc_lpid(); + if ((long)lpid < 0) + return -ENOMEM; + kvm->arch.lpid = lpid; + + /* + * Since we don't flush the TLB when tearing down a VM, + * and this lpid might have previously been used, + * make sure we flush on each core before running the new VM. + */ + cpumask_setall(&kvm->arch.need_tlb_flush); + + /* Start out with the default set of hcalls enabled */ + memcpy(kvm->arch.enabled_hcalls, default_enabled_hcalls, + sizeof(kvm->arch.enabled_hcalls)); + + kvm->arch.host_sdr1 = mfspr(SPRN_SDR1); + + /* Init LPCR for virtual RMA mode */ + kvm->arch.host_lpid = mfspr(SPRN_LPID); + kvm->arch.host_lpcr = lpcr = mfspr(SPRN_LPCR); + lpcr &= LPCR_PECE | LPCR_LPES; + lpcr |= (4UL << LPCR_DPFD_SH) | LPCR_HDICE | + LPCR_VPM0 | LPCR_VPM1; + kvm->arch.vrma_slb_v = SLB_VSID_B_1T | + (VRMA_VSID << SLB_VSID_SHIFT_1T); + /* On POWER8 turn on online bit to enable PURR/SPURR */ + if (cpu_has_feature(CPU_FTR_ARCH_207S)) + lpcr |= LPCR_ONL; + kvm->arch.lpcr = lpcr; + + /* + * Track that we now have a HV mode VM active. This blocks secondary + * CPU threads from coming online. + */ + kvm_hv_vm_activated(); + + /* + * Create a debugfs directory for the VM + */ + snprintf(buf, sizeof(buf), "vm%d", current->pid); + kvm->arch.debugfs_dir = debugfs_create_dir(buf, kvm_debugfs_dir); + if (!IS_ERR_OR_NULL(kvm->arch.debugfs_dir)) + kvmppc_mmu_debugfs_init(kvm); + + return 0; +} + +static void kvmppc_free_vcores(struct kvm *kvm) +{ + long int i; + + for (i = 0; i < KVM_MAX_VCORES; ++i) { + if (kvm->arch.vcores[i] && kvm->arch.vcores[i]->mpp_buffer) { + struct kvmppc_vcore *vc = kvm->arch.vcores[i]; + free_pages((unsigned long)vc->mpp_buffer, + MPP_BUFFER_ORDER); + } + kfree(kvm->arch.vcores[i]); + } + kvm->arch.online_vcores = 0; +} + +static void kvmppc_core_destroy_vm_hv(struct kvm *kvm) +{ + debugfs_remove_recursive(kvm->arch.debugfs_dir); + + kvm_hv_vm_deactivated(); + + kvmppc_free_vcores(kvm); + + kvmppc_free_hpt(kvm); +} + +/* We don't need to emulate any privileged instructions or dcbz */ +static int kvmppc_core_emulate_op_hv(struct kvm_run *run, struct kvm_vcpu *vcpu, + unsigned int inst, int *advance) +{ + return EMULATE_FAIL; +} + +static int kvmppc_core_emulate_mtspr_hv(struct kvm_vcpu *vcpu, int sprn, + ulong spr_val) +{ + return EMULATE_FAIL; +} + +static int kvmppc_core_emulate_mfspr_hv(struct kvm_vcpu *vcpu, int sprn, + ulong *spr_val) +{ + return EMULATE_FAIL; +} + +static int kvmppc_core_check_processor_compat_hv(void) +{ + if (!cpu_has_feature(CPU_FTR_HVMODE) || + !cpu_has_feature(CPU_FTR_ARCH_206)) + return -EIO; + return 0; +} + +static long kvm_arch_vm_ioctl_hv(struct file *filp, + unsigned int ioctl, unsigned long arg) +{ + struct kvm *kvm __maybe_unused = filp->private_data; + void __user *argp = (void __user *)arg; + long r; + + switch (ioctl) { + + case KVM_PPC_ALLOCATE_HTAB: { + u32 htab_order; + + r = -EFAULT; + if (get_user(htab_order, (u32 __user *)argp)) + break; + r = kvmppc_alloc_reset_hpt(kvm, &htab_order); + if (r) + break; + r = -EFAULT; + if (put_user(htab_order, (u32 __user *)argp)) + break; + r = 0; + break; + } + + case KVM_PPC_GET_HTAB_FD: { + struct kvm_get_htab_fd ghf; + + r = -EFAULT; + if (copy_from_user(&ghf, argp, sizeof(ghf))) + break; + r = kvm_vm_ioctl_get_htab_fd(kvm, &ghf); + break; + } + + default: + r = -ENOTTY; + } + + return r; +} + +/* + * List of hcall numbers to enable by default. + * For compatibility with old userspace, we enable by default + * all hcalls that were implemented before the hcall-enabling + * facility was added. Note this list should not include H_RTAS. + */ +static unsigned int default_hcall_list[] = { + H_REMOVE, + H_ENTER, + H_READ, + H_PROTECT, + H_BULK_REMOVE, + H_GET_TCE, + H_PUT_TCE, + H_SET_DABR, + H_SET_XDABR, + H_CEDE, + H_PROD, + H_CONFER, + H_REGISTER_VPA, +#ifdef CONFIG_KVM_XICS + H_EOI, + H_CPPR, + H_IPI, + H_IPOLL, + H_XIRR, + H_XIRR_X, +#endif + 0 +}; + +static void init_default_hcalls(void) +{ + int i; + unsigned int hcall; + + for (i = 0; default_hcall_list[i]; ++i) { + hcall = default_hcall_list[i]; + WARN_ON(!kvmppc_hcall_impl_hv(hcall)); + __set_bit(hcall / 4, default_enabled_hcalls); + } +} + +static struct kvmppc_ops kvm_ops_hv = { + .get_sregs = kvm_arch_vcpu_ioctl_get_sregs_hv, + .set_sregs = kvm_arch_vcpu_ioctl_set_sregs_hv, + .get_one_reg = kvmppc_get_one_reg_hv, + .set_one_reg = kvmppc_set_one_reg_hv, + .vcpu_load = kvmppc_core_vcpu_load_hv, + .vcpu_put = kvmppc_core_vcpu_put_hv, + .set_msr = kvmppc_set_msr_hv, + .vcpu_run = kvmppc_vcpu_run_hv, + .vcpu_create = kvmppc_core_vcpu_create_hv, + .vcpu_free = kvmppc_core_vcpu_free_hv, + .check_requests = kvmppc_core_check_requests_hv, + .get_dirty_log = kvm_vm_ioctl_get_dirty_log_hv, + .flush_memslot = kvmppc_core_flush_memslot_hv, + .prepare_memory_region = kvmppc_core_prepare_memory_region_hv, + .commit_memory_region = kvmppc_core_commit_memory_region_hv, + .unmap_hva = kvm_unmap_hva_hv, + .unmap_hva_range = kvm_unmap_hva_range_hv, + .age_hva = kvm_age_hva_hv, + .test_age_hva = kvm_test_age_hva_hv, + .set_spte_hva = kvm_set_spte_hva_hv, + .mmu_destroy = kvmppc_mmu_destroy_hv, + .free_memslot = kvmppc_core_free_memslot_hv, + .create_memslot = kvmppc_core_create_memslot_hv, + .init_vm = kvmppc_core_init_vm_hv, + .destroy_vm = kvmppc_core_destroy_vm_hv, + .get_smmu_info = kvm_vm_ioctl_get_smmu_info_hv, + .emulate_op = kvmppc_core_emulate_op_hv, + .emulate_mtspr = kvmppc_core_emulate_mtspr_hv, + .emulate_mfspr = kvmppc_core_emulate_mfspr_hv, + .fast_vcpu_kick = kvmppc_fast_vcpu_kick_hv, + .arch_vm_ioctl = kvm_arch_vm_ioctl_hv, + .hcall_implemented = kvmppc_hcall_impl_hv, +}; + +static int kvmppc_book3s_init_hv(void) +{ + int r; + /* + * FIXME!! Do we need to check on all cpus ? + */ + r = kvmppc_core_check_processor_compat_hv(); + if (r < 0) + return -ENODEV; + + kvm_ops_hv.owner = THIS_MODULE; + kvmppc_hv_ops = &kvm_ops_hv; + + init_default_hcalls(); + + r = kvmppc_mmu_hv_init(); + return r; +} + +static void kvmppc_book3s_exit_hv(void) +{ + kvmppc_hv_ops = NULL; +} + +module_init(kvmppc_book3s_init_hv); +module_exit(kvmppc_book3s_exit_hv); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_MISCDEV(KVM_MINOR); +MODULE_ALIAS("devname:kvm"); diff --git a/kernel/arch/powerpc/kvm/book3s_hv_builtin.c b/kernel/arch/powerpc/kvm/book3s_hv_builtin.c new file mode 100644 index 000000000..ed2589d45 --- /dev/null +++ b/kernel/arch/powerpc/kvm/book3s_hv_builtin.c @@ -0,0 +1,261 @@ +/* + * Copyright 2011 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + */ + +#include <linux/cpu.h> +#include <linux/kvm_host.h> +#include <linux/preempt.h> +#include <linux/export.h> +#include <linux/sched.h> +#include <linux/spinlock.h> +#include <linux/init.h> +#include <linux/memblock.h> +#include <linux/sizes.h> +#include <linux/cma.h> +#include <linux/bitops.h> + +#include <asm/cputable.h> +#include <asm/kvm_ppc.h> +#include <asm/kvm_book3s.h> +#include <asm/archrandom.h> +#include <asm/xics.h> +#include <asm/dbell.h> +#include <asm/cputhreads.h> + +#define KVM_CMA_CHUNK_ORDER 18 + +/* + * Hash page table alignment on newer cpus(CPU_FTR_ARCH_206) + * should be power of 2. + */ +#define HPT_ALIGN_PAGES ((1 << 18) >> PAGE_SHIFT) /* 256k */ +/* + * By default we reserve 5% of memory for hash pagetable allocation. + */ +static unsigned long kvm_cma_resv_ratio = 5; + +static struct cma *kvm_cma; + +static int __init early_parse_kvm_cma_resv(char *p) +{ + pr_debug("%s(%s)\n", __func__, p); + if (!p) + return -EINVAL; + return kstrtoul(p, 0, &kvm_cma_resv_ratio); +} +early_param("kvm_cma_resv_ratio", early_parse_kvm_cma_resv); + +struct page *kvm_alloc_hpt(unsigned long nr_pages) +{ + VM_BUG_ON(order_base_2(nr_pages) < KVM_CMA_CHUNK_ORDER - PAGE_SHIFT); + + return cma_alloc(kvm_cma, nr_pages, order_base_2(HPT_ALIGN_PAGES)); +} +EXPORT_SYMBOL_GPL(kvm_alloc_hpt); + +void kvm_release_hpt(struct page *page, unsigned long nr_pages) +{ + cma_release(kvm_cma, page, nr_pages); +} +EXPORT_SYMBOL_GPL(kvm_release_hpt); + +/** + * kvm_cma_reserve() - reserve area for kvm hash pagetable + * + * This function reserves memory from early allocator. It should be + * called by arch specific code once the memblock allocator + * has been activated and all other subsystems have already allocated/reserved + * memory. + */ +void __init kvm_cma_reserve(void) +{ + unsigned long align_size; + struct memblock_region *reg; + phys_addr_t selected_size = 0; + + /* + * We need CMA reservation only when we are in HV mode + */ + if (!cpu_has_feature(CPU_FTR_HVMODE)) + return; + /* + * We cannot use memblock_phys_mem_size() here, because + * memblock_analyze() has not been called yet. + */ + for_each_memblock(memory, reg) + selected_size += memblock_region_memory_end_pfn(reg) - + memblock_region_memory_base_pfn(reg); + + selected_size = (selected_size * kvm_cma_resv_ratio / 100) << PAGE_SHIFT; + if (selected_size) { + pr_debug("%s: reserving %ld MiB for global area\n", __func__, + (unsigned long)selected_size / SZ_1M); + align_size = HPT_ALIGN_PAGES << PAGE_SHIFT; + cma_declare_contiguous(0, selected_size, 0, align_size, + KVM_CMA_CHUNK_ORDER - PAGE_SHIFT, false, &kvm_cma); + } +} + +/* + * Real-mode H_CONFER implementation. + * We check if we are the only vcpu out of this virtual core + * still running in the guest and not ceded. If so, we pop up + * to the virtual-mode implementation; if not, just return to + * the guest. + */ +long int kvmppc_rm_h_confer(struct kvm_vcpu *vcpu, int target, + unsigned int yield_count) +{ + struct kvmppc_vcore *vc = vcpu->arch.vcore; + int threads_running; + int threads_ceded; + int threads_conferring; + u64 stop = get_tb() + 10 * tb_ticks_per_usec; + int rv = H_SUCCESS; /* => don't yield */ + + set_bit(vcpu->arch.ptid, &vc->conferring_threads); + while ((get_tb() < stop) && !VCORE_IS_EXITING(vc)) { + threads_running = VCORE_ENTRY_MAP(vc); + threads_ceded = vc->napping_threads; + threads_conferring = vc->conferring_threads; + if ((threads_ceded | threads_conferring) == threads_running) { + rv = H_TOO_HARD; /* => do yield */ + break; + } + } + clear_bit(vcpu->arch.ptid, &vc->conferring_threads); + return rv; +} + +/* + * When running HV mode KVM we need to block certain operations while KVM VMs + * exist in the system. We use a counter of VMs to track this. + * + * One of the operations we need to block is onlining of secondaries, so we + * protect hv_vm_count with get/put_online_cpus(). + */ +static atomic_t hv_vm_count; + +void kvm_hv_vm_activated(void) +{ + get_online_cpus(); + atomic_inc(&hv_vm_count); + put_online_cpus(); +} +EXPORT_SYMBOL_GPL(kvm_hv_vm_activated); + +void kvm_hv_vm_deactivated(void) +{ + get_online_cpus(); + atomic_dec(&hv_vm_count); + put_online_cpus(); +} +EXPORT_SYMBOL_GPL(kvm_hv_vm_deactivated); + +bool kvm_hv_mode_active(void) +{ + return atomic_read(&hv_vm_count) != 0; +} + +extern int hcall_real_table[], hcall_real_table_end[]; + +int kvmppc_hcall_impl_hv_realmode(unsigned long cmd) +{ + cmd /= 4; + if (cmd < hcall_real_table_end - hcall_real_table && + hcall_real_table[cmd]) + return 1; + + return 0; +} +EXPORT_SYMBOL_GPL(kvmppc_hcall_impl_hv_realmode); + +int kvmppc_hwrng_present(void) +{ + return powernv_hwrng_present(); +} +EXPORT_SYMBOL_GPL(kvmppc_hwrng_present); + +long kvmppc_h_random(struct kvm_vcpu *vcpu) +{ + if (powernv_get_random_real_mode(&vcpu->arch.gpr[4])) + return H_SUCCESS; + + return H_HARDWARE; +} + +static inline void rm_writeb(unsigned long paddr, u8 val) +{ + __asm__ __volatile__("stbcix %0,0,%1" + : : "r" (val), "r" (paddr) : "memory"); +} + +/* + * Send an interrupt or message to another CPU. + * This can only be called in real mode. + * The caller needs to include any barrier needed to order writes + * to memory vs. the IPI/message. + */ +void kvmhv_rm_send_ipi(int cpu) +{ + unsigned long xics_phys; + + /* On POWER8 for IPIs to threads in the same core, use msgsnd */ + if (cpu_has_feature(CPU_FTR_ARCH_207S) && + cpu_first_thread_sibling(cpu) == + cpu_first_thread_sibling(raw_smp_processor_id())) { + unsigned long msg = PPC_DBELL_TYPE(PPC_DBELL_SERVER); + msg |= cpu_thread_in_core(cpu); + __asm__ __volatile__ (PPC_MSGSND(%0) : : "r" (msg)); + return; + } + + /* Else poke the target with an IPI */ + xics_phys = paca[cpu].kvm_hstate.xics_phys; + rm_writeb(xics_phys + XICS_MFRR, IPI_PRIORITY); +} + +/* + * The following functions are called from the assembly code + * in book3s_hv_rmhandlers.S. + */ +static void kvmhv_interrupt_vcore(struct kvmppc_vcore *vc, int active) +{ + int cpu = vc->pcpu; + + /* Order setting of exit map vs. msgsnd/IPI */ + smp_mb(); + for (; active; active >>= 1, ++cpu) + if (active & 1) + kvmhv_rm_send_ipi(cpu); +} + +void kvmhv_commence_exit(int trap) +{ + struct kvmppc_vcore *vc = local_paca->kvm_hstate.kvm_vcore; + int ptid = local_paca->kvm_hstate.ptid; + int me, ee; + + /* Set our bit in the threads-exiting-guest map in the 0xff00 + bits of vcore->entry_exit_map */ + me = 0x100 << ptid; + do { + ee = vc->entry_exit_map; + } while (cmpxchg(&vc->entry_exit_map, ee, ee | me) != ee); + + /* Are we the first here? */ + if ((ee >> 8) != 0) + return; + + /* + * Trigger the other threads in this vcore to exit the guest. + * If this is a hypervisor decrementer interrupt then they + * will be already on their way out of the guest. + */ + if (trap != BOOK3S_INTERRUPT_HV_DECREMENTER) + kvmhv_interrupt_vcore(vc, ee & ~(1 << ptid)); +} diff --git a/kernel/arch/powerpc/kvm/book3s_hv_interrupts.S b/kernel/arch/powerpc/kvm/book3s_hv_interrupts.S new file mode 100644 index 000000000..0fdc4a289 --- /dev/null +++ b/kernel/arch/powerpc/kvm/book3s_hv_interrupts.S @@ -0,0 +1,158 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * + * Copyright 2011 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com> + * + * Derived from book3s_interrupts.S, which is: + * Copyright SUSE Linux Products GmbH 2009 + * + * Authors: Alexander Graf <agraf@suse.de> + */ + +#include <asm/ppc_asm.h> +#include <asm/kvm_asm.h> +#include <asm/reg.h> +#include <asm/page.h> +#include <asm/asm-offsets.h> +#include <asm/exception-64s.h> +#include <asm/ppc-opcode.h> + +/***************************************************************************** + * * + * Guest entry / exit code that is in kernel module memory (vmalloc) * + * * + ****************************************************************************/ + +/* Registers: + * none + */ +_GLOBAL(__kvmppc_vcore_entry) + + /* Write correct stack frame */ + mflr r0 + std r0,PPC_LR_STKOFF(r1) + + /* Save host state to the stack */ + stdu r1, -SWITCH_FRAME_SIZE(r1) + + /* Save non-volatile registers (r14 - r31) and CR */ + SAVE_NVGPRS(r1) + mfcr r3 + std r3, _CCR(r1) + + /* Save host DSCR */ + mfspr r3, SPRN_DSCR + std r3, HSTATE_DSCR(r13) + +BEGIN_FTR_SECTION + /* Save host DABR */ + mfspr r3, SPRN_DABR + std r3, HSTATE_DABR(r13) +END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S) + + /* Hard-disable interrupts */ + mfmsr r10 + std r10, HSTATE_HOST_MSR(r13) + rldicl r10,r10,48,1 + rotldi r10,r10,16 + mtmsrd r10,1 + + /* Save host PMU registers */ +BEGIN_FTR_SECTION + /* Work around P8 PMAE bug */ + li r3, -1 + clrrdi r3, r3, 10 + mfspr r8, SPRN_MMCR2 + mtspr SPRN_MMCR2, r3 /* freeze all counters using MMCR2 */ + isync +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) + li r3, 1 + sldi r3, r3, 31 /* MMCR0_FC (freeze counters) bit */ + mfspr r7, SPRN_MMCR0 /* save MMCR0 */ + mtspr SPRN_MMCR0, r3 /* freeze all counters, disable interrupts */ + mfspr r6, SPRN_MMCRA + /* Clear MMCRA in order to disable SDAR updates */ + li r5, 0 + mtspr SPRN_MMCRA, r5 + isync + ld r3, PACALPPACAPTR(r13) /* is the host using the PMU? */ + lbz r5, LPPACA_PMCINUSE(r3) + cmpwi r5, 0 + beq 31f /* skip if not */ + mfspr r5, SPRN_MMCR1 + mfspr r9, SPRN_SIAR + mfspr r10, SPRN_SDAR + std r7, HSTATE_MMCR0(r13) + std r5, HSTATE_MMCR1(r13) + std r6, HSTATE_MMCRA(r13) + std r9, HSTATE_SIAR(r13) + std r10, HSTATE_SDAR(r13) +BEGIN_FTR_SECTION + mfspr r9, SPRN_SIER + std r8, HSTATE_MMCR2(r13) + std r9, HSTATE_SIER(r13) +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) + mfspr r3, SPRN_PMC1 + mfspr r5, SPRN_PMC2 + mfspr r6, SPRN_PMC3 + mfspr r7, SPRN_PMC4 + mfspr r8, SPRN_PMC5 + mfspr r9, SPRN_PMC6 + stw r3, HSTATE_PMC1(r13) + stw r5, HSTATE_PMC2(r13) + stw r6, HSTATE_PMC3(r13) + stw r7, HSTATE_PMC4(r13) + stw r8, HSTATE_PMC5(r13) + stw r9, HSTATE_PMC6(r13) +31: + + /* + * Put whatever is in the decrementer into the + * hypervisor decrementer. + */ + mfspr r8,SPRN_DEC + mftb r7 + mtspr SPRN_HDEC,r8 + extsw r8,r8 + add r8,r8,r7 + std r8,HSTATE_DECEXP(r13) + + /* Jump to partition switch code */ + bl kvmppc_hv_entry_trampoline + nop + +/* + * We return here in virtual mode after the guest exits + * with something that we can't handle in real mode. + * Interrupts are enabled again at this point. + */ + + /* + * Register usage at this point: + * + * R1 = host R1 + * R2 = host R2 + * R12 = exit handler id + * R13 = PACA + */ + + /* Restore non-volatile host registers (r14 - r31) and CR */ + REST_NVGPRS(r1) + ld r4, _CCR(r1) + mtcr r4 + + addi r1, r1, SWITCH_FRAME_SIZE + ld r0, PPC_LR_STKOFF(r1) + mtlr r0 + blr diff --git a/kernel/arch/powerpc/kvm/book3s_hv_ras.c b/kernel/arch/powerpc/kvm/book3s_hv_ras.c new file mode 100644 index 000000000..93b5f5c9b --- /dev/null +++ b/kernel/arch/powerpc/kvm/book3s_hv_ras.c @@ -0,0 +1,142 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + * + * Copyright 2012 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com> + */ + +#include <linux/types.h> +#include <linux/string.h> +#include <linux/kvm.h> +#include <linux/kvm_host.h> +#include <linux/kernel.h> +#include <asm/opal.h> +#include <asm/mce.h> + +/* SRR1 bits for machine check on POWER7 */ +#define SRR1_MC_LDSTERR (1ul << (63-42)) +#define SRR1_MC_IFETCH_SH (63-45) +#define SRR1_MC_IFETCH_MASK 0x7 +#define SRR1_MC_IFETCH_SLBPAR 2 /* SLB parity error */ +#define SRR1_MC_IFETCH_SLBMULTI 3 /* SLB multi-hit */ +#define SRR1_MC_IFETCH_SLBPARMULTI 4 /* SLB parity + multi-hit */ +#define SRR1_MC_IFETCH_TLBMULTI 5 /* I-TLB multi-hit */ + +/* DSISR bits for machine check on POWER7 */ +#define DSISR_MC_DERAT_MULTI 0x800 /* D-ERAT multi-hit */ +#define DSISR_MC_TLB_MULTI 0x400 /* D-TLB multi-hit */ +#define DSISR_MC_SLB_PARITY 0x100 /* SLB parity error */ +#define DSISR_MC_SLB_MULTI 0x080 /* SLB multi-hit */ +#define DSISR_MC_SLB_PARMULTI 0x040 /* SLB parity + multi-hit */ + +/* POWER7 SLB flush and reload */ +static void reload_slb(struct kvm_vcpu *vcpu) +{ + struct slb_shadow *slb; + unsigned long i, n; + + /* First clear out SLB */ + asm volatile("slbmte %0,%0; slbia" : : "r" (0)); + + /* Do they have an SLB shadow buffer registered? */ + slb = vcpu->arch.slb_shadow.pinned_addr; + if (!slb) + return; + + /* Sanity check */ + n = min_t(u32, be32_to_cpu(slb->persistent), SLB_MIN_SIZE); + if ((void *) &slb->save_area[n] > vcpu->arch.slb_shadow.pinned_end) + return; + + /* Load up the SLB from that */ + for (i = 0; i < n; ++i) { + unsigned long rb = be64_to_cpu(slb->save_area[i].esid); + unsigned long rs = be64_to_cpu(slb->save_area[i].vsid); + + rb = (rb & ~0xFFFul) | i; /* insert entry number */ + asm volatile("slbmte %0,%1" : : "r" (rs), "r" (rb)); + } +} + +/* + * On POWER7, see if we can handle a machine check that occurred inside + * the guest in real mode, without switching to the host partition. + * + * Returns: 0 => exit guest, 1 => deliver machine check to guest + */ +static long kvmppc_realmode_mc_power7(struct kvm_vcpu *vcpu) +{ + unsigned long srr1 = vcpu->arch.shregs.msr; + struct machine_check_event mce_evt; + long handled = 1; + + if (srr1 & SRR1_MC_LDSTERR) { + /* error on load/store */ + unsigned long dsisr = vcpu->arch.shregs.dsisr; + + if (dsisr & (DSISR_MC_SLB_PARMULTI | DSISR_MC_SLB_MULTI | + DSISR_MC_SLB_PARITY | DSISR_MC_DERAT_MULTI)) { + /* flush and reload SLB; flushes D-ERAT too */ + reload_slb(vcpu); + dsisr &= ~(DSISR_MC_SLB_PARMULTI | DSISR_MC_SLB_MULTI | + DSISR_MC_SLB_PARITY | DSISR_MC_DERAT_MULTI); + } + if (dsisr & DSISR_MC_TLB_MULTI) { + if (cur_cpu_spec && cur_cpu_spec->flush_tlb) + cur_cpu_spec->flush_tlb(TLB_INVAL_SCOPE_LPID); + dsisr &= ~DSISR_MC_TLB_MULTI; + } + /* Any other errors we don't understand? */ + if (dsisr & 0xffffffffUL) + handled = 0; + } + + switch ((srr1 >> SRR1_MC_IFETCH_SH) & SRR1_MC_IFETCH_MASK) { + case 0: + break; + case SRR1_MC_IFETCH_SLBPAR: + case SRR1_MC_IFETCH_SLBMULTI: + case SRR1_MC_IFETCH_SLBPARMULTI: + reload_slb(vcpu); + break; + case SRR1_MC_IFETCH_TLBMULTI: + if (cur_cpu_spec && cur_cpu_spec->flush_tlb) + cur_cpu_spec->flush_tlb(TLB_INVAL_SCOPE_LPID); + break; + default: + handled = 0; + } + + /* + * See if we have already handled the condition in the linux host. + * We assume that if the condition is recovered then linux host + * will have generated an error log event that we will pick + * up and log later. + * Don't release mce event now. We will queue up the event so that + * we can log the MCE event info on host console. + */ + if (!get_mce_event(&mce_evt, MCE_EVENT_DONTRELEASE)) + goto out; + + if (mce_evt.version == MCE_V1 && + (mce_evt.severity == MCE_SEV_NO_ERROR || + mce_evt.disposition == MCE_DISPOSITION_RECOVERED)) + handled = 1; + +out: + /* + * We are now going enter guest either through machine check + * interrupt (for unhandled errors) or will continue from + * current HSRR0 (for handled errors) in guest. Hence + * queue up the event so that we can log it from host console later. + */ + machine_check_queue_event(); + + return handled; +} + +long kvmppc_realmode_machine_check(struct kvm_vcpu *vcpu) +{ + return kvmppc_realmode_mc_power7(vcpu); +} diff --git a/kernel/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/kernel/arch/powerpc/kvm/book3s_hv_rm_mmu.c new file mode 100644 index 000000000..b027a8973 --- /dev/null +++ b/kernel/arch/powerpc/kvm/book3s_hv_rm_mmu.c @@ -0,0 +1,857 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + * + * Copyright 2010-2011 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com> + */ + +#include <linux/types.h> +#include <linux/string.h> +#include <linux/kvm.h> +#include <linux/kvm_host.h> +#include <linux/hugetlb.h> +#include <linux/module.h> + +#include <asm/tlbflush.h> +#include <asm/kvm_ppc.h> +#include <asm/kvm_book3s.h> +#include <asm/mmu-hash64.h> +#include <asm/hvcall.h> +#include <asm/synch.h> +#include <asm/ppc-opcode.h> + +/* Translate address of a vmalloc'd thing to a linear map address */ +static void *real_vmalloc_addr(void *x) +{ + unsigned long addr = (unsigned long) x; + pte_t *p; + /* + * assume we don't have huge pages in vmalloc space... + * So don't worry about THP collapse/split. Called + * Only in realmode, hence won't need irq_save/restore. + */ + p = __find_linux_pte_or_hugepte(swapper_pg_dir, addr, NULL); + if (!p || !pte_present(*p)) + return NULL; + addr = (pte_pfn(*p) << PAGE_SHIFT) | (addr & ~PAGE_MASK); + return __va(addr); +} + +/* Return 1 if we need to do a global tlbie, 0 if we can use tlbiel */ +static int global_invalidates(struct kvm *kvm, unsigned long flags) +{ + int global; + + /* + * If there is only one vcore, and it's currently running, + * as indicated by local_paca->kvm_hstate.kvm_vcpu being set, + * we can use tlbiel as long as we mark all other physical + * cores as potentially having stale TLB entries for this lpid. + * Otherwise, don't use tlbiel. + */ + if (kvm->arch.online_vcores == 1 && local_paca->kvm_hstate.kvm_vcpu) + global = 0; + else + global = 1; + + if (!global) { + /* any other core might now have stale TLB entries... */ + smp_wmb(); + cpumask_setall(&kvm->arch.need_tlb_flush); + cpumask_clear_cpu(local_paca->kvm_hstate.kvm_vcore->pcpu, + &kvm->arch.need_tlb_flush); + } + + return global; +} + +/* + * Add this HPTE into the chain for the real page. + * Must be called with the chain locked; it unlocks the chain. + */ +void kvmppc_add_revmap_chain(struct kvm *kvm, struct revmap_entry *rev, + unsigned long *rmap, long pte_index, int realmode) +{ + struct revmap_entry *head, *tail; + unsigned long i; + + if (*rmap & KVMPPC_RMAP_PRESENT) { + i = *rmap & KVMPPC_RMAP_INDEX; + head = &kvm->arch.revmap[i]; + if (realmode) + head = real_vmalloc_addr(head); + tail = &kvm->arch.revmap[head->back]; + if (realmode) + tail = real_vmalloc_addr(tail); + rev->forw = i; + rev->back = head->back; + tail->forw = pte_index; + head->back = pte_index; + } else { + rev->forw = rev->back = pte_index; + *rmap = (*rmap & ~KVMPPC_RMAP_INDEX) | + pte_index | KVMPPC_RMAP_PRESENT; + } + unlock_rmap(rmap); +} +EXPORT_SYMBOL_GPL(kvmppc_add_revmap_chain); + +/* Remove this HPTE from the chain for a real page */ +static void remove_revmap_chain(struct kvm *kvm, long pte_index, + struct revmap_entry *rev, + unsigned long hpte_v, unsigned long hpte_r) +{ + struct revmap_entry *next, *prev; + unsigned long gfn, ptel, head; + struct kvm_memory_slot *memslot; + unsigned long *rmap; + unsigned long rcbits; + + rcbits = hpte_r & (HPTE_R_R | HPTE_R_C); + ptel = rev->guest_rpte |= rcbits; + gfn = hpte_rpn(ptel, hpte_page_size(hpte_v, ptel)); + memslot = __gfn_to_memslot(kvm_memslots_raw(kvm), gfn); + if (!memslot) + return; + + rmap = real_vmalloc_addr(&memslot->arch.rmap[gfn - memslot->base_gfn]); + lock_rmap(rmap); + + head = *rmap & KVMPPC_RMAP_INDEX; + next = real_vmalloc_addr(&kvm->arch.revmap[rev->forw]); + prev = real_vmalloc_addr(&kvm->arch.revmap[rev->back]); + next->back = rev->back; + prev->forw = rev->forw; + if (head == pte_index) { + head = rev->forw; + if (head == pte_index) + *rmap &= ~(KVMPPC_RMAP_PRESENT | KVMPPC_RMAP_INDEX); + else + *rmap = (*rmap & ~KVMPPC_RMAP_INDEX) | head; + } + *rmap |= rcbits << KVMPPC_RMAP_RC_SHIFT; + unlock_rmap(rmap); +} + +long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags, + long pte_index, unsigned long pteh, unsigned long ptel, + pgd_t *pgdir, bool realmode, unsigned long *pte_idx_ret) +{ + unsigned long i, pa, gpa, gfn, psize; + unsigned long slot_fn, hva; + __be64 *hpte; + struct revmap_entry *rev; + unsigned long g_ptel; + struct kvm_memory_slot *memslot; + unsigned hpage_shift; + unsigned long is_io; + unsigned long *rmap; + pte_t *ptep; + unsigned int writing; + unsigned long mmu_seq; + unsigned long rcbits, irq_flags = 0; + + psize = hpte_page_size(pteh, ptel); + if (!psize) + return H_PARAMETER; + writing = hpte_is_writable(ptel); + pteh &= ~(HPTE_V_HVLOCK | HPTE_V_ABSENT | HPTE_V_VALID); + ptel &= ~HPTE_GR_RESERVED; + g_ptel = ptel; + + /* used later to detect if we might have been invalidated */ + mmu_seq = kvm->mmu_notifier_seq; + smp_rmb(); + + /* Find the memslot (if any) for this address */ + gpa = (ptel & HPTE_R_RPN) & ~(psize - 1); + gfn = gpa >> PAGE_SHIFT; + memslot = __gfn_to_memslot(kvm_memslots_raw(kvm), gfn); + pa = 0; + is_io = ~0ul; + rmap = NULL; + if (!(memslot && !(memslot->flags & KVM_MEMSLOT_INVALID))) { + /* Emulated MMIO - mark this with key=31 */ + pteh |= HPTE_V_ABSENT; + ptel |= HPTE_R_KEY_HI | HPTE_R_KEY_LO; + goto do_insert; + } + + /* Check if the requested page fits entirely in the memslot. */ + if (!slot_is_aligned(memslot, psize)) + return H_PARAMETER; + slot_fn = gfn - memslot->base_gfn; + rmap = &memslot->arch.rmap[slot_fn]; + + /* Translate to host virtual address */ + hva = __gfn_to_hva_memslot(memslot, gfn); + /* + * If we had a page table table change after lookup, we would + * retry via mmu_notifier_retry. + */ + if (realmode) + ptep = __find_linux_pte_or_hugepte(pgdir, hva, &hpage_shift); + else { + local_irq_save(irq_flags); + ptep = find_linux_pte_or_hugepte(pgdir, hva, &hpage_shift); + } + if (ptep) { + pte_t pte; + unsigned int host_pte_size; + + if (hpage_shift) + host_pte_size = 1ul << hpage_shift; + else + host_pte_size = PAGE_SIZE; + /* + * We should always find the guest page size + * to <= host page size, if host is using hugepage + */ + if (host_pte_size < psize) { + if (!realmode) + local_irq_restore(flags); + return H_PARAMETER; + } + pte = kvmppc_read_update_linux_pte(ptep, writing); + if (pte_present(pte) && !pte_protnone(pte)) { + if (writing && !pte_write(pte)) + /* make the actual HPTE be read-only */ + ptel = hpte_make_readonly(ptel); + is_io = hpte_cache_bits(pte_val(pte)); + pa = pte_pfn(pte) << PAGE_SHIFT; + pa |= hva & (host_pte_size - 1); + pa |= gpa & ~PAGE_MASK; + } + } + if (!realmode) + local_irq_restore(irq_flags); + + ptel &= ~(HPTE_R_PP0 - psize); + ptel |= pa; + + if (pa) + pteh |= HPTE_V_VALID; + else + pteh |= HPTE_V_ABSENT; + + /* Check WIMG */ + if (is_io != ~0ul && !hpte_cache_flags_ok(ptel, is_io)) { + if (is_io) + return H_PARAMETER; + /* + * Allow guest to map emulated device memory as + * uncacheable, but actually make it cacheable. + */ + ptel &= ~(HPTE_R_W|HPTE_R_I|HPTE_R_G); + ptel |= HPTE_R_M; + } + + /* Find and lock the HPTEG slot to use */ + do_insert: + if (pte_index >= kvm->arch.hpt_npte) + return H_PARAMETER; + if (likely((flags & H_EXACT) == 0)) { + pte_index &= ~7UL; + hpte = (__be64 *)(kvm->arch.hpt_virt + (pte_index << 4)); + for (i = 0; i < 8; ++i) { + if ((be64_to_cpu(*hpte) & HPTE_V_VALID) == 0 && + try_lock_hpte(hpte, HPTE_V_HVLOCK | HPTE_V_VALID | + HPTE_V_ABSENT)) + break; + hpte += 2; + } + if (i == 8) { + /* + * Since try_lock_hpte doesn't retry (not even stdcx. + * failures), it could be that there is a free slot + * but we transiently failed to lock it. Try again, + * actually locking each slot and checking it. + */ + hpte -= 16; + for (i = 0; i < 8; ++i) { + u64 pte; + while (!try_lock_hpte(hpte, HPTE_V_HVLOCK)) + cpu_relax(); + pte = be64_to_cpu(hpte[0]); + if (!(pte & (HPTE_V_VALID | HPTE_V_ABSENT))) + break; + __unlock_hpte(hpte, pte); + hpte += 2; + } + if (i == 8) + return H_PTEG_FULL; + } + pte_index += i; + } else { + hpte = (__be64 *)(kvm->arch.hpt_virt + (pte_index << 4)); + if (!try_lock_hpte(hpte, HPTE_V_HVLOCK | HPTE_V_VALID | + HPTE_V_ABSENT)) { + /* Lock the slot and check again */ + u64 pte; + + while (!try_lock_hpte(hpte, HPTE_V_HVLOCK)) + cpu_relax(); + pte = be64_to_cpu(hpte[0]); + if (pte & (HPTE_V_VALID | HPTE_V_ABSENT)) { + __unlock_hpte(hpte, pte); + return H_PTEG_FULL; + } + } + } + + /* Save away the guest's idea of the second HPTE dword */ + rev = &kvm->arch.revmap[pte_index]; + if (realmode) + rev = real_vmalloc_addr(rev); + if (rev) { + rev->guest_rpte = g_ptel; + note_hpte_modification(kvm, rev); + } + + /* Link HPTE into reverse-map chain */ + if (pteh & HPTE_V_VALID) { + if (realmode) + rmap = real_vmalloc_addr(rmap); + lock_rmap(rmap); + /* Check for pending invalidations under the rmap chain lock */ + if (mmu_notifier_retry(kvm, mmu_seq)) { + /* inval in progress, write a non-present HPTE */ + pteh |= HPTE_V_ABSENT; + pteh &= ~HPTE_V_VALID; + unlock_rmap(rmap); + } else { + kvmppc_add_revmap_chain(kvm, rev, rmap, pte_index, + realmode); + /* Only set R/C in real HPTE if already set in *rmap */ + rcbits = *rmap >> KVMPPC_RMAP_RC_SHIFT; + ptel &= rcbits | ~(HPTE_R_R | HPTE_R_C); + } + } + + hpte[1] = cpu_to_be64(ptel); + + /* Write the first HPTE dword, unlocking the HPTE and making it valid */ + eieio(); + __unlock_hpte(hpte, pteh); + asm volatile("ptesync" : : : "memory"); + + *pte_idx_ret = pte_index; + return H_SUCCESS; +} +EXPORT_SYMBOL_GPL(kvmppc_do_h_enter); + +long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags, + long pte_index, unsigned long pteh, unsigned long ptel) +{ + return kvmppc_do_h_enter(vcpu->kvm, flags, pte_index, pteh, ptel, + vcpu->arch.pgdir, true, &vcpu->arch.gpr[4]); +} + +#ifdef __BIG_ENDIAN__ +#define LOCK_TOKEN (*(u32 *)(&get_paca()->lock_token)) +#else +#define LOCK_TOKEN (*(u32 *)(&get_paca()->paca_index)) +#endif + +static inline int try_lock_tlbie(unsigned int *lock) +{ + unsigned int tmp, old; + unsigned int token = LOCK_TOKEN; + + asm volatile("1:lwarx %1,0,%2\n" + " cmpwi cr0,%1,0\n" + " bne 2f\n" + " stwcx. %3,0,%2\n" + " bne- 1b\n" + " isync\n" + "2:" + : "=&r" (tmp), "=&r" (old) + : "r" (lock), "r" (token) + : "cc", "memory"); + return old == 0; +} + +static void do_tlbies(struct kvm *kvm, unsigned long *rbvalues, + long npages, int global, bool need_sync) +{ + long i; + + if (global) { + while (!try_lock_tlbie(&kvm->arch.tlbie_lock)) + cpu_relax(); + if (need_sync) + asm volatile("ptesync" : : : "memory"); + for (i = 0; i < npages; ++i) + asm volatile(PPC_TLBIE(%1,%0) : : + "r" (rbvalues[i]), "r" (kvm->arch.lpid)); + asm volatile("eieio; tlbsync; ptesync" : : : "memory"); + kvm->arch.tlbie_lock = 0; + } else { + if (need_sync) + asm volatile("ptesync" : : : "memory"); + for (i = 0; i < npages; ++i) + asm volatile("tlbiel %0" : : "r" (rbvalues[i])); + asm volatile("ptesync" : : : "memory"); + } +} + +long kvmppc_do_h_remove(struct kvm *kvm, unsigned long flags, + unsigned long pte_index, unsigned long avpn, + unsigned long *hpret) +{ + __be64 *hpte; + unsigned long v, r, rb; + struct revmap_entry *rev; + u64 pte; + + if (pte_index >= kvm->arch.hpt_npte) + return H_PARAMETER; + hpte = (__be64 *)(kvm->arch.hpt_virt + (pte_index << 4)); + while (!try_lock_hpte(hpte, HPTE_V_HVLOCK)) + cpu_relax(); + pte = be64_to_cpu(hpte[0]); + if ((pte & (HPTE_V_ABSENT | HPTE_V_VALID)) == 0 || + ((flags & H_AVPN) && (pte & ~0x7fUL) != avpn) || + ((flags & H_ANDCOND) && (pte & avpn) != 0)) { + __unlock_hpte(hpte, pte); + return H_NOT_FOUND; + } + + rev = real_vmalloc_addr(&kvm->arch.revmap[pte_index]); + v = pte & ~HPTE_V_HVLOCK; + if (v & HPTE_V_VALID) { + u64 pte1; + + pte1 = be64_to_cpu(hpte[1]); + hpte[0] &= ~cpu_to_be64(HPTE_V_VALID); + rb = compute_tlbie_rb(v, pte1, pte_index); + do_tlbies(kvm, &rb, 1, global_invalidates(kvm, flags), true); + /* Read PTE low word after tlbie to get final R/C values */ + remove_revmap_chain(kvm, pte_index, rev, v, pte1); + } + r = rev->guest_rpte & ~HPTE_GR_RESERVED; + note_hpte_modification(kvm, rev); + unlock_hpte(hpte, 0); + + hpret[0] = v; + hpret[1] = r; + return H_SUCCESS; +} +EXPORT_SYMBOL_GPL(kvmppc_do_h_remove); + +long kvmppc_h_remove(struct kvm_vcpu *vcpu, unsigned long flags, + unsigned long pte_index, unsigned long avpn) +{ + return kvmppc_do_h_remove(vcpu->kvm, flags, pte_index, avpn, + &vcpu->arch.gpr[4]); +} + +long kvmppc_h_bulk_remove(struct kvm_vcpu *vcpu) +{ + struct kvm *kvm = vcpu->kvm; + unsigned long *args = &vcpu->arch.gpr[4]; + __be64 *hp, *hptes[4]; + unsigned long tlbrb[4]; + long int i, j, k, n, found, indexes[4]; + unsigned long flags, req, pte_index, rcbits; + int global; + long int ret = H_SUCCESS; + struct revmap_entry *rev, *revs[4]; + u64 hp0; + + global = global_invalidates(kvm, 0); + for (i = 0; i < 4 && ret == H_SUCCESS; ) { + n = 0; + for (; i < 4; ++i) { + j = i * 2; + pte_index = args[j]; + flags = pte_index >> 56; + pte_index &= ((1ul << 56) - 1); + req = flags >> 6; + flags &= 3; + if (req == 3) { /* no more requests */ + i = 4; + break; + } + if (req != 1 || flags == 3 || + pte_index >= kvm->arch.hpt_npte) { + /* parameter error */ + args[j] = ((0xa0 | flags) << 56) + pte_index; + ret = H_PARAMETER; + break; + } + hp = (__be64 *) (kvm->arch.hpt_virt + (pte_index << 4)); + /* to avoid deadlock, don't spin except for first */ + if (!try_lock_hpte(hp, HPTE_V_HVLOCK)) { + if (n) + break; + while (!try_lock_hpte(hp, HPTE_V_HVLOCK)) + cpu_relax(); + } + found = 0; + hp0 = be64_to_cpu(hp[0]); + if (hp0 & (HPTE_V_ABSENT | HPTE_V_VALID)) { + switch (flags & 3) { + case 0: /* absolute */ + found = 1; + break; + case 1: /* andcond */ + if (!(hp0 & args[j + 1])) + found = 1; + break; + case 2: /* AVPN */ + if ((hp0 & ~0x7fUL) == args[j + 1]) + found = 1; + break; + } + } + if (!found) { + hp[0] &= ~cpu_to_be64(HPTE_V_HVLOCK); + args[j] = ((0x90 | flags) << 56) + pte_index; + continue; + } + + args[j] = ((0x80 | flags) << 56) + pte_index; + rev = real_vmalloc_addr(&kvm->arch.revmap[pte_index]); + note_hpte_modification(kvm, rev); + + if (!(hp0 & HPTE_V_VALID)) { + /* insert R and C bits from PTE */ + rcbits = rev->guest_rpte & (HPTE_R_R|HPTE_R_C); + args[j] |= rcbits << (56 - 5); + hp[0] = 0; + continue; + } + + /* leave it locked */ + hp[0] &= ~cpu_to_be64(HPTE_V_VALID); + tlbrb[n] = compute_tlbie_rb(be64_to_cpu(hp[0]), + be64_to_cpu(hp[1]), pte_index); + indexes[n] = j; + hptes[n] = hp; + revs[n] = rev; + ++n; + } + + if (!n) + break; + + /* Now that we've collected a batch, do the tlbies */ + do_tlbies(kvm, tlbrb, n, global, true); + + /* Read PTE low words after tlbie to get final R/C values */ + for (k = 0; k < n; ++k) { + j = indexes[k]; + pte_index = args[j] & ((1ul << 56) - 1); + hp = hptes[k]; + rev = revs[k]; + remove_revmap_chain(kvm, pte_index, rev, + be64_to_cpu(hp[0]), be64_to_cpu(hp[1])); + rcbits = rev->guest_rpte & (HPTE_R_R|HPTE_R_C); + args[j] |= rcbits << (56 - 5); + __unlock_hpte(hp, 0); + } + } + + return ret; +} + +long kvmppc_h_protect(struct kvm_vcpu *vcpu, unsigned long flags, + unsigned long pte_index, unsigned long avpn, + unsigned long va) +{ + struct kvm *kvm = vcpu->kvm; + __be64 *hpte; + struct revmap_entry *rev; + unsigned long v, r, rb, mask, bits; + u64 pte; + + if (pte_index >= kvm->arch.hpt_npte) + return H_PARAMETER; + + hpte = (__be64 *)(kvm->arch.hpt_virt + (pte_index << 4)); + while (!try_lock_hpte(hpte, HPTE_V_HVLOCK)) + cpu_relax(); + pte = be64_to_cpu(hpte[0]); + if ((pte & (HPTE_V_ABSENT | HPTE_V_VALID)) == 0 || + ((flags & H_AVPN) && (pte & ~0x7fUL) != avpn)) { + __unlock_hpte(hpte, pte); + return H_NOT_FOUND; + } + + v = pte; + bits = (flags << 55) & HPTE_R_PP0; + bits |= (flags << 48) & HPTE_R_KEY_HI; + bits |= flags & (HPTE_R_PP | HPTE_R_N | HPTE_R_KEY_LO); + + /* Update guest view of 2nd HPTE dword */ + mask = HPTE_R_PP0 | HPTE_R_PP | HPTE_R_N | + HPTE_R_KEY_HI | HPTE_R_KEY_LO; + rev = real_vmalloc_addr(&kvm->arch.revmap[pte_index]); + if (rev) { + r = (rev->guest_rpte & ~mask) | bits; + rev->guest_rpte = r; + note_hpte_modification(kvm, rev); + } + + /* Update HPTE */ + if (v & HPTE_V_VALID) { + /* + * If the page is valid, don't let it transition from + * readonly to writable. If it should be writable, we'll + * take a trap and let the page fault code sort it out. + */ + pte = be64_to_cpu(hpte[1]); + r = (pte & ~mask) | bits; + if (hpte_is_writable(r) && !hpte_is_writable(pte)) + r = hpte_make_readonly(r); + /* If the PTE is changing, invalidate it first */ + if (r != pte) { + rb = compute_tlbie_rb(v, r, pte_index); + hpte[0] = cpu_to_be64((v & ~HPTE_V_VALID) | + HPTE_V_ABSENT); + do_tlbies(kvm, &rb, 1, global_invalidates(kvm, flags), + true); + hpte[1] = cpu_to_be64(r); + } + } + unlock_hpte(hpte, v & ~HPTE_V_HVLOCK); + asm volatile("ptesync" : : : "memory"); + return H_SUCCESS; +} + +long kvmppc_h_read(struct kvm_vcpu *vcpu, unsigned long flags, + unsigned long pte_index) +{ + struct kvm *kvm = vcpu->kvm; + __be64 *hpte; + unsigned long v, r; + int i, n = 1; + struct revmap_entry *rev = NULL; + + if (pte_index >= kvm->arch.hpt_npte) + return H_PARAMETER; + if (flags & H_READ_4) { + pte_index &= ~3; + n = 4; + } + rev = real_vmalloc_addr(&kvm->arch.revmap[pte_index]); + for (i = 0; i < n; ++i, ++pte_index) { + hpte = (__be64 *)(kvm->arch.hpt_virt + (pte_index << 4)); + v = be64_to_cpu(hpte[0]) & ~HPTE_V_HVLOCK; + r = be64_to_cpu(hpte[1]); + if (v & HPTE_V_ABSENT) { + v &= ~HPTE_V_ABSENT; + v |= HPTE_V_VALID; + } + if (v & HPTE_V_VALID) { + r = rev[i].guest_rpte | (r & (HPTE_R_R | HPTE_R_C)); + r &= ~HPTE_GR_RESERVED; + } + vcpu->arch.gpr[4 + i * 2] = v; + vcpu->arch.gpr[5 + i * 2] = r; + } + return H_SUCCESS; +} + +void kvmppc_invalidate_hpte(struct kvm *kvm, __be64 *hptep, + unsigned long pte_index) +{ + unsigned long rb; + + hptep[0] &= ~cpu_to_be64(HPTE_V_VALID); + rb = compute_tlbie_rb(be64_to_cpu(hptep[0]), be64_to_cpu(hptep[1]), + pte_index); + do_tlbies(kvm, &rb, 1, 1, true); +} +EXPORT_SYMBOL_GPL(kvmppc_invalidate_hpte); + +void kvmppc_clear_ref_hpte(struct kvm *kvm, __be64 *hptep, + unsigned long pte_index) +{ + unsigned long rb; + unsigned char rbyte; + + rb = compute_tlbie_rb(be64_to_cpu(hptep[0]), be64_to_cpu(hptep[1]), + pte_index); + rbyte = (be64_to_cpu(hptep[1]) & ~HPTE_R_R) >> 8; + /* modify only the second-last byte, which contains the ref bit */ + *((char *)hptep + 14) = rbyte; + do_tlbies(kvm, &rb, 1, 1, false); +} +EXPORT_SYMBOL_GPL(kvmppc_clear_ref_hpte); + +static int slb_base_page_shift[4] = { + 24, /* 16M */ + 16, /* 64k */ + 34, /* 16G */ + 20, /* 1M, unsupported */ +}; + +/* When called from virtmode, this func should be protected by + * preempt_disable(), otherwise, the holding of HPTE_V_HVLOCK + * can trigger deadlock issue. + */ +long kvmppc_hv_find_lock_hpte(struct kvm *kvm, gva_t eaddr, unsigned long slb_v, + unsigned long valid) +{ + unsigned int i; + unsigned int pshift; + unsigned long somask; + unsigned long vsid, hash; + unsigned long avpn; + __be64 *hpte; + unsigned long mask, val; + unsigned long v, r; + + /* Get page shift, work out hash and AVPN etc. */ + mask = SLB_VSID_B | HPTE_V_AVPN | HPTE_V_SECONDARY; + val = 0; + pshift = 12; + if (slb_v & SLB_VSID_L) { + mask |= HPTE_V_LARGE; + val |= HPTE_V_LARGE; + pshift = slb_base_page_shift[(slb_v & SLB_VSID_LP) >> 4]; + } + if (slb_v & SLB_VSID_B_1T) { + somask = (1UL << 40) - 1; + vsid = (slb_v & ~SLB_VSID_B) >> SLB_VSID_SHIFT_1T; + vsid ^= vsid << 25; + } else { + somask = (1UL << 28) - 1; + vsid = (slb_v & ~SLB_VSID_B) >> SLB_VSID_SHIFT; + } + hash = (vsid ^ ((eaddr & somask) >> pshift)) & kvm->arch.hpt_mask; + avpn = slb_v & ~(somask >> 16); /* also includes B */ + avpn |= (eaddr & somask) >> 16; + + if (pshift >= 24) + avpn &= ~((1UL << (pshift - 16)) - 1); + else + avpn &= ~0x7fUL; + val |= avpn; + + for (;;) { + hpte = (__be64 *)(kvm->arch.hpt_virt + (hash << 7)); + + for (i = 0; i < 16; i += 2) { + /* Read the PTE racily */ + v = be64_to_cpu(hpte[i]) & ~HPTE_V_HVLOCK; + + /* Check valid/absent, hash, segment size and AVPN */ + if (!(v & valid) || (v & mask) != val) + continue; + + /* Lock the PTE and read it under the lock */ + while (!try_lock_hpte(&hpte[i], HPTE_V_HVLOCK)) + cpu_relax(); + v = be64_to_cpu(hpte[i]) & ~HPTE_V_HVLOCK; + r = be64_to_cpu(hpte[i+1]); + + /* + * Check the HPTE again, including base page size + */ + if ((v & valid) && (v & mask) == val && + hpte_base_page_size(v, r) == (1ul << pshift)) + /* Return with the HPTE still locked */ + return (hash << 3) + (i >> 1); + + __unlock_hpte(&hpte[i], v); + } + + if (val & HPTE_V_SECONDARY) + break; + val |= HPTE_V_SECONDARY; + hash = hash ^ kvm->arch.hpt_mask; + } + return -1; +} +EXPORT_SYMBOL(kvmppc_hv_find_lock_hpte); + +/* + * Called in real mode to check whether an HPTE not found fault + * is due to accessing a paged-out page or an emulated MMIO page, + * or if a protection fault is due to accessing a page that the + * guest wanted read/write access to but which we made read-only. + * Returns a possibly modified status (DSISR) value if not + * (i.e. pass the interrupt to the guest), + * -1 to pass the fault up to host kernel mode code, -2 to do that + * and also load the instruction word (for MMIO emulation), + * or 0 if we should make the guest retry the access. + */ +long kvmppc_hpte_hv_fault(struct kvm_vcpu *vcpu, unsigned long addr, + unsigned long slb_v, unsigned int status, bool data) +{ + struct kvm *kvm = vcpu->kvm; + long int index; + unsigned long v, r, gr; + __be64 *hpte; + unsigned long valid; + struct revmap_entry *rev; + unsigned long pp, key; + + /* For protection fault, expect to find a valid HPTE */ + valid = HPTE_V_VALID; + if (status & DSISR_NOHPTE) + valid |= HPTE_V_ABSENT; + + index = kvmppc_hv_find_lock_hpte(kvm, addr, slb_v, valid); + if (index < 0) { + if (status & DSISR_NOHPTE) + return status; /* there really was no HPTE */ + return 0; /* for prot fault, HPTE disappeared */ + } + hpte = (__be64 *)(kvm->arch.hpt_virt + (index << 4)); + v = be64_to_cpu(hpte[0]) & ~HPTE_V_HVLOCK; + r = be64_to_cpu(hpte[1]); + rev = real_vmalloc_addr(&kvm->arch.revmap[index]); + gr = rev->guest_rpte; + + unlock_hpte(hpte, v); + + /* For not found, if the HPTE is valid by now, retry the instruction */ + if ((status & DSISR_NOHPTE) && (v & HPTE_V_VALID)) + return 0; + + /* Check access permissions to the page */ + pp = gr & (HPTE_R_PP0 | HPTE_R_PP); + key = (vcpu->arch.shregs.msr & MSR_PR) ? SLB_VSID_KP : SLB_VSID_KS; + status &= ~DSISR_NOHPTE; /* DSISR_NOHPTE == SRR1_ISI_NOPT */ + if (!data) { + if (gr & (HPTE_R_N | HPTE_R_G)) + return status | SRR1_ISI_N_OR_G; + if (!hpte_read_permission(pp, slb_v & key)) + return status | SRR1_ISI_PROT; + } else if (status & DSISR_ISSTORE) { + /* check write permission */ + if (!hpte_write_permission(pp, slb_v & key)) + return status | DSISR_PROTFAULT; + } else { + if (!hpte_read_permission(pp, slb_v & key)) + return status | DSISR_PROTFAULT; + } + + /* Check storage key, if applicable */ + if (data && (vcpu->arch.shregs.msr & MSR_DR)) { + unsigned int perm = hpte_get_skey_perm(gr, vcpu->arch.amr); + if (status & DSISR_ISSTORE) + perm >>= 1; + if (perm & 1) + return status | DSISR_KEYFAULT; + } + + /* Save HPTE info for virtual-mode handler */ + vcpu->arch.pgfault_addr = addr; + vcpu->arch.pgfault_index = index; + vcpu->arch.pgfault_hpte[0] = v; + vcpu->arch.pgfault_hpte[1] = r; + + /* Check the storage key to see if it is possibly emulated MMIO */ + if (data && (vcpu->arch.shregs.msr & MSR_IR) && + (r & (HPTE_R_KEY_HI | HPTE_R_KEY_LO)) == + (HPTE_R_KEY_HI | HPTE_R_KEY_LO)) + return -2; /* MMIO emulation - load instr word */ + + return -1; /* send fault up to host kernel mode */ +} diff --git a/kernel/arch/powerpc/kvm/book3s_hv_rm_xics.c b/kernel/arch/powerpc/kvm/book3s_hv_rm_xics.c new file mode 100644 index 000000000..00e45b6d4 --- /dev/null +++ b/kernel/arch/powerpc/kvm/book3s_hv_rm_xics.c @@ -0,0 +1,627 @@ +/* + * Copyright 2012 Michael Ellerman, IBM Corporation. + * Copyright 2012 Benjamin Herrenschmidt, IBM Corporation + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + */ + +#include <linux/kernel.h> +#include <linux/kvm_host.h> +#include <linux/err.h> + +#include <asm/kvm_book3s.h> +#include <asm/kvm_ppc.h> +#include <asm/hvcall.h> +#include <asm/xics.h> +#include <asm/debug.h> +#include <asm/synch.h> +#include <asm/ppc-opcode.h> + +#include "book3s_xics.h" + +#define DEBUG_PASSUP + +static void icp_rm_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp, + u32 new_irq); + +/* -- ICS routines -- */ +static void ics_rm_check_resend(struct kvmppc_xics *xics, + struct kvmppc_ics *ics, struct kvmppc_icp *icp) +{ + int i; + + arch_spin_lock(&ics->lock); + + for (i = 0; i < KVMPPC_XICS_IRQ_PER_ICS; i++) { + struct ics_irq_state *state = &ics->irq_state[i]; + + if (!state->resend) + continue; + + arch_spin_unlock(&ics->lock); + icp_rm_deliver_irq(xics, icp, state->number); + arch_spin_lock(&ics->lock); + } + + arch_spin_unlock(&ics->lock); +} + +/* -- ICP routines -- */ + +static void icp_rm_set_vcpu_irq(struct kvm_vcpu *vcpu, + struct kvm_vcpu *this_vcpu) +{ + struct kvmppc_icp *this_icp = this_vcpu->arch.icp; + int cpu; + + /* Mark the target VCPU as having an interrupt pending */ + vcpu->stat.queue_intr++; + set_bit(BOOK3S_IRQPRIO_EXTERNAL_LEVEL, &vcpu->arch.pending_exceptions); + + /* Kick self ? Just set MER and return */ + if (vcpu == this_vcpu) { + mtspr(SPRN_LPCR, mfspr(SPRN_LPCR) | LPCR_MER); + return; + } + + /* Check if the core is loaded, if not, too hard */ + cpu = vcpu->cpu; + if (cpu < 0 || cpu >= nr_cpu_ids) { + this_icp->rm_action |= XICS_RM_KICK_VCPU; + this_icp->rm_kick_target = vcpu; + return; + } + /* In SMT cpu will always point to thread 0, we adjust it */ + cpu += vcpu->arch.ptid; + + smp_mb(); + kvmhv_rm_send_ipi(cpu); +} + +static void icp_rm_clr_vcpu_irq(struct kvm_vcpu *vcpu) +{ + /* Note: Only called on self ! */ + clear_bit(BOOK3S_IRQPRIO_EXTERNAL_LEVEL, + &vcpu->arch.pending_exceptions); + mtspr(SPRN_LPCR, mfspr(SPRN_LPCR) & ~LPCR_MER); +} + +static inline bool icp_rm_try_update(struct kvmppc_icp *icp, + union kvmppc_icp_state old, + union kvmppc_icp_state new) +{ + struct kvm_vcpu *this_vcpu = local_paca->kvm_hstate.kvm_vcpu; + bool success; + + /* Calculate new output value */ + new.out_ee = (new.xisr && (new.pending_pri < new.cppr)); + + /* Attempt atomic update */ + success = cmpxchg64(&icp->state.raw, old.raw, new.raw) == old.raw; + if (!success) + goto bail; + + /* + * Check for output state update + * + * Note that this is racy since another processor could be updating + * the state already. This is why we never clear the interrupt output + * here, we only ever set it. The clear only happens prior to doing + * an update and only by the processor itself. Currently we do it + * in Accept (H_XIRR) and Up_Cppr (H_XPPR). + * + * We also do not try to figure out whether the EE state has changed, + * we unconditionally set it if the new state calls for it. The reason + * for that is that we opportunistically remove the pending interrupt + * flag when raising CPPR, so we need to set it back here if an + * interrupt is still pending. + */ + if (new.out_ee) + icp_rm_set_vcpu_irq(icp->vcpu, this_vcpu); + + /* Expose the state change for debug purposes */ + this_vcpu->arch.icp->rm_dbgstate = new; + this_vcpu->arch.icp->rm_dbgtgt = icp->vcpu; + + bail: + return success; +} + +static inline int check_too_hard(struct kvmppc_xics *xics, + struct kvmppc_icp *icp) +{ + return (xics->real_mode_dbg || icp->rm_action) ? H_TOO_HARD : H_SUCCESS; +} + +static void icp_rm_check_resend(struct kvmppc_xics *xics, + struct kvmppc_icp *icp) +{ + u32 icsid; + + /* Order this load with the test for need_resend in the caller */ + smp_rmb(); + for_each_set_bit(icsid, icp->resend_map, xics->max_icsid + 1) { + struct kvmppc_ics *ics = xics->ics[icsid]; + + if (!test_and_clear_bit(icsid, icp->resend_map)) + continue; + if (!ics) + continue; + ics_rm_check_resend(xics, ics, icp); + } +} + +static bool icp_rm_try_to_deliver(struct kvmppc_icp *icp, u32 irq, u8 priority, + u32 *reject) +{ + union kvmppc_icp_state old_state, new_state; + bool success; + + do { + old_state = new_state = READ_ONCE(icp->state); + + *reject = 0; + + /* See if we can deliver */ + success = new_state.cppr > priority && + new_state.mfrr > priority && + new_state.pending_pri > priority; + + /* + * If we can, check for a rejection and perform the + * delivery + */ + if (success) { + *reject = new_state.xisr; + new_state.xisr = irq; + new_state.pending_pri = priority; + } else { + /* + * If we failed to deliver we set need_resend + * so a subsequent CPPR state change causes us + * to try a new delivery. + */ + new_state.need_resend = true; + } + + } while (!icp_rm_try_update(icp, old_state, new_state)); + + return success; +} + +static void icp_rm_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp, + u32 new_irq) +{ + struct ics_irq_state *state; + struct kvmppc_ics *ics; + u32 reject; + u16 src; + + /* + * This is used both for initial delivery of an interrupt and + * for subsequent rejection. + * + * Rejection can be racy vs. resends. We have evaluated the + * rejection in an atomic ICP transaction which is now complete, + * so potentially the ICP can already accept the interrupt again. + * + * So we need to retry the delivery. Essentially the reject path + * boils down to a failed delivery. Always. + * + * Now the interrupt could also have moved to a different target, + * thus we may need to re-do the ICP lookup as well + */ + + again: + /* Get the ICS state and lock it */ + ics = kvmppc_xics_find_ics(xics, new_irq, &src); + if (!ics) { + /* Unsafe increment, but this does not need to be accurate */ + xics->err_noics++; + return; + } + state = &ics->irq_state[src]; + + /* Get a lock on the ICS */ + arch_spin_lock(&ics->lock); + + /* Get our server */ + if (!icp || state->server != icp->server_num) { + icp = kvmppc_xics_find_server(xics->kvm, state->server); + if (!icp) { + /* Unsafe increment again*/ + xics->err_noicp++; + goto out; + } + } + + /* Clear the resend bit of that interrupt */ + state->resend = 0; + + /* + * If masked, bail out + * + * Note: PAPR doesn't mention anything about masked pending + * when doing a resend, only when doing a delivery. + * + * However that would have the effect of losing a masked + * interrupt that was rejected and isn't consistent with + * the whole masked_pending business which is about not + * losing interrupts that occur while masked. + * + * I don't differentiate normal deliveries and resends, this + * implementation will differ from PAPR and not lose such + * interrupts. + */ + if (state->priority == MASKED) { + state->masked_pending = 1; + goto out; + } + + /* + * Try the delivery, this will set the need_resend flag + * in the ICP as part of the atomic transaction if the + * delivery is not possible. + * + * Note that if successful, the new delivery might have itself + * rejected an interrupt that was "delivered" before we took the + * ics spin lock. + * + * In this case we do the whole sequence all over again for the + * new guy. We cannot assume that the rejected interrupt is less + * favored than the new one, and thus doesn't need to be delivered, + * because by the time we exit icp_rm_try_to_deliver() the target + * processor may well have already consumed & completed it, and thus + * the rejected interrupt might actually be already acceptable. + */ + if (icp_rm_try_to_deliver(icp, new_irq, state->priority, &reject)) { + /* + * Delivery was successful, did we reject somebody else ? + */ + if (reject && reject != XICS_IPI) { + arch_spin_unlock(&ics->lock); + new_irq = reject; + goto again; + } + } else { + /* + * We failed to deliver the interrupt we need to set the + * resend map bit and mark the ICS state as needing a resend + */ + set_bit(ics->icsid, icp->resend_map); + state->resend = 1; + + /* + * If the need_resend flag got cleared in the ICP some time + * between icp_rm_try_to_deliver() atomic update and now, then + * we know it might have missed the resend_map bit. So we + * retry + */ + smp_mb(); + if (!icp->state.need_resend) { + arch_spin_unlock(&ics->lock); + goto again; + } + } + out: + arch_spin_unlock(&ics->lock); +} + +static void icp_rm_down_cppr(struct kvmppc_xics *xics, struct kvmppc_icp *icp, + u8 new_cppr) +{ + union kvmppc_icp_state old_state, new_state; + bool resend; + + /* + * This handles several related states in one operation: + * + * ICP State: Down_CPPR + * + * Load CPPR with new value and if the XISR is 0 + * then check for resends: + * + * ICP State: Resend + * + * If MFRR is more favored than CPPR, check for IPIs + * and notify ICS of a potential resend. This is done + * asynchronously (when used in real mode, we will have + * to exit here). + * + * We do not handle the complete Check_IPI as documented + * here. In the PAPR, this state will be used for both + * Set_MFRR and Down_CPPR. However, we know that we aren't + * changing the MFRR state here so we don't need to handle + * the case of an MFRR causing a reject of a pending irq, + * this will have been handled when the MFRR was set in the + * first place. + * + * Thus we don't have to handle rejects, only resends. + * + * When implementing real mode for HV KVM, resend will lead to + * a H_TOO_HARD return and the whole transaction will be handled + * in virtual mode. + */ + do { + old_state = new_state = READ_ONCE(icp->state); + + /* Down_CPPR */ + new_state.cppr = new_cppr; + + /* + * Cut down Resend / Check_IPI / IPI + * + * The logic is that we cannot have a pending interrupt + * trumped by an IPI at this point (see above), so we + * know that either the pending interrupt is already an + * IPI (in which case we don't care to override it) or + * it's either more favored than us or non existent + */ + if (new_state.mfrr < new_cppr && + new_state.mfrr <= new_state.pending_pri) { + new_state.pending_pri = new_state.mfrr; + new_state.xisr = XICS_IPI; + } + + /* Latch/clear resend bit */ + resend = new_state.need_resend; + new_state.need_resend = 0; + + } while (!icp_rm_try_update(icp, old_state, new_state)); + + /* + * Now handle resend checks. Those are asynchronous to the ICP + * state update in HW (ie bus transactions) so we can handle them + * separately here as well. + */ + if (resend) { + icp->n_check_resend++; + icp_rm_check_resend(xics, icp); + } +} + + +unsigned long kvmppc_rm_h_xirr(struct kvm_vcpu *vcpu) +{ + union kvmppc_icp_state old_state, new_state; + struct kvmppc_xics *xics = vcpu->kvm->arch.xics; + struct kvmppc_icp *icp = vcpu->arch.icp; + u32 xirr; + + if (!xics || !xics->real_mode) + return H_TOO_HARD; + + /* First clear the interrupt */ + icp_rm_clr_vcpu_irq(icp->vcpu); + + /* + * ICP State: Accept_Interrupt + * + * Return the pending interrupt (if any) along with the + * current CPPR, then clear the XISR & set CPPR to the + * pending priority + */ + do { + old_state = new_state = READ_ONCE(icp->state); + + xirr = old_state.xisr | (((u32)old_state.cppr) << 24); + if (!old_state.xisr) + break; + new_state.cppr = new_state.pending_pri; + new_state.pending_pri = 0xff; + new_state.xisr = 0; + + } while (!icp_rm_try_update(icp, old_state, new_state)); + + /* Return the result in GPR4 */ + vcpu->arch.gpr[4] = xirr; + + return check_too_hard(xics, icp); +} + +int kvmppc_rm_h_ipi(struct kvm_vcpu *vcpu, unsigned long server, + unsigned long mfrr) +{ + union kvmppc_icp_state old_state, new_state; + struct kvmppc_xics *xics = vcpu->kvm->arch.xics; + struct kvmppc_icp *icp, *this_icp = vcpu->arch.icp; + u32 reject; + bool resend; + bool local; + + if (!xics || !xics->real_mode) + return H_TOO_HARD; + + local = this_icp->server_num == server; + if (local) + icp = this_icp; + else + icp = kvmppc_xics_find_server(vcpu->kvm, server); + if (!icp) + return H_PARAMETER; + + /* + * ICP state: Set_MFRR + * + * If the CPPR is more favored than the new MFRR, then + * nothing needs to be done as there can be no XISR to + * reject. + * + * ICP state: Check_IPI + * + * If the CPPR is less favored, then we might be replacing + * an interrupt, and thus need to possibly reject it. + * + * ICP State: IPI + * + * Besides rejecting any pending interrupts, we also + * update XISR and pending_pri to mark IPI as pending. + * + * PAPR does not describe this state, but if the MFRR is being + * made less favored than its earlier value, there might be + * a previously-rejected interrupt needing to be resent. + * Ideally, we would want to resend only if + * prio(pending_interrupt) < mfrr && + * prio(pending_interrupt) < cppr + * where pending interrupt is the one that was rejected. But + * we don't have that state, so we simply trigger a resend + * whenever the MFRR is made less favored. + */ + do { + old_state = new_state = READ_ONCE(icp->state); + + /* Set_MFRR */ + new_state.mfrr = mfrr; + + /* Check_IPI */ + reject = 0; + resend = false; + if (mfrr < new_state.cppr) { + /* Reject a pending interrupt if not an IPI */ + if (mfrr <= new_state.pending_pri) { + reject = new_state.xisr; + new_state.pending_pri = mfrr; + new_state.xisr = XICS_IPI; + } + } + + if (mfrr > old_state.mfrr) { + resend = new_state.need_resend; + new_state.need_resend = 0; + } + } while (!icp_rm_try_update(icp, old_state, new_state)); + + /* Handle reject in real mode */ + if (reject && reject != XICS_IPI) { + this_icp->n_reject++; + icp_rm_deliver_irq(xics, icp, reject); + } + + /* Handle resends in real mode */ + if (resend) { + this_icp->n_check_resend++; + icp_rm_check_resend(xics, icp); + } + + return check_too_hard(xics, this_icp); +} + +int kvmppc_rm_h_cppr(struct kvm_vcpu *vcpu, unsigned long cppr) +{ + union kvmppc_icp_state old_state, new_state; + struct kvmppc_xics *xics = vcpu->kvm->arch.xics; + struct kvmppc_icp *icp = vcpu->arch.icp; + u32 reject; + + if (!xics || !xics->real_mode) + return H_TOO_HARD; + + /* + * ICP State: Set_CPPR + * + * We can safely compare the new value with the current + * value outside of the transaction as the CPPR is only + * ever changed by the processor on itself + */ + if (cppr > icp->state.cppr) { + icp_rm_down_cppr(xics, icp, cppr); + goto bail; + } else if (cppr == icp->state.cppr) + return H_SUCCESS; + + /* + * ICP State: Up_CPPR + * + * The processor is raising its priority, this can result + * in a rejection of a pending interrupt: + * + * ICP State: Reject_Current + * + * We can remove EE from the current processor, the update + * transaction will set it again if needed + */ + icp_rm_clr_vcpu_irq(icp->vcpu); + + do { + old_state = new_state = READ_ONCE(icp->state); + + reject = 0; + new_state.cppr = cppr; + + if (cppr <= new_state.pending_pri) { + reject = new_state.xisr; + new_state.xisr = 0; + new_state.pending_pri = 0xff; + } + + } while (!icp_rm_try_update(icp, old_state, new_state)); + + /* + * Check for rejects. They are handled by doing a new delivery + * attempt (see comments in icp_rm_deliver_irq). + */ + if (reject && reject != XICS_IPI) { + icp->n_reject++; + icp_rm_deliver_irq(xics, icp, reject); + } + bail: + return check_too_hard(xics, icp); +} + +int kvmppc_rm_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr) +{ + struct kvmppc_xics *xics = vcpu->kvm->arch.xics; + struct kvmppc_icp *icp = vcpu->arch.icp; + struct kvmppc_ics *ics; + struct ics_irq_state *state; + u32 irq = xirr & 0x00ffffff; + u16 src; + + if (!xics || !xics->real_mode) + return H_TOO_HARD; + + /* + * ICP State: EOI + * + * Note: If EOI is incorrectly used by SW to lower the CPPR + * value (ie more favored), we do not check for rejection of + * a pending interrupt, this is a SW error and PAPR sepcifies + * that we don't have to deal with it. + * + * The sending of an EOI to the ICS is handled after the + * CPPR update + * + * ICP State: Down_CPPR which we handle + * in a separate function as it's shared with H_CPPR. + */ + icp_rm_down_cppr(xics, icp, xirr >> 24); + + /* IPIs have no EOI */ + if (irq == XICS_IPI) + goto bail; + /* + * EOI handling: If the interrupt is still asserted, we need to + * resend it. We can take a lockless "peek" at the ICS state here. + * + * "Message" interrupts will never have "asserted" set + */ + ics = kvmppc_xics_find_ics(xics, irq, &src); + if (!ics) + goto bail; + state = &ics->irq_state[src]; + + /* Still asserted, resend it */ + if (state->asserted) { + icp->n_reject++; + icp_rm_deliver_irq(xics, icp, irq); + } + + if (!hlist_empty(&vcpu->kvm->irq_ack_notifier_list)) { + icp->rm_action |= XICS_RM_NOTIFY_EOI; + icp->rm_eoied_irq = irq; + } + bail: + return check_too_hard(xics, icp); +} diff --git a/kernel/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/kernel/arch/powerpc/kvm/book3s_hv_rmhandlers.S new file mode 100644 index 000000000..4d70df26c --- /dev/null +++ b/kernel/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -0,0 +1,2601 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * Copyright 2011 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com> + * + * Derived from book3s_rmhandlers.S and other files, which are: + * + * Copyright SUSE Linux Products GmbH 2009 + * + * Authors: Alexander Graf <agraf@suse.de> + */ + +#include <asm/ppc_asm.h> +#include <asm/kvm_asm.h> +#include <asm/reg.h> +#include <asm/mmu.h> +#include <asm/page.h> +#include <asm/ptrace.h> +#include <asm/hvcall.h> +#include <asm/asm-offsets.h> +#include <asm/exception-64s.h> +#include <asm/kvm_book3s_asm.h> +#include <asm/mmu-hash64.h> +#include <asm/tm.h> + +#define VCPU_GPRS_TM(reg) (((reg) * ULONG_SIZE) + VCPU_GPR_TM) + +/* Values in HSTATE_NAPPING(r13) */ +#define NAPPING_CEDE 1 +#define NAPPING_NOVCPU 2 + +/* + * Call kvmppc_hv_entry in real mode. + * Must be called with interrupts hard-disabled. + * + * Input Registers: + * + * LR = return address to continue at after eventually re-enabling MMU + */ +_GLOBAL_TOC(kvmppc_hv_entry_trampoline) + mflr r0 + std r0, PPC_LR_STKOFF(r1) + stdu r1, -112(r1) + mfmsr r10 + LOAD_REG_ADDR(r5, kvmppc_call_hv_entry) + li r0,MSR_RI + andc r0,r10,r0 + li r6,MSR_IR | MSR_DR + andc r6,r10,r6 + mtmsrd r0,1 /* clear RI in MSR */ + mtsrr0 r5 + mtsrr1 r6 + RFI + +kvmppc_call_hv_entry: + ld r4, HSTATE_KVM_VCPU(r13) + bl kvmppc_hv_entry + + /* Back from guest - restore host state and return to caller */ + +BEGIN_FTR_SECTION + /* Restore host DABR and DABRX */ + ld r5,HSTATE_DABR(r13) + li r6,7 + mtspr SPRN_DABR,r5 + mtspr SPRN_DABRX,r6 +END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S) + + /* Restore SPRG3 */ + ld r3,PACA_SPRG_VDSO(r13) + mtspr SPRN_SPRG_VDSO_WRITE,r3 + + /* Reload the host's PMU registers */ + ld r3, PACALPPACAPTR(r13) /* is the host using the PMU? */ + lbz r4, LPPACA_PMCINUSE(r3) + cmpwi r4, 0 + beq 23f /* skip if not */ +BEGIN_FTR_SECTION + ld r3, HSTATE_MMCR0(r13) + andi. r4, r3, MMCR0_PMAO_SYNC | MMCR0_PMAO + cmpwi r4, MMCR0_PMAO + beql kvmppc_fix_pmao +END_FTR_SECTION_IFSET(CPU_FTR_PMAO_BUG) + lwz r3, HSTATE_PMC1(r13) + lwz r4, HSTATE_PMC2(r13) + lwz r5, HSTATE_PMC3(r13) + lwz r6, HSTATE_PMC4(r13) + lwz r8, HSTATE_PMC5(r13) + lwz r9, HSTATE_PMC6(r13) + mtspr SPRN_PMC1, r3 + mtspr SPRN_PMC2, r4 + mtspr SPRN_PMC3, r5 + mtspr SPRN_PMC4, r6 + mtspr SPRN_PMC5, r8 + mtspr SPRN_PMC6, r9 + ld r3, HSTATE_MMCR0(r13) + ld r4, HSTATE_MMCR1(r13) + ld r5, HSTATE_MMCRA(r13) + ld r6, HSTATE_SIAR(r13) + ld r7, HSTATE_SDAR(r13) + mtspr SPRN_MMCR1, r4 + mtspr SPRN_MMCRA, r5 + mtspr SPRN_SIAR, r6 + mtspr SPRN_SDAR, r7 +BEGIN_FTR_SECTION + ld r8, HSTATE_MMCR2(r13) + ld r9, HSTATE_SIER(r13) + mtspr SPRN_MMCR2, r8 + mtspr SPRN_SIER, r9 +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) + mtspr SPRN_MMCR0, r3 + isync +23: + + /* + * Reload DEC. HDEC interrupts were disabled when + * we reloaded the host's LPCR value. + */ + ld r3, HSTATE_DECEXP(r13) + mftb r4 + subf r4, r4, r3 + mtspr SPRN_DEC, r4 + + /* + * For external and machine check interrupts, we need + * to call the Linux handler to process the interrupt. + * We do that by jumping to absolute address 0x500 for + * external interrupts, or the machine_check_fwnmi label + * for machine checks (since firmware might have patched + * the vector area at 0x200). The [h]rfid at the end of the + * handler will return to the book3s_hv_interrupts.S code. + * For other interrupts we do the rfid to get back + * to the book3s_hv_interrupts.S code here. + */ + ld r8, 112+PPC_LR_STKOFF(r1) + addi r1, r1, 112 + ld r7, HSTATE_HOST_MSR(r13) + + cmpwi cr1, r12, BOOK3S_INTERRUPT_MACHINE_CHECK + cmpwi r12, BOOK3S_INTERRUPT_EXTERNAL + beq 11f + cmpwi cr2, r12, BOOK3S_INTERRUPT_HMI + beq cr2, 14f /* HMI check */ + + /* RFI into the highmem handler, or branch to interrupt handler */ + mfmsr r6 + li r0, MSR_RI + andc r6, r6, r0 + mtmsrd r6, 1 /* Clear RI in MSR */ + mtsrr0 r8 + mtsrr1 r7 + beq cr1, 13f /* machine check */ + RFI + + /* On POWER7, we have external interrupts set to use HSRR0/1 */ +11: mtspr SPRN_HSRR0, r8 + mtspr SPRN_HSRR1, r7 + ba 0x500 + +13: b machine_check_fwnmi + +14: mtspr SPRN_HSRR0, r8 + mtspr SPRN_HSRR1, r7 + b hmi_exception_after_realmode + +kvmppc_primary_no_guest: + /* We handle this much like a ceded vcpu */ + /* put the HDEC into the DEC, since HDEC interrupts don't wake us */ + mfspr r3, SPRN_HDEC + mtspr SPRN_DEC, r3 + /* + * Make sure the primary has finished the MMU switch. + * We should never get here on a secondary thread, but + * check it for robustness' sake. + */ + ld r5, HSTATE_KVM_VCORE(r13) +65: lbz r0, VCORE_IN_GUEST(r5) + cmpwi r0, 0 + beq 65b + /* Set LPCR. */ + ld r8,VCORE_LPCR(r5) + mtspr SPRN_LPCR,r8 + isync + /* set our bit in napping_threads */ + ld r5, HSTATE_KVM_VCORE(r13) + lbz r7, HSTATE_PTID(r13) + li r0, 1 + sld r0, r0, r7 + addi r6, r5, VCORE_NAPPING_THREADS +1: lwarx r3, 0, r6 + or r3, r3, r0 + stwcx. r3, 0, r6 + bne 1b + /* order napping_threads update vs testing entry_exit_map */ + isync + li r12, 0 + lwz r7, VCORE_ENTRY_EXIT(r5) + cmpwi r7, 0x100 + bge kvm_novcpu_exit /* another thread already exiting */ + li r3, NAPPING_NOVCPU + stb r3, HSTATE_NAPPING(r13) + + li r3, 0 /* Don't wake on privileged (OS) doorbell */ + b kvm_do_nap + +kvm_novcpu_wakeup: + ld r1, HSTATE_HOST_R1(r13) + ld r5, HSTATE_KVM_VCORE(r13) + li r0, 0 + stb r0, HSTATE_NAPPING(r13) + stb r0, HSTATE_HWTHREAD_REQ(r13) + + /* check the wake reason */ + bl kvmppc_check_wake_reason + + /* see if any other thread is already exiting */ + lwz r0, VCORE_ENTRY_EXIT(r5) + cmpwi r0, 0x100 + bge kvm_novcpu_exit + + /* clear our bit in napping_threads */ + lbz r7, HSTATE_PTID(r13) + li r0, 1 + sld r0, r0, r7 + addi r6, r5, VCORE_NAPPING_THREADS +4: lwarx r7, 0, r6 + andc r7, r7, r0 + stwcx. r7, 0, r6 + bne 4b + + /* See if the wake reason means we need to exit */ + cmpdi r3, 0 + bge kvm_novcpu_exit + + /* See if our timeslice has expired (HDEC is negative) */ + mfspr r0, SPRN_HDEC + li r12, BOOK3S_INTERRUPT_HV_DECREMENTER + cmpwi r0, 0 + blt kvm_novcpu_exit + + /* Got an IPI but other vcpus aren't yet exiting, must be a latecomer */ + ld r4, HSTATE_KVM_VCPU(r13) + cmpdi r4, 0 + beq kvmppc_primary_no_guest + +#ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING + addi r3, r4, VCPU_TB_RMENTRY + bl kvmhv_start_timing +#endif + b kvmppc_got_guest + +kvm_novcpu_exit: +#ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING + ld r4, HSTATE_KVM_VCPU(r13) + cmpdi r4, 0 + beq 13f + addi r3, r4, VCPU_TB_RMEXIT + bl kvmhv_accumulate_time +#endif +13: mr r3, r12 + stw r12, 112-4(r1) + bl kvmhv_commence_exit + nop + lwz r12, 112-4(r1) + b kvmhv_switch_to_host + +/* + * We come in here when wakened from nap mode. + * Relocation is off and most register values are lost. + * r13 points to the PACA. + */ + .globl kvm_start_guest +kvm_start_guest: + + /* Set runlatch bit the minute you wake up from nap */ + mfspr r0, SPRN_CTRLF + ori r0, r0, 1 + mtspr SPRN_CTRLT, r0 + + ld r2,PACATOC(r13) + + li r0,KVM_HWTHREAD_IN_KVM + stb r0,HSTATE_HWTHREAD_STATE(r13) + + /* NV GPR values from power7_idle() will no longer be valid */ + li r0,1 + stb r0,PACA_NAPSTATELOST(r13) + + /* were we napping due to cede? */ + lbz r0,HSTATE_NAPPING(r13) + cmpwi r0,NAPPING_CEDE + beq kvm_end_cede + cmpwi r0,NAPPING_NOVCPU + beq kvm_novcpu_wakeup + + ld r1,PACAEMERGSP(r13) + subi r1,r1,STACK_FRAME_OVERHEAD + + /* + * We weren't napping due to cede, so this must be a secondary + * thread being woken up to run a guest, or being woken up due + * to a stray IPI. (Or due to some machine check or hypervisor + * maintenance interrupt while the core is in KVM.) + */ + + /* Check the wake reason in SRR1 to see why we got here */ + bl kvmppc_check_wake_reason + cmpdi r3, 0 + bge kvm_no_guest + + /* get vcpu pointer, NULL if we have no vcpu to run */ + ld r4,HSTATE_KVM_VCPU(r13) + cmpdi r4,0 + /* if we have no vcpu to run, go back to sleep */ + beq kvm_no_guest + +kvm_secondary_got_guest: + + /* Set HSTATE_DSCR(r13) to something sensible */ + ld r6, PACA_DSCR(r13) + std r6, HSTATE_DSCR(r13) + + /* Order load of vcore, ptid etc. after load of vcpu */ + lwsync + bl kvmppc_hv_entry + + /* Back from the guest, go back to nap */ + /* Clear our vcpu pointer so we don't come back in early */ + li r0, 0 + /* + * Once we clear HSTATE_KVM_VCPU(r13), the code in + * kvmppc_run_core() is going to assume that all our vcpu + * state is visible in memory. This lwsync makes sure + * that that is true. + */ + lwsync + std r0, HSTATE_KVM_VCPU(r13) + +/* + * At this point we have finished executing in the guest. + * We need to wait for hwthread_req to become zero, since + * we may not turn on the MMU while hwthread_req is non-zero. + * While waiting we also need to check if we get given a vcpu to run. + */ +kvm_no_guest: + lbz r3, HSTATE_HWTHREAD_REQ(r13) + cmpwi r3, 0 + bne 53f + HMT_MEDIUM + li r0, KVM_HWTHREAD_IN_KERNEL + stb r0, HSTATE_HWTHREAD_STATE(r13) + /* need to recheck hwthread_req after a barrier, to avoid race */ + sync + lbz r3, HSTATE_HWTHREAD_REQ(r13) + cmpwi r3, 0 + bne 54f +/* + * We jump to power7_wakeup_loss, which will return to the caller + * of power7_nap in the powernv cpu offline loop. The value we + * put in r3 becomes the return value for power7_nap. + */ + li r3, LPCR_PECE0 + mfspr r4, SPRN_LPCR + rlwimi r4, r3, 0, LPCR_PECE0 | LPCR_PECE1 + mtspr SPRN_LPCR, r4 + li r3, 0 + b power7_wakeup_loss + +53: HMT_LOW + ld r4, HSTATE_KVM_VCPU(r13) + cmpdi r4, 0 + beq kvm_no_guest + HMT_MEDIUM + b kvm_secondary_got_guest + +54: li r0, KVM_HWTHREAD_IN_KVM + stb r0, HSTATE_HWTHREAD_STATE(r13) + b kvm_no_guest + +/****************************************************************************** + * * + * Entry code * + * * + *****************************************************************************/ + +.global kvmppc_hv_entry +kvmppc_hv_entry: + + /* Required state: + * + * R4 = vcpu pointer (or NULL) + * MSR = ~IR|DR + * R13 = PACA + * R1 = host R1 + * R2 = TOC + * all other volatile GPRS = free + */ + mflr r0 + std r0, PPC_LR_STKOFF(r1) + stdu r1, -112(r1) + + /* Save R1 in the PACA */ + std r1, HSTATE_HOST_R1(r13) + + li r6, KVM_GUEST_MODE_HOST_HV + stb r6, HSTATE_IN_GUEST(r13) + +#ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING + /* Store initial timestamp */ + cmpdi r4, 0 + beq 1f + addi r3, r4, VCPU_TB_RMENTRY + bl kvmhv_start_timing +1: +#endif + /* Clear out SLB */ + li r6,0 + slbmte r6,r6 + slbia + ptesync + + /* + * POWER7/POWER8 host -> guest partition switch code. + * We don't have to lock against concurrent tlbies, + * but we do have to coordinate across hardware threads. + */ + /* Set bit in entry map iff exit map is zero. */ + ld r5, HSTATE_KVM_VCORE(r13) + li r7, 1 + lbz r6, HSTATE_PTID(r13) + sld r7, r7, r6 + addi r9, r5, VCORE_ENTRY_EXIT +21: lwarx r3, 0, r9 + cmpwi r3, 0x100 /* any threads starting to exit? */ + bge secondary_too_late /* if so we're too late to the party */ + or r3, r3, r7 + stwcx. r3, 0, r9 + bne 21b + + /* Primary thread switches to guest partition. */ + ld r9,VCORE_KVM(r5) /* pointer to struct kvm */ + cmpwi r6,0 + bne 10f + ld r6,KVM_SDR1(r9) + lwz r7,KVM_LPID(r9) + li r0,LPID_RSVD /* switch to reserved LPID */ + mtspr SPRN_LPID,r0 + ptesync + mtspr SPRN_SDR1,r6 /* switch to partition page table */ + mtspr SPRN_LPID,r7 + isync + + /* See if we need to flush the TLB */ + lhz r6,PACAPACAINDEX(r13) /* test_bit(cpu, need_tlb_flush) */ + clrldi r7,r6,64-6 /* extract bit number (6 bits) */ + srdi r6,r6,6 /* doubleword number */ + sldi r6,r6,3 /* address offset */ + add r6,r6,r9 + addi r6,r6,KVM_NEED_FLUSH /* dword in kvm->arch.need_tlb_flush */ + li r0,1 + sld r0,r0,r7 + ld r7,0(r6) + and. r7,r7,r0 + beq 22f +23: ldarx r7,0,r6 /* if set, clear the bit */ + andc r7,r7,r0 + stdcx. r7,0,r6 + bne 23b + /* Flush the TLB of any entries for this LPID */ + /* use arch 2.07S as a proxy for POWER8 */ +BEGIN_FTR_SECTION + li r6,512 /* POWER8 has 512 sets */ +FTR_SECTION_ELSE + li r6,128 /* POWER7 has 128 sets */ +ALT_FTR_SECTION_END_IFSET(CPU_FTR_ARCH_207S) + mtctr r6 + li r7,0x800 /* IS field = 0b10 */ + ptesync +28: tlbiel r7 + addi r7,r7,0x1000 + bdnz 28b + ptesync + + /* Add timebase offset onto timebase */ +22: ld r8,VCORE_TB_OFFSET(r5) + cmpdi r8,0 + beq 37f + mftb r6 /* current host timebase */ + add r8,r8,r6 + mtspr SPRN_TBU40,r8 /* update upper 40 bits */ + mftb r7 /* check if lower 24 bits overflowed */ + clrldi r6,r6,40 + clrldi r7,r7,40 + cmpld r7,r6 + bge 37f + addis r8,r8,0x100 /* if so, increment upper 40 bits */ + mtspr SPRN_TBU40,r8 + + /* Load guest PCR value to select appropriate compat mode */ +37: ld r7, VCORE_PCR(r5) + cmpdi r7, 0 + beq 38f + mtspr SPRN_PCR, r7 +38: + +BEGIN_FTR_SECTION + /* DPDES is shared between threads */ + ld r8, VCORE_DPDES(r5) + mtspr SPRN_DPDES, r8 +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) + + li r0,1 + stb r0,VCORE_IN_GUEST(r5) /* signal secondaries to continue */ + + /* Do we have a guest vcpu to run? */ +10: cmpdi r4, 0 + beq kvmppc_primary_no_guest +kvmppc_got_guest: + + /* Load up guest SLB entries */ + lwz r5,VCPU_SLB_MAX(r4) + cmpwi r5,0 + beq 9f + mtctr r5 + addi r6,r4,VCPU_SLB +1: ld r8,VCPU_SLB_E(r6) + ld r9,VCPU_SLB_V(r6) + slbmte r9,r8 + addi r6,r6,VCPU_SLB_SIZE + bdnz 1b +9: + /* Increment yield count if they have a VPA */ + ld r3, VCPU_VPA(r4) + cmpdi r3, 0 + beq 25f + li r6, LPPACA_YIELDCOUNT + LWZX_BE r5, r3, r6 + addi r5, r5, 1 + STWX_BE r5, r3, r6 + li r6, 1 + stb r6, VCPU_VPA_DIRTY(r4) +25: + + /* Save purr/spurr */ + mfspr r5,SPRN_PURR + mfspr r6,SPRN_SPURR + std r5,HSTATE_PURR(r13) + std r6,HSTATE_SPURR(r13) + ld r7,VCPU_PURR(r4) + ld r8,VCPU_SPURR(r4) + mtspr SPRN_PURR,r7 + mtspr SPRN_SPURR,r8 + +BEGIN_FTR_SECTION + /* Set partition DABR */ + /* Do this before re-enabling PMU to avoid P7 DABR corruption bug */ + lwz r5,VCPU_DABRX(r4) + ld r6,VCPU_DABR(r4) + mtspr SPRN_DABRX,r5 + mtspr SPRN_DABR,r6 + isync +END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S) + +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM +BEGIN_FTR_SECTION + b skip_tm +END_FTR_SECTION_IFCLR(CPU_FTR_TM) + + /* Turn on TM/FP/VSX/VMX so we can restore them. */ + mfmsr r5 + li r6, MSR_TM >> 32 + sldi r6, r6, 32 + or r5, r5, r6 + ori r5, r5, MSR_FP + oris r5, r5, (MSR_VEC | MSR_VSX)@h + mtmsrd r5 + + /* + * The user may change these outside of a transaction, so they must + * always be context switched. + */ + ld r5, VCPU_TFHAR(r4) + ld r6, VCPU_TFIAR(r4) + ld r7, VCPU_TEXASR(r4) + mtspr SPRN_TFHAR, r5 + mtspr SPRN_TFIAR, r6 + mtspr SPRN_TEXASR, r7 + + ld r5, VCPU_MSR(r4) + rldicl. r5, r5, 64 - MSR_TS_S_LG, 62 + beq skip_tm /* TM not active in guest */ + + /* Make sure the failure summary is set, otherwise we'll program check + * when we trechkpt. It's possible that this might have been not set + * on a kvmppc_set_one_reg() call but we shouldn't let this crash the + * host. + */ + oris r7, r7, (TEXASR_FS)@h + mtspr SPRN_TEXASR, r7 + + /* + * We need to load up the checkpointed state for the guest. + * We need to do this early as it will blow away any GPRs, VSRs and + * some SPRs. + */ + + mr r31, r4 + addi r3, r31, VCPU_FPRS_TM + bl load_fp_state + addi r3, r31, VCPU_VRS_TM + bl load_vr_state + mr r4, r31 + lwz r7, VCPU_VRSAVE_TM(r4) + mtspr SPRN_VRSAVE, r7 + + ld r5, VCPU_LR_TM(r4) + lwz r6, VCPU_CR_TM(r4) + ld r7, VCPU_CTR_TM(r4) + ld r8, VCPU_AMR_TM(r4) + ld r9, VCPU_TAR_TM(r4) + mtlr r5 + mtcr r6 + mtctr r7 + mtspr SPRN_AMR, r8 + mtspr SPRN_TAR, r9 + + /* + * Load up PPR and DSCR values but don't put them in the actual SPRs + * till the last moment to avoid running with userspace PPR and DSCR for + * too long. + */ + ld r29, VCPU_DSCR_TM(r4) + ld r30, VCPU_PPR_TM(r4) + + std r2, PACATMSCRATCH(r13) /* Save TOC */ + + /* Clear the MSR RI since r1, r13 are all going to be foobar. */ + li r5, 0 + mtmsrd r5, 1 + + /* Load GPRs r0-r28 */ + reg = 0 + .rept 29 + ld reg, VCPU_GPRS_TM(reg)(r31) + reg = reg + 1 + .endr + + mtspr SPRN_DSCR, r29 + mtspr SPRN_PPR, r30 + + /* Load final GPRs */ + ld 29, VCPU_GPRS_TM(29)(r31) + ld 30, VCPU_GPRS_TM(30)(r31) + ld 31, VCPU_GPRS_TM(31)(r31) + + /* TM checkpointed state is now setup. All GPRs are now volatile. */ + TRECHKPT + + /* Now let's get back the state we need. */ + HMT_MEDIUM + GET_PACA(r13) + ld r29, HSTATE_DSCR(r13) + mtspr SPRN_DSCR, r29 + ld r4, HSTATE_KVM_VCPU(r13) + ld r1, HSTATE_HOST_R1(r13) + ld r2, PACATMSCRATCH(r13) + + /* Set the MSR RI since we have our registers back. */ + li r5, MSR_RI + mtmsrd r5, 1 +skip_tm: +#endif + + /* Load guest PMU registers */ + /* R4 is live here (vcpu pointer) */ + li r3, 1 + sldi r3, r3, 31 /* MMCR0_FC (freeze counters) bit */ + mtspr SPRN_MMCR0, r3 /* freeze all counters, disable ints */ + isync +BEGIN_FTR_SECTION + ld r3, VCPU_MMCR(r4) + andi. r5, r3, MMCR0_PMAO_SYNC | MMCR0_PMAO + cmpwi r5, MMCR0_PMAO + beql kvmppc_fix_pmao +END_FTR_SECTION_IFSET(CPU_FTR_PMAO_BUG) + lwz r3, VCPU_PMC(r4) /* always load up guest PMU registers */ + lwz r5, VCPU_PMC + 4(r4) /* to prevent information leak */ + lwz r6, VCPU_PMC + 8(r4) + lwz r7, VCPU_PMC + 12(r4) + lwz r8, VCPU_PMC + 16(r4) + lwz r9, VCPU_PMC + 20(r4) + mtspr SPRN_PMC1, r3 + mtspr SPRN_PMC2, r5 + mtspr SPRN_PMC3, r6 + mtspr SPRN_PMC4, r7 + mtspr SPRN_PMC5, r8 + mtspr SPRN_PMC6, r9 + ld r3, VCPU_MMCR(r4) + ld r5, VCPU_MMCR + 8(r4) + ld r6, VCPU_MMCR + 16(r4) + ld r7, VCPU_SIAR(r4) + ld r8, VCPU_SDAR(r4) + mtspr SPRN_MMCR1, r5 + mtspr SPRN_MMCRA, r6 + mtspr SPRN_SIAR, r7 + mtspr SPRN_SDAR, r8 +BEGIN_FTR_SECTION + ld r5, VCPU_MMCR + 24(r4) + ld r6, VCPU_SIER(r4) + lwz r7, VCPU_PMC + 24(r4) + lwz r8, VCPU_PMC + 28(r4) + ld r9, VCPU_MMCR + 32(r4) + mtspr SPRN_MMCR2, r5 + mtspr SPRN_SIER, r6 + mtspr SPRN_SPMC1, r7 + mtspr SPRN_SPMC2, r8 + mtspr SPRN_MMCRS, r9 +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) + mtspr SPRN_MMCR0, r3 + isync + + /* Load up FP, VMX and VSX registers */ + bl kvmppc_load_fp + + ld r14, VCPU_GPR(R14)(r4) + ld r15, VCPU_GPR(R15)(r4) + ld r16, VCPU_GPR(R16)(r4) + ld r17, VCPU_GPR(R17)(r4) + ld r18, VCPU_GPR(R18)(r4) + ld r19, VCPU_GPR(R19)(r4) + ld r20, VCPU_GPR(R20)(r4) + ld r21, VCPU_GPR(R21)(r4) + ld r22, VCPU_GPR(R22)(r4) + ld r23, VCPU_GPR(R23)(r4) + ld r24, VCPU_GPR(R24)(r4) + ld r25, VCPU_GPR(R25)(r4) + ld r26, VCPU_GPR(R26)(r4) + ld r27, VCPU_GPR(R27)(r4) + ld r28, VCPU_GPR(R28)(r4) + ld r29, VCPU_GPR(R29)(r4) + ld r30, VCPU_GPR(R30)(r4) + ld r31, VCPU_GPR(R31)(r4) + + /* Switch DSCR to guest value */ + ld r5, VCPU_DSCR(r4) + mtspr SPRN_DSCR, r5 + +BEGIN_FTR_SECTION + /* Skip next section on POWER7 */ + b 8f +END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S) + /* Turn on TM so we can access TFHAR/TFIAR/TEXASR */ + mfmsr r8 + li r0, 1 + rldimi r8, r0, MSR_TM_LG, 63-MSR_TM_LG + mtmsrd r8 + + /* Load up POWER8-specific registers */ + ld r5, VCPU_IAMR(r4) + lwz r6, VCPU_PSPB(r4) + ld r7, VCPU_FSCR(r4) + mtspr SPRN_IAMR, r5 + mtspr SPRN_PSPB, r6 + mtspr SPRN_FSCR, r7 + ld r5, VCPU_DAWR(r4) + ld r6, VCPU_DAWRX(r4) + ld r7, VCPU_CIABR(r4) + ld r8, VCPU_TAR(r4) + mtspr SPRN_DAWR, r5 + mtspr SPRN_DAWRX, r6 + mtspr SPRN_CIABR, r7 + mtspr SPRN_TAR, r8 + ld r5, VCPU_IC(r4) + ld r6, VCPU_VTB(r4) + mtspr SPRN_IC, r5 + mtspr SPRN_VTB, r6 + ld r8, VCPU_EBBHR(r4) + mtspr SPRN_EBBHR, r8 + ld r5, VCPU_EBBRR(r4) + ld r6, VCPU_BESCR(r4) + ld r7, VCPU_CSIGR(r4) + ld r8, VCPU_TACR(r4) + mtspr SPRN_EBBRR, r5 + mtspr SPRN_BESCR, r6 + mtspr SPRN_CSIGR, r7 + mtspr SPRN_TACR, r8 + ld r5, VCPU_TCSCR(r4) + ld r6, VCPU_ACOP(r4) + lwz r7, VCPU_GUEST_PID(r4) + ld r8, VCPU_WORT(r4) + mtspr SPRN_TCSCR, r5 + mtspr SPRN_ACOP, r6 + mtspr SPRN_PID, r7 + mtspr SPRN_WORT, r8 +8: + + /* + * Set the decrementer to the guest decrementer. + */ + ld r8,VCPU_DEC_EXPIRES(r4) + /* r8 is a host timebase value here, convert to guest TB */ + ld r5,HSTATE_KVM_VCORE(r13) + ld r6,VCORE_TB_OFFSET(r5) + add r8,r8,r6 + mftb r7 + subf r3,r7,r8 + mtspr SPRN_DEC,r3 + stw r3,VCPU_DEC(r4) + + ld r5, VCPU_SPRG0(r4) + ld r6, VCPU_SPRG1(r4) + ld r7, VCPU_SPRG2(r4) + ld r8, VCPU_SPRG3(r4) + mtspr SPRN_SPRG0, r5 + mtspr SPRN_SPRG1, r6 + mtspr SPRN_SPRG2, r7 + mtspr SPRN_SPRG3, r8 + + /* Load up DAR and DSISR */ + ld r5, VCPU_DAR(r4) + lwz r6, VCPU_DSISR(r4) + mtspr SPRN_DAR, r5 + mtspr SPRN_DSISR, r6 + + /* Restore AMR and UAMOR, set AMOR to all 1s */ + ld r5,VCPU_AMR(r4) + ld r6,VCPU_UAMOR(r4) + li r7,-1 + mtspr SPRN_AMR,r5 + mtspr SPRN_UAMOR,r6 + mtspr SPRN_AMOR,r7 + + /* Restore state of CTRL run bit; assume 1 on entry */ + lwz r5,VCPU_CTRL(r4) + andi. r5,r5,1 + bne 4f + mfspr r6,SPRN_CTRLF + clrrdi r6,r6,1 + mtspr SPRN_CTRLT,r6 +4: + /* Secondary threads wait for primary to have done partition switch */ + ld r5, HSTATE_KVM_VCORE(r13) + lbz r6, HSTATE_PTID(r13) + cmpwi r6, 0 + beq 21f + lbz r0, VCORE_IN_GUEST(r5) + cmpwi r0, 0 + bne 21f + HMT_LOW +20: lbz r0, VCORE_IN_GUEST(r5) + cmpwi r0, 0 + beq 20b + HMT_MEDIUM +21: + /* Set LPCR. */ + ld r8,VCORE_LPCR(r5) + mtspr SPRN_LPCR,r8 + isync + + /* Check if HDEC expires soon */ + mfspr r3, SPRN_HDEC + cmpwi r3, 512 /* 1 microsecond */ + blt hdec_soon + + ld r6, VCPU_CTR(r4) + lwz r7, VCPU_XER(r4) + + mtctr r6 + mtxer r7 + +kvmppc_cede_reentry: /* r4 = vcpu, r13 = paca */ + ld r10, VCPU_PC(r4) + ld r11, VCPU_MSR(r4) + ld r6, VCPU_SRR0(r4) + ld r7, VCPU_SRR1(r4) + mtspr SPRN_SRR0, r6 + mtspr SPRN_SRR1, r7 + +deliver_guest_interrupt: + /* r11 = vcpu->arch.msr & ~MSR_HV */ + rldicl r11, r11, 63 - MSR_HV_LG, 1 + rotldi r11, r11, 1 + MSR_HV_LG + ori r11, r11, MSR_ME + + /* Check if we can deliver an external or decrementer interrupt now */ + ld r0, VCPU_PENDING_EXC(r4) + rldicl r0, r0, 64 - BOOK3S_IRQPRIO_EXTERNAL_LEVEL, 63 + cmpdi cr1, r0, 0 + andi. r8, r11, MSR_EE + mfspr r8, SPRN_LPCR + /* Insert EXTERNAL_LEVEL bit into LPCR at the MER bit position */ + rldimi r8, r0, LPCR_MER_SH, 63 - LPCR_MER_SH + mtspr SPRN_LPCR, r8 + isync + beq 5f + li r0, BOOK3S_INTERRUPT_EXTERNAL + bne cr1, 12f + mfspr r0, SPRN_DEC + cmpwi r0, 0 + li r0, BOOK3S_INTERRUPT_DECREMENTER + bge 5f + +12: mtspr SPRN_SRR0, r10 + mr r10,r0 + mtspr SPRN_SRR1, r11 + mr r9, r4 + bl kvmppc_msr_interrupt +5: + +/* + * Required state: + * R4 = vcpu + * R10: value for HSRR0 + * R11: value for HSRR1 + * R13 = PACA + */ +fast_guest_return: + li r0,0 + stb r0,VCPU_CEDED(r4) /* cancel cede */ + mtspr SPRN_HSRR0,r10 + mtspr SPRN_HSRR1,r11 + + /* Activate guest mode, so faults get handled by KVM */ + li r9, KVM_GUEST_MODE_GUEST_HV + stb r9, HSTATE_IN_GUEST(r13) + +#ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING + /* Accumulate timing */ + addi r3, r4, VCPU_TB_GUEST + bl kvmhv_accumulate_time +#endif + + /* Enter guest */ + +BEGIN_FTR_SECTION + ld r5, VCPU_CFAR(r4) + mtspr SPRN_CFAR, r5 +END_FTR_SECTION_IFSET(CPU_FTR_CFAR) +BEGIN_FTR_SECTION + ld r0, VCPU_PPR(r4) +END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) + + ld r5, VCPU_LR(r4) + lwz r6, VCPU_CR(r4) + mtlr r5 + mtcr r6 + + ld r1, VCPU_GPR(R1)(r4) + ld r2, VCPU_GPR(R2)(r4) + ld r3, VCPU_GPR(R3)(r4) + ld r5, VCPU_GPR(R5)(r4) + ld r6, VCPU_GPR(R6)(r4) + ld r7, VCPU_GPR(R7)(r4) + ld r8, VCPU_GPR(R8)(r4) + ld r9, VCPU_GPR(R9)(r4) + ld r10, VCPU_GPR(R10)(r4) + ld r11, VCPU_GPR(R11)(r4) + ld r12, VCPU_GPR(R12)(r4) + ld r13, VCPU_GPR(R13)(r4) + +BEGIN_FTR_SECTION + mtspr SPRN_PPR, r0 +END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) + ld r0, VCPU_GPR(R0)(r4) + ld r4, VCPU_GPR(R4)(r4) + + hrfid + b . + +secondary_too_late: + li r12, 0 + cmpdi r4, 0 + beq 11f + stw r12, VCPU_TRAP(r4) +#ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING + addi r3, r4, VCPU_TB_RMEXIT + bl kvmhv_accumulate_time +#endif +11: b kvmhv_switch_to_host + +hdec_soon: + li r12, BOOK3S_INTERRUPT_HV_DECREMENTER + stw r12, VCPU_TRAP(r4) + mr r9, r4 +#ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING + addi r3, r4, VCPU_TB_RMEXIT + bl kvmhv_accumulate_time +#endif + b guest_exit_cont + +/****************************************************************************** + * * + * Exit code * + * * + *****************************************************************************/ + +/* + * We come here from the first-level interrupt handlers. + */ + .globl kvmppc_interrupt_hv +kvmppc_interrupt_hv: + /* + * Register contents: + * R12 = interrupt vector + * R13 = PACA + * guest CR, R12 saved in shadow VCPU SCRATCH1/0 + * guest R13 saved in SPRN_SCRATCH0 + */ + std r9, HSTATE_SCRATCH2(r13) + + lbz r9, HSTATE_IN_GUEST(r13) + cmpwi r9, KVM_GUEST_MODE_HOST_HV + beq kvmppc_bad_host_intr +#ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE + cmpwi r9, KVM_GUEST_MODE_GUEST + ld r9, HSTATE_SCRATCH2(r13) + beq kvmppc_interrupt_pr +#endif + /* We're now back in the host but in guest MMU context */ + li r9, KVM_GUEST_MODE_HOST_HV + stb r9, HSTATE_IN_GUEST(r13) + + ld r9, HSTATE_KVM_VCPU(r13) + + /* Save registers */ + + std r0, VCPU_GPR(R0)(r9) + std r1, VCPU_GPR(R1)(r9) + std r2, VCPU_GPR(R2)(r9) + std r3, VCPU_GPR(R3)(r9) + std r4, VCPU_GPR(R4)(r9) + std r5, VCPU_GPR(R5)(r9) + std r6, VCPU_GPR(R6)(r9) + std r7, VCPU_GPR(R7)(r9) + std r8, VCPU_GPR(R8)(r9) + ld r0, HSTATE_SCRATCH2(r13) + std r0, VCPU_GPR(R9)(r9) + std r10, VCPU_GPR(R10)(r9) + std r11, VCPU_GPR(R11)(r9) + ld r3, HSTATE_SCRATCH0(r13) + lwz r4, HSTATE_SCRATCH1(r13) + std r3, VCPU_GPR(R12)(r9) + stw r4, VCPU_CR(r9) +BEGIN_FTR_SECTION + ld r3, HSTATE_CFAR(r13) + std r3, VCPU_CFAR(r9) +END_FTR_SECTION_IFSET(CPU_FTR_CFAR) +BEGIN_FTR_SECTION + ld r4, HSTATE_PPR(r13) + std r4, VCPU_PPR(r9) +END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) + + /* Restore R1/R2 so we can handle faults */ + ld r1, HSTATE_HOST_R1(r13) + ld r2, PACATOC(r13) + + mfspr r10, SPRN_SRR0 + mfspr r11, SPRN_SRR1 + std r10, VCPU_SRR0(r9) + std r11, VCPU_SRR1(r9) + andi. r0, r12, 2 /* need to read HSRR0/1? */ + beq 1f + mfspr r10, SPRN_HSRR0 + mfspr r11, SPRN_HSRR1 + clrrdi r12, r12, 2 +1: std r10, VCPU_PC(r9) + std r11, VCPU_MSR(r9) + + GET_SCRATCH0(r3) + mflr r4 + std r3, VCPU_GPR(R13)(r9) + std r4, VCPU_LR(r9) + + stw r12,VCPU_TRAP(r9) + +#ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING + addi r3, r9, VCPU_TB_RMINTR + mr r4, r9 + bl kvmhv_accumulate_time + ld r5, VCPU_GPR(R5)(r9) + ld r6, VCPU_GPR(R6)(r9) + ld r7, VCPU_GPR(R7)(r9) + ld r8, VCPU_GPR(R8)(r9) +#endif + + /* Save HEIR (HV emulation assist reg) in emul_inst + if this is an HEI (HV emulation interrupt, e40) */ + li r3,KVM_INST_FETCH_FAILED + stw r3,VCPU_LAST_INST(r9) + cmpwi r12,BOOK3S_INTERRUPT_H_EMUL_ASSIST + bne 11f + mfspr r3,SPRN_HEIR +11: stw r3,VCPU_HEIR(r9) + + /* these are volatile across C function calls */ + mfctr r3 + mfxer r4 + std r3, VCPU_CTR(r9) + stw r4, VCPU_XER(r9) + + /* If this is a page table miss then see if it's theirs or ours */ + cmpwi r12, BOOK3S_INTERRUPT_H_DATA_STORAGE + beq kvmppc_hdsi + cmpwi r12, BOOK3S_INTERRUPT_H_INST_STORAGE + beq kvmppc_hisi + + /* See if this is a leftover HDEC interrupt */ + cmpwi r12,BOOK3S_INTERRUPT_HV_DECREMENTER + bne 2f + mfspr r3,SPRN_HDEC + cmpwi r3,0 + mr r4,r9 + bge fast_guest_return +2: + /* See if this is an hcall we can handle in real mode */ + cmpwi r12,BOOK3S_INTERRUPT_SYSCALL + beq hcall_try_real_mode + + /* Hypervisor doorbell - exit only if host IPI flag set */ + cmpwi r12, BOOK3S_INTERRUPT_H_DOORBELL + bne 3f + lbz r0, HSTATE_HOST_IPI(r13) + beq 4f + b guest_exit_cont +3: + /* External interrupt ? */ + cmpwi r12, BOOK3S_INTERRUPT_EXTERNAL + bne+ guest_exit_cont + + /* External interrupt, first check for host_ipi. If this is + * set, we know the host wants us out so let's do it now + */ + bl kvmppc_read_intr + cmpdi r3, 0 + bgt guest_exit_cont + + /* Check if any CPU is heading out to the host, if so head out too */ +4: ld r5, HSTATE_KVM_VCORE(r13) + lwz r0, VCORE_ENTRY_EXIT(r5) + cmpwi r0, 0x100 + mr r4, r9 + blt deliver_guest_interrupt + +guest_exit_cont: /* r9 = vcpu, r12 = trap, r13 = paca */ + /* Save more register state */ + mfdar r6 + mfdsisr r7 + std r6, VCPU_DAR(r9) + stw r7, VCPU_DSISR(r9) + /* don't overwrite fault_dar/fault_dsisr if HDSI */ + cmpwi r12,BOOK3S_INTERRUPT_H_DATA_STORAGE + beq mc_cont + std r6, VCPU_FAULT_DAR(r9) + stw r7, VCPU_FAULT_DSISR(r9) + + /* See if it is a machine check */ + cmpwi r12, BOOK3S_INTERRUPT_MACHINE_CHECK + beq machine_check_realmode +mc_cont: +#ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING + addi r3, r9, VCPU_TB_RMEXIT + mr r4, r9 + bl kvmhv_accumulate_time +#endif + + /* Increment exit count, poke other threads to exit */ + bl kvmhv_commence_exit + nop + ld r9, HSTATE_KVM_VCPU(r13) + lwz r12, VCPU_TRAP(r9) + + /* Save guest CTRL register, set runlatch to 1 */ + mfspr r6,SPRN_CTRLF + stw r6,VCPU_CTRL(r9) + andi. r0,r6,1 + bne 4f + ori r6,r6,1 + mtspr SPRN_CTRLT,r6 +4: + /* Read the guest SLB and save it away */ + lwz r0,VCPU_SLB_NR(r9) /* number of entries in SLB */ + mtctr r0 + li r6,0 + addi r7,r9,VCPU_SLB + li r5,0 +1: slbmfee r8,r6 + andis. r0,r8,SLB_ESID_V@h + beq 2f + add r8,r8,r6 /* put index in */ + slbmfev r3,r6 + std r8,VCPU_SLB_E(r7) + std r3,VCPU_SLB_V(r7) + addi r7,r7,VCPU_SLB_SIZE + addi r5,r5,1 +2: addi r6,r6,1 + bdnz 1b + stw r5,VCPU_SLB_MAX(r9) + + /* + * Save the guest PURR/SPURR + */ + mfspr r5,SPRN_PURR + mfspr r6,SPRN_SPURR + ld r7,VCPU_PURR(r9) + ld r8,VCPU_SPURR(r9) + std r5,VCPU_PURR(r9) + std r6,VCPU_SPURR(r9) + subf r5,r7,r5 + subf r6,r8,r6 + + /* + * Restore host PURR/SPURR and add guest times + * so that the time in the guest gets accounted. + */ + ld r3,HSTATE_PURR(r13) + ld r4,HSTATE_SPURR(r13) + add r3,r3,r5 + add r4,r4,r6 + mtspr SPRN_PURR,r3 + mtspr SPRN_SPURR,r4 + + /* Save DEC */ + mfspr r5,SPRN_DEC + mftb r6 + extsw r5,r5 + add r5,r5,r6 + /* r5 is a guest timebase value here, convert to host TB */ + ld r3,HSTATE_KVM_VCORE(r13) + ld r4,VCORE_TB_OFFSET(r3) + subf r5,r4,r5 + std r5,VCPU_DEC_EXPIRES(r9) + +BEGIN_FTR_SECTION + b 8f +END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S) + /* Save POWER8-specific registers */ + mfspr r5, SPRN_IAMR + mfspr r6, SPRN_PSPB + mfspr r7, SPRN_FSCR + std r5, VCPU_IAMR(r9) + stw r6, VCPU_PSPB(r9) + std r7, VCPU_FSCR(r9) + mfspr r5, SPRN_IC + mfspr r6, SPRN_VTB + mfspr r7, SPRN_TAR + std r5, VCPU_IC(r9) + std r6, VCPU_VTB(r9) + std r7, VCPU_TAR(r9) + mfspr r8, SPRN_EBBHR + std r8, VCPU_EBBHR(r9) + mfspr r5, SPRN_EBBRR + mfspr r6, SPRN_BESCR + mfspr r7, SPRN_CSIGR + mfspr r8, SPRN_TACR + std r5, VCPU_EBBRR(r9) + std r6, VCPU_BESCR(r9) + std r7, VCPU_CSIGR(r9) + std r8, VCPU_TACR(r9) + mfspr r5, SPRN_TCSCR + mfspr r6, SPRN_ACOP + mfspr r7, SPRN_PID + mfspr r8, SPRN_WORT + std r5, VCPU_TCSCR(r9) + std r6, VCPU_ACOP(r9) + stw r7, VCPU_GUEST_PID(r9) + std r8, VCPU_WORT(r9) +8: + + /* Save and reset AMR and UAMOR before turning on the MMU */ + mfspr r5,SPRN_AMR + mfspr r6,SPRN_UAMOR + std r5,VCPU_AMR(r9) + std r6,VCPU_UAMOR(r9) + li r6,0 + mtspr SPRN_AMR,r6 + + /* Switch DSCR back to host value */ + mfspr r8, SPRN_DSCR + ld r7, HSTATE_DSCR(r13) + std r8, VCPU_DSCR(r9) + mtspr SPRN_DSCR, r7 + + /* Save non-volatile GPRs */ + std r14, VCPU_GPR(R14)(r9) + std r15, VCPU_GPR(R15)(r9) + std r16, VCPU_GPR(R16)(r9) + std r17, VCPU_GPR(R17)(r9) + std r18, VCPU_GPR(R18)(r9) + std r19, VCPU_GPR(R19)(r9) + std r20, VCPU_GPR(R20)(r9) + std r21, VCPU_GPR(R21)(r9) + std r22, VCPU_GPR(R22)(r9) + std r23, VCPU_GPR(R23)(r9) + std r24, VCPU_GPR(R24)(r9) + std r25, VCPU_GPR(R25)(r9) + std r26, VCPU_GPR(R26)(r9) + std r27, VCPU_GPR(R27)(r9) + std r28, VCPU_GPR(R28)(r9) + std r29, VCPU_GPR(R29)(r9) + std r30, VCPU_GPR(R30)(r9) + std r31, VCPU_GPR(R31)(r9) + + /* Save SPRGs */ + mfspr r3, SPRN_SPRG0 + mfspr r4, SPRN_SPRG1 + mfspr r5, SPRN_SPRG2 + mfspr r6, SPRN_SPRG3 + std r3, VCPU_SPRG0(r9) + std r4, VCPU_SPRG1(r9) + std r5, VCPU_SPRG2(r9) + std r6, VCPU_SPRG3(r9) + + /* save FP state */ + mr r3, r9 + bl kvmppc_save_fp + +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM +BEGIN_FTR_SECTION + b 2f +END_FTR_SECTION_IFCLR(CPU_FTR_TM) + /* Turn on TM. */ + mfmsr r8 + li r0, 1 + rldimi r8, r0, MSR_TM_LG, 63-MSR_TM_LG + mtmsrd r8 + + ld r5, VCPU_MSR(r9) + rldicl. r5, r5, 64 - MSR_TS_S_LG, 62 + beq 1f /* TM not active in guest. */ + + li r3, TM_CAUSE_KVM_RESCHED + + /* Clear the MSR RI since r1, r13 are all going to be foobar. */ + li r5, 0 + mtmsrd r5, 1 + + /* All GPRs are volatile at this point. */ + TRECLAIM(R3) + + /* Temporarily store r13 and r9 so we have some regs to play with */ + SET_SCRATCH0(r13) + GET_PACA(r13) + std r9, PACATMSCRATCH(r13) + ld r9, HSTATE_KVM_VCPU(r13) + + /* Get a few more GPRs free. */ + std r29, VCPU_GPRS_TM(29)(r9) + std r30, VCPU_GPRS_TM(30)(r9) + std r31, VCPU_GPRS_TM(31)(r9) + + /* Save away PPR and DSCR soon so don't run with user values. */ + mfspr r31, SPRN_PPR + HMT_MEDIUM + mfspr r30, SPRN_DSCR + ld r29, HSTATE_DSCR(r13) + mtspr SPRN_DSCR, r29 + + /* Save all but r9, r13 & r29-r31 */ + reg = 0 + .rept 29 + .if (reg != 9) && (reg != 13) + std reg, VCPU_GPRS_TM(reg)(r9) + .endif + reg = reg + 1 + .endr + /* ... now save r13 */ + GET_SCRATCH0(r4) + std r4, VCPU_GPRS_TM(13)(r9) + /* ... and save r9 */ + ld r4, PACATMSCRATCH(r13) + std r4, VCPU_GPRS_TM(9)(r9) + + /* Reload stack pointer and TOC. */ + ld r1, HSTATE_HOST_R1(r13) + ld r2, PACATOC(r13) + + /* Set MSR RI now we have r1 and r13 back. */ + li r5, MSR_RI + mtmsrd r5, 1 + + /* Save away checkpinted SPRs. */ + std r31, VCPU_PPR_TM(r9) + std r30, VCPU_DSCR_TM(r9) + mflr r5 + mfcr r6 + mfctr r7 + mfspr r8, SPRN_AMR + mfspr r10, SPRN_TAR + std r5, VCPU_LR_TM(r9) + stw r6, VCPU_CR_TM(r9) + std r7, VCPU_CTR_TM(r9) + std r8, VCPU_AMR_TM(r9) + std r10, VCPU_TAR_TM(r9) + + /* Restore r12 as trap number. */ + lwz r12, VCPU_TRAP(r9) + + /* Save FP/VSX. */ + addi r3, r9, VCPU_FPRS_TM + bl store_fp_state + addi r3, r9, VCPU_VRS_TM + bl store_vr_state + mfspr r6, SPRN_VRSAVE + stw r6, VCPU_VRSAVE_TM(r9) +1: + /* + * We need to save these SPRs after the treclaim so that the software + * error code is recorded correctly in the TEXASR. Also the user may + * change these outside of a transaction, so they must always be + * context switched. + */ + mfspr r5, SPRN_TFHAR + mfspr r6, SPRN_TFIAR + mfspr r7, SPRN_TEXASR + std r5, VCPU_TFHAR(r9) + std r6, VCPU_TFIAR(r9) + std r7, VCPU_TEXASR(r9) +2: +#endif + + /* Increment yield count if they have a VPA */ + ld r8, VCPU_VPA(r9) /* do they have a VPA? */ + cmpdi r8, 0 + beq 25f + li r4, LPPACA_YIELDCOUNT + LWZX_BE r3, r8, r4 + addi r3, r3, 1 + STWX_BE r3, r8, r4 + li r3, 1 + stb r3, VCPU_VPA_DIRTY(r9) +25: + /* Save PMU registers if requested */ + /* r8 and cr0.eq are live here */ +BEGIN_FTR_SECTION + /* + * POWER8 seems to have a hardware bug where setting + * MMCR0[PMAE] along with MMCR0[PMC1CE] and/or MMCR0[PMCjCE] + * when some counters are already negative doesn't seem + * to cause a performance monitor alert (and hence interrupt). + * The effect of this is that when saving the PMU state, + * if there is no PMU alert pending when we read MMCR0 + * before freezing the counters, but one becomes pending + * before we read the counters, we lose it. + * To work around this, we need a way to freeze the counters + * before reading MMCR0. Normally, freezing the counters + * is done by writing MMCR0 (to set MMCR0[FC]) which + * unavoidably writes MMCR0[PMA0] as well. On POWER8, + * we can also freeze the counters using MMCR2, by writing + * 1s to all the counter freeze condition bits (there are + * 9 bits each for 6 counters). + */ + li r3, -1 /* set all freeze bits */ + clrrdi r3, r3, 10 + mfspr r10, SPRN_MMCR2 + mtspr SPRN_MMCR2, r3 + isync +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) + li r3, 1 + sldi r3, r3, 31 /* MMCR0_FC (freeze counters) bit */ + mfspr r4, SPRN_MMCR0 /* save MMCR0 */ + mtspr SPRN_MMCR0, r3 /* freeze all counters, disable ints */ + mfspr r6, SPRN_MMCRA + /* Clear MMCRA in order to disable SDAR updates */ + li r7, 0 + mtspr SPRN_MMCRA, r7 + isync + beq 21f /* if no VPA, save PMU stuff anyway */ + lbz r7, LPPACA_PMCINUSE(r8) + cmpwi r7, 0 /* did they ask for PMU stuff to be saved? */ + bne 21f + std r3, VCPU_MMCR(r9) /* if not, set saved MMCR0 to FC */ + b 22f +21: mfspr r5, SPRN_MMCR1 + mfspr r7, SPRN_SIAR + mfspr r8, SPRN_SDAR + std r4, VCPU_MMCR(r9) + std r5, VCPU_MMCR + 8(r9) + std r6, VCPU_MMCR + 16(r9) +BEGIN_FTR_SECTION + std r10, VCPU_MMCR + 24(r9) +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) + std r7, VCPU_SIAR(r9) + std r8, VCPU_SDAR(r9) + mfspr r3, SPRN_PMC1 + mfspr r4, SPRN_PMC2 + mfspr r5, SPRN_PMC3 + mfspr r6, SPRN_PMC4 + mfspr r7, SPRN_PMC5 + mfspr r8, SPRN_PMC6 + stw r3, VCPU_PMC(r9) + stw r4, VCPU_PMC + 4(r9) + stw r5, VCPU_PMC + 8(r9) + stw r6, VCPU_PMC + 12(r9) + stw r7, VCPU_PMC + 16(r9) + stw r8, VCPU_PMC + 20(r9) +BEGIN_FTR_SECTION + mfspr r5, SPRN_SIER + mfspr r6, SPRN_SPMC1 + mfspr r7, SPRN_SPMC2 + mfspr r8, SPRN_MMCRS + std r5, VCPU_SIER(r9) + stw r6, VCPU_PMC + 24(r9) + stw r7, VCPU_PMC + 28(r9) + std r8, VCPU_MMCR + 32(r9) + lis r4, 0x8000 + mtspr SPRN_MMCRS, r4 +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) +22: + /* Clear out SLB */ + li r5,0 + slbmte r5,r5 + slbia + ptesync + + /* + * POWER7/POWER8 guest -> host partition switch code. + * We don't have to lock against tlbies but we do + * have to coordinate the hardware threads. + */ +kvmhv_switch_to_host: + /* Secondary threads wait for primary to do partition switch */ + ld r5,HSTATE_KVM_VCORE(r13) + ld r4,VCORE_KVM(r5) /* pointer to struct kvm */ + lbz r3,HSTATE_PTID(r13) + cmpwi r3,0 + beq 15f + HMT_LOW +13: lbz r3,VCORE_IN_GUEST(r5) + cmpwi r3,0 + bne 13b + HMT_MEDIUM + b 16f + + /* Primary thread waits for all the secondaries to exit guest */ +15: lwz r3,VCORE_ENTRY_EXIT(r5) + srwi r0,r3,8 + clrldi r3,r3,56 + cmpw r3,r0 + bne 15b + isync + + /* Primary thread switches back to host partition */ + ld r6,KVM_HOST_SDR1(r4) + lwz r7,KVM_HOST_LPID(r4) + li r8,LPID_RSVD /* switch to reserved LPID */ + mtspr SPRN_LPID,r8 + ptesync + mtspr SPRN_SDR1,r6 /* switch to partition page table */ + mtspr SPRN_LPID,r7 + isync + +BEGIN_FTR_SECTION + /* DPDES is shared between threads */ + mfspr r7, SPRN_DPDES + std r7, VCORE_DPDES(r5) + /* clear DPDES so we don't get guest doorbells in the host */ + li r8, 0 + mtspr SPRN_DPDES, r8 +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) + + /* Subtract timebase offset from timebase */ + ld r8,VCORE_TB_OFFSET(r5) + cmpdi r8,0 + beq 17f + mftb r6 /* current guest timebase */ + subf r8,r8,r6 + mtspr SPRN_TBU40,r8 /* update upper 40 bits */ + mftb r7 /* check if lower 24 bits overflowed */ + clrldi r6,r6,40 + clrldi r7,r7,40 + cmpld r7,r6 + bge 17f + addis r8,r8,0x100 /* if so, increment upper 40 bits */ + mtspr SPRN_TBU40,r8 + + /* Reset PCR */ +17: ld r0, VCORE_PCR(r5) + cmpdi r0, 0 + beq 18f + li r0, 0 + mtspr SPRN_PCR, r0 +18: + /* Signal secondary CPUs to continue */ + stb r0,VCORE_IN_GUEST(r5) + lis r8,0x7fff /* MAX_INT@h */ + mtspr SPRN_HDEC,r8 + +16: ld r8,KVM_HOST_LPCR(r4) + mtspr SPRN_LPCR,r8 + isync + + /* load host SLB entries */ + ld r8,PACA_SLBSHADOWPTR(r13) + + .rept SLB_NUM_BOLTED + li r3, SLBSHADOW_SAVEAREA + LDX_BE r5, r8, r3 + addi r3, r3, 8 + LDX_BE r6, r8, r3 + andis. r7,r5,SLB_ESID_V@h + beq 1f + slbmte r6,r5 +1: addi r8,r8,16 + .endr + +#ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING + /* Finish timing, if we have a vcpu */ + ld r4, HSTATE_KVM_VCPU(r13) + cmpdi r4, 0 + li r3, 0 + beq 2f + bl kvmhv_accumulate_time +2: +#endif + /* Unset guest mode */ + li r0, KVM_GUEST_MODE_NONE + stb r0, HSTATE_IN_GUEST(r13) + + ld r0, 112+PPC_LR_STKOFF(r1) + addi r1, r1, 112 + mtlr r0 + blr + +/* + * Check whether an HDSI is an HPTE not found fault or something else. + * If it is an HPTE not found fault that is due to the guest accessing + * a page that they have mapped but which we have paged out, then + * we continue on with the guest exit path. In all other cases, + * reflect the HDSI to the guest as a DSI. + */ +kvmppc_hdsi: + mfspr r4, SPRN_HDAR + mfspr r6, SPRN_HDSISR + /* HPTE not found fault or protection fault? */ + andis. r0, r6, (DSISR_NOHPTE | DSISR_PROTFAULT)@h + beq 1f /* if not, send it to the guest */ + andi. r0, r11, MSR_DR /* data relocation enabled? */ + beq 3f + clrrdi r0, r4, 28 + PPC_SLBFEE_DOT(R5, R0) /* if so, look up SLB */ + bne 1f /* if no SLB entry found */ +4: std r4, VCPU_FAULT_DAR(r9) + stw r6, VCPU_FAULT_DSISR(r9) + + /* Search the hash table. */ + mr r3, r9 /* vcpu pointer */ + li r7, 1 /* data fault */ + bl kvmppc_hpte_hv_fault + ld r9, HSTATE_KVM_VCPU(r13) + ld r10, VCPU_PC(r9) + ld r11, VCPU_MSR(r9) + li r12, BOOK3S_INTERRUPT_H_DATA_STORAGE + cmpdi r3, 0 /* retry the instruction */ + beq 6f + cmpdi r3, -1 /* handle in kernel mode */ + beq guest_exit_cont + cmpdi r3, -2 /* MMIO emulation; need instr word */ + beq 2f + + /* Synthesize a DSI for the guest */ + ld r4, VCPU_FAULT_DAR(r9) + mr r6, r3 +1: mtspr SPRN_DAR, r4 + mtspr SPRN_DSISR, r6 + mtspr SPRN_SRR0, r10 + mtspr SPRN_SRR1, r11 + li r10, BOOK3S_INTERRUPT_DATA_STORAGE + bl kvmppc_msr_interrupt +fast_interrupt_c_return: +6: ld r7, VCPU_CTR(r9) + lwz r8, VCPU_XER(r9) + mtctr r7 + mtxer r8 + mr r4, r9 + b fast_guest_return + +3: ld r5, VCPU_KVM(r9) /* not relocated, use VRMA */ + ld r5, KVM_VRMA_SLB_V(r5) + b 4b + + /* If this is for emulated MMIO, load the instruction word */ +2: li r8, KVM_INST_FETCH_FAILED /* In case lwz faults */ + + /* Set guest mode to 'jump over instruction' so if lwz faults + * we'll just continue at the next IP. */ + li r0, KVM_GUEST_MODE_SKIP + stb r0, HSTATE_IN_GUEST(r13) + + /* Do the access with MSR:DR enabled */ + mfmsr r3 + ori r4, r3, MSR_DR /* Enable paging for data */ + mtmsrd r4 + lwz r8, 0(r10) + mtmsrd r3 + + /* Store the result */ + stw r8, VCPU_LAST_INST(r9) + + /* Unset guest mode. */ + li r0, KVM_GUEST_MODE_HOST_HV + stb r0, HSTATE_IN_GUEST(r13) + b guest_exit_cont + +/* + * Similarly for an HISI, reflect it to the guest as an ISI unless + * it is an HPTE not found fault for a page that we have paged out. + */ +kvmppc_hisi: + andis. r0, r11, SRR1_ISI_NOPT@h + beq 1f + andi. r0, r11, MSR_IR /* instruction relocation enabled? */ + beq 3f + clrrdi r0, r10, 28 + PPC_SLBFEE_DOT(R5, R0) /* if so, look up SLB */ + bne 1f /* if no SLB entry found */ +4: + /* Search the hash table. */ + mr r3, r9 /* vcpu pointer */ + mr r4, r10 + mr r6, r11 + li r7, 0 /* instruction fault */ + bl kvmppc_hpte_hv_fault + ld r9, HSTATE_KVM_VCPU(r13) + ld r10, VCPU_PC(r9) + ld r11, VCPU_MSR(r9) + li r12, BOOK3S_INTERRUPT_H_INST_STORAGE + cmpdi r3, 0 /* retry the instruction */ + beq fast_interrupt_c_return + cmpdi r3, -1 /* handle in kernel mode */ + beq guest_exit_cont + + /* Synthesize an ISI for the guest */ + mr r11, r3 +1: mtspr SPRN_SRR0, r10 + mtspr SPRN_SRR1, r11 + li r10, BOOK3S_INTERRUPT_INST_STORAGE + bl kvmppc_msr_interrupt + b fast_interrupt_c_return + +3: ld r6, VCPU_KVM(r9) /* not relocated, use VRMA */ + ld r5, KVM_VRMA_SLB_V(r6) + b 4b + +/* + * Try to handle an hcall in real mode. + * Returns to the guest if we handle it, or continues on up to + * the kernel if we can't (i.e. if we don't have a handler for + * it, or if the handler returns H_TOO_HARD). + * + * r5 - r8 contain hcall args, + * r9 = vcpu, r10 = pc, r11 = msr, r12 = trap, r13 = paca + */ +hcall_try_real_mode: + ld r3,VCPU_GPR(R3)(r9) + andi. r0,r11,MSR_PR + /* sc 1 from userspace - reflect to guest syscall */ + bne sc_1_fast_return + clrrdi r3,r3,2 + cmpldi r3,hcall_real_table_end - hcall_real_table + bge guest_exit_cont + /* See if this hcall is enabled for in-kernel handling */ + ld r4, VCPU_KVM(r9) + srdi r0, r3, 8 /* r0 = (r3 / 4) >> 6 */ + sldi r0, r0, 3 /* index into kvm->arch.enabled_hcalls[] */ + add r4, r4, r0 + ld r0, KVM_ENABLED_HCALLS(r4) + rlwinm r4, r3, 32-2, 0x3f /* r4 = (r3 / 4) & 0x3f */ + srd r0, r0, r4 + andi. r0, r0, 1 + beq guest_exit_cont + /* Get pointer to handler, if any, and call it */ + LOAD_REG_ADDR(r4, hcall_real_table) + lwax r3,r3,r4 + cmpwi r3,0 + beq guest_exit_cont + add r12,r3,r4 + mtctr r12 + mr r3,r9 /* get vcpu pointer */ + ld r4,VCPU_GPR(R4)(r9) + bctrl + cmpdi r3,H_TOO_HARD + beq hcall_real_fallback + ld r4,HSTATE_KVM_VCPU(r13) + std r3,VCPU_GPR(R3)(r4) + ld r10,VCPU_PC(r4) + ld r11,VCPU_MSR(r4) + b fast_guest_return + +sc_1_fast_return: + mtspr SPRN_SRR0,r10 + mtspr SPRN_SRR1,r11 + li r10, BOOK3S_INTERRUPT_SYSCALL + bl kvmppc_msr_interrupt + mr r4,r9 + b fast_guest_return + + /* We've attempted a real mode hcall, but it's punted it back + * to userspace. We need to restore some clobbered volatiles + * before resuming the pass-it-to-qemu path */ +hcall_real_fallback: + li r12,BOOK3S_INTERRUPT_SYSCALL + ld r9, HSTATE_KVM_VCPU(r13) + + b guest_exit_cont + + .globl hcall_real_table +hcall_real_table: + .long 0 /* 0 - unused */ + .long DOTSYM(kvmppc_h_remove) - hcall_real_table + .long DOTSYM(kvmppc_h_enter) - hcall_real_table + .long DOTSYM(kvmppc_h_read) - hcall_real_table + .long 0 /* 0x10 - H_CLEAR_MOD */ + .long 0 /* 0x14 - H_CLEAR_REF */ + .long DOTSYM(kvmppc_h_protect) - hcall_real_table + .long DOTSYM(kvmppc_h_get_tce) - hcall_real_table + .long DOTSYM(kvmppc_h_put_tce) - hcall_real_table + .long 0 /* 0x24 - H_SET_SPRG0 */ + .long DOTSYM(kvmppc_h_set_dabr) - hcall_real_table + .long 0 /* 0x2c */ + .long 0 /* 0x30 */ + .long 0 /* 0x34 */ + .long 0 /* 0x38 */ + .long 0 /* 0x3c */ + .long 0 /* 0x40 */ + .long 0 /* 0x44 */ + .long 0 /* 0x48 */ + .long 0 /* 0x4c */ + .long 0 /* 0x50 */ + .long 0 /* 0x54 */ + .long 0 /* 0x58 */ + .long 0 /* 0x5c */ + .long 0 /* 0x60 */ +#ifdef CONFIG_KVM_XICS + .long DOTSYM(kvmppc_rm_h_eoi) - hcall_real_table + .long DOTSYM(kvmppc_rm_h_cppr) - hcall_real_table + .long DOTSYM(kvmppc_rm_h_ipi) - hcall_real_table + .long 0 /* 0x70 - H_IPOLL */ + .long DOTSYM(kvmppc_rm_h_xirr) - hcall_real_table +#else + .long 0 /* 0x64 - H_EOI */ + .long 0 /* 0x68 - H_CPPR */ + .long 0 /* 0x6c - H_IPI */ + .long 0 /* 0x70 - H_IPOLL */ + .long 0 /* 0x74 - H_XIRR */ +#endif + .long 0 /* 0x78 */ + .long 0 /* 0x7c */ + .long 0 /* 0x80 */ + .long 0 /* 0x84 */ + .long 0 /* 0x88 */ + .long 0 /* 0x8c */ + .long 0 /* 0x90 */ + .long 0 /* 0x94 */ + .long 0 /* 0x98 */ + .long 0 /* 0x9c */ + .long 0 /* 0xa0 */ + .long 0 /* 0xa4 */ + .long 0 /* 0xa8 */ + .long 0 /* 0xac */ + .long 0 /* 0xb0 */ + .long 0 /* 0xb4 */ + .long 0 /* 0xb8 */ + .long 0 /* 0xbc */ + .long 0 /* 0xc0 */ + .long 0 /* 0xc4 */ + .long 0 /* 0xc8 */ + .long 0 /* 0xcc */ + .long 0 /* 0xd0 */ + .long 0 /* 0xd4 */ + .long 0 /* 0xd8 */ + .long 0 /* 0xdc */ + .long DOTSYM(kvmppc_h_cede) - hcall_real_table + .long DOTSYM(kvmppc_rm_h_confer) - hcall_real_table + .long 0 /* 0xe8 */ + .long 0 /* 0xec */ + .long 0 /* 0xf0 */ + .long 0 /* 0xf4 */ + .long 0 /* 0xf8 */ + .long 0 /* 0xfc */ + .long 0 /* 0x100 */ + .long 0 /* 0x104 */ + .long 0 /* 0x108 */ + .long 0 /* 0x10c */ + .long 0 /* 0x110 */ + .long 0 /* 0x114 */ + .long 0 /* 0x118 */ + .long 0 /* 0x11c */ + .long 0 /* 0x120 */ + .long DOTSYM(kvmppc_h_bulk_remove) - hcall_real_table + .long 0 /* 0x128 */ + .long 0 /* 0x12c */ + .long 0 /* 0x130 */ + .long DOTSYM(kvmppc_h_set_xdabr) - hcall_real_table + .long 0 /* 0x138 */ + .long 0 /* 0x13c */ + .long 0 /* 0x140 */ + .long 0 /* 0x144 */ + .long 0 /* 0x148 */ + .long 0 /* 0x14c */ + .long 0 /* 0x150 */ + .long 0 /* 0x154 */ + .long 0 /* 0x158 */ + .long 0 /* 0x15c */ + .long 0 /* 0x160 */ + .long 0 /* 0x164 */ + .long 0 /* 0x168 */ + .long 0 /* 0x16c */ + .long 0 /* 0x170 */ + .long 0 /* 0x174 */ + .long 0 /* 0x178 */ + .long 0 /* 0x17c */ + .long 0 /* 0x180 */ + .long 0 /* 0x184 */ + .long 0 /* 0x188 */ + .long 0 /* 0x18c */ + .long 0 /* 0x190 */ + .long 0 /* 0x194 */ + .long 0 /* 0x198 */ + .long 0 /* 0x19c */ + .long 0 /* 0x1a0 */ + .long 0 /* 0x1a4 */ + .long 0 /* 0x1a8 */ + .long 0 /* 0x1ac */ + .long 0 /* 0x1b0 */ + .long 0 /* 0x1b4 */ + .long 0 /* 0x1b8 */ + .long 0 /* 0x1bc */ + .long 0 /* 0x1c0 */ + .long 0 /* 0x1c4 */ + .long 0 /* 0x1c8 */ + .long 0 /* 0x1cc */ + .long 0 /* 0x1d0 */ + .long 0 /* 0x1d4 */ + .long 0 /* 0x1d8 */ + .long 0 /* 0x1dc */ + .long 0 /* 0x1e0 */ + .long 0 /* 0x1e4 */ + .long 0 /* 0x1e8 */ + .long 0 /* 0x1ec */ + .long 0 /* 0x1f0 */ + .long 0 /* 0x1f4 */ + .long 0 /* 0x1f8 */ + .long 0 /* 0x1fc */ + .long 0 /* 0x200 */ + .long 0 /* 0x204 */ + .long 0 /* 0x208 */ + .long 0 /* 0x20c */ + .long 0 /* 0x210 */ + .long 0 /* 0x214 */ + .long 0 /* 0x218 */ + .long 0 /* 0x21c */ + .long 0 /* 0x220 */ + .long 0 /* 0x224 */ + .long 0 /* 0x228 */ + .long 0 /* 0x22c */ + .long 0 /* 0x230 */ + .long 0 /* 0x234 */ + .long 0 /* 0x238 */ + .long 0 /* 0x23c */ + .long 0 /* 0x240 */ + .long 0 /* 0x244 */ + .long 0 /* 0x248 */ + .long 0 /* 0x24c */ + .long 0 /* 0x250 */ + .long 0 /* 0x254 */ + .long 0 /* 0x258 */ + .long 0 /* 0x25c */ + .long 0 /* 0x260 */ + .long 0 /* 0x264 */ + .long 0 /* 0x268 */ + .long 0 /* 0x26c */ + .long 0 /* 0x270 */ + .long 0 /* 0x274 */ + .long 0 /* 0x278 */ + .long 0 /* 0x27c */ + .long 0 /* 0x280 */ + .long 0 /* 0x284 */ + .long 0 /* 0x288 */ + .long 0 /* 0x28c */ + .long 0 /* 0x290 */ + .long 0 /* 0x294 */ + .long 0 /* 0x298 */ + .long 0 /* 0x29c */ + .long 0 /* 0x2a0 */ + .long 0 /* 0x2a4 */ + .long 0 /* 0x2a8 */ + .long 0 /* 0x2ac */ + .long 0 /* 0x2b0 */ + .long 0 /* 0x2b4 */ + .long 0 /* 0x2b8 */ + .long 0 /* 0x2bc */ + .long 0 /* 0x2c0 */ + .long 0 /* 0x2c4 */ + .long 0 /* 0x2c8 */ + .long 0 /* 0x2cc */ + .long 0 /* 0x2d0 */ + .long 0 /* 0x2d4 */ + .long 0 /* 0x2d8 */ + .long 0 /* 0x2dc */ + .long 0 /* 0x2e0 */ + .long 0 /* 0x2e4 */ + .long 0 /* 0x2e8 */ + .long 0 /* 0x2ec */ + .long 0 /* 0x2f0 */ + .long 0 /* 0x2f4 */ + .long 0 /* 0x2f8 */ + .long 0 /* 0x2fc */ + .long DOTSYM(kvmppc_h_random) - hcall_real_table + .globl hcall_real_table_end +hcall_real_table_end: + +_GLOBAL(kvmppc_h_set_xdabr) + andi. r0, r5, DABRX_USER | DABRX_KERNEL + beq 6f + li r0, DABRX_USER | DABRX_KERNEL | DABRX_BTI + andc. r0, r5, r0 + beq 3f +6: li r3, H_PARAMETER + blr + +_GLOBAL(kvmppc_h_set_dabr) + li r5, DABRX_USER | DABRX_KERNEL +3: +BEGIN_FTR_SECTION + b 2f +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) + std r4,VCPU_DABR(r3) + stw r5, VCPU_DABRX(r3) + mtspr SPRN_DABRX, r5 + /* Work around P7 bug where DABR can get corrupted on mtspr */ +1: mtspr SPRN_DABR,r4 + mfspr r5, SPRN_DABR + cmpd r4, r5 + bne 1b + isync + li r3,0 + blr + + /* Emulate H_SET_DABR/X on P8 for the sake of compat mode guests */ +2: rlwimi r5, r4, 5, DAWRX_DR | DAWRX_DW + rlwimi r5, r4, 1, DAWRX_WT + clrrdi r4, r4, 3 + std r4, VCPU_DAWR(r3) + std r5, VCPU_DAWRX(r3) + mtspr SPRN_DAWR, r4 + mtspr SPRN_DAWRX, r5 + li r3, 0 + blr + +_GLOBAL(kvmppc_h_cede) /* r3 = vcpu pointer, r11 = msr, r13 = paca */ + ori r11,r11,MSR_EE + std r11,VCPU_MSR(r3) + li r0,1 + stb r0,VCPU_CEDED(r3) + sync /* order setting ceded vs. testing prodded */ + lbz r5,VCPU_PRODDED(r3) + cmpwi r5,0 + bne kvm_cede_prodded + li r12,0 /* set trap to 0 to say hcall is handled */ + stw r12,VCPU_TRAP(r3) + li r0,H_SUCCESS + std r0,VCPU_GPR(R3)(r3) + + /* + * Set our bit in the bitmask of napping threads unless all the + * other threads are already napping, in which case we send this + * up to the host. + */ + ld r5,HSTATE_KVM_VCORE(r13) + lbz r6,HSTATE_PTID(r13) + lwz r8,VCORE_ENTRY_EXIT(r5) + clrldi r8,r8,56 + li r0,1 + sld r0,r0,r6 + addi r6,r5,VCORE_NAPPING_THREADS +31: lwarx r4,0,r6 + or r4,r4,r0 + cmpw r4,r8 + beq kvm_cede_exit + stwcx. r4,0,r6 + bne 31b + /* order napping_threads update vs testing entry_exit_map */ + isync + li r0,NAPPING_CEDE + stb r0,HSTATE_NAPPING(r13) + lwz r7,VCORE_ENTRY_EXIT(r5) + cmpwi r7,0x100 + bge 33f /* another thread already exiting */ + +/* + * Although not specifically required by the architecture, POWER7 + * preserves the following registers in nap mode, even if an SMT mode + * switch occurs: SLB entries, PURR, SPURR, AMOR, UAMOR, AMR, SPRG0-3, + * DAR, DSISR, DABR, DABRX, DSCR, PMCx, MMCRx, SIAR, SDAR. + */ + /* Save non-volatile GPRs */ + std r14, VCPU_GPR(R14)(r3) + std r15, VCPU_GPR(R15)(r3) + std r16, VCPU_GPR(R16)(r3) + std r17, VCPU_GPR(R17)(r3) + std r18, VCPU_GPR(R18)(r3) + std r19, VCPU_GPR(R19)(r3) + std r20, VCPU_GPR(R20)(r3) + std r21, VCPU_GPR(R21)(r3) + std r22, VCPU_GPR(R22)(r3) + std r23, VCPU_GPR(R23)(r3) + std r24, VCPU_GPR(R24)(r3) + std r25, VCPU_GPR(R25)(r3) + std r26, VCPU_GPR(R26)(r3) + std r27, VCPU_GPR(R27)(r3) + std r28, VCPU_GPR(R28)(r3) + std r29, VCPU_GPR(R29)(r3) + std r30, VCPU_GPR(R30)(r3) + std r31, VCPU_GPR(R31)(r3) + + /* save FP state */ + bl kvmppc_save_fp + + /* + * Set DEC to the smaller of DEC and HDEC, so that we wake + * no later than the end of our timeslice (HDEC interrupts + * don't wake us from nap). + */ + mfspr r3, SPRN_DEC + mfspr r4, SPRN_HDEC + mftb r5 + cmpw r3, r4 + ble 67f + mtspr SPRN_DEC, r4 +67: + /* save expiry time of guest decrementer */ + extsw r3, r3 + add r3, r3, r5 + ld r4, HSTATE_KVM_VCPU(r13) + ld r5, HSTATE_KVM_VCORE(r13) + ld r6, VCORE_TB_OFFSET(r5) + subf r3, r6, r3 /* convert to host TB value */ + std r3, VCPU_DEC_EXPIRES(r4) + +#ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING + ld r4, HSTATE_KVM_VCPU(r13) + addi r3, r4, VCPU_TB_CEDE + bl kvmhv_accumulate_time +#endif + + lis r3, LPCR_PECEDP@h /* Do wake on privileged doorbell */ + + /* + * Take a nap until a decrementer or external or doobell interrupt + * occurs, with PECE1 and PECE0 set in LPCR. + * On POWER8, set PECEDH, and if we are ceding, also set PECEDP. + * Also clear the runlatch bit before napping. + */ +kvm_do_nap: + mfspr r0, SPRN_CTRLF + clrrdi r0, r0, 1 + mtspr SPRN_CTRLT, r0 + + li r0,1 + stb r0,HSTATE_HWTHREAD_REQ(r13) + mfspr r5,SPRN_LPCR + ori r5,r5,LPCR_PECE0 | LPCR_PECE1 +BEGIN_FTR_SECTION + ori r5, r5, LPCR_PECEDH + rlwimi r5, r3, 0, LPCR_PECEDP +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) + mtspr SPRN_LPCR,r5 + isync + li r0, 0 + std r0, HSTATE_SCRATCH0(r13) + ptesync + ld r0, HSTATE_SCRATCH0(r13) +1: cmpd r0, r0 + bne 1b + nap + b . + +33: mr r4, r3 + li r3, 0 + li r12, 0 + b 34f + +kvm_end_cede: + /* get vcpu pointer */ + ld r4, HSTATE_KVM_VCPU(r13) + + /* Woken by external or decrementer interrupt */ + ld r1, HSTATE_HOST_R1(r13) + +#ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING + addi r3, r4, VCPU_TB_RMINTR + bl kvmhv_accumulate_time +#endif + + /* load up FP state */ + bl kvmppc_load_fp + + /* Restore guest decrementer */ + ld r3, VCPU_DEC_EXPIRES(r4) + ld r5, HSTATE_KVM_VCORE(r13) + ld r6, VCORE_TB_OFFSET(r5) + add r3, r3, r6 /* convert host TB to guest TB value */ + mftb r7 + subf r3, r7, r3 + mtspr SPRN_DEC, r3 + + /* Load NV GPRS */ + ld r14, VCPU_GPR(R14)(r4) + ld r15, VCPU_GPR(R15)(r4) + ld r16, VCPU_GPR(R16)(r4) + ld r17, VCPU_GPR(R17)(r4) + ld r18, VCPU_GPR(R18)(r4) + ld r19, VCPU_GPR(R19)(r4) + ld r20, VCPU_GPR(R20)(r4) + ld r21, VCPU_GPR(R21)(r4) + ld r22, VCPU_GPR(R22)(r4) + ld r23, VCPU_GPR(R23)(r4) + ld r24, VCPU_GPR(R24)(r4) + ld r25, VCPU_GPR(R25)(r4) + ld r26, VCPU_GPR(R26)(r4) + ld r27, VCPU_GPR(R27)(r4) + ld r28, VCPU_GPR(R28)(r4) + ld r29, VCPU_GPR(R29)(r4) + ld r30, VCPU_GPR(R30)(r4) + ld r31, VCPU_GPR(R31)(r4) + + /* Check the wake reason in SRR1 to see why we got here */ + bl kvmppc_check_wake_reason + + /* clear our bit in vcore->napping_threads */ +34: ld r5,HSTATE_KVM_VCORE(r13) + lbz r7,HSTATE_PTID(r13) + li r0,1 + sld r0,r0,r7 + addi r6,r5,VCORE_NAPPING_THREADS +32: lwarx r7,0,r6 + andc r7,r7,r0 + stwcx. r7,0,r6 + bne 32b + li r0,0 + stb r0,HSTATE_NAPPING(r13) + + /* See if the wake reason means we need to exit */ + stw r12, VCPU_TRAP(r4) + mr r9, r4 + cmpdi r3, 0 + bgt guest_exit_cont + + /* see if any other thread is already exiting */ + lwz r0,VCORE_ENTRY_EXIT(r5) + cmpwi r0,0x100 + bge guest_exit_cont + + b kvmppc_cede_reentry /* if not go back to guest */ + + /* cede when already previously prodded case */ +kvm_cede_prodded: + li r0,0 + stb r0,VCPU_PRODDED(r3) + sync /* order testing prodded vs. clearing ceded */ + stb r0,VCPU_CEDED(r3) + li r3,H_SUCCESS + blr + + /* we've ceded but we want to give control to the host */ +kvm_cede_exit: + ld r9, HSTATE_KVM_VCPU(r13) + b guest_exit_cont + + /* Try to handle a machine check in real mode */ +machine_check_realmode: + mr r3, r9 /* get vcpu pointer */ + bl kvmppc_realmode_machine_check + nop + cmpdi r3, 0 /* Did we handle MCE ? */ + ld r9, HSTATE_KVM_VCPU(r13) + li r12, BOOK3S_INTERRUPT_MACHINE_CHECK + /* + * Deliver unhandled/fatal (e.g. UE) MCE errors to guest through + * machine check interrupt (set HSRR0 to 0x200). And for handled + * errors (no-fatal), just go back to guest execution with current + * HSRR0 instead of exiting guest. This new approach will inject + * machine check to guest for fatal error causing guest to crash. + * + * The old code used to return to host for unhandled errors which + * was causing guest to hang with soft lockups inside guest and + * makes it difficult to recover guest instance. + */ + ld r10, VCPU_PC(r9) + ld r11, VCPU_MSR(r9) + bne 2f /* Continue guest execution. */ + /* If not, deliver a machine check. SRR0/1 are already set */ + li r10, BOOK3S_INTERRUPT_MACHINE_CHECK + ld r11, VCPU_MSR(r9) + bl kvmppc_msr_interrupt +2: b fast_interrupt_c_return + +/* + * Check the reason we woke from nap, and take appropriate action. + * Returns (in r3): + * 0 if nothing needs to be done + * 1 if something happened that needs to be handled by the host + * -1 if there was a guest wakeup (IPI or msgsnd) + * + * Also sets r12 to the interrupt vector for any interrupt that needs + * to be handled now by the host (0x500 for external interrupt), or zero. + * Modifies r0, r6, r7, r8. + */ +kvmppc_check_wake_reason: + mfspr r6, SPRN_SRR1 +BEGIN_FTR_SECTION + rlwinm r6, r6, 45-31, 0xf /* extract wake reason field (P8) */ +FTR_SECTION_ELSE + rlwinm r6, r6, 45-31, 0xe /* P7 wake reason field is 3 bits */ +ALT_FTR_SECTION_END_IFSET(CPU_FTR_ARCH_207S) + cmpwi r6, 8 /* was it an external interrupt? */ + li r12, BOOK3S_INTERRUPT_EXTERNAL + beq kvmppc_read_intr /* if so, see what it was */ + li r3, 0 + li r12, 0 + cmpwi r6, 6 /* was it the decrementer? */ + beq 0f +BEGIN_FTR_SECTION + cmpwi r6, 5 /* privileged doorbell? */ + beq 0f + cmpwi r6, 3 /* hypervisor doorbell? */ + beq 3f +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) + li r3, 1 /* anything else, return 1 */ +0: blr + + /* hypervisor doorbell */ +3: li r12, BOOK3S_INTERRUPT_H_DOORBELL + /* see if it's a host IPI */ + li r3, 1 + lbz r0, HSTATE_HOST_IPI(r13) + cmpwi r0, 0 + bnelr + /* if not, clear it and return -1 */ + lis r6, (PPC_DBELL_SERVER << (63-36))@h + PPC_MSGCLR(6) + li r3, -1 + blr + +/* + * Determine what sort of external interrupt is pending (if any). + * Returns: + * 0 if no interrupt is pending + * 1 if an interrupt is pending that needs to be handled by the host + * -1 if there was a guest wakeup IPI (which has now been cleared) + * Modifies r0, r6, r7, r8, returns value in r3. + */ +kvmppc_read_intr: + /* see if a host IPI is pending */ + li r3, 1 + lbz r0, HSTATE_HOST_IPI(r13) + cmpwi r0, 0 + bne 1f + + /* Now read the interrupt from the ICP */ + ld r6, HSTATE_XICS_PHYS(r13) + li r7, XICS_XIRR + cmpdi r6, 0 + beq- 1f + lwzcix r0, r6, r7 + /* + * Save XIRR for later. Since we get in in reverse endian on LE + * systems, save it byte reversed and fetch it back in host endian. + */ + li r3, HSTATE_SAVED_XIRR + STWX_BE r0, r3, r13 +#ifdef __LITTLE_ENDIAN__ + lwz r3, HSTATE_SAVED_XIRR(r13) +#else + mr r3, r0 +#endif + rlwinm. r3, r3, 0, 0xffffff + sync + beq 1f /* if nothing pending in the ICP */ + + /* We found something in the ICP... + * + * If it's not an IPI, stash it in the PACA and return to + * the host, we don't (yet) handle directing real external + * interrupts directly to the guest + */ + cmpwi r3, XICS_IPI /* if there is, is it an IPI? */ + bne 42f + + /* It's an IPI, clear the MFRR and EOI it */ + li r3, 0xff + li r8, XICS_MFRR + stbcix r3, r6, r8 /* clear the IPI */ + stwcix r0, r6, r7 /* EOI it */ + sync + + /* We need to re-check host IPI now in case it got set in the + * meantime. If it's clear, we bounce the interrupt to the + * guest + */ + lbz r0, HSTATE_HOST_IPI(r13) + cmpwi r0, 0 + bne- 43f + + /* OK, it's an IPI for us */ + li r12, 0 + li r3, -1 +1: blr + +42: /* It's not an IPI and it's for the host. We saved a copy of XIRR in + * the PACA earlier, it will be picked up by the host ICP driver + */ + li r3, 1 + b 1b + +43: /* We raced with the host, we need to resend that IPI, bummer */ + li r0, IPI_PRIORITY + stbcix r0, r6, r8 /* set the IPI */ + sync + li r3, 1 + b 1b + +/* + * Save away FP, VMX and VSX registers. + * r3 = vcpu pointer + * N.B. r30 and r31 are volatile across this function, + * thus it is not callable from C. + */ +kvmppc_save_fp: + mflr r30 + mr r31,r3 + mfmsr r5 + ori r8,r5,MSR_FP +#ifdef CONFIG_ALTIVEC +BEGIN_FTR_SECTION + oris r8,r8,MSR_VEC@h +END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC) +#endif +#ifdef CONFIG_VSX +BEGIN_FTR_SECTION + oris r8,r8,MSR_VSX@h +END_FTR_SECTION_IFSET(CPU_FTR_VSX) +#endif + mtmsrd r8 + addi r3,r3,VCPU_FPRS + bl store_fp_state +#ifdef CONFIG_ALTIVEC +BEGIN_FTR_SECTION + addi r3,r31,VCPU_VRS + bl store_vr_state +END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC) +#endif + mfspr r6,SPRN_VRSAVE + stw r6,VCPU_VRSAVE(r31) + mtlr r30 + blr + +/* + * Load up FP, VMX and VSX registers + * r4 = vcpu pointer + * N.B. r30 and r31 are volatile across this function, + * thus it is not callable from C. + */ +kvmppc_load_fp: + mflr r30 + mr r31,r4 + mfmsr r9 + ori r8,r9,MSR_FP +#ifdef CONFIG_ALTIVEC +BEGIN_FTR_SECTION + oris r8,r8,MSR_VEC@h +END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC) +#endif +#ifdef CONFIG_VSX +BEGIN_FTR_SECTION + oris r8,r8,MSR_VSX@h +END_FTR_SECTION_IFSET(CPU_FTR_VSX) +#endif + mtmsrd r8 + addi r3,r4,VCPU_FPRS + bl load_fp_state +#ifdef CONFIG_ALTIVEC +BEGIN_FTR_SECTION + addi r3,r31,VCPU_VRS + bl load_vr_state +END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC) +#endif + lwz r7,VCPU_VRSAVE(r31) + mtspr SPRN_VRSAVE,r7 + mtlr r30 + mr r4,r31 + blr + +/* + * We come here if we get any exception or interrupt while we are + * executing host real mode code while in guest MMU context. + * For now just spin, but we should do something better. + */ +kvmppc_bad_host_intr: + b . + +/* + * This mimics the MSR transition on IRQ delivery. The new guest MSR is taken + * from VCPU_INTR_MSR and is modified based on the required TM state changes. + * r11 has the guest MSR value (in/out) + * r9 has a vcpu pointer (in) + * r0 is used as a scratch register + */ +kvmppc_msr_interrupt: + rldicl r0, r11, 64 - MSR_TS_S_LG, 62 + cmpwi r0, 2 /* Check if we are in transactional state.. */ + ld r11, VCPU_INTR_MSR(r9) + bne 1f + /* ... if transactional, change to suspended */ + li r0, 1 +1: rldimi r11, r0, MSR_TS_S_LG, 63 - MSR_TS_T_LG + blr + +/* + * This works around a hardware bug on POWER8E processors, where + * writing a 1 to the MMCR0[PMAO] bit doesn't generate a + * performance monitor interrupt. Instead, when we need to have + * an interrupt pending, we have to arrange for a counter to overflow. + */ +kvmppc_fix_pmao: + li r3, 0 + mtspr SPRN_MMCR2, r3 + lis r3, (MMCR0_PMXE | MMCR0_FCECE)@h + ori r3, r3, MMCR0_PMCjCE | MMCR0_C56RUN + mtspr SPRN_MMCR0, r3 + lis r3, 0x7fff + ori r3, r3, 0xffff + mtspr SPRN_PMC6, r3 + isync + blr + +#ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING +/* + * Start timing an activity + * r3 = pointer to time accumulation struct, r4 = vcpu + */ +kvmhv_start_timing: + ld r5, HSTATE_KVM_VCORE(r13) + lbz r6, VCORE_IN_GUEST(r5) + cmpwi r6, 0 + beq 5f /* if in guest, need to */ + ld r6, VCORE_TB_OFFSET(r5) /* subtract timebase offset */ +5: mftb r5 + subf r5, r6, r5 + std r3, VCPU_CUR_ACTIVITY(r4) + std r5, VCPU_ACTIVITY_START(r4) + blr + +/* + * Accumulate time to one activity and start another. + * r3 = pointer to new time accumulation struct, r4 = vcpu + */ +kvmhv_accumulate_time: + ld r5, HSTATE_KVM_VCORE(r13) + lbz r8, VCORE_IN_GUEST(r5) + cmpwi r8, 0 + beq 4f /* if in guest, need to */ + ld r8, VCORE_TB_OFFSET(r5) /* subtract timebase offset */ +4: ld r5, VCPU_CUR_ACTIVITY(r4) + ld r6, VCPU_ACTIVITY_START(r4) + std r3, VCPU_CUR_ACTIVITY(r4) + mftb r7 + subf r7, r8, r7 + std r7, VCPU_ACTIVITY_START(r4) + cmpdi r5, 0 + beqlr + subf r3, r6, r7 + ld r8, TAS_SEQCOUNT(r5) + cmpdi r8, 0 + addi r8, r8, 1 + std r8, TAS_SEQCOUNT(r5) + lwsync + ld r7, TAS_TOTAL(r5) + add r7, r7, r3 + std r7, TAS_TOTAL(r5) + ld r6, TAS_MIN(r5) + ld r7, TAS_MAX(r5) + beq 3f + cmpd r3, r6 + bge 1f +3: std r3, TAS_MIN(r5) +1: cmpd r3, r7 + ble 2f + std r3, TAS_MAX(r5) +2: lwsync + addi r8, r8, 1 + std r8, TAS_SEQCOUNT(r5) + blr +#endif diff --git a/kernel/arch/powerpc/kvm/book3s_interrupts.S b/kernel/arch/powerpc/kvm/book3s_interrupts.S new file mode 100644 index 000000000..d044b8b7c --- /dev/null +++ b/kernel/arch/powerpc/kvm/book3s_interrupts.S @@ -0,0 +1,253 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * + * Copyright SUSE Linux Products GmbH 2009 + * + * Authors: Alexander Graf <agraf@suse.de> + */ + +#include <asm/ppc_asm.h> +#include <asm/kvm_asm.h> +#include <asm/reg.h> +#include <asm/page.h> +#include <asm/asm-offsets.h> +#include <asm/exception-64s.h> + +#if defined(CONFIG_PPC_BOOK3S_64) +#if defined(_CALL_ELF) && _CALL_ELF == 2 +#define FUNC(name) name +#else +#define FUNC(name) GLUE(.,name) +#endif +#define GET_SHADOW_VCPU(reg) addi reg, r13, PACA_SVCPU + +#elif defined(CONFIG_PPC_BOOK3S_32) +#define FUNC(name) name +#define GET_SHADOW_VCPU(reg) lwz reg, (THREAD + THREAD_KVM_SVCPU)(r2) + +#endif /* CONFIG_PPC_BOOK3S_XX */ + +#define VCPU_LOAD_NVGPRS(vcpu) \ + PPC_LL r14, VCPU_GPR(R14)(vcpu); \ + PPC_LL r15, VCPU_GPR(R15)(vcpu); \ + PPC_LL r16, VCPU_GPR(R16)(vcpu); \ + PPC_LL r17, VCPU_GPR(R17)(vcpu); \ + PPC_LL r18, VCPU_GPR(R18)(vcpu); \ + PPC_LL r19, VCPU_GPR(R19)(vcpu); \ + PPC_LL r20, VCPU_GPR(R20)(vcpu); \ + PPC_LL r21, VCPU_GPR(R21)(vcpu); \ + PPC_LL r22, VCPU_GPR(R22)(vcpu); \ + PPC_LL r23, VCPU_GPR(R23)(vcpu); \ + PPC_LL r24, VCPU_GPR(R24)(vcpu); \ + PPC_LL r25, VCPU_GPR(R25)(vcpu); \ + PPC_LL r26, VCPU_GPR(R26)(vcpu); \ + PPC_LL r27, VCPU_GPR(R27)(vcpu); \ + PPC_LL r28, VCPU_GPR(R28)(vcpu); \ + PPC_LL r29, VCPU_GPR(R29)(vcpu); \ + PPC_LL r30, VCPU_GPR(R30)(vcpu); \ + PPC_LL r31, VCPU_GPR(R31)(vcpu); \ + +/***************************************************************************** + * * + * Guest entry / exit code that is in kernel module memory (highmem) * + * * + ****************************************************************************/ + +/* Registers: + * r3: kvm_run pointer + * r4: vcpu pointer + */ +_GLOBAL(__kvmppc_vcpu_run) + +kvm_start_entry: + /* Write correct stack frame */ + mflr r0 + PPC_STL r0,PPC_LR_STKOFF(r1) + + /* Save host state to the stack */ + PPC_STLU r1, -SWITCH_FRAME_SIZE(r1) + + /* Save r3 (kvm_run) and r4 (vcpu) */ + SAVE_2GPRS(3, r1) + + /* Save non-volatile registers (r14 - r31) */ + SAVE_NVGPRS(r1) + + /* Save CR */ + mfcr r14 + stw r14, _CCR(r1) + + /* Save LR */ + PPC_STL r0, _LINK(r1) + + /* Load non-volatile guest state from the vcpu */ + VCPU_LOAD_NVGPRS(r4) + +kvm_start_lightweight: + /* Copy registers into shadow vcpu so we can access them in real mode */ + GET_SHADOW_VCPU(r3) + bl FUNC(kvmppc_copy_to_svcpu) + nop + REST_GPR(4, r1) + +#ifdef CONFIG_PPC_BOOK3S_64 + /* Get the dcbz32 flag */ + PPC_LL r3, VCPU_HFLAGS(r4) + rldicl r3, r3, 0, 63 /* r3 &= 1 */ + stb r3, HSTATE_RESTORE_HID5(r13) + + /* Load up guest SPRG3 value, since it's user readable */ + lwz r3, VCPU_SHAREDBE(r4) + cmpwi r3, 0 + ld r5, VCPU_SHARED(r4) + beq sprg3_little_endian +sprg3_big_endian: +#ifdef __BIG_ENDIAN__ + ld r3, VCPU_SHARED_SPRG3(r5) +#else + addi r5, r5, VCPU_SHARED_SPRG3 + ldbrx r3, 0, r5 +#endif + b after_sprg3_load +sprg3_little_endian: +#ifdef __LITTLE_ENDIAN__ + ld r3, VCPU_SHARED_SPRG3(r5) +#else + addi r5, r5, VCPU_SHARED_SPRG3 + ldbrx r3, 0, r5 +#endif + +after_sprg3_load: + mtspr SPRN_SPRG3, r3 +#endif /* CONFIG_PPC_BOOK3S_64 */ + + PPC_LL r4, VCPU_SHADOW_MSR(r4) /* get shadow_msr */ + + /* Jump to segment patching handler and into our guest */ + bl FUNC(kvmppc_entry_trampoline) + nop + +/* + * This is the handler in module memory. It gets jumped at from the + * lowmem trampoline code, so it's basically the guest exit code. + * + */ + + /* + * Register usage at this point: + * + * R1 = host R1 + * R2 = host R2 + * R12 = exit handler id + * R13 = PACA + * SVCPU.* = guest * + * MSR.EE = 1 + * + */ + + PPC_LL r3, GPR4(r1) /* vcpu pointer */ + + /* + * kvmppc_copy_from_svcpu can clobber volatile registers, save + * the exit handler id to the vcpu and restore it from there later. + */ + stw r12, VCPU_TRAP(r3) + + /* Transfer reg values from shadow vcpu back to vcpu struct */ + /* On 64-bit, interrupts are still off at this point */ + + GET_SHADOW_VCPU(r4) + bl FUNC(kvmppc_copy_from_svcpu) + nop + +#ifdef CONFIG_PPC_BOOK3S_64 + /* + * Reload kernel SPRG3 value. + * No need to save guest value as usermode can't modify SPRG3. + */ + ld r3, PACA_SPRG_VDSO(r13) + mtspr SPRN_SPRG_VDSO_WRITE, r3 +#endif /* CONFIG_PPC_BOOK3S_64 */ + + /* R7 = vcpu */ + PPC_LL r7, GPR4(r1) + + PPC_STL r14, VCPU_GPR(R14)(r7) + PPC_STL r15, VCPU_GPR(R15)(r7) + PPC_STL r16, VCPU_GPR(R16)(r7) + PPC_STL r17, VCPU_GPR(R17)(r7) + PPC_STL r18, VCPU_GPR(R18)(r7) + PPC_STL r19, VCPU_GPR(R19)(r7) + PPC_STL r20, VCPU_GPR(R20)(r7) + PPC_STL r21, VCPU_GPR(R21)(r7) + PPC_STL r22, VCPU_GPR(R22)(r7) + PPC_STL r23, VCPU_GPR(R23)(r7) + PPC_STL r24, VCPU_GPR(R24)(r7) + PPC_STL r25, VCPU_GPR(R25)(r7) + PPC_STL r26, VCPU_GPR(R26)(r7) + PPC_STL r27, VCPU_GPR(R27)(r7) + PPC_STL r28, VCPU_GPR(R28)(r7) + PPC_STL r29, VCPU_GPR(R29)(r7) + PPC_STL r30, VCPU_GPR(R30)(r7) + PPC_STL r31, VCPU_GPR(R31)(r7) + + /* Pass the exit number as 3rd argument to kvmppc_handle_exit */ + lwz r5, VCPU_TRAP(r7) + + /* Restore r3 (kvm_run) and r4 (vcpu) */ + REST_2GPRS(3, r1) + bl FUNC(kvmppc_handle_exit_pr) + + /* If RESUME_GUEST, get back in the loop */ + cmpwi r3, RESUME_GUEST + beq kvm_loop_lightweight + + cmpwi r3, RESUME_GUEST_NV + beq kvm_loop_heavyweight + +kvm_exit_loop: + + PPC_LL r4, _LINK(r1) + mtlr r4 + + lwz r14, _CCR(r1) + mtcr r14 + + /* Restore non-volatile host registers (r14 - r31) */ + REST_NVGPRS(r1) + + addi r1, r1, SWITCH_FRAME_SIZE + blr + +kvm_loop_heavyweight: + + PPC_LL r4, _LINK(r1) + PPC_STL r4, (PPC_LR_STKOFF + SWITCH_FRAME_SIZE)(r1) + + /* Load vcpu and cpu_run */ + REST_2GPRS(3, r1) + + /* Load non-volatile guest state from the vcpu */ + VCPU_LOAD_NVGPRS(r4) + + /* Jump back into the beginning of this function */ + b kvm_start_lightweight + +kvm_loop_lightweight: + + /* We'll need the vcpu pointer */ + REST_GPR(4, r1) + + /* Jump back into the beginning of this function */ + b kvm_start_lightweight diff --git a/kernel/arch/powerpc/kvm/book3s_mmu_hpte.c b/kernel/arch/powerpc/kvm/book3s_mmu_hpte.c new file mode 100644 index 000000000..5a1ab1250 --- /dev/null +++ b/kernel/arch/powerpc/kvm/book3s_mmu_hpte.c @@ -0,0 +1,397 @@ +/* + * Copyright (C) 2010 SUSE Linux Products GmbH. All rights reserved. + * + * Authors: + * Alexander Graf <agraf@suse.de> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + */ + +#include <linux/kvm_host.h> +#include <linux/hash.h> +#include <linux/slab.h> + +#include <asm/kvm_ppc.h> +#include <asm/kvm_book3s.h> +#include <asm/machdep.h> +#include <asm/mmu_context.h> +#include <asm/hw_irq.h> + +#include "trace_pr.h" + +#define PTE_SIZE 12 + +static struct kmem_cache *hpte_cache; + +static inline u64 kvmppc_mmu_hash_pte(u64 eaddr) +{ + return hash_64(eaddr >> PTE_SIZE, HPTEG_HASH_BITS_PTE); +} + +static inline u64 kvmppc_mmu_hash_pte_long(u64 eaddr) +{ + return hash_64((eaddr & 0x0ffff000) >> PTE_SIZE, + HPTEG_HASH_BITS_PTE_LONG); +} + +static inline u64 kvmppc_mmu_hash_vpte(u64 vpage) +{ + return hash_64(vpage & 0xfffffffffULL, HPTEG_HASH_BITS_VPTE); +} + +static inline u64 kvmppc_mmu_hash_vpte_long(u64 vpage) +{ + return hash_64((vpage & 0xffffff000ULL) >> 12, + HPTEG_HASH_BITS_VPTE_LONG); +} + +#ifdef CONFIG_PPC_BOOK3S_64 +static inline u64 kvmppc_mmu_hash_vpte_64k(u64 vpage) +{ + return hash_64((vpage & 0xffffffff0ULL) >> 4, + HPTEG_HASH_BITS_VPTE_64K); +} +#endif + +void kvmppc_mmu_hpte_cache_map(struct kvm_vcpu *vcpu, struct hpte_cache *pte) +{ + u64 index; + struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu); + + trace_kvm_book3s_mmu_map(pte); + + spin_lock(&vcpu3s->mmu_lock); + + /* Add to ePTE list */ + index = kvmppc_mmu_hash_pte(pte->pte.eaddr); + hlist_add_head_rcu(&pte->list_pte, &vcpu3s->hpte_hash_pte[index]); + + /* Add to ePTE_long list */ + index = kvmppc_mmu_hash_pte_long(pte->pte.eaddr); + hlist_add_head_rcu(&pte->list_pte_long, + &vcpu3s->hpte_hash_pte_long[index]); + + /* Add to vPTE list */ + index = kvmppc_mmu_hash_vpte(pte->pte.vpage); + hlist_add_head_rcu(&pte->list_vpte, &vcpu3s->hpte_hash_vpte[index]); + + /* Add to vPTE_long list */ + index = kvmppc_mmu_hash_vpte_long(pte->pte.vpage); + hlist_add_head_rcu(&pte->list_vpte_long, + &vcpu3s->hpte_hash_vpte_long[index]); + +#ifdef CONFIG_PPC_BOOK3S_64 + /* Add to vPTE_64k list */ + index = kvmppc_mmu_hash_vpte_64k(pte->pte.vpage); + hlist_add_head_rcu(&pte->list_vpte_64k, + &vcpu3s->hpte_hash_vpte_64k[index]); +#endif + + vcpu3s->hpte_cache_count++; + + spin_unlock(&vcpu3s->mmu_lock); +} + +static void free_pte_rcu(struct rcu_head *head) +{ + struct hpte_cache *pte = container_of(head, struct hpte_cache, rcu_head); + kmem_cache_free(hpte_cache, pte); +} + +static void invalidate_pte(struct kvm_vcpu *vcpu, struct hpte_cache *pte) +{ + struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu); + + trace_kvm_book3s_mmu_invalidate(pte); + + /* Different for 32 and 64 bit */ + kvmppc_mmu_invalidate_pte(vcpu, pte); + + spin_lock(&vcpu3s->mmu_lock); + + /* pte already invalidated in between? */ + if (hlist_unhashed(&pte->list_pte)) { + spin_unlock(&vcpu3s->mmu_lock); + return; + } + + hlist_del_init_rcu(&pte->list_pte); + hlist_del_init_rcu(&pte->list_pte_long); + hlist_del_init_rcu(&pte->list_vpte); + hlist_del_init_rcu(&pte->list_vpte_long); +#ifdef CONFIG_PPC_BOOK3S_64 + hlist_del_init_rcu(&pte->list_vpte_64k); +#endif + vcpu3s->hpte_cache_count--; + + spin_unlock(&vcpu3s->mmu_lock); + + call_rcu(&pte->rcu_head, free_pte_rcu); +} + +static void kvmppc_mmu_pte_flush_all(struct kvm_vcpu *vcpu) +{ + struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu); + struct hpte_cache *pte; + int i; + + rcu_read_lock(); + + for (i = 0; i < HPTEG_HASH_NUM_VPTE_LONG; i++) { + struct hlist_head *list = &vcpu3s->hpte_hash_vpte_long[i]; + + hlist_for_each_entry_rcu(pte, list, list_vpte_long) + invalidate_pte(vcpu, pte); + } + + rcu_read_unlock(); +} + +static void kvmppc_mmu_pte_flush_page(struct kvm_vcpu *vcpu, ulong guest_ea) +{ + struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu); + struct hlist_head *list; + struct hpte_cache *pte; + + /* Find the list of entries in the map */ + list = &vcpu3s->hpte_hash_pte[kvmppc_mmu_hash_pte(guest_ea)]; + + rcu_read_lock(); + + /* Check the list for matching entries and invalidate */ + hlist_for_each_entry_rcu(pte, list, list_pte) + if ((pte->pte.eaddr & ~0xfffUL) == guest_ea) + invalidate_pte(vcpu, pte); + + rcu_read_unlock(); +} + +static void kvmppc_mmu_pte_flush_long(struct kvm_vcpu *vcpu, ulong guest_ea) +{ + struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu); + struct hlist_head *list; + struct hpte_cache *pte; + + /* Find the list of entries in the map */ + list = &vcpu3s->hpte_hash_pte_long[ + kvmppc_mmu_hash_pte_long(guest_ea)]; + + rcu_read_lock(); + + /* Check the list for matching entries and invalidate */ + hlist_for_each_entry_rcu(pte, list, list_pte_long) + if ((pte->pte.eaddr & 0x0ffff000UL) == guest_ea) + invalidate_pte(vcpu, pte); + + rcu_read_unlock(); +} + +void kvmppc_mmu_pte_flush(struct kvm_vcpu *vcpu, ulong guest_ea, ulong ea_mask) +{ + trace_kvm_book3s_mmu_flush("", vcpu, guest_ea, ea_mask); + guest_ea &= ea_mask; + + switch (ea_mask) { + case ~0xfffUL: + kvmppc_mmu_pte_flush_page(vcpu, guest_ea); + break; + case 0x0ffff000: + kvmppc_mmu_pte_flush_long(vcpu, guest_ea); + break; + case 0: + /* Doing a complete flush -> start from scratch */ + kvmppc_mmu_pte_flush_all(vcpu); + break; + default: + WARN_ON(1); + break; + } +} + +/* Flush with mask 0xfffffffff */ +static void kvmppc_mmu_pte_vflush_short(struct kvm_vcpu *vcpu, u64 guest_vp) +{ + struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu); + struct hlist_head *list; + struct hpte_cache *pte; + u64 vp_mask = 0xfffffffffULL; + + list = &vcpu3s->hpte_hash_vpte[kvmppc_mmu_hash_vpte(guest_vp)]; + + rcu_read_lock(); + + /* Check the list for matching entries and invalidate */ + hlist_for_each_entry_rcu(pte, list, list_vpte) + if ((pte->pte.vpage & vp_mask) == guest_vp) + invalidate_pte(vcpu, pte); + + rcu_read_unlock(); +} + +#ifdef CONFIG_PPC_BOOK3S_64 +/* Flush with mask 0xffffffff0 */ +static void kvmppc_mmu_pte_vflush_64k(struct kvm_vcpu *vcpu, u64 guest_vp) +{ + struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu); + struct hlist_head *list; + struct hpte_cache *pte; + u64 vp_mask = 0xffffffff0ULL; + + list = &vcpu3s->hpte_hash_vpte_64k[ + kvmppc_mmu_hash_vpte_64k(guest_vp)]; + + rcu_read_lock(); + + /* Check the list for matching entries and invalidate */ + hlist_for_each_entry_rcu(pte, list, list_vpte_64k) + if ((pte->pte.vpage & vp_mask) == guest_vp) + invalidate_pte(vcpu, pte); + + rcu_read_unlock(); +} +#endif + +/* Flush with mask 0xffffff000 */ +static void kvmppc_mmu_pte_vflush_long(struct kvm_vcpu *vcpu, u64 guest_vp) +{ + struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu); + struct hlist_head *list; + struct hpte_cache *pte; + u64 vp_mask = 0xffffff000ULL; + + list = &vcpu3s->hpte_hash_vpte_long[ + kvmppc_mmu_hash_vpte_long(guest_vp)]; + + rcu_read_lock(); + + /* Check the list for matching entries and invalidate */ + hlist_for_each_entry_rcu(pte, list, list_vpte_long) + if ((pte->pte.vpage & vp_mask) == guest_vp) + invalidate_pte(vcpu, pte); + + rcu_read_unlock(); +} + +void kvmppc_mmu_pte_vflush(struct kvm_vcpu *vcpu, u64 guest_vp, u64 vp_mask) +{ + trace_kvm_book3s_mmu_flush("v", vcpu, guest_vp, vp_mask); + guest_vp &= vp_mask; + + switch(vp_mask) { + case 0xfffffffffULL: + kvmppc_mmu_pte_vflush_short(vcpu, guest_vp); + break; +#ifdef CONFIG_PPC_BOOK3S_64 + case 0xffffffff0ULL: + kvmppc_mmu_pte_vflush_64k(vcpu, guest_vp); + break; +#endif + case 0xffffff000ULL: + kvmppc_mmu_pte_vflush_long(vcpu, guest_vp); + break; + default: + WARN_ON(1); + return; + } +} + +void kvmppc_mmu_pte_pflush(struct kvm_vcpu *vcpu, ulong pa_start, ulong pa_end) +{ + struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu); + struct hpte_cache *pte; + int i; + + trace_kvm_book3s_mmu_flush("p", vcpu, pa_start, pa_end); + + rcu_read_lock(); + + for (i = 0; i < HPTEG_HASH_NUM_VPTE_LONG; i++) { + struct hlist_head *list = &vcpu3s->hpte_hash_vpte_long[i]; + + hlist_for_each_entry_rcu(pte, list, list_vpte_long) + if ((pte->pte.raddr >= pa_start) && + (pte->pte.raddr < pa_end)) + invalidate_pte(vcpu, pte); + } + + rcu_read_unlock(); +} + +struct hpte_cache *kvmppc_mmu_hpte_cache_next(struct kvm_vcpu *vcpu) +{ + struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu); + struct hpte_cache *pte; + + if (vcpu3s->hpte_cache_count == HPTEG_CACHE_NUM) + kvmppc_mmu_pte_flush_all(vcpu); + + pte = kmem_cache_zalloc(hpte_cache, GFP_KERNEL); + + return pte; +} + +void kvmppc_mmu_hpte_cache_free(struct hpte_cache *pte) +{ + kmem_cache_free(hpte_cache, pte); +} + +void kvmppc_mmu_hpte_destroy(struct kvm_vcpu *vcpu) +{ + kvmppc_mmu_pte_flush(vcpu, 0, 0); +} + +static void kvmppc_mmu_hpte_init_hash(struct hlist_head *hash_list, int len) +{ + int i; + + for (i = 0; i < len; i++) + INIT_HLIST_HEAD(&hash_list[i]); +} + +int kvmppc_mmu_hpte_init(struct kvm_vcpu *vcpu) +{ + struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu); + + /* init hpte lookup hashes */ + kvmppc_mmu_hpte_init_hash(vcpu3s->hpte_hash_pte, + ARRAY_SIZE(vcpu3s->hpte_hash_pte)); + kvmppc_mmu_hpte_init_hash(vcpu3s->hpte_hash_pte_long, + ARRAY_SIZE(vcpu3s->hpte_hash_pte_long)); + kvmppc_mmu_hpte_init_hash(vcpu3s->hpte_hash_vpte, + ARRAY_SIZE(vcpu3s->hpte_hash_vpte)); + kvmppc_mmu_hpte_init_hash(vcpu3s->hpte_hash_vpte_long, + ARRAY_SIZE(vcpu3s->hpte_hash_vpte_long)); +#ifdef CONFIG_PPC_BOOK3S_64 + kvmppc_mmu_hpte_init_hash(vcpu3s->hpte_hash_vpte_64k, + ARRAY_SIZE(vcpu3s->hpte_hash_vpte_64k)); +#endif + + spin_lock_init(&vcpu3s->mmu_lock); + + return 0; +} + +int kvmppc_mmu_hpte_sysinit(void) +{ + /* init hpte slab cache */ + hpte_cache = kmem_cache_create("kvm-spt", sizeof(struct hpte_cache), + sizeof(struct hpte_cache), 0, NULL); + + return 0; +} + +void kvmppc_mmu_hpte_sysexit(void) +{ + kmem_cache_destroy(hpte_cache); +} diff --git a/kernel/arch/powerpc/kvm/book3s_paired_singles.c b/kernel/arch/powerpc/kvm/book3s_paired_singles.c new file mode 100644 index 000000000..bd6ab1672 --- /dev/null +++ b/kernel/arch/powerpc/kvm/book3s_paired_singles.c @@ -0,0 +1,1271 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * + * Copyright Novell Inc 2010 + * + * Authors: Alexander Graf <agraf@suse.de> + */ + +#include <asm/kvm.h> +#include <asm/kvm_ppc.h> +#include <asm/disassemble.h> +#include <asm/kvm_book3s.h> +#include <asm/kvm_fpu.h> +#include <asm/reg.h> +#include <asm/cacheflush.h> +#include <asm/switch_to.h> +#include <linux/vmalloc.h> + +/* #define DEBUG */ + +#ifdef DEBUG +#define dprintk printk +#else +#define dprintk(...) do { } while(0); +#endif + +#define OP_LFS 48 +#define OP_LFSU 49 +#define OP_LFD 50 +#define OP_LFDU 51 +#define OP_STFS 52 +#define OP_STFSU 53 +#define OP_STFD 54 +#define OP_STFDU 55 +#define OP_PSQ_L 56 +#define OP_PSQ_LU 57 +#define OP_PSQ_ST 60 +#define OP_PSQ_STU 61 + +#define OP_31_LFSX 535 +#define OP_31_LFSUX 567 +#define OP_31_LFDX 599 +#define OP_31_LFDUX 631 +#define OP_31_STFSX 663 +#define OP_31_STFSUX 695 +#define OP_31_STFX 727 +#define OP_31_STFUX 759 +#define OP_31_LWIZX 887 +#define OP_31_STFIWX 983 + +#define OP_59_FADDS 21 +#define OP_59_FSUBS 20 +#define OP_59_FSQRTS 22 +#define OP_59_FDIVS 18 +#define OP_59_FRES 24 +#define OP_59_FMULS 25 +#define OP_59_FRSQRTES 26 +#define OP_59_FMSUBS 28 +#define OP_59_FMADDS 29 +#define OP_59_FNMSUBS 30 +#define OP_59_FNMADDS 31 + +#define OP_63_FCMPU 0 +#define OP_63_FCPSGN 8 +#define OP_63_FRSP 12 +#define OP_63_FCTIW 14 +#define OP_63_FCTIWZ 15 +#define OP_63_FDIV 18 +#define OP_63_FADD 21 +#define OP_63_FSQRT 22 +#define OP_63_FSEL 23 +#define OP_63_FRE 24 +#define OP_63_FMUL 25 +#define OP_63_FRSQRTE 26 +#define OP_63_FMSUB 28 +#define OP_63_FMADD 29 +#define OP_63_FNMSUB 30 +#define OP_63_FNMADD 31 +#define OP_63_FCMPO 32 +#define OP_63_MTFSB1 38 // XXX +#define OP_63_FSUB 20 +#define OP_63_FNEG 40 +#define OP_63_MCRFS 64 +#define OP_63_MTFSB0 70 +#define OP_63_FMR 72 +#define OP_63_MTFSFI 134 +#define OP_63_FABS 264 +#define OP_63_MFFS 583 +#define OP_63_MTFSF 711 + +#define OP_4X_PS_CMPU0 0 +#define OP_4X_PSQ_LX 6 +#define OP_4XW_PSQ_STX 7 +#define OP_4A_PS_SUM0 10 +#define OP_4A_PS_SUM1 11 +#define OP_4A_PS_MULS0 12 +#define OP_4A_PS_MULS1 13 +#define OP_4A_PS_MADDS0 14 +#define OP_4A_PS_MADDS1 15 +#define OP_4A_PS_DIV 18 +#define OP_4A_PS_SUB 20 +#define OP_4A_PS_ADD 21 +#define OP_4A_PS_SEL 23 +#define OP_4A_PS_RES 24 +#define OP_4A_PS_MUL 25 +#define OP_4A_PS_RSQRTE 26 +#define OP_4A_PS_MSUB 28 +#define OP_4A_PS_MADD 29 +#define OP_4A_PS_NMSUB 30 +#define OP_4A_PS_NMADD 31 +#define OP_4X_PS_CMPO0 32 +#define OP_4X_PSQ_LUX 38 +#define OP_4XW_PSQ_STUX 39 +#define OP_4X_PS_NEG 40 +#define OP_4X_PS_CMPU1 64 +#define OP_4X_PS_MR 72 +#define OP_4X_PS_CMPO1 96 +#define OP_4X_PS_NABS 136 +#define OP_4X_PS_ABS 264 +#define OP_4X_PS_MERGE00 528 +#define OP_4X_PS_MERGE01 560 +#define OP_4X_PS_MERGE10 592 +#define OP_4X_PS_MERGE11 624 + +#define SCALAR_NONE 0 +#define SCALAR_HIGH (1 << 0) +#define SCALAR_LOW (1 << 1) +#define SCALAR_NO_PS0 (1 << 2) +#define SCALAR_NO_PS1 (1 << 3) + +#define GQR_ST_TYPE_MASK 0x00000007 +#define GQR_ST_TYPE_SHIFT 0 +#define GQR_ST_SCALE_MASK 0x00003f00 +#define GQR_ST_SCALE_SHIFT 8 +#define GQR_LD_TYPE_MASK 0x00070000 +#define GQR_LD_TYPE_SHIFT 16 +#define GQR_LD_SCALE_MASK 0x3f000000 +#define GQR_LD_SCALE_SHIFT 24 + +#define GQR_QUANTIZE_FLOAT 0 +#define GQR_QUANTIZE_U8 4 +#define GQR_QUANTIZE_U16 5 +#define GQR_QUANTIZE_S8 6 +#define GQR_QUANTIZE_S16 7 + +#define FPU_LS_SINGLE 0 +#define FPU_LS_DOUBLE 1 +#define FPU_LS_SINGLE_LOW 2 + +static inline void kvmppc_sync_qpr(struct kvm_vcpu *vcpu, int rt) +{ + kvm_cvt_df(&VCPU_FPR(vcpu, rt), &vcpu->arch.qpr[rt]); +} + +static void kvmppc_inject_pf(struct kvm_vcpu *vcpu, ulong eaddr, bool is_store) +{ + u32 dsisr; + u64 msr = kvmppc_get_msr(vcpu); + + msr = kvmppc_set_field(msr, 33, 36, 0); + msr = kvmppc_set_field(msr, 42, 47, 0); + kvmppc_set_msr(vcpu, msr); + kvmppc_set_dar(vcpu, eaddr); + /* Page Fault */ + dsisr = kvmppc_set_field(0, 33, 33, 1); + if (is_store) + dsisr = kvmppc_set_field(dsisr, 38, 38, 1); + kvmppc_set_dsisr(vcpu, dsisr); + kvmppc_book3s_queue_irqprio(vcpu, BOOK3S_INTERRUPT_DATA_STORAGE); +} + +static int kvmppc_emulate_fpr_load(struct kvm_run *run, struct kvm_vcpu *vcpu, + int rs, ulong addr, int ls_type) +{ + int emulated = EMULATE_FAIL; + int r; + char tmp[8]; + int len = sizeof(u32); + + if (ls_type == FPU_LS_DOUBLE) + len = sizeof(u64); + + /* read from memory */ + r = kvmppc_ld(vcpu, &addr, len, tmp, true); + vcpu->arch.paddr_accessed = addr; + + if (r < 0) { + kvmppc_inject_pf(vcpu, addr, false); + goto done_load; + } else if (r == EMULATE_DO_MMIO) { + emulated = kvmppc_handle_load(run, vcpu, KVM_MMIO_REG_FPR | rs, + len, 1); + goto done_load; + } + + emulated = EMULATE_DONE; + + /* put in registers */ + switch (ls_type) { + case FPU_LS_SINGLE: + kvm_cvt_fd((u32*)tmp, &VCPU_FPR(vcpu, rs)); + vcpu->arch.qpr[rs] = *((u32*)tmp); + break; + case FPU_LS_DOUBLE: + VCPU_FPR(vcpu, rs) = *((u64*)tmp); + break; + } + + dprintk(KERN_INFO "KVM: FPR_LD [0x%llx] at 0x%lx (%d)\n", *(u64*)tmp, + addr, len); + +done_load: + return emulated; +} + +static int kvmppc_emulate_fpr_store(struct kvm_run *run, struct kvm_vcpu *vcpu, + int rs, ulong addr, int ls_type) +{ + int emulated = EMULATE_FAIL; + int r; + char tmp[8]; + u64 val; + int len; + + switch (ls_type) { + case FPU_LS_SINGLE: + kvm_cvt_df(&VCPU_FPR(vcpu, rs), (u32*)tmp); + val = *((u32*)tmp); + len = sizeof(u32); + break; + case FPU_LS_SINGLE_LOW: + *((u32*)tmp) = VCPU_FPR(vcpu, rs); + val = VCPU_FPR(vcpu, rs) & 0xffffffff; + len = sizeof(u32); + break; + case FPU_LS_DOUBLE: + *((u64*)tmp) = VCPU_FPR(vcpu, rs); + val = VCPU_FPR(vcpu, rs); + len = sizeof(u64); + break; + default: + val = 0; + len = 0; + } + + r = kvmppc_st(vcpu, &addr, len, tmp, true); + vcpu->arch.paddr_accessed = addr; + if (r < 0) { + kvmppc_inject_pf(vcpu, addr, true); + } else if (r == EMULATE_DO_MMIO) { + emulated = kvmppc_handle_store(run, vcpu, val, len, 1); + } else { + emulated = EMULATE_DONE; + } + + dprintk(KERN_INFO "KVM: FPR_ST [0x%llx] at 0x%lx (%d)\n", + val, addr, len); + + return emulated; +} + +static int kvmppc_emulate_psq_load(struct kvm_run *run, struct kvm_vcpu *vcpu, + int rs, ulong addr, bool w, int i) +{ + int emulated = EMULATE_FAIL; + int r; + float one = 1.0; + u32 tmp[2]; + + /* read from memory */ + if (w) { + r = kvmppc_ld(vcpu, &addr, sizeof(u32), tmp, true); + memcpy(&tmp[1], &one, sizeof(u32)); + } else { + r = kvmppc_ld(vcpu, &addr, sizeof(u32) * 2, tmp, true); + } + vcpu->arch.paddr_accessed = addr; + if (r < 0) { + kvmppc_inject_pf(vcpu, addr, false); + goto done_load; + } else if ((r == EMULATE_DO_MMIO) && w) { + emulated = kvmppc_handle_load(run, vcpu, KVM_MMIO_REG_FPR | rs, + 4, 1); + vcpu->arch.qpr[rs] = tmp[1]; + goto done_load; + } else if (r == EMULATE_DO_MMIO) { + emulated = kvmppc_handle_load(run, vcpu, KVM_MMIO_REG_FQPR | rs, + 8, 1); + goto done_load; + } + + emulated = EMULATE_DONE; + + /* put in registers */ + kvm_cvt_fd(&tmp[0], &VCPU_FPR(vcpu, rs)); + vcpu->arch.qpr[rs] = tmp[1]; + + dprintk(KERN_INFO "KVM: PSQ_LD [0x%x, 0x%x] at 0x%lx (%d)\n", tmp[0], + tmp[1], addr, w ? 4 : 8); + +done_load: + return emulated; +} + +static int kvmppc_emulate_psq_store(struct kvm_run *run, struct kvm_vcpu *vcpu, + int rs, ulong addr, bool w, int i) +{ + int emulated = EMULATE_FAIL; + int r; + u32 tmp[2]; + int len = w ? sizeof(u32) : sizeof(u64); + + kvm_cvt_df(&VCPU_FPR(vcpu, rs), &tmp[0]); + tmp[1] = vcpu->arch.qpr[rs]; + + r = kvmppc_st(vcpu, &addr, len, tmp, true); + vcpu->arch.paddr_accessed = addr; + if (r < 0) { + kvmppc_inject_pf(vcpu, addr, true); + } else if ((r == EMULATE_DO_MMIO) && w) { + emulated = kvmppc_handle_store(run, vcpu, tmp[0], 4, 1); + } else if (r == EMULATE_DO_MMIO) { + u64 val = ((u64)tmp[0] << 32) | tmp[1]; + emulated = kvmppc_handle_store(run, vcpu, val, 8, 1); + } else { + emulated = EMULATE_DONE; + } + + dprintk(KERN_INFO "KVM: PSQ_ST [0x%x, 0x%x] at 0x%lx (%d)\n", + tmp[0], tmp[1], addr, len); + + return emulated; +} + +/* + * Cuts out inst bits with ordering according to spec. + * That means the leftmost bit is zero. All given bits are included. + */ +static inline u32 inst_get_field(u32 inst, int msb, int lsb) +{ + return kvmppc_get_field(inst, msb + 32, lsb + 32); +} + +bool kvmppc_inst_is_paired_single(struct kvm_vcpu *vcpu, u32 inst) +{ + if (!(vcpu->arch.hflags & BOOK3S_HFLAG_PAIRED_SINGLE)) + return false; + + switch (get_op(inst)) { + case OP_PSQ_L: + case OP_PSQ_LU: + case OP_PSQ_ST: + case OP_PSQ_STU: + case OP_LFS: + case OP_LFSU: + case OP_LFD: + case OP_LFDU: + case OP_STFS: + case OP_STFSU: + case OP_STFD: + case OP_STFDU: + return true; + case 4: + /* X form */ + switch (inst_get_field(inst, 21, 30)) { + case OP_4X_PS_CMPU0: + case OP_4X_PSQ_LX: + case OP_4X_PS_CMPO0: + case OP_4X_PSQ_LUX: + case OP_4X_PS_NEG: + case OP_4X_PS_CMPU1: + case OP_4X_PS_MR: + case OP_4X_PS_CMPO1: + case OP_4X_PS_NABS: + case OP_4X_PS_ABS: + case OP_4X_PS_MERGE00: + case OP_4X_PS_MERGE01: + case OP_4X_PS_MERGE10: + case OP_4X_PS_MERGE11: + return true; + } + /* XW form */ + switch (inst_get_field(inst, 25, 30)) { + case OP_4XW_PSQ_STX: + case OP_4XW_PSQ_STUX: + return true; + } + /* A form */ + switch (inst_get_field(inst, 26, 30)) { + case OP_4A_PS_SUM1: + case OP_4A_PS_SUM0: + case OP_4A_PS_MULS0: + case OP_4A_PS_MULS1: + case OP_4A_PS_MADDS0: + case OP_4A_PS_MADDS1: + case OP_4A_PS_DIV: + case OP_4A_PS_SUB: + case OP_4A_PS_ADD: + case OP_4A_PS_SEL: + case OP_4A_PS_RES: + case OP_4A_PS_MUL: + case OP_4A_PS_RSQRTE: + case OP_4A_PS_MSUB: + case OP_4A_PS_MADD: + case OP_4A_PS_NMSUB: + case OP_4A_PS_NMADD: + return true; + } + break; + case 59: + switch (inst_get_field(inst, 21, 30)) { + case OP_59_FADDS: + case OP_59_FSUBS: + case OP_59_FDIVS: + case OP_59_FRES: + case OP_59_FRSQRTES: + return true; + } + switch (inst_get_field(inst, 26, 30)) { + case OP_59_FMULS: + case OP_59_FMSUBS: + case OP_59_FMADDS: + case OP_59_FNMSUBS: + case OP_59_FNMADDS: + return true; + } + break; + case 63: + switch (inst_get_field(inst, 21, 30)) { + case OP_63_MTFSB0: + case OP_63_MTFSB1: + case OP_63_MTFSF: + case OP_63_MTFSFI: + case OP_63_MCRFS: + case OP_63_MFFS: + case OP_63_FCMPU: + case OP_63_FCMPO: + case OP_63_FNEG: + case OP_63_FMR: + case OP_63_FABS: + case OP_63_FRSP: + case OP_63_FDIV: + case OP_63_FADD: + case OP_63_FSUB: + case OP_63_FCTIW: + case OP_63_FCTIWZ: + case OP_63_FRSQRTE: + case OP_63_FCPSGN: + return true; + } + switch (inst_get_field(inst, 26, 30)) { + case OP_63_FMUL: + case OP_63_FSEL: + case OP_63_FMSUB: + case OP_63_FMADD: + case OP_63_FNMSUB: + case OP_63_FNMADD: + return true; + } + break; + case 31: + switch (inst_get_field(inst, 21, 30)) { + case OP_31_LFSX: + case OP_31_LFSUX: + case OP_31_LFDX: + case OP_31_LFDUX: + case OP_31_STFSX: + case OP_31_STFSUX: + case OP_31_STFX: + case OP_31_STFUX: + case OP_31_STFIWX: + return true; + } + break; + } + + return false; +} + +static int get_d_signext(u32 inst) +{ + int d = inst & 0x8ff; + + if (d & 0x800) + return -(d & 0x7ff); + + return (d & 0x7ff); +} + +static int kvmppc_ps_three_in(struct kvm_vcpu *vcpu, bool rc, + int reg_out, int reg_in1, int reg_in2, + int reg_in3, int scalar, + void (*func)(u64 *fpscr, + u32 *dst, u32 *src1, + u32 *src2, u32 *src3)) +{ + u32 *qpr = vcpu->arch.qpr; + u32 ps0_out; + u32 ps0_in1, ps0_in2, ps0_in3; + u32 ps1_in1, ps1_in2, ps1_in3; + + /* RC */ + WARN_ON(rc); + + /* PS0 */ + kvm_cvt_df(&VCPU_FPR(vcpu, reg_in1), &ps0_in1); + kvm_cvt_df(&VCPU_FPR(vcpu, reg_in2), &ps0_in2); + kvm_cvt_df(&VCPU_FPR(vcpu, reg_in3), &ps0_in3); + + if (scalar & SCALAR_LOW) + ps0_in2 = qpr[reg_in2]; + + func(&vcpu->arch.fp.fpscr, &ps0_out, &ps0_in1, &ps0_in2, &ps0_in3); + + dprintk(KERN_INFO "PS3 ps0 -> f(0x%x, 0x%x, 0x%x) = 0x%x\n", + ps0_in1, ps0_in2, ps0_in3, ps0_out); + + if (!(scalar & SCALAR_NO_PS0)) + kvm_cvt_fd(&ps0_out, &VCPU_FPR(vcpu, reg_out)); + + /* PS1 */ + ps1_in1 = qpr[reg_in1]; + ps1_in2 = qpr[reg_in2]; + ps1_in3 = qpr[reg_in3]; + + if (scalar & SCALAR_HIGH) + ps1_in2 = ps0_in2; + + if (!(scalar & SCALAR_NO_PS1)) + func(&vcpu->arch.fp.fpscr, &qpr[reg_out], &ps1_in1, &ps1_in2, &ps1_in3); + + dprintk(KERN_INFO "PS3 ps1 -> f(0x%x, 0x%x, 0x%x) = 0x%x\n", + ps1_in1, ps1_in2, ps1_in3, qpr[reg_out]); + + return EMULATE_DONE; +} + +static int kvmppc_ps_two_in(struct kvm_vcpu *vcpu, bool rc, + int reg_out, int reg_in1, int reg_in2, + int scalar, + void (*func)(u64 *fpscr, + u32 *dst, u32 *src1, + u32 *src2)) +{ + u32 *qpr = vcpu->arch.qpr; + u32 ps0_out; + u32 ps0_in1, ps0_in2; + u32 ps1_out; + u32 ps1_in1, ps1_in2; + + /* RC */ + WARN_ON(rc); + + /* PS0 */ + kvm_cvt_df(&VCPU_FPR(vcpu, reg_in1), &ps0_in1); + + if (scalar & SCALAR_LOW) + ps0_in2 = qpr[reg_in2]; + else + kvm_cvt_df(&VCPU_FPR(vcpu, reg_in2), &ps0_in2); + + func(&vcpu->arch.fp.fpscr, &ps0_out, &ps0_in1, &ps0_in2); + + if (!(scalar & SCALAR_NO_PS0)) { + dprintk(KERN_INFO "PS2 ps0 -> f(0x%x, 0x%x) = 0x%x\n", + ps0_in1, ps0_in2, ps0_out); + + kvm_cvt_fd(&ps0_out, &VCPU_FPR(vcpu, reg_out)); + } + + /* PS1 */ + ps1_in1 = qpr[reg_in1]; + ps1_in2 = qpr[reg_in2]; + + if (scalar & SCALAR_HIGH) + ps1_in2 = ps0_in2; + + func(&vcpu->arch.fp.fpscr, &ps1_out, &ps1_in1, &ps1_in2); + + if (!(scalar & SCALAR_NO_PS1)) { + qpr[reg_out] = ps1_out; + + dprintk(KERN_INFO "PS2 ps1 -> f(0x%x, 0x%x) = 0x%x\n", + ps1_in1, ps1_in2, qpr[reg_out]); + } + + return EMULATE_DONE; +} + +static int kvmppc_ps_one_in(struct kvm_vcpu *vcpu, bool rc, + int reg_out, int reg_in, + void (*func)(u64 *t, + u32 *dst, u32 *src1)) +{ + u32 *qpr = vcpu->arch.qpr; + u32 ps0_out, ps0_in; + u32 ps1_in; + + /* RC */ + WARN_ON(rc); + + /* PS0 */ + kvm_cvt_df(&VCPU_FPR(vcpu, reg_in), &ps0_in); + func(&vcpu->arch.fp.fpscr, &ps0_out, &ps0_in); + + dprintk(KERN_INFO "PS1 ps0 -> f(0x%x) = 0x%x\n", + ps0_in, ps0_out); + + kvm_cvt_fd(&ps0_out, &VCPU_FPR(vcpu, reg_out)); + + /* PS1 */ + ps1_in = qpr[reg_in]; + func(&vcpu->arch.fp.fpscr, &qpr[reg_out], &ps1_in); + + dprintk(KERN_INFO "PS1 ps1 -> f(0x%x) = 0x%x\n", + ps1_in, qpr[reg_out]); + + return EMULATE_DONE; +} + +int kvmppc_emulate_paired_single(struct kvm_run *run, struct kvm_vcpu *vcpu) +{ + u32 inst; + enum emulation_result emulated = EMULATE_DONE; + int ax_rd, ax_ra, ax_rb, ax_rc; + short full_d; + u64 *fpr_d, *fpr_a, *fpr_b, *fpr_c; + + bool rcomp; + u32 cr; +#ifdef DEBUG + int i; +#endif + + emulated = kvmppc_get_last_inst(vcpu, INST_GENERIC, &inst); + if (emulated != EMULATE_DONE) + return emulated; + + ax_rd = inst_get_field(inst, 6, 10); + ax_ra = inst_get_field(inst, 11, 15); + ax_rb = inst_get_field(inst, 16, 20); + ax_rc = inst_get_field(inst, 21, 25); + full_d = inst_get_field(inst, 16, 31); + + fpr_d = &VCPU_FPR(vcpu, ax_rd); + fpr_a = &VCPU_FPR(vcpu, ax_ra); + fpr_b = &VCPU_FPR(vcpu, ax_rb); + fpr_c = &VCPU_FPR(vcpu, ax_rc); + + rcomp = (inst & 1) ? true : false; + cr = kvmppc_get_cr(vcpu); + + if (!kvmppc_inst_is_paired_single(vcpu, inst)) + return EMULATE_FAIL; + + if (!(kvmppc_get_msr(vcpu) & MSR_FP)) { + kvmppc_book3s_queue_irqprio(vcpu, BOOK3S_INTERRUPT_FP_UNAVAIL); + return EMULATE_AGAIN; + } + + kvmppc_giveup_ext(vcpu, MSR_FP); + preempt_disable(); + enable_kernel_fp(); + /* Do we need to clear FE0 / FE1 here? Don't think so. */ + +#ifdef DEBUG + for (i = 0; i < ARRAY_SIZE(vcpu->arch.fp.fpr); i++) { + u32 f; + kvm_cvt_df(&VCPU_FPR(vcpu, i), &f); + dprintk(KERN_INFO "FPR[%d] = 0x%x / 0x%llx QPR[%d] = 0x%x\n", + i, f, VCPU_FPR(vcpu, i), i, vcpu->arch.qpr[i]); + } +#endif + + switch (get_op(inst)) { + case OP_PSQ_L: + { + ulong addr = ax_ra ? kvmppc_get_gpr(vcpu, ax_ra) : 0; + bool w = inst_get_field(inst, 16, 16) ? true : false; + int i = inst_get_field(inst, 17, 19); + + addr += get_d_signext(inst); + emulated = kvmppc_emulate_psq_load(run, vcpu, ax_rd, addr, w, i); + break; + } + case OP_PSQ_LU: + { + ulong addr = kvmppc_get_gpr(vcpu, ax_ra); + bool w = inst_get_field(inst, 16, 16) ? true : false; + int i = inst_get_field(inst, 17, 19); + + addr += get_d_signext(inst); + emulated = kvmppc_emulate_psq_load(run, vcpu, ax_rd, addr, w, i); + + if (emulated == EMULATE_DONE) + kvmppc_set_gpr(vcpu, ax_ra, addr); + break; + } + case OP_PSQ_ST: + { + ulong addr = ax_ra ? kvmppc_get_gpr(vcpu, ax_ra) : 0; + bool w = inst_get_field(inst, 16, 16) ? true : false; + int i = inst_get_field(inst, 17, 19); + + addr += get_d_signext(inst); + emulated = kvmppc_emulate_psq_store(run, vcpu, ax_rd, addr, w, i); + break; + } + case OP_PSQ_STU: + { + ulong addr = kvmppc_get_gpr(vcpu, ax_ra); + bool w = inst_get_field(inst, 16, 16) ? true : false; + int i = inst_get_field(inst, 17, 19); + + addr += get_d_signext(inst); + emulated = kvmppc_emulate_psq_store(run, vcpu, ax_rd, addr, w, i); + + if (emulated == EMULATE_DONE) + kvmppc_set_gpr(vcpu, ax_ra, addr); + break; + } + case 4: + /* X form */ + switch (inst_get_field(inst, 21, 30)) { + case OP_4X_PS_CMPU0: + /* XXX */ + emulated = EMULATE_FAIL; + break; + case OP_4X_PSQ_LX: + { + ulong addr = ax_ra ? kvmppc_get_gpr(vcpu, ax_ra) : 0; + bool w = inst_get_field(inst, 21, 21) ? true : false; + int i = inst_get_field(inst, 22, 24); + + addr += kvmppc_get_gpr(vcpu, ax_rb); + emulated = kvmppc_emulate_psq_load(run, vcpu, ax_rd, addr, w, i); + break; + } + case OP_4X_PS_CMPO0: + /* XXX */ + emulated = EMULATE_FAIL; + break; + case OP_4X_PSQ_LUX: + { + ulong addr = kvmppc_get_gpr(vcpu, ax_ra); + bool w = inst_get_field(inst, 21, 21) ? true : false; + int i = inst_get_field(inst, 22, 24); + + addr += kvmppc_get_gpr(vcpu, ax_rb); + emulated = kvmppc_emulate_psq_load(run, vcpu, ax_rd, addr, w, i); + + if (emulated == EMULATE_DONE) + kvmppc_set_gpr(vcpu, ax_ra, addr); + break; + } + case OP_4X_PS_NEG: + VCPU_FPR(vcpu, ax_rd) = VCPU_FPR(vcpu, ax_rb); + VCPU_FPR(vcpu, ax_rd) ^= 0x8000000000000000ULL; + vcpu->arch.qpr[ax_rd] = vcpu->arch.qpr[ax_rb]; + vcpu->arch.qpr[ax_rd] ^= 0x80000000; + break; + case OP_4X_PS_CMPU1: + /* XXX */ + emulated = EMULATE_FAIL; + break; + case OP_4X_PS_MR: + WARN_ON(rcomp); + VCPU_FPR(vcpu, ax_rd) = VCPU_FPR(vcpu, ax_rb); + vcpu->arch.qpr[ax_rd] = vcpu->arch.qpr[ax_rb]; + break; + case OP_4X_PS_CMPO1: + /* XXX */ + emulated = EMULATE_FAIL; + break; + case OP_4X_PS_NABS: + WARN_ON(rcomp); + VCPU_FPR(vcpu, ax_rd) = VCPU_FPR(vcpu, ax_rb); + VCPU_FPR(vcpu, ax_rd) |= 0x8000000000000000ULL; + vcpu->arch.qpr[ax_rd] = vcpu->arch.qpr[ax_rb]; + vcpu->arch.qpr[ax_rd] |= 0x80000000; + break; + case OP_4X_PS_ABS: + WARN_ON(rcomp); + VCPU_FPR(vcpu, ax_rd) = VCPU_FPR(vcpu, ax_rb); + VCPU_FPR(vcpu, ax_rd) &= ~0x8000000000000000ULL; + vcpu->arch.qpr[ax_rd] = vcpu->arch.qpr[ax_rb]; + vcpu->arch.qpr[ax_rd] &= ~0x80000000; + break; + case OP_4X_PS_MERGE00: + WARN_ON(rcomp); + VCPU_FPR(vcpu, ax_rd) = VCPU_FPR(vcpu, ax_ra); + /* vcpu->arch.qpr[ax_rd] = VCPU_FPR(vcpu, ax_rb); */ + kvm_cvt_df(&VCPU_FPR(vcpu, ax_rb), + &vcpu->arch.qpr[ax_rd]); + break; + case OP_4X_PS_MERGE01: + WARN_ON(rcomp); + VCPU_FPR(vcpu, ax_rd) = VCPU_FPR(vcpu, ax_ra); + vcpu->arch.qpr[ax_rd] = vcpu->arch.qpr[ax_rb]; + break; + case OP_4X_PS_MERGE10: + WARN_ON(rcomp); + /* VCPU_FPR(vcpu, ax_rd) = vcpu->arch.qpr[ax_ra]; */ + kvm_cvt_fd(&vcpu->arch.qpr[ax_ra], + &VCPU_FPR(vcpu, ax_rd)); + /* vcpu->arch.qpr[ax_rd] = VCPU_FPR(vcpu, ax_rb); */ + kvm_cvt_df(&VCPU_FPR(vcpu, ax_rb), + &vcpu->arch.qpr[ax_rd]); + break; + case OP_4X_PS_MERGE11: + WARN_ON(rcomp); + /* VCPU_FPR(vcpu, ax_rd) = vcpu->arch.qpr[ax_ra]; */ + kvm_cvt_fd(&vcpu->arch.qpr[ax_ra], + &VCPU_FPR(vcpu, ax_rd)); + vcpu->arch.qpr[ax_rd] = vcpu->arch.qpr[ax_rb]; + break; + } + /* XW form */ + switch (inst_get_field(inst, 25, 30)) { + case OP_4XW_PSQ_STX: + { + ulong addr = ax_ra ? kvmppc_get_gpr(vcpu, ax_ra) : 0; + bool w = inst_get_field(inst, 21, 21) ? true : false; + int i = inst_get_field(inst, 22, 24); + + addr += kvmppc_get_gpr(vcpu, ax_rb); + emulated = kvmppc_emulate_psq_store(run, vcpu, ax_rd, addr, w, i); + break; + } + case OP_4XW_PSQ_STUX: + { + ulong addr = kvmppc_get_gpr(vcpu, ax_ra); + bool w = inst_get_field(inst, 21, 21) ? true : false; + int i = inst_get_field(inst, 22, 24); + + addr += kvmppc_get_gpr(vcpu, ax_rb); + emulated = kvmppc_emulate_psq_store(run, vcpu, ax_rd, addr, w, i); + + if (emulated == EMULATE_DONE) + kvmppc_set_gpr(vcpu, ax_ra, addr); + break; + } + } + /* A form */ + switch (inst_get_field(inst, 26, 30)) { + case OP_4A_PS_SUM1: + emulated = kvmppc_ps_two_in(vcpu, rcomp, ax_rd, + ax_rb, ax_ra, SCALAR_NO_PS0 | SCALAR_HIGH, fps_fadds); + VCPU_FPR(vcpu, ax_rd) = VCPU_FPR(vcpu, ax_rc); + break; + case OP_4A_PS_SUM0: + emulated = kvmppc_ps_two_in(vcpu, rcomp, ax_rd, + ax_ra, ax_rb, SCALAR_NO_PS1 | SCALAR_LOW, fps_fadds); + vcpu->arch.qpr[ax_rd] = vcpu->arch.qpr[ax_rc]; + break; + case OP_4A_PS_MULS0: + emulated = kvmppc_ps_two_in(vcpu, rcomp, ax_rd, + ax_ra, ax_rc, SCALAR_HIGH, fps_fmuls); + break; + case OP_4A_PS_MULS1: + emulated = kvmppc_ps_two_in(vcpu, rcomp, ax_rd, + ax_ra, ax_rc, SCALAR_LOW, fps_fmuls); + break; + case OP_4A_PS_MADDS0: + emulated = kvmppc_ps_three_in(vcpu, rcomp, ax_rd, + ax_ra, ax_rc, ax_rb, SCALAR_HIGH, fps_fmadds); + break; + case OP_4A_PS_MADDS1: + emulated = kvmppc_ps_three_in(vcpu, rcomp, ax_rd, + ax_ra, ax_rc, ax_rb, SCALAR_LOW, fps_fmadds); + break; + case OP_4A_PS_DIV: + emulated = kvmppc_ps_two_in(vcpu, rcomp, ax_rd, + ax_ra, ax_rb, SCALAR_NONE, fps_fdivs); + break; + case OP_4A_PS_SUB: + emulated = kvmppc_ps_two_in(vcpu, rcomp, ax_rd, + ax_ra, ax_rb, SCALAR_NONE, fps_fsubs); + break; + case OP_4A_PS_ADD: + emulated = kvmppc_ps_two_in(vcpu, rcomp, ax_rd, + ax_ra, ax_rb, SCALAR_NONE, fps_fadds); + break; + case OP_4A_PS_SEL: + emulated = kvmppc_ps_three_in(vcpu, rcomp, ax_rd, + ax_ra, ax_rc, ax_rb, SCALAR_NONE, fps_fsel); + break; + case OP_4A_PS_RES: + emulated = kvmppc_ps_one_in(vcpu, rcomp, ax_rd, + ax_rb, fps_fres); + break; + case OP_4A_PS_MUL: + emulated = kvmppc_ps_two_in(vcpu, rcomp, ax_rd, + ax_ra, ax_rc, SCALAR_NONE, fps_fmuls); + break; + case OP_4A_PS_RSQRTE: + emulated = kvmppc_ps_one_in(vcpu, rcomp, ax_rd, + ax_rb, fps_frsqrte); + break; + case OP_4A_PS_MSUB: + emulated = kvmppc_ps_three_in(vcpu, rcomp, ax_rd, + ax_ra, ax_rc, ax_rb, SCALAR_NONE, fps_fmsubs); + break; + case OP_4A_PS_MADD: + emulated = kvmppc_ps_three_in(vcpu, rcomp, ax_rd, + ax_ra, ax_rc, ax_rb, SCALAR_NONE, fps_fmadds); + break; + case OP_4A_PS_NMSUB: + emulated = kvmppc_ps_three_in(vcpu, rcomp, ax_rd, + ax_ra, ax_rc, ax_rb, SCALAR_NONE, fps_fnmsubs); + break; + case OP_4A_PS_NMADD: + emulated = kvmppc_ps_three_in(vcpu, rcomp, ax_rd, + ax_ra, ax_rc, ax_rb, SCALAR_NONE, fps_fnmadds); + break; + } + break; + + /* Real FPU operations */ + + case OP_LFS: + { + ulong addr = (ax_ra ? kvmppc_get_gpr(vcpu, ax_ra) : 0) + full_d; + + emulated = kvmppc_emulate_fpr_load(run, vcpu, ax_rd, addr, + FPU_LS_SINGLE); + break; + } + case OP_LFSU: + { + ulong addr = kvmppc_get_gpr(vcpu, ax_ra) + full_d; + + emulated = kvmppc_emulate_fpr_load(run, vcpu, ax_rd, addr, + FPU_LS_SINGLE); + + if (emulated == EMULATE_DONE) + kvmppc_set_gpr(vcpu, ax_ra, addr); + break; + } + case OP_LFD: + { + ulong addr = (ax_ra ? kvmppc_get_gpr(vcpu, ax_ra) : 0) + full_d; + + emulated = kvmppc_emulate_fpr_load(run, vcpu, ax_rd, addr, + FPU_LS_DOUBLE); + break; + } + case OP_LFDU: + { + ulong addr = kvmppc_get_gpr(vcpu, ax_ra) + full_d; + + emulated = kvmppc_emulate_fpr_load(run, vcpu, ax_rd, addr, + FPU_LS_DOUBLE); + + if (emulated == EMULATE_DONE) + kvmppc_set_gpr(vcpu, ax_ra, addr); + break; + } + case OP_STFS: + { + ulong addr = (ax_ra ? kvmppc_get_gpr(vcpu, ax_ra) : 0) + full_d; + + emulated = kvmppc_emulate_fpr_store(run, vcpu, ax_rd, addr, + FPU_LS_SINGLE); + break; + } + case OP_STFSU: + { + ulong addr = kvmppc_get_gpr(vcpu, ax_ra) + full_d; + + emulated = kvmppc_emulate_fpr_store(run, vcpu, ax_rd, addr, + FPU_LS_SINGLE); + + if (emulated == EMULATE_DONE) + kvmppc_set_gpr(vcpu, ax_ra, addr); + break; + } + case OP_STFD: + { + ulong addr = (ax_ra ? kvmppc_get_gpr(vcpu, ax_ra) : 0) + full_d; + + emulated = kvmppc_emulate_fpr_store(run, vcpu, ax_rd, addr, + FPU_LS_DOUBLE); + break; + } + case OP_STFDU: + { + ulong addr = kvmppc_get_gpr(vcpu, ax_ra) + full_d; + + emulated = kvmppc_emulate_fpr_store(run, vcpu, ax_rd, addr, + FPU_LS_DOUBLE); + + if (emulated == EMULATE_DONE) + kvmppc_set_gpr(vcpu, ax_ra, addr); + break; + } + case 31: + switch (inst_get_field(inst, 21, 30)) { + case OP_31_LFSX: + { + ulong addr = ax_ra ? kvmppc_get_gpr(vcpu, ax_ra) : 0; + + addr += kvmppc_get_gpr(vcpu, ax_rb); + emulated = kvmppc_emulate_fpr_load(run, vcpu, ax_rd, + addr, FPU_LS_SINGLE); + break; + } + case OP_31_LFSUX: + { + ulong addr = kvmppc_get_gpr(vcpu, ax_ra) + + kvmppc_get_gpr(vcpu, ax_rb); + + emulated = kvmppc_emulate_fpr_load(run, vcpu, ax_rd, + addr, FPU_LS_SINGLE); + + if (emulated == EMULATE_DONE) + kvmppc_set_gpr(vcpu, ax_ra, addr); + break; + } + case OP_31_LFDX: + { + ulong addr = (ax_ra ? kvmppc_get_gpr(vcpu, ax_ra) : 0) + + kvmppc_get_gpr(vcpu, ax_rb); + + emulated = kvmppc_emulate_fpr_load(run, vcpu, ax_rd, + addr, FPU_LS_DOUBLE); + break; + } + case OP_31_LFDUX: + { + ulong addr = kvmppc_get_gpr(vcpu, ax_ra) + + kvmppc_get_gpr(vcpu, ax_rb); + + emulated = kvmppc_emulate_fpr_load(run, vcpu, ax_rd, + addr, FPU_LS_DOUBLE); + + if (emulated == EMULATE_DONE) + kvmppc_set_gpr(vcpu, ax_ra, addr); + break; + } + case OP_31_STFSX: + { + ulong addr = (ax_ra ? kvmppc_get_gpr(vcpu, ax_ra) : 0) + + kvmppc_get_gpr(vcpu, ax_rb); + + emulated = kvmppc_emulate_fpr_store(run, vcpu, ax_rd, + addr, FPU_LS_SINGLE); + break; + } + case OP_31_STFSUX: + { + ulong addr = kvmppc_get_gpr(vcpu, ax_ra) + + kvmppc_get_gpr(vcpu, ax_rb); + + emulated = kvmppc_emulate_fpr_store(run, vcpu, ax_rd, + addr, FPU_LS_SINGLE); + + if (emulated == EMULATE_DONE) + kvmppc_set_gpr(vcpu, ax_ra, addr); + break; + } + case OP_31_STFX: + { + ulong addr = (ax_ra ? kvmppc_get_gpr(vcpu, ax_ra) : 0) + + kvmppc_get_gpr(vcpu, ax_rb); + + emulated = kvmppc_emulate_fpr_store(run, vcpu, ax_rd, + addr, FPU_LS_DOUBLE); + break; + } + case OP_31_STFUX: + { + ulong addr = kvmppc_get_gpr(vcpu, ax_ra) + + kvmppc_get_gpr(vcpu, ax_rb); + + emulated = kvmppc_emulate_fpr_store(run, vcpu, ax_rd, + addr, FPU_LS_DOUBLE); + + if (emulated == EMULATE_DONE) + kvmppc_set_gpr(vcpu, ax_ra, addr); + break; + } + case OP_31_STFIWX: + { + ulong addr = (ax_ra ? kvmppc_get_gpr(vcpu, ax_ra) : 0) + + kvmppc_get_gpr(vcpu, ax_rb); + + emulated = kvmppc_emulate_fpr_store(run, vcpu, ax_rd, + addr, + FPU_LS_SINGLE_LOW); + break; + } + break; + } + break; + case 59: + switch (inst_get_field(inst, 21, 30)) { + case OP_59_FADDS: + fpd_fadds(&vcpu->arch.fp.fpscr, &cr, fpr_d, fpr_a, fpr_b); + kvmppc_sync_qpr(vcpu, ax_rd); + break; + case OP_59_FSUBS: + fpd_fsubs(&vcpu->arch.fp.fpscr, &cr, fpr_d, fpr_a, fpr_b); + kvmppc_sync_qpr(vcpu, ax_rd); + break; + case OP_59_FDIVS: + fpd_fdivs(&vcpu->arch.fp.fpscr, &cr, fpr_d, fpr_a, fpr_b); + kvmppc_sync_qpr(vcpu, ax_rd); + break; + case OP_59_FRES: + fpd_fres(&vcpu->arch.fp.fpscr, &cr, fpr_d, fpr_b); + kvmppc_sync_qpr(vcpu, ax_rd); + break; + case OP_59_FRSQRTES: + fpd_frsqrtes(&vcpu->arch.fp.fpscr, &cr, fpr_d, fpr_b); + kvmppc_sync_qpr(vcpu, ax_rd); + break; + } + switch (inst_get_field(inst, 26, 30)) { + case OP_59_FMULS: + fpd_fmuls(&vcpu->arch.fp.fpscr, &cr, fpr_d, fpr_a, fpr_c); + kvmppc_sync_qpr(vcpu, ax_rd); + break; + case OP_59_FMSUBS: + fpd_fmsubs(&vcpu->arch.fp.fpscr, &cr, fpr_d, fpr_a, fpr_c, fpr_b); + kvmppc_sync_qpr(vcpu, ax_rd); + break; + case OP_59_FMADDS: + fpd_fmadds(&vcpu->arch.fp.fpscr, &cr, fpr_d, fpr_a, fpr_c, fpr_b); + kvmppc_sync_qpr(vcpu, ax_rd); + break; + case OP_59_FNMSUBS: + fpd_fnmsubs(&vcpu->arch.fp.fpscr, &cr, fpr_d, fpr_a, fpr_c, fpr_b); + kvmppc_sync_qpr(vcpu, ax_rd); + break; + case OP_59_FNMADDS: + fpd_fnmadds(&vcpu->arch.fp.fpscr, &cr, fpr_d, fpr_a, fpr_c, fpr_b); + kvmppc_sync_qpr(vcpu, ax_rd); + break; + } + break; + case 63: + switch (inst_get_field(inst, 21, 30)) { + case OP_63_MTFSB0: + case OP_63_MTFSB1: + case OP_63_MCRFS: + case OP_63_MTFSFI: + /* XXX need to implement */ + break; + case OP_63_MFFS: + /* XXX missing CR */ + *fpr_d = vcpu->arch.fp.fpscr; + break; + case OP_63_MTFSF: + /* XXX missing fm bits */ + /* XXX missing CR */ + vcpu->arch.fp.fpscr = *fpr_b; + break; + case OP_63_FCMPU: + { + u32 tmp_cr; + u32 cr0_mask = 0xf0000000; + u32 cr_shift = inst_get_field(inst, 6, 8) * 4; + + fpd_fcmpu(&vcpu->arch.fp.fpscr, &tmp_cr, fpr_a, fpr_b); + cr &= ~(cr0_mask >> cr_shift); + cr |= (cr & cr0_mask) >> cr_shift; + break; + } + case OP_63_FCMPO: + { + u32 tmp_cr; + u32 cr0_mask = 0xf0000000; + u32 cr_shift = inst_get_field(inst, 6, 8) * 4; + + fpd_fcmpo(&vcpu->arch.fp.fpscr, &tmp_cr, fpr_a, fpr_b); + cr &= ~(cr0_mask >> cr_shift); + cr |= (cr & cr0_mask) >> cr_shift; + break; + } + case OP_63_FNEG: + fpd_fneg(&vcpu->arch.fp.fpscr, &cr, fpr_d, fpr_b); + break; + case OP_63_FMR: + *fpr_d = *fpr_b; + break; + case OP_63_FABS: + fpd_fabs(&vcpu->arch.fp.fpscr, &cr, fpr_d, fpr_b); + break; + case OP_63_FCPSGN: + fpd_fcpsgn(&vcpu->arch.fp.fpscr, &cr, fpr_d, fpr_a, fpr_b); + break; + case OP_63_FDIV: + fpd_fdiv(&vcpu->arch.fp.fpscr, &cr, fpr_d, fpr_a, fpr_b); + break; + case OP_63_FADD: + fpd_fadd(&vcpu->arch.fp.fpscr, &cr, fpr_d, fpr_a, fpr_b); + break; + case OP_63_FSUB: + fpd_fsub(&vcpu->arch.fp.fpscr, &cr, fpr_d, fpr_a, fpr_b); + break; + case OP_63_FCTIW: + fpd_fctiw(&vcpu->arch.fp.fpscr, &cr, fpr_d, fpr_b); + break; + case OP_63_FCTIWZ: + fpd_fctiwz(&vcpu->arch.fp.fpscr, &cr, fpr_d, fpr_b); + break; + case OP_63_FRSP: + fpd_frsp(&vcpu->arch.fp.fpscr, &cr, fpr_d, fpr_b); + kvmppc_sync_qpr(vcpu, ax_rd); + break; + case OP_63_FRSQRTE: + { + double one = 1.0f; + + /* fD = sqrt(fB) */ + fpd_fsqrt(&vcpu->arch.fp.fpscr, &cr, fpr_d, fpr_b); + /* fD = 1.0f / fD */ + fpd_fdiv(&vcpu->arch.fp.fpscr, &cr, fpr_d, (u64*)&one, fpr_d); + break; + } + } + switch (inst_get_field(inst, 26, 30)) { + case OP_63_FMUL: + fpd_fmul(&vcpu->arch.fp.fpscr, &cr, fpr_d, fpr_a, fpr_c); + break; + case OP_63_FSEL: + fpd_fsel(&vcpu->arch.fp.fpscr, &cr, fpr_d, fpr_a, fpr_c, fpr_b); + break; + case OP_63_FMSUB: + fpd_fmsub(&vcpu->arch.fp.fpscr, &cr, fpr_d, fpr_a, fpr_c, fpr_b); + break; + case OP_63_FMADD: + fpd_fmadd(&vcpu->arch.fp.fpscr, &cr, fpr_d, fpr_a, fpr_c, fpr_b); + break; + case OP_63_FNMSUB: + fpd_fnmsub(&vcpu->arch.fp.fpscr, &cr, fpr_d, fpr_a, fpr_c, fpr_b); + break; + case OP_63_FNMADD: + fpd_fnmadd(&vcpu->arch.fp.fpscr, &cr, fpr_d, fpr_a, fpr_c, fpr_b); + break; + } + break; + } + +#ifdef DEBUG + for (i = 0; i < ARRAY_SIZE(vcpu->arch.fp.fpr); i++) { + u32 f; + kvm_cvt_df(&VCPU_FPR(vcpu, i), &f); + dprintk(KERN_INFO "FPR[%d] = 0x%x\n", i, f); + } +#endif + + if (rcomp) + kvmppc_set_cr(vcpu, cr); + + preempt_enable(); + + return emulated; +} diff --git a/kernel/arch/powerpc/kvm/book3s_pr.c b/kernel/arch/powerpc/kvm/book3s_pr.c new file mode 100644 index 000000000..f57383941 --- /dev/null +++ b/kernel/arch/powerpc/kvm/book3s_pr.c @@ -0,0 +1,1772 @@ +/* + * Copyright (C) 2009. SUSE Linux Products GmbH. All rights reserved. + * + * Authors: + * Alexander Graf <agraf@suse.de> + * Kevin Wolf <mail@kevin-wolf.de> + * Paul Mackerras <paulus@samba.org> + * + * Description: + * Functions relating to running KVM on Book 3S processors where + * we don't have access to hypervisor mode, and we run the guest + * in problem state (user mode). + * + * This file is derived from arch/powerpc/kvm/44x.c, + * by Hollis Blanchard <hollisb@us.ibm.com>. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + */ + +#include <linux/kvm_host.h> +#include <linux/export.h> +#include <linux/err.h> +#include <linux/slab.h> + +#include <asm/reg.h> +#include <asm/cputable.h> +#include <asm/cacheflush.h> +#include <asm/tlbflush.h> +#include <asm/uaccess.h> +#include <asm/io.h> +#include <asm/kvm_ppc.h> +#include <asm/kvm_book3s.h> +#include <asm/mmu_context.h> +#include <asm/switch_to.h> +#include <asm/firmware.h> +#include <asm/hvcall.h> +#include <linux/gfp.h> +#include <linux/sched.h> +#include <linux/vmalloc.h> +#include <linux/highmem.h> +#include <linux/module.h> +#include <linux/miscdevice.h> + +#include "book3s.h" + +#define CREATE_TRACE_POINTS +#include "trace_pr.h" + +/* #define EXIT_DEBUG */ +/* #define DEBUG_EXT */ + +static int kvmppc_handle_ext(struct kvm_vcpu *vcpu, unsigned int exit_nr, + ulong msr); +static void kvmppc_giveup_fac(struct kvm_vcpu *vcpu, ulong fac); + +/* Some compatibility defines */ +#ifdef CONFIG_PPC_BOOK3S_32 +#define MSR_USER32 MSR_USER +#define MSR_USER64 MSR_USER +#define HW_PAGE_SIZE PAGE_SIZE +#endif + +static bool kvmppc_is_split_real(struct kvm_vcpu *vcpu) +{ + ulong msr = kvmppc_get_msr(vcpu); + return (msr & (MSR_IR|MSR_DR)) == MSR_DR; +} + +static void kvmppc_fixup_split_real(struct kvm_vcpu *vcpu) +{ + ulong msr = kvmppc_get_msr(vcpu); + ulong pc = kvmppc_get_pc(vcpu); + + /* We are in DR only split real mode */ + if ((msr & (MSR_IR|MSR_DR)) != MSR_DR) + return; + + /* We have not fixed up the guest already */ + if (vcpu->arch.hflags & BOOK3S_HFLAG_SPLIT_HACK) + return; + + /* The code is in fixupable address space */ + if (pc & SPLIT_HACK_MASK) + return; + + vcpu->arch.hflags |= BOOK3S_HFLAG_SPLIT_HACK; + kvmppc_set_pc(vcpu, pc | SPLIT_HACK_OFFS); +} + +void kvmppc_unfixup_split_real(struct kvm_vcpu *vcpu); + +static void kvmppc_core_vcpu_load_pr(struct kvm_vcpu *vcpu, int cpu) +{ +#ifdef CONFIG_PPC_BOOK3S_64 + struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu); + memcpy(svcpu->slb, to_book3s(vcpu)->slb_shadow, sizeof(svcpu->slb)); + svcpu->slb_max = to_book3s(vcpu)->slb_shadow_max; + svcpu->in_use = 0; + svcpu_put(svcpu); +#endif + + /* Disable AIL if supported */ + if (cpu_has_feature(CPU_FTR_HVMODE) && + cpu_has_feature(CPU_FTR_ARCH_207S)) + mtspr(SPRN_LPCR, mfspr(SPRN_LPCR) & ~LPCR_AIL); + + vcpu->cpu = smp_processor_id(); +#ifdef CONFIG_PPC_BOOK3S_32 + current->thread.kvm_shadow_vcpu = vcpu->arch.shadow_vcpu; +#endif + + if (kvmppc_is_split_real(vcpu)) + kvmppc_fixup_split_real(vcpu); +} + +static void kvmppc_core_vcpu_put_pr(struct kvm_vcpu *vcpu) +{ +#ifdef CONFIG_PPC_BOOK3S_64 + struct kvmppc_book3s_shadow_vcpu *svcpu = svcpu_get(vcpu); + if (svcpu->in_use) { + kvmppc_copy_from_svcpu(vcpu, svcpu); + } + memcpy(to_book3s(vcpu)->slb_shadow, svcpu->slb, sizeof(svcpu->slb)); + to_book3s(vcpu)->slb_shadow_max = svcpu->slb_max; + svcpu_put(svcpu); +#endif + + if (kvmppc_is_split_real(vcpu)) + kvmppc_unfixup_split_real(vcpu); + + kvmppc_giveup_ext(vcpu, MSR_FP | MSR_VEC | MSR_VSX); + kvmppc_giveup_fac(vcpu, FSCR_TAR_LG); + + /* Enable AIL if supported */ + if (cpu_has_feature(CPU_FTR_HVMODE) && + cpu_has_feature(CPU_FTR_ARCH_207S)) + mtspr(SPRN_LPCR, mfspr(SPRN_LPCR) | LPCR_AIL_3); + + vcpu->cpu = -1; +} + +/* Copy data needed by real-mode code from vcpu to shadow vcpu */ +void kvmppc_copy_to_svcpu(struct kvmppc_book3s_shadow_vcpu *svcpu, + struct kvm_vcpu *vcpu) +{ + svcpu->gpr[0] = vcpu->arch.gpr[0]; + svcpu->gpr[1] = vcpu->arch.gpr[1]; + svcpu->gpr[2] = vcpu->arch.gpr[2]; + svcpu->gpr[3] = vcpu->arch.gpr[3]; + svcpu->gpr[4] = vcpu->arch.gpr[4]; + svcpu->gpr[5] = vcpu->arch.gpr[5]; + svcpu->gpr[6] = vcpu->arch.gpr[6]; + svcpu->gpr[7] = vcpu->arch.gpr[7]; + svcpu->gpr[8] = vcpu->arch.gpr[8]; + svcpu->gpr[9] = vcpu->arch.gpr[9]; + svcpu->gpr[10] = vcpu->arch.gpr[10]; + svcpu->gpr[11] = vcpu->arch.gpr[11]; + svcpu->gpr[12] = vcpu->arch.gpr[12]; + svcpu->gpr[13] = vcpu->arch.gpr[13]; + svcpu->cr = vcpu->arch.cr; + svcpu->xer = vcpu->arch.xer; + svcpu->ctr = vcpu->arch.ctr; + svcpu->lr = vcpu->arch.lr; + svcpu->pc = vcpu->arch.pc; +#ifdef CONFIG_PPC_BOOK3S_64 + svcpu->shadow_fscr = vcpu->arch.shadow_fscr; +#endif + /* + * Now also save the current time base value. We use this + * to find the guest purr and spurr value. + */ + vcpu->arch.entry_tb = get_tb(); + vcpu->arch.entry_vtb = get_vtb(); + if (cpu_has_feature(CPU_FTR_ARCH_207S)) + vcpu->arch.entry_ic = mfspr(SPRN_IC); + svcpu->in_use = true; +} + +/* Copy data touched by real-mode code from shadow vcpu back to vcpu */ +void kvmppc_copy_from_svcpu(struct kvm_vcpu *vcpu, + struct kvmppc_book3s_shadow_vcpu *svcpu) +{ + /* + * vcpu_put would just call us again because in_use hasn't + * been updated yet. + */ + preempt_disable(); + + /* + * Maybe we were already preempted and synced the svcpu from + * our preempt notifiers. Don't bother touching this svcpu then. + */ + if (!svcpu->in_use) + goto out; + + vcpu->arch.gpr[0] = svcpu->gpr[0]; + vcpu->arch.gpr[1] = svcpu->gpr[1]; + vcpu->arch.gpr[2] = svcpu->gpr[2]; + vcpu->arch.gpr[3] = svcpu->gpr[3]; + vcpu->arch.gpr[4] = svcpu->gpr[4]; + vcpu->arch.gpr[5] = svcpu->gpr[5]; + vcpu->arch.gpr[6] = svcpu->gpr[6]; + vcpu->arch.gpr[7] = svcpu->gpr[7]; + vcpu->arch.gpr[8] = svcpu->gpr[8]; + vcpu->arch.gpr[9] = svcpu->gpr[9]; + vcpu->arch.gpr[10] = svcpu->gpr[10]; + vcpu->arch.gpr[11] = svcpu->gpr[11]; + vcpu->arch.gpr[12] = svcpu->gpr[12]; + vcpu->arch.gpr[13] = svcpu->gpr[13]; + vcpu->arch.cr = svcpu->cr; + vcpu->arch.xer = svcpu->xer; + vcpu->arch.ctr = svcpu->ctr; + vcpu->arch.lr = svcpu->lr; + vcpu->arch.pc = svcpu->pc; + vcpu->arch.shadow_srr1 = svcpu->shadow_srr1; + vcpu->arch.fault_dar = svcpu->fault_dar; + vcpu->arch.fault_dsisr = svcpu->fault_dsisr; + vcpu->arch.last_inst = svcpu->last_inst; +#ifdef CONFIG_PPC_BOOK3S_64 + vcpu->arch.shadow_fscr = svcpu->shadow_fscr; +#endif + /* + * Update purr and spurr using time base on exit. + */ + vcpu->arch.purr += get_tb() - vcpu->arch.entry_tb; + vcpu->arch.spurr += get_tb() - vcpu->arch.entry_tb; + vcpu->arch.vtb += get_vtb() - vcpu->arch.entry_vtb; + if (cpu_has_feature(CPU_FTR_ARCH_207S)) + vcpu->arch.ic += mfspr(SPRN_IC) - vcpu->arch.entry_ic; + svcpu->in_use = false; + +out: + preempt_enable(); +} + +static int kvmppc_core_check_requests_pr(struct kvm_vcpu *vcpu) +{ + int r = 1; /* Indicate we want to get back into the guest */ + + /* We misuse TLB_FLUSH to indicate that we want to clear + all shadow cache entries */ + if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu)) + kvmppc_mmu_pte_flush(vcpu, 0, 0); + + return r; +} + +/************* MMU Notifiers *************/ +static void do_kvm_unmap_hva(struct kvm *kvm, unsigned long start, + unsigned long end) +{ + long i; + struct kvm_vcpu *vcpu; + struct kvm_memslots *slots; + struct kvm_memory_slot *memslot; + + slots = kvm_memslots(kvm); + kvm_for_each_memslot(memslot, slots) { + unsigned long hva_start, hva_end; + gfn_t gfn, gfn_end; + + hva_start = max(start, memslot->userspace_addr); + hva_end = min(end, memslot->userspace_addr + + (memslot->npages << PAGE_SHIFT)); + if (hva_start >= hva_end) + continue; + /* + * {gfn(page) | page intersects with [hva_start, hva_end)} = + * {gfn, gfn+1, ..., gfn_end-1}. + */ + gfn = hva_to_gfn_memslot(hva_start, memslot); + gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot); + kvm_for_each_vcpu(i, vcpu, kvm) + kvmppc_mmu_pte_pflush(vcpu, gfn << PAGE_SHIFT, + gfn_end << PAGE_SHIFT); + } +} + +static int kvm_unmap_hva_pr(struct kvm *kvm, unsigned long hva) +{ + trace_kvm_unmap_hva(hva); + + do_kvm_unmap_hva(kvm, hva, hva + PAGE_SIZE); + + return 0; +} + +static int kvm_unmap_hva_range_pr(struct kvm *kvm, unsigned long start, + unsigned long end) +{ + do_kvm_unmap_hva(kvm, start, end); + + return 0; +} + +static int kvm_age_hva_pr(struct kvm *kvm, unsigned long start, + unsigned long end) +{ + /* XXX could be more clever ;) */ + return 0; +} + +static int kvm_test_age_hva_pr(struct kvm *kvm, unsigned long hva) +{ + /* XXX could be more clever ;) */ + return 0; +} + +static void kvm_set_spte_hva_pr(struct kvm *kvm, unsigned long hva, pte_t pte) +{ + /* The page will get remapped properly on its next fault */ + do_kvm_unmap_hva(kvm, hva, hva + PAGE_SIZE); +} + +/*****************************************/ + +static void kvmppc_recalc_shadow_msr(struct kvm_vcpu *vcpu) +{ + ulong guest_msr = kvmppc_get_msr(vcpu); + ulong smsr = guest_msr; + + /* Guest MSR values */ + smsr &= MSR_FE0 | MSR_FE1 | MSR_SF | MSR_SE | MSR_BE | MSR_LE; + /* Process MSR values */ + smsr |= MSR_ME | MSR_RI | MSR_IR | MSR_DR | MSR_PR | MSR_EE; + /* External providers the guest reserved */ + smsr |= (guest_msr & vcpu->arch.guest_owned_ext); + /* 64-bit Process MSR values */ +#ifdef CONFIG_PPC_BOOK3S_64 + smsr |= MSR_ISF | MSR_HV; +#endif + vcpu->arch.shadow_msr = smsr; +} + +static void kvmppc_set_msr_pr(struct kvm_vcpu *vcpu, u64 msr) +{ + ulong old_msr = kvmppc_get_msr(vcpu); + +#ifdef EXIT_DEBUG + printk(KERN_INFO "KVM: Set MSR to 0x%llx\n", msr); +#endif + + msr &= to_book3s(vcpu)->msr_mask; + kvmppc_set_msr_fast(vcpu, msr); + kvmppc_recalc_shadow_msr(vcpu); + + if (msr & MSR_POW) { + if (!vcpu->arch.pending_exceptions) { + kvm_vcpu_block(vcpu); + clear_bit(KVM_REQ_UNHALT, &vcpu->requests); + vcpu->stat.halt_wakeup++; + + /* Unset POW bit after we woke up */ + msr &= ~MSR_POW; + kvmppc_set_msr_fast(vcpu, msr); + } + } + + if (kvmppc_is_split_real(vcpu)) + kvmppc_fixup_split_real(vcpu); + else + kvmppc_unfixup_split_real(vcpu); + + if ((kvmppc_get_msr(vcpu) & (MSR_PR|MSR_IR|MSR_DR)) != + (old_msr & (MSR_PR|MSR_IR|MSR_DR))) { + kvmppc_mmu_flush_segments(vcpu); + kvmppc_mmu_map_segment(vcpu, kvmppc_get_pc(vcpu)); + + /* Preload magic page segment when in kernel mode */ + if (!(msr & MSR_PR) && vcpu->arch.magic_page_pa) { + struct kvm_vcpu_arch *a = &vcpu->arch; + + if (msr & MSR_DR) + kvmppc_mmu_map_segment(vcpu, a->magic_page_ea); + else + kvmppc_mmu_map_segment(vcpu, a->magic_page_pa); + } + } + + /* + * When switching from 32 to 64-bit, we may have a stale 32-bit + * magic page around, we need to flush it. Typically 32-bit magic + * page will be instanciated when calling into RTAS. Note: We + * assume that such transition only happens while in kernel mode, + * ie, we never transition from user 32-bit to kernel 64-bit with + * a 32-bit magic page around. + */ + if (vcpu->arch.magic_page_pa && + !(old_msr & MSR_PR) && !(old_msr & MSR_SF) && (msr & MSR_SF)) { + /* going from RTAS to normal kernel code */ + kvmppc_mmu_pte_flush(vcpu, (uint32_t)vcpu->arch.magic_page_pa, + ~0xFFFUL); + } + + /* Preload FPU if it's enabled */ + if (kvmppc_get_msr(vcpu) & MSR_FP) + kvmppc_handle_ext(vcpu, BOOK3S_INTERRUPT_FP_UNAVAIL, MSR_FP); +} + +void kvmppc_set_pvr_pr(struct kvm_vcpu *vcpu, u32 pvr) +{ + u32 host_pvr; + + vcpu->arch.hflags &= ~BOOK3S_HFLAG_SLB; + vcpu->arch.pvr = pvr; +#ifdef CONFIG_PPC_BOOK3S_64 + if ((pvr >= 0x330000) && (pvr < 0x70330000)) { + kvmppc_mmu_book3s_64_init(vcpu); + if (!to_book3s(vcpu)->hior_explicit) + to_book3s(vcpu)->hior = 0xfff00000; + to_book3s(vcpu)->msr_mask = 0xffffffffffffffffULL; + vcpu->arch.cpu_type = KVM_CPU_3S_64; + } else +#endif + { + kvmppc_mmu_book3s_32_init(vcpu); + if (!to_book3s(vcpu)->hior_explicit) + to_book3s(vcpu)->hior = 0; + to_book3s(vcpu)->msr_mask = 0xffffffffULL; + vcpu->arch.cpu_type = KVM_CPU_3S_32; + } + + kvmppc_sanity_check(vcpu); + + /* If we are in hypervisor level on 970, we can tell the CPU to + * treat DCBZ as 32 bytes store */ + vcpu->arch.hflags &= ~BOOK3S_HFLAG_DCBZ32; + if (vcpu->arch.mmu.is_dcbz32(vcpu) && (mfmsr() & MSR_HV) && + !strcmp(cur_cpu_spec->platform, "ppc970")) + vcpu->arch.hflags |= BOOK3S_HFLAG_DCBZ32; + + /* Cell performs badly if MSR_FEx are set. So let's hope nobody + really needs them in a VM on Cell and force disable them. */ + if (!strcmp(cur_cpu_spec->platform, "ppc-cell-be")) + to_book3s(vcpu)->msr_mask &= ~(MSR_FE0 | MSR_FE1); + + /* + * If they're asking for POWER6 or later, set the flag + * indicating that we can do multiple large page sizes + * and 1TB segments. + * Also set the flag that indicates that tlbie has the large + * page bit in the RB operand instead of the instruction. + */ + switch (PVR_VER(pvr)) { + case PVR_POWER6: + case PVR_POWER7: + case PVR_POWER7p: + case PVR_POWER8: + vcpu->arch.hflags |= BOOK3S_HFLAG_MULTI_PGSIZE | + BOOK3S_HFLAG_NEW_TLBIE; + break; + } + +#ifdef CONFIG_PPC_BOOK3S_32 + /* 32 bit Book3S always has 32 byte dcbz */ + vcpu->arch.hflags |= BOOK3S_HFLAG_DCBZ32; +#endif + + /* On some CPUs we can execute paired single operations natively */ + asm ( "mfpvr %0" : "=r"(host_pvr)); + switch (host_pvr) { + case 0x00080200: /* lonestar 2.0 */ + case 0x00088202: /* lonestar 2.2 */ + case 0x70000100: /* gekko 1.0 */ + case 0x00080100: /* gekko 2.0 */ + case 0x00083203: /* gekko 2.3a */ + case 0x00083213: /* gekko 2.3b */ + case 0x00083204: /* gekko 2.4 */ + case 0x00083214: /* gekko 2.4e (8SE) - retail HW2 */ + case 0x00087200: /* broadway */ + vcpu->arch.hflags |= BOOK3S_HFLAG_NATIVE_PS; + /* Enable HID2.PSE - in case we need it later */ + mtspr(SPRN_HID2_GEKKO, mfspr(SPRN_HID2_GEKKO) | (1 << 29)); + } +} + +/* Book3s_32 CPUs always have 32 bytes cache line size, which Linux assumes. To + * make Book3s_32 Linux work on Book3s_64, we have to make sure we trap dcbz to + * emulate 32 bytes dcbz length. + * + * The Book3s_64 inventors also realized this case and implemented a special bit + * in the HID5 register, which is a hypervisor ressource. Thus we can't use it. + * + * My approach here is to patch the dcbz instruction on executing pages. + */ +static void kvmppc_patch_dcbz(struct kvm_vcpu *vcpu, struct kvmppc_pte *pte) +{ + struct page *hpage; + u64 hpage_offset; + u32 *page; + int i; + + hpage = gfn_to_page(vcpu->kvm, pte->raddr >> PAGE_SHIFT); + if (is_error_page(hpage)) + return; + + hpage_offset = pte->raddr & ~PAGE_MASK; + hpage_offset &= ~0xFFFULL; + hpage_offset /= 4; + + get_page(hpage); + page = kmap_atomic(hpage); + + /* patch dcbz into reserved instruction, so we trap */ + for (i=hpage_offset; i < hpage_offset + (HW_PAGE_SIZE / 4); i++) + if ((be32_to_cpu(page[i]) & 0xff0007ff) == INS_DCBZ) + page[i] &= cpu_to_be32(0xfffffff7); + + kunmap_atomic(page); + put_page(hpage); +} + +static int kvmppc_visible_gpa(struct kvm_vcpu *vcpu, gpa_t gpa) +{ + ulong mp_pa = vcpu->arch.magic_page_pa; + + if (!(kvmppc_get_msr(vcpu) & MSR_SF)) + mp_pa = (uint32_t)mp_pa; + + gpa &= ~0xFFFULL; + if (unlikely(mp_pa) && unlikely((mp_pa & KVM_PAM) == (gpa & KVM_PAM))) { + return 1; + } + + return kvm_is_visible_gfn(vcpu->kvm, gpa >> PAGE_SHIFT); +} + +int kvmppc_handle_pagefault(struct kvm_run *run, struct kvm_vcpu *vcpu, + ulong eaddr, int vec) +{ + bool data = (vec == BOOK3S_INTERRUPT_DATA_STORAGE); + bool iswrite = false; + int r = RESUME_GUEST; + int relocated; + int page_found = 0; + struct kvmppc_pte pte; + bool is_mmio = false; + bool dr = (kvmppc_get_msr(vcpu) & MSR_DR) ? true : false; + bool ir = (kvmppc_get_msr(vcpu) & MSR_IR) ? true : false; + u64 vsid; + + relocated = data ? dr : ir; + if (data && (vcpu->arch.fault_dsisr & DSISR_ISSTORE)) + iswrite = true; + + /* Resolve real address if translation turned on */ + if (relocated) { + page_found = vcpu->arch.mmu.xlate(vcpu, eaddr, &pte, data, iswrite); + } else { + pte.may_execute = true; + pte.may_read = true; + pte.may_write = true; + pte.raddr = eaddr & KVM_PAM; + pte.eaddr = eaddr; + pte.vpage = eaddr >> 12; + pte.page_size = MMU_PAGE_64K; + } + + switch (kvmppc_get_msr(vcpu) & (MSR_DR|MSR_IR)) { + case 0: + pte.vpage |= ((u64)VSID_REAL << (SID_SHIFT - 12)); + break; + case MSR_DR: + if (!data && + (vcpu->arch.hflags & BOOK3S_HFLAG_SPLIT_HACK) && + ((pte.raddr & SPLIT_HACK_MASK) == SPLIT_HACK_OFFS)) + pte.raddr &= ~SPLIT_HACK_MASK; + /* fall through */ + case MSR_IR: + vcpu->arch.mmu.esid_to_vsid(vcpu, eaddr >> SID_SHIFT, &vsid); + + if ((kvmppc_get_msr(vcpu) & (MSR_DR|MSR_IR)) == MSR_DR) + pte.vpage |= ((u64)VSID_REAL_DR << (SID_SHIFT - 12)); + else + pte.vpage |= ((u64)VSID_REAL_IR << (SID_SHIFT - 12)); + pte.vpage |= vsid; + + if (vsid == -1) + page_found = -EINVAL; + break; + } + + if (vcpu->arch.mmu.is_dcbz32(vcpu) && + (!(vcpu->arch.hflags & BOOK3S_HFLAG_DCBZ32))) { + /* + * If we do the dcbz hack, we have to NX on every execution, + * so we can patch the executing code. This renders our guest + * NX-less. + */ + pte.may_execute = !data; + } + + if (page_found == -ENOENT) { + /* Page not found in guest PTE entries */ + u64 ssrr1 = vcpu->arch.shadow_srr1; + u64 msr = kvmppc_get_msr(vcpu); + kvmppc_set_dar(vcpu, kvmppc_get_fault_dar(vcpu)); + kvmppc_set_dsisr(vcpu, vcpu->arch.fault_dsisr); + kvmppc_set_msr_fast(vcpu, msr | (ssrr1 & 0xf8000000ULL)); + kvmppc_book3s_queue_irqprio(vcpu, vec); + } else if (page_found == -EPERM) { + /* Storage protection */ + u32 dsisr = vcpu->arch.fault_dsisr; + u64 ssrr1 = vcpu->arch.shadow_srr1; + u64 msr = kvmppc_get_msr(vcpu); + kvmppc_set_dar(vcpu, kvmppc_get_fault_dar(vcpu)); + dsisr = (dsisr & ~DSISR_NOHPTE) | DSISR_PROTFAULT; + kvmppc_set_dsisr(vcpu, dsisr); + kvmppc_set_msr_fast(vcpu, msr | (ssrr1 & 0xf8000000ULL)); + kvmppc_book3s_queue_irqprio(vcpu, vec); + } else if (page_found == -EINVAL) { + /* Page not found in guest SLB */ + kvmppc_set_dar(vcpu, kvmppc_get_fault_dar(vcpu)); + kvmppc_book3s_queue_irqprio(vcpu, vec + 0x80); + } else if (!is_mmio && + kvmppc_visible_gpa(vcpu, pte.raddr)) { + if (data && !(vcpu->arch.fault_dsisr & DSISR_NOHPTE)) { + /* + * There is already a host HPTE there, presumably + * a read-only one for a page the guest thinks + * is writable, so get rid of it first. + */ + kvmppc_mmu_unmap_page(vcpu, &pte); + } + /* The guest's PTE is not mapped yet. Map on the host */ + kvmppc_mmu_map_page(vcpu, &pte, iswrite); + if (data) + vcpu->stat.sp_storage++; + else if (vcpu->arch.mmu.is_dcbz32(vcpu) && + (!(vcpu->arch.hflags & BOOK3S_HFLAG_DCBZ32))) + kvmppc_patch_dcbz(vcpu, &pte); + } else { + /* MMIO */ + vcpu->stat.mmio_exits++; + vcpu->arch.paddr_accessed = pte.raddr; + vcpu->arch.vaddr_accessed = pte.eaddr; + r = kvmppc_emulate_mmio(run, vcpu); + if ( r == RESUME_HOST_NV ) + r = RESUME_HOST; + } + + return r; +} + +/* Give up external provider (FPU, Altivec, VSX) */ +void kvmppc_giveup_ext(struct kvm_vcpu *vcpu, ulong msr) +{ + struct thread_struct *t = ¤t->thread; + + /* + * VSX instructions can access FP and vector registers, so if + * we are giving up VSX, make sure we give up FP and VMX as well. + */ + if (msr & MSR_VSX) + msr |= MSR_FP | MSR_VEC; + + msr &= vcpu->arch.guest_owned_ext; + if (!msr) + return; + +#ifdef DEBUG_EXT + printk(KERN_INFO "Giving up ext 0x%lx\n", msr); +#endif + + if (msr & MSR_FP) { + /* + * Note that on CPUs with VSX, giveup_fpu stores + * both the traditional FP registers and the added VSX + * registers into thread.fp_state.fpr[]. + */ + if (t->regs->msr & MSR_FP) + giveup_fpu(current); + t->fp_save_area = NULL; + } + +#ifdef CONFIG_ALTIVEC + if (msr & MSR_VEC) { + if (current->thread.regs->msr & MSR_VEC) + giveup_altivec(current); + t->vr_save_area = NULL; + } +#endif + + vcpu->arch.guest_owned_ext &= ~(msr | MSR_VSX); + kvmppc_recalc_shadow_msr(vcpu); +} + +/* Give up facility (TAR / EBB / DSCR) */ +static void kvmppc_giveup_fac(struct kvm_vcpu *vcpu, ulong fac) +{ +#ifdef CONFIG_PPC_BOOK3S_64 + if (!(vcpu->arch.shadow_fscr & (1ULL << fac))) { + /* Facility not available to the guest, ignore giveup request*/ + return; + } + + switch (fac) { + case FSCR_TAR_LG: + vcpu->arch.tar = mfspr(SPRN_TAR); + mtspr(SPRN_TAR, current->thread.tar); + vcpu->arch.shadow_fscr &= ~FSCR_TAR; + break; + } +#endif +} + +/* Handle external providers (FPU, Altivec, VSX) */ +static int kvmppc_handle_ext(struct kvm_vcpu *vcpu, unsigned int exit_nr, + ulong msr) +{ + struct thread_struct *t = ¤t->thread; + + /* When we have paired singles, we emulate in software */ + if (vcpu->arch.hflags & BOOK3S_HFLAG_PAIRED_SINGLE) + return RESUME_GUEST; + + if (!(kvmppc_get_msr(vcpu) & msr)) { + kvmppc_book3s_queue_irqprio(vcpu, exit_nr); + return RESUME_GUEST; + } + + if (msr == MSR_VSX) { + /* No VSX? Give an illegal instruction interrupt */ +#ifdef CONFIG_VSX + if (!cpu_has_feature(CPU_FTR_VSX)) +#endif + { + kvmppc_core_queue_program(vcpu, SRR1_PROGILL); + return RESUME_GUEST; + } + + /* + * We have to load up all the FP and VMX registers before + * we can let the guest use VSX instructions. + */ + msr = MSR_FP | MSR_VEC | MSR_VSX; + } + + /* See if we already own all the ext(s) needed */ + msr &= ~vcpu->arch.guest_owned_ext; + if (!msr) + return RESUME_GUEST; + +#ifdef DEBUG_EXT + printk(KERN_INFO "Loading up ext 0x%lx\n", msr); +#endif + + if (msr & MSR_FP) { + preempt_disable(); + enable_kernel_fp(); + load_fp_state(&vcpu->arch.fp); + t->fp_save_area = &vcpu->arch.fp; + preempt_enable(); + } + + if (msr & MSR_VEC) { +#ifdef CONFIG_ALTIVEC + preempt_disable(); + enable_kernel_altivec(); + load_vr_state(&vcpu->arch.vr); + t->vr_save_area = &vcpu->arch.vr; + preempt_enable(); +#endif + } + + t->regs->msr |= msr; + vcpu->arch.guest_owned_ext |= msr; + kvmppc_recalc_shadow_msr(vcpu); + + return RESUME_GUEST; +} + +/* + * Kernel code using FP or VMX could have flushed guest state to + * the thread_struct; if so, get it back now. + */ +static void kvmppc_handle_lost_ext(struct kvm_vcpu *vcpu) +{ + unsigned long lost_ext; + + lost_ext = vcpu->arch.guest_owned_ext & ~current->thread.regs->msr; + if (!lost_ext) + return; + + if (lost_ext & MSR_FP) { + preempt_disable(); + enable_kernel_fp(); + load_fp_state(&vcpu->arch.fp); + preempt_enable(); + } +#ifdef CONFIG_ALTIVEC + if (lost_ext & MSR_VEC) { + preempt_disable(); + enable_kernel_altivec(); + load_vr_state(&vcpu->arch.vr); + preempt_enable(); + } +#endif + current->thread.regs->msr |= lost_ext; +} + +#ifdef CONFIG_PPC_BOOK3S_64 + +static void kvmppc_trigger_fac_interrupt(struct kvm_vcpu *vcpu, ulong fac) +{ + /* Inject the Interrupt Cause field and trigger a guest interrupt */ + vcpu->arch.fscr &= ~(0xffULL << 56); + vcpu->arch.fscr |= (fac << 56); + kvmppc_book3s_queue_irqprio(vcpu, BOOK3S_INTERRUPT_FAC_UNAVAIL); +} + +static void kvmppc_emulate_fac(struct kvm_vcpu *vcpu, ulong fac) +{ + enum emulation_result er = EMULATE_FAIL; + + if (!(kvmppc_get_msr(vcpu) & MSR_PR)) + er = kvmppc_emulate_instruction(vcpu->run, vcpu); + + if ((er != EMULATE_DONE) && (er != EMULATE_AGAIN)) { + /* Couldn't emulate, trigger interrupt in guest */ + kvmppc_trigger_fac_interrupt(vcpu, fac); + } +} + +/* Enable facilities (TAR, EBB, DSCR) for the guest */ +static int kvmppc_handle_fac(struct kvm_vcpu *vcpu, ulong fac) +{ + bool guest_fac_enabled; + BUG_ON(!cpu_has_feature(CPU_FTR_ARCH_207S)); + + /* + * Not every facility is enabled by FSCR bits, check whether the + * guest has this facility enabled at all. + */ + switch (fac) { + case FSCR_TAR_LG: + case FSCR_EBB_LG: + guest_fac_enabled = (vcpu->arch.fscr & (1ULL << fac)); + break; + case FSCR_TM_LG: + guest_fac_enabled = kvmppc_get_msr(vcpu) & MSR_TM; + break; + default: + guest_fac_enabled = false; + break; + } + + if (!guest_fac_enabled) { + /* Facility not enabled by the guest */ + kvmppc_trigger_fac_interrupt(vcpu, fac); + return RESUME_GUEST; + } + + switch (fac) { + case FSCR_TAR_LG: + /* TAR switching isn't lazy in Linux yet */ + current->thread.tar = mfspr(SPRN_TAR); + mtspr(SPRN_TAR, vcpu->arch.tar); + vcpu->arch.shadow_fscr |= FSCR_TAR; + break; + default: + kvmppc_emulate_fac(vcpu, fac); + break; + } + + return RESUME_GUEST; +} + +void kvmppc_set_fscr(struct kvm_vcpu *vcpu, u64 fscr) +{ + if ((vcpu->arch.fscr & FSCR_TAR) && !(fscr & FSCR_TAR)) { + /* TAR got dropped, drop it in shadow too */ + kvmppc_giveup_fac(vcpu, FSCR_TAR_LG); + } + vcpu->arch.fscr = fscr; +} +#endif + +int kvmppc_handle_exit_pr(struct kvm_run *run, struct kvm_vcpu *vcpu, + unsigned int exit_nr) +{ + int r = RESUME_HOST; + int s; + + vcpu->stat.sum_exits++; + + run->exit_reason = KVM_EXIT_UNKNOWN; + run->ready_for_interrupt_injection = 1; + + /* We get here with MSR.EE=1 */ + + trace_kvm_exit(exit_nr, vcpu); + kvm_guest_exit(); + + switch (exit_nr) { + case BOOK3S_INTERRUPT_INST_STORAGE: + { + ulong shadow_srr1 = vcpu->arch.shadow_srr1; + vcpu->stat.pf_instruc++; + + if (kvmppc_is_split_real(vcpu)) + kvmppc_fixup_split_real(vcpu); + +#ifdef CONFIG_PPC_BOOK3S_32 + /* We set segments as unused segments when invalidating them. So + * treat the respective fault as segment fault. */ + { + struct kvmppc_book3s_shadow_vcpu *svcpu; + u32 sr; + + svcpu = svcpu_get(vcpu); + sr = svcpu->sr[kvmppc_get_pc(vcpu) >> SID_SHIFT]; + svcpu_put(svcpu); + if (sr == SR_INVALID) { + kvmppc_mmu_map_segment(vcpu, kvmppc_get_pc(vcpu)); + r = RESUME_GUEST; + break; + } + } +#endif + + /* only care about PTEG not found errors, but leave NX alone */ + if (shadow_srr1 & 0x40000000) { + int idx = srcu_read_lock(&vcpu->kvm->srcu); + r = kvmppc_handle_pagefault(run, vcpu, kvmppc_get_pc(vcpu), exit_nr); + srcu_read_unlock(&vcpu->kvm->srcu, idx); + vcpu->stat.sp_instruc++; + } else if (vcpu->arch.mmu.is_dcbz32(vcpu) && + (!(vcpu->arch.hflags & BOOK3S_HFLAG_DCBZ32))) { + /* + * XXX If we do the dcbz hack we use the NX bit to flush&patch the page, + * so we can't use the NX bit inside the guest. Let's cross our fingers, + * that no guest that needs the dcbz hack does NX. + */ + kvmppc_mmu_pte_flush(vcpu, kvmppc_get_pc(vcpu), ~0xFFFUL); + r = RESUME_GUEST; + } else { + u64 msr = kvmppc_get_msr(vcpu); + msr |= shadow_srr1 & 0x58000000; + kvmppc_set_msr_fast(vcpu, msr); + kvmppc_book3s_queue_irqprio(vcpu, exit_nr); + r = RESUME_GUEST; + } + break; + } + case BOOK3S_INTERRUPT_DATA_STORAGE: + { + ulong dar = kvmppc_get_fault_dar(vcpu); + u32 fault_dsisr = vcpu->arch.fault_dsisr; + vcpu->stat.pf_storage++; + +#ifdef CONFIG_PPC_BOOK3S_32 + /* We set segments as unused segments when invalidating them. So + * treat the respective fault as segment fault. */ + { + struct kvmppc_book3s_shadow_vcpu *svcpu; + u32 sr; + + svcpu = svcpu_get(vcpu); + sr = svcpu->sr[dar >> SID_SHIFT]; + svcpu_put(svcpu); + if (sr == SR_INVALID) { + kvmppc_mmu_map_segment(vcpu, dar); + r = RESUME_GUEST; + break; + } + } +#endif + + /* + * We need to handle missing shadow PTEs, and + * protection faults due to us mapping a page read-only + * when the guest thinks it is writable. + */ + if (fault_dsisr & (DSISR_NOHPTE | DSISR_PROTFAULT)) { + int idx = srcu_read_lock(&vcpu->kvm->srcu); + r = kvmppc_handle_pagefault(run, vcpu, dar, exit_nr); + srcu_read_unlock(&vcpu->kvm->srcu, idx); + } else { + kvmppc_set_dar(vcpu, dar); + kvmppc_set_dsisr(vcpu, fault_dsisr); + kvmppc_book3s_queue_irqprio(vcpu, exit_nr); + r = RESUME_GUEST; + } + break; + } + case BOOK3S_INTERRUPT_DATA_SEGMENT: + if (kvmppc_mmu_map_segment(vcpu, kvmppc_get_fault_dar(vcpu)) < 0) { + kvmppc_set_dar(vcpu, kvmppc_get_fault_dar(vcpu)); + kvmppc_book3s_queue_irqprio(vcpu, + BOOK3S_INTERRUPT_DATA_SEGMENT); + } + r = RESUME_GUEST; + break; + case BOOK3S_INTERRUPT_INST_SEGMENT: + if (kvmppc_mmu_map_segment(vcpu, kvmppc_get_pc(vcpu)) < 0) { + kvmppc_book3s_queue_irqprio(vcpu, + BOOK3S_INTERRUPT_INST_SEGMENT); + } + r = RESUME_GUEST; + break; + /* We're good on these - the host merely wanted to get our attention */ + case BOOK3S_INTERRUPT_DECREMENTER: + case BOOK3S_INTERRUPT_HV_DECREMENTER: + case BOOK3S_INTERRUPT_DOORBELL: + case BOOK3S_INTERRUPT_H_DOORBELL: + vcpu->stat.dec_exits++; + r = RESUME_GUEST; + break; + case BOOK3S_INTERRUPT_EXTERNAL: + case BOOK3S_INTERRUPT_EXTERNAL_LEVEL: + case BOOK3S_INTERRUPT_EXTERNAL_HV: + vcpu->stat.ext_intr_exits++; + r = RESUME_GUEST; + break; + case BOOK3S_INTERRUPT_PERFMON: + r = RESUME_GUEST; + break; + case BOOK3S_INTERRUPT_PROGRAM: + case BOOK3S_INTERRUPT_H_EMUL_ASSIST: + { + enum emulation_result er; + ulong flags; + u32 last_inst; + int emul; + +program_interrupt: + flags = vcpu->arch.shadow_srr1 & 0x1f0000ull; + + emul = kvmppc_get_last_inst(vcpu, INST_GENERIC, &last_inst); + if (emul != EMULATE_DONE) { + r = RESUME_GUEST; + break; + } + + if (kvmppc_get_msr(vcpu) & MSR_PR) { +#ifdef EXIT_DEBUG + pr_info("Userspace triggered 0x700 exception at\n 0x%lx (0x%x)\n", + kvmppc_get_pc(vcpu), last_inst); +#endif + if ((last_inst & 0xff0007ff) != + (INS_DCBZ & 0xfffffff7)) { + kvmppc_core_queue_program(vcpu, flags); + r = RESUME_GUEST; + break; + } + } + + vcpu->stat.emulated_inst_exits++; + er = kvmppc_emulate_instruction(run, vcpu); + switch (er) { + case EMULATE_DONE: + r = RESUME_GUEST_NV; + break; + case EMULATE_AGAIN: + r = RESUME_GUEST; + break; + case EMULATE_FAIL: + printk(KERN_CRIT "%s: emulation at %lx failed (%08x)\n", + __func__, kvmppc_get_pc(vcpu), last_inst); + kvmppc_core_queue_program(vcpu, flags); + r = RESUME_GUEST; + break; + case EMULATE_DO_MMIO: + run->exit_reason = KVM_EXIT_MMIO; + r = RESUME_HOST_NV; + break; + case EMULATE_EXIT_USER: + r = RESUME_HOST_NV; + break; + default: + BUG(); + } + break; + } + case BOOK3S_INTERRUPT_SYSCALL: + { + u32 last_sc; + int emul; + + /* Get last sc for papr */ + if (vcpu->arch.papr_enabled) { + /* The sc instuction points SRR0 to the next inst */ + emul = kvmppc_get_last_inst(vcpu, INST_SC, &last_sc); + if (emul != EMULATE_DONE) { + kvmppc_set_pc(vcpu, kvmppc_get_pc(vcpu) - 4); + r = RESUME_GUEST; + break; + } + } + + if (vcpu->arch.papr_enabled && + (last_sc == 0x44000022) && + !(kvmppc_get_msr(vcpu) & MSR_PR)) { + /* SC 1 papr hypercalls */ + ulong cmd = kvmppc_get_gpr(vcpu, 3); + int i; + +#ifdef CONFIG_PPC_BOOK3S_64 + if (kvmppc_h_pr(vcpu, cmd) == EMULATE_DONE) { + r = RESUME_GUEST; + break; + } +#endif + + run->papr_hcall.nr = cmd; + for (i = 0; i < 9; ++i) { + ulong gpr = kvmppc_get_gpr(vcpu, 4 + i); + run->papr_hcall.args[i] = gpr; + } + run->exit_reason = KVM_EXIT_PAPR_HCALL; + vcpu->arch.hcall_needed = 1; + r = RESUME_HOST; + } else if (vcpu->arch.osi_enabled && + (((u32)kvmppc_get_gpr(vcpu, 3)) == OSI_SC_MAGIC_R3) && + (((u32)kvmppc_get_gpr(vcpu, 4)) == OSI_SC_MAGIC_R4)) { + /* MOL hypercalls */ + u64 *gprs = run->osi.gprs; + int i; + + run->exit_reason = KVM_EXIT_OSI; + for (i = 0; i < 32; i++) + gprs[i] = kvmppc_get_gpr(vcpu, i); + vcpu->arch.osi_needed = 1; + r = RESUME_HOST_NV; + } else if (!(kvmppc_get_msr(vcpu) & MSR_PR) && + (((u32)kvmppc_get_gpr(vcpu, 0)) == KVM_SC_MAGIC_R0)) { + /* KVM PV hypercalls */ + kvmppc_set_gpr(vcpu, 3, kvmppc_kvm_pv(vcpu)); + r = RESUME_GUEST; + } else { + /* Guest syscalls */ + vcpu->stat.syscall_exits++; + kvmppc_book3s_queue_irqprio(vcpu, exit_nr); + r = RESUME_GUEST; + } + break; + } + case BOOK3S_INTERRUPT_FP_UNAVAIL: + case BOOK3S_INTERRUPT_ALTIVEC: + case BOOK3S_INTERRUPT_VSX: + { + int ext_msr = 0; + int emul; + u32 last_inst; + + if (vcpu->arch.hflags & BOOK3S_HFLAG_PAIRED_SINGLE) { + /* Do paired single instruction emulation */ + emul = kvmppc_get_last_inst(vcpu, INST_GENERIC, + &last_inst); + if (emul == EMULATE_DONE) + goto program_interrupt; + else + r = RESUME_GUEST; + + break; + } + + /* Enable external provider */ + switch (exit_nr) { + case BOOK3S_INTERRUPT_FP_UNAVAIL: + ext_msr = MSR_FP; + break; + + case BOOK3S_INTERRUPT_ALTIVEC: + ext_msr = MSR_VEC; + break; + + case BOOK3S_INTERRUPT_VSX: + ext_msr = MSR_VSX; + break; + } + + r = kvmppc_handle_ext(vcpu, exit_nr, ext_msr); + break; + } + case BOOK3S_INTERRUPT_ALIGNMENT: + { + u32 last_inst; + int emul = kvmppc_get_last_inst(vcpu, INST_GENERIC, &last_inst); + + if (emul == EMULATE_DONE) { + u32 dsisr; + u64 dar; + + dsisr = kvmppc_alignment_dsisr(vcpu, last_inst); + dar = kvmppc_alignment_dar(vcpu, last_inst); + + kvmppc_set_dsisr(vcpu, dsisr); + kvmppc_set_dar(vcpu, dar); + + kvmppc_book3s_queue_irqprio(vcpu, exit_nr); + } + r = RESUME_GUEST; + break; + } +#ifdef CONFIG_PPC_BOOK3S_64 + case BOOK3S_INTERRUPT_FAC_UNAVAIL: + kvmppc_handle_fac(vcpu, vcpu->arch.shadow_fscr >> 56); + r = RESUME_GUEST; + break; +#endif + case BOOK3S_INTERRUPT_MACHINE_CHECK: + case BOOK3S_INTERRUPT_TRACE: + kvmppc_book3s_queue_irqprio(vcpu, exit_nr); + r = RESUME_GUEST; + break; + default: + { + ulong shadow_srr1 = vcpu->arch.shadow_srr1; + /* Ugh - bork here! What did we get? */ + printk(KERN_EMERG "exit_nr=0x%x | pc=0x%lx | msr=0x%lx\n", + exit_nr, kvmppc_get_pc(vcpu), shadow_srr1); + r = RESUME_HOST; + BUG(); + break; + } + } + + if (!(r & RESUME_HOST)) { + /* To avoid clobbering exit_reason, only check for signals if + * we aren't already exiting to userspace for some other + * reason. */ + + /* + * Interrupts could be timers for the guest which we have to + * inject again, so let's postpone them until we're in the guest + * and if we really did time things so badly, then we just exit + * again due to a host external interrupt. + */ + s = kvmppc_prepare_to_enter(vcpu); + if (s <= 0) + r = s; + else { + /* interrupts now hard-disabled */ + kvmppc_fix_ee_before_entry(); + } + + kvmppc_handle_lost_ext(vcpu); + } + + trace_kvm_book3s_reenter(r, vcpu); + + return r; +} + +static int kvm_arch_vcpu_ioctl_get_sregs_pr(struct kvm_vcpu *vcpu, + struct kvm_sregs *sregs) +{ + struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu); + int i; + + sregs->pvr = vcpu->arch.pvr; + + sregs->u.s.sdr1 = to_book3s(vcpu)->sdr1; + if (vcpu->arch.hflags & BOOK3S_HFLAG_SLB) { + for (i = 0; i < 64; i++) { + sregs->u.s.ppc64.slb[i].slbe = vcpu->arch.slb[i].orige | i; + sregs->u.s.ppc64.slb[i].slbv = vcpu->arch.slb[i].origv; + } + } else { + for (i = 0; i < 16; i++) + sregs->u.s.ppc32.sr[i] = kvmppc_get_sr(vcpu, i); + + for (i = 0; i < 8; i++) { + sregs->u.s.ppc32.ibat[i] = vcpu3s->ibat[i].raw; + sregs->u.s.ppc32.dbat[i] = vcpu3s->dbat[i].raw; + } + } + + return 0; +} + +static int kvm_arch_vcpu_ioctl_set_sregs_pr(struct kvm_vcpu *vcpu, + struct kvm_sregs *sregs) +{ + struct kvmppc_vcpu_book3s *vcpu3s = to_book3s(vcpu); + int i; + + kvmppc_set_pvr_pr(vcpu, sregs->pvr); + + vcpu3s->sdr1 = sregs->u.s.sdr1; + if (vcpu->arch.hflags & BOOK3S_HFLAG_SLB) { + for (i = 0; i < 64; i++) { + vcpu->arch.mmu.slbmte(vcpu, sregs->u.s.ppc64.slb[i].slbv, + sregs->u.s.ppc64.slb[i].slbe); + } + } else { + for (i = 0; i < 16; i++) { + vcpu->arch.mmu.mtsrin(vcpu, i, sregs->u.s.ppc32.sr[i]); + } + for (i = 0; i < 8; i++) { + kvmppc_set_bat(vcpu, &(vcpu3s->ibat[i]), false, + (u32)sregs->u.s.ppc32.ibat[i]); + kvmppc_set_bat(vcpu, &(vcpu3s->ibat[i]), true, + (u32)(sregs->u.s.ppc32.ibat[i] >> 32)); + kvmppc_set_bat(vcpu, &(vcpu3s->dbat[i]), false, + (u32)sregs->u.s.ppc32.dbat[i]); + kvmppc_set_bat(vcpu, &(vcpu3s->dbat[i]), true, + (u32)(sregs->u.s.ppc32.dbat[i] >> 32)); + } + } + + /* Flush the MMU after messing with the segments */ + kvmppc_mmu_pte_flush(vcpu, 0, 0); + + return 0; +} + +static int kvmppc_get_one_reg_pr(struct kvm_vcpu *vcpu, u64 id, + union kvmppc_one_reg *val) +{ + int r = 0; + + switch (id) { + case KVM_REG_PPC_DEBUG_INST: + *val = get_reg_val(id, KVMPPC_INST_SW_BREAKPOINT); + break; + case KVM_REG_PPC_HIOR: + *val = get_reg_val(id, to_book3s(vcpu)->hior); + break; + case KVM_REG_PPC_LPCR: + case KVM_REG_PPC_LPCR_64: + /* + * We are only interested in the LPCR_ILE bit + */ + if (vcpu->arch.intr_msr & MSR_LE) + *val = get_reg_val(id, LPCR_ILE); + else + *val = get_reg_val(id, 0); + break; + default: + r = -EINVAL; + break; + } + + return r; +} + +static void kvmppc_set_lpcr_pr(struct kvm_vcpu *vcpu, u64 new_lpcr) +{ + if (new_lpcr & LPCR_ILE) + vcpu->arch.intr_msr |= MSR_LE; + else + vcpu->arch.intr_msr &= ~MSR_LE; +} + +static int kvmppc_set_one_reg_pr(struct kvm_vcpu *vcpu, u64 id, + union kvmppc_one_reg *val) +{ + int r = 0; + + switch (id) { + case KVM_REG_PPC_HIOR: + to_book3s(vcpu)->hior = set_reg_val(id, *val); + to_book3s(vcpu)->hior_explicit = true; + break; + case KVM_REG_PPC_LPCR: + case KVM_REG_PPC_LPCR_64: + kvmppc_set_lpcr_pr(vcpu, set_reg_val(id, *val)); + break; + default: + r = -EINVAL; + break; + } + + return r; +} + +static struct kvm_vcpu *kvmppc_core_vcpu_create_pr(struct kvm *kvm, + unsigned int id) +{ + struct kvmppc_vcpu_book3s *vcpu_book3s; + struct kvm_vcpu *vcpu; + int err = -ENOMEM; + unsigned long p; + + vcpu = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL); + if (!vcpu) + goto out; + + vcpu_book3s = vzalloc(sizeof(struct kvmppc_vcpu_book3s)); + if (!vcpu_book3s) + goto free_vcpu; + vcpu->arch.book3s = vcpu_book3s; + +#ifdef CONFIG_KVM_BOOK3S_32_HANDLER + vcpu->arch.shadow_vcpu = + kzalloc(sizeof(*vcpu->arch.shadow_vcpu), GFP_KERNEL); + if (!vcpu->arch.shadow_vcpu) + goto free_vcpu3s; +#endif + + err = kvm_vcpu_init(vcpu, kvm, id); + if (err) + goto free_shadow_vcpu; + + err = -ENOMEM; + p = __get_free_page(GFP_KERNEL|__GFP_ZERO); + if (!p) + goto uninit_vcpu; + vcpu->arch.shared = (void *)p; +#ifdef CONFIG_PPC_BOOK3S_64 + /* Always start the shared struct in native endian mode */ +#ifdef __BIG_ENDIAN__ + vcpu->arch.shared_big_endian = true; +#else + vcpu->arch.shared_big_endian = false; +#endif + + /* + * Default to the same as the host if we're on sufficiently + * recent machine that we have 1TB segments; + * otherwise default to PPC970FX. + */ + vcpu->arch.pvr = 0x3C0301; + if (mmu_has_feature(MMU_FTR_1T_SEGMENT)) + vcpu->arch.pvr = mfspr(SPRN_PVR); + vcpu->arch.intr_msr = MSR_SF; +#else + /* default to book3s_32 (750) */ + vcpu->arch.pvr = 0x84202; +#endif + kvmppc_set_pvr_pr(vcpu, vcpu->arch.pvr); + vcpu->arch.slb_nr = 64; + + vcpu->arch.shadow_msr = MSR_USER64 & ~MSR_LE; + + err = kvmppc_mmu_init(vcpu); + if (err < 0) + goto uninit_vcpu; + + return vcpu; + +uninit_vcpu: + kvm_vcpu_uninit(vcpu); +free_shadow_vcpu: +#ifdef CONFIG_KVM_BOOK3S_32_HANDLER + kfree(vcpu->arch.shadow_vcpu); +free_vcpu3s: +#endif + vfree(vcpu_book3s); +free_vcpu: + kmem_cache_free(kvm_vcpu_cache, vcpu); +out: + return ERR_PTR(err); +} + +static void kvmppc_core_vcpu_free_pr(struct kvm_vcpu *vcpu) +{ + struct kvmppc_vcpu_book3s *vcpu_book3s = to_book3s(vcpu); + + free_page((unsigned long)vcpu->arch.shared & PAGE_MASK); + kvm_vcpu_uninit(vcpu); +#ifdef CONFIG_KVM_BOOK3S_32_HANDLER + kfree(vcpu->arch.shadow_vcpu); +#endif + vfree(vcpu_book3s); + kmem_cache_free(kvm_vcpu_cache, vcpu); +} + +static int kvmppc_vcpu_run_pr(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) +{ + int ret; +#ifdef CONFIG_ALTIVEC + unsigned long uninitialized_var(vrsave); +#endif + + /* Check if we can run the vcpu at all */ + if (!vcpu->arch.sane) { + kvm_run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + ret = -EINVAL; + goto out; + } + + /* + * Interrupts could be timers for the guest which we have to inject + * again, so let's postpone them until we're in the guest and if we + * really did time things so badly, then we just exit again due to + * a host external interrupt. + */ + ret = kvmppc_prepare_to_enter(vcpu); + if (ret <= 0) + goto out; + /* interrupts now hard-disabled */ + + /* Save FPU state in thread_struct */ + if (current->thread.regs->msr & MSR_FP) + giveup_fpu(current); + +#ifdef CONFIG_ALTIVEC + /* Save Altivec state in thread_struct */ + if (current->thread.regs->msr & MSR_VEC) + giveup_altivec(current); +#endif + +#ifdef CONFIG_VSX + /* Save VSX state in thread_struct */ + if (current->thread.regs->msr & MSR_VSX) + __giveup_vsx(current); +#endif + + /* Preload FPU if it's enabled */ + if (kvmppc_get_msr(vcpu) & MSR_FP) + kvmppc_handle_ext(vcpu, BOOK3S_INTERRUPT_FP_UNAVAIL, MSR_FP); + + kvmppc_fix_ee_before_entry(); + + ret = __kvmppc_vcpu_run(kvm_run, vcpu); + + /* No need for kvm_guest_exit. It's done in handle_exit. + We also get here with interrupts enabled. */ + + /* Make sure we save the guest FPU/Altivec/VSX state */ + kvmppc_giveup_ext(vcpu, MSR_FP | MSR_VEC | MSR_VSX); + + /* Make sure we save the guest TAR/EBB/DSCR state */ + kvmppc_giveup_fac(vcpu, FSCR_TAR_LG); + +out: + vcpu->mode = OUTSIDE_GUEST_MODE; + return ret; +} + +/* + * Get (and clear) the dirty memory log for a memory slot. + */ +static int kvm_vm_ioctl_get_dirty_log_pr(struct kvm *kvm, + struct kvm_dirty_log *log) +{ + struct kvm_memory_slot *memslot; + struct kvm_vcpu *vcpu; + ulong ga, ga_end; + int is_dirty = 0; + int r; + unsigned long n; + + mutex_lock(&kvm->slots_lock); + + r = kvm_get_dirty_log(kvm, log, &is_dirty); + if (r) + goto out; + + /* If nothing is dirty, don't bother messing with page tables. */ + if (is_dirty) { + memslot = id_to_memslot(kvm->memslots, log->slot); + + ga = memslot->base_gfn << PAGE_SHIFT; + ga_end = ga + (memslot->npages << PAGE_SHIFT); + + kvm_for_each_vcpu(n, vcpu, kvm) + kvmppc_mmu_pte_pflush(vcpu, ga, ga_end); + + n = kvm_dirty_bitmap_bytes(memslot); + memset(memslot->dirty_bitmap, 0, n); + } + + r = 0; +out: + mutex_unlock(&kvm->slots_lock); + return r; +} + +static void kvmppc_core_flush_memslot_pr(struct kvm *kvm, + struct kvm_memory_slot *memslot) +{ + return; +} + +static int kvmppc_core_prepare_memory_region_pr(struct kvm *kvm, + struct kvm_memory_slot *memslot, + struct kvm_userspace_memory_region *mem) +{ + return 0; +} + +static void kvmppc_core_commit_memory_region_pr(struct kvm *kvm, + struct kvm_userspace_memory_region *mem, + const struct kvm_memory_slot *old) +{ + return; +} + +static void kvmppc_core_free_memslot_pr(struct kvm_memory_slot *free, + struct kvm_memory_slot *dont) +{ + return; +} + +static int kvmppc_core_create_memslot_pr(struct kvm_memory_slot *slot, + unsigned long npages) +{ + return 0; +} + + +#ifdef CONFIG_PPC64 +static int kvm_vm_ioctl_get_smmu_info_pr(struct kvm *kvm, + struct kvm_ppc_smmu_info *info) +{ + long int i; + struct kvm_vcpu *vcpu; + + info->flags = 0; + + /* SLB is always 64 entries */ + info->slb_size = 64; + + /* Standard 4k base page size segment */ + info->sps[0].page_shift = 12; + info->sps[0].slb_enc = 0; + info->sps[0].enc[0].page_shift = 12; + info->sps[0].enc[0].pte_enc = 0; + + /* + * 64k large page size. + * We only want to put this in if the CPUs we're emulating + * support it, but unfortunately we don't have a vcpu easily + * to hand here to test. Just pick the first vcpu, and if + * that doesn't exist yet, report the minimum capability, + * i.e., no 64k pages. + * 1T segment support goes along with 64k pages. + */ + i = 1; + vcpu = kvm_get_vcpu(kvm, 0); + if (vcpu && (vcpu->arch.hflags & BOOK3S_HFLAG_MULTI_PGSIZE)) { + info->flags = KVM_PPC_1T_SEGMENTS; + info->sps[i].page_shift = 16; + info->sps[i].slb_enc = SLB_VSID_L | SLB_VSID_LP_01; + info->sps[i].enc[0].page_shift = 16; + info->sps[i].enc[0].pte_enc = 1; + ++i; + } + + /* Standard 16M large page size segment */ + info->sps[i].page_shift = 24; + info->sps[i].slb_enc = SLB_VSID_L; + info->sps[i].enc[0].page_shift = 24; + info->sps[i].enc[0].pte_enc = 0; + + return 0; +} +#else +static int kvm_vm_ioctl_get_smmu_info_pr(struct kvm *kvm, + struct kvm_ppc_smmu_info *info) +{ + /* We should not get called */ + BUG(); +} +#endif /* CONFIG_PPC64 */ + +static unsigned int kvm_global_user_count = 0; +static DEFINE_SPINLOCK(kvm_global_user_count_lock); + +static int kvmppc_core_init_vm_pr(struct kvm *kvm) +{ + mutex_init(&kvm->arch.hpt_mutex); + +#ifdef CONFIG_PPC_BOOK3S_64 + /* Start out with the default set of hcalls enabled */ + kvmppc_pr_init_default_hcalls(kvm); +#endif + + if (firmware_has_feature(FW_FEATURE_SET_MODE)) { + spin_lock(&kvm_global_user_count_lock); + if (++kvm_global_user_count == 1) + pSeries_disable_reloc_on_exc(); + spin_unlock(&kvm_global_user_count_lock); + } + return 0; +} + +static void kvmppc_core_destroy_vm_pr(struct kvm *kvm) +{ +#ifdef CONFIG_PPC64 + WARN_ON(!list_empty(&kvm->arch.spapr_tce_tables)); +#endif + + if (firmware_has_feature(FW_FEATURE_SET_MODE)) { + spin_lock(&kvm_global_user_count_lock); + BUG_ON(kvm_global_user_count == 0); + if (--kvm_global_user_count == 0) + pSeries_enable_reloc_on_exc(); + spin_unlock(&kvm_global_user_count_lock); + } +} + +static int kvmppc_core_check_processor_compat_pr(void) +{ + /* we are always compatible */ + return 0; +} + +static long kvm_arch_vm_ioctl_pr(struct file *filp, + unsigned int ioctl, unsigned long arg) +{ + return -ENOTTY; +} + +static struct kvmppc_ops kvm_ops_pr = { + .get_sregs = kvm_arch_vcpu_ioctl_get_sregs_pr, + .set_sregs = kvm_arch_vcpu_ioctl_set_sregs_pr, + .get_one_reg = kvmppc_get_one_reg_pr, + .set_one_reg = kvmppc_set_one_reg_pr, + .vcpu_load = kvmppc_core_vcpu_load_pr, + .vcpu_put = kvmppc_core_vcpu_put_pr, + .set_msr = kvmppc_set_msr_pr, + .vcpu_run = kvmppc_vcpu_run_pr, + .vcpu_create = kvmppc_core_vcpu_create_pr, + .vcpu_free = kvmppc_core_vcpu_free_pr, + .check_requests = kvmppc_core_check_requests_pr, + .get_dirty_log = kvm_vm_ioctl_get_dirty_log_pr, + .flush_memslot = kvmppc_core_flush_memslot_pr, + .prepare_memory_region = kvmppc_core_prepare_memory_region_pr, + .commit_memory_region = kvmppc_core_commit_memory_region_pr, + .unmap_hva = kvm_unmap_hva_pr, + .unmap_hva_range = kvm_unmap_hva_range_pr, + .age_hva = kvm_age_hva_pr, + .test_age_hva = kvm_test_age_hva_pr, + .set_spte_hva = kvm_set_spte_hva_pr, + .mmu_destroy = kvmppc_mmu_destroy_pr, + .free_memslot = kvmppc_core_free_memslot_pr, + .create_memslot = kvmppc_core_create_memslot_pr, + .init_vm = kvmppc_core_init_vm_pr, + .destroy_vm = kvmppc_core_destroy_vm_pr, + .get_smmu_info = kvm_vm_ioctl_get_smmu_info_pr, + .emulate_op = kvmppc_core_emulate_op_pr, + .emulate_mtspr = kvmppc_core_emulate_mtspr_pr, + .emulate_mfspr = kvmppc_core_emulate_mfspr_pr, + .fast_vcpu_kick = kvm_vcpu_kick, + .arch_vm_ioctl = kvm_arch_vm_ioctl_pr, +#ifdef CONFIG_PPC_BOOK3S_64 + .hcall_implemented = kvmppc_hcall_impl_pr, +#endif +}; + + +int kvmppc_book3s_init_pr(void) +{ + int r; + + r = kvmppc_core_check_processor_compat_pr(); + if (r < 0) + return r; + + kvm_ops_pr.owner = THIS_MODULE; + kvmppc_pr_ops = &kvm_ops_pr; + + r = kvmppc_mmu_hpte_sysinit(); + return r; +} + +void kvmppc_book3s_exit_pr(void) +{ + kvmppc_pr_ops = NULL; + kvmppc_mmu_hpte_sysexit(); +} + +/* + * We only support separate modules for book3s 64 + */ +#ifdef CONFIG_PPC_BOOK3S_64 + +module_init(kvmppc_book3s_init_pr); +module_exit(kvmppc_book3s_exit_pr); + +MODULE_LICENSE("GPL"); +MODULE_ALIAS_MISCDEV(KVM_MINOR); +MODULE_ALIAS("devname:kvm"); +#endif diff --git a/kernel/arch/powerpc/kvm/book3s_pr_papr.c b/kernel/arch/powerpc/kvm/book3s_pr_papr.c new file mode 100644 index 000000000..f2c75a1e0 --- /dev/null +++ b/kernel/arch/powerpc/kvm/book3s_pr_papr.c @@ -0,0 +1,401 @@ +/* + * Copyright (C) 2011. Freescale Inc. All rights reserved. + * + * Authors: + * Alexander Graf <agraf@suse.de> + * Paul Mackerras <paulus@samba.org> + * + * Description: + * + * Hypercall handling for running PAPR guests in PR KVM on Book 3S + * processors. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + */ + +#include <linux/anon_inodes.h> + +#include <asm/uaccess.h> +#include <asm/kvm_ppc.h> +#include <asm/kvm_book3s.h> + +#define HPTE_SIZE 16 /* bytes per HPT entry */ + +static unsigned long get_pteg_addr(struct kvm_vcpu *vcpu, long pte_index) +{ + struct kvmppc_vcpu_book3s *vcpu_book3s = to_book3s(vcpu); + unsigned long pteg_addr; + + pte_index <<= 4; + pte_index &= ((1 << ((vcpu_book3s->sdr1 & 0x1f) + 11)) - 1) << 7 | 0x70; + pteg_addr = vcpu_book3s->sdr1 & 0xfffffffffffc0000ULL; + pteg_addr |= pte_index; + + return pteg_addr; +} + +static int kvmppc_h_pr_enter(struct kvm_vcpu *vcpu) +{ + long flags = kvmppc_get_gpr(vcpu, 4); + long pte_index = kvmppc_get_gpr(vcpu, 5); + __be64 pteg[2 * 8]; + __be64 *hpte; + unsigned long pteg_addr, i; + long int ret; + + i = pte_index & 7; + pte_index &= ~7UL; + pteg_addr = get_pteg_addr(vcpu, pte_index); + + mutex_lock(&vcpu->kvm->arch.hpt_mutex); + copy_from_user(pteg, (void __user *)pteg_addr, sizeof(pteg)); + hpte = pteg; + + ret = H_PTEG_FULL; + if (likely((flags & H_EXACT) == 0)) { + for (i = 0; ; ++i) { + if (i == 8) + goto done; + if ((be64_to_cpu(*hpte) & HPTE_V_VALID) == 0) + break; + hpte += 2; + } + } else { + hpte += i * 2; + if (*hpte & HPTE_V_VALID) + goto done; + } + + hpte[0] = cpu_to_be64(kvmppc_get_gpr(vcpu, 6)); + hpte[1] = cpu_to_be64(kvmppc_get_gpr(vcpu, 7)); + pteg_addr += i * HPTE_SIZE; + copy_to_user((void __user *)pteg_addr, hpte, HPTE_SIZE); + kvmppc_set_gpr(vcpu, 4, pte_index | i); + ret = H_SUCCESS; + + done: + mutex_unlock(&vcpu->kvm->arch.hpt_mutex); + kvmppc_set_gpr(vcpu, 3, ret); + + return EMULATE_DONE; +} + +static int kvmppc_h_pr_remove(struct kvm_vcpu *vcpu) +{ + unsigned long flags= kvmppc_get_gpr(vcpu, 4); + unsigned long pte_index = kvmppc_get_gpr(vcpu, 5); + unsigned long avpn = kvmppc_get_gpr(vcpu, 6); + unsigned long v = 0, pteg, rb; + unsigned long pte[2]; + long int ret; + + pteg = get_pteg_addr(vcpu, pte_index); + mutex_lock(&vcpu->kvm->arch.hpt_mutex); + copy_from_user(pte, (void __user *)pteg, sizeof(pte)); + pte[0] = be64_to_cpu((__force __be64)pte[0]); + pte[1] = be64_to_cpu((__force __be64)pte[1]); + + ret = H_NOT_FOUND; + if ((pte[0] & HPTE_V_VALID) == 0 || + ((flags & H_AVPN) && (pte[0] & ~0x7fUL) != avpn) || + ((flags & H_ANDCOND) && (pte[0] & avpn) != 0)) + goto done; + + copy_to_user((void __user *)pteg, &v, sizeof(v)); + + rb = compute_tlbie_rb(pte[0], pte[1], pte_index); + vcpu->arch.mmu.tlbie(vcpu, rb, rb & 1 ? true : false); + + ret = H_SUCCESS; + kvmppc_set_gpr(vcpu, 4, pte[0]); + kvmppc_set_gpr(vcpu, 5, pte[1]); + + done: + mutex_unlock(&vcpu->kvm->arch.hpt_mutex); + kvmppc_set_gpr(vcpu, 3, ret); + + return EMULATE_DONE; +} + +/* Request defs for kvmppc_h_pr_bulk_remove() */ +#define H_BULK_REMOVE_TYPE 0xc000000000000000ULL +#define H_BULK_REMOVE_REQUEST 0x4000000000000000ULL +#define H_BULK_REMOVE_RESPONSE 0x8000000000000000ULL +#define H_BULK_REMOVE_END 0xc000000000000000ULL +#define H_BULK_REMOVE_CODE 0x3000000000000000ULL +#define H_BULK_REMOVE_SUCCESS 0x0000000000000000ULL +#define H_BULK_REMOVE_NOT_FOUND 0x1000000000000000ULL +#define H_BULK_REMOVE_PARM 0x2000000000000000ULL +#define H_BULK_REMOVE_HW 0x3000000000000000ULL +#define H_BULK_REMOVE_RC 0x0c00000000000000ULL +#define H_BULK_REMOVE_FLAGS 0x0300000000000000ULL +#define H_BULK_REMOVE_ABSOLUTE 0x0000000000000000ULL +#define H_BULK_REMOVE_ANDCOND 0x0100000000000000ULL +#define H_BULK_REMOVE_AVPN 0x0200000000000000ULL +#define H_BULK_REMOVE_PTEX 0x00ffffffffffffffULL +#define H_BULK_REMOVE_MAX_BATCH 4 + +static int kvmppc_h_pr_bulk_remove(struct kvm_vcpu *vcpu) +{ + int i; + int paramnr = 4; + int ret = H_SUCCESS; + + mutex_lock(&vcpu->kvm->arch.hpt_mutex); + for (i = 0; i < H_BULK_REMOVE_MAX_BATCH; i++) { + unsigned long tsh = kvmppc_get_gpr(vcpu, paramnr+(2*i)); + unsigned long tsl = kvmppc_get_gpr(vcpu, paramnr+(2*i)+1); + unsigned long pteg, rb, flags; + unsigned long pte[2]; + unsigned long v = 0; + + if ((tsh & H_BULK_REMOVE_TYPE) == H_BULK_REMOVE_END) { + break; /* Exit success */ + } else if ((tsh & H_BULK_REMOVE_TYPE) != + H_BULK_REMOVE_REQUEST) { + ret = H_PARAMETER; + break; /* Exit fail */ + } + + tsh &= H_BULK_REMOVE_PTEX | H_BULK_REMOVE_FLAGS; + tsh |= H_BULK_REMOVE_RESPONSE; + + if ((tsh & H_BULK_REMOVE_ANDCOND) && + (tsh & H_BULK_REMOVE_AVPN)) { + tsh |= H_BULK_REMOVE_PARM; + kvmppc_set_gpr(vcpu, paramnr+(2*i), tsh); + ret = H_PARAMETER; + break; /* Exit fail */ + } + + pteg = get_pteg_addr(vcpu, tsh & H_BULK_REMOVE_PTEX); + copy_from_user(pte, (void __user *)pteg, sizeof(pte)); + pte[0] = be64_to_cpu((__force __be64)pte[0]); + pte[1] = be64_to_cpu((__force __be64)pte[1]); + + /* tsl = AVPN */ + flags = (tsh & H_BULK_REMOVE_FLAGS) >> 26; + + if ((pte[0] & HPTE_V_VALID) == 0 || + ((flags & H_AVPN) && (pte[0] & ~0x7fUL) != tsl) || + ((flags & H_ANDCOND) && (pte[0] & tsl) != 0)) { + tsh |= H_BULK_REMOVE_NOT_FOUND; + } else { + /* Splat the pteg in (userland) hpt */ + copy_to_user((void __user *)pteg, &v, sizeof(v)); + + rb = compute_tlbie_rb(pte[0], pte[1], + tsh & H_BULK_REMOVE_PTEX); + vcpu->arch.mmu.tlbie(vcpu, rb, rb & 1 ? true : false); + tsh |= H_BULK_REMOVE_SUCCESS; + tsh |= (pte[1] & (HPTE_R_C | HPTE_R_R)) << 43; + } + kvmppc_set_gpr(vcpu, paramnr+(2*i), tsh); + } + mutex_unlock(&vcpu->kvm->arch.hpt_mutex); + kvmppc_set_gpr(vcpu, 3, ret); + + return EMULATE_DONE; +} + +static int kvmppc_h_pr_protect(struct kvm_vcpu *vcpu) +{ + unsigned long flags = kvmppc_get_gpr(vcpu, 4); + unsigned long pte_index = kvmppc_get_gpr(vcpu, 5); + unsigned long avpn = kvmppc_get_gpr(vcpu, 6); + unsigned long rb, pteg, r, v; + unsigned long pte[2]; + long int ret; + + pteg = get_pteg_addr(vcpu, pte_index); + mutex_lock(&vcpu->kvm->arch.hpt_mutex); + copy_from_user(pte, (void __user *)pteg, sizeof(pte)); + pte[0] = be64_to_cpu((__force __be64)pte[0]); + pte[1] = be64_to_cpu((__force __be64)pte[1]); + + ret = H_NOT_FOUND; + if ((pte[0] & HPTE_V_VALID) == 0 || + ((flags & H_AVPN) && (pte[0] & ~0x7fUL) != avpn)) + goto done; + + v = pte[0]; + r = pte[1]; + r &= ~(HPTE_R_PP0 | HPTE_R_PP | HPTE_R_N | HPTE_R_KEY_HI | + HPTE_R_KEY_LO); + r |= (flags << 55) & HPTE_R_PP0; + r |= (flags << 48) & HPTE_R_KEY_HI; + r |= flags & (HPTE_R_PP | HPTE_R_N | HPTE_R_KEY_LO); + + pte[1] = r; + + rb = compute_tlbie_rb(v, r, pte_index); + vcpu->arch.mmu.tlbie(vcpu, rb, rb & 1 ? true : false); + pte[0] = (__force u64)cpu_to_be64(pte[0]); + pte[1] = (__force u64)cpu_to_be64(pte[1]); + copy_to_user((void __user *)pteg, pte, sizeof(pte)); + ret = H_SUCCESS; + + done: + mutex_unlock(&vcpu->kvm->arch.hpt_mutex); + kvmppc_set_gpr(vcpu, 3, ret); + + return EMULATE_DONE; +} + +static int kvmppc_h_pr_put_tce(struct kvm_vcpu *vcpu) +{ + unsigned long liobn = kvmppc_get_gpr(vcpu, 4); + unsigned long ioba = kvmppc_get_gpr(vcpu, 5); + unsigned long tce = kvmppc_get_gpr(vcpu, 6); + long rc; + + rc = kvmppc_h_put_tce(vcpu, liobn, ioba, tce); + if (rc == H_TOO_HARD) + return EMULATE_FAIL; + kvmppc_set_gpr(vcpu, 3, rc); + return EMULATE_DONE; +} + +static int kvmppc_h_pr_logical_ci_load(struct kvm_vcpu *vcpu) +{ + long rc; + + rc = kvmppc_h_logical_ci_load(vcpu); + if (rc == H_TOO_HARD) + return EMULATE_FAIL; + kvmppc_set_gpr(vcpu, 3, rc); + return EMULATE_DONE; +} + +static int kvmppc_h_pr_logical_ci_store(struct kvm_vcpu *vcpu) +{ + long rc; + + rc = kvmppc_h_logical_ci_store(vcpu); + if (rc == H_TOO_HARD) + return EMULATE_FAIL; + kvmppc_set_gpr(vcpu, 3, rc); + return EMULATE_DONE; +} + +static int kvmppc_h_pr_xics_hcall(struct kvm_vcpu *vcpu, u32 cmd) +{ + long rc = kvmppc_xics_hcall(vcpu, cmd); + kvmppc_set_gpr(vcpu, 3, rc); + return EMULATE_DONE; +} + +int kvmppc_h_pr(struct kvm_vcpu *vcpu, unsigned long cmd) +{ + int rc, idx; + + if (cmd <= MAX_HCALL_OPCODE && + !test_bit(cmd/4, vcpu->kvm->arch.enabled_hcalls)) + return EMULATE_FAIL; + + switch (cmd) { + case H_ENTER: + return kvmppc_h_pr_enter(vcpu); + case H_REMOVE: + return kvmppc_h_pr_remove(vcpu); + case H_PROTECT: + return kvmppc_h_pr_protect(vcpu); + case H_BULK_REMOVE: + return kvmppc_h_pr_bulk_remove(vcpu); + case H_PUT_TCE: + return kvmppc_h_pr_put_tce(vcpu); + case H_CEDE: + kvmppc_set_msr_fast(vcpu, kvmppc_get_msr(vcpu) | MSR_EE); + kvm_vcpu_block(vcpu); + clear_bit(KVM_REQ_UNHALT, &vcpu->requests); + vcpu->stat.halt_wakeup++; + return EMULATE_DONE; + case H_LOGICAL_CI_LOAD: + return kvmppc_h_pr_logical_ci_load(vcpu); + case H_LOGICAL_CI_STORE: + return kvmppc_h_pr_logical_ci_store(vcpu); + case H_XIRR: + case H_CPPR: + case H_EOI: + case H_IPI: + case H_IPOLL: + case H_XIRR_X: + if (kvmppc_xics_enabled(vcpu)) + return kvmppc_h_pr_xics_hcall(vcpu, cmd); + break; + case H_RTAS: + if (list_empty(&vcpu->kvm->arch.rtas_tokens)) + break; + idx = srcu_read_lock(&vcpu->kvm->srcu); + rc = kvmppc_rtas_hcall(vcpu); + srcu_read_unlock(&vcpu->kvm->srcu, idx); + if (rc) + break; + kvmppc_set_gpr(vcpu, 3, 0); + return EMULATE_DONE; + } + + return EMULATE_FAIL; +} + +int kvmppc_hcall_impl_pr(unsigned long cmd) +{ + switch (cmd) { + case H_ENTER: + case H_REMOVE: + case H_PROTECT: + case H_BULK_REMOVE: + case H_PUT_TCE: + case H_CEDE: + case H_LOGICAL_CI_LOAD: + case H_LOGICAL_CI_STORE: +#ifdef CONFIG_KVM_XICS + case H_XIRR: + case H_CPPR: + case H_EOI: + case H_IPI: + case H_IPOLL: + case H_XIRR_X: +#endif + return 1; + } + return 0; +} + +/* + * List of hcall numbers to enable by default. + * For compatibility with old userspace, we enable by default + * all hcalls that were implemented before the hcall-enabling + * facility was added. Note this list should not include H_RTAS. + */ +static unsigned int default_hcall_list[] = { + H_ENTER, + H_REMOVE, + H_PROTECT, + H_BULK_REMOVE, + H_PUT_TCE, + H_CEDE, +#ifdef CONFIG_KVM_XICS + H_XIRR, + H_CPPR, + H_EOI, + H_IPI, + H_IPOLL, + H_XIRR_X, +#endif + 0 +}; + +void kvmppc_pr_init_default_hcalls(struct kvm *kvm) +{ + int i; + unsigned int hcall; + + for (i = 0; default_hcall_list[i]; ++i) { + hcall = default_hcall_list[i]; + WARN_ON(!kvmppc_hcall_impl_pr(hcall)); + __set_bit(hcall / 4, kvm->arch.enabled_hcalls); + } +} diff --git a/kernel/arch/powerpc/kvm/book3s_rmhandlers.S b/kernel/arch/powerpc/kvm/book3s_rmhandlers.S new file mode 100644 index 000000000..16c4d88ba --- /dev/null +++ b/kernel/arch/powerpc/kvm/book3s_rmhandlers.S @@ -0,0 +1,169 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * + * Copyright SUSE Linux Products GmbH 2009 + * + * Authors: Alexander Graf <agraf@suse.de> + */ + +#include <asm/ppc_asm.h> +#include <asm/kvm_asm.h> +#include <asm/reg.h> +#include <asm/mmu.h> +#include <asm/page.h> +#include <asm/asm-offsets.h> + +#ifdef CONFIG_PPC_BOOK3S_64 +#include <asm/exception-64s.h> +#endif + +/***************************************************************************** + * * + * Real Mode handlers that need to be in low physical memory * + * * + ****************************************************************************/ + +#if defined(CONFIG_PPC_BOOK3S_64) + +#if defined(_CALL_ELF) && _CALL_ELF == 2 +#define FUNC(name) name +#else +#define FUNC(name) GLUE(.,name) +#endif + +#elif defined(CONFIG_PPC_BOOK3S_32) + +#define FUNC(name) name + +.macro INTERRUPT_TRAMPOLINE intno + +.global kvmppc_trampoline_\intno +kvmppc_trampoline_\intno: + + mtspr SPRN_SPRG_SCRATCH0, r13 /* Save r13 */ + + /* + * First thing to do is to find out if we're coming + * from a KVM guest or a Linux process. + * + * To distinguish, we check a magic byte in the PACA/current + */ + mfspr r13, SPRN_SPRG_THREAD + lwz r13, THREAD_KVM_SVCPU(r13) + /* PPC32 can have a NULL pointer - let's check for that */ + mtspr SPRN_SPRG_SCRATCH1, r12 /* Save r12 */ + mfcr r12 + cmpwi r13, 0 + bne 1f +2: mtcr r12 + mfspr r12, SPRN_SPRG_SCRATCH1 + mfspr r13, SPRN_SPRG_SCRATCH0 /* r13 = original r13 */ + b kvmppc_resume_\intno /* Get back original handler */ + +1: tophys(r13, r13) + stw r12, HSTATE_SCRATCH1(r13) + mfspr r12, SPRN_SPRG_SCRATCH1 + stw r12, HSTATE_SCRATCH0(r13) + lbz r12, HSTATE_IN_GUEST(r13) + cmpwi r12, KVM_GUEST_MODE_NONE + bne ..kvmppc_handler_hasmagic_\intno + /* No KVM guest? Then jump back to the Linux handler! */ + lwz r12, HSTATE_SCRATCH1(r13) + b 2b + + /* Now we know we're handling a KVM guest */ +..kvmppc_handler_hasmagic_\intno: + + /* Should we just skip the faulting instruction? */ + cmpwi r12, KVM_GUEST_MODE_SKIP + beq kvmppc_handler_skip_ins + + /* Let's store which interrupt we're handling */ + li r12, \intno + + /* Jump into the SLB exit code that goes to the highmem handler */ + b kvmppc_handler_trampoline_exit + +.endm + +INTERRUPT_TRAMPOLINE BOOK3S_INTERRUPT_SYSTEM_RESET +INTERRUPT_TRAMPOLINE BOOK3S_INTERRUPT_MACHINE_CHECK +INTERRUPT_TRAMPOLINE BOOK3S_INTERRUPT_DATA_STORAGE +INTERRUPT_TRAMPOLINE BOOK3S_INTERRUPT_INST_STORAGE +INTERRUPT_TRAMPOLINE BOOK3S_INTERRUPT_EXTERNAL +INTERRUPT_TRAMPOLINE BOOK3S_INTERRUPT_ALIGNMENT +INTERRUPT_TRAMPOLINE BOOK3S_INTERRUPT_PROGRAM +INTERRUPT_TRAMPOLINE BOOK3S_INTERRUPT_FP_UNAVAIL +INTERRUPT_TRAMPOLINE BOOK3S_INTERRUPT_DECREMENTER +INTERRUPT_TRAMPOLINE BOOK3S_INTERRUPT_SYSCALL +INTERRUPT_TRAMPOLINE BOOK3S_INTERRUPT_TRACE +INTERRUPT_TRAMPOLINE BOOK3S_INTERRUPT_PERFMON +INTERRUPT_TRAMPOLINE BOOK3S_INTERRUPT_ALTIVEC + +/* + * Bring us back to the faulting code, but skip the + * faulting instruction. + * + * This is a generic exit path from the interrupt + * trampolines above. + * + * Input Registers: + * + * R12 = free + * R13 = Shadow VCPU (PACA) + * HSTATE.SCRATCH0 = guest R12 + * HSTATE.SCRATCH1 = guest CR + * SPRG_SCRATCH0 = guest R13 + * + */ +kvmppc_handler_skip_ins: + + /* Patch the IP to the next instruction */ + mfsrr0 r12 + addi r12, r12, 4 + mtsrr0 r12 + + /* Clean up all state */ + lwz r12, HSTATE_SCRATCH1(r13) + mtcr r12 + PPC_LL r12, HSTATE_SCRATCH0(r13) + GET_SCRATCH0(r13) + + /* And get back into the code */ + RFI +#endif + +/* + * Call kvmppc_handler_trampoline_enter in real mode + * + * On entry, r4 contains the guest shadow MSR + * MSR.EE has to be 0 when calling this function + */ +_GLOBAL_TOC(kvmppc_entry_trampoline) + mfmsr r5 + LOAD_REG_ADDR(r7, kvmppc_handler_trampoline_enter) + toreal(r7) + + li r6, MSR_IR | MSR_DR + andc r6, r5, r6 /* Clear DR and IR in MSR value */ + /* + * Set EE in HOST_MSR so that it's enabled when we get into our + * C exit handler function. + */ + ori r5, r5, MSR_EE + mtsrr0 r7 + mtsrr1 r6 + RFI + +#include "book3s_segment.S" diff --git a/kernel/arch/powerpc/kvm/book3s_rtas.c b/kernel/arch/powerpc/kvm/book3s_rtas.c new file mode 100644 index 000000000..ef27fbd5d --- /dev/null +++ b/kernel/arch/powerpc/kvm/book3s_rtas.c @@ -0,0 +1,278 @@ +/* + * Copyright 2012 Michael Ellerman, IBM Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + */ + +#include <linux/kernel.h> +#include <linux/kvm_host.h> +#include <linux/kvm.h> +#include <linux/err.h> + +#include <asm/uaccess.h> +#include <asm/kvm_book3s.h> +#include <asm/kvm_ppc.h> +#include <asm/hvcall.h> +#include <asm/rtas.h> + +#ifdef CONFIG_KVM_XICS +static void kvm_rtas_set_xive(struct kvm_vcpu *vcpu, struct rtas_args *args) +{ + u32 irq, server, priority; + int rc; + + if (be32_to_cpu(args->nargs) != 3 || be32_to_cpu(args->nret) != 1) { + rc = -3; + goto out; + } + + irq = be32_to_cpu(args->args[0]); + server = be32_to_cpu(args->args[1]); + priority = be32_to_cpu(args->args[2]); + + rc = kvmppc_xics_set_xive(vcpu->kvm, irq, server, priority); + if (rc) + rc = -3; +out: + args->rets[0] = cpu_to_be32(rc); +} + +static void kvm_rtas_get_xive(struct kvm_vcpu *vcpu, struct rtas_args *args) +{ + u32 irq, server, priority; + int rc; + + if (be32_to_cpu(args->nargs) != 1 || be32_to_cpu(args->nret) != 3) { + rc = -3; + goto out; + } + + irq = be32_to_cpu(args->args[0]); + + server = priority = 0; + rc = kvmppc_xics_get_xive(vcpu->kvm, irq, &server, &priority); + if (rc) { + rc = -3; + goto out; + } + + args->rets[1] = cpu_to_be32(server); + args->rets[2] = cpu_to_be32(priority); +out: + args->rets[0] = cpu_to_be32(rc); +} + +static void kvm_rtas_int_off(struct kvm_vcpu *vcpu, struct rtas_args *args) +{ + u32 irq; + int rc; + + if (be32_to_cpu(args->nargs) != 1 || be32_to_cpu(args->nret) != 1) { + rc = -3; + goto out; + } + + irq = be32_to_cpu(args->args[0]); + + rc = kvmppc_xics_int_off(vcpu->kvm, irq); + if (rc) + rc = -3; +out: + args->rets[0] = cpu_to_be32(rc); +} + +static void kvm_rtas_int_on(struct kvm_vcpu *vcpu, struct rtas_args *args) +{ + u32 irq; + int rc; + + if (be32_to_cpu(args->nargs) != 1 || be32_to_cpu(args->nret) != 1) { + rc = -3; + goto out; + } + + irq = be32_to_cpu(args->args[0]); + + rc = kvmppc_xics_int_on(vcpu->kvm, irq); + if (rc) + rc = -3; +out: + args->rets[0] = cpu_to_be32(rc); +} +#endif /* CONFIG_KVM_XICS */ + +struct rtas_handler { + void (*handler)(struct kvm_vcpu *vcpu, struct rtas_args *args); + char *name; +}; + +static struct rtas_handler rtas_handlers[] = { +#ifdef CONFIG_KVM_XICS + { .name = "ibm,set-xive", .handler = kvm_rtas_set_xive }, + { .name = "ibm,get-xive", .handler = kvm_rtas_get_xive }, + { .name = "ibm,int-off", .handler = kvm_rtas_int_off }, + { .name = "ibm,int-on", .handler = kvm_rtas_int_on }, +#endif +}; + +struct rtas_token_definition { + struct list_head list; + struct rtas_handler *handler; + u64 token; +}; + +static int rtas_name_matches(char *s1, char *s2) +{ + struct kvm_rtas_token_args args; + return !strncmp(s1, s2, sizeof(args.name)); +} + +static int rtas_token_undefine(struct kvm *kvm, char *name) +{ + struct rtas_token_definition *d, *tmp; + + lockdep_assert_held(&kvm->lock); + + list_for_each_entry_safe(d, tmp, &kvm->arch.rtas_tokens, list) { + if (rtas_name_matches(d->handler->name, name)) { + list_del(&d->list); + kfree(d); + return 0; + } + } + + /* It's not an error to undefine an undefined token */ + return 0; +} + +static int rtas_token_define(struct kvm *kvm, char *name, u64 token) +{ + struct rtas_token_definition *d; + struct rtas_handler *h = NULL; + bool found; + int i; + + lockdep_assert_held(&kvm->lock); + + list_for_each_entry(d, &kvm->arch.rtas_tokens, list) { + if (d->token == token) + return -EEXIST; + } + + found = false; + for (i = 0; i < ARRAY_SIZE(rtas_handlers); i++) { + h = &rtas_handlers[i]; + if (rtas_name_matches(h->name, name)) { + found = true; + break; + } + } + + if (!found) + return -ENOENT; + + d = kzalloc(sizeof(*d), GFP_KERNEL); + if (!d) + return -ENOMEM; + + d->handler = h; + d->token = token; + + list_add_tail(&d->list, &kvm->arch.rtas_tokens); + + return 0; +} + +int kvm_vm_ioctl_rtas_define_token(struct kvm *kvm, void __user *argp) +{ + struct kvm_rtas_token_args args; + int rc; + + if (copy_from_user(&args, argp, sizeof(args))) + return -EFAULT; + + mutex_lock(&kvm->lock); + + if (args.token) + rc = rtas_token_define(kvm, args.name, args.token); + else + rc = rtas_token_undefine(kvm, args.name); + + mutex_unlock(&kvm->lock); + + return rc; +} + +int kvmppc_rtas_hcall(struct kvm_vcpu *vcpu) +{ + struct rtas_token_definition *d; + struct rtas_args args; + rtas_arg_t *orig_rets; + gpa_t args_phys; + int rc; + + /* + * r4 contains the guest physical address of the RTAS args + * Mask off the top 4 bits since this is a guest real address + */ + args_phys = kvmppc_get_gpr(vcpu, 4) & KVM_PAM; + + rc = kvm_read_guest(vcpu->kvm, args_phys, &args, sizeof(args)); + if (rc) + goto fail; + + /* + * args->rets is a pointer into args->args. Now that we've + * copied args we need to fix it up to point into our copy, + * not the guest args. We also need to save the original + * value so we can restore it on the way out. + */ + orig_rets = args.rets; + args.rets = &args.args[be32_to_cpu(args.nargs)]; + + mutex_lock(&vcpu->kvm->lock); + + rc = -ENOENT; + list_for_each_entry(d, &vcpu->kvm->arch.rtas_tokens, list) { + if (d->token == be32_to_cpu(args.token)) { + d->handler->handler(vcpu, &args); + rc = 0; + break; + } + } + + mutex_unlock(&vcpu->kvm->lock); + + if (rc == 0) { + args.rets = orig_rets; + rc = kvm_write_guest(vcpu->kvm, args_phys, &args, sizeof(args)); + if (rc) + goto fail; + } + + return rc; + +fail: + /* + * We only get here if the guest has called RTAS with a bogus + * args pointer. That means we can't get to the args, and so we + * can't fail the RTAS call. So fail right out to userspace, + * which should kill the guest. + */ + return rc; +} +EXPORT_SYMBOL_GPL(kvmppc_rtas_hcall); + +void kvmppc_rtas_tokens_free(struct kvm *kvm) +{ + struct rtas_token_definition *d, *tmp; + + lockdep_assert_held(&kvm->lock); + + list_for_each_entry_safe(d, tmp, &kvm->arch.rtas_tokens, list) { + list_del(&d->list); + kfree(d); + } +} diff --git a/kernel/arch/powerpc/kvm/book3s_segment.S b/kernel/arch/powerpc/kvm/book3s_segment.S new file mode 100644 index 000000000..acee37cde --- /dev/null +++ b/kernel/arch/powerpc/kvm/book3s_segment.S @@ -0,0 +1,393 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * + * Copyright SUSE Linux Products GmbH 2010 + * + * Authors: Alexander Graf <agraf@suse.de> + */ + +/* Real mode helpers */ + +#if defined(CONFIG_PPC_BOOK3S_64) + +#define GET_SHADOW_VCPU(reg) \ + mr reg, r13 + +#elif defined(CONFIG_PPC_BOOK3S_32) + +#define GET_SHADOW_VCPU(reg) \ + tophys(reg, r2); \ + lwz reg, (THREAD + THREAD_KVM_SVCPU)(reg); \ + tophys(reg, reg) + +#endif + +/* Disable for nested KVM */ +#define USE_QUICK_LAST_INST + + +/* Get helper functions for subarch specific functionality */ + +#if defined(CONFIG_PPC_BOOK3S_64) +#include "book3s_64_slb.S" +#elif defined(CONFIG_PPC_BOOK3S_32) +#include "book3s_32_sr.S" +#endif + +/****************************************************************************** + * * + * Entry code * + * * + *****************************************************************************/ + +.global kvmppc_handler_trampoline_enter +kvmppc_handler_trampoline_enter: + + /* Required state: + * + * MSR = ~IR|DR + * R1 = host R1 + * R2 = host R2 + * R4 = guest shadow MSR + * R5 = normal host MSR + * R6 = current host MSR (EE, IR, DR off) + * LR = highmem guest exit code + * all other volatile GPRS = free + * SVCPU[CR] = guest CR + * SVCPU[XER] = guest XER + * SVCPU[CTR] = guest CTR + * SVCPU[LR] = guest LR + */ + + /* r3 = shadow vcpu */ + GET_SHADOW_VCPU(r3) + + /* Save guest exit handler address and MSR */ + mflr r0 + PPC_STL r0, HSTATE_VMHANDLER(r3) + PPC_STL r5, HSTATE_HOST_MSR(r3) + + /* Save R1/R2 in the PACA (64-bit) or shadow_vcpu (32-bit) */ + PPC_STL r1, HSTATE_HOST_R1(r3) + PPC_STL r2, HSTATE_HOST_R2(r3) + + /* Activate guest mode, so faults get handled by KVM */ + li r11, KVM_GUEST_MODE_GUEST + stb r11, HSTATE_IN_GUEST(r3) + + /* Switch to guest segment. This is subarch specific. */ + LOAD_GUEST_SEGMENTS + +#ifdef CONFIG_PPC_BOOK3S_64 +BEGIN_FTR_SECTION + /* Save host FSCR */ + mfspr r8, SPRN_FSCR + std r8, HSTATE_HOST_FSCR(r13) + /* Set FSCR during guest execution */ + ld r9, SVCPU_SHADOW_FSCR(r13) + mtspr SPRN_FSCR, r9 +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) + + /* Some guests may need to have dcbz set to 32 byte length. + * + * Usually we ensure that by patching the guest's instructions + * to trap on dcbz and emulate it in the hypervisor. + * + * If we can, we should tell the CPU to use 32 byte dcbz though, + * because that's a lot faster. + */ + lbz r0, HSTATE_RESTORE_HID5(r3) + cmpwi r0, 0 + beq no_dcbz32_on + + mfspr r0,SPRN_HID5 + ori r0, r0, 0x80 /* XXX HID5_dcbz32 = 0x80 */ + mtspr SPRN_HID5,r0 +no_dcbz32_on: + +#endif /* CONFIG_PPC_BOOK3S_64 */ + + /* Enter guest */ + + PPC_LL r8, SVCPU_CTR(r3) + PPC_LL r9, SVCPU_LR(r3) + lwz r10, SVCPU_CR(r3) + lwz r11, SVCPU_XER(r3) + + mtctr r8 + mtlr r9 + mtcr r10 + mtxer r11 + + /* Move SRR0 and SRR1 into the respective regs */ + PPC_LL r9, SVCPU_PC(r3) + /* First clear RI in our current MSR value */ + li r0, MSR_RI + andc r6, r6, r0 + + PPC_LL r0, SVCPU_R0(r3) + PPC_LL r1, SVCPU_R1(r3) + PPC_LL r2, SVCPU_R2(r3) + PPC_LL r5, SVCPU_R5(r3) + PPC_LL r7, SVCPU_R7(r3) + PPC_LL r8, SVCPU_R8(r3) + PPC_LL r10, SVCPU_R10(r3) + PPC_LL r11, SVCPU_R11(r3) + PPC_LL r12, SVCPU_R12(r3) + PPC_LL r13, SVCPU_R13(r3) + + MTMSR_EERI(r6) + mtsrr0 r9 + mtsrr1 r4 + + PPC_LL r4, SVCPU_R4(r3) + PPC_LL r6, SVCPU_R6(r3) + PPC_LL r9, SVCPU_R9(r3) + PPC_LL r3, (SVCPU_R3)(r3) + + RFI +kvmppc_handler_trampoline_enter_end: + + + +/****************************************************************************** + * * + * Exit code * + * * + *****************************************************************************/ + +.global kvmppc_handler_trampoline_exit +kvmppc_handler_trampoline_exit: + +.global kvmppc_interrupt_pr +kvmppc_interrupt_pr: + + /* Register usage at this point: + * + * SPRG_SCRATCH0 = guest R13 + * R12 = exit handler id + * R13 = shadow vcpu (32-bit) or PACA (64-bit) + * HSTATE.SCRATCH0 = guest R12 + * HSTATE.SCRATCH1 = guest CR + * + */ + + /* Save registers */ + + PPC_STL r0, SVCPU_R0(r13) + PPC_STL r1, SVCPU_R1(r13) + PPC_STL r2, SVCPU_R2(r13) + PPC_STL r3, SVCPU_R3(r13) + PPC_STL r4, SVCPU_R4(r13) + PPC_STL r5, SVCPU_R5(r13) + PPC_STL r6, SVCPU_R6(r13) + PPC_STL r7, SVCPU_R7(r13) + PPC_STL r8, SVCPU_R8(r13) + PPC_STL r9, SVCPU_R9(r13) + PPC_STL r10, SVCPU_R10(r13) + PPC_STL r11, SVCPU_R11(r13) + + /* Restore R1/R2 so we can handle faults */ + PPC_LL r1, HSTATE_HOST_R1(r13) + PPC_LL r2, HSTATE_HOST_R2(r13) + + /* Save guest PC and MSR */ +#ifdef CONFIG_PPC64 +BEGIN_FTR_SECTION + andi. r0, r12, 0x2 + cmpwi cr1, r0, 0 + beq 1f + mfspr r3,SPRN_HSRR0 + mfspr r4,SPRN_HSRR1 + andi. r12,r12,0x3ffd + b 2f +END_FTR_SECTION_IFSET(CPU_FTR_HVMODE) +#endif +1: mfsrr0 r3 + mfsrr1 r4 +2: + PPC_STL r3, SVCPU_PC(r13) + PPC_STL r4, SVCPU_SHADOW_SRR1(r13) + + /* Get scratch'ed off registers */ + GET_SCRATCH0(r9) + PPC_LL r8, HSTATE_SCRATCH0(r13) + lwz r7, HSTATE_SCRATCH1(r13) + + PPC_STL r9, SVCPU_R13(r13) + PPC_STL r8, SVCPU_R12(r13) + stw r7, SVCPU_CR(r13) + + /* Save more register state */ + + mfxer r5 + mfdar r6 + mfdsisr r7 + mfctr r8 + mflr r9 + + stw r5, SVCPU_XER(r13) + PPC_STL r6, SVCPU_FAULT_DAR(r13) + stw r7, SVCPU_FAULT_DSISR(r13) + PPC_STL r8, SVCPU_CTR(r13) + PPC_STL r9, SVCPU_LR(r13) + + /* + * In order for us to easily get the last instruction, + * we got the #vmexit at, we exploit the fact that the + * virtual layout is still the same here, so we can just + * ld from the guest's PC address + */ + + /* We only load the last instruction when it's safe */ + cmpwi r12, BOOK3S_INTERRUPT_DATA_STORAGE + beq ld_last_inst + cmpwi r12, BOOK3S_INTERRUPT_PROGRAM + beq ld_last_inst + cmpwi r12, BOOK3S_INTERRUPT_SYSCALL + beq ld_last_prev_inst + cmpwi r12, BOOK3S_INTERRUPT_ALIGNMENT + beq- ld_last_inst +#ifdef CONFIG_PPC64 +BEGIN_FTR_SECTION + cmpwi r12, BOOK3S_INTERRUPT_H_EMUL_ASSIST + beq- ld_last_inst +END_FTR_SECTION_IFSET(CPU_FTR_HVMODE) +BEGIN_FTR_SECTION + cmpwi r12, BOOK3S_INTERRUPT_FAC_UNAVAIL + beq- ld_last_inst +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) +#endif + + b no_ld_last_inst + +ld_last_prev_inst: + addi r3, r3, -4 + +ld_last_inst: + /* Save off the guest instruction we're at */ + + /* In case lwz faults */ + li r0, KVM_INST_FETCH_FAILED + +#ifdef USE_QUICK_LAST_INST + + /* Set guest mode to 'jump over instruction' so if lwz faults + * we'll just continue at the next IP. */ + li r9, KVM_GUEST_MODE_SKIP + stb r9, HSTATE_IN_GUEST(r13) + + /* 1) enable paging for data */ + mfmsr r9 + ori r11, r9, MSR_DR /* Enable paging for data */ + mtmsr r11 + sync + /* 2) fetch the instruction */ + lwz r0, 0(r3) + /* 3) disable paging again */ + mtmsr r9 + sync + +#endif + stw r0, SVCPU_LAST_INST(r13) + +no_ld_last_inst: + + /* Unset guest mode */ + li r9, KVM_GUEST_MODE_NONE + stb r9, HSTATE_IN_GUEST(r13) + + /* Switch back to host MMU */ + LOAD_HOST_SEGMENTS + +#ifdef CONFIG_PPC_BOOK3S_64 + + lbz r5, HSTATE_RESTORE_HID5(r13) + cmpwi r5, 0 + beq no_dcbz32_off + + li r4, 0 + mfspr r5,SPRN_HID5 + rldimi r5,r4,6,56 + mtspr SPRN_HID5,r5 + +no_dcbz32_off: + +BEGIN_FTR_SECTION + /* Save guest FSCR on a FAC_UNAVAIL interrupt */ + cmpwi r12, BOOK3S_INTERRUPT_FAC_UNAVAIL + bne+ no_fscr_save + mfspr r7, SPRN_FSCR + std r7, SVCPU_SHADOW_FSCR(r13) +no_fscr_save: + /* Restore host FSCR */ + ld r8, HSTATE_HOST_FSCR(r13) + mtspr SPRN_FSCR, r8 +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) + +#endif /* CONFIG_PPC_BOOK3S_64 */ + + /* + * For some interrupts, we need to call the real Linux + * handler, so it can do work for us. This has to happen + * as if the interrupt arrived from the kernel though, + * so let's fake it here where most state is restored. + * + * Having set up SRR0/1 with the address where we want + * to continue with relocation on (potentially in module + * space), we either just go straight there with rfi[d], + * or we jump to an interrupt handler if there is an + * interrupt to be handled first. In the latter case, + * the rfi[d] at the end of the interrupt handler will + * get us back to where we want to continue. + */ + + /* Register usage at this point: + * + * R1 = host R1 + * R2 = host R2 + * R10 = raw exit handler id + * R12 = exit handler id + * R13 = shadow vcpu (32-bit) or PACA (64-bit) + * SVCPU.* = guest * + * + */ + + PPC_LL r6, HSTATE_HOST_MSR(r13) + PPC_LL r8, HSTATE_VMHANDLER(r13) + +#ifdef CONFIG_PPC64 +BEGIN_FTR_SECTION + beq cr1, 1f + mtspr SPRN_HSRR1, r6 + mtspr SPRN_HSRR0, r8 +END_FTR_SECTION_IFSET(CPU_FTR_HVMODE) +#endif +1: /* Restore host msr -> SRR1 */ + mtsrr1 r6 + /* Load highmem handler address */ + mtsrr0 r8 + + /* RFI into the highmem handler, or jump to interrupt handler */ + cmpwi r12, BOOK3S_INTERRUPT_EXTERNAL + beqa BOOK3S_INTERRUPT_EXTERNAL + cmpwi r12, BOOK3S_INTERRUPT_DECREMENTER + beqa BOOK3S_INTERRUPT_DECREMENTER + cmpwi r12, BOOK3S_INTERRUPT_PERFMON + beqa BOOK3S_INTERRUPT_PERFMON + cmpwi r12, BOOK3S_INTERRUPT_DOORBELL + beqa BOOK3S_INTERRUPT_DOORBELL + + RFI +kvmppc_handler_trampoline_exit_end: diff --git a/kernel/arch/powerpc/kvm/book3s_xics.c b/kernel/arch/powerpc/kvm/book3s_xics.c new file mode 100644 index 000000000..c6ca7db64 --- /dev/null +++ b/kernel/arch/powerpc/kvm/book3s_xics.c @@ -0,0 +1,1411 @@ +/* + * Copyright 2012 Michael Ellerman, IBM Corporation. + * Copyright 2012 Benjamin Herrenschmidt, IBM Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + */ + +#include <linux/kernel.h> +#include <linux/kvm_host.h> +#include <linux/err.h> +#include <linux/gfp.h> +#include <linux/anon_inodes.h> +#include <linux/spinlock.h> + +#include <asm/uaccess.h> +#include <asm/kvm_book3s.h> +#include <asm/kvm_ppc.h> +#include <asm/hvcall.h> +#include <asm/xics.h> +#include <asm/debug.h> +#include <asm/time.h> + +#include <linux/debugfs.h> +#include <linux/seq_file.h> + +#include "book3s_xics.h" + +#if 1 +#define XICS_DBG(fmt...) do { } while (0) +#else +#define XICS_DBG(fmt...) trace_printk(fmt) +#endif + +#define ENABLE_REALMODE true +#define DEBUG_REALMODE false + +/* + * LOCKING + * ======= + * + * Each ICS has a spin lock protecting the information about the IRQ + * sources and avoiding simultaneous deliveries if the same interrupt. + * + * ICP operations are done via a single compare & swap transaction + * (most ICP state fits in the union kvmppc_icp_state) + */ + +/* + * TODO + * ==== + * + * - To speed up resends, keep a bitmap of "resend" set bits in the + * ICS + * + * - Speed up server# -> ICP lookup (array ? hash table ?) + * + * - Make ICS lockless as well, or at least a per-interrupt lock or hashed + * locks array to improve scalability + */ + +/* -- ICS routines -- */ + +static void icp_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp, + u32 new_irq); + +/* + * Return value ideally indicates how the interrupt was handled, but no + * callers look at it (given that we don't implement KVM_IRQ_LINE_STATUS), + * so just return 0. + */ +static int ics_deliver_irq(struct kvmppc_xics *xics, u32 irq, u32 level) +{ + struct ics_irq_state *state; + struct kvmppc_ics *ics; + u16 src; + + XICS_DBG("ics deliver %#x (level: %d)\n", irq, level); + + ics = kvmppc_xics_find_ics(xics, irq, &src); + if (!ics) { + XICS_DBG("ics_deliver_irq: IRQ 0x%06x not found !\n", irq); + return -EINVAL; + } + state = &ics->irq_state[src]; + if (!state->exists) + return -EINVAL; + + /* + * We set state->asserted locklessly. This should be fine as + * we are the only setter, thus concurrent access is undefined + * to begin with. + */ + if (level == 1 || level == KVM_INTERRUPT_SET_LEVEL) + state->asserted = 1; + else if (level == 0 || level == KVM_INTERRUPT_UNSET) { + state->asserted = 0; + return 0; + } + + /* Attempt delivery */ + icp_deliver_irq(xics, NULL, irq); + + return 0; +} + +static void ics_check_resend(struct kvmppc_xics *xics, struct kvmppc_ics *ics, + struct kvmppc_icp *icp) +{ + int i; + + unsigned long flags; + + local_irq_save(flags); + arch_spin_lock(&ics->lock); + + for (i = 0; i < KVMPPC_XICS_IRQ_PER_ICS; i++) { + struct ics_irq_state *state = &ics->irq_state[i]; + + if (!state->resend) + continue; + + XICS_DBG("resend %#x prio %#x\n", state->number, + state->priority); + + arch_spin_unlock(&ics->lock); + local_irq_restore(flags); + icp_deliver_irq(xics, icp, state->number); + local_irq_save(flags); + arch_spin_lock(&ics->lock); + } + + arch_spin_unlock(&ics->lock); + local_irq_restore(flags); +} + +static bool write_xive(struct kvmppc_xics *xics, struct kvmppc_ics *ics, + struct ics_irq_state *state, + u32 server, u32 priority, u32 saved_priority) +{ + bool deliver; + unsigned long flags; + + local_irq_save(flags); + arch_spin_lock(&ics->lock); + + state->server = server; + state->priority = priority; + state->saved_priority = saved_priority; + deliver = false; + if ((state->masked_pending || state->resend) && priority != MASKED) { + state->masked_pending = 0; + deliver = true; + } + + arch_spin_unlock(&ics->lock); + local_irq_restore(flags); + + return deliver; +} + +int kvmppc_xics_set_xive(struct kvm *kvm, u32 irq, u32 server, u32 priority) +{ + struct kvmppc_xics *xics = kvm->arch.xics; + struct kvmppc_icp *icp; + struct kvmppc_ics *ics; + struct ics_irq_state *state; + u16 src; + + if (!xics) + return -ENODEV; + + ics = kvmppc_xics_find_ics(xics, irq, &src); + if (!ics) + return -EINVAL; + state = &ics->irq_state[src]; + + icp = kvmppc_xics_find_server(kvm, server); + if (!icp) + return -EINVAL; + + XICS_DBG("set_xive %#x server %#x prio %#x MP:%d RS:%d\n", + irq, server, priority, + state->masked_pending, state->resend); + + if (write_xive(xics, ics, state, server, priority, priority)) + icp_deliver_irq(xics, icp, irq); + + return 0; +} + +int kvmppc_xics_get_xive(struct kvm *kvm, u32 irq, u32 *server, u32 *priority) +{ + struct kvmppc_xics *xics = kvm->arch.xics; + struct kvmppc_ics *ics; + struct ics_irq_state *state; + u16 src; + unsigned long flags; + + if (!xics) + return -ENODEV; + + ics = kvmppc_xics_find_ics(xics, irq, &src); + if (!ics) + return -EINVAL; + state = &ics->irq_state[src]; + + local_irq_save(flags); + arch_spin_lock(&ics->lock); + *server = state->server; + *priority = state->priority; + arch_spin_unlock(&ics->lock); + local_irq_restore(flags); + + return 0; +} + +int kvmppc_xics_int_on(struct kvm *kvm, u32 irq) +{ + struct kvmppc_xics *xics = kvm->arch.xics; + struct kvmppc_icp *icp; + struct kvmppc_ics *ics; + struct ics_irq_state *state; + u16 src; + + if (!xics) + return -ENODEV; + + ics = kvmppc_xics_find_ics(xics, irq, &src); + if (!ics) + return -EINVAL; + state = &ics->irq_state[src]; + + icp = kvmppc_xics_find_server(kvm, state->server); + if (!icp) + return -EINVAL; + + if (write_xive(xics, ics, state, state->server, state->saved_priority, + state->saved_priority)) + icp_deliver_irq(xics, icp, irq); + + return 0; +} + +int kvmppc_xics_int_off(struct kvm *kvm, u32 irq) +{ + struct kvmppc_xics *xics = kvm->arch.xics; + struct kvmppc_ics *ics; + struct ics_irq_state *state; + u16 src; + + if (!xics) + return -ENODEV; + + ics = kvmppc_xics_find_ics(xics, irq, &src); + if (!ics) + return -EINVAL; + state = &ics->irq_state[src]; + + write_xive(xics, ics, state, state->server, MASKED, state->priority); + + return 0; +} + +/* -- ICP routines, including hcalls -- */ + +static inline bool icp_try_update(struct kvmppc_icp *icp, + union kvmppc_icp_state old, + union kvmppc_icp_state new, + bool change_self) +{ + bool success; + + /* Calculate new output value */ + new.out_ee = (new.xisr && (new.pending_pri < new.cppr)); + + /* Attempt atomic update */ + success = cmpxchg64(&icp->state.raw, old.raw, new.raw) == old.raw; + if (!success) + goto bail; + + XICS_DBG("UPD [%04x] - C:%02x M:%02x PP: %02x PI:%06x R:%d O:%d\n", + icp->server_num, + old.cppr, old.mfrr, old.pending_pri, old.xisr, + old.need_resend, old.out_ee); + XICS_DBG("UPD - C:%02x M:%02x PP: %02x PI:%06x R:%d O:%d\n", + new.cppr, new.mfrr, new.pending_pri, new.xisr, + new.need_resend, new.out_ee); + /* + * Check for output state update + * + * Note that this is racy since another processor could be updating + * the state already. This is why we never clear the interrupt output + * here, we only ever set it. The clear only happens prior to doing + * an update and only by the processor itself. Currently we do it + * in Accept (H_XIRR) and Up_Cppr (H_XPPR). + * + * We also do not try to figure out whether the EE state has changed, + * we unconditionally set it if the new state calls for it. The reason + * for that is that we opportunistically remove the pending interrupt + * flag when raising CPPR, so we need to set it back here if an + * interrupt is still pending. + */ + if (new.out_ee) { + kvmppc_book3s_queue_irqprio(icp->vcpu, + BOOK3S_INTERRUPT_EXTERNAL_LEVEL); + if (!change_self) + kvmppc_fast_vcpu_kick(icp->vcpu); + } + bail: + return success; +} + +static void icp_check_resend(struct kvmppc_xics *xics, + struct kvmppc_icp *icp) +{ + u32 icsid; + + /* Order this load with the test for need_resend in the caller */ + smp_rmb(); + for_each_set_bit(icsid, icp->resend_map, xics->max_icsid + 1) { + struct kvmppc_ics *ics = xics->ics[icsid]; + + if (!test_and_clear_bit(icsid, icp->resend_map)) + continue; + if (!ics) + continue; + ics_check_resend(xics, ics, icp); + } +} + +static bool icp_try_to_deliver(struct kvmppc_icp *icp, u32 irq, u8 priority, + u32 *reject) +{ + union kvmppc_icp_state old_state, new_state; + bool success; + + XICS_DBG("try deliver %#x(P:%#x) to server %#x\n", irq, priority, + icp->server_num); + + do { + old_state = new_state = READ_ONCE(icp->state); + + *reject = 0; + + /* See if we can deliver */ + success = new_state.cppr > priority && + new_state.mfrr > priority && + new_state.pending_pri > priority; + + /* + * If we can, check for a rejection and perform the + * delivery + */ + if (success) { + *reject = new_state.xisr; + new_state.xisr = irq; + new_state.pending_pri = priority; + } else { + /* + * If we failed to deliver we set need_resend + * so a subsequent CPPR state change causes us + * to try a new delivery. + */ + new_state.need_resend = true; + } + + } while (!icp_try_update(icp, old_state, new_state, false)); + + return success; +} + +static void icp_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp, + u32 new_irq) +{ + struct ics_irq_state *state; + struct kvmppc_ics *ics; + u32 reject; + u16 src; + unsigned long flags; + + /* + * This is used both for initial delivery of an interrupt and + * for subsequent rejection. + * + * Rejection can be racy vs. resends. We have evaluated the + * rejection in an atomic ICP transaction which is now complete, + * so potentially the ICP can already accept the interrupt again. + * + * So we need to retry the delivery. Essentially the reject path + * boils down to a failed delivery. Always. + * + * Now the interrupt could also have moved to a different target, + * thus we may need to re-do the ICP lookup as well + */ + + again: + /* Get the ICS state and lock it */ + ics = kvmppc_xics_find_ics(xics, new_irq, &src); + if (!ics) { + XICS_DBG("icp_deliver_irq: IRQ 0x%06x not found !\n", new_irq); + return; + } + state = &ics->irq_state[src]; + + /* Get a lock on the ICS */ + local_irq_save(flags); + arch_spin_lock(&ics->lock); + + /* Get our server */ + if (!icp || state->server != icp->server_num) { + icp = kvmppc_xics_find_server(xics->kvm, state->server); + if (!icp) { + pr_warn("icp_deliver_irq: IRQ 0x%06x server 0x%x not found !\n", + new_irq, state->server); + goto out; + } + } + + /* Clear the resend bit of that interrupt */ + state->resend = 0; + + /* + * If masked, bail out + * + * Note: PAPR doesn't mention anything about masked pending + * when doing a resend, only when doing a delivery. + * + * However that would have the effect of losing a masked + * interrupt that was rejected and isn't consistent with + * the whole masked_pending business which is about not + * losing interrupts that occur while masked. + * + * I don't differenciate normal deliveries and resends, this + * implementation will differ from PAPR and not lose such + * interrupts. + */ + if (state->priority == MASKED) { + XICS_DBG("irq %#x masked pending\n", new_irq); + state->masked_pending = 1; + goto out; + } + + /* + * Try the delivery, this will set the need_resend flag + * in the ICP as part of the atomic transaction if the + * delivery is not possible. + * + * Note that if successful, the new delivery might have itself + * rejected an interrupt that was "delivered" before we took the + * ics spin lock. + * + * In this case we do the whole sequence all over again for the + * new guy. We cannot assume that the rejected interrupt is less + * favored than the new one, and thus doesn't need to be delivered, + * because by the time we exit icp_try_to_deliver() the target + * processor may well have alrady consumed & completed it, and thus + * the rejected interrupt might actually be already acceptable. + */ + if (icp_try_to_deliver(icp, new_irq, state->priority, &reject)) { + /* + * Delivery was successful, did we reject somebody else ? + */ + if (reject && reject != XICS_IPI) { + arch_spin_unlock(&ics->lock); + local_irq_restore(flags); + new_irq = reject; + goto again; + } + } else { + /* + * We failed to deliver the interrupt we need to set the + * resend map bit and mark the ICS state as needing a resend + */ + set_bit(ics->icsid, icp->resend_map); + state->resend = 1; + + /* + * If the need_resend flag got cleared in the ICP some time + * between icp_try_to_deliver() atomic update and now, then + * we know it might have missed the resend_map bit. So we + * retry + */ + smp_mb(); + if (!icp->state.need_resend) { + arch_spin_unlock(&ics->lock); + local_irq_restore(flags); + goto again; + } + } + out: + arch_spin_unlock(&ics->lock); + local_irq_restore(flags); +} + +static void icp_down_cppr(struct kvmppc_xics *xics, struct kvmppc_icp *icp, + u8 new_cppr) +{ + union kvmppc_icp_state old_state, new_state; + bool resend; + + /* + * This handles several related states in one operation: + * + * ICP State: Down_CPPR + * + * Load CPPR with new value and if the XISR is 0 + * then check for resends: + * + * ICP State: Resend + * + * If MFRR is more favored than CPPR, check for IPIs + * and notify ICS of a potential resend. This is done + * asynchronously (when used in real mode, we will have + * to exit here). + * + * We do not handle the complete Check_IPI as documented + * here. In the PAPR, this state will be used for both + * Set_MFRR and Down_CPPR. However, we know that we aren't + * changing the MFRR state here so we don't need to handle + * the case of an MFRR causing a reject of a pending irq, + * this will have been handled when the MFRR was set in the + * first place. + * + * Thus we don't have to handle rejects, only resends. + * + * When implementing real mode for HV KVM, resend will lead to + * a H_TOO_HARD return and the whole transaction will be handled + * in virtual mode. + */ + do { + old_state = new_state = READ_ONCE(icp->state); + + /* Down_CPPR */ + new_state.cppr = new_cppr; + + /* + * Cut down Resend / Check_IPI / IPI + * + * The logic is that we cannot have a pending interrupt + * trumped by an IPI at this point (see above), so we + * know that either the pending interrupt is already an + * IPI (in which case we don't care to override it) or + * it's either more favored than us or non existent + */ + if (new_state.mfrr < new_cppr && + new_state.mfrr <= new_state.pending_pri) { + WARN_ON(new_state.xisr != XICS_IPI && + new_state.xisr != 0); + new_state.pending_pri = new_state.mfrr; + new_state.xisr = XICS_IPI; + } + + /* Latch/clear resend bit */ + resend = new_state.need_resend; + new_state.need_resend = 0; + + } while (!icp_try_update(icp, old_state, new_state, true)); + + /* + * Now handle resend checks. Those are asynchronous to the ICP + * state update in HW (ie bus transactions) so we can handle them + * separately here too + */ + if (resend) + icp_check_resend(xics, icp); +} + +static noinline unsigned long kvmppc_h_xirr(struct kvm_vcpu *vcpu) +{ + union kvmppc_icp_state old_state, new_state; + struct kvmppc_icp *icp = vcpu->arch.icp; + u32 xirr; + + /* First, remove EE from the processor */ + kvmppc_book3s_dequeue_irqprio(icp->vcpu, + BOOK3S_INTERRUPT_EXTERNAL_LEVEL); + + /* + * ICP State: Accept_Interrupt + * + * Return the pending interrupt (if any) along with the + * current CPPR, then clear the XISR & set CPPR to the + * pending priority + */ + do { + old_state = new_state = READ_ONCE(icp->state); + + xirr = old_state.xisr | (((u32)old_state.cppr) << 24); + if (!old_state.xisr) + break; + new_state.cppr = new_state.pending_pri; + new_state.pending_pri = 0xff; + new_state.xisr = 0; + + } while (!icp_try_update(icp, old_state, new_state, true)); + + XICS_DBG("h_xirr vcpu %d xirr %#x\n", vcpu->vcpu_id, xirr); + + return xirr; +} + +static noinline int kvmppc_h_ipi(struct kvm_vcpu *vcpu, unsigned long server, + unsigned long mfrr) +{ + union kvmppc_icp_state old_state, new_state; + struct kvmppc_xics *xics = vcpu->kvm->arch.xics; + struct kvmppc_icp *icp; + u32 reject; + bool resend; + bool local; + + XICS_DBG("h_ipi vcpu %d to server %lu mfrr %#lx\n", + vcpu->vcpu_id, server, mfrr); + + icp = vcpu->arch.icp; + local = icp->server_num == server; + if (!local) { + icp = kvmppc_xics_find_server(vcpu->kvm, server); + if (!icp) + return H_PARAMETER; + } + + /* + * ICP state: Set_MFRR + * + * If the CPPR is more favored than the new MFRR, then + * nothing needs to be rejected as there can be no XISR to + * reject. If the MFRR is being made less favored then + * there might be a previously-rejected interrupt needing + * to be resent. + * + * ICP state: Check_IPI + * + * If the CPPR is less favored, then we might be replacing + * an interrupt, and thus need to possibly reject it. + * + * ICP State: IPI + * + * Besides rejecting any pending interrupts, we also + * update XISR and pending_pri to mark IPI as pending. + * + * PAPR does not describe this state, but if the MFRR is being + * made less favored than its earlier value, there might be + * a previously-rejected interrupt needing to be resent. + * Ideally, we would want to resend only if + * prio(pending_interrupt) < mfrr && + * prio(pending_interrupt) < cppr + * where pending interrupt is the one that was rejected. But + * we don't have that state, so we simply trigger a resend + * whenever the MFRR is made less favored. + */ + do { + old_state = new_state = READ_ONCE(icp->state); + + /* Set_MFRR */ + new_state.mfrr = mfrr; + + /* Check_IPI */ + reject = 0; + resend = false; + if (mfrr < new_state.cppr) { + /* Reject a pending interrupt if not an IPI */ + if (mfrr <= new_state.pending_pri) { + reject = new_state.xisr; + new_state.pending_pri = mfrr; + new_state.xisr = XICS_IPI; + } + } + + if (mfrr > old_state.mfrr) { + resend = new_state.need_resend; + new_state.need_resend = 0; + } + } while (!icp_try_update(icp, old_state, new_state, local)); + + /* Handle reject */ + if (reject && reject != XICS_IPI) + icp_deliver_irq(xics, icp, reject); + + /* Handle resend */ + if (resend) + icp_check_resend(xics, icp); + + return H_SUCCESS; +} + +static int kvmppc_h_ipoll(struct kvm_vcpu *vcpu, unsigned long server) +{ + union kvmppc_icp_state state; + struct kvmppc_icp *icp; + + icp = vcpu->arch.icp; + if (icp->server_num != server) { + icp = kvmppc_xics_find_server(vcpu->kvm, server); + if (!icp) + return H_PARAMETER; + } + state = READ_ONCE(icp->state); + kvmppc_set_gpr(vcpu, 4, ((u32)state.cppr << 24) | state.xisr); + kvmppc_set_gpr(vcpu, 5, state.mfrr); + return H_SUCCESS; +} + +static noinline void kvmppc_h_cppr(struct kvm_vcpu *vcpu, unsigned long cppr) +{ + union kvmppc_icp_state old_state, new_state; + struct kvmppc_xics *xics = vcpu->kvm->arch.xics; + struct kvmppc_icp *icp = vcpu->arch.icp; + u32 reject; + + XICS_DBG("h_cppr vcpu %d cppr %#lx\n", vcpu->vcpu_id, cppr); + + /* + * ICP State: Set_CPPR + * + * We can safely compare the new value with the current + * value outside of the transaction as the CPPR is only + * ever changed by the processor on itself + */ + if (cppr > icp->state.cppr) + icp_down_cppr(xics, icp, cppr); + else if (cppr == icp->state.cppr) + return; + + /* + * ICP State: Up_CPPR + * + * The processor is raising its priority, this can result + * in a rejection of a pending interrupt: + * + * ICP State: Reject_Current + * + * We can remove EE from the current processor, the update + * transaction will set it again if needed + */ + kvmppc_book3s_dequeue_irqprio(icp->vcpu, + BOOK3S_INTERRUPT_EXTERNAL_LEVEL); + + do { + old_state = new_state = READ_ONCE(icp->state); + + reject = 0; + new_state.cppr = cppr; + + if (cppr <= new_state.pending_pri) { + reject = new_state.xisr; + new_state.xisr = 0; + new_state.pending_pri = 0xff; + } + + } while (!icp_try_update(icp, old_state, new_state, true)); + + /* + * Check for rejects. They are handled by doing a new delivery + * attempt (see comments in icp_deliver_irq). + */ + if (reject && reject != XICS_IPI) + icp_deliver_irq(xics, icp, reject); +} + +static noinline int kvmppc_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr) +{ + struct kvmppc_xics *xics = vcpu->kvm->arch.xics; + struct kvmppc_icp *icp = vcpu->arch.icp; + struct kvmppc_ics *ics; + struct ics_irq_state *state; + u32 irq = xirr & 0x00ffffff; + u16 src; + + XICS_DBG("h_eoi vcpu %d eoi %#lx\n", vcpu->vcpu_id, xirr); + + /* + * ICP State: EOI + * + * Note: If EOI is incorrectly used by SW to lower the CPPR + * value (ie more favored), we do not check for rejection of + * a pending interrupt, this is a SW error and PAPR sepcifies + * that we don't have to deal with it. + * + * The sending of an EOI to the ICS is handled after the + * CPPR update + * + * ICP State: Down_CPPR which we handle + * in a separate function as it's shared with H_CPPR. + */ + icp_down_cppr(xics, icp, xirr >> 24); + + /* IPIs have no EOI */ + if (irq == XICS_IPI) + return H_SUCCESS; + /* + * EOI handling: If the interrupt is still asserted, we need to + * resend it. We can take a lockless "peek" at the ICS state here. + * + * "Message" interrupts will never have "asserted" set + */ + ics = kvmppc_xics_find_ics(xics, irq, &src); + if (!ics) { + XICS_DBG("h_eoi: IRQ 0x%06x not found !\n", irq); + return H_PARAMETER; + } + state = &ics->irq_state[src]; + + /* Still asserted, resend it */ + if (state->asserted) + icp_deliver_irq(xics, icp, irq); + + kvm_notify_acked_irq(vcpu->kvm, 0, irq); + + return H_SUCCESS; +} + +static noinline int kvmppc_xics_rm_complete(struct kvm_vcpu *vcpu, u32 hcall) +{ + struct kvmppc_xics *xics = vcpu->kvm->arch.xics; + struct kvmppc_icp *icp = vcpu->arch.icp; + + XICS_DBG("XICS_RM: H_%x completing, act: %x state: %lx tgt: %p\n", + hcall, icp->rm_action, icp->rm_dbgstate.raw, icp->rm_dbgtgt); + + if (icp->rm_action & XICS_RM_KICK_VCPU) { + icp->n_rm_kick_vcpu++; + kvmppc_fast_vcpu_kick(icp->rm_kick_target); + } + if (icp->rm_action & XICS_RM_CHECK_RESEND) { + icp->n_rm_check_resend++; + icp_check_resend(xics, icp->rm_resend_icp); + } + if (icp->rm_action & XICS_RM_REJECT) { + icp->n_rm_reject++; + icp_deliver_irq(xics, icp, icp->rm_reject); + } + if (icp->rm_action & XICS_RM_NOTIFY_EOI) { + icp->n_rm_notify_eoi++; + kvm_notify_acked_irq(vcpu->kvm, 0, icp->rm_eoied_irq); + } + + icp->rm_action = 0; + + return H_SUCCESS; +} + +int kvmppc_xics_hcall(struct kvm_vcpu *vcpu, u32 req) +{ + struct kvmppc_xics *xics = vcpu->kvm->arch.xics; + unsigned long res; + int rc = H_SUCCESS; + + /* Check if we have an ICP */ + if (!xics || !vcpu->arch.icp) + return H_HARDWARE; + + /* These requests don't have real-mode implementations at present */ + switch (req) { + case H_XIRR_X: + res = kvmppc_h_xirr(vcpu); + kvmppc_set_gpr(vcpu, 4, res); + kvmppc_set_gpr(vcpu, 5, get_tb()); + return rc; + case H_IPOLL: + rc = kvmppc_h_ipoll(vcpu, kvmppc_get_gpr(vcpu, 4)); + return rc; + } + + /* Check for real mode returning too hard */ + if (xics->real_mode && is_kvmppc_hv_enabled(vcpu->kvm)) + return kvmppc_xics_rm_complete(vcpu, req); + + switch (req) { + case H_XIRR: + res = kvmppc_h_xirr(vcpu); + kvmppc_set_gpr(vcpu, 4, res); + break; + case H_CPPR: + kvmppc_h_cppr(vcpu, kvmppc_get_gpr(vcpu, 4)); + break; + case H_EOI: + rc = kvmppc_h_eoi(vcpu, kvmppc_get_gpr(vcpu, 4)); + break; + case H_IPI: + rc = kvmppc_h_ipi(vcpu, kvmppc_get_gpr(vcpu, 4), + kvmppc_get_gpr(vcpu, 5)); + break; + } + + return rc; +} +EXPORT_SYMBOL_GPL(kvmppc_xics_hcall); + + +/* -- Initialisation code etc. -- */ + +static int xics_debug_show(struct seq_file *m, void *private) +{ + struct kvmppc_xics *xics = m->private; + struct kvm *kvm = xics->kvm; + struct kvm_vcpu *vcpu; + int icsid, i; + unsigned long flags; + unsigned long t_rm_kick_vcpu, t_rm_check_resend; + unsigned long t_rm_reject, t_rm_notify_eoi; + unsigned long t_reject, t_check_resend; + + if (!kvm) + return 0; + + t_rm_kick_vcpu = 0; + t_rm_notify_eoi = 0; + t_rm_check_resend = 0; + t_rm_reject = 0; + t_check_resend = 0; + t_reject = 0; + + seq_printf(m, "=========\nICP state\n=========\n"); + + kvm_for_each_vcpu(i, vcpu, kvm) { + struct kvmppc_icp *icp = vcpu->arch.icp; + union kvmppc_icp_state state; + + if (!icp) + continue; + + state.raw = READ_ONCE(icp->state.raw); + seq_printf(m, "cpu server %#lx XIRR:%#x PPRI:%#x CPPR:%#x MFRR:%#x OUT:%d NR:%d\n", + icp->server_num, state.xisr, + state.pending_pri, state.cppr, state.mfrr, + state.out_ee, state.need_resend); + t_rm_kick_vcpu += icp->n_rm_kick_vcpu; + t_rm_notify_eoi += icp->n_rm_notify_eoi; + t_rm_check_resend += icp->n_rm_check_resend; + t_rm_reject += icp->n_rm_reject; + t_check_resend += icp->n_check_resend; + t_reject += icp->n_reject; + } + + seq_printf(m, "ICP Guest->Host totals: kick_vcpu=%lu check_resend=%lu reject=%lu notify_eoi=%lu\n", + t_rm_kick_vcpu, t_rm_check_resend, + t_rm_reject, t_rm_notify_eoi); + seq_printf(m, "ICP Real Mode totals: check_resend=%lu resend=%lu\n", + t_check_resend, t_reject); + for (icsid = 0; icsid <= KVMPPC_XICS_MAX_ICS_ID; icsid++) { + struct kvmppc_ics *ics = xics->ics[icsid]; + + if (!ics) + continue; + + seq_printf(m, "=========\nICS state for ICS 0x%x\n=========\n", + icsid); + + local_irq_save(flags); + arch_spin_lock(&ics->lock); + + for (i = 0; i < KVMPPC_XICS_IRQ_PER_ICS; i++) { + struct ics_irq_state *irq = &ics->irq_state[i]; + + seq_printf(m, "irq 0x%06x: server %#x prio %#x save prio %#x asserted %d resend %d masked pending %d\n", + irq->number, irq->server, irq->priority, + irq->saved_priority, irq->asserted, + irq->resend, irq->masked_pending); + + } + arch_spin_unlock(&ics->lock); + local_irq_restore(flags); + } + return 0; +} + +static int xics_debug_open(struct inode *inode, struct file *file) +{ + return single_open(file, xics_debug_show, inode->i_private); +} + +static const struct file_operations xics_debug_fops = { + .open = xics_debug_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static void xics_debugfs_init(struct kvmppc_xics *xics) +{ + char *name; + + name = kasprintf(GFP_KERNEL, "kvm-xics-%p", xics); + if (!name) { + pr_err("%s: no memory for name\n", __func__); + return; + } + + xics->dentry = debugfs_create_file(name, S_IRUGO, powerpc_debugfs_root, + xics, &xics_debug_fops); + + pr_debug("%s: created %s\n", __func__, name); + kfree(name); +} + +static struct kvmppc_ics *kvmppc_xics_create_ics(struct kvm *kvm, + struct kvmppc_xics *xics, int irq) +{ + struct kvmppc_ics *ics; + int i, icsid; + + icsid = irq >> KVMPPC_XICS_ICS_SHIFT; + + mutex_lock(&kvm->lock); + + /* ICS already exists - somebody else got here first */ + if (xics->ics[icsid]) + goto out; + + /* Create the ICS */ + ics = kzalloc(sizeof(struct kvmppc_ics), GFP_KERNEL); + if (!ics) + goto out; + + ics->icsid = icsid; + + for (i = 0; i < KVMPPC_XICS_IRQ_PER_ICS; i++) { + ics->irq_state[i].number = (icsid << KVMPPC_XICS_ICS_SHIFT) | i; + ics->irq_state[i].priority = MASKED; + ics->irq_state[i].saved_priority = MASKED; + } + smp_wmb(); + xics->ics[icsid] = ics; + + if (icsid > xics->max_icsid) + xics->max_icsid = icsid; + + out: + mutex_unlock(&kvm->lock); + return xics->ics[icsid]; +} + +int kvmppc_xics_create_icp(struct kvm_vcpu *vcpu, unsigned long server_num) +{ + struct kvmppc_icp *icp; + + if (!vcpu->kvm->arch.xics) + return -ENODEV; + + if (kvmppc_xics_find_server(vcpu->kvm, server_num)) + return -EEXIST; + + icp = kzalloc(sizeof(struct kvmppc_icp), GFP_KERNEL); + if (!icp) + return -ENOMEM; + + icp->vcpu = vcpu; + icp->server_num = server_num; + icp->state.mfrr = MASKED; + icp->state.pending_pri = MASKED; + vcpu->arch.icp = icp; + + XICS_DBG("created server for vcpu %d\n", vcpu->vcpu_id); + + return 0; +} + +u64 kvmppc_xics_get_icp(struct kvm_vcpu *vcpu) +{ + struct kvmppc_icp *icp = vcpu->arch.icp; + union kvmppc_icp_state state; + + if (!icp) + return 0; + state = icp->state; + return ((u64)state.cppr << KVM_REG_PPC_ICP_CPPR_SHIFT) | + ((u64)state.xisr << KVM_REG_PPC_ICP_XISR_SHIFT) | + ((u64)state.mfrr << KVM_REG_PPC_ICP_MFRR_SHIFT) | + ((u64)state.pending_pri << KVM_REG_PPC_ICP_PPRI_SHIFT); +} + +int kvmppc_xics_set_icp(struct kvm_vcpu *vcpu, u64 icpval) +{ + struct kvmppc_icp *icp = vcpu->arch.icp; + struct kvmppc_xics *xics = vcpu->kvm->arch.xics; + union kvmppc_icp_state old_state, new_state; + struct kvmppc_ics *ics; + u8 cppr, mfrr, pending_pri; + u32 xisr; + u16 src; + bool resend; + + if (!icp || !xics) + return -ENOENT; + + cppr = icpval >> KVM_REG_PPC_ICP_CPPR_SHIFT; + xisr = (icpval >> KVM_REG_PPC_ICP_XISR_SHIFT) & + KVM_REG_PPC_ICP_XISR_MASK; + mfrr = icpval >> KVM_REG_PPC_ICP_MFRR_SHIFT; + pending_pri = icpval >> KVM_REG_PPC_ICP_PPRI_SHIFT; + + /* Require the new state to be internally consistent */ + if (xisr == 0) { + if (pending_pri != 0xff) + return -EINVAL; + } else if (xisr == XICS_IPI) { + if (pending_pri != mfrr || pending_pri >= cppr) + return -EINVAL; + } else { + if (pending_pri >= mfrr || pending_pri >= cppr) + return -EINVAL; + ics = kvmppc_xics_find_ics(xics, xisr, &src); + if (!ics) + return -EINVAL; + } + + new_state.raw = 0; + new_state.cppr = cppr; + new_state.xisr = xisr; + new_state.mfrr = mfrr; + new_state.pending_pri = pending_pri; + + /* + * Deassert the CPU interrupt request. + * icp_try_update will reassert it if necessary. + */ + kvmppc_book3s_dequeue_irqprio(icp->vcpu, + BOOK3S_INTERRUPT_EXTERNAL_LEVEL); + + /* + * Note that if we displace an interrupt from old_state.xisr, + * we don't mark it as rejected. We expect userspace to set + * the state of the interrupt sources to be consistent with + * the ICP states (either before or afterwards, which doesn't + * matter). We do handle resends due to CPPR becoming less + * favoured because that is necessary to end up with a + * consistent state in the situation where userspace restores + * the ICS states before the ICP states. + */ + do { + old_state = READ_ONCE(icp->state); + + if (new_state.mfrr <= old_state.mfrr) { + resend = false; + new_state.need_resend = old_state.need_resend; + } else { + resend = old_state.need_resend; + new_state.need_resend = 0; + } + } while (!icp_try_update(icp, old_state, new_state, false)); + + if (resend) + icp_check_resend(xics, icp); + + return 0; +} + +static int xics_get_source(struct kvmppc_xics *xics, long irq, u64 addr) +{ + int ret; + struct kvmppc_ics *ics; + struct ics_irq_state *irqp; + u64 __user *ubufp = (u64 __user *) addr; + u16 idx; + u64 val, prio; + unsigned long flags; + + ics = kvmppc_xics_find_ics(xics, irq, &idx); + if (!ics) + return -ENOENT; + + irqp = &ics->irq_state[idx]; + local_irq_save(flags); + arch_spin_lock(&ics->lock); + ret = -ENOENT; + if (irqp->exists) { + val = irqp->server; + prio = irqp->priority; + if (prio == MASKED) { + val |= KVM_XICS_MASKED; + prio = irqp->saved_priority; + } + val |= prio << KVM_XICS_PRIORITY_SHIFT; + if (irqp->asserted) + val |= KVM_XICS_LEVEL_SENSITIVE | KVM_XICS_PENDING; + else if (irqp->masked_pending || irqp->resend) + val |= KVM_XICS_PENDING; + ret = 0; + } + arch_spin_unlock(&ics->lock); + local_irq_restore(flags); + + if (!ret && put_user(val, ubufp)) + ret = -EFAULT; + + return ret; +} + +static int xics_set_source(struct kvmppc_xics *xics, long irq, u64 addr) +{ + struct kvmppc_ics *ics; + struct ics_irq_state *irqp; + u64 __user *ubufp = (u64 __user *) addr; + u16 idx; + u64 val; + u8 prio; + u32 server; + unsigned long flags; + + if (irq < KVMPPC_XICS_FIRST_IRQ || irq >= KVMPPC_XICS_NR_IRQS) + return -ENOENT; + + ics = kvmppc_xics_find_ics(xics, irq, &idx); + if (!ics) { + ics = kvmppc_xics_create_ics(xics->kvm, xics, irq); + if (!ics) + return -ENOMEM; + } + irqp = &ics->irq_state[idx]; + if (get_user(val, ubufp)) + return -EFAULT; + + server = val & KVM_XICS_DESTINATION_MASK; + prio = val >> KVM_XICS_PRIORITY_SHIFT; + if (prio != MASKED && + kvmppc_xics_find_server(xics->kvm, server) == NULL) + return -EINVAL; + + local_irq_save(flags); + arch_spin_lock(&ics->lock); + irqp->server = server; + irqp->saved_priority = prio; + if (val & KVM_XICS_MASKED) + prio = MASKED; + irqp->priority = prio; + irqp->resend = 0; + irqp->masked_pending = 0; + irqp->asserted = 0; + if ((val & KVM_XICS_PENDING) && (val & KVM_XICS_LEVEL_SENSITIVE)) + irqp->asserted = 1; + irqp->exists = 1; + arch_spin_unlock(&ics->lock); + local_irq_restore(flags); + + if (val & KVM_XICS_PENDING) + icp_deliver_irq(xics, NULL, irqp->number); + + return 0; +} + +int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level, + bool line_status) +{ + struct kvmppc_xics *xics = kvm->arch.xics; + + return ics_deliver_irq(xics, irq, level); +} + +int kvm_set_msi(struct kvm_kernel_irq_routing_entry *irq_entry, struct kvm *kvm, + int irq_source_id, int level, bool line_status) +{ + if (!level) + return -1; + return kvm_set_irq(kvm, irq_source_id, irq_entry->gsi, + level, line_status); +} + +static int xics_set_attr(struct kvm_device *dev, struct kvm_device_attr *attr) +{ + struct kvmppc_xics *xics = dev->private; + + switch (attr->group) { + case KVM_DEV_XICS_GRP_SOURCES: + return xics_set_source(xics, attr->attr, attr->addr); + } + return -ENXIO; +} + +static int xics_get_attr(struct kvm_device *dev, struct kvm_device_attr *attr) +{ + struct kvmppc_xics *xics = dev->private; + + switch (attr->group) { + case KVM_DEV_XICS_GRP_SOURCES: + return xics_get_source(xics, attr->attr, attr->addr); + } + return -ENXIO; +} + +static int xics_has_attr(struct kvm_device *dev, struct kvm_device_attr *attr) +{ + switch (attr->group) { + case KVM_DEV_XICS_GRP_SOURCES: + if (attr->attr >= KVMPPC_XICS_FIRST_IRQ && + attr->attr < KVMPPC_XICS_NR_IRQS) + return 0; + break; + } + return -ENXIO; +} + +static void kvmppc_xics_free(struct kvm_device *dev) +{ + struct kvmppc_xics *xics = dev->private; + int i; + struct kvm *kvm = xics->kvm; + + debugfs_remove(xics->dentry); + + if (kvm) + kvm->arch.xics = NULL; + + for (i = 0; i <= xics->max_icsid; i++) + kfree(xics->ics[i]); + kfree(xics); + kfree(dev); +} + +static int kvmppc_xics_create(struct kvm_device *dev, u32 type) +{ + struct kvmppc_xics *xics; + struct kvm *kvm = dev->kvm; + int ret = 0; + + xics = kzalloc(sizeof(*xics), GFP_KERNEL); + if (!xics) + return -ENOMEM; + + dev->private = xics; + xics->dev = dev; + xics->kvm = kvm; + + /* Already there ? */ + mutex_lock(&kvm->lock); + if (kvm->arch.xics) + ret = -EEXIST; + else + kvm->arch.xics = xics; + mutex_unlock(&kvm->lock); + + if (ret) { + kfree(xics); + return ret; + } + + xics_debugfs_init(xics); + +#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE + if (cpu_has_feature(CPU_FTR_ARCH_206)) { + /* Enable real mode support */ + xics->real_mode = ENABLE_REALMODE; + xics->real_mode_dbg = DEBUG_REALMODE; + } +#endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */ + + return 0; +} + +struct kvm_device_ops kvm_xics_ops = { + .name = "kvm-xics", + .create = kvmppc_xics_create, + .destroy = kvmppc_xics_free, + .set_attr = xics_set_attr, + .get_attr = xics_get_attr, + .has_attr = xics_has_attr, +}; + +int kvmppc_xics_connect_vcpu(struct kvm_device *dev, struct kvm_vcpu *vcpu, + u32 xcpu) +{ + struct kvmppc_xics *xics = dev->private; + int r = -EBUSY; + + if (dev->ops != &kvm_xics_ops) + return -EPERM; + if (xics->kvm != vcpu->kvm) + return -EPERM; + if (vcpu->arch.irq_type) + return -EBUSY; + + r = kvmppc_xics_create_icp(vcpu, xcpu); + if (!r) + vcpu->arch.irq_type = KVMPPC_IRQ_XICS; + + return r; +} + +void kvmppc_xics_free_icp(struct kvm_vcpu *vcpu) +{ + if (!vcpu->arch.icp) + return; + kfree(vcpu->arch.icp); + vcpu->arch.icp = NULL; + vcpu->arch.irq_type = KVMPPC_IRQ_DEFAULT; +} + +static int xics_set_irq(struct kvm_kernel_irq_routing_entry *e, + struct kvm *kvm, int irq_source_id, int level, + bool line_status) +{ + return kvm_set_irq(kvm, irq_source_id, e->gsi, level, line_status); +} + +int kvm_irq_map_gsi(struct kvm *kvm, + struct kvm_kernel_irq_routing_entry *entries, int gsi) +{ + entries->gsi = gsi; + entries->type = KVM_IRQ_ROUTING_IRQCHIP; + entries->set = xics_set_irq; + entries->irqchip.irqchip = 0; + entries->irqchip.pin = gsi; + return 1; +} + +int kvm_irq_map_chip_pin(struct kvm *kvm, unsigned irqchip, unsigned pin) +{ + return pin; +} diff --git a/kernel/arch/powerpc/kvm/book3s_xics.h b/kernel/arch/powerpc/kvm/book3s_xics.h new file mode 100644 index 000000000..56ea44f98 --- /dev/null +++ b/kernel/arch/powerpc/kvm/book3s_xics.h @@ -0,0 +1,144 @@ +/* + * Copyright 2012 Michael Ellerman, IBM Corporation. + * Copyright 2012 Benjamin Herrenschmidt, IBM Corporation + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + */ + +#ifndef _KVM_PPC_BOOK3S_XICS_H +#define _KVM_PPC_BOOK3S_XICS_H + +/* + * We use a two-level tree to store interrupt source information. + * There are up to 1024 ICS nodes, each of which can represent + * 1024 sources. + */ +#define KVMPPC_XICS_MAX_ICS_ID 1023 +#define KVMPPC_XICS_ICS_SHIFT 10 +#define KVMPPC_XICS_IRQ_PER_ICS (1 << KVMPPC_XICS_ICS_SHIFT) +#define KVMPPC_XICS_SRC_MASK (KVMPPC_XICS_IRQ_PER_ICS - 1) + +/* + * Interrupt source numbers below this are reserved, for example + * 0 is "no interrupt", and 2 is used for IPIs. + */ +#define KVMPPC_XICS_FIRST_IRQ 16 +#define KVMPPC_XICS_NR_IRQS ((KVMPPC_XICS_MAX_ICS_ID + 1) * \ + KVMPPC_XICS_IRQ_PER_ICS) + +/* Priority value to use for disabling an interrupt */ +#define MASKED 0xff + +/* State for one irq source */ +struct ics_irq_state { + u32 number; + u32 server; + u8 priority; + u8 saved_priority; + u8 resend; + u8 masked_pending; + u8 asserted; /* Only for LSI */ + u8 exists; +}; + +/* Atomic ICP state, updated with a single compare & swap */ +union kvmppc_icp_state { + unsigned long raw; + struct { + u8 out_ee:1; + u8 need_resend:1; + u8 cppr; + u8 mfrr; + u8 pending_pri; + u32 xisr; + }; +}; + +/* One bit per ICS */ +#define ICP_RESEND_MAP_SIZE (KVMPPC_XICS_MAX_ICS_ID / BITS_PER_LONG + 1) + +struct kvmppc_icp { + struct kvm_vcpu *vcpu; + unsigned long server_num; + union kvmppc_icp_state state; + unsigned long resend_map[ICP_RESEND_MAP_SIZE]; + + /* Real mode might find something too hard, here's the action + * it might request from virtual mode + */ +#define XICS_RM_KICK_VCPU 0x1 +#define XICS_RM_CHECK_RESEND 0x2 +#define XICS_RM_REJECT 0x4 +#define XICS_RM_NOTIFY_EOI 0x8 + u32 rm_action; + struct kvm_vcpu *rm_kick_target; + struct kvmppc_icp *rm_resend_icp; + u32 rm_reject; + u32 rm_eoied_irq; + + /* Counters for each reason we exited real mode */ + unsigned long n_rm_kick_vcpu; + unsigned long n_rm_check_resend; + unsigned long n_rm_reject; + unsigned long n_rm_notify_eoi; + /* Counters for handling ICP processing in real mode */ + unsigned long n_check_resend; + unsigned long n_reject; + + /* Debug stuff for real mode */ + union kvmppc_icp_state rm_dbgstate; + struct kvm_vcpu *rm_dbgtgt; +}; + +struct kvmppc_ics { + arch_spinlock_t lock; + u16 icsid; + struct ics_irq_state irq_state[KVMPPC_XICS_IRQ_PER_ICS]; +}; + +struct kvmppc_xics { + struct kvm *kvm; + struct kvm_device *dev; + struct dentry *dentry; + u32 max_icsid; + bool real_mode; + bool real_mode_dbg; + u32 err_noics; + u32 err_noicp; + struct kvmppc_ics *ics[KVMPPC_XICS_MAX_ICS_ID + 1]; +}; + +static inline struct kvmppc_icp *kvmppc_xics_find_server(struct kvm *kvm, + u32 nr) +{ + struct kvm_vcpu *vcpu = NULL; + int i; + + kvm_for_each_vcpu(i, vcpu, kvm) { + if (vcpu->arch.icp && nr == vcpu->arch.icp->server_num) + return vcpu->arch.icp; + } + return NULL; +} + +static inline struct kvmppc_ics *kvmppc_xics_find_ics(struct kvmppc_xics *xics, + u32 irq, u16 *source) +{ + u32 icsid = irq >> KVMPPC_XICS_ICS_SHIFT; + u16 src = irq & KVMPPC_XICS_SRC_MASK; + struct kvmppc_ics *ics; + + if (source) + *source = src; + if (icsid > KVMPPC_XICS_MAX_ICS_ID) + return NULL; + ics = xics->ics[icsid]; + if (!ics) + return NULL; + return ics; +} + + +#endif /* _KVM_PPC_BOOK3S_XICS_H */ diff --git a/kernel/arch/powerpc/kvm/booke.c b/kernel/arch/powerpc/kvm/booke.c new file mode 100644 index 000000000..6c1316a15 --- /dev/null +++ b/kernel/arch/powerpc/kvm/booke.c @@ -0,0 +1,2160 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * + * Copyright IBM Corp. 2007 + * Copyright 2010-2011 Freescale Semiconductor, Inc. + * + * Authors: Hollis Blanchard <hollisb@us.ibm.com> + * Christian Ehrhardt <ehrhardt@linux.vnet.ibm.com> + * Scott Wood <scottwood@freescale.com> + * Varun Sethi <varun.sethi@freescale.com> + */ + +#include <linux/errno.h> +#include <linux/err.h> +#include <linux/kvm_host.h> +#include <linux/gfp.h> +#include <linux/module.h> +#include <linux/vmalloc.h> +#include <linux/fs.h> + +#include <asm/cputable.h> +#include <asm/uaccess.h> +#include <asm/kvm_ppc.h> +#include <asm/cacheflush.h> +#include <asm/dbell.h> +#include <asm/hw_irq.h> +#include <asm/irq.h> +#include <asm/time.h> + +#include "timing.h" +#include "booke.h" + +#define CREATE_TRACE_POINTS +#include "trace_booke.h" + +unsigned long kvmppc_booke_handlers; + +#define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM +#define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU + +struct kvm_stats_debugfs_item debugfs_entries[] = { + { "mmio", VCPU_STAT(mmio_exits) }, + { "sig", VCPU_STAT(signal_exits) }, + { "itlb_r", VCPU_STAT(itlb_real_miss_exits) }, + { "itlb_v", VCPU_STAT(itlb_virt_miss_exits) }, + { "dtlb_r", VCPU_STAT(dtlb_real_miss_exits) }, + { "dtlb_v", VCPU_STAT(dtlb_virt_miss_exits) }, + { "sysc", VCPU_STAT(syscall_exits) }, + { "isi", VCPU_STAT(isi_exits) }, + { "dsi", VCPU_STAT(dsi_exits) }, + { "inst_emu", VCPU_STAT(emulated_inst_exits) }, + { "dec", VCPU_STAT(dec_exits) }, + { "ext_intr", VCPU_STAT(ext_intr_exits) }, + { "halt_successful_poll", VCPU_STAT(halt_successful_poll) }, + { "halt_wakeup", VCPU_STAT(halt_wakeup) }, + { "doorbell", VCPU_STAT(dbell_exits) }, + { "guest doorbell", VCPU_STAT(gdbell_exits) }, + { "remote_tlb_flush", VM_STAT(remote_tlb_flush) }, + { NULL } +}; + +/* TODO: use vcpu_printf() */ +void kvmppc_dump_vcpu(struct kvm_vcpu *vcpu) +{ + int i; + + printk("pc: %08lx msr: %08llx\n", vcpu->arch.pc, vcpu->arch.shared->msr); + printk("lr: %08lx ctr: %08lx\n", vcpu->arch.lr, vcpu->arch.ctr); + printk("srr0: %08llx srr1: %08llx\n", vcpu->arch.shared->srr0, + vcpu->arch.shared->srr1); + + printk("exceptions: %08lx\n", vcpu->arch.pending_exceptions); + + for (i = 0; i < 32; i += 4) { + printk("gpr%02d: %08lx %08lx %08lx %08lx\n", i, + kvmppc_get_gpr(vcpu, i), + kvmppc_get_gpr(vcpu, i+1), + kvmppc_get_gpr(vcpu, i+2), + kvmppc_get_gpr(vcpu, i+3)); + } +} + +#ifdef CONFIG_SPE +void kvmppc_vcpu_disable_spe(struct kvm_vcpu *vcpu) +{ + preempt_disable(); + enable_kernel_spe(); + kvmppc_save_guest_spe(vcpu); + vcpu->arch.shadow_msr &= ~MSR_SPE; + preempt_enable(); +} + +static void kvmppc_vcpu_enable_spe(struct kvm_vcpu *vcpu) +{ + preempt_disable(); + enable_kernel_spe(); + kvmppc_load_guest_spe(vcpu); + vcpu->arch.shadow_msr |= MSR_SPE; + preempt_enable(); +} + +static void kvmppc_vcpu_sync_spe(struct kvm_vcpu *vcpu) +{ + if (vcpu->arch.shared->msr & MSR_SPE) { + if (!(vcpu->arch.shadow_msr & MSR_SPE)) + kvmppc_vcpu_enable_spe(vcpu); + } else if (vcpu->arch.shadow_msr & MSR_SPE) { + kvmppc_vcpu_disable_spe(vcpu); + } +} +#else +static void kvmppc_vcpu_sync_spe(struct kvm_vcpu *vcpu) +{ +} +#endif + +/* + * Load up guest vcpu FP state if it's needed. + * It also set the MSR_FP in thread so that host know + * we're holding FPU, and then host can help to save + * guest vcpu FP state if other threads require to use FPU. + * This simulates an FP unavailable fault. + * + * It requires to be called with preemption disabled. + */ +static inline void kvmppc_load_guest_fp(struct kvm_vcpu *vcpu) +{ +#ifdef CONFIG_PPC_FPU + if (!(current->thread.regs->msr & MSR_FP)) { + enable_kernel_fp(); + load_fp_state(&vcpu->arch.fp); + current->thread.fp_save_area = &vcpu->arch.fp; + current->thread.regs->msr |= MSR_FP; + } +#endif +} + +/* + * Save guest vcpu FP state into thread. + * It requires to be called with preemption disabled. + */ +static inline void kvmppc_save_guest_fp(struct kvm_vcpu *vcpu) +{ +#ifdef CONFIG_PPC_FPU + if (current->thread.regs->msr & MSR_FP) + giveup_fpu(current); + current->thread.fp_save_area = NULL; +#endif +} + +static void kvmppc_vcpu_sync_fpu(struct kvm_vcpu *vcpu) +{ +#if defined(CONFIG_PPC_FPU) && !defined(CONFIG_KVM_BOOKE_HV) + /* We always treat the FP bit as enabled from the host + perspective, so only need to adjust the shadow MSR */ + vcpu->arch.shadow_msr &= ~MSR_FP; + vcpu->arch.shadow_msr |= vcpu->arch.shared->msr & MSR_FP; +#endif +} + +/* + * Simulate AltiVec unavailable fault to load guest state + * from thread to AltiVec unit. + * It requires to be called with preemption disabled. + */ +static inline void kvmppc_load_guest_altivec(struct kvm_vcpu *vcpu) +{ +#ifdef CONFIG_ALTIVEC + if (cpu_has_feature(CPU_FTR_ALTIVEC)) { + if (!(current->thread.regs->msr & MSR_VEC)) { + enable_kernel_altivec(); + load_vr_state(&vcpu->arch.vr); + current->thread.vr_save_area = &vcpu->arch.vr; + current->thread.regs->msr |= MSR_VEC; + } + } +#endif +} + +/* + * Save guest vcpu AltiVec state into thread. + * It requires to be called with preemption disabled. + */ +static inline void kvmppc_save_guest_altivec(struct kvm_vcpu *vcpu) +{ +#ifdef CONFIG_ALTIVEC + if (cpu_has_feature(CPU_FTR_ALTIVEC)) { + if (current->thread.regs->msr & MSR_VEC) + giveup_altivec(current); + current->thread.vr_save_area = NULL; + } +#endif +} + +static void kvmppc_vcpu_sync_debug(struct kvm_vcpu *vcpu) +{ + /* Synchronize guest's desire to get debug interrupts into shadow MSR */ +#ifndef CONFIG_KVM_BOOKE_HV + vcpu->arch.shadow_msr &= ~MSR_DE; + vcpu->arch.shadow_msr |= vcpu->arch.shared->msr & MSR_DE; +#endif + + /* Force enable debug interrupts when user space wants to debug */ + if (vcpu->guest_debug) { +#ifdef CONFIG_KVM_BOOKE_HV + /* + * Since there is no shadow MSR, sync MSR_DE into the guest + * visible MSR. + */ + vcpu->arch.shared->msr |= MSR_DE; +#else + vcpu->arch.shadow_msr |= MSR_DE; + vcpu->arch.shared->msr &= ~MSR_DE; +#endif + } +} + +/* + * Helper function for "full" MSR writes. No need to call this if only + * EE/CE/ME/DE/RI are changing. + */ +void kvmppc_set_msr(struct kvm_vcpu *vcpu, u32 new_msr) +{ + u32 old_msr = vcpu->arch.shared->msr; + +#ifdef CONFIG_KVM_BOOKE_HV + new_msr |= MSR_GS; +#endif + + vcpu->arch.shared->msr = new_msr; + + kvmppc_mmu_msr_notify(vcpu, old_msr); + kvmppc_vcpu_sync_spe(vcpu); + kvmppc_vcpu_sync_fpu(vcpu); + kvmppc_vcpu_sync_debug(vcpu); +} + +static void kvmppc_booke_queue_irqprio(struct kvm_vcpu *vcpu, + unsigned int priority) +{ + trace_kvm_booke_queue_irqprio(vcpu, priority); + set_bit(priority, &vcpu->arch.pending_exceptions); +} + +void kvmppc_core_queue_dtlb_miss(struct kvm_vcpu *vcpu, + ulong dear_flags, ulong esr_flags) +{ + vcpu->arch.queued_dear = dear_flags; + vcpu->arch.queued_esr = esr_flags; + kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_DTLB_MISS); +} + +void kvmppc_core_queue_data_storage(struct kvm_vcpu *vcpu, + ulong dear_flags, ulong esr_flags) +{ + vcpu->arch.queued_dear = dear_flags; + vcpu->arch.queued_esr = esr_flags; + kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_DATA_STORAGE); +} + +void kvmppc_core_queue_itlb_miss(struct kvm_vcpu *vcpu) +{ + kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_ITLB_MISS); +} + +void kvmppc_core_queue_inst_storage(struct kvm_vcpu *vcpu, ulong esr_flags) +{ + vcpu->arch.queued_esr = esr_flags; + kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_INST_STORAGE); +} + +static void kvmppc_core_queue_alignment(struct kvm_vcpu *vcpu, ulong dear_flags, + ulong esr_flags) +{ + vcpu->arch.queued_dear = dear_flags; + vcpu->arch.queued_esr = esr_flags; + kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_ALIGNMENT); +} + +void kvmppc_core_queue_program(struct kvm_vcpu *vcpu, ulong esr_flags) +{ + vcpu->arch.queued_esr = esr_flags; + kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_PROGRAM); +} + +void kvmppc_core_queue_dec(struct kvm_vcpu *vcpu) +{ + kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_DECREMENTER); +} + +int kvmppc_core_pending_dec(struct kvm_vcpu *vcpu) +{ + return test_bit(BOOKE_IRQPRIO_DECREMENTER, &vcpu->arch.pending_exceptions); +} + +void kvmppc_core_dequeue_dec(struct kvm_vcpu *vcpu) +{ + clear_bit(BOOKE_IRQPRIO_DECREMENTER, &vcpu->arch.pending_exceptions); +} + +void kvmppc_core_queue_external(struct kvm_vcpu *vcpu, + struct kvm_interrupt *irq) +{ + unsigned int prio = BOOKE_IRQPRIO_EXTERNAL; + + if (irq->irq == KVM_INTERRUPT_SET_LEVEL) + prio = BOOKE_IRQPRIO_EXTERNAL_LEVEL; + + kvmppc_booke_queue_irqprio(vcpu, prio); +} + +void kvmppc_core_dequeue_external(struct kvm_vcpu *vcpu) +{ + clear_bit(BOOKE_IRQPRIO_EXTERNAL, &vcpu->arch.pending_exceptions); + clear_bit(BOOKE_IRQPRIO_EXTERNAL_LEVEL, &vcpu->arch.pending_exceptions); +} + +static void kvmppc_core_queue_watchdog(struct kvm_vcpu *vcpu) +{ + kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_WATCHDOG); +} + +static void kvmppc_core_dequeue_watchdog(struct kvm_vcpu *vcpu) +{ + clear_bit(BOOKE_IRQPRIO_WATCHDOG, &vcpu->arch.pending_exceptions); +} + +void kvmppc_core_queue_debug(struct kvm_vcpu *vcpu) +{ + kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_DEBUG); +} + +void kvmppc_core_dequeue_debug(struct kvm_vcpu *vcpu) +{ + clear_bit(BOOKE_IRQPRIO_DEBUG, &vcpu->arch.pending_exceptions); +} + +static void set_guest_srr(struct kvm_vcpu *vcpu, unsigned long srr0, u32 srr1) +{ + kvmppc_set_srr0(vcpu, srr0); + kvmppc_set_srr1(vcpu, srr1); +} + +static void set_guest_csrr(struct kvm_vcpu *vcpu, unsigned long srr0, u32 srr1) +{ + vcpu->arch.csrr0 = srr0; + vcpu->arch.csrr1 = srr1; +} + +static void set_guest_dsrr(struct kvm_vcpu *vcpu, unsigned long srr0, u32 srr1) +{ + if (cpu_has_feature(CPU_FTR_DEBUG_LVL_EXC)) { + vcpu->arch.dsrr0 = srr0; + vcpu->arch.dsrr1 = srr1; + } else { + set_guest_csrr(vcpu, srr0, srr1); + } +} + +static void set_guest_mcsrr(struct kvm_vcpu *vcpu, unsigned long srr0, u32 srr1) +{ + vcpu->arch.mcsrr0 = srr0; + vcpu->arch.mcsrr1 = srr1; +} + +/* Deliver the interrupt of the corresponding priority, if possible. */ +static int kvmppc_booke_irqprio_deliver(struct kvm_vcpu *vcpu, + unsigned int priority) +{ + int allowed = 0; + ulong msr_mask = 0; + bool update_esr = false, update_dear = false, update_epr = false; + ulong crit_raw = vcpu->arch.shared->critical; + ulong crit_r1 = kvmppc_get_gpr(vcpu, 1); + bool crit; + bool keep_irq = false; + enum int_class int_class; + ulong new_msr = vcpu->arch.shared->msr; + + /* Truncate crit indicators in 32 bit mode */ + if (!(vcpu->arch.shared->msr & MSR_SF)) { + crit_raw &= 0xffffffff; + crit_r1 &= 0xffffffff; + } + + /* Critical section when crit == r1 */ + crit = (crit_raw == crit_r1); + /* ... and we're in supervisor mode */ + crit = crit && !(vcpu->arch.shared->msr & MSR_PR); + + if (priority == BOOKE_IRQPRIO_EXTERNAL_LEVEL) { + priority = BOOKE_IRQPRIO_EXTERNAL; + keep_irq = true; + } + + if ((priority == BOOKE_IRQPRIO_EXTERNAL) && vcpu->arch.epr_flags) + update_epr = true; + + switch (priority) { + case BOOKE_IRQPRIO_DTLB_MISS: + case BOOKE_IRQPRIO_DATA_STORAGE: + case BOOKE_IRQPRIO_ALIGNMENT: + update_dear = true; + /* fall through */ + case BOOKE_IRQPRIO_INST_STORAGE: + case BOOKE_IRQPRIO_PROGRAM: + update_esr = true; + /* fall through */ + case BOOKE_IRQPRIO_ITLB_MISS: + case BOOKE_IRQPRIO_SYSCALL: + case BOOKE_IRQPRIO_FP_UNAVAIL: +#ifdef CONFIG_SPE_POSSIBLE + case BOOKE_IRQPRIO_SPE_UNAVAIL: + case BOOKE_IRQPRIO_SPE_FP_DATA: + case BOOKE_IRQPRIO_SPE_FP_ROUND: +#endif +#ifdef CONFIG_ALTIVEC + case BOOKE_IRQPRIO_ALTIVEC_UNAVAIL: + case BOOKE_IRQPRIO_ALTIVEC_ASSIST: +#endif + case BOOKE_IRQPRIO_AP_UNAVAIL: + allowed = 1; + msr_mask = MSR_CE | MSR_ME | MSR_DE; + int_class = INT_CLASS_NONCRIT; + break; + case BOOKE_IRQPRIO_WATCHDOG: + case BOOKE_IRQPRIO_CRITICAL: + case BOOKE_IRQPRIO_DBELL_CRIT: + allowed = vcpu->arch.shared->msr & MSR_CE; + allowed = allowed && !crit; + msr_mask = MSR_ME; + int_class = INT_CLASS_CRIT; + break; + case BOOKE_IRQPRIO_MACHINE_CHECK: + allowed = vcpu->arch.shared->msr & MSR_ME; + allowed = allowed && !crit; + int_class = INT_CLASS_MC; + break; + case BOOKE_IRQPRIO_DECREMENTER: + case BOOKE_IRQPRIO_FIT: + keep_irq = true; + /* fall through */ + case BOOKE_IRQPRIO_EXTERNAL: + case BOOKE_IRQPRIO_DBELL: + allowed = vcpu->arch.shared->msr & MSR_EE; + allowed = allowed && !crit; + msr_mask = MSR_CE | MSR_ME | MSR_DE; + int_class = INT_CLASS_NONCRIT; + break; + case BOOKE_IRQPRIO_DEBUG: + allowed = vcpu->arch.shared->msr & MSR_DE; + allowed = allowed && !crit; + msr_mask = MSR_ME; + if (cpu_has_feature(CPU_FTR_DEBUG_LVL_EXC)) + int_class = INT_CLASS_DBG; + else + int_class = INT_CLASS_CRIT; + + break; + } + + if (allowed) { + switch (int_class) { + case INT_CLASS_NONCRIT: + set_guest_srr(vcpu, vcpu->arch.pc, + vcpu->arch.shared->msr); + break; + case INT_CLASS_CRIT: + set_guest_csrr(vcpu, vcpu->arch.pc, + vcpu->arch.shared->msr); + break; + case INT_CLASS_DBG: + set_guest_dsrr(vcpu, vcpu->arch.pc, + vcpu->arch.shared->msr); + break; + case INT_CLASS_MC: + set_guest_mcsrr(vcpu, vcpu->arch.pc, + vcpu->arch.shared->msr); + break; + } + + vcpu->arch.pc = vcpu->arch.ivpr | vcpu->arch.ivor[priority]; + if (update_esr == true) + kvmppc_set_esr(vcpu, vcpu->arch.queued_esr); + if (update_dear == true) + kvmppc_set_dar(vcpu, vcpu->arch.queued_dear); + if (update_epr == true) { + if (vcpu->arch.epr_flags & KVMPPC_EPR_USER) + kvm_make_request(KVM_REQ_EPR_EXIT, vcpu); + else if (vcpu->arch.epr_flags & KVMPPC_EPR_KERNEL) { + BUG_ON(vcpu->arch.irq_type != KVMPPC_IRQ_MPIC); + kvmppc_mpic_set_epr(vcpu); + } + } + + new_msr &= msr_mask; +#if defined(CONFIG_64BIT) + if (vcpu->arch.epcr & SPRN_EPCR_ICM) + new_msr |= MSR_CM; +#endif + kvmppc_set_msr(vcpu, new_msr); + + if (!keep_irq) + clear_bit(priority, &vcpu->arch.pending_exceptions); + } + +#ifdef CONFIG_KVM_BOOKE_HV + /* + * If an interrupt is pending but masked, raise a guest doorbell + * so that we are notified when the guest enables the relevant + * MSR bit. + */ + if (vcpu->arch.pending_exceptions & BOOKE_IRQMASK_EE) + kvmppc_set_pending_interrupt(vcpu, INT_CLASS_NONCRIT); + if (vcpu->arch.pending_exceptions & BOOKE_IRQMASK_CE) + kvmppc_set_pending_interrupt(vcpu, INT_CLASS_CRIT); + if (vcpu->arch.pending_exceptions & BOOKE_IRQPRIO_MACHINE_CHECK) + kvmppc_set_pending_interrupt(vcpu, INT_CLASS_MC); +#endif + + return allowed; +} + +/* + * Return the number of jiffies until the next timeout. If the timeout is + * longer than the NEXT_TIMER_MAX_DELTA, then return NEXT_TIMER_MAX_DELTA + * because the larger value can break the timer APIs. + */ +static unsigned long watchdog_next_timeout(struct kvm_vcpu *vcpu) +{ + u64 tb, wdt_tb, wdt_ticks = 0; + u64 nr_jiffies = 0; + u32 period = TCR_GET_WP(vcpu->arch.tcr); + + wdt_tb = 1ULL << (63 - period); + tb = get_tb(); + /* + * The watchdog timeout will hapeen when TB bit corresponding + * to watchdog will toggle from 0 to 1. + */ + if (tb & wdt_tb) + wdt_ticks = wdt_tb; + + wdt_ticks += wdt_tb - (tb & (wdt_tb - 1)); + + /* Convert timebase ticks to jiffies */ + nr_jiffies = wdt_ticks; + + if (do_div(nr_jiffies, tb_ticks_per_jiffy)) + nr_jiffies++; + + return min_t(unsigned long long, nr_jiffies, NEXT_TIMER_MAX_DELTA); +} + +static void arm_next_watchdog(struct kvm_vcpu *vcpu) +{ + unsigned long nr_jiffies; + unsigned long flags; + + /* + * If TSR_ENW and TSR_WIS are not set then no need to exit to + * userspace, so clear the KVM_REQ_WATCHDOG request. + */ + if ((vcpu->arch.tsr & (TSR_ENW | TSR_WIS)) != (TSR_ENW | TSR_WIS)) + clear_bit(KVM_REQ_WATCHDOG, &vcpu->requests); + + spin_lock_irqsave(&vcpu->arch.wdt_lock, flags); + nr_jiffies = watchdog_next_timeout(vcpu); + /* + * If the number of jiffies of watchdog timer >= NEXT_TIMER_MAX_DELTA + * then do not run the watchdog timer as this can break timer APIs. + */ + if (nr_jiffies < NEXT_TIMER_MAX_DELTA) + mod_timer(&vcpu->arch.wdt_timer, jiffies + nr_jiffies); + else + del_timer(&vcpu->arch.wdt_timer); + spin_unlock_irqrestore(&vcpu->arch.wdt_lock, flags); +} + +void kvmppc_watchdog_func(unsigned long data) +{ + struct kvm_vcpu *vcpu = (struct kvm_vcpu *)data; + u32 tsr, new_tsr; + int final; + + do { + new_tsr = tsr = vcpu->arch.tsr; + final = 0; + + /* Time out event */ + if (tsr & TSR_ENW) { + if (tsr & TSR_WIS) + final = 1; + else + new_tsr = tsr | TSR_WIS; + } else { + new_tsr = tsr | TSR_ENW; + } + } while (cmpxchg(&vcpu->arch.tsr, tsr, new_tsr) != tsr); + + if (new_tsr & TSR_WIS) { + smp_wmb(); + kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu); + kvm_vcpu_kick(vcpu); + } + + /* + * If this is final watchdog expiry and some action is required + * then exit to userspace. + */ + if (final && (vcpu->arch.tcr & TCR_WRC_MASK) && + vcpu->arch.watchdog_enabled) { + smp_wmb(); + kvm_make_request(KVM_REQ_WATCHDOG, vcpu); + kvm_vcpu_kick(vcpu); + } + + /* + * Stop running the watchdog timer after final expiration to + * prevent the host from being flooded with timers if the + * guest sets a short period. + * Timers will resume when TSR/TCR is updated next time. + */ + if (!final) + arm_next_watchdog(vcpu); +} + +static void update_timer_ints(struct kvm_vcpu *vcpu) +{ + if ((vcpu->arch.tcr & TCR_DIE) && (vcpu->arch.tsr & TSR_DIS)) + kvmppc_core_queue_dec(vcpu); + else + kvmppc_core_dequeue_dec(vcpu); + + if ((vcpu->arch.tcr & TCR_WIE) && (vcpu->arch.tsr & TSR_WIS)) + kvmppc_core_queue_watchdog(vcpu); + else + kvmppc_core_dequeue_watchdog(vcpu); +} + +static void kvmppc_core_check_exceptions(struct kvm_vcpu *vcpu) +{ + unsigned long *pending = &vcpu->arch.pending_exceptions; + unsigned int priority; + + priority = __ffs(*pending); + while (priority < BOOKE_IRQPRIO_MAX) { + if (kvmppc_booke_irqprio_deliver(vcpu, priority)) + break; + + priority = find_next_bit(pending, + BITS_PER_BYTE * sizeof(*pending), + priority + 1); + } + + /* Tell the guest about our interrupt status */ + vcpu->arch.shared->int_pending = !!*pending; +} + +/* Check pending exceptions and deliver one, if possible. */ +int kvmppc_core_prepare_to_enter(struct kvm_vcpu *vcpu) +{ + int r = 0; + WARN_ON_ONCE(!irqs_disabled()); + + kvmppc_core_check_exceptions(vcpu); + + if (vcpu->requests) { + /* Exception delivery raised request; start over */ + return 1; + } + + if (vcpu->arch.shared->msr & MSR_WE) { + local_irq_enable(); + kvm_vcpu_block(vcpu); + clear_bit(KVM_REQ_UNHALT, &vcpu->requests); + hard_irq_disable(); + + kvmppc_set_exit_type(vcpu, EMULATED_MTMSRWE_EXITS); + r = 1; + }; + + return r; +} + +int kvmppc_core_check_requests(struct kvm_vcpu *vcpu) +{ + int r = 1; /* Indicate we want to get back into the guest */ + + if (kvm_check_request(KVM_REQ_PENDING_TIMER, vcpu)) + update_timer_ints(vcpu); +#if defined(CONFIG_KVM_E500V2) || defined(CONFIG_KVM_E500MC) + if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu)) + kvmppc_core_flush_tlb(vcpu); +#endif + + if (kvm_check_request(KVM_REQ_WATCHDOG, vcpu)) { + vcpu->run->exit_reason = KVM_EXIT_WATCHDOG; + r = 0; + } + + if (kvm_check_request(KVM_REQ_EPR_EXIT, vcpu)) { + vcpu->run->epr.epr = 0; + vcpu->arch.epr_needed = true; + vcpu->run->exit_reason = KVM_EXIT_EPR; + r = 0; + } + + return r; +} + +int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) +{ + int ret, s; + struct debug_reg debug; + + if (!vcpu->arch.sane) { + kvm_run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + return -EINVAL; + } + + s = kvmppc_prepare_to_enter(vcpu); + if (s <= 0) { + ret = s; + goto out; + } + /* interrupts now hard-disabled */ + +#ifdef CONFIG_PPC_FPU + /* Save userspace FPU state in stack */ + enable_kernel_fp(); + + /* + * Since we can't trap on MSR_FP in GS-mode, we consider the guest + * as always using the FPU. + */ + kvmppc_load_guest_fp(vcpu); +#endif + +#ifdef CONFIG_ALTIVEC + /* Save userspace AltiVec state in stack */ + if (cpu_has_feature(CPU_FTR_ALTIVEC)) + enable_kernel_altivec(); + /* + * Since we can't trap on MSR_VEC in GS-mode, we consider the guest + * as always using the AltiVec. + */ + kvmppc_load_guest_altivec(vcpu); +#endif + + /* Switch to guest debug context */ + debug = vcpu->arch.dbg_reg; + switch_booke_debug_regs(&debug); + debug = current->thread.debug; + current->thread.debug = vcpu->arch.dbg_reg; + + vcpu->arch.pgdir = current->mm->pgd; + kvmppc_fix_ee_before_entry(); + + ret = __kvmppc_vcpu_run(kvm_run, vcpu); + + /* No need for kvm_guest_exit. It's done in handle_exit. + We also get here with interrupts enabled. */ + + /* Switch back to user space debug context */ + switch_booke_debug_regs(&debug); + current->thread.debug = debug; + +#ifdef CONFIG_PPC_FPU + kvmppc_save_guest_fp(vcpu); +#endif + +#ifdef CONFIG_ALTIVEC + kvmppc_save_guest_altivec(vcpu); +#endif + +out: + vcpu->mode = OUTSIDE_GUEST_MODE; + return ret; +} + +static int emulation_exit(struct kvm_run *run, struct kvm_vcpu *vcpu) +{ + enum emulation_result er; + + er = kvmppc_emulate_instruction(run, vcpu); + switch (er) { + case EMULATE_DONE: + /* don't overwrite subtypes, just account kvm_stats */ + kvmppc_account_exit_stat(vcpu, EMULATED_INST_EXITS); + /* Future optimization: only reload non-volatiles if + * they were actually modified by emulation. */ + return RESUME_GUEST_NV; + + case EMULATE_AGAIN: + return RESUME_GUEST; + + case EMULATE_FAIL: + printk(KERN_CRIT "%s: emulation at %lx failed (%08x)\n", + __func__, vcpu->arch.pc, vcpu->arch.last_inst); + /* For debugging, encode the failing instruction and + * report it to userspace. */ + run->hw.hardware_exit_reason = ~0ULL << 32; + run->hw.hardware_exit_reason |= vcpu->arch.last_inst; + kvmppc_core_queue_program(vcpu, ESR_PIL); + return RESUME_HOST; + + case EMULATE_EXIT_USER: + return RESUME_HOST; + + default: + BUG(); + } +} + +static int kvmppc_handle_debug(struct kvm_run *run, struct kvm_vcpu *vcpu) +{ + struct debug_reg *dbg_reg = &(vcpu->arch.dbg_reg); + u32 dbsr = vcpu->arch.dbsr; + + if (vcpu->guest_debug == 0) { + /* + * Debug resources belong to Guest. + * Imprecise debug event is not injected + */ + if (dbsr & DBSR_IDE) { + dbsr &= ~DBSR_IDE; + if (!dbsr) + return RESUME_GUEST; + } + + if (dbsr && (vcpu->arch.shared->msr & MSR_DE) && + (vcpu->arch.dbg_reg.dbcr0 & DBCR0_IDM)) + kvmppc_core_queue_debug(vcpu); + + /* Inject a program interrupt if trap debug is not allowed */ + if ((dbsr & DBSR_TIE) && !(vcpu->arch.shared->msr & MSR_DE)) + kvmppc_core_queue_program(vcpu, ESR_PTR); + + return RESUME_GUEST; + } + + /* + * Debug resource owned by userspace. + * Clear guest dbsr (vcpu->arch.dbsr) + */ + vcpu->arch.dbsr = 0; + run->debug.arch.status = 0; + run->debug.arch.address = vcpu->arch.pc; + + if (dbsr & (DBSR_IAC1 | DBSR_IAC2 | DBSR_IAC3 | DBSR_IAC4)) { + run->debug.arch.status |= KVMPPC_DEBUG_BREAKPOINT; + } else { + if (dbsr & (DBSR_DAC1W | DBSR_DAC2W)) + run->debug.arch.status |= KVMPPC_DEBUG_WATCH_WRITE; + else if (dbsr & (DBSR_DAC1R | DBSR_DAC2R)) + run->debug.arch.status |= KVMPPC_DEBUG_WATCH_READ; + if (dbsr & (DBSR_DAC1R | DBSR_DAC1W)) + run->debug.arch.address = dbg_reg->dac1; + else if (dbsr & (DBSR_DAC2R | DBSR_DAC2W)) + run->debug.arch.address = dbg_reg->dac2; + } + + return RESUME_HOST; +} + +static void kvmppc_fill_pt_regs(struct pt_regs *regs) +{ + ulong r1, ip, msr, lr; + + asm("mr %0, 1" : "=r"(r1)); + asm("mflr %0" : "=r"(lr)); + asm("mfmsr %0" : "=r"(msr)); + asm("bl 1f; 1: mflr %0" : "=r"(ip)); + + memset(regs, 0, sizeof(*regs)); + regs->gpr[1] = r1; + regs->nip = ip; + regs->msr = msr; + regs->link = lr; +} + +/* + * For interrupts needed to be handled by host interrupt handlers, + * corresponding host handler are called from here in similar way + * (but not exact) as they are called from low level handler + * (such as from arch/powerpc/kernel/head_fsl_booke.S). + */ +static void kvmppc_restart_interrupt(struct kvm_vcpu *vcpu, + unsigned int exit_nr) +{ + struct pt_regs regs; + + switch (exit_nr) { + case BOOKE_INTERRUPT_EXTERNAL: + kvmppc_fill_pt_regs(®s); + do_IRQ(®s); + break; + case BOOKE_INTERRUPT_DECREMENTER: + kvmppc_fill_pt_regs(®s); + timer_interrupt(®s); + break; +#if defined(CONFIG_PPC_DOORBELL) + case BOOKE_INTERRUPT_DOORBELL: + kvmppc_fill_pt_regs(®s); + doorbell_exception(®s); + break; +#endif + case BOOKE_INTERRUPT_MACHINE_CHECK: + /* FIXME */ + break; + case BOOKE_INTERRUPT_PERFORMANCE_MONITOR: + kvmppc_fill_pt_regs(®s); + performance_monitor_exception(®s); + break; + case BOOKE_INTERRUPT_WATCHDOG: + kvmppc_fill_pt_regs(®s); +#ifdef CONFIG_BOOKE_WDT + WatchdogException(®s); +#else + unknown_exception(®s); +#endif + break; + case BOOKE_INTERRUPT_CRITICAL: + unknown_exception(®s); + break; + case BOOKE_INTERRUPT_DEBUG: + /* Save DBSR before preemption is enabled */ + vcpu->arch.dbsr = mfspr(SPRN_DBSR); + kvmppc_clear_dbsr(); + break; + } +} + +static int kvmppc_resume_inst_load(struct kvm_run *run, struct kvm_vcpu *vcpu, + enum emulation_result emulated, u32 last_inst) +{ + switch (emulated) { + case EMULATE_AGAIN: + return RESUME_GUEST; + + case EMULATE_FAIL: + pr_debug("%s: load instruction from guest address %lx failed\n", + __func__, vcpu->arch.pc); + /* For debugging, encode the failing instruction and + * report it to userspace. */ + run->hw.hardware_exit_reason = ~0ULL << 32; + run->hw.hardware_exit_reason |= last_inst; + kvmppc_core_queue_program(vcpu, ESR_PIL); + return RESUME_HOST; + + default: + BUG(); + } +} + +/** + * kvmppc_handle_exit + * + * Return value is in the form (errcode<<2 | RESUME_FLAG_HOST | RESUME_FLAG_NV) + */ +int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, + unsigned int exit_nr) +{ + int r = RESUME_HOST; + int s; + int idx; + u32 last_inst = KVM_INST_FETCH_FAILED; + enum emulation_result emulated = EMULATE_DONE; + + /* update before a new last_exit_type is rewritten */ + kvmppc_update_timing_stats(vcpu); + + /* restart interrupts if they were meant for the host */ + kvmppc_restart_interrupt(vcpu, exit_nr); + + /* + * get last instruction before beeing preempted + * TODO: for e6500 check also BOOKE_INTERRUPT_LRAT_ERROR & ESR_DATA + */ + switch (exit_nr) { + case BOOKE_INTERRUPT_DATA_STORAGE: + case BOOKE_INTERRUPT_DTLB_MISS: + case BOOKE_INTERRUPT_HV_PRIV: + emulated = kvmppc_get_last_inst(vcpu, INST_GENERIC, &last_inst); + break; + case BOOKE_INTERRUPT_PROGRAM: + /* SW breakpoints arrive as illegal instructions on HV */ + if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) + emulated = kvmppc_get_last_inst(vcpu, INST_GENERIC, &last_inst); + break; + default: + break; + } + + local_irq_enable(); + + trace_kvm_exit(exit_nr, vcpu); + kvm_guest_exit(); + + run->exit_reason = KVM_EXIT_UNKNOWN; + run->ready_for_interrupt_injection = 1; + + if (emulated != EMULATE_DONE) { + r = kvmppc_resume_inst_load(run, vcpu, emulated, last_inst); + goto out; + } + + switch (exit_nr) { + case BOOKE_INTERRUPT_MACHINE_CHECK: + printk("MACHINE CHECK: %lx\n", mfspr(SPRN_MCSR)); + kvmppc_dump_vcpu(vcpu); + /* For debugging, send invalid exit reason to user space */ + run->hw.hardware_exit_reason = ~1ULL << 32; + run->hw.hardware_exit_reason |= mfspr(SPRN_MCSR); + r = RESUME_HOST; + break; + + case BOOKE_INTERRUPT_EXTERNAL: + kvmppc_account_exit(vcpu, EXT_INTR_EXITS); + r = RESUME_GUEST; + break; + + case BOOKE_INTERRUPT_DECREMENTER: + kvmppc_account_exit(vcpu, DEC_EXITS); + r = RESUME_GUEST; + break; + + case BOOKE_INTERRUPT_WATCHDOG: + r = RESUME_GUEST; + break; + + case BOOKE_INTERRUPT_DOORBELL: + kvmppc_account_exit(vcpu, DBELL_EXITS); + r = RESUME_GUEST; + break; + + case BOOKE_INTERRUPT_GUEST_DBELL_CRIT: + kvmppc_account_exit(vcpu, GDBELL_EXITS); + + /* + * We are here because there is a pending guest interrupt + * which could not be delivered as MSR_CE or MSR_ME was not + * set. Once we break from here we will retry delivery. + */ + r = RESUME_GUEST; + break; + + case BOOKE_INTERRUPT_GUEST_DBELL: + kvmppc_account_exit(vcpu, GDBELL_EXITS); + + /* + * We are here because there is a pending guest interrupt + * which could not be delivered as MSR_EE was not set. Once + * we break from here we will retry delivery. + */ + r = RESUME_GUEST; + break; + + case BOOKE_INTERRUPT_PERFORMANCE_MONITOR: + r = RESUME_GUEST; + break; + + case BOOKE_INTERRUPT_HV_PRIV: + r = emulation_exit(run, vcpu); + break; + + case BOOKE_INTERRUPT_PROGRAM: + if ((vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) && + (last_inst == KVMPPC_INST_SW_BREAKPOINT)) { + /* + * We are here because of an SW breakpoint instr, + * so lets return to host to handle. + */ + r = kvmppc_handle_debug(run, vcpu); + run->exit_reason = KVM_EXIT_DEBUG; + kvmppc_account_exit(vcpu, DEBUG_EXITS); + break; + } + + if (vcpu->arch.shared->msr & (MSR_PR | MSR_GS)) { + /* + * Program traps generated by user-level software must + * be handled by the guest kernel. + * + * In GS mode, hypervisor privileged instructions trap + * on BOOKE_INTERRUPT_HV_PRIV, not here, so these are + * actual program interrupts, handled by the guest. + */ + kvmppc_core_queue_program(vcpu, vcpu->arch.fault_esr); + r = RESUME_GUEST; + kvmppc_account_exit(vcpu, USR_PR_INST); + break; + } + + r = emulation_exit(run, vcpu); + break; + + case BOOKE_INTERRUPT_FP_UNAVAIL: + kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_FP_UNAVAIL); + kvmppc_account_exit(vcpu, FP_UNAVAIL); + r = RESUME_GUEST; + break; + +#ifdef CONFIG_SPE + case BOOKE_INTERRUPT_SPE_UNAVAIL: { + if (vcpu->arch.shared->msr & MSR_SPE) + kvmppc_vcpu_enable_spe(vcpu); + else + kvmppc_booke_queue_irqprio(vcpu, + BOOKE_IRQPRIO_SPE_UNAVAIL); + r = RESUME_GUEST; + break; + } + + case BOOKE_INTERRUPT_SPE_FP_DATA: + kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_SPE_FP_DATA); + r = RESUME_GUEST; + break; + + case BOOKE_INTERRUPT_SPE_FP_ROUND: + kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_SPE_FP_ROUND); + r = RESUME_GUEST; + break; +#elif defined(CONFIG_SPE_POSSIBLE) + case BOOKE_INTERRUPT_SPE_UNAVAIL: + /* + * Guest wants SPE, but host kernel doesn't support it. Send + * an "unimplemented operation" program check to the guest. + */ + kvmppc_core_queue_program(vcpu, ESR_PUO | ESR_SPV); + r = RESUME_GUEST; + break; + + /* + * These really should never happen without CONFIG_SPE, + * as we should never enable the real MSR[SPE] in the guest. + */ + case BOOKE_INTERRUPT_SPE_FP_DATA: + case BOOKE_INTERRUPT_SPE_FP_ROUND: + printk(KERN_CRIT "%s: unexpected SPE interrupt %u at %08lx\n", + __func__, exit_nr, vcpu->arch.pc); + run->hw.hardware_exit_reason = exit_nr; + r = RESUME_HOST; + break; +#endif /* CONFIG_SPE_POSSIBLE */ + +/* + * On cores with Vector category, KVM is loaded only if CONFIG_ALTIVEC, + * see kvmppc_core_check_processor_compat(). + */ +#ifdef CONFIG_ALTIVEC + case BOOKE_INTERRUPT_ALTIVEC_UNAVAIL: + kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_ALTIVEC_UNAVAIL); + r = RESUME_GUEST; + break; + + case BOOKE_INTERRUPT_ALTIVEC_ASSIST: + kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_ALTIVEC_ASSIST); + r = RESUME_GUEST; + break; +#endif + + case BOOKE_INTERRUPT_DATA_STORAGE: + kvmppc_core_queue_data_storage(vcpu, vcpu->arch.fault_dear, + vcpu->arch.fault_esr); + kvmppc_account_exit(vcpu, DSI_EXITS); + r = RESUME_GUEST; + break; + + case BOOKE_INTERRUPT_INST_STORAGE: + kvmppc_core_queue_inst_storage(vcpu, vcpu->arch.fault_esr); + kvmppc_account_exit(vcpu, ISI_EXITS); + r = RESUME_GUEST; + break; + + case BOOKE_INTERRUPT_ALIGNMENT: + kvmppc_core_queue_alignment(vcpu, vcpu->arch.fault_dear, + vcpu->arch.fault_esr); + r = RESUME_GUEST; + break; + +#ifdef CONFIG_KVM_BOOKE_HV + case BOOKE_INTERRUPT_HV_SYSCALL: + if (!(vcpu->arch.shared->msr & MSR_PR)) { + kvmppc_set_gpr(vcpu, 3, kvmppc_kvm_pv(vcpu)); + } else { + /* + * hcall from guest userspace -- send privileged + * instruction program check. + */ + kvmppc_core_queue_program(vcpu, ESR_PPR); + } + + r = RESUME_GUEST; + break; +#else + case BOOKE_INTERRUPT_SYSCALL: + if (!(vcpu->arch.shared->msr & MSR_PR) && + (((u32)kvmppc_get_gpr(vcpu, 0)) == KVM_SC_MAGIC_R0)) { + /* KVM PV hypercalls */ + kvmppc_set_gpr(vcpu, 3, kvmppc_kvm_pv(vcpu)); + r = RESUME_GUEST; + } else { + /* Guest syscalls */ + kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_SYSCALL); + } + kvmppc_account_exit(vcpu, SYSCALL_EXITS); + r = RESUME_GUEST; + break; +#endif + + case BOOKE_INTERRUPT_DTLB_MISS: { + unsigned long eaddr = vcpu->arch.fault_dear; + int gtlb_index; + gpa_t gpaddr; + gfn_t gfn; + +#ifdef CONFIG_KVM_E500V2 + if (!(vcpu->arch.shared->msr & MSR_PR) && + (eaddr & PAGE_MASK) == vcpu->arch.magic_page_ea) { + kvmppc_map_magic(vcpu); + kvmppc_account_exit(vcpu, DTLB_VIRT_MISS_EXITS); + r = RESUME_GUEST; + + break; + } +#endif + + /* Check the guest TLB. */ + gtlb_index = kvmppc_mmu_dtlb_index(vcpu, eaddr); + if (gtlb_index < 0) { + /* The guest didn't have a mapping for it. */ + kvmppc_core_queue_dtlb_miss(vcpu, + vcpu->arch.fault_dear, + vcpu->arch.fault_esr); + kvmppc_mmu_dtlb_miss(vcpu); + kvmppc_account_exit(vcpu, DTLB_REAL_MISS_EXITS); + r = RESUME_GUEST; + break; + } + + idx = srcu_read_lock(&vcpu->kvm->srcu); + + gpaddr = kvmppc_mmu_xlate(vcpu, gtlb_index, eaddr); + gfn = gpaddr >> PAGE_SHIFT; + + if (kvm_is_visible_gfn(vcpu->kvm, gfn)) { + /* The guest TLB had a mapping, but the shadow TLB + * didn't, and it is RAM. This could be because: + * a) the entry is mapping the host kernel, or + * b) the guest used a large mapping which we're faking + * Either way, we need to satisfy the fault without + * invoking the guest. */ + kvmppc_mmu_map(vcpu, eaddr, gpaddr, gtlb_index); + kvmppc_account_exit(vcpu, DTLB_VIRT_MISS_EXITS); + r = RESUME_GUEST; + } else { + /* Guest has mapped and accessed a page which is not + * actually RAM. */ + vcpu->arch.paddr_accessed = gpaddr; + vcpu->arch.vaddr_accessed = eaddr; + r = kvmppc_emulate_mmio(run, vcpu); + kvmppc_account_exit(vcpu, MMIO_EXITS); + } + + srcu_read_unlock(&vcpu->kvm->srcu, idx); + break; + } + + case BOOKE_INTERRUPT_ITLB_MISS: { + unsigned long eaddr = vcpu->arch.pc; + gpa_t gpaddr; + gfn_t gfn; + int gtlb_index; + + r = RESUME_GUEST; + + /* Check the guest TLB. */ + gtlb_index = kvmppc_mmu_itlb_index(vcpu, eaddr); + if (gtlb_index < 0) { + /* The guest didn't have a mapping for it. */ + kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_ITLB_MISS); + kvmppc_mmu_itlb_miss(vcpu); + kvmppc_account_exit(vcpu, ITLB_REAL_MISS_EXITS); + break; + } + + kvmppc_account_exit(vcpu, ITLB_VIRT_MISS_EXITS); + + idx = srcu_read_lock(&vcpu->kvm->srcu); + + gpaddr = kvmppc_mmu_xlate(vcpu, gtlb_index, eaddr); + gfn = gpaddr >> PAGE_SHIFT; + + if (kvm_is_visible_gfn(vcpu->kvm, gfn)) { + /* The guest TLB had a mapping, but the shadow TLB + * didn't. This could be because: + * a) the entry is mapping the host kernel, or + * b) the guest used a large mapping which we're faking + * Either way, we need to satisfy the fault without + * invoking the guest. */ + kvmppc_mmu_map(vcpu, eaddr, gpaddr, gtlb_index); + } else { + /* Guest mapped and leaped at non-RAM! */ + kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_MACHINE_CHECK); + } + + srcu_read_unlock(&vcpu->kvm->srcu, idx); + break; + } + + case BOOKE_INTERRUPT_DEBUG: { + r = kvmppc_handle_debug(run, vcpu); + if (r == RESUME_HOST) + run->exit_reason = KVM_EXIT_DEBUG; + kvmppc_account_exit(vcpu, DEBUG_EXITS); + break; + } + + default: + printk(KERN_EMERG "exit_nr %d\n", exit_nr); + BUG(); + } + +out: + /* + * To avoid clobbering exit_reason, only check for signals if we + * aren't already exiting to userspace for some other reason. + */ + if (!(r & RESUME_HOST)) { + s = kvmppc_prepare_to_enter(vcpu); + if (s <= 0) + r = (s << 2) | RESUME_HOST | (r & RESUME_FLAG_NV); + else { + /* interrupts now hard-disabled */ + kvmppc_fix_ee_before_entry(); + kvmppc_load_guest_fp(vcpu); + kvmppc_load_guest_altivec(vcpu); + } + } + + return r; +} + +static void kvmppc_set_tsr(struct kvm_vcpu *vcpu, u32 new_tsr) +{ + u32 old_tsr = vcpu->arch.tsr; + + vcpu->arch.tsr = new_tsr; + + if ((old_tsr ^ vcpu->arch.tsr) & (TSR_ENW | TSR_WIS)) + arm_next_watchdog(vcpu); + + update_timer_ints(vcpu); +} + +/* Initial guest state: 16MB mapping 0 -> 0, PC = 0, MSR = 0, R1 = 16MB */ +int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) +{ + int i; + int r; + + vcpu->arch.pc = 0; + vcpu->arch.shared->pir = vcpu->vcpu_id; + kvmppc_set_gpr(vcpu, 1, (16<<20) - 8); /* -8 for the callee-save LR slot */ + kvmppc_set_msr(vcpu, 0); + +#ifndef CONFIG_KVM_BOOKE_HV + vcpu->arch.shadow_msr = MSR_USER | MSR_IS | MSR_DS; + vcpu->arch.shadow_pid = 1; + vcpu->arch.shared->msr = 0; +#endif + + /* Eye-catching numbers so we know if the guest takes an interrupt + * before it's programmed its own IVPR/IVORs. */ + vcpu->arch.ivpr = 0x55550000; + for (i = 0; i < BOOKE_IRQPRIO_MAX; i++) + vcpu->arch.ivor[i] = 0x7700 | i * 4; + + kvmppc_init_timing_stats(vcpu); + + r = kvmppc_core_vcpu_setup(vcpu); + kvmppc_sanity_check(vcpu); + return r; +} + +int kvmppc_subarch_vcpu_init(struct kvm_vcpu *vcpu) +{ + /* setup watchdog timer once */ + spin_lock_init(&vcpu->arch.wdt_lock); + setup_timer(&vcpu->arch.wdt_timer, kvmppc_watchdog_func, + (unsigned long)vcpu); + + /* + * Clear DBSR.MRR to avoid guest debug interrupt as + * this is of host interest + */ + mtspr(SPRN_DBSR, DBSR_MRR); + return 0; +} + +void kvmppc_subarch_vcpu_uninit(struct kvm_vcpu *vcpu) +{ + del_timer_sync(&vcpu->arch.wdt_timer); +} + +int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) +{ + int i; + + regs->pc = vcpu->arch.pc; + regs->cr = kvmppc_get_cr(vcpu); + regs->ctr = vcpu->arch.ctr; + regs->lr = vcpu->arch.lr; + regs->xer = kvmppc_get_xer(vcpu); + regs->msr = vcpu->arch.shared->msr; + regs->srr0 = kvmppc_get_srr0(vcpu); + regs->srr1 = kvmppc_get_srr1(vcpu); + regs->pid = vcpu->arch.pid; + regs->sprg0 = kvmppc_get_sprg0(vcpu); + regs->sprg1 = kvmppc_get_sprg1(vcpu); + regs->sprg2 = kvmppc_get_sprg2(vcpu); + regs->sprg3 = kvmppc_get_sprg3(vcpu); + regs->sprg4 = kvmppc_get_sprg4(vcpu); + regs->sprg5 = kvmppc_get_sprg5(vcpu); + regs->sprg6 = kvmppc_get_sprg6(vcpu); + regs->sprg7 = kvmppc_get_sprg7(vcpu); + + for (i = 0; i < ARRAY_SIZE(regs->gpr); i++) + regs->gpr[i] = kvmppc_get_gpr(vcpu, i); + + return 0; +} + +int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) +{ + int i; + + vcpu->arch.pc = regs->pc; + kvmppc_set_cr(vcpu, regs->cr); + vcpu->arch.ctr = regs->ctr; + vcpu->arch.lr = regs->lr; + kvmppc_set_xer(vcpu, regs->xer); + kvmppc_set_msr(vcpu, regs->msr); + kvmppc_set_srr0(vcpu, regs->srr0); + kvmppc_set_srr1(vcpu, regs->srr1); + kvmppc_set_pid(vcpu, regs->pid); + kvmppc_set_sprg0(vcpu, regs->sprg0); + kvmppc_set_sprg1(vcpu, regs->sprg1); + kvmppc_set_sprg2(vcpu, regs->sprg2); + kvmppc_set_sprg3(vcpu, regs->sprg3); + kvmppc_set_sprg4(vcpu, regs->sprg4); + kvmppc_set_sprg5(vcpu, regs->sprg5); + kvmppc_set_sprg6(vcpu, regs->sprg6); + kvmppc_set_sprg7(vcpu, regs->sprg7); + + for (i = 0; i < ARRAY_SIZE(regs->gpr); i++) + kvmppc_set_gpr(vcpu, i, regs->gpr[i]); + + return 0; +} + +static void get_sregs_base(struct kvm_vcpu *vcpu, + struct kvm_sregs *sregs) +{ + u64 tb = get_tb(); + + sregs->u.e.features |= KVM_SREGS_E_BASE; + + sregs->u.e.csrr0 = vcpu->arch.csrr0; + sregs->u.e.csrr1 = vcpu->arch.csrr1; + sregs->u.e.mcsr = vcpu->arch.mcsr; + sregs->u.e.esr = kvmppc_get_esr(vcpu); + sregs->u.e.dear = kvmppc_get_dar(vcpu); + sregs->u.e.tsr = vcpu->arch.tsr; + sregs->u.e.tcr = vcpu->arch.tcr; + sregs->u.e.dec = kvmppc_get_dec(vcpu, tb); + sregs->u.e.tb = tb; + sregs->u.e.vrsave = vcpu->arch.vrsave; +} + +static int set_sregs_base(struct kvm_vcpu *vcpu, + struct kvm_sregs *sregs) +{ + if (!(sregs->u.e.features & KVM_SREGS_E_BASE)) + return 0; + + vcpu->arch.csrr0 = sregs->u.e.csrr0; + vcpu->arch.csrr1 = sregs->u.e.csrr1; + vcpu->arch.mcsr = sregs->u.e.mcsr; + kvmppc_set_esr(vcpu, sregs->u.e.esr); + kvmppc_set_dar(vcpu, sregs->u.e.dear); + vcpu->arch.vrsave = sregs->u.e.vrsave; + kvmppc_set_tcr(vcpu, sregs->u.e.tcr); + + if (sregs->u.e.update_special & KVM_SREGS_E_UPDATE_DEC) { + vcpu->arch.dec = sregs->u.e.dec; + kvmppc_emulate_dec(vcpu); + } + + if (sregs->u.e.update_special & KVM_SREGS_E_UPDATE_TSR) + kvmppc_set_tsr(vcpu, sregs->u.e.tsr); + + return 0; +} + +static void get_sregs_arch206(struct kvm_vcpu *vcpu, + struct kvm_sregs *sregs) +{ + sregs->u.e.features |= KVM_SREGS_E_ARCH206; + + sregs->u.e.pir = vcpu->vcpu_id; + sregs->u.e.mcsrr0 = vcpu->arch.mcsrr0; + sregs->u.e.mcsrr1 = vcpu->arch.mcsrr1; + sregs->u.e.decar = vcpu->arch.decar; + sregs->u.e.ivpr = vcpu->arch.ivpr; +} + +static int set_sregs_arch206(struct kvm_vcpu *vcpu, + struct kvm_sregs *sregs) +{ + if (!(sregs->u.e.features & KVM_SREGS_E_ARCH206)) + return 0; + + if (sregs->u.e.pir != vcpu->vcpu_id) + return -EINVAL; + + vcpu->arch.mcsrr0 = sregs->u.e.mcsrr0; + vcpu->arch.mcsrr1 = sregs->u.e.mcsrr1; + vcpu->arch.decar = sregs->u.e.decar; + vcpu->arch.ivpr = sregs->u.e.ivpr; + + return 0; +} + +int kvmppc_get_sregs_ivor(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) +{ + sregs->u.e.features |= KVM_SREGS_E_IVOR; + + sregs->u.e.ivor_low[0] = vcpu->arch.ivor[BOOKE_IRQPRIO_CRITICAL]; + sregs->u.e.ivor_low[1] = vcpu->arch.ivor[BOOKE_IRQPRIO_MACHINE_CHECK]; + sregs->u.e.ivor_low[2] = vcpu->arch.ivor[BOOKE_IRQPRIO_DATA_STORAGE]; + sregs->u.e.ivor_low[3] = vcpu->arch.ivor[BOOKE_IRQPRIO_INST_STORAGE]; + sregs->u.e.ivor_low[4] = vcpu->arch.ivor[BOOKE_IRQPRIO_EXTERNAL]; + sregs->u.e.ivor_low[5] = vcpu->arch.ivor[BOOKE_IRQPRIO_ALIGNMENT]; + sregs->u.e.ivor_low[6] = vcpu->arch.ivor[BOOKE_IRQPRIO_PROGRAM]; + sregs->u.e.ivor_low[7] = vcpu->arch.ivor[BOOKE_IRQPRIO_FP_UNAVAIL]; + sregs->u.e.ivor_low[8] = vcpu->arch.ivor[BOOKE_IRQPRIO_SYSCALL]; + sregs->u.e.ivor_low[9] = vcpu->arch.ivor[BOOKE_IRQPRIO_AP_UNAVAIL]; + sregs->u.e.ivor_low[10] = vcpu->arch.ivor[BOOKE_IRQPRIO_DECREMENTER]; + sregs->u.e.ivor_low[11] = vcpu->arch.ivor[BOOKE_IRQPRIO_FIT]; + sregs->u.e.ivor_low[12] = vcpu->arch.ivor[BOOKE_IRQPRIO_WATCHDOG]; + sregs->u.e.ivor_low[13] = vcpu->arch.ivor[BOOKE_IRQPRIO_DTLB_MISS]; + sregs->u.e.ivor_low[14] = vcpu->arch.ivor[BOOKE_IRQPRIO_ITLB_MISS]; + sregs->u.e.ivor_low[15] = vcpu->arch.ivor[BOOKE_IRQPRIO_DEBUG]; + return 0; +} + +int kvmppc_set_sregs_ivor(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) +{ + if (!(sregs->u.e.features & KVM_SREGS_E_IVOR)) + return 0; + + vcpu->arch.ivor[BOOKE_IRQPRIO_CRITICAL] = sregs->u.e.ivor_low[0]; + vcpu->arch.ivor[BOOKE_IRQPRIO_MACHINE_CHECK] = sregs->u.e.ivor_low[1]; + vcpu->arch.ivor[BOOKE_IRQPRIO_DATA_STORAGE] = sregs->u.e.ivor_low[2]; + vcpu->arch.ivor[BOOKE_IRQPRIO_INST_STORAGE] = sregs->u.e.ivor_low[3]; + vcpu->arch.ivor[BOOKE_IRQPRIO_EXTERNAL] = sregs->u.e.ivor_low[4]; + vcpu->arch.ivor[BOOKE_IRQPRIO_ALIGNMENT] = sregs->u.e.ivor_low[5]; + vcpu->arch.ivor[BOOKE_IRQPRIO_PROGRAM] = sregs->u.e.ivor_low[6]; + vcpu->arch.ivor[BOOKE_IRQPRIO_FP_UNAVAIL] = sregs->u.e.ivor_low[7]; + vcpu->arch.ivor[BOOKE_IRQPRIO_SYSCALL] = sregs->u.e.ivor_low[8]; + vcpu->arch.ivor[BOOKE_IRQPRIO_AP_UNAVAIL] = sregs->u.e.ivor_low[9]; + vcpu->arch.ivor[BOOKE_IRQPRIO_DECREMENTER] = sregs->u.e.ivor_low[10]; + vcpu->arch.ivor[BOOKE_IRQPRIO_FIT] = sregs->u.e.ivor_low[11]; + vcpu->arch.ivor[BOOKE_IRQPRIO_WATCHDOG] = sregs->u.e.ivor_low[12]; + vcpu->arch.ivor[BOOKE_IRQPRIO_DTLB_MISS] = sregs->u.e.ivor_low[13]; + vcpu->arch.ivor[BOOKE_IRQPRIO_ITLB_MISS] = sregs->u.e.ivor_low[14]; + vcpu->arch.ivor[BOOKE_IRQPRIO_DEBUG] = sregs->u.e.ivor_low[15]; + + return 0; +} + +int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, + struct kvm_sregs *sregs) +{ + sregs->pvr = vcpu->arch.pvr; + + get_sregs_base(vcpu, sregs); + get_sregs_arch206(vcpu, sregs); + return vcpu->kvm->arch.kvm_ops->get_sregs(vcpu, sregs); +} + +int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, + struct kvm_sregs *sregs) +{ + int ret; + + if (vcpu->arch.pvr != sregs->pvr) + return -EINVAL; + + ret = set_sregs_base(vcpu, sregs); + if (ret < 0) + return ret; + + ret = set_sregs_arch206(vcpu, sregs); + if (ret < 0) + return ret; + + return vcpu->kvm->arch.kvm_ops->set_sregs(vcpu, sregs); +} + +int kvmppc_get_one_reg(struct kvm_vcpu *vcpu, u64 id, + union kvmppc_one_reg *val) +{ + int r = 0; + + switch (id) { + case KVM_REG_PPC_IAC1: + *val = get_reg_val(id, vcpu->arch.dbg_reg.iac1); + break; + case KVM_REG_PPC_IAC2: + *val = get_reg_val(id, vcpu->arch.dbg_reg.iac2); + break; +#if CONFIG_PPC_ADV_DEBUG_IACS > 2 + case KVM_REG_PPC_IAC3: + *val = get_reg_val(id, vcpu->arch.dbg_reg.iac3); + break; + case KVM_REG_PPC_IAC4: + *val = get_reg_val(id, vcpu->arch.dbg_reg.iac4); + break; +#endif + case KVM_REG_PPC_DAC1: + *val = get_reg_val(id, vcpu->arch.dbg_reg.dac1); + break; + case KVM_REG_PPC_DAC2: + *val = get_reg_val(id, vcpu->arch.dbg_reg.dac2); + break; + case KVM_REG_PPC_EPR: { + u32 epr = kvmppc_get_epr(vcpu); + *val = get_reg_val(id, epr); + break; + } +#if defined(CONFIG_64BIT) + case KVM_REG_PPC_EPCR: + *val = get_reg_val(id, vcpu->arch.epcr); + break; +#endif + case KVM_REG_PPC_TCR: + *val = get_reg_val(id, vcpu->arch.tcr); + break; + case KVM_REG_PPC_TSR: + *val = get_reg_val(id, vcpu->arch.tsr); + break; + case KVM_REG_PPC_DEBUG_INST: + *val = get_reg_val(id, KVMPPC_INST_SW_BREAKPOINT); + break; + case KVM_REG_PPC_VRSAVE: + *val = get_reg_val(id, vcpu->arch.vrsave); + break; + default: + r = vcpu->kvm->arch.kvm_ops->get_one_reg(vcpu, id, val); + break; + } + + return r; +} + +int kvmppc_set_one_reg(struct kvm_vcpu *vcpu, u64 id, + union kvmppc_one_reg *val) +{ + int r = 0; + + switch (id) { + case KVM_REG_PPC_IAC1: + vcpu->arch.dbg_reg.iac1 = set_reg_val(id, *val); + break; + case KVM_REG_PPC_IAC2: + vcpu->arch.dbg_reg.iac2 = set_reg_val(id, *val); + break; +#if CONFIG_PPC_ADV_DEBUG_IACS > 2 + case KVM_REG_PPC_IAC3: + vcpu->arch.dbg_reg.iac3 = set_reg_val(id, *val); + break; + case KVM_REG_PPC_IAC4: + vcpu->arch.dbg_reg.iac4 = set_reg_val(id, *val); + break; +#endif + case KVM_REG_PPC_DAC1: + vcpu->arch.dbg_reg.dac1 = set_reg_val(id, *val); + break; + case KVM_REG_PPC_DAC2: + vcpu->arch.dbg_reg.dac2 = set_reg_val(id, *val); + break; + case KVM_REG_PPC_EPR: { + u32 new_epr = set_reg_val(id, *val); + kvmppc_set_epr(vcpu, new_epr); + break; + } +#if defined(CONFIG_64BIT) + case KVM_REG_PPC_EPCR: { + u32 new_epcr = set_reg_val(id, *val); + kvmppc_set_epcr(vcpu, new_epcr); + break; + } +#endif + case KVM_REG_PPC_OR_TSR: { + u32 tsr_bits = set_reg_val(id, *val); + kvmppc_set_tsr_bits(vcpu, tsr_bits); + break; + } + case KVM_REG_PPC_CLEAR_TSR: { + u32 tsr_bits = set_reg_val(id, *val); + kvmppc_clr_tsr_bits(vcpu, tsr_bits); + break; + } + case KVM_REG_PPC_TSR: { + u32 tsr = set_reg_val(id, *val); + kvmppc_set_tsr(vcpu, tsr); + break; + } + case KVM_REG_PPC_TCR: { + u32 tcr = set_reg_val(id, *val); + kvmppc_set_tcr(vcpu, tcr); + break; + } + case KVM_REG_PPC_VRSAVE: + vcpu->arch.vrsave = set_reg_val(id, *val); + break; + default: + r = vcpu->kvm->arch.kvm_ops->set_one_reg(vcpu, id, val); + break; + } + + return r; +} + +int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) +{ + return -ENOTSUPP; +} + +int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) +{ + return -ENOTSUPP; +} + +int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu, + struct kvm_translation *tr) +{ + int r; + + r = kvmppc_core_vcpu_translate(vcpu, tr); + return r; +} + +int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log) +{ + return -ENOTSUPP; +} + +void kvmppc_core_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free, + struct kvm_memory_slot *dont) +{ +} + +int kvmppc_core_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot, + unsigned long npages) +{ + return 0; +} + +int kvmppc_core_prepare_memory_region(struct kvm *kvm, + struct kvm_memory_slot *memslot, + struct kvm_userspace_memory_region *mem) +{ + return 0; +} + +void kvmppc_core_commit_memory_region(struct kvm *kvm, + struct kvm_userspace_memory_region *mem, + const struct kvm_memory_slot *old) +{ +} + +void kvmppc_core_flush_memslot(struct kvm *kvm, struct kvm_memory_slot *memslot) +{ +} + +void kvmppc_set_epcr(struct kvm_vcpu *vcpu, u32 new_epcr) +{ +#if defined(CONFIG_64BIT) + vcpu->arch.epcr = new_epcr; +#ifdef CONFIG_KVM_BOOKE_HV + vcpu->arch.shadow_epcr &= ~SPRN_EPCR_GICM; + if (vcpu->arch.epcr & SPRN_EPCR_ICM) + vcpu->arch.shadow_epcr |= SPRN_EPCR_GICM; +#endif +#endif +} + +void kvmppc_set_tcr(struct kvm_vcpu *vcpu, u32 new_tcr) +{ + vcpu->arch.tcr = new_tcr; + arm_next_watchdog(vcpu); + update_timer_ints(vcpu); +} + +void kvmppc_set_tsr_bits(struct kvm_vcpu *vcpu, u32 tsr_bits) +{ + set_bits(tsr_bits, &vcpu->arch.tsr); + smp_wmb(); + kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu); + kvm_vcpu_kick(vcpu); +} + +void kvmppc_clr_tsr_bits(struct kvm_vcpu *vcpu, u32 tsr_bits) +{ + clear_bits(tsr_bits, &vcpu->arch.tsr); + + /* + * We may have stopped the watchdog due to + * being stuck on final expiration. + */ + if (tsr_bits & (TSR_ENW | TSR_WIS)) + arm_next_watchdog(vcpu); + + update_timer_ints(vcpu); +} + +void kvmppc_decrementer_func(struct kvm_vcpu *vcpu) +{ + if (vcpu->arch.tcr & TCR_ARE) { + vcpu->arch.dec = vcpu->arch.decar; + kvmppc_emulate_dec(vcpu); + } + + kvmppc_set_tsr_bits(vcpu, TSR_DIS); +} + +static int kvmppc_booke_add_breakpoint(struct debug_reg *dbg_reg, + uint64_t addr, int index) +{ + switch (index) { + case 0: + dbg_reg->dbcr0 |= DBCR0_IAC1; + dbg_reg->iac1 = addr; + break; + case 1: + dbg_reg->dbcr0 |= DBCR0_IAC2; + dbg_reg->iac2 = addr; + break; +#if CONFIG_PPC_ADV_DEBUG_IACS > 2 + case 2: + dbg_reg->dbcr0 |= DBCR0_IAC3; + dbg_reg->iac3 = addr; + break; + case 3: + dbg_reg->dbcr0 |= DBCR0_IAC4; + dbg_reg->iac4 = addr; + break; +#endif + default: + return -EINVAL; + } + + dbg_reg->dbcr0 |= DBCR0_IDM; + return 0; +} + +static int kvmppc_booke_add_watchpoint(struct debug_reg *dbg_reg, uint64_t addr, + int type, int index) +{ + switch (index) { + case 0: + if (type & KVMPPC_DEBUG_WATCH_READ) + dbg_reg->dbcr0 |= DBCR0_DAC1R; + if (type & KVMPPC_DEBUG_WATCH_WRITE) + dbg_reg->dbcr0 |= DBCR0_DAC1W; + dbg_reg->dac1 = addr; + break; + case 1: + if (type & KVMPPC_DEBUG_WATCH_READ) + dbg_reg->dbcr0 |= DBCR0_DAC2R; + if (type & KVMPPC_DEBUG_WATCH_WRITE) + dbg_reg->dbcr0 |= DBCR0_DAC2W; + dbg_reg->dac2 = addr; + break; + default: + return -EINVAL; + } + + dbg_reg->dbcr0 |= DBCR0_IDM; + return 0; +} +void kvm_guest_protect_msr(struct kvm_vcpu *vcpu, ulong prot_bitmap, bool set) +{ + /* XXX: Add similar MSR protection for BookE-PR */ +#ifdef CONFIG_KVM_BOOKE_HV + BUG_ON(prot_bitmap & ~(MSRP_UCLEP | MSRP_DEP | MSRP_PMMP)); + if (set) { + if (prot_bitmap & MSR_UCLE) + vcpu->arch.shadow_msrp |= MSRP_UCLEP; + if (prot_bitmap & MSR_DE) + vcpu->arch.shadow_msrp |= MSRP_DEP; + if (prot_bitmap & MSR_PMM) + vcpu->arch.shadow_msrp |= MSRP_PMMP; + } else { + if (prot_bitmap & MSR_UCLE) + vcpu->arch.shadow_msrp &= ~MSRP_UCLEP; + if (prot_bitmap & MSR_DE) + vcpu->arch.shadow_msrp &= ~MSRP_DEP; + if (prot_bitmap & MSR_PMM) + vcpu->arch.shadow_msrp &= ~MSRP_PMMP; + } +#endif +} + +int kvmppc_xlate(struct kvm_vcpu *vcpu, ulong eaddr, enum xlate_instdata xlid, + enum xlate_readwrite xlrw, struct kvmppc_pte *pte) +{ + int gtlb_index; + gpa_t gpaddr; + +#ifdef CONFIG_KVM_E500V2 + if (!(vcpu->arch.shared->msr & MSR_PR) && + (eaddr & PAGE_MASK) == vcpu->arch.magic_page_ea) { + pte->eaddr = eaddr; + pte->raddr = (vcpu->arch.magic_page_pa & PAGE_MASK) | + (eaddr & ~PAGE_MASK); + pte->vpage = eaddr >> PAGE_SHIFT; + pte->may_read = true; + pte->may_write = true; + pte->may_execute = true; + + return 0; + } +#endif + + /* Check the guest TLB. */ + switch (xlid) { + case XLATE_INST: + gtlb_index = kvmppc_mmu_itlb_index(vcpu, eaddr); + break; + case XLATE_DATA: + gtlb_index = kvmppc_mmu_dtlb_index(vcpu, eaddr); + break; + default: + BUG(); + } + + /* Do we have a TLB entry at all? */ + if (gtlb_index < 0) + return -ENOENT; + + gpaddr = kvmppc_mmu_xlate(vcpu, gtlb_index, eaddr); + + pte->eaddr = eaddr; + pte->raddr = (gpaddr & PAGE_MASK) | (eaddr & ~PAGE_MASK); + pte->vpage = eaddr >> PAGE_SHIFT; + + /* XXX read permissions from the guest TLB */ + pte->may_read = true; + pte->may_write = true; + pte->may_execute = true; + + return 0; +} + +int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, + struct kvm_guest_debug *dbg) +{ + struct debug_reg *dbg_reg; + int n, b = 0, w = 0; + + if (!(dbg->control & KVM_GUESTDBG_ENABLE)) { + vcpu->arch.dbg_reg.dbcr0 = 0; + vcpu->guest_debug = 0; + kvm_guest_protect_msr(vcpu, MSR_DE, false); + return 0; + } + + kvm_guest_protect_msr(vcpu, MSR_DE, true); + vcpu->guest_debug = dbg->control; + vcpu->arch.dbg_reg.dbcr0 = 0; + + if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) + vcpu->arch.dbg_reg.dbcr0 |= DBCR0_IDM | DBCR0_IC; + + /* Code below handles only HW breakpoints */ + dbg_reg = &(vcpu->arch.dbg_reg); + +#ifdef CONFIG_KVM_BOOKE_HV + /* + * On BookE-HV (e500mc) the guest is always executed with MSR.GS=1 + * DBCR1 and DBCR2 are set to trigger debug events when MSR.PR is 0 + */ + dbg_reg->dbcr1 = 0; + dbg_reg->dbcr2 = 0; +#else + /* + * On BookE-PR (e500v2) the guest is always executed with MSR.PR=1 + * We set DBCR1 and DBCR2 to only trigger debug events when MSR.PR + * is set. + */ + dbg_reg->dbcr1 = DBCR1_IAC1US | DBCR1_IAC2US | DBCR1_IAC3US | + DBCR1_IAC4US; + dbg_reg->dbcr2 = DBCR2_DAC1US | DBCR2_DAC2US; +#endif + + if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) + return 0; + + for (n = 0; n < (KVMPPC_BOOKE_IAC_NUM + KVMPPC_BOOKE_DAC_NUM); n++) { + uint64_t addr = dbg->arch.bp[n].addr; + uint32_t type = dbg->arch.bp[n].type; + + if (type == KVMPPC_DEBUG_NONE) + continue; + + if (type & !(KVMPPC_DEBUG_WATCH_READ | + KVMPPC_DEBUG_WATCH_WRITE | + KVMPPC_DEBUG_BREAKPOINT)) + return -EINVAL; + + if (type & KVMPPC_DEBUG_BREAKPOINT) { + /* Setting H/W breakpoint */ + if (kvmppc_booke_add_breakpoint(dbg_reg, addr, b++)) + return -EINVAL; + } else { + /* Setting H/W watchpoint */ + if (kvmppc_booke_add_watchpoint(dbg_reg, addr, + type, w++)) + return -EINVAL; + } + } + + return 0; +} + +void kvmppc_booke_vcpu_load(struct kvm_vcpu *vcpu, int cpu) +{ + vcpu->cpu = smp_processor_id(); + current->thread.kvm_vcpu = vcpu; +} + +void kvmppc_booke_vcpu_put(struct kvm_vcpu *vcpu) +{ + current->thread.kvm_vcpu = NULL; + vcpu->cpu = -1; + + /* Clear pending debug event in DBSR */ + kvmppc_clear_dbsr(); +} + +void kvmppc_mmu_destroy(struct kvm_vcpu *vcpu) +{ + vcpu->kvm->arch.kvm_ops->mmu_destroy(vcpu); +} + +int kvmppc_core_init_vm(struct kvm *kvm) +{ + return kvm->arch.kvm_ops->init_vm(kvm); +} + +struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm *kvm, unsigned int id) +{ + return kvm->arch.kvm_ops->vcpu_create(kvm, id); +} + +void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu) +{ + vcpu->kvm->arch.kvm_ops->vcpu_free(vcpu); +} + +void kvmppc_core_destroy_vm(struct kvm *kvm) +{ + kvm->arch.kvm_ops->destroy_vm(kvm); +} + +void kvmppc_core_vcpu_load(struct kvm_vcpu *vcpu, int cpu) +{ + vcpu->kvm->arch.kvm_ops->vcpu_load(vcpu, cpu); +} + +void kvmppc_core_vcpu_put(struct kvm_vcpu *vcpu) +{ + vcpu->kvm->arch.kvm_ops->vcpu_put(vcpu); +} + +int __init kvmppc_booke_init(void) +{ +#ifndef CONFIG_KVM_BOOKE_HV + unsigned long ivor[16]; + unsigned long *handler = kvmppc_booke_handler_addr; + unsigned long max_ivor = 0; + unsigned long handler_len; + int i; + + /* We install our own exception handlers by hijacking IVPR. IVPR must + * be 16-bit aligned, so we need a 64KB allocation. */ + kvmppc_booke_handlers = __get_free_pages(GFP_KERNEL | __GFP_ZERO, + VCPU_SIZE_ORDER); + if (!kvmppc_booke_handlers) + return -ENOMEM; + + /* XXX make sure our handlers are smaller than Linux's */ + + /* Copy our interrupt handlers to match host IVORs. That way we don't + * have to swap the IVORs on every guest/host transition. */ + ivor[0] = mfspr(SPRN_IVOR0); + ivor[1] = mfspr(SPRN_IVOR1); + ivor[2] = mfspr(SPRN_IVOR2); + ivor[3] = mfspr(SPRN_IVOR3); + ivor[4] = mfspr(SPRN_IVOR4); + ivor[5] = mfspr(SPRN_IVOR5); + ivor[6] = mfspr(SPRN_IVOR6); + ivor[7] = mfspr(SPRN_IVOR7); + ivor[8] = mfspr(SPRN_IVOR8); + ivor[9] = mfspr(SPRN_IVOR9); + ivor[10] = mfspr(SPRN_IVOR10); + ivor[11] = mfspr(SPRN_IVOR11); + ivor[12] = mfspr(SPRN_IVOR12); + ivor[13] = mfspr(SPRN_IVOR13); + ivor[14] = mfspr(SPRN_IVOR14); + ivor[15] = mfspr(SPRN_IVOR15); + + for (i = 0; i < 16; i++) { + if (ivor[i] > max_ivor) + max_ivor = i; + + handler_len = handler[i + 1] - handler[i]; + memcpy((void *)kvmppc_booke_handlers + ivor[i], + (void *)handler[i], handler_len); + } + + handler_len = handler[max_ivor + 1] - handler[max_ivor]; + flush_icache_range(kvmppc_booke_handlers, kvmppc_booke_handlers + + ivor[max_ivor] + handler_len); +#endif /* !BOOKE_HV */ + return 0; +} + +void __exit kvmppc_booke_exit(void) +{ + free_pages(kvmppc_booke_handlers, VCPU_SIZE_ORDER); + kvm_exit(); +} diff --git a/kernel/arch/powerpc/kvm/booke.h b/kernel/arch/powerpc/kvm/booke.h new file mode 100644 index 000000000..22ba08ea6 --- /dev/null +++ b/kernel/arch/powerpc/kvm/booke.h @@ -0,0 +1,129 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * + * Copyright IBM Corp. 2008 + * + * Authors: Hollis Blanchard <hollisb@us.ibm.com> + */ + +#ifndef __KVM_BOOKE_H__ +#define __KVM_BOOKE_H__ + +#include <linux/types.h> +#include <linux/kvm_host.h> +#include <asm/kvm_ppc.h> +#include <asm/switch_to.h> +#include "timing.h" + +/* interrupt priortity ordering */ +#define BOOKE_IRQPRIO_DATA_STORAGE 0 +#define BOOKE_IRQPRIO_INST_STORAGE 1 +#define BOOKE_IRQPRIO_ALIGNMENT 2 +#define BOOKE_IRQPRIO_PROGRAM 3 +#define BOOKE_IRQPRIO_FP_UNAVAIL 4 +#ifdef CONFIG_SPE_POSSIBLE +#define BOOKE_IRQPRIO_SPE_UNAVAIL 5 +#define BOOKE_IRQPRIO_SPE_FP_DATA 6 +#define BOOKE_IRQPRIO_SPE_FP_ROUND 7 +#endif +#ifdef CONFIG_PPC_E500MC +#define BOOKE_IRQPRIO_ALTIVEC_UNAVAIL 5 +#define BOOKE_IRQPRIO_ALTIVEC_ASSIST 6 +#endif +#define BOOKE_IRQPRIO_SYSCALL 8 +#define BOOKE_IRQPRIO_AP_UNAVAIL 9 +#define BOOKE_IRQPRIO_DTLB_MISS 10 +#define BOOKE_IRQPRIO_ITLB_MISS 11 +#define BOOKE_IRQPRIO_MACHINE_CHECK 12 +#define BOOKE_IRQPRIO_DEBUG 13 +#define BOOKE_IRQPRIO_CRITICAL 14 +#define BOOKE_IRQPRIO_WATCHDOG 15 +#define BOOKE_IRQPRIO_EXTERNAL 16 +#define BOOKE_IRQPRIO_FIT 17 +#define BOOKE_IRQPRIO_DECREMENTER 18 +#define BOOKE_IRQPRIO_PERFORMANCE_MONITOR 19 +/* Internal pseudo-irqprio for level triggered externals */ +#define BOOKE_IRQPRIO_EXTERNAL_LEVEL 20 +#define BOOKE_IRQPRIO_DBELL 21 +#define BOOKE_IRQPRIO_DBELL_CRIT 22 +#define BOOKE_IRQPRIO_MAX 23 + +#define BOOKE_IRQMASK_EE ((1 << BOOKE_IRQPRIO_EXTERNAL_LEVEL) | \ + (1 << BOOKE_IRQPRIO_PERFORMANCE_MONITOR) | \ + (1 << BOOKE_IRQPRIO_DBELL) | \ + (1 << BOOKE_IRQPRIO_DECREMENTER) | \ + (1 << BOOKE_IRQPRIO_FIT) | \ + (1 << BOOKE_IRQPRIO_EXTERNAL)) + +#define BOOKE_IRQMASK_CE ((1 << BOOKE_IRQPRIO_DBELL_CRIT) | \ + (1 << BOOKE_IRQPRIO_WATCHDOG) | \ + (1 << BOOKE_IRQPRIO_CRITICAL)) + +extern unsigned long kvmppc_booke_handlers; +extern unsigned long kvmppc_booke_handler_addr[]; + +void kvmppc_set_msr(struct kvm_vcpu *vcpu, u32 new_msr); +void kvmppc_mmu_msr_notify(struct kvm_vcpu *vcpu, u32 old_msr); + +void kvmppc_set_epcr(struct kvm_vcpu *vcpu, u32 new_epcr); +void kvmppc_set_tcr(struct kvm_vcpu *vcpu, u32 new_tcr); +void kvmppc_set_tsr_bits(struct kvm_vcpu *vcpu, u32 tsr_bits); +void kvmppc_clr_tsr_bits(struct kvm_vcpu *vcpu, u32 tsr_bits); + +int kvmppc_booke_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu, + unsigned int inst, int *advance); +int kvmppc_booke_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, ulong *spr_val); +int kvmppc_booke_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, ulong spr_val); + +/* low-level asm code to transfer guest state */ +void kvmppc_load_guest_spe(struct kvm_vcpu *vcpu); +void kvmppc_save_guest_spe(struct kvm_vcpu *vcpu); + +/* high-level function, manages flags, host state */ +void kvmppc_vcpu_disable_spe(struct kvm_vcpu *vcpu); + +void kvmppc_booke_vcpu_load(struct kvm_vcpu *vcpu, int cpu); +void kvmppc_booke_vcpu_put(struct kvm_vcpu *vcpu); + +enum int_class { + INT_CLASS_NONCRIT, + INT_CLASS_CRIT, + INT_CLASS_MC, + INT_CLASS_DBG, +}; + +void kvmppc_set_pending_interrupt(struct kvm_vcpu *vcpu, enum int_class type); + +extern void kvmppc_mmu_destroy_e500(struct kvm_vcpu *vcpu); +extern int kvmppc_core_emulate_op_e500(struct kvm_run *run, + struct kvm_vcpu *vcpu, + unsigned int inst, int *advance); +extern int kvmppc_core_emulate_mtspr_e500(struct kvm_vcpu *vcpu, int sprn, + ulong spr_val); +extern int kvmppc_core_emulate_mfspr_e500(struct kvm_vcpu *vcpu, int sprn, + ulong *spr_val); +extern void kvmppc_mmu_destroy_e500(struct kvm_vcpu *vcpu); +extern int kvmppc_core_emulate_op_e500(struct kvm_run *run, + struct kvm_vcpu *vcpu, + unsigned int inst, int *advance); +extern int kvmppc_core_emulate_mtspr_e500(struct kvm_vcpu *vcpu, int sprn, + ulong spr_val); +extern int kvmppc_core_emulate_mfspr_e500(struct kvm_vcpu *vcpu, int sprn, + ulong *spr_val); + +static inline void kvmppc_clear_dbsr(void) +{ + mtspr(SPRN_DBSR, mfspr(SPRN_DBSR)); +} +#endif /* __KVM_BOOKE_H__ */ diff --git a/kernel/arch/powerpc/kvm/booke_emulate.c b/kernel/arch/powerpc/kvm/booke_emulate.c new file mode 100644 index 000000000..a82f64502 --- /dev/null +++ b/kernel/arch/powerpc/kvm/booke_emulate.c @@ -0,0 +1,522 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * + * Copyright IBM Corp. 2008 + * Copyright 2011 Freescale Semiconductor, Inc. + * + * Authors: Hollis Blanchard <hollisb@us.ibm.com> + */ + +#include <linux/kvm_host.h> +#include <asm/disassemble.h> + +#include "booke.h" + +#define OP_19_XOP_RFI 50 +#define OP_19_XOP_RFCI 51 +#define OP_19_XOP_RFDI 39 + +#define OP_31_XOP_MFMSR 83 +#define OP_31_XOP_WRTEE 131 +#define OP_31_XOP_MTMSR 146 +#define OP_31_XOP_WRTEEI 163 + +static void kvmppc_emul_rfi(struct kvm_vcpu *vcpu) +{ + vcpu->arch.pc = vcpu->arch.shared->srr0; + kvmppc_set_msr(vcpu, vcpu->arch.shared->srr1); +} + +static void kvmppc_emul_rfdi(struct kvm_vcpu *vcpu) +{ + vcpu->arch.pc = vcpu->arch.dsrr0; + kvmppc_set_msr(vcpu, vcpu->arch.dsrr1); +} + +static void kvmppc_emul_rfci(struct kvm_vcpu *vcpu) +{ + vcpu->arch.pc = vcpu->arch.csrr0; + kvmppc_set_msr(vcpu, vcpu->arch.csrr1); +} + +int kvmppc_booke_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu, + unsigned int inst, int *advance) +{ + int emulated = EMULATE_DONE; + int rs = get_rs(inst); + int rt = get_rt(inst); + + switch (get_op(inst)) { + case 19: + switch (get_xop(inst)) { + case OP_19_XOP_RFI: + kvmppc_emul_rfi(vcpu); + kvmppc_set_exit_type(vcpu, EMULATED_RFI_EXITS); + *advance = 0; + break; + + case OP_19_XOP_RFCI: + kvmppc_emul_rfci(vcpu); + kvmppc_set_exit_type(vcpu, EMULATED_RFCI_EXITS); + *advance = 0; + break; + + case OP_19_XOP_RFDI: + kvmppc_emul_rfdi(vcpu); + kvmppc_set_exit_type(vcpu, EMULATED_RFDI_EXITS); + *advance = 0; + break; + + default: + emulated = EMULATE_FAIL; + break; + } + break; + + case 31: + switch (get_xop(inst)) { + + case OP_31_XOP_MFMSR: + kvmppc_set_gpr(vcpu, rt, vcpu->arch.shared->msr); + kvmppc_set_exit_type(vcpu, EMULATED_MFMSR_EXITS); + break; + + case OP_31_XOP_MTMSR: + kvmppc_set_exit_type(vcpu, EMULATED_MTMSR_EXITS); + kvmppc_set_msr(vcpu, kvmppc_get_gpr(vcpu, rs)); + break; + + case OP_31_XOP_WRTEE: + vcpu->arch.shared->msr = (vcpu->arch.shared->msr & ~MSR_EE) + | (kvmppc_get_gpr(vcpu, rs) & MSR_EE); + kvmppc_set_exit_type(vcpu, EMULATED_WRTEE_EXITS); + break; + + case OP_31_XOP_WRTEEI: + vcpu->arch.shared->msr = (vcpu->arch.shared->msr & ~MSR_EE) + | (inst & MSR_EE); + kvmppc_set_exit_type(vcpu, EMULATED_WRTEE_EXITS); + break; + + default: + emulated = EMULATE_FAIL; + } + + break; + + default: + emulated = EMULATE_FAIL; + } + + return emulated; +} + +/* + * NOTE: some of these registers are not emulated on BOOKE_HV (GS-mode). + * Their backing store is in real registers, and these functions + * will return the wrong result if called for them in another context + * (such as debugging). + */ +int kvmppc_booke_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, ulong spr_val) +{ + int emulated = EMULATE_DONE; + bool debug_inst = false; + + switch (sprn) { + case SPRN_DEAR: + vcpu->arch.shared->dar = spr_val; + break; + case SPRN_ESR: + vcpu->arch.shared->esr = spr_val; + break; + case SPRN_CSRR0: + vcpu->arch.csrr0 = spr_val; + break; + case SPRN_CSRR1: + vcpu->arch.csrr1 = spr_val; + break; + case SPRN_DSRR0: + vcpu->arch.dsrr0 = spr_val; + break; + case SPRN_DSRR1: + vcpu->arch.dsrr1 = spr_val; + break; + case SPRN_IAC1: + /* + * If userspace is debugging guest then guest + * can not access debug registers. + */ + if (vcpu->guest_debug) + break; + + debug_inst = true; + vcpu->arch.dbg_reg.iac1 = spr_val; + break; + case SPRN_IAC2: + /* + * If userspace is debugging guest then guest + * can not access debug registers. + */ + if (vcpu->guest_debug) + break; + + debug_inst = true; + vcpu->arch.dbg_reg.iac2 = spr_val; + break; +#if CONFIG_PPC_ADV_DEBUG_IACS > 2 + case SPRN_IAC3: + /* + * If userspace is debugging guest then guest + * can not access debug registers. + */ + if (vcpu->guest_debug) + break; + + debug_inst = true; + vcpu->arch.dbg_reg.iac3 = spr_val; + break; + case SPRN_IAC4: + /* + * If userspace is debugging guest then guest + * can not access debug registers. + */ + if (vcpu->guest_debug) + break; + + debug_inst = true; + vcpu->arch.dbg_reg.iac4 = spr_val; + break; +#endif + case SPRN_DAC1: + /* + * If userspace is debugging guest then guest + * can not access debug registers. + */ + if (vcpu->guest_debug) + break; + + debug_inst = true; + vcpu->arch.dbg_reg.dac1 = spr_val; + break; + case SPRN_DAC2: + /* + * If userspace is debugging guest then guest + * can not access debug registers. + */ + if (vcpu->guest_debug) + break; + + debug_inst = true; + vcpu->arch.dbg_reg.dac2 = spr_val; + break; + case SPRN_DBCR0: + /* + * If userspace is debugging guest then guest + * can not access debug registers. + */ + if (vcpu->guest_debug) + break; + + debug_inst = true; + spr_val &= (DBCR0_IDM | DBCR0_IC | DBCR0_BT | DBCR0_TIE | + DBCR0_IAC1 | DBCR0_IAC2 | DBCR0_IAC3 | DBCR0_IAC4 | + DBCR0_DAC1R | DBCR0_DAC1W | DBCR0_DAC2R | DBCR0_DAC2W); + + vcpu->arch.dbg_reg.dbcr0 = spr_val; + break; + case SPRN_DBCR1: + /* + * If userspace is debugging guest then guest + * can not access debug registers. + */ + if (vcpu->guest_debug) + break; + + debug_inst = true; + vcpu->arch.dbg_reg.dbcr1 = spr_val; + break; + case SPRN_DBCR2: + /* + * If userspace is debugging guest then guest + * can not access debug registers. + */ + if (vcpu->guest_debug) + break; + + debug_inst = true; + vcpu->arch.dbg_reg.dbcr2 = spr_val; + break; + case SPRN_DBSR: + /* + * If userspace is debugging guest then guest + * can not access debug registers. + */ + if (vcpu->guest_debug) + break; + + vcpu->arch.dbsr &= ~spr_val; + if (!(vcpu->arch.dbsr & ~DBSR_IDE)) + kvmppc_core_dequeue_debug(vcpu); + break; + case SPRN_TSR: + kvmppc_clr_tsr_bits(vcpu, spr_val); + break; + case SPRN_TCR: + /* + * WRC is a 2-bit field that is supposed to preserve its + * value once written to non-zero. + */ + if (vcpu->arch.tcr & TCR_WRC_MASK) { + spr_val &= ~TCR_WRC_MASK; + spr_val |= vcpu->arch.tcr & TCR_WRC_MASK; + } + kvmppc_set_tcr(vcpu, spr_val); + break; + + case SPRN_DECAR: + vcpu->arch.decar = spr_val; + break; + /* + * Note: SPRG4-7 are user-readable. + * These values are loaded into the real SPRGs when resuming the + * guest (PR-mode only). + */ + case SPRN_SPRG4: + kvmppc_set_sprg4(vcpu, spr_val); + break; + case SPRN_SPRG5: + kvmppc_set_sprg5(vcpu, spr_val); + break; + case SPRN_SPRG6: + kvmppc_set_sprg6(vcpu, spr_val); + break; + case SPRN_SPRG7: + kvmppc_set_sprg7(vcpu, spr_val); + break; + + case SPRN_IVPR: + vcpu->arch.ivpr = spr_val; +#ifdef CONFIG_KVM_BOOKE_HV + mtspr(SPRN_GIVPR, spr_val); +#endif + break; + case SPRN_IVOR0: + vcpu->arch.ivor[BOOKE_IRQPRIO_CRITICAL] = spr_val; + break; + case SPRN_IVOR1: + vcpu->arch.ivor[BOOKE_IRQPRIO_MACHINE_CHECK] = spr_val; + break; + case SPRN_IVOR2: + vcpu->arch.ivor[BOOKE_IRQPRIO_DATA_STORAGE] = spr_val; +#ifdef CONFIG_KVM_BOOKE_HV + mtspr(SPRN_GIVOR2, spr_val); +#endif + break; + case SPRN_IVOR3: + vcpu->arch.ivor[BOOKE_IRQPRIO_INST_STORAGE] = spr_val; + break; + case SPRN_IVOR4: + vcpu->arch.ivor[BOOKE_IRQPRIO_EXTERNAL] = spr_val; + break; + case SPRN_IVOR5: + vcpu->arch.ivor[BOOKE_IRQPRIO_ALIGNMENT] = spr_val; + break; + case SPRN_IVOR6: + vcpu->arch.ivor[BOOKE_IRQPRIO_PROGRAM] = spr_val; + break; + case SPRN_IVOR7: + vcpu->arch.ivor[BOOKE_IRQPRIO_FP_UNAVAIL] = spr_val; + break; + case SPRN_IVOR8: + vcpu->arch.ivor[BOOKE_IRQPRIO_SYSCALL] = spr_val; +#ifdef CONFIG_KVM_BOOKE_HV + mtspr(SPRN_GIVOR8, spr_val); +#endif + break; + case SPRN_IVOR9: + vcpu->arch.ivor[BOOKE_IRQPRIO_AP_UNAVAIL] = spr_val; + break; + case SPRN_IVOR10: + vcpu->arch.ivor[BOOKE_IRQPRIO_DECREMENTER] = spr_val; + break; + case SPRN_IVOR11: + vcpu->arch.ivor[BOOKE_IRQPRIO_FIT] = spr_val; + break; + case SPRN_IVOR12: + vcpu->arch.ivor[BOOKE_IRQPRIO_WATCHDOG] = spr_val; + break; + case SPRN_IVOR13: + vcpu->arch.ivor[BOOKE_IRQPRIO_DTLB_MISS] = spr_val; + break; + case SPRN_IVOR14: + vcpu->arch.ivor[BOOKE_IRQPRIO_ITLB_MISS] = spr_val; + break; + case SPRN_IVOR15: + vcpu->arch.ivor[BOOKE_IRQPRIO_DEBUG] = spr_val; + break; + case SPRN_MCSR: + vcpu->arch.mcsr &= ~spr_val; + break; +#if defined(CONFIG_64BIT) + case SPRN_EPCR: + kvmppc_set_epcr(vcpu, spr_val); +#ifdef CONFIG_KVM_BOOKE_HV + mtspr(SPRN_EPCR, vcpu->arch.shadow_epcr); +#endif + break; +#endif + default: + emulated = EMULATE_FAIL; + } + + if (debug_inst) { + current->thread.debug = vcpu->arch.dbg_reg; + switch_booke_debug_regs(&vcpu->arch.dbg_reg); + } + return emulated; +} + +int kvmppc_booke_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, ulong *spr_val) +{ + int emulated = EMULATE_DONE; + + switch (sprn) { + case SPRN_IVPR: + *spr_val = vcpu->arch.ivpr; + break; + case SPRN_DEAR: + *spr_val = vcpu->arch.shared->dar; + break; + case SPRN_ESR: + *spr_val = vcpu->arch.shared->esr; + break; + case SPRN_EPR: + *spr_val = vcpu->arch.epr; + break; + case SPRN_CSRR0: + *spr_val = vcpu->arch.csrr0; + break; + case SPRN_CSRR1: + *spr_val = vcpu->arch.csrr1; + break; + case SPRN_DSRR0: + *spr_val = vcpu->arch.dsrr0; + break; + case SPRN_DSRR1: + *spr_val = vcpu->arch.dsrr1; + break; + case SPRN_IAC1: + *spr_val = vcpu->arch.dbg_reg.iac1; + break; + case SPRN_IAC2: + *spr_val = vcpu->arch.dbg_reg.iac2; + break; +#if CONFIG_PPC_ADV_DEBUG_IACS > 2 + case SPRN_IAC3: + *spr_val = vcpu->arch.dbg_reg.iac3; + break; + case SPRN_IAC4: + *spr_val = vcpu->arch.dbg_reg.iac4; + break; +#endif + case SPRN_DAC1: + *spr_val = vcpu->arch.dbg_reg.dac1; + break; + case SPRN_DAC2: + *spr_val = vcpu->arch.dbg_reg.dac2; + break; + case SPRN_DBCR0: + *spr_val = vcpu->arch.dbg_reg.dbcr0; + if (vcpu->guest_debug) + *spr_val = *spr_val | DBCR0_EDM; + break; + case SPRN_DBCR1: + *spr_val = vcpu->arch.dbg_reg.dbcr1; + break; + case SPRN_DBCR2: + *spr_val = vcpu->arch.dbg_reg.dbcr2; + break; + case SPRN_DBSR: + *spr_val = vcpu->arch.dbsr; + break; + case SPRN_TSR: + *spr_val = vcpu->arch.tsr; + break; + case SPRN_TCR: + *spr_val = vcpu->arch.tcr; + break; + + case SPRN_IVOR0: + *spr_val = vcpu->arch.ivor[BOOKE_IRQPRIO_CRITICAL]; + break; + case SPRN_IVOR1: + *spr_val = vcpu->arch.ivor[BOOKE_IRQPRIO_MACHINE_CHECK]; + break; + case SPRN_IVOR2: + *spr_val = vcpu->arch.ivor[BOOKE_IRQPRIO_DATA_STORAGE]; + break; + case SPRN_IVOR3: + *spr_val = vcpu->arch.ivor[BOOKE_IRQPRIO_INST_STORAGE]; + break; + case SPRN_IVOR4: + *spr_val = vcpu->arch.ivor[BOOKE_IRQPRIO_EXTERNAL]; + break; + case SPRN_IVOR5: + *spr_val = vcpu->arch.ivor[BOOKE_IRQPRIO_ALIGNMENT]; + break; + case SPRN_IVOR6: + *spr_val = vcpu->arch.ivor[BOOKE_IRQPRIO_PROGRAM]; + break; + case SPRN_IVOR7: + *spr_val = vcpu->arch.ivor[BOOKE_IRQPRIO_FP_UNAVAIL]; + break; + case SPRN_IVOR8: + *spr_val = vcpu->arch.ivor[BOOKE_IRQPRIO_SYSCALL]; + break; + case SPRN_IVOR9: + *spr_val = vcpu->arch.ivor[BOOKE_IRQPRIO_AP_UNAVAIL]; + break; + case SPRN_IVOR10: + *spr_val = vcpu->arch.ivor[BOOKE_IRQPRIO_DECREMENTER]; + break; + case SPRN_IVOR11: + *spr_val = vcpu->arch.ivor[BOOKE_IRQPRIO_FIT]; + break; + case SPRN_IVOR12: + *spr_val = vcpu->arch.ivor[BOOKE_IRQPRIO_WATCHDOG]; + break; + case SPRN_IVOR13: + *spr_val = vcpu->arch.ivor[BOOKE_IRQPRIO_DTLB_MISS]; + break; + case SPRN_IVOR14: + *spr_val = vcpu->arch.ivor[BOOKE_IRQPRIO_ITLB_MISS]; + break; + case SPRN_IVOR15: + *spr_val = vcpu->arch.ivor[BOOKE_IRQPRIO_DEBUG]; + break; + case SPRN_MCSR: + *spr_val = vcpu->arch.mcsr; + break; +#if defined(CONFIG_64BIT) + case SPRN_EPCR: + *spr_val = vcpu->arch.epcr; + break; +#endif + + default: + emulated = EMULATE_FAIL; + } + + return emulated; +} diff --git a/kernel/arch/powerpc/kvm/booke_interrupts.S b/kernel/arch/powerpc/kvm/booke_interrupts.S new file mode 100644 index 000000000..84c308a9a --- /dev/null +++ b/kernel/arch/powerpc/kvm/booke_interrupts.S @@ -0,0 +1,547 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * + * Copyright IBM Corp. 2007 + * Copyright 2011 Freescale Semiconductor, Inc. + * + * Authors: Hollis Blanchard <hollisb@us.ibm.com> + */ + +#include <asm/ppc_asm.h> +#include <asm/kvm_asm.h> +#include <asm/reg.h> +#include <asm/page.h> +#include <asm/asm-offsets.h> + +/* The host stack layout: */ +#define HOST_R1 0 /* Implied by stwu. */ +#define HOST_CALLEE_LR 4 +#define HOST_RUN 8 +/* r2 is special: it holds 'current', and it made nonvolatile in the + * kernel with the -ffixed-r2 gcc option. */ +#define HOST_R2 12 +#define HOST_CR 16 +#define HOST_NV_GPRS 20 +#define __HOST_NV_GPR(n) (HOST_NV_GPRS + ((n - 14) * 4)) +#define HOST_NV_GPR(n) __HOST_NV_GPR(__REG_##n) +#define HOST_MIN_STACK_SIZE (HOST_NV_GPR(R31) + 4) +#define HOST_STACK_SIZE (((HOST_MIN_STACK_SIZE + 15) / 16) * 16) /* Align. */ +#define HOST_STACK_LR (HOST_STACK_SIZE + 4) /* In caller stack frame. */ + +#define NEED_INST_MASK ((1<<BOOKE_INTERRUPT_PROGRAM) | \ + (1<<BOOKE_INTERRUPT_DTLB_MISS) | \ + (1<<BOOKE_INTERRUPT_DEBUG)) + +#define NEED_DEAR_MASK ((1<<BOOKE_INTERRUPT_DATA_STORAGE) | \ + (1<<BOOKE_INTERRUPT_DTLB_MISS) | \ + (1<<BOOKE_INTERRUPT_ALIGNMENT)) + +#define NEED_ESR_MASK ((1<<BOOKE_INTERRUPT_DATA_STORAGE) | \ + (1<<BOOKE_INTERRUPT_INST_STORAGE) | \ + (1<<BOOKE_INTERRUPT_PROGRAM) | \ + (1<<BOOKE_INTERRUPT_DTLB_MISS) | \ + (1<<BOOKE_INTERRUPT_ALIGNMENT)) + +.macro __KVM_HANDLER ivor_nr scratch srr0 + /* Get pointer to vcpu and record exit number. */ + mtspr \scratch , r4 + mfspr r4, SPRN_SPRG_THREAD + lwz r4, THREAD_KVM_VCPU(r4) + stw r3, VCPU_GPR(R3)(r4) + stw r5, VCPU_GPR(R5)(r4) + stw r6, VCPU_GPR(R6)(r4) + mfspr r3, \scratch + mfctr r5 + stw r3, VCPU_GPR(R4)(r4) + stw r5, VCPU_CTR(r4) + mfspr r3, \srr0 + lis r6, kvmppc_resume_host@h + stw r3, VCPU_PC(r4) + li r5, \ivor_nr + ori r6, r6, kvmppc_resume_host@l + mtctr r6 + bctr +.endm + +.macro KVM_HANDLER ivor_nr scratch srr0 +_GLOBAL(kvmppc_handler_\ivor_nr) + __KVM_HANDLER \ivor_nr \scratch \srr0 +.endm + +.macro KVM_DBG_HANDLER ivor_nr scratch srr0 +_GLOBAL(kvmppc_handler_\ivor_nr) + mtspr \scratch, r4 + mfspr r4, SPRN_SPRG_THREAD + lwz r4, THREAD_KVM_VCPU(r4) + stw r3, VCPU_CRIT_SAVE(r4) + mfcr r3 + mfspr r4, SPRN_CSRR1 + andi. r4, r4, MSR_PR + bne 1f + /* debug interrupt happened in enter/exit path */ + mfspr r4, SPRN_CSRR1 + rlwinm r4, r4, 0, ~MSR_DE + mtspr SPRN_CSRR1, r4 + lis r4, 0xffff + ori r4, r4, 0xffff + mtspr SPRN_DBSR, r4 + mfspr r4, SPRN_SPRG_THREAD + lwz r4, THREAD_KVM_VCPU(r4) + mtcr r3 + lwz r3, VCPU_CRIT_SAVE(r4) + mfspr r4, \scratch + rfci +1: /* debug interrupt happened in guest */ + mtcr r3 + mfspr r4, SPRN_SPRG_THREAD + lwz r4, THREAD_KVM_VCPU(r4) + lwz r3, VCPU_CRIT_SAVE(r4) + mfspr r4, \scratch + __KVM_HANDLER \ivor_nr \scratch \srr0 +.endm + +.macro KVM_HANDLER_ADDR ivor_nr + .long kvmppc_handler_\ivor_nr +.endm + +.macro KVM_HANDLER_END + .long kvmppc_handlers_end +.endm + +_GLOBAL(kvmppc_handlers_start) +KVM_HANDLER BOOKE_INTERRUPT_CRITICAL SPRN_SPRG_RSCRATCH_CRIT SPRN_CSRR0 +KVM_HANDLER BOOKE_INTERRUPT_MACHINE_CHECK SPRN_SPRG_RSCRATCH_MC SPRN_MCSRR0 +KVM_HANDLER BOOKE_INTERRUPT_DATA_STORAGE SPRN_SPRG_RSCRATCH0 SPRN_SRR0 +KVM_HANDLER BOOKE_INTERRUPT_INST_STORAGE SPRN_SPRG_RSCRATCH0 SPRN_SRR0 +KVM_HANDLER BOOKE_INTERRUPT_EXTERNAL SPRN_SPRG_RSCRATCH0 SPRN_SRR0 +KVM_HANDLER BOOKE_INTERRUPT_ALIGNMENT SPRN_SPRG_RSCRATCH0 SPRN_SRR0 +KVM_HANDLER BOOKE_INTERRUPT_PROGRAM SPRN_SPRG_RSCRATCH0 SPRN_SRR0 +KVM_HANDLER BOOKE_INTERRUPT_FP_UNAVAIL SPRN_SPRG_RSCRATCH0 SPRN_SRR0 +KVM_HANDLER BOOKE_INTERRUPT_SYSCALL SPRN_SPRG_RSCRATCH0 SPRN_SRR0 +KVM_HANDLER BOOKE_INTERRUPT_AP_UNAVAIL SPRN_SPRG_RSCRATCH0 SPRN_SRR0 +KVM_HANDLER BOOKE_INTERRUPT_DECREMENTER SPRN_SPRG_RSCRATCH0 SPRN_SRR0 +KVM_HANDLER BOOKE_INTERRUPT_FIT SPRN_SPRG_RSCRATCH0 SPRN_SRR0 +KVM_HANDLER BOOKE_INTERRUPT_WATCHDOG SPRN_SPRG_RSCRATCH_CRIT SPRN_CSRR0 +KVM_HANDLER BOOKE_INTERRUPT_DTLB_MISS SPRN_SPRG_RSCRATCH0 SPRN_SRR0 +KVM_HANDLER BOOKE_INTERRUPT_ITLB_MISS SPRN_SPRG_RSCRATCH0 SPRN_SRR0 +KVM_DBG_HANDLER BOOKE_INTERRUPT_DEBUG SPRN_SPRG_RSCRATCH_CRIT SPRN_CSRR0 +KVM_HANDLER BOOKE_INTERRUPT_SPE_UNAVAIL SPRN_SPRG_RSCRATCH0 SPRN_SRR0 +KVM_HANDLER BOOKE_INTERRUPT_SPE_FP_DATA SPRN_SPRG_RSCRATCH0 SPRN_SRR0 +KVM_HANDLER BOOKE_INTERRUPT_SPE_FP_ROUND SPRN_SPRG_RSCRATCH0 SPRN_SRR0 +_GLOBAL(kvmppc_handlers_end) + +/* Registers: + * SPRG_SCRATCH0: guest r4 + * r4: vcpu pointer + * r5: KVM exit number + */ +_GLOBAL(kvmppc_resume_host) + mfcr r3 + stw r3, VCPU_CR(r4) + stw r7, VCPU_GPR(R7)(r4) + stw r8, VCPU_GPR(R8)(r4) + stw r9, VCPU_GPR(R9)(r4) + + li r6, 1 + slw r6, r6, r5 + +#ifdef CONFIG_KVM_EXIT_TIMING + /* save exit time */ +1: + mfspr r7, SPRN_TBRU + mfspr r8, SPRN_TBRL + mfspr r9, SPRN_TBRU + cmpw r9, r7 + bne 1b + stw r8, VCPU_TIMING_EXIT_TBL(r4) + stw r9, VCPU_TIMING_EXIT_TBU(r4) +#endif + + /* Save the faulting instruction and all GPRs for emulation. */ + andi. r7, r6, NEED_INST_MASK + beq ..skip_inst_copy + mfspr r9, SPRN_SRR0 + mfmsr r8 + ori r7, r8, MSR_DS + mtmsr r7 + isync + lwz r9, 0(r9) + mtmsr r8 + isync + stw r9, VCPU_LAST_INST(r4) + + stw r15, VCPU_GPR(R15)(r4) + stw r16, VCPU_GPR(R16)(r4) + stw r17, VCPU_GPR(R17)(r4) + stw r18, VCPU_GPR(R18)(r4) + stw r19, VCPU_GPR(R19)(r4) + stw r20, VCPU_GPR(R20)(r4) + stw r21, VCPU_GPR(R21)(r4) + stw r22, VCPU_GPR(R22)(r4) + stw r23, VCPU_GPR(R23)(r4) + stw r24, VCPU_GPR(R24)(r4) + stw r25, VCPU_GPR(R25)(r4) + stw r26, VCPU_GPR(R26)(r4) + stw r27, VCPU_GPR(R27)(r4) + stw r28, VCPU_GPR(R28)(r4) + stw r29, VCPU_GPR(R29)(r4) + stw r30, VCPU_GPR(R30)(r4) + stw r31, VCPU_GPR(R31)(r4) +..skip_inst_copy: + + /* Also grab DEAR and ESR before the host can clobber them. */ + + andi. r7, r6, NEED_DEAR_MASK + beq ..skip_dear + mfspr r9, SPRN_DEAR + stw r9, VCPU_FAULT_DEAR(r4) +..skip_dear: + + andi. r7, r6, NEED_ESR_MASK + beq ..skip_esr + mfspr r9, SPRN_ESR + stw r9, VCPU_FAULT_ESR(r4) +..skip_esr: + + /* Save remaining volatile guest register state to vcpu. */ + stw r0, VCPU_GPR(R0)(r4) + stw r1, VCPU_GPR(R1)(r4) + stw r2, VCPU_GPR(R2)(r4) + stw r10, VCPU_GPR(R10)(r4) + stw r11, VCPU_GPR(R11)(r4) + stw r12, VCPU_GPR(R12)(r4) + stw r13, VCPU_GPR(R13)(r4) + stw r14, VCPU_GPR(R14)(r4) /* We need a NV GPR below. */ + mflr r3 + stw r3, VCPU_LR(r4) + mfxer r3 + stw r3, VCPU_XER(r4) + + /* Restore host stack pointer and PID before IVPR, since the host + * exception handlers use them. */ + lwz r1, VCPU_HOST_STACK(r4) + lwz r3, VCPU_HOST_PID(r4) + mtspr SPRN_PID, r3 + +#ifdef CONFIG_FSL_BOOKE + /* we cheat and know that Linux doesn't use PID1 which is always 0 */ + lis r3, 0 + mtspr SPRN_PID1, r3 +#endif + + /* Restore host IVPR before re-enabling interrupts. We cheat and know + * that Linux IVPR is always 0xc0000000. */ + lis r3, 0xc000 + mtspr SPRN_IVPR, r3 + + /* Switch to kernel stack and jump to handler. */ + LOAD_REG_ADDR(r3, kvmppc_handle_exit) + mtctr r3 + lwz r3, HOST_RUN(r1) + lwz r2, HOST_R2(r1) + mr r14, r4 /* Save vcpu pointer. */ + + bctrl /* kvmppc_handle_exit() */ + + /* Restore vcpu pointer and the nonvolatiles we used. */ + mr r4, r14 + lwz r14, VCPU_GPR(R14)(r4) + + /* Sometimes instruction emulation must restore complete GPR state. */ + andi. r5, r3, RESUME_FLAG_NV + beq ..skip_nv_load + lwz r15, VCPU_GPR(R15)(r4) + lwz r16, VCPU_GPR(R16)(r4) + lwz r17, VCPU_GPR(R17)(r4) + lwz r18, VCPU_GPR(R18)(r4) + lwz r19, VCPU_GPR(R19)(r4) + lwz r20, VCPU_GPR(R20)(r4) + lwz r21, VCPU_GPR(R21)(r4) + lwz r22, VCPU_GPR(R22)(r4) + lwz r23, VCPU_GPR(R23)(r4) + lwz r24, VCPU_GPR(R24)(r4) + lwz r25, VCPU_GPR(R25)(r4) + lwz r26, VCPU_GPR(R26)(r4) + lwz r27, VCPU_GPR(R27)(r4) + lwz r28, VCPU_GPR(R28)(r4) + lwz r29, VCPU_GPR(R29)(r4) + lwz r30, VCPU_GPR(R30)(r4) + lwz r31, VCPU_GPR(R31)(r4) +..skip_nv_load: + + /* Should we return to the guest? */ + andi. r5, r3, RESUME_FLAG_HOST + beq lightweight_exit + + srawi r3, r3, 2 /* Shift -ERR back down. */ + +heavyweight_exit: + /* Not returning to guest. */ + +#ifdef CONFIG_SPE + /* save guest SPEFSCR and load host SPEFSCR */ + mfspr r9, SPRN_SPEFSCR + stw r9, VCPU_SPEFSCR(r4) + lwz r9, VCPU_HOST_SPEFSCR(r4) + mtspr SPRN_SPEFSCR, r9 +#endif + + /* We already saved guest volatile register state; now save the + * non-volatiles. */ + stw r15, VCPU_GPR(R15)(r4) + stw r16, VCPU_GPR(R16)(r4) + stw r17, VCPU_GPR(R17)(r4) + stw r18, VCPU_GPR(R18)(r4) + stw r19, VCPU_GPR(R19)(r4) + stw r20, VCPU_GPR(R20)(r4) + stw r21, VCPU_GPR(R21)(r4) + stw r22, VCPU_GPR(R22)(r4) + stw r23, VCPU_GPR(R23)(r4) + stw r24, VCPU_GPR(R24)(r4) + stw r25, VCPU_GPR(R25)(r4) + stw r26, VCPU_GPR(R26)(r4) + stw r27, VCPU_GPR(R27)(r4) + stw r28, VCPU_GPR(R28)(r4) + stw r29, VCPU_GPR(R29)(r4) + stw r30, VCPU_GPR(R30)(r4) + stw r31, VCPU_GPR(R31)(r4) + + /* Load host non-volatile register state from host stack. */ + lwz r14, HOST_NV_GPR(R14)(r1) + lwz r15, HOST_NV_GPR(R15)(r1) + lwz r16, HOST_NV_GPR(R16)(r1) + lwz r17, HOST_NV_GPR(R17)(r1) + lwz r18, HOST_NV_GPR(R18)(r1) + lwz r19, HOST_NV_GPR(R19)(r1) + lwz r20, HOST_NV_GPR(R20)(r1) + lwz r21, HOST_NV_GPR(R21)(r1) + lwz r22, HOST_NV_GPR(R22)(r1) + lwz r23, HOST_NV_GPR(R23)(r1) + lwz r24, HOST_NV_GPR(R24)(r1) + lwz r25, HOST_NV_GPR(R25)(r1) + lwz r26, HOST_NV_GPR(R26)(r1) + lwz r27, HOST_NV_GPR(R27)(r1) + lwz r28, HOST_NV_GPR(R28)(r1) + lwz r29, HOST_NV_GPR(R29)(r1) + lwz r30, HOST_NV_GPR(R30)(r1) + lwz r31, HOST_NV_GPR(R31)(r1) + + /* Return to kvm_vcpu_run(). */ + lwz r4, HOST_STACK_LR(r1) + lwz r5, HOST_CR(r1) + addi r1, r1, HOST_STACK_SIZE + mtlr r4 + mtcr r5 + /* r3 still contains the return code from kvmppc_handle_exit(). */ + blr + + +/* Registers: + * r3: kvm_run pointer + * r4: vcpu pointer + */ +_GLOBAL(__kvmppc_vcpu_run) + stwu r1, -HOST_STACK_SIZE(r1) + stw r1, VCPU_HOST_STACK(r4) /* Save stack pointer to vcpu. */ + + /* Save host state to stack. */ + stw r3, HOST_RUN(r1) + mflr r3 + stw r3, HOST_STACK_LR(r1) + mfcr r5 + stw r5, HOST_CR(r1) + + /* Save host non-volatile register state to stack. */ + stw r14, HOST_NV_GPR(R14)(r1) + stw r15, HOST_NV_GPR(R15)(r1) + stw r16, HOST_NV_GPR(R16)(r1) + stw r17, HOST_NV_GPR(R17)(r1) + stw r18, HOST_NV_GPR(R18)(r1) + stw r19, HOST_NV_GPR(R19)(r1) + stw r20, HOST_NV_GPR(R20)(r1) + stw r21, HOST_NV_GPR(R21)(r1) + stw r22, HOST_NV_GPR(R22)(r1) + stw r23, HOST_NV_GPR(R23)(r1) + stw r24, HOST_NV_GPR(R24)(r1) + stw r25, HOST_NV_GPR(R25)(r1) + stw r26, HOST_NV_GPR(R26)(r1) + stw r27, HOST_NV_GPR(R27)(r1) + stw r28, HOST_NV_GPR(R28)(r1) + stw r29, HOST_NV_GPR(R29)(r1) + stw r30, HOST_NV_GPR(R30)(r1) + stw r31, HOST_NV_GPR(R31)(r1) + + /* Load guest non-volatiles. */ + lwz r14, VCPU_GPR(R14)(r4) + lwz r15, VCPU_GPR(R15)(r4) + lwz r16, VCPU_GPR(R16)(r4) + lwz r17, VCPU_GPR(R17)(r4) + lwz r18, VCPU_GPR(R18)(r4) + lwz r19, VCPU_GPR(R19)(r4) + lwz r20, VCPU_GPR(R20)(r4) + lwz r21, VCPU_GPR(R21)(r4) + lwz r22, VCPU_GPR(R22)(r4) + lwz r23, VCPU_GPR(R23)(r4) + lwz r24, VCPU_GPR(R24)(r4) + lwz r25, VCPU_GPR(R25)(r4) + lwz r26, VCPU_GPR(R26)(r4) + lwz r27, VCPU_GPR(R27)(r4) + lwz r28, VCPU_GPR(R28)(r4) + lwz r29, VCPU_GPR(R29)(r4) + lwz r30, VCPU_GPR(R30)(r4) + lwz r31, VCPU_GPR(R31)(r4) + +#ifdef CONFIG_SPE + /* save host SPEFSCR and load guest SPEFSCR */ + mfspr r3, SPRN_SPEFSCR + stw r3, VCPU_HOST_SPEFSCR(r4) + lwz r3, VCPU_SPEFSCR(r4) + mtspr SPRN_SPEFSCR, r3 +#endif + +lightweight_exit: + stw r2, HOST_R2(r1) + + mfspr r3, SPRN_PID + stw r3, VCPU_HOST_PID(r4) + lwz r3, VCPU_SHADOW_PID(r4) + mtspr SPRN_PID, r3 + +#ifdef CONFIG_FSL_BOOKE + lwz r3, VCPU_SHADOW_PID1(r4) + mtspr SPRN_PID1, r3 +#endif + + /* Load some guest volatiles. */ + lwz r0, VCPU_GPR(R0)(r4) + lwz r2, VCPU_GPR(R2)(r4) + lwz r9, VCPU_GPR(R9)(r4) + lwz r10, VCPU_GPR(R10)(r4) + lwz r11, VCPU_GPR(R11)(r4) + lwz r12, VCPU_GPR(R12)(r4) + lwz r13, VCPU_GPR(R13)(r4) + lwz r3, VCPU_LR(r4) + mtlr r3 + lwz r3, VCPU_XER(r4) + mtxer r3 + + /* Switch the IVPR. XXX If we take a TLB miss after this we're screwed, + * so how do we make sure vcpu won't fault? */ + lis r8, kvmppc_booke_handlers@ha + lwz r8, kvmppc_booke_handlers@l(r8) + mtspr SPRN_IVPR, r8 + + lwz r5, VCPU_SHARED(r4) + + /* Can't switch the stack pointer until after IVPR is switched, + * because host interrupt handlers would get confused. */ + lwz r1, VCPU_GPR(R1)(r4) + + /* + * Host interrupt handlers may have clobbered these + * guest-readable SPRGs, or the guest kernel may have + * written directly to the shared area, so we + * need to reload them here with the guest's values. + */ + PPC_LD(r3, VCPU_SHARED_SPRG4, r5) + mtspr SPRN_SPRG4W, r3 + PPC_LD(r3, VCPU_SHARED_SPRG5, r5) + mtspr SPRN_SPRG5W, r3 + PPC_LD(r3, VCPU_SHARED_SPRG6, r5) + mtspr SPRN_SPRG6W, r3 + PPC_LD(r3, VCPU_SHARED_SPRG7, r5) + mtspr SPRN_SPRG7W, r3 + +#ifdef CONFIG_KVM_EXIT_TIMING + /* save enter time */ +1: + mfspr r6, SPRN_TBRU + mfspr r7, SPRN_TBRL + mfspr r8, SPRN_TBRU + cmpw r8, r6 + bne 1b + stw r7, VCPU_TIMING_LAST_ENTER_TBL(r4) + stw r8, VCPU_TIMING_LAST_ENTER_TBU(r4) +#endif + + /* Finish loading guest volatiles and jump to guest. */ + lwz r3, VCPU_CTR(r4) + lwz r5, VCPU_CR(r4) + lwz r6, VCPU_PC(r4) + lwz r7, VCPU_SHADOW_MSR(r4) + mtctr r3 + mtcr r5 + mtsrr0 r6 + mtsrr1 r7 + lwz r5, VCPU_GPR(R5)(r4) + lwz r6, VCPU_GPR(R6)(r4) + lwz r7, VCPU_GPR(R7)(r4) + lwz r8, VCPU_GPR(R8)(r4) + + /* Clear any debug events which occurred since we disabled MSR[DE]. + * XXX This gives us a 3-instruction window in which a breakpoint + * intended for guest context could fire in the host instead. */ + lis r3, 0xffff + ori r3, r3, 0xffff + mtspr SPRN_DBSR, r3 + + lwz r3, VCPU_GPR(R3)(r4) + lwz r4, VCPU_GPR(R4)(r4) + rfi + + .data + .align 4 + .globl kvmppc_booke_handler_addr +kvmppc_booke_handler_addr: +KVM_HANDLER_ADDR BOOKE_INTERRUPT_CRITICAL +KVM_HANDLER_ADDR BOOKE_INTERRUPT_MACHINE_CHECK +KVM_HANDLER_ADDR BOOKE_INTERRUPT_DATA_STORAGE +KVM_HANDLER_ADDR BOOKE_INTERRUPT_INST_STORAGE +KVM_HANDLER_ADDR BOOKE_INTERRUPT_EXTERNAL +KVM_HANDLER_ADDR BOOKE_INTERRUPT_ALIGNMENT +KVM_HANDLER_ADDR BOOKE_INTERRUPT_PROGRAM +KVM_HANDLER_ADDR BOOKE_INTERRUPT_FP_UNAVAIL +KVM_HANDLER_ADDR BOOKE_INTERRUPT_SYSCALL +KVM_HANDLER_ADDR BOOKE_INTERRUPT_AP_UNAVAIL +KVM_HANDLER_ADDR BOOKE_INTERRUPT_DECREMENTER +KVM_HANDLER_ADDR BOOKE_INTERRUPT_FIT +KVM_HANDLER_ADDR BOOKE_INTERRUPT_WATCHDOG +KVM_HANDLER_ADDR BOOKE_INTERRUPT_DTLB_MISS +KVM_HANDLER_ADDR BOOKE_INTERRUPT_ITLB_MISS +KVM_HANDLER_ADDR BOOKE_INTERRUPT_DEBUG +KVM_HANDLER_ADDR BOOKE_INTERRUPT_SPE_UNAVAIL +KVM_HANDLER_ADDR BOOKE_INTERRUPT_SPE_FP_DATA +KVM_HANDLER_ADDR BOOKE_INTERRUPT_SPE_FP_ROUND +KVM_HANDLER_END /*Always keep this in end*/ + +#ifdef CONFIG_SPE +_GLOBAL(kvmppc_save_guest_spe) + cmpi 0,r3,0 + beqlr- + SAVE_32EVRS(0, r4, r3, VCPU_EVR) + evxor evr6, evr6, evr6 + evmwumiaa evr6, evr6, evr6 + li r4,VCPU_ACC + evstddx evr6, r4, r3 /* save acc */ + blr + +_GLOBAL(kvmppc_load_guest_spe) + cmpi 0,r3,0 + beqlr- + li r4,VCPU_ACC + evlddx evr6,r4,r3 + evmra evr6,evr6 /* load acc */ + REST_32EVRS(0, r4, r3, VCPU_EVR) + blr +#endif diff --git a/kernel/arch/powerpc/kvm/bookehv_interrupts.S b/kernel/arch/powerpc/kvm/bookehv_interrupts.S new file mode 100644 index 000000000..81bd8a07a --- /dev/null +++ b/kernel/arch/powerpc/kvm/bookehv_interrupts.S @@ -0,0 +1,689 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * + * Copyright (C) 2010-2011 Freescale Semiconductor, Inc. + * + * Author: Varun Sethi <varun.sethi@freescale.com> + * Author: Scott Wood <scotwood@freescale.com> + * Author: Mihai Caraman <mihai.caraman@freescale.com> + * + * This file is derived from arch/powerpc/kvm/booke_interrupts.S + */ + +#include <asm/ppc_asm.h> +#include <asm/kvm_asm.h> +#include <asm/reg.h> +#include <asm/page.h> +#include <asm/asm-compat.h> +#include <asm/asm-offsets.h> +#include <asm/bitsperlong.h> + +#ifdef CONFIG_64BIT +#include <asm/exception-64e.h> +#include <asm/hw_irq.h> +#include <asm/irqflags.h> +#else +#include "../kernel/head_booke.h" /* for THREAD_NORMSAVE() */ +#endif + +#define LONGBYTES (BITS_PER_LONG / 8) + +#define VCPU_GUEST_SPRG(n) (VCPU_GUEST_SPRGS + (n * LONGBYTES)) + +/* The host stack layout: */ +#define HOST_R1 0 /* Implied by stwu. */ +#define HOST_CALLEE_LR PPC_LR_STKOFF +#define HOST_RUN (HOST_CALLEE_LR + LONGBYTES) +/* + * r2 is special: it holds 'current', and it made nonvolatile in the + * kernel with the -ffixed-r2 gcc option. + */ +#define HOST_R2 (HOST_RUN + LONGBYTES) +#define HOST_CR (HOST_R2 + LONGBYTES) +#define HOST_NV_GPRS (HOST_CR + LONGBYTES) +#define __HOST_NV_GPR(n) (HOST_NV_GPRS + ((n - 14) * LONGBYTES)) +#define HOST_NV_GPR(n) __HOST_NV_GPR(__REG_##n) +#define HOST_MIN_STACK_SIZE (HOST_NV_GPR(R31) + LONGBYTES) +#define HOST_STACK_SIZE ((HOST_MIN_STACK_SIZE + 15) & ~15) /* Align. */ +/* LR in caller stack frame. */ +#define HOST_STACK_LR (HOST_STACK_SIZE + PPC_LR_STKOFF) + +#define NEED_EMU 0x00000001 /* emulation -- save nv regs */ +#define NEED_DEAR 0x00000002 /* save faulting DEAR */ +#define NEED_ESR 0x00000004 /* save faulting ESR */ + +/* + * On entry: + * r4 = vcpu, r5 = srr0, r6 = srr1 + * saved in vcpu: cr, ctr, r3-r13 + */ +.macro kvm_handler_common intno, srr0, flags + /* Restore host stack pointer */ + PPC_STL r1, VCPU_GPR(R1)(r4) + PPC_STL r2, VCPU_GPR(R2)(r4) + PPC_LL r1, VCPU_HOST_STACK(r4) + PPC_LL r2, HOST_R2(r1) + + mfspr r10, SPRN_PID + lwz r8, VCPU_HOST_PID(r4) + PPC_LL r11, VCPU_SHARED(r4) + PPC_STL r14, VCPU_GPR(R14)(r4) /* We need a non-volatile GPR. */ + li r14, \intno + + stw r10, VCPU_GUEST_PID(r4) + mtspr SPRN_PID, r8 + +#ifdef CONFIG_KVM_EXIT_TIMING + /* save exit time */ +1: mfspr r7, SPRN_TBRU + mfspr r8, SPRN_TBRL + mfspr r9, SPRN_TBRU + cmpw r9, r7 + stw r8, VCPU_TIMING_EXIT_TBL(r4) + bne- 1b + stw r9, VCPU_TIMING_EXIT_TBU(r4) +#endif + + oris r8, r6, MSR_CE@h + PPC_STD(r6, VCPU_SHARED_MSR, r11) + ori r8, r8, MSR_ME | MSR_RI + PPC_STL r5, VCPU_PC(r4) + + /* + * Make sure CE/ME/RI are set (if appropriate for exception type) + * whether or not the guest had it set. Since mfmsr/mtmsr are + * somewhat expensive, skip in the common case where the guest + * had all these bits set (and thus they're still set if + * appropriate for the exception type). + */ + cmpw r6, r8 + beq 1f + mfmsr r7 + .if \srr0 != SPRN_MCSRR0 && \srr0 != SPRN_CSRR0 + oris r7, r7, MSR_CE@h + .endif + .if \srr0 != SPRN_MCSRR0 + ori r7, r7, MSR_ME | MSR_RI + .endif + mtmsr r7 +1: + + .if \flags & NEED_EMU + PPC_STL r15, VCPU_GPR(R15)(r4) + PPC_STL r16, VCPU_GPR(R16)(r4) + PPC_STL r17, VCPU_GPR(R17)(r4) + PPC_STL r18, VCPU_GPR(R18)(r4) + PPC_STL r19, VCPU_GPR(R19)(r4) + PPC_STL r20, VCPU_GPR(R20)(r4) + PPC_STL r21, VCPU_GPR(R21)(r4) + PPC_STL r22, VCPU_GPR(R22)(r4) + PPC_STL r23, VCPU_GPR(R23)(r4) + PPC_STL r24, VCPU_GPR(R24)(r4) + PPC_STL r25, VCPU_GPR(R25)(r4) + PPC_STL r26, VCPU_GPR(R26)(r4) + PPC_STL r27, VCPU_GPR(R27)(r4) + PPC_STL r28, VCPU_GPR(R28)(r4) + PPC_STL r29, VCPU_GPR(R29)(r4) + PPC_STL r30, VCPU_GPR(R30)(r4) + PPC_STL r31, VCPU_GPR(R31)(r4) + + /* + * We don't use external PID support. lwepx faults would need to be + * handled by KVM and this implies aditional code in DO_KVM (for + * DTB_MISS, DSI and LRAT) to check ESR[EPID] and EPLC[EGS] which + * is too intrusive for the host. Get last instuction in + * kvmppc_get_last_inst(). + */ + li r9, KVM_INST_FETCH_FAILED + stw r9, VCPU_LAST_INST(r4) + .endif + + .if \flags & NEED_ESR + mfspr r8, SPRN_ESR + PPC_STL r8, VCPU_FAULT_ESR(r4) + .endif + + .if \flags & NEED_DEAR + mfspr r9, SPRN_DEAR + PPC_STL r9, VCPU_FAULT_DEAR(r4) + .endif + + b kvmppc_resume_host +.endm + +#ifdef CONFIG_64BIT +/* Exception types */ +#define EX_GEN 1 +#define EX_GDBELL 2 +#define EX_DBG 3 +#define EX_MC 4 +#define EX_CRIT 5 +#define EX_TLB 6 + +/* + * For input register values, see arch/powerpc/include/asm/kvm_booke_hv_asm.h + */ +.macro kvm_handler intno type scratch, paca_ex, ex_r10, ex_r11, srr0, srr1, flags + _GLOBAL(kvmppc_handler_\intno\()_\srr1) + mr r11, r4 + /* + * Get vcpu from Paca: paca->__current.thread->kvm_vcpu + */ + PPC_LL r4, PACACURRENT(r13) + PPC_LL r4, (THREAD + THREAD_KVM_VCPU)(r4) + stw r10, VCPU_CR(r4) + PPC_STL r11, VCPU_GPR(R4)(r4) + PPC_STL r5, VCPU_GPR(R5)(r4) + PPC_STL r6, VCPU_GPR(R6)(r4) + PPC_STL r8, VCPU_GPR(R8)(r4) + PPC_STL r9, VCPU_GPR(R9)(r4) + .if \type == EX_TLB + PPC_LL r5, EX_TLB_R13(r12) + PPC_LL r6, EX_TLB_R10(r12) + PPC_LL r8, EX_TLB_R11(r12) + mfspr r12, \scratch + .else + mfspr r5, \scratch + PPC_LL r6, (\paca_ex + \ex_r10)(r13) + PPC_LL r8, (\paca_ex + \ex_r11)(r13) + .endif + PPC_STL r5, VCPU_GPR(R13)(r4) + PPC_STL r3, VCPU_GPR(R3)(r4) + PPC_STL r7, VCPU_GPR(R7)(r4) + PPC_STL r12, VCPU_GPR(R12)(r4) + PPC_STL r6, VCPU_GPR(R10)(r4) + PPC_STL r8, VCPU_GPR(R11)(r4) + mfctr r5 + PPC_STL r5, VCPU_CTR(r4) + mfspr r5, \srr0 + mfspr r6, \srr1 + kvm_handler_common \intno, \srr0, \flags +.endm + +#define EX_PARAMS(type) \ + EX_##type, \ + SPRN_SPRG_##type##_SCRATCH, \ + PACA_EX##type, \ + EX_R10, \ + EX_R11 + +#define EX_PARAMS_TLB \ + EX_TLB, \ + SPRN_SPRG_GEN_SCRATCH, \ + PACA_EXTLB, \ + EX_TLB_R10, \ + EX_TLB_R11 + +kvm_handler BOOKE_INTERRUPT_CRITICAL, EX_PARAMS(CRIT), \ + SPRN_CSRR0, SPRN_CSRR1, 0 +kvm_handler BOOKE_INTERRUPT_MACHINE_CHECK, EX_PARAMS(MC), \ + SPRN_MCSRR0, SPRN_MCSRR1, 0 +kvm_handler BOOKE_INTERRUPT_DATA_STORAGE, EX_PARAMS(GEN), \ + SPRN_SRR0, SPRN_SRR1,(NEED_EMU | NEED_DEAR | NEED_ESR) +kvm_handler BOOKE_INTERRUPT_INST_STORAGE, EX_PARAMS(GEN), \ + SPRN_SRR0, SPRN_SRR1, NEED_ESR +kvm_handler BOOKE_INTERRUPT_EXTERNAL, EX_PARAMS(GEN), \ + SPRN_SRR0, SPRN_SRR1, 0 +kvm_handler BOOKE_INTERRUPT_ALIGNMENT, EX_PARAMS(GEN), \ + SPRN_SRR0, SPRN_SRR1,(NEED_DEAR | NEED_ESR) +kvm_handler BOOKE_INTERRUPT_PROGRAM, EX_PARAMS(GEN), \ + SPRN_SRR0, SPRN_SRR1, (NEED_ESR | NEED_EMU) +kvm_handler BOOKE_INTERRUPT_FP_UNAVAIL, EX_PARAMS(GEN), \ + SPRN_SRR0, SPRN_SRR1, 0 +kvm_handler BOOKE_INTERRUPT_AP_UNAVAIL, EX_PARAMS(GEN), \ + SPRN_SRR0, SPRN_SRR1, 0 +kvm_handler BOOKE_INTERRUPT_DECREMENTER, EX_PARAMS(GEN), \ + SPRN_SRR0, SPRN_SRR1, 0 +kvm_handler BOOKE_INTERRUPT_FIT, EX_PARAMS(GEN), \ + SPRN_SRR0, SPRN_SRR1, 0 +kvm_handler BOOKE_INTERRUPT_WATCHDOG, EX_PARAMS(CRIT),\ + SPRN_CSRR0, SPRN_CSRR1, 0 +/* + * Only bolted TLB miss exception handlers are supported for now + */ +kvm_handler BOOKE_INTERRUPT_DTLB_MISS, EX_PARAMS_TLB, \ + SPRN_SRR0, SPRN_SRR1, (NEED_EMU | NEED_DEAR | NEED_ESR) +kvm_handler BOOKE_INTERRUPT_ITLB_MISS, EX_PARAMS_TLB, \ + SPRN_SRR0, SPRN_SRR1, 0 +kvm_handler BOOKE_INTERRUPT_ALTIVEC_UNAVAIL, EX_PARAMS(GEN), \ + SPRN_SRR0, SPRN_SRR1, 0 +kvm_handler BOOKE_INTERRUPT_ALTIVEC_ASSIST, EX_PARAMS(GEN), \ + SPRN_SRR0, SPRN_SRR1, 0 +kvm_handler BOOKE_INTERRUPT_PERFORMANCE_MONITOR, EX_PARAMS(GEN), \ + SPRN_SRR0, SPRN_SRR1, 0 +kvm_handler BOOKE_INTERRUPT_DOORBELL, EX_PARAMS(GEN), \ + SPRN_SRR0, SPRN_SRR1, 0 +kvm_handler BOOKE_INTERRUPT_DOORBELL_CRITICAL, EX_PARAMS(CRIT), \ + SPRN_CSRR0, SPRN_CSRR1, 0 +kvm_handler BOOKE_INTERRUPT_HV_PRIV, EX_PARAMS(GEN), \ + SPRN_SRR0, SPRN_SRR1, NEED_EMU +kvm_handler BOOKE_INTERRUPT_HV_SYSCALL, EX_PARAMS(GEN), \ + SPRN_SRR0, SPRN_SRR1, 0 +kvm_handler BOOKE_INTERRUPT_GUEST_DBELL, EX_PARAMS(GDBELL), \ + SPRN_GSRR0, SPRN_GSRR1, 0 +kvm_handler BOOKE_INTERRUPT_GUEST_DBELL_CRIT, EX_PARAMS(CRIT), \ + SPRN_CSRR0, SPRN_CSRR1, 0 +kvm_handler BOOKE_INTERRUPT_DEBUG, EX_PARAMS(DBG), \ + SPRN_DSRR0, SPRN_DSRR1, 0 +kvm_handler BOOKE_INTERRUPT_DEBUG, EX_PARAMS(CRIT), \ + SPRN_CSRR0, SPRN_CSRR1, 0 +kvm_handler BOOKE_INTERRUPT_LRAT_ERROR, EX_PARAMS(GEN), \ + SPRN_SRR0, SPRN_SRR1, (NEED_EMU | NEED_DEAR | NEED_ESR) +#else +/* + * For input register values, see arch/powerpc/include/asm/kvm_booke_hv_asm.h + */ +.macro kvm_handler intno srr0, srr1, flags +_GLOBAL(kvmppc_handler_\intno\()_\srr1) + PPC_LL r11, THREAD_KVM_VCPU(r10) + PPC_STL r3, VCPU_GPR(R3)(r11) + mfspr r3, SPRN_SPRG_RSCRATCH0 + PPC_STL r4, VCPU_GPR(R4)(r11) + PPC_LL r4, THREAD_NORMSAVE(0)(r10) + PPC_STL r5, VCPU_GPR(R5)(r11) + stw r13, VCPU_CR(r11) + mfspr r5, \srr0 + PPC_STL r3, VCPU_GPR(R10)(r11) + PPC_LL r3, THREAD_NORMSAVE(2)(r10) + PPC_STL r6, VCPU_GPR(R6)(r11) + PPC_STL r4, VCPU_GPR(R11)(r11) + mfspr r6, \srr1 + PPC_STL r7, VCPU_GPR(R7)(r11) + PPC_STL r8, VCPU_GPR(R8)(r11) + PPC_STL r9, VCPU_GPR(R9)(r11) + PPC_STL r3, VCPU_GPR(R13)(r11) + mfctr r7 + PPC_STL r12, VCPU_GPR(R12)(r11) + PPC_STL r7, VCPU_CTR(r11) + mr r4, r11 + kvm_handler_common \intno, \srr0, \flags +.endm + +.macro kvm_lvl_handler intno scratch srr0, srr1, flags +_GLOBAL(kvmppc_handler_\intno\()_\srr1) + mfspr r10, SPRN_SPRG_THREAD + PPC_LL r11, THREAD_KVM_VCPU(r10) + PPC_STL r3, VCPU_GPR(R3)(r11) + mfspr r3, \scratch + PPC_STL r4, VCPU_GPR(R4)(r11) + PPC_LL r4, GPR9(r8) + PPC_STL r5, VCPU_GPR(R5)(r11) + stw r9, VCPU_CR(r11) + mfspr r5, \srr0 + PPC_STL r3, VCPU_GPR(R8)(r11) + PPC_LL r3, GPR10(r8) + PPC_STL r6, VCPU_GPR(R6)(r11) + PPC_STL r4, VCPU_GPR(R9)(r11) + mfspr r6, \srr1 + PPC_LL r4, GPR11(r8) + PPC_STL r7, VCPU_GPR(R7)(r11) + PPC_STL r3, VCPU_GPR(R10)(r11) + mfctr r7 + PPC_STL r12, VCPU_GPR(R12)(r11) + PPC_STL r13, VCPU_GPR(R13)(r11) + PPC_STL r4, VCPU_GPR(R11)(r11) + PPC_STL r7, VCPU_CTR(r11) + mr r4, r11 + kvm_handler_common \intno, \srr0, \flags +.endm + +kvm_lvl_handler BOOKE_INTERRUPT_CRITICAL, \ + SPRN_SPRG_RSCRATCH_CRIT, SPRN_CSRR0, SPRN_CSRR1, 0 +kvm_lvl_handler BOOKE_INTERRUPT_MACHINE_CHECK, \ + SPRN_SPRG_RSCRATCH_MC, SPRN_MCSRR0, SPRN_MCSRR1, 0 +kvm_handler BOOKE_INTERRUPT_DATA_STORAGE, \ + SPRN_SRR0, SPRN_SRR1, (NEED_EMU | NEED_DEAR | NEED_ESR) +kvm_handler BOOKE_INTERRUPT_INST_STORAGE, SPRN_SRR0, SPRN_SRR1, NEED_ESR +kvm_handler BOOKE_INTERRUPT_EXTERNAL, SPRN_SRR0, SPRN_SRR1, 0 +kvm_handler BOOKE_INTERRUPT_ALIGNMENT, \ + SPRN_SRR0, SPRN_SRR1, (NEED_DEAR | NEED_ESR) +kvm_handler BOOKE_INTERRUPT_PROGRAM, SPRN_SRR0, SPRN_SRR1, (NEED_ESR | NEED_EMU) +kvm_handler BOOKE_INTERRUPT_FP_UNAVAIL, SPRN_SRR0, SPRN_SRR1, 0 +kvm_handler BOOKE_INTERRUPT_SYSCALL, SPRN_SRR0, SPRN_SRR1, 0 +kvm_handler BOOKE_INTERRUPT_AP_UNAVAIL, SPRN_SRR0, SPRN_SRR1, 0 +kvm_handler BOOKE_INTERRUPT_DECREMENTER, SPRN_SRR0, SPRN_SRR1, 0 +kvm_handler BOOKE_INTERRUPT_FIT, SPRN_SRR0, SPRN_SRR1, 0 +kvm_lvl_handler BOOKE_INTERRUPT_WATCHDOG, \ + SPRN_SPRG_RSCRATCH_CRIT, SPRN_CSRR0, SPRN_CSRR1, 0 +kvm_handler BOOKE_INTERRUPT_DTLB_MISS, \ + SPRN_SRR0, SPRN_SRR1, (NEED_EMU | NEED_DEAR | NEED_ESR) +kvm_handler BOOKE_INTERRUPT_ITLB_MISS, SPRN_SRR0, SPRN_SRR1, 0 +kvm_handler BOOKE_INTERRUPT_PERFORMANCE_MONITOR, SPRN_SRR0, SPRN_SRR1, 0 +kvm_handler BOOKE_INTERRUPT_DOORBELL, SPRN_SRR0, SPRN_SRR1, 0 +kvm_lvl_handler BOOKE_INTERRUPT_DOORBELL_CRITICAL, \ + SPRN_SPRG_RSCRATCH_CRIT, SPRN_CSRR0, SPRN_CSRR1, 0 +kvm_handler BOOKE_INTERRUPT_HV_PRIV, SPRN_SRR0, SPRN_SRR1, NEED_EMU +kvm_handler BOOKE_INTERRUPT_HV_SYSCALL, SPRN_SRR0, SPRN_SRR1, 0 +kvm_handler BOOKE_INTERRUPT_GUEST_DBELL, SPRN_GSRR0, SPRN_GSRR1, 0 +kvm_lvl_handler BOOKE_INTERRUPT_GUEST_DBELL_CRIT, \ + SPRN_SPRG_RSCRATCH_CRIT, SPRN_CSRR0, SPRN_CSRR1, 0 +kvm_lvl_handler BOOKE_INTERRUPT_DEBUG, \ + SPRN_SPRG_RSCRATCH_CRIT, SPRN_CSRR0, SPRN_CSRR1, 0 +kvm_lvl_handler BOOKE_INTERRUPT_DEBUG, \ + SPRN_SPRG_RSCRATCH_DBG, SPRN_DSRR0, SPRN_DSRR1, 0 +#endif + +/* Registers: + * SPRG_SCRATCH0: guest r10 + * r4: vcpu pointer + * r11: vcpu->arch.shared + * r14: KVM exit number + */ +_GLOBAL(kvmppc_resume_host) + /* Save remaining volatile guest register state to vcpu. */ + mfspr r3, SPRN_VRSAVE + PPC_STL r0, VCPU_GPR(R0)(r4) + mflr r5 + mfspr r6, SPRN_SPRG4 + PPC_STL r5, VCPU_LR(r4) + mfspr r7, SPRN_SPRG5 + stw r3, VCPU_VRSAVE(r4) +#ifdef CONFIG_64BIT + PPC_LL r3, PACA_SPRG_VDSO(r13) +#endif + mfspr r5, SPRN_SPRG9 + PPC_STD(r6, VCPU_SHARED_SPRG4, r11) + mfspr r8, SPRN_SPRG6 + PPC_STD(r7, VCPU_SHARED_SPRG5, r11) + mfspr r9, SPRN_SPRG7 +#ifdef CONFIG_64BIT + mtspr SPRN_SPRG_VDSO_WRITE, r3 +#endif + PPC_STD(r5, VCPU_SPRG9, r4) + PPC_STD(r8, VCPU_SHARED_SPRG6, r11) + mfxer r3 + PPC_STD(r9, VCPU_SHARED_SPRG7, r11) + + /* save guest MAS registers and restore host mas4 & mas6 */ + mfspr r5, SPRN_MAS0 + PPC_STL r3, VCPU_XER(r4) + mfspr r6, SPRN_MAS1 + stw r5, VCPU_SHARED_MAS0(r11) + mfspr r7, SPRN_MAS2 + stw r6, VCPU_SHARED_MAS1(r11) + PPC_STD(r7, VCPU_SHARED_MAS2, r11) + mfspr r5, SPRN_MAS3 + mfspr r6, SPRN_MAS4 + stw r5, VCPU_SHARED_MAS7_3+4(r11) + mfspr r7, SPRN_MAS6 + stw r6, VCPU_SHARED_MAS4(r11) + mfspr r5, SPRN_MAS7 + lwz r6, VCPU_HOST_MAS4(r4) + stw r7, VCPU_SHARED_MAS6(r11) + lwz r8, VCPU_HOST_MAS6(r4) + mtspr SPRN_MAS4, r6 + stw r5, VCPU_SHARED_MAS7_3+0(r11) + mtspr SPRN_MAS6, r8 + /* Enable MAS register updates via exception */ + mfspr r3, SPRN_EPCR + rlwinm r3, r3, 0, ~SPRN_EPCR_DMIUH + mtspr SPRN_EPCR, r3 + isync + +#ifdef CONFIG_64BIT + /* + * We enter with interrupts disabled in hardware, but + * we need to call RECONCILE_IRQ_STATE to ensure + * that the software state is kept in sync. + */ + RECONCILE_IRQ_STATE(r3,r5) +#endif + + /* Switch to kernel stack and jump to handler. */ + PPC_LL r3, HOST_RUN(r1) + mr r5, r14 /* intno */ + mr r14, r4 /* Save vcpu pointer. */ + bl kvmppc_handle_exit + + /* Restore vcpu pointer and the nonvolatiles we used. */ + mr r4, r14 + PPC_LL r14, VCPU_GPR(R14)(r4) + + andi. r5, r3, RESUME_FLAG_NV + beq skip_nv_load + PPC_LL r15, VCPU_GPR(R15)(r4) + PPC_LL r16, VCPU_GPR(R16)(r4) + PPC_LL r17, VCPU_GPR(R17)(r4) + PPC_LL r18, VCPU_GPR(R18)(r4) + PPC_LL r19, VCPU_GPR(R19)(r4) + PPC_LL r20, VCPU_GPR(R20)(r4) + PPC_LL r21, VCPU_GPR(R21)(r4) + PPC_LL r22, VCPU_GPR(R22)(r4) + PPC_LL r23, VCPU_GPR(R23)(r4) + PPC_LL r24, VCPU_GPR(R24)(r4) + PPC_LL r25, VCPU_GPR(R25)(r4) + PPC_LL r26, VCPU_GPR(R26)(r4) + PPC_LL r27, VCPU_GPR(R27)(r4) + PPC_LL r28, VCPU_GPR(R28)(r4) + PPC_LL r29, VCPU_GPR(R29)(r4) + PPC_LL r30, VCPU_GPR(R30)(r4) + PPC_LL r31, VCPU_GPR(R31)(r4) +skip_nv_load: + /* Should we return to the guest? */ + andi. r5, r3, RESUME_FLAG_HOST + beq lightweight_exit + + srawi r3, r3, 2 /* Shift -ERR back down. */ + +heavyweight_exit: + /* Not returning to guest. */ + PPC_LL r5, HOST_STACK_LR(r1) + lwz r6, HOST_CR(r1) + + /* + * We already saved guest volatile register state; now save the + * non-volatiles. + */ + + PPC_STL r15, VCPU_GPR(R15)(r4) + PPC_STL r16, VCPU_GPR(R16)(r4) + PPC_STL r17, VCPU_GPR(R17)(r4) + PPC_STL r18, VCPU_GPR(R18)(r4) + PPC_STL r19, VCPU_GPR(R19)(r4) + PPC_STL r20, VCPU_GPR(R20)(r4) + PPC_STL r21, VCPU_GPR(R21)(r4) + PPC_STL r22, VCPU_GPR(R22)(r4) + PPC_STL r23, VCPU_GPR(R23)(r4) + PPC_STL r24, VCPU_GPR(R24)(r4) + PPC_STL r25, VCPU_GPR(R25)(r4) + PPC_STL r26, VCPU_GPR(R26)(r4) + PPC_STL r27, VCPU_GPR(R27)(r4) + PPC_STL r28, VCPU_GPR(R28)(r4) + PPC_STL r29, VCPU_GPR(R29)(r4) + PPC_STL r30, VCPU_GPR(R30)(r4) + PPC_STL r31, VCPU_GPR(R31)(r4) + + /* Load host non-volatile register state from host stack. */ + PPC_LL r14, HOST_NV_GPR(R14)(r1) + PPC_LL r15, HOST_NV_GPR(R15)(r1) + PPC_LL r16, HOST_NV_GPR(R16)(r1) + PPC_LL r17, HOST_NV_GPR(R17)(r1) + PPC_LL r18, HOST_NV_GPR(R18)(r1) + PPC_LL r19, HOST_NV_GPR(R19)(r1) + PPC_LL r20, HOST_NV_GPR(R20)(r1) + PPC_LL r21, HOST_NV_GPR(R21)(r1) + PPC_LL r22, HOST_NV_GPR(R22)(r1) + PPC_LL r23, HOST_NV_GPR(R23)(r1) + PPC_LL r24, HOST_NV_GPR(R24)(r1) + PPC_LL r25, HOST_NV_GPR(R25)(r1) + PPC_LL r26, HOST_NV_GPR(R26)(r1) + PPC_LL r27, HOST_NV_GPR(R27)(r1) + PPC_LL r28, HOST_NV_GPR(R28)(r1) + PPC_LL r29, HOST_NV_GPR(R29)(r1) + PPC_LL r30, HOST_NV_GPR(R30)(r1) + PPC_LL r31, HOST_NV_GPR(R31)(r1) + + /* Return to kvm_vcpu_run(). */ + mtlr r5 + mtcr r6 + addi r1, r1, HOST_STACK_SIZE + /* r3 still contains the return code from kvmppc_handle_exit(). */ + blr + +/* Registers: + * r3: kvm_run pointer + * r4: vcpu pointer + */ +_GLOBAL(__kvmppc_vcpu_run) + stwu r1, -HOST_STACK_SIZE(r1) + PPC_STL r1, VCPU_HOST_STACK(r4) /* Save stack pointer to vcpu. */ + + /* Save host state to stack. */ + PPC_STL r3, HOST_RUN(r1) + mflr r3 + mfcr r5 + PPC_STL r3, HOST_STACK_LR(r1) + + stw r5, HOST_CR(r1) + + /* Save host non-volatile register state to stack. */ + PPC_STL r14, HOST_NV_GPR(R14)(r1) + PPC_STL r15, HOST_NV_GPR(R15)(r1) + PPC_STL r16, HOST_NV_GPR(R16)(r1) + PPC_STL r17, HOST_NV_GPR(R17)(r1) + PPC_STL r18, HOST_NV_GPR(R18)(r1) + PPC_STL r19, HOST_NV_GPR(R19)(r1) + PPC_STL r20, HOST_NV_GPR(R20)(r1) + PPC_STL r21, HOST_NV_GPR(R21)(r1) + PPC_STL r22, HOST_NV_GPR(R22)(r1) + PPC_STL r23, HOST_NV_GPR(R23)(r1) + PPC_STL r24, HOST_NV_GPR(R24)(r1) + PPC_STL r25, HOST_NV_GPR(R25)(r1) + PPC_STL r26, HOST_NV_GPR(R26)(r1) + PPC_STL r27, HOST_NV_GPR(R27)(r1) + PPC_STL r28, HOST_NV_GPR(R28)(r1) + PPC_STL r29, HOST_NV_GPR(R29)(r1) + PPC_STL r30, HOST_NV_GPR(R30)(r1) + PPC_STL r31, HOST_NV_GPR(R31)(r1) + + /* Load guest non-volatiles. */ + PPC_LL r14, VCPU_GPR(R14)(r4) + PPC_LL r15, VCPU_GPR(R15)(r4) + PPC_LL r16, VCPU_GPR(R16)(r4) + PPC_LL r17, VCPU_GPR(R17)(r4) + PPC_LL r18, VCPU_GPR(R18)(r4) + PPC_LL r19, VCPU_GPR(R19)(r4) + PPC_LL r20, VCPU_GPR(R20)(r4) + PPC_LL r21, VCPU_GPR(R21)(r4) + PPC_LL r22, VCPU_GPR(R22)(r4) + PPC_LL r23, VCPU_GPR(R23)(r4) + PPC_LL r24, VCPU_GPR(R24)(r4) + PPC_LL r25, VCPU_GPR(R25)(r4) + PPC_LL r26, VCPU_GPR(R26)(r4) + PPC_LL r27, VCPU_GPR(R27)(r4) + PPC_LL r28, VCPU_GPR(R28)(r4) + PPC_LL r29, VCPU_GPR(R29)(r4) + PPC_LL r30, VCPU_GPR(R30)(r4) + PPC_LL r31, VCPU_GPR(R31)(r4) + + +lightweight_exit: + PPC_STL r2, HOST_R2(r1) + + mfspr r3, SPRN_PID + stw r3, VCPU_HOST_PID(r4) + lwz r3, VCPU_GUEST_PID(r4) + mtspr SPRN_PID, r3 + + PPC_LL r11, VCPU_SHARED(r4) + /* Disable MAS register updates via exception */ + mfspr r3, SPRN_EPCR + oris r3, r3, SPRN_EPCR_DMIUH@h + mtspr SPRN_EPCR, r3 + isync + /* Save host mas4 and mas6 and load guest MAS registers */ + mfspr r3, SPRN_MAS4 + stw r3, VCPU_HOST_MAS4(r4) + mfspr r3, SPRN_MAS6 + stw r3, VCPU_HOST_MAS6(r4) + lwz r3, VCPU_SHARED_MAS0(r11) + lwz r5, VCPU_SHARED_MAS1(r11) + PPC_LD(r6, VCPU_SHARED_MAS2, r11) + lwz r7, VCPU_SHARED_MAS7_3+4(r11) + lwz r8, VCPU_SHARED_MAS4(r11) + mtspr SPRN_MAS0, r3 + mtspr SPRN_MAS1, r5 + mtspr SPRN_MAS2, r6 + mtspr SPRN_MAS3, r7 + mtspr SPRN_MAS4, r8 + lwz r3, VCPU_SHARED_MAS6(r11) + lwz r5, VCPU_SHARED_MAS7_3+0(r11) + mtspr SPRN_MAS6, r3 + mtspr SPRN_MAS7, r5 + + /* + * Host interrupt handlers may have clobbered these guest-readable + * SPRGs, so we need to reload them here with the guest's values. + */ + lwz r3, VCPU_VRSAVE(r4) + PPC_LD(r5, VCPU_SHARED_SPRG4, r11) + mtspr SPRN_VRSAVE, r3 + PPC_LD(r6, VCPU_SHARED_SPRG5, r11) + mtspr SPRN_SPRG4W, r5 + PPC_LD(r7, VCPU_SHARED_SPRG6, r11) + mtspr SPRN_SPRG5W, r6 + PPC_LD(r8, VCPU_SHARED_SPRG7, r11) + mtspr SPRN_SPRG6W, r7 + PPC_LD(r5, VCPU_SPRG9, r4) + mtspr SPRN_SPRG7W, r8 + mtspr SPRN_SPRG9, r5 + + /* Load some guest volatiles. */ + PPC_LL r3, VCPU_LR(r4) + PPC_LL r5, VCPU_XER(r4) + PPC_LL r6, VCPU_CTR(r4) + lwz r7, VCPU_CR(r4) + PPC_LL r8, VCPU_PC(r4) + PPC_LD(r9, VCPU_SHARED_MSR, r11) + PPC_LL r0, VCPU_GPR(R0)(r4) + PPC_LL r1, VCPU_GPR(R1)(r4) + PPC_LL r2, VCPU_GPR(R2)(r4) + PPC_LL r10, VCPU_GPR(R10)(r4) + PPC_LL r11, VCPU_GPR(R11)(r4) + PPC_LL r12, VCPU_GPR(R12)(r4) + PPC_LL r13, VCPU_GPR(R13)(r4) + mtlr r3 + mtxer r5 + mtctr r6 + mtsrr0 r8 + mtsrr1 r9 + +#ifdef CONFIG_KVM_EXIT_TIMING + /* save enter time */ +1: + mfspr r6, SPRN_TBRU + mfspr r9, SPRN_TBRL + mfspr r8, SPRN_TBRU + cmpw r8, r6 + stw r9, VCPU_TIMING_LAST_ENTER_TBL(r4) + bne 1b + stw r8, VCPU_TIMING_LAST_ENTER_TBU(r4) +#endif + + /* + * Don't execute any instruction which can change CR after + * below instruction. + */ + mtcr r7 + + /* Finish loading guest volatiles and jump to guest. */ + PPC_LL r5, VCPU_GPR(R5)(r4) + PPC_LL r6, VCPU_GPR(R6)(r4) + PPC_LL r7, VCPU_GPR(R7)(r4) + PPC_LL r8, VCPU_GPR(R8)(r4) + PPC_LL r9, VCPU_GPR(R9)(r4) + + PPC_LL r3, VCPU_GPR(R3)(r4) + PPC_LL r4, VCPU_GPR(R4)(r4) + rfi diff --git a/kernel/arch/powerpc/kvm/e500.c b/kernel/arch/powerpc/kvm/e500.c new file mode 100644 index 000000000..b29ce752c --- /dev/null +++ b/kernel/arch/powerpc/kvm/e500.c @@ -0,0 +1,571 @@ +/* + * Copyright (C) 2008-2011 Freescale Semiconductor, Inc. All rights reserved. + * + * Author: Yu Liu, <yu.liu@freescale.com> + * + * Description: + * This file is derived from arch/powerpc/kvm/44x.c, + * by Hollis Blanchard <hollisb@us.ibm.com>. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + */ + +#include <linux/kvm_host.h> +#include <linux/slab.h> +#include <linux/err.h> +#include <linux/export.h> +#include <linux/module.h> +#include <linux/miscdevice.h> + +#include <asm/reg.h> +#include <asm/cputable.h> +#include <asm/tlbflush.h> +#include <asm/kvm_ppc.h> + +#include "../mm/mmu_decl.h" +#include "booke.h" +#include "e500.h" + +struct id { + unsigned long val; + struct id **pentry; +}; + +#define NUM_TIDS 256 + +/* + * This table provide mappings from: + * (guestAS,guestTID,guestPR) --> ID of physical cpu + * guestAS [0..1] + * guestTID [0..255] + * guestPR [0..1] + * ID [1..255] + * Each vcpu keeps one vcpu_id_table. + */ +struct vcpu_id_table { + struct id id[2][NUM_TIDS][2]; +}; + +/* + * This table provide reversed mappings of vcpu_id_table: + * ID --> address of vcpu_id_table item. + * Each physical core has one pcpu_id_table. + */ +struct pcpu_id_table { + struct id *entry[NUM_TIDS]; +}; + +static DEFINE_PER_CPU(struct pcpu_id_table, pcpu_sids); + +/* This variable keeps last used shadow ID on local core. + * The valid range of shadow ID is [1..255] */ +static DEFINE_PER_CPU(unsigned long, pcpu_last_used_sid); + +/* + * Allocate a free shadow id and setup a valid sid mapping in given entry. + * A mapping is only valid when vcpu_id_table and pcpu_id_table are match. + * + * The caller must have preemption disabled, and keep it that way until + * it has finished with the returned shadow id (either written into the + * TLB or arch.shadow_pid, or discarded). + */ +static inline int local_sid_setup_one(struct id *entry) +{ + unsigned long sid; + int ret = -1; + + sid = __this_cpu_inc_return(pcpu_last_used_sid); + if (sid < NUM_TIDS) { + __this_cpu_write(pcpu_sids.entry[sid], entry); + entry->val = sid; + entry->pentry = this_cpu_ptr(&pcpu_sids.entry[sid]); + ret = sid; + } + + /* + * If sid == NUM_TIDS, we've run out of sids. We return -1, and + * the caller will invalidate everything and start over. + * + * sid > NUM_TIDS indicates a race, which we disable preemption to + * avoid. + */ + WARN_ON(sid > NUM_TIDS); + + return ret; +} + +/* + * Check if given entry contain a valid shadow id mapping. + * An ID mapping is considered valid only if + * both vcpu and pcpu know this mapping. + * + * The caller must have preemption disabled, and keep it that way until + * it has finished with the returned shadow id (either written into the + * TLB or arch.shadow_pid, or discarded). + */ +static inline int local_sid_lookup(struct id *entry) +{ + if (entry && entry->val != 0 && + __this_cpu_read(pcpu_sids.entry[entry->val]) == entry && + entry->pentry == this_cpu_ptr(&pcpu_sids.entry[entry->val])) + return entry->val; + return -1; +} + +/* Invalidate all id mappings on local core -- call with preempt disabled */ +static inline void local_sid_destroy_all(void) +{ + __this_cpu_write(pcpu_last_used_sid, 0); + memset(this_cpu_ptr(&pcpu_sids), 0, sizeof(pcpu_sids)); +} + +static void *kvmppc_e500_id_table_alloc(struct kvmppc_vcpu_e500 *vcpu_e500) +{ + vcpu_e500->idt = kzalloc(sizeof(struct vcpu_id_table), GFP_KERNEL); + return vcpu_e500->idt; +} + +static void kvmppc_e500_id_table_free(struct kvmppc_vcpu_e500 *vcpu_e500) +{ + kfree(vcpu_e500->idt); + vcpu_e500->idt = NULL; +} + +/* Map guest pid to shadow. + * We use PID to keep shadow of current guest non-zero PID, + * and use PID1 to keep shadow of guest zero PID. + * So that guest tlbe with TID=0 can be accessed at any time */ +static void kvmppc_e500_recalc_shadow_pid(struct kvmppc_vcpu_e500 *vcpu_e500) +{ + preempt_disable(); + vcpu_e500->vcpu.arch.shadow_pid = kvmppc_e500_get_sid(vcpu_e500, + get_cur_as(&vcpu_e500->vcpu), + get_cur_pid(&vcpu_e500->vcpu), + get_cur_pr(&vcpu_e500->vcpu), 1); + vcpu_e500->vcpu.arch.shadow_pid1 = kvmppc_e500_get_sid(vcpu_e500, + get_cur_as(&vcpu_e500->vcpu), 0, + get_cur_pr(&vcpu_e500->vcpu), 1); + preempt_enable(); +} + +/* Invalidate all mappings on vcpu */ +static void kvmppc_e500_id_table_reset_all(struct kvmppc_vcpu_e500 *vcpu_e500) +{ + memset(vcpu_e500->idt, 0, sizeof(struct vcpu_id_table)); + + /* Update shadow pid when mappings are changed */ + kvmppc_e500_recalc_shadow_pid(vcpu_e500); +} + +/* Invalidate one ID mapping on vcpu */ +static inline void kvmppc_e500_id_table_reset_one( + struct kvmppc_vcpu_e500 *vcpu_e500, + int as, int pid, int pr) +{ + struct vcpu_id_table *idt = vcpu_e500->idt; + + BUG_ON(as >= 2); + BUG_ON(pid >= NUM_TIDS); + BUG_ON(pr >= 2); + + idt->id[as][pid][pr].val = 0; + idt->id[as][pid][pr].pentry = NULL; + + /* Update shadow pid when mappings are changed */ + kvmppc_e500_recalc_shadow_pid(vcpu_e500); +} + +/* + * Map guest (vcpu,AS,ID,PR) to physical core shadow id. + * This function first lookup if a valid mapping exists, + * if not, then creates a new one. + * + * The caller must have preemption disabled, and keep it that way until + * it has finished with the returned shadow id (either written into the + * TLB or arch.shadow_pid, or discarded). + */ +unsigned int kvmppc_e500_get_sid(struct kvmppc_vcpu_e500 *vcpu_e500, + unsigned int as, unsigned int gid, + unsigned int pr, int avoid_recursion) +{ + struct vcpu_id_table *idt = vcpu_e500->idt; + int sid; + + BUG_ON(as >= 2); + BUG_ON(gid >= NUM_TIDS); + BUG_ON(pr >= 2); + + sid = local_sid_lookup(&idt->id[as][gid][pr]); + + while (sid <= 0) { + /* No mapping yet */ + sid = local_sid_setup_one(&idt->id[as][gid][pr]); + if (sid <= 0) { + _tlbil_all(); + local_sid_destroy_all(); + } + + /* Update shadow pid when mappings are changed */ + if (!avoid_recursion) + kvmppc_e500_recalc_shadow_pid(vcpu_e500); + } + + return sid; +} + +unsigned int kvmppc_e500_get_tlb_stid(struct kvm_vcpu *vcpu, + struct kvm_book3e_206_tlb_entry *gtlbe) +{ + return kvmppc_e500_get_sid(to_e500(vcpu), get_tlb_ts(gtlbe), + get_tlb_tid(gtlbe), get_cur_pr(vcpu), 0); +} + +void kvmppc_set_pid(struct kvm_vcpu *vcpu, u32 pid) +{ + struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); + + if (vcpu->arch.pid != pid) { + vcpu_e500->pid[0] = vcpu->arch.pid = pid; + kvmppc_e500_recalc_shadow_pid(vcpu_e500); + } +} + +/* gtlbe must not be mapped by more than one host tlbe */ +void kvmppc_e500_tlbil_one(struct kvmppc_vcpu_e500 *vcpu_e500, + struct kvm_book3e_206_tlb_entry *gtlbe) +{ + struct vcpu_id_table *idt = vcpu_e500->idt; + unsigned int pr, tid, ts, pid; + u32 val, eaddr; + unsigned long flags; + + ts = get_tlb_ts(gtlbe); + tid = get_tlb_tid(gtlbe); + + preempt_disable(); + + /* One guest ID may be mapped to two shadow IDs */ + for (pr = 0; pr < 2; pr++) { + /* + * The shadow PID can have a valid mapping on at most one + * host CPU. In the common case, it will be valid on this + * CPU, in which case we do a local invalidation of the + * specific address. + * + * If the shadow PID is not valid on the current host CPU, + * we invalidate the entire shadow PID. + */ + pid = local_sid_lookup(&idt->id[ts][tid][pr]); + if (pid <= 0) { + kvmppc_e500_id_table_reset_one(vcpu_e500, ts, tid, pr); + continue; + } + + /* + * The guest is invalidating a 4K entry which is in a PID + * that has a valid shadow mapping on this host CPU. We + * search host TLB to invalidate it's shadow TLB entry, + * similar to __tlbil_va except that we need to look in AS1. + */ + val = (pid << MAS6_SPID_SHIFT) | MAS6_SAS; + eaddr = get_tlb_eaddr(gtlbe); + + local_irq_save(flags); + + mtspr(SPRN_MAS6, val); + asm volatile("tlbsx 0, %[eaddr]" : : [eaddr] "r" (eaddr)); + val = mfspr(SPRN_MAS1); + if (val & MAS1_VALID) { + mtspr(SPRN_MAS1, val & ~MAS1_VALID); + asm volatile("tlbwe"); + } + + local_irq_restore(flags); + } + + preempt_enable(); +} + +void kvmppc_e500_tlbil_all(struct kvmppc_vcpu_e500 *vcpu_e500) +{ + kvmppc_e500_id_table_reset_all(vcpu_e500); +} + +void kvmppc_mmu_msr_notify(struct kvm_vcpu *vcpu, u32 old_msr) +{ + /* Recalc shadow pid since MSR changes */ + kvmppc_e500_recalc_shadow_pid(to_e500(vcpu)); +} + +static void kvmppc_core_vcpu_load_e500(struct kvm_vcpu *vcpu, int cpu) +{ + kvmppc_booke_vcpu_load(vcpu, cpu); + + /* Shadow PID may be expired on local core */ + kvmppc_e500_recalc_shadow_pid(to_e500(vcpu)); +} + +static void kvmppc_core_vcpu_put_e500(struct kvm_vcpu *vcpu) +{ +#ifdef CONFIG_SPE + if (vcpu->arch.shadow_msr & MSR_SPE) + kvmppc_vcpu_disable_spe(vcpu); +#endif + + kvmppc_booke_vcpu_put(vcpu); +} + +int kvmppc_core_check_processor_compat(void) +{ + int r; + + if (strcmp(cur_cpu_spec->cpu_name, "e500v2") == 0) + r = 0; + else + r = -ENOTSUPP; + + return r; +} + +static void kvmppc_e500_tlb_setup(struct kvmppc_vcpu_e500 *vcpu_e500) +{ + struct kvm_book3e_206_tlb_entry *tlbe; + + /* Insert large initial mapping for guest. */ + tlbe = get_entry(vcpu_e500, 1, 0); + tlbe->mas1 = MAS1_VALID | MAS1_TSIZE(BOOK3E_PAGESZ_256M); + tlbe->mas2 = 0; + tlbe->mas7_3 = E500_TLB_SUPER_PERM_MASK; + + /* 4K map for serial output. Used by kernel wrapper. */ + tlbe = get_entry(vcpu_e500, 1, 1); + tlbe->mas1 = MAS1_VALID | MAS1_TSIZE(BOOK3E_PAGESZ_4K); + tlbe->mas2 = (0xe0004500 & 0xFFFFF000) | MAS2_I | MAS2_G; + tlbe->mas7_3 = (0xe0004500 & 0xFFFFF000) | E500_TLB_SUPER_PERM_MASK; +} + +int kvmppc_core_vcpu_setup(struct kvm_vcpu *vcpu) +{ + struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); + + kvmppc_e500_tlb_setup(vcpu_e500); + + /* Registers init */ + vcpu->arch.pvr = mfspr(SPRN_PVR); + vcpu_e500->svr = mfspr(SPRN_SVR); + + vcpu->arch.cpu_type = KVM_CPU_E500V2; + + return 0; +} + +static int kvmppc_core_get_sregs_e500(struct kvm_vcpu *vcpu, + struct kvm_sregs *sregs) +{ + struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); + + sregs->u.e.features |= KVM_SREGS_E_ARCH206_MMU | KVM_SREGS_E_SPE | + KVM_SREGS_E_PM; + sregs->u.e.impl_id = KVM_SREGS_E_IMPL_FSL; + + sregs->u.e.impl.fsl.features = 0; + sregs->u.e.impl.fsl.svr = vcpu_e500->svr; + sregs->u.e.impl.fsl.hid0 = vcpu_e500->hid0; + sregs->u.e.impl.fsl.mcar = vcpu_e500->mcar; + + sregs->u.e.ivor_high[0] = vcpu->arch.ivor[BOOKE_IRQPRIO_SPE_UNAVAIL]; + sregs->u.e.ivor_high[1] = vcpu->arch.ivor[BOOKE_IRQPRIO_SPE_FP_DATA]; + sregs->u.e.ivor_high[2] = vcpu->arch.ivor[BOOKE_IRQPRIO_SPE_FP_ROUND]; + sregs->u.e.ivor_high[3] = + vcpu->arch.ivor[BOOKE_IRQPRIO_PERFORMANCE_MONITOR]; + + kvmppc_get_sregs_ivor(vcpu, sregs); + kvmppc_get_sregs_e500_tlb(vcpu, sregs); + return 0; +} + +static int kvmppc_core_set_sregs_e500(struct kvm_vcpu *vcpu, + struct kvm_sregs *sregs) +{ + struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); + int ret; + + if (sregs->u.e.impl_id == KVM_SREGS_E_IMPL_FSL) { + vcpu_e500->svr = sregs->u.e.impl.fsl.svr; + vcpu_e500->hid0 = sregs->u.e.impl.fsl.hid0; + vcpu_e500->mcar = sregs->u.e.impl.fsl.mcar; + } + + ret = kvmppc_set_sregs_e500_tlb(vcpu, sregs); + if (ret < 0) + return ret; + + if (!(sregs->u.e.features & KVM_SREGS_E_IVOR)) + return 0; + + if (sregs->u.e.features & KVM_SREGS_E_SPE) { + vcpu->arch.ivor[BOOKE_IRQPRIO_SPE_UNAVAIL] = + sregs->u.e.ivor_high[0]; + vcpu->arch.ivor[BOOKE_IRQPRIO_SPE_FP_DATA] = + sregs->u.e.ivor_high[1]; + vcpu->arch.ivor[BOOKE_IRQPRIO_SPE_FP_ROUND] = + sregs->u.e.ivor_high[2]; + } + + if (sregs->u.e.features & KVM_SREGS_E_PM) { + vcpu->arch.ivor[BOOKE_IRQPRIO_PERFORMANCE_MONITOR] = + sregs->u.e.ivor_high[3]; + } + + return kvmppc_set_sregs_ivor(vcpu, sregs); +} + +static int kvmppc_get_one_reg_e500(struct kvm_vcpu *vcpu, u64 id, + union kvmppc_one_reg *val) +{ + int r = kvmppc_get_one_reg_e500_tlb(vcpu, id, val); + return r; +} + +static int kvmppc_set_one_reg_e500(struct kvm_vcpu *vcpu, u64 id, + union kvmppc_one_reg *val) +{ + int r = kvmppc_get_one_reg_e500_tlb(vcpu, id, val); + return r; +} + +static struct kvm_vcpu *kvmppc_core_vcpu_create_e500(struct kvm *kvm, + unsigned int id) +{ + struct kvmppc_vcpu_e500 *vcpu_e500; + struct kvm_vcpu *vcpu; + int err; + + vcpu_e500 = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL); + if (!vcpu_e500) { + err = -ENOMEM; + goto out; + } + + vcpu = &vcpu_e500->vcpu; + err = kvm_vcpu_init(vcpu, kvm, id); + if (err) + goto free_vcpu; + + if (kvmppc_e500_id_table_alloc(vcpu_e500) == NULL) + goto uninit_vcpu; + + err = kvmppc_e500_tlb_init(vcpu_e500); + if (err) + goto uninit_id; + + vcpu->arch.shared = (void*)__get_free_page(GFP_KERNEL|__GFP_ZERO); + if (!vcpu->arch.shared) + goto uninit_tlb; + + return vcpu; + +uninit_tlb: + kvmppc_e500_tlb_uninit(vcpu_e500); +uninit_id: + kvmppc_e500_id_table_free(vcpu_e500); +uninit_vcpu: + kvm_vcpu_uninit(vcpu); +free_vcpu: + kmem_cache_free(kvm_vcpu_cache, vcpu_e500); +out: + return ERR_PTR(err); +} + +static void kvmppc_core_vcpu_free_e500(struct kvm_vcpu *vcpu) +{ + struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); + + free_page((unsigned long)vcpu->arch.shared); + kvmppc_e500_tlb_uninit(vcpu_e500); + kvmppc_e500_id_table_free(vcpu_e500); + kvm_vcpu_uninit(vcpu); + kmem_cache_free(kvm_vcpu_cache, vcpu_e500); +} + +static int kvmppc_core_init_vm_e500(struct kvm *kvm) +{ + return 0; +} + +static void kvmppc_core_destroy_vm_e500(struct kvm *kvm) +{ +} + +static struct kvmppc_ops kvm_ops_e500 = { + .get_sregs = kvmppc_core_get_sregs_e500, + .set_sregs = kvmppc_core_set_sregs_e500, + .get_one_reg = kvmppc_get_one_reg_e500, + .set_one_reg = kvmppc_set_one_reg_e500, + .vcpu_load = kvmppc_core_vcpu_load_e500, + .vcpu_put = kvmppc_core_vcpu_put_e500, + .vcpu_create = kvmppc_core_vcpu_create_e500, + .vcpu_free = kvmppc_core_vcpu_free_e500, + .mmu_destroy = kvmppc_mmu_destroy_e500, + .init_vm = kvmppc_core_init_vm_e500, + .destroy_vm = kvmppc_core_destroy_vm_e500, + .emulate_op = kvmppc_core_emulate_op_e500, + .emulate_mtspr = kvmppc_core_emulate_mtspr_e500, + .emulate_mfspr = kvmppc_core_emulate_mfspr_e500, +}; + +static int __init kvmppc_e500_init(void) +{ + int r, i; + unsigned long ivor[3]; + /* Process remaining handlers above the generic first 16 */ + unsigned long *handler = &kvmppc_booke_handler_addr[16]; + unsigned long handler_len; + unsigned long max_ivor = 0; + + r = kvmppc_core_check_processor_compat(); + if (r) + goto err_out; + + r = kvmppc_booke_init(); + if (r) + goto err_out; + + /* copy extra E500 exception handlers */ + ivor[0] = mfspr(SPRN_IVOR32); + ivor[1] = mfspr(SPRN_IVOR33); + ivor[2] = mfspr(SPRN_IVOR34); + for (i = 0; i < 3; i++) { + if (ivor[i] > ivor[max_ivor]) + max_ivor = i; + + handler_len = handler[i + 1] - handler[i]; + memcpy((void *)kvmppc_booke_handlers + ivor[i], + (void *)handler[i], handler_len); + } + handler_len = handler[max_ivor + 1] - handler[max_ivor]; + flush_icache_range(kvmppc_booke_handlers, kvmppc_booke_handlers + + ivor[max_ivor] + handler_len); + + r = kvm_init(NULL, sizeof(struct kvmppc_vcpu_e500), 0, THIS_MODULE); + if (r) + goto err_out; + kvm_ops_e500.owner = THIS_MODULE; + kvmppc_pr_ops = &kvm_ops_e500; + +err_out: + return r; +} + +static void __exit kvmppc_e500_exit(void) +{ + kvmppc_pr_ops = NULL; + kvmppc_booke_exit(); +} + +module_init(kvmppc_e500_init); +module_exit(kvmppc_e500_exit); +MODULE_ALIAS_MISCDEV(KVM_MINOR); +MODULE_ALIAS("devname:kvm"); diff --git a/kernel/arch/powerpc/kvm/e500.h b/kernel/arch/powerpc/kvm/e500.h new file mode 100644 index 000000000..72920bed3 --- /dev/null +++ b/kernel/arch/powerpc/kvm/e500.h @@ -0,0 +1,342 @@ +/* + * Copyright (C) 2008-2011 Freescale Semiconductor, Inc. All rights reserved. + * + * Author: Yu Liu <yu.liu@freescale.com> + * Scott Wood <scottwood@freescale.com> + * Ashish Kalra <ashish.kalra@freescale.com> + * Varun Sethi <varun.sethi@freescale.com> + * + * Description: + * This file is based on arch/powerpc/kvm/44x_tlb.h and + * arch/powerpc/include/asm/kvm_44x.h by Hollis Blanchard <hollisb@us.ibm.com>, + * Copyright IBM Corp. 2007-2008 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + */ + +#ifndef KVM_E500_H +#define KVM_E500_H + +#include <linux/kvm_host.h> +#include <asm/mmu-book3e.h> +#include <asm/tlb.h> +#include <asm/cputhreads.h> + +enum vcpu_ftr { + VCPU_FTR_MMU_V2 +}; + +#define E500_PID_NUM 3 +#define E500_TLB_NUM 2 + +/* entry is mapped somewhere in host TLB */ +#define E500_TLB_VALID (1 << 31) +/* TLB1 entry is mapped by host TLB1, tracked by bitmaps */ +#define E500_TLB_BITMAP (1 << 30) +/* TLB1 entry is mapped by host TLB0 */ +#define E500_TLB_TLB0 (1 << 29) +/* bits [6-5] MAS2_X1 and MAS2_X0 and [4-0] bits for WIMGE */ +#define E500_TLB_MAS2_ATTR (0x7f) + +struct tlbe_ref { + pfn_t pfn; /* valid only for TLB0, except briefly */ + unsigned int flags; /* E500_TLB_* */ +}; + +struct tlbe_priv { + struct tlbe_ref ref; +}; + +#ifdef CONFIG_KVM_E500V2 +struct vcpu_id_table; +#endif + +struct kvmppc_e500_tlb_params { + int entries, ways, sets; +}; + +struct kvmppc_vcpu_e500 { + struct kvm_vcpu vcpu; + + /* Unmodified copy of the guest's TLB -- shared with host userspace. */ + struct kvm_book3e_206_tlb_entry *gtlb_arch; + + /* Starting entry number in gtlb_arch[] */ + int gtlb_offset[E500_TLB_NUM]; + + /* KVM internal information associated with each guest TLB entry */ + struct tlbe_priv *gtlb_priv[E500_TLB_NUM]; + + struct kvmppc_e500_tlb_params gtlb_params[E500_TLB_NUM]; + + unsigned int gtlb_nv[E500_TLB_NUM]; + + unsigned int host_tlb1_nv; + + u32 svr; + u32 l1csr0; + u32 l1csr1; + u32 hid0; + u32 hid1; + u64 mcar; + + struct page **shared_tlb_pages; + int num_shared_tlb_pages; + + u64 *g2h_tlb1_map; + unsigned int *h2g_tlb1_rmap; + + /* Minimum and maximum address mapped my TLB1 */ + unsigned long tlb1_min_eaddr; + unsigned long tlb1_max_eaddr; + +#ifdef CONFIG_KVM_E500V2 + u32 pid[E500_PID_NUM]; + + /* vcpu id table */ + struct vcpu_id_table *idt; +#endif +}; + +static inline struct kvmppc_vcpu_e500 *to_e500(struct kvm_vcpu *vcpu) +{ + return container_of(vcpu, struct kvmppc_vcpu_e500, vcpu); +} + + +/* This geometry is the legacy default -- can be overridden by userspace */ +#define KVM_E500_TLB0_WAY_SIZE 128 +#define KVM_E500_TLB0_WAY_NUM 2 + +#define KVM_E500_TLB0_SIZE (KVM_E500_TLB0_WAY_SIZE * KVM_E500_TLB0_WAY_NUM) +#define KVM_E500_TLB1_SIZE 16 + +#define index_of(tlbsel, esel) (((tlbsel) << 16) | ((esel) & 0xFFFF)) +#define tlbsel_of(index) ((index) >> 16) +#define esel_of(index) ((index) & 0xFFFF) + +#define E500_TLB_USER_PERM_MASK (MAS3_UX|MAS3_UR|MAS3_UW) +#define E500_TLB_SUPER_PERM_MASK (MAS3_SX|MAS3_SR|MAS3_SW) +#define MAS2_ATTRIB_MASK \ + (MAS2_X0 | MAS2_X1 | MAS2_E | MAS2_G) +#define MAS3_ATTRIB_MASK \ + (MAS3_U0 | MAS3_U1 | MAS3_U2 | MAS3_U3 \ + | E500_TLB_USER_PERM_MASK | E500_TLB_SUPER_PERM_MASK) + +int kvmppc_e500_emul_mt_mmucsr0(struct kvmppc_vcpu_e500 *vcpu_e500, + ulong value); +int kvmppc_e500_emul_tlbwe(struct kvm_vcpu *vcpu); +int kvmppc_e500_emul_tlbre(struct kvm_vcpu *vcpu); +int kvmppc_e500_emul_tlbivax(struct kvm_vcpu *vcpu, gva_t ea); +int kvmppc_e500_emul_tlbilx(struct kvm_vcpu *vcpu, int type, gva_t ea); +int kvmppc_e500_emul_tlbsx(struct kvm_vcpu *vcpu, gva_t ea); +int kvmppc_e500_tlb_init(struct kvmppc_vcpu_e500 *vcpu_e500); +void kvmppc_e500_tlb_uninit(struct kvmppc_vcpu_e500 *vcpu_e500); + +void kvmppc_get_sregs_e500_tlb(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs); +int kvmppc_set_sregs_e500_tlb(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs); + +int kvmppc_get_one_reg_e500_tlb(struct kvm_vcpu *vcpu, u64 id, + union kvmppc_one_reg *val); +int kvmppc_set_one_reg_e500_tlb(struct kvm_vcpu *vcpu, u64 id, + union kvmppc_one_reg *val); + +#ifdef CONFIG_KVM_E500V2 +unsigned int kvmppc_e500_get_sid(struct kvmppc_vcpu_e500 *vcpu_e500, + unsigned int as, unsigned int gid, + unsigned int pr, int avoid_recursion); +#endif + +/* TLB helper functions */ +static inline unsigned int +get_tlb_size(const struct kvm_book3e_206_tlb_entry *tlbe) +{ + return (tlbe->mas1 >> 7) & 0x1f; +} + +static inline gva_t get_tlb_eaddr(const struct kvm_book3e_206_tlb_entry *tlbe) +{ + return tlbe->mas2 & MAS2_EPN; +} + +static inline u64 get_tlb_bytes(const struct kvm_book3e_206_tlb_entry *tlbe) +{ + unsigned int pgsize = get_tlb_size(tlbe); + return 1ULL << 10 << pgsize; +} + +static inline gva_t get_tlb_end(const struct kvm_book3e_206_tlb_entry *tlbe) +{ + u64 bytes = get_tlb_bytes(tlbe); + return get_tlb_eaddr(tlbe) + bytes - 1; +} + +static inline u64 get_tlb_raddr(const struct kvm_book3e_206_tlb_entry *tlbe) +{ + return tlbe->mas7_3 & ~0xfffULL; +} + +static inline unsigned int +get_tlb_tid(const struct kvm_book3e_206_tlb_entry *tlbe) +{ + return (tlbe->mas1 >> 16) & 0xff; +} + +static inline unsigned int +get_tlb_ts(const struct kvm_book3e_206_tlb_entry *tlbe) +{ + return (tlbe->mas1 >> 12) & 0x1; +} + +static inline unsigned int +get_tlb_v(const struct kvm_book3e_206_tlb_entry *tlbe) +{ + return (tlbe->mas1 >> 31) & 0x1; +} + +static inline unsigned int +get_tlb_iprot(const struct kvm_book3e_206_tlb_entry *tlbe) +{ + return (tlbe->mas1 >> 30) & 0x1; +} + +static inline unsigned int +get_tlb_tsize(const struct kvm_book3e_206_tlb_entry *tlbe) +{ + return (tlbe->mas1 & MAS1_TSIZE_MASK) >> MAS1_TSIZE_SHIFT; +} + +static inline unsigned int get_cur_pid(struct kvm_vcpu *vcpu) +{ + return vcpu->arch.pid & 0xff; +} + +static inline unsigned int get_cur_as(struct kvm_vcpu *vcpu) +{ + return !!(vcpu->arch.shared->msr & (MSR_IS | MSR_DS)); +} + +static inline unsigned int get_cur_pr(struct kvm_vcpu *vcpu) +{ + return !!(vcpu->arch.shared->msr & MSR_PR); +} + +static inline unsigned int get_cur_spid(const struct kvm_vcpu *vcpu) +{ + return (vcpu->arch.shared->mas6 >> 16) & 0xff; +} + +static inline unsigned int get_cur_sas(const struct kvm_vcpu *vcpu) +{ + return vcpu->arch.shared->mas6 & 0x1; +} + +static inline unsigned int get_tlb_tlbsel(const struct kvm_vcpu *vcpu) +{ + /* + * Manual says that tlbsel has 2 bits wide. + * Since we only have two TLBs, only lower bit is used. + */ + return (vcpu->arch.shared->mas0 >> 28) & 0x1; +} + +static inline unsigned int get_tlb_nv_bit(const struct kvm_vcpu *vcpu) +{ + return vcpu->arch.shared->mas0 & 0xfff; +} + +static inline unsigned int get_tlb_esel_bit(const struct kvm_vcpu *vcpu) +{ + return (vcpu->arch.shared->mas0 >> 16) & 0xfff; +} + +static inline int tlbe_is_host_safe(const struct kvm_vcpu *vcpu, + const struct kvm_book3e_206_tlb_entry *tlbe) +{ + gpa_t gpa; + + if (!get_tlb_v(tlbe)) + return 0; + +#ifndef CONFIG_KVM_BOOKE_HV + /* Does it match current guest AS? */ + /* XXX what about IS != DS? */ + if (get_tlb_ts(tlbe) != !!(vcpu->arch.shared->msr & MSR_IS)) + return 0; +#endif + + gpa = get_tlb_raddr(tlbe); + if (!gfn_to_memslot(vcpu->kvm, gpa >> PAGE_SHIFT)) + /* Mapping is not for RAM. */ + return 0; + + return 1; +} + +static inline struct kvm_book3e_206_tlb_entry *get_entry( + struct kvmppc_vcpu_e500 *vcpu_e500, int tlbsel, int entry) +{ + int offset = vcpu_e500->gtlb_offset[tlbsel]; + return &vcpu_e500->gtlb_arch[offset + entry]; +} + +void kvmppc_e500_tlbil_one(struct kvmppc_vcpu_e500 *vcpu_e500, + struct kvm_book3e_206_tlb_entry *gtlbe); +void kvmppc_e500_tlbil_all(struct kvmppc_vcpu_e500 *vcpu_e500); + +#ifdef CONFIG_KVM_BOOKE_HV +#define kvmppc_e500_get_tlb_stid(vcpu, gtlbe) get_tlb_tid(gtlbe) +#define get_tlbmiss_tid(vcpu) get_cur_pid(vcpu) +#define get_tlb_sts(gtlbe) (gtlbe->mas1 & MAS1_TS) + +/* + * These functions should be called with preemption disabled + * and the returned value is valid only in that context + */ +static inline int get_thread_specific_lpid(int vm_lpid) +{ + int vcpu_lpid = vm_lpid; + + if (threads_per_core == 2) + vcpu_lpid |= smp_processor_id() & 1; + + return vcpu_lpid; +} + +static inline int get_lpid(struct kvm_vcpu *vcpu) +{ + return get_thread_specific_lpid(vcpu->kvm->arch.lpid); +} +#else +unsigned int kvmppc_e500_get_tlb_stid(struct kvm_vcpu *vcpu, + struct kvm_book3e_206_tlb_entry *gtlbe); + +static inline unsigned int get_tlbmiss_tid(struct kvm_vcpu *vcpu) +{ + struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); + unsigned int tidseld = (vcpu->arch.shared->mas4 >> 16) & 0xf; + + return vcpu_e500->pid[tidseld]; +} + +/* Force TS=1 for all guest mappings. */ +#define get_tlb_sts(gtlbe) (MAS1_TS) +#endif /* !BOOKE_HV */ + +static inline bool has_feature(const struct kvm_vcpu *vcpu, + enum vcpu_ftr ftr) +{ + bool has_ftr; + switch (ftr) { + case VCPU_FTR_MMU_V2: + has_ftr = ((vcpu->arch.mmucfg & MMUCFG_MAVN) == MMUCFG_MAVN_V2); + break; + default: + return false; + } + return has_ftr; +} + +#endif /* KVM_E500_H */ diff --git a/kernel/arch/powerpc/kvm/e500_emulate.c b/kernel/arch/powerpc/kvm/e500_emulate.c new file mode 100644 index 000000000..ce7291c79 --- /dev/null +++ b/kernel/arch/powerpc/kvm/e500_emulate.c @@ -0,0 +1,430 @@ +/* + * Copyright (C) 2008-2011 Freescale Semiconductor, Inc. All rights reserved. + * + * Author: Yu Liu, <yu.liu@freescale.com> + * + * Description: + * This file is derived from arch/powerpc/kvm/44x_emulate.c, + * by Hollis Blanchard <hollisb@us.ibm.com>. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + */ + +#include <asm/kvm_ppc.h> +#include <asm/disassemble.h> +#include <asm/dbell.h> + +#include "booke.h" +#include "e500.h" + +#define XOP_DCBTLS 166 +#define XOP_MSGSND 206 +#define XOP_MSGCLR 238 +#define XOP_TLBIVAX 786 +#define XOP_TLBSX 914 +#define XOP_TLBRE 946 +#define XOP_TLBWE 978 +#define XOP_TLBILX 18 +#define XOP_EHPRIV 270 + +#ifdef CONFIG_KVM_E500MC +static int dbell2prio(ulong param) +{ + int msg = param & PPC_DBELL_TYPE_MASK; + int prio = -1; + + switch (msg) { + case PPC_DBELL_TYPE(PPC_DBELL): + prio = BOOKE_IRQPRIO_DBELL; + break; + case PPC_DBELL_TYPE(PPC_DBELL_CRIT): + prio = BOOKE_IRQPRIO_DBELL_CRIT; + break; + default: + break; + } + + return prio; +} + +static int kvmppc_e500_emul_msgclr(struct kvm_vcpu *vcpu, int rb) +{ + ulong param = vcpu->arch.gpr[rb]; + int prio = dbell2prio(param); + + if (prio < 0) + return EMULATE_FAIL; + + clear_bit(prio, &vcpu->arch.pending_exceptions); + return EMULATE_DONE; +} + +static int kvmppc_e500_emul_msgsnd(struct kvm_vcpu *vcpu, int rb) +{ + ulong param = vcpu->arch.gpr[rb]; + int prio = dbell2prio(rb); + int pir = param & PPC_DBELL_PIR_MASK; + int i; + struct kvm_vcpu *cvcpu; + + if (prio < 0) + return EMULATE_FAIL; + + kvm_for_each_vcpu(i, cvcpu, vcpu->kvm) { + int cpir = cvcpu->arch.shared->pir; + if ((param & PPC_DBELL_MSG_BRDCAST) || (cpir == pir)) { + set_bit(prio, &cvcpu->arch.pending_exceptions); + kvm_vcpu_kick(cvcpu); + } + } + + return EMULATE_DONE; +} +#endif + +static int kvmppc_e500_emul_ehpriv(struct kvm_run *run, struct kvm_vcpu *vcpu, + unsigned int inst, int *advance) +{ + int emulated = EMULATE_DONE; + + switch (get_oc(inst)) { + case EHPRIV_OC_DEBUG: + run->exit_reason = KVM_EXIT_DEBUG; + run->debug.arch.address = vcpu->arch.pc; + run->debug.arch.status = 0; + kvmppc_account_exit(vcpu, DEBUG_EXITS); + emulated = EMULATE_EXIT_USER; + *advance = 0; + break; + default: + emulated = EMULATE_FAIL; + } + return emulated; +} + +static int kvmppc_e500_emul_dcbtls(struct kvm_vcpu *vcpu) +{ + struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); + + /* Always fail to lock the cache */ + vcpu_e500->l1csr0 |= L1CSR0_CUL; + return EMULATE_DONE; +} + +int kvmppc_core_emulate_op_e500(struct kvm_run *run, struct kvm_vcpu *vcpu, + unsigned int inst, int *advance) +{ + int emulated = EMULATE_DONE; + int ra = get_ra(inst); + int rb = get_rb(inst); + int rt = get_rt(inst); + gva_t ea; + + switch (get_op(inst)) { + case 31: + switch (get_xop(inst)) { + + case XOP_DCBTLS: + emulated = kvmppc_e500_emul_dcbtls(vcpu); + break; + +#ifdef CONFIG_KVM_E500MC + case XOP_MSGSND: + emulated = kvmppc_e500_emul_msgsnd(vcpu, rb); + break; + + case XOP_MSGCLR: + emulated = kvmppc_e500_emul_msgclr(vcpu, rb); + break; +#endif + + case XOP_TLBRE: + emulated = kvmppc_e500_emul_tlbre(vcpu); + break; + + case XOP_TLBWE: + emulated = kvmppc_e500_emul_tlbwe(vcpu); + break; + + case XOP_TLBSX: + ea = kvmppc_get_ea_indexed(vcpu, ra, rb); + emulated = kvmppc_e500_emul_tlbsx(vcpu, ea); + break; + + case XOP_TLBILX: { + int type = rt & 0x3; + ea = kvmppc_get_ea_indexed(vcpu, ra, rb); + emulated = kvmppc_e500_emul_tlbilx(vcpu, type, ea); + break; + } + + case XOP_TLBIVAX: + ea = kvmppc_get_ea_indexed(vcpu, ra, rb); + emulated = kvmppc_e500_emul_tlbivax(vcpu, ea); + break; + + case XOP_EHPRIV: + emulated = kvmppc_e500_emul_ehpriv(run, vcpu, inst, + advance); + break; + + default: + emulated = EMULATE_FAIL; + } + + break; + + default: + emulated = EMULATE_FAIL; + } + + if (emulated == EMULATE_FAIL) + emulated = kvmppc_booke_emulate_op(run, vcpu, inst, advance); + + return emulated; +} + +int kvmppc_core_emulate_mtspr_e500(struct kvm_vcpu *vcpu, int sprn, ulong spr_val) +{ + struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); + int emulated = EMULATE_DONE; + + switch (sprn) { +#ifndef CONFIG_KVM_BOOKE_HV + case SPRN_PID: + kvmppc_set_pid(vcpu, spr_val); + break; + case SPRN_PID1: + if (spr_val != 0) + return EMULATE_FAIL; + vcpu_e500->pid[1] = spr_val; + break; + case SPRN_PID2: + if (spr_val != 0) + return EMULATE_FAIL; + vcpu_e500->pid[2] = spr_val; + break; + case SPRN_MAS0: + vcpu->arch.shared->mas0 = spr_val; + break; + case SPRN_MAS1: + vcpu->arch.shared->mas1 = spr_val; + break; + case SPRN_MAS2: + vcpu->arch.shared->mas2 = spr_val; + break; + case SPRN_MAS3: + vcpu->arch.shared->mas7_3 &= ~(u64)0xffffffff; + vcpu->arch.shared->mas7_3 |= spr_val; + break; + case SPRN_MAS4: + vcpu->arch.shared->mas4 = spr_val; + break; + case SPRN_MAS6: + vcpu->arch.shared->mas6 = spr_val; + break; + case SPRN_MAS7: + vcpu->arch.shared->mas7_3 &= (u64)0xffffffff; + vcpu->arch.shared->mas7_3 |= (u64)spr_val << 32; + break; +#endif + case SPRN_L1CSR0: + vcpu_e500->l1csr0 = spr_val; + vcpu_e500->l1csr0 &= ~(L1CSR0_DCFI | L1CSR0_CLFC); + break; + case SPRN_L1CSR1: + vcpu_e500->l1csr1 = spr_val; + vcpu_e500->l1csr1 &= ~(L1CSR1_ICFI | L1CSR1_ICLFR); + break; + case SPRN_HID0: + vcpu_e500->hid0 = spr_val; + break; + case SPRN_HID1: + vcpu_e500->hid1 = spr_val; + break; + + case SPRN_MMUCSR0: + emulated = kvmppc_e500_emul_mt_mmucsr0(vcpu_e500, + spr_val); + break; + + case SPRN_PWRMGTCR0: + /* + * Guest relies on host power management configurations + * Treat the request as a general store + */ + vcpu->arch.pwrmgtcr0 = spr_val; + break; + + /* extra exceptions */ +#ifdef CONFIG_SPE_POSSIBLE + case SPRN_IVOR32: + vcpu->arch.ivor[BOOKE_IRQPRIO_SPE_UNAVAIL] = spr_val; + break; + case SPRN_IVOR33: + vcpu->arch.ivor[BOOKE_IRQPRIO_SPE_FP_DATA] = spr_val; + break; + case SPRN_IVOR34: + vcpu->arch.ivor[BOOKE_IRQPRIO_SPE_FP_ROUND] = spr_val; + break; +#endif +#ifdef CONFIG_ALTIVEC + case SPRN_IVOR32: + vcpu->arch.ivor[BOOKE_IRQPRIO_ALTIVEC_UNAVAIL] = spr_val; + break; + case SPRN_IVOR33: + vcpu->arch.ivor[BOOKE_IRQPRIO_ALTIVEC_ASSIST] = spr_val; + break; +#endif + case SPRN_IVOR35: + vcpu->arch.ivor[BOOKE_IRQPRIO_PERFORMANCE_MONITOR] = spr_val; + break; +#ifdef CONFIG_KVM_BOOKE_HV + case SPRN_IVOR36: + vcpu->arch.ivor[BOOKE_IRQPRIO_DBELL] = spr_val; + break; + case SPRN_IVOR37: + vcpu->arch.ivor[BOOKE_IRQPRIO_DBELL_CRIT] = spr_val; + break; +#endif + default: + emulated = kvmppc_booke_emulate_mtspr(vcpu, sprn, spr_val); + } + + return emulated; +} + +int kvmppc_core_emulate_mfspr_e500(struct kvm_vcpu *vcpu, int sprn, ulong *spr_val) +{ + struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); + int emulated = EMULATE_DONE; + + switch (sprn) { +#ifndef CONFIG_KVM_BOOKE_HV + case SPRN_PID: + *spr_val = vcpu_e500->pid[0]; + break; + case SPRN_PID1: + *spr_val = vcpu_e500->pid[1]; + break; + case SPRN_PID2: + *spr_val = vcpu_e500->pid[2]; + break; + case SPRN_MAS0: + *spr_val = vcpu->arch.shared->mas0; + break; + case SPRN_MAS1: + *spr_val = vcpu->arch.shared->mas1; + break; + case SPRN_MAS2: + *spr_val = vcpu->arch.shared->mas2; + break; + case SPRN_MAS3: + *spr_val = (u32)vcpu->arch.shared->mas7_3; + break; + case SPRN_MAS4: + *spr_val = vcpu->arch.shared->mas4; + break; + case SPRN_MAS6: + *spr_val = vcpu->arch.shared->mas6; + break; + case SPRN_MAS7: + *spr_val = vcpu->arch.shared->mas7_3 >> 32; + break; +#endif + case SPRN_DECAR: + *spr_val = vcpu->arch.decar; + break; + case SPRN_TLB0CFG: + *spr_val = vcpu->arch.tlbcfg[0]; + break; + case SPRN_TLB1CFG: + *spr_val = vcpu->arch.tlbcfg[1]; + break; + case SPRN_TLB0PS: + if (!has_feature(vcpu, VCPU_FTR_MMU_V2)) + return EMULATE_FAIL; + *spr_val = vcpu->arch.tlbps[0]; + break; + case SPRN_TLB1PS: + if (!has_feature(vcpu, VCPU_FTR_MMU_V2)) + return EMULATE_FAIL; + *spr_val = vcpu->arch.tlbps[1]; + break; + case SPRN_L1CSR0: + *spr_val = vcpu_e500->l1csr0; + break; + case SPRN_L1CSR1: + *spr_val = vcpu_e500->l1csr1; + break; + case SPRN_HID0: + *spr_val = vcpu_e500->hid0; + break; + case SPRN_HID1: + *spr_val = vcpu_e500->hid1; + break; + case SPRN_SVR: + *spr_val = vcpu_e500->svr; + break; + + case SPRN_MMUCSR0: + *spr_val = 0; + break; + + case SPRN_MMUCFG: + *spr_val = vcpu->arch.mmucfg; + break; + case SPRN_EPTCFG: + if (!has_feature(vcpu, VCPU_FTR_MMU_V2)) + return EMULATE_FAIL; + /* + * Legacy Linux guests access EPTCFG register even if the E.PT + * category is disabled in the VM. Give them a chance to live. + */ + *spr_val = vcpu->arch.eptcfg; + break; + + case SPRN_PWRMGTCR0: + *spr_val = vcpu->arch.pwrmgtcr0; + break; + + /* extra exceptions */ +#ifdef CONFIG_SPE_POSSIBLE + case SPRN_IVOR32: + *spr_val = vcpu->arch.ivor[BOOKE_IRQPRIO_SPE_UNAVAIL]; + break; + case SPRN_IVOR33: + *spr_val = vcpu->arch.ivor[BOOKE_IRQPRIO_SPE_FP_DATA]; + break; + case SPRN_IVOR34: + *spr_val = vcpu->arch.ivor[BOOKE_IRQPRIO_SPE_FP_ROUND]; + break; +#endif +#ifdef CONFIG_ALTIVEC + case SPRN_IVOR32: + *spr_val = vcpu->arch.ivor[BOOKE_IRQPRIO_ALTIVEC_UNAVAIL]; + break; + case SPRN_IVOR33: + *spr_val = vcpu->arch.ivor[BOOKE_IRQPRIO_ALTIVEC_ASSIST]; + break; +#endif + case SPRN_IVOR35: + *spr_val = vcpu->arch.ivor[BOOKE_IRQPRIO_PERFORMANCE_MONITOR]; + break; +#ifdef CONFIG_KVM_BOOKE_HV + case SPRN_IVOR36: + *spr_val = vcpu->arch.ivor[BOOKE_IRQPRIO_DBELL]; + break; + case SPRN_IVOR37: + *spr_val = vcpu->arch.ivor[BOOKE_IRQPRIO_DBELL_CRIT]; + break; +#endif + default: + emulated = kvmppc_booke_emulate_mfspr(vcpu, sprn, spr_val); + } + + return emulated; +} + diff --git a/kernel/arch/powerpc/kvm/e500_mmu.c b/kernel/arch/powerpc/kvm/e500_mmu.c new file mode 100644 index 000000000..50860e919 --- /dev/null +++ b/kernel/arch/powerpc/kvm/e500_mmu.c @@ -0,0 +1,962 @@ +/* + * Copyright (C) 2008-2013 Freescale Semiconductor, Inc. All rights reserved. + * + * Author: Yu Liu, yu.liu@freescale.com + * Scott Wood, scottwood@freescale.com + * Ashish Kalra, ashish.kalra@freescale.com + * Varun Sethi, varun.sethi@freescale.com + * Alexander Graf, agraf@suse.de + * + * Description: + * This file is based on arch/powerpc/kvm/44x_tlb.c, + * by Hollis Blanchard <hollisb@us.ibm.com>. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + */ + +#include <linux/kernel.h> +#include <linux/types.h> +#include <linux/slab.h> +#include <linux/string.h> +#include <linux/kvm.h> +#include <linux/kvm_host.h> +#include <linux/highmem.h> +#include <linux/log2.h> +#include <linux/uaccess.h> +#include <linux/sched.h> +#include <linux/rwsem.h> +#include <linux/vmalloc.h> +#include <linux/hugetlb.h> +#include <asm/kvm_ppc.h> + +#include "e500.h" +#include "trace_booke.h" +#include "timing.h" +#include "e500_mmu_host.h" + +static inline unsigned int gtlb0_get_next_victim( + struct kvmppc_vcpu_e500 *vcpu_e500) +{ + unsigned int victim; + + victim = vcpu_e500->gtlb_nv[0]++; + if (unlikely(vcpu_e500->gtlb_nv[0] >= vcpu_e500->gtlb_params[0].ways)) + vcpu_e500->gtlb_nv[0] = 0; + + return victim; +} + +static int tlb0_set_base(gva_t addr, int sets, int ways) +{ + int set_base; + + set_base = (addr >> PAGE_SHIFT) & (sets - 1); + set_base *= ways; + + return set_base; +} + +static int gtlb0_set_base(struct kvmppc_vcpu_e500 *vcpu_e500, gva_t addr) +{ + return tlb0_set_base(addr, vcpu_e500->gtlb_params[0].sets, + vcpu_e500->gtlb_params[0].ways); +} + +static unsigned int get_tlb_esel(struct kvm_vcpu *vcpu, int tlbsel) +{ + struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); + int esel = get_tlb_esel_bit(vcpu); + + if (tlbsel == 0) { + esel &= vcpu_e500->gtlb_params[0].ways - 1; + esel += gtlb0_set_base(vcpu_e500, vcpu->arch.shared->mas2); + } else { + esel &= vcpu_e500->gtlb_params[tlbsel].entries - 1; + } + + return esel; +} + +/* Search the guest TLB for a matching entry. */ +static int kvmppc_e500_tlb_index(struct kvmppc_vcpu_e500 *vcpu_e500, + gva_t eaddr, int tlbsel, unsigned int pid, int as) +{ + int size = vcpu_e500->gtlb_params[tlbsel].entries; + unsigned int set_base, offset; + int i; + + if (tlbsel == 0) { + set_base = gtlb0_set_base(vcpu_e500, eaddr); + size = vcpu_e500->gtlb_params[0].ways; + } else { + if (eaddr < vcpu_e500->tlb1_min_eaddr || + eaddr > vcpu_e500->tlb1_max_eaddr) + return -1; + set_base = 0; + } + + offset = vcpu_e500->gtlb_offset[tlbsel]; + + for (i = 0; i < size; i++) { + struct kvm_book3e_206_tlb_entry *tlbe = + &vcpu_e500->gtlb_arch[offset + set_base + i]; + unsigned int tid; + + if (eaddr < get_tlb_eaddr(tlbe)) + continue; + + if (eaddr > get_tlb_end(tlbe)) + continue; + + tid = get_tlb_tid(tlbe); + if (tid && (tid != pid)) + continue; + + if (!get_tlb_v(tlbe)) + continue; + + if (get_tlb_ts(tlbe) != as && as != -1) + continue; + + return set_base + i; + } + + return -1; +} + +static inline void kvmppc_e500_deliver_tlb_miss(struct kvm_vcpu *vcpu, + gva_t eaddr, int as) +{ + struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); + unsigned int victim, tsized; + int tlbsel; + + /* since we only have two TLBs, only lower bit is used. */ + tlbsel = (vcpu->arch.shared->mas4 >> 28) & 0x1; + victim = (tlbsel == 0) ? gtlb0_get_next_victim(vcpu_e500) : 0; + tsized = (vcpu->arch.shared->mas4 >> 7) & 0x1f; + + vcpu->arch.shared->mas0 = MAS0_TLBSEL(tlbsel) | MAS0_ESEL(victim) + | MAS0_NV(vcpu_e500->gtlb_nv[tlbsel]); + vcpu->arch.shared->mas1 = MAS1_VALID | (as ? MAS1_TS : 0) + | MAS1_TID(get_tlbmiss_tid(vcpu)) + | MAS1_TSIZE(tsized); + vcpu->arch.shared->mas2 = (eaddr & MAS2_EPN) + | (vcpu->arch.shared->mas4 & MAS2_ATTRIB_MASK); + vcpu->arch.shared->mas7_3 &= MAS3_U0 | MAS3_U1 | MAS3_U2 | MAS3_U3; + vcpu->arch.shared->mas6 = (vcpu->arch.shared->mas6 & MAS6_SPID1) + | (get_cur_pid(vcpu) << 16) + | (as ? MAS6_SAS : 0); +} + +static void kvmppc_recalc_tlb1map_range(struct kvmppc_vcpu_e500 *vcpu_e500) +{ + int size = vcpu_e500->gtlb_params[1].entries; + unsigned int offset; + gva_t eaddr; + int i; + + vcpu_e500->tlb1_min_eaddr = ~0UL; + vcpu_e500->tlb1_max_eaddr = 0; + offset = vcpu_e500->gtlb_offset[1]; + + for (i = 0; i < size; i++) { + struct kvm_book3e_206_tlb_entry *tlbe = + &vcpu_e500->gtlb_arch[offset + i]; + + if (!get_tlb_v(tlbe)) + continue; + + eaddr = get_tlb_eaddr(tlbe); + vcpu_e500->tlb1_min_eaddr = + min(vcpu_e500->tlb1_min_eaddr, eaddr); + + eaddr = get_tlb_end(tlbe); + vcpu_e500->tlb1_max_eaddr = + max(vcpu_e500->tlb1_max_eaddr, eaddr); + } +} + +static int kvmppc_need_recalc_tlb1map_range(struct kvmppc_vcpu_e500 *vcpu_e500, + struct kvm_book3e_206_tlb_entry *gtlbe) +{ + unsigned long start, end, size; + + size = get_tlb_bytes(gtlbe); + start = get_tlb_eaddr(gtlbe) & ~(size - 1); + end = start + size - 1; + + return vcpu_e500->tlb1_min_eaddr == start || + vcpu_e500->tlb1_max_eaddr == end; +} + +/* This function is supposed to be called for a adding a new valid tlb entry */ +static void kvmppc_set_tlb1map_range(struct kvm_vcpu *vcpu, + struct kvm_book3e_206_tlb_entry *gtlbe) +{ + unsigned long start, end, size; + struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); + + if (!get_tlb_v(gtlbe)) + return; + + size = get_tlb_bytes(gtlbe); + start = get_tlb_eaddr(gtlbe) & ~(size - 1); + end = start + size - 1; + + vcpu_e500->tlb1_min_eaddr = min(vcpu_e500->tlb1_min_eaddr, start); + vcpu_e500->tlb1_max_eaddr = max(vcpu_e500->tlb1_max_eaddr, end); +} + +static inline int kvmppc_e500_gtlbe_invalidate( + struct kvmppc_vcpu_e500 *vcpu_e500, + int tlbsel, int esel) +{ + struct kvm_book3e_206_tlb_entry *gtlbe = + get_entry(vcpu_e500, tlbsel, esel); + + if (unlikely(get_tlb_iprot(gtlbe))) + return -1; + + if (tlbsel == 1 && kvmppc_need_recalc_tlb1map_range(vcpu_e500, gtlbe)) + kvmppc_recalc_tlb1map_range(vcpu_e500); + + gtlbe->mas1 = 0; + + return 0; +} + +int kvmppc_e500_emul_mt_mmucsr0(struct kvmppc_vcpu_e500 *vcpu_e500, ulong value) +{ + int esel; + + if (value & MMUCSR0_TLB0FI) + for (esel = 0; esel < vcpu_e500->gtlb_params[0].entries; esel++) + kvmppc_e500_gtlbe_invalidate(vcpu_e500, 0, esel); + if (value & MMUCSR0_TLB1FI) + for (esel = 0; esel < vcpu_e500->gtlb_params[1].entries; esel++) + kvmppc_e500_gtlbe_invalidate(vcpu_e500, 1, esel); + + /* Invalidate all host shadow mappings */ + kvmppc_core_flush_tlb(&vcpu_e500->vcpu); + + return EMULATE_DONE; +} + +int kvmppc_e500_emul_tlbivax(struct kvm_vcpu *vcpu, gva_t ea) +{ + struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); + unsigned int ia; + int esel, tlbsel; + + ia = (ea >> 2) & 0x1; + + /* since we only have two TLBs, only lower bit is used. */ + tlbsel = (ea >> 3) & 0x1; + + if (ia) { + /* invalidate all entries */ + for (esel = 0; esel < vcpu_e500->gtlb_params[tlbsel].entries; + esel++) + kvmppc_e500_gtlbe_invalidate(vcpu_e500, tlbsel, esel); + } else { + ea &= 0xfffff000; + esel = kvmppc_e500_tlb_index(vcpu_e500, ea, tlbsel, + get_cur_pid(vcpu), -1); + if (esel >= 0) + kvmppc_e500_gtlbe_invalidate(vcpu_e500, tlbsel, esel); + } + + /* Invalidate all host shadow mappings */ + kvmppc_core_flush_tlb(&vcpu_e500->vcpu); + + return EMULATE_DONE; +} + +static void tlbilx_all(struct kvmppc_vcpu_e500 *vcpu_e500, int tlbsel, + int pid, int type) +{ + struct kvm_book3e_206_tlb_entry *tlbe; + int tid, esel; + + /* invalidate all entries */ + for (esel = 0; esel < vcpu_e500->gtlb_params[tlbsel].entries; esel++) { + tlbe = get_entry(vcpu_e500, tlbsel, esel); + tid = get_tlb_tid(tlbe); + if (type == 0 || tid == pid) { + inval_gtlbe_on_host(vcpu_e500, tlbsel, esel); + kvmppc_e500_gtlbe_invalidate(vcpu_e500, tlbsel, esel); + } + } +} + +static void tlbilx_one(struct kvmppc_vcpu_e500 *vcpu_e500, int pid, + gva_t ea) +{ + int tlbsel, esel; + + for (tlbsel = 0; tlbsel < 2; tlbsel++) { + esel = kvmppc_e500_tlb_index(vcpu_e500, ea, tlbsel, pid, -1); + if (esel >= 0) { + inval_gtlbe_on_host(vcpu_e500, tlbsel, esel); + kvmppc_e500_gtlbe_invalidate(vcpu_e500, tlbsel, esel); + break; + } + } +} + +int kvmppc_e500_emul_tlbilx(struct kvm_vcpu *vcpu, int type, gva_t ea) +{ + struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); + int pid = get_cur_spid(vcpu); + + if (type == 0 || type == 1) { + tlbilx_all(vcpu_e500, 0, pid, type); + tlbilx_all(vcpu_e500, 1, pid, type); + } else if (type == 3) { + tlbilx_one(vcpu_e500, pid, ea); + } + + return EMULATE_DONE; +} + +int kvmppc_e500_emul_tlbre(struct kvm_vcpu *vcpu) +{ + struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); + int tlbsel, esel; + struct kvm_book3e_206_tlb_entry *gtlbe; + + tlbsel = get_tlb_tlbsel(vcpu); + esel = get_tlb_esel(vcpu, tlbsel); + + gtlbe = get_entry(vcpu_e500, tlbsel, esel); + vcpu->arch.shared->mas0 &= ~MAS0_NV(~0); + vcpu->arch.shared->mas0 |= MAS0_NV(vcpu_e500->gtlb_nv[tlbsel]); + vcpu->arch.shared->mas1 = gtlbe->mas1; + vcpu->arch.shared->mas2 = gtlbe->mas2; + vcpu->arch.shared->mas7_3 = gtlbe->mas7_3; + + return EMULATE_DONE; +} + +int kvmppc_e500_emul_tlbsx(struct kvm_vcpu *vcpu, gva_t ea) +{ + struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); + int as = !!get_cur_sas(vcpu); + unsigned int pid = get_cur_spid(vcpu); + int esel, tlbsel; + struct kvm_book3e_206_tlb_entry *gtlbe = NULL; + + for (tlbsel = 0; tlbsel < 2; tlbsel++) { + esel = kvmppc_e500_tlb_index(vcpu_e500, ea, tlbsel, pid, as); + if (esel >= 0) { + gtlbe = get_entry(vcpu_e500, tlbsel, esel); + break; + } + } + + if (gtlbe) { + esel &= vcpu_e500->gtlb_params[tlbsel].ways - 1; + + vcpu->arch.shared->mas0 = MAS0_TLBSEL(tlbsel) | MAS0_ESEL(esel) + | MAS0_NV(vcpu_e500->gtlb_nv[tlbsel]); + vcpu->arch.shared->mas1 = gtlbe->mas1; + vcpu->arch.shared->mas2 = gtlbe->mas2; + vcpu->arch.shared->mas7_3 = gtlbe->mas7_3; + } else { + int victim; + + /* since we only have two TLBs, only lower bit is used. */ + tlbsel = vcpu->arch.shared->mas4 >> 28 & 0x1; + victim = (tlbsel == 0) ? gtlb0_get_next_victim(vcpu_e500) : 0; + + vcpu->arch.shared->mas0 = MAS0_TLBSEL(tlbsel) + | MAS0_ESEL(victim) + | MAS0_NV(vcpu_e500->gtlb_nv[tlbsel]); + vcpu->arch.shared->mas1 = + (vcpu->arch.shared->mas6 & MAS6_SPID0) + | (vcpu->arch.shared->mas6 & (MAS6_SAS ? MAS1_TS : 0)) + | (vcpu->arch.shared->mas4 & MAS4_TSIZED(~0)); + vcpu->arch.shared->mas2 &= MAS2_EPN; + vcpu->arch.shared->mas2 |= vcpu->arch.shared->mas4 & + MAS2_ATTRIB_MASK; + vcpu->arch.shared->mas7_3 &= MAS3_U0 | MAS3_U1 | + MAS3_U2 | MAS3_U3; + } + + kvmppc_set_exit_type(vcpu, EMULATED_TLBSX_EXITS); + return EMULATE_DONE; +} + +int kvmppc_e500_emul_tlbwe(struct kvm_vcpu *vcpu) +{ + struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); + struct kvm_book3e_206_tlb_entry *gtlbe; + int tlbsel, esel; + int recal = 0; + int idx; + + tlbsel = get_tlb_tlbsel(vcpu); + esel = get_tlb_esel(vcpu, tlbsel); + + gtlbe = get_entry(vcpu_e500, tlbsel, esel); + + if (get_tlb_v(gtlbe)) { + inval_gtlbe_on_host(vcpu_e500, tlbsel, esel); + if ((tlbsel == 1) && + kvmppc_need_recalc_tlb1map_range(vcpu_e500, gtlbe)) + recal = 1; + } + + gtlbe->mas1 = vcpu->arch.shared->mas1; + gtlbe->mas2 = vcpu->arch.shared->mas2; + if (!(vcpu->arch.shared->msr & MSR_CM)) + gtlbe->mas2 &= 0xffffffffUL; + gtlbe->mas7_3 = vcpu->arch.shared->mas7_3; + + trace_kvm_booke206_gtlb_write(vcpu->arch.shared->mas0, gtlbe->mas1, + gtlbe->mas2, gtlbe->mas7_3); + + if (tlbsel == 1) { + /* + * If a valid tlb1 entry is overwritten then recalculate the + * min/max TLB1 map address range otherwise no need to look + * in tlb1 array. + */ + if (recal) + kvmppc_recalc_tlb1map_range(vcpu_e500); + else + kvmppc_set_tlb1map_range(vcpu, gtlbe); + } + + idx = srcu_read_lock(&vcpu->kvm->srcu); + + /* Invalidate shadow mappings for the about-to-be-clobbered TLBE. */ + if (tlbe_is_host_safe(vcpu, gtlbe)) { + u64 eaddr = get_tlb_eaddr(gtlbe); + u64 raddr = get_tlb_raddr(gtlbe); + + if (tlbsel == 0) { + gtlbe->mas1 &= ~MAS1_TSIZE(~0); + gtlbe->mas1 |= MAS1_TSIZE(BOOK3E_PAGESZ_4K); + } + + /* Premap the faulting page */ + kvmppc_mmu_map(vcpu, eaddr, raddr, index_of(tlbsel, esel)); + } + + srcu_read_unlock(&vcpu->kvm->srcu, idx); + + kvmppc_set_exit_type(vcpu, EMULATED_TLBWE_EXITS); + return EMULATE_DONE; +} + +static int kvmppc_e500_tlb_search(struct kvm_vcpu *vcpu, + gva_t eaddr, unsigned int pid, int as) +{ + struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); + int esel, tlbsel; + + for (tlbsel = 0; tlbsel < 2; tlbsel++) { + esel = kvmppc_e500_tlb_index(vcpu_e500, eaddr, tlbsel, pid, as); + if (esel >= 0) + return index_of(tlbsel, esel); + } + + return -1; +} + +/* 'linear_address' is actually an encoding of AS|PID|EADDR . */ +int kvmppc_core_vcpu_translate(struct kvm_vcpu *vcpu, + struct kvm_translation *tr) +{ + int index; + gva_t eaddr; + u8 pid; + u8 as; + + eaddr = tr->linear_address; + pid = (tr->linear_address >> 32) & 0xff; + as = (tr->linear_address >> 40) & 0x1; + + index = kvmppc_e500_tlb_search(vcpu, eaddr, pid, as); + if (index < 0) { + tr->valid = 0; + return 0; + } + + tr->physical_address = kvmppc_mmu_xlate(vcpu, index, eaddr); + /* XXX what does "writeable" and "usermode" even mean? */ + tr->valid = 1; + + return 0; +} + + +int kvmppc_mmu_itlb_index(struct kvm_vcpu *vcpu, gva_t eaddr) +{ + unsigned int as = !!(vcpu->arch.shared->msr & MSR_IS); + + return kvmppc_e500_tlb_search(vcpu, eaddr, get_cur_pid(vcpu), as); +} + +int kvmppc_mmu_dtlb_index(struct kvm_vcpu *vcpu, gva_t eaddr) +{ + unsigned int as = !!(vcpu->arch.shared->msr & MSR_DS); + + return kvmppc_e500_tlb_search(vcpu, eaddr, get_cur_pid(vcpu), as); +} + +void kvmppc_mmu_itlb_miss(struct kvm_vcpu *vcpu) +{ + unsigned int as = !!(vcpu->arch.shared->msr & MSR_IS); + + kvmppc_e500_deliver_tlb_miss(vcpu, vcpu->arch.pc, as); +} + +void kvmppc_mmu_dtlb_miss(struct kvm_vcpu *vcpu) +{ + unsigned int as = !!(vcpu->arch.shared->msr & MSR_DS); + + kvmppc_e500_deliver_tlb_miss(vcpu, vcpu->arch.fault_dear, as); +} + +gpa_t kvmppc_mmu_xlate(struct kvm_vcpu *vcpu, unsigned int index, + gva_t eaddr) +{ + struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); + struct kvm_book3e_206_tlb_entry *gtlbe; + u64 pgmask; + + gtlbe = get_entry(vcpu_e500, tlbsel_of(index), esel_of(index)); + pgmask = get_tlb_bytes(gtlbe) - 1; + + return get_tlb_raddr(gtlbe) | (eaddr & pgmask); +} + +void kvmppc_mmu_destroy_e500(struct kvm_vcpu *vcpu) +{ +} + +/*****************************************/ + +static void free_gtlb(struct kvmppc_vcpu_e500 *vcpu_e500) +{ + int i; + + kvmppc_core_flush_tlb(&vcpu_e500->vcpu); + kfree(vcpu_e500->g2h_tlb1_map); + kfree(vcpu_e500->gtlb_priv[0]); + kfree(vcpu_e500->gtlb_priv[1]); + + if (vcpu_e500->shared_tlb_pages) { + vfree((void *)(round_down((uintptr_t)vcpu_e500->gtlb_arch, + PAGE_SIZE))); + + for (i = 0; i < vcpu_e500->num_shared_tlb_pages; i++) { + set_page_dirty_lock(vcpu_e500->shared_tlb_pages[i]); + put_page(vcpu_e500->shared_tlb_pages[i]); + } + + vcpu_e500->num_shared_tlb_pages = 0; + + kfree(vcpu_e500->shared_tlb_pages); + vcpu_e500->shared_tlb_pages = NULL; + } else { + kfree(vcpu_e500->gtlb_arch); + } + + vcpu_e500->gtlb_arch = NULL; +} + +void kvmppc_get_sregs_e500_tlb(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) +{ + sregs->u.e.mas0 = vcpu->arch.shared->mas0; + sregs->u.e.mas1 = vcpu->arch.shared->mas1; + sregs->u.e.mas2 = vcpu->arch.shared->mas2; + sregs->u.e.mas7_3 = vcpu->arch.shared->mas7_3; + sregs->u.e.mas4 = vcpu->arch.shared->mas4; + sregs->u.e.mas6 = vcpu->arch.shared->mas6; + + sregs->u.e.mmucfg = vcpu->arch.mmucfg; + sregs->u.e.tlbcfg[0] = vcpu->arch.tlbcfg[0]; + sregs->u.e.tlbcfg[1] = vcpu->arch.tlbcfg[1]; + sregs->u.e.tlbcfg[2] = 0; + sregs->u.e.tlbcfg[3] = 0; +} + +int kvmppc_set_sregs_e500_tlb(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) +{ + if (sregs->u.e.features & KVM_SREGS_E_ARCH206_MMU) { + vcpu->arch.shared->mas0 = sregs->u.e.mas0; + vcpu->arch.shared->mas1 = sregs->u.e.mas1; + vcpu->arch.shared->mas2 = sregs->u.e.mas2; + vcpu->arch.shared->mas7_3 = sregs->u.e.mas7_3; + vcpu->arch.shared->mas4 = sregs->u.e.mas4; + vcpu->arch.shared->mas6 = sregs->u.e.mas6; + } + + return 0; +} + +int kvmppc_get_one_reg_e500_tlb(struct kvm_vcpu *vcpu, u64 id, + union kvmppc_one_reg *val) +{ + int r = 0; + long int i; + + switch (id) { + case KVM_REG_PPC_MAS0: + *val = get_reg_val(id, vcpu->arch.shared->mas0); + break; + case KVM_REG_PPC_MAS1: + *val = get_reg_val(id, vcpu->arch.shared->mas1); + break; + case KVM_REG_PPC_MAS2: + *val = get_reg_val(id, vcpu->arch.shared->mas2); + break; + case KVM_REG_PPC_MAS7_3: + *val = get_reg_val(id, vcpu->arch.shared->mas7_3); + break; + case KVM_REG_PPC_MAS4: + *val = get_reg_val(id, vcpu->arch.shared->mas4); + break; + case KVM_REG_PPC_MAS6: + *val = get_reg_val(id, vcpu->arch.shared->mas6); + break; + case KVM_REG_PPC_MMUCFG: + *val = get_reg_val(id, vcpu->arch.mmucfg); + break; + case KVM_REG_PPC_EPTCFG: + *val = get_reg_val(id, vcpu->arch.eptcfg); + break; + case KVM_REG_PPC_TLB0CFG: + case KVM_REG_PPC_TLB1CFG: + case KVM_REG_PPC_TLB2CFG: + case KVM_REG_PPC_TLB3CFG: + i = id - KVM_REG_PPC_TLB0CFG; + *val = get_reg_val(id, vcpu->arch.tlbcfg[i]); + break; + case KVM_REG_PPC_TLB0PS: + case KVM_REG_PPC_TLB1PS: + case KVM_REG_PPC_TLB2PS: + case KVM_REG_PPC_TLB3PS: + i = id - KVM_REG_PPC_TLB0PS; + *val = get_reg_val(id, vcpu->arch.tlbps[i]); + break; + default: + r = -EINVAL; + break; + } + + return r; +} + +int kvmppc_set_one_reg_e500_tlb(struct kvm_vcpu *vcpu, u64 id, + union kvmppc_one_reg *val) +{ + int r = 0; + long int i; + + switch (id) { + case KVM_REG_PPC_MAS0: + vcpu->arch.shared->mas0 = set_reg_val(id, *val); + break; + case KVM_REG_PPC_MAS1: + vcpu->arch.shared->mas1 = set_reg_val(id, *val); + break; + case KVM_REG_PPC_MAS2: + vcpu->arch.shared->mas2 = set_reg_val(id, *val); + break; + case KVM_REG_PPC_MAS7_3: + vcpu->arch.shared->mas7_3 = set_reg_val(id, *val); + break; + case KVM_REG_PPC_MAS4: + vcpu->arch.shared->mas4 = set_reg_val(id, *val); + break; + case KVM_REG_PPC_MAS6: + vcpu->arch.shared->mas6 = set_reg_val(id, *val); + break; + /* Only allow MMU registers to be set to the config supported by KVM */ + case KVM_REG_PPC_MMUCFG: { + u32 reg = set_reg_val(id, *val); + if (reg != vcpu->arch.mmucfg) + r = -EINVAL; + break; + } + case KVM_REG_PPC_EPTCFG: { + u32 reg = set_reg_val(id, *val); + if (reg != vcpu->arch.eptcfg) + r = -EINVAL; + break; + } + case KVM_REG_PPC_TLB0CFG: + case KVM_REG_PPC_TLB1CFG: + case KVM_REG_PPC_TLB2CFG: + case KVM_REG_PPC_TLB3CFG: { + /* MMU geometry (N_ENTRY/ASSOC) can be set only using SW_TLB */ + u32 reg = set_reg_val(id, *val); + i = id - KVM_REG_PPC_TLB0CFG; + if (reg != vcpu->arch.tlbcfg[i]) + r = -EINVAL; + break; + } + case KVM_REG_PPC_TLB0PS: + case KVM_REG_PPC_TLB1PS: + case KVM_REG_PPC_TLB2PS: + case KVM_REG_PPC_TLB3PS: { + u32 reg = set_reg_val(id, *val); + i = id - KVM_REG_PPC_TLB0PS; + if (reg != vcpu->arch.tlbps[i]) + r = -EINVAL; + break; + } + default: + r = -EINVAL; + break; + } + + return r; +} + +static int vcpu_mmu_geometry_update(struct kvm_vcpu *vcpu, + struct kvm_book3e_206_tlb_params *params) +{ + vcpu->arch.tlbcfg[0] &= ~(TLBnCFG_N_ENTRY | TLBnCFG_ASSOC); + if (params->tlb_sizes[0] <= 2048) + vcpu->arch.tlbcfg[0] |= params->tlb_sizes[0]; + vcpu->arch.tlbcfg[0] |= params->tlb_ways[0] << TLBnCFG_ASSOC_SHIFT; + + vcpu->arch.tlbcfg[1] &= ~(TLBnCFG_N_ENTRY | TLBnCFG_ASSOC); + vcpu->arch.tlbcfg[1] |= params->tlb_sizes[1]; + vcpu->arch.tlbcfg[1] |= params->tlb_ways[1] << TLBnCFG_ASSOC_SHIFT; + return 0; +} + +int kvm_vcpu_ioctl_config_tlb(struct kvm_vcpu *vcpu, + struct kvm_config_tlb *cfg) +{ + struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); + struct kvm_book3e_206_tlb_params params; + char *virt; + struct page **pages; + struct tlbe_priv *privs[2] = {}; + u64 *g2h_bitmap = NULL; + size_t array_len; + u32 sets; + int num_pages, ret, i; + + if (cfg->mmu_type != KVM_MMU_FSL_BOOKE_NOHV) + return -EINVAL; + + if (copy_from_user(¶ms, (void __user *)(uintptr_t)cfg->params, + sizeof(params))) + return -EFAULT; + + if (params.tlb_sizes[1] > 64) + return -EINVAL; + if (params.tlb_ways[1] != params.tlb_sizes[1]) + return -EINVAL; + if (params.tlb_sizes[2] != 0 || params.tlb_sizes[3] != 0) + return -EINVAL; + if (params.tlb_ways[2] != 0 || params.tlb_ways[3] != 0) + return -EINVAL; + + if (!is_power_of_2(params.tlb_ways[0])) + return -EINVAL; + + sets = params.tlb_sizes[0] >> ilog2(params.tlb_ways[0]); + if (!is_power_of_2(sets)) + return -EINVAL; + + array_len = params.tlb_sizes[0] + params.tlb_sizes[1]; + array_len *= sizeof(struct kvm_book3e_206_tlb_entry); + + if (cfg->array_len < array_len) + return -EINVAL; + + num_pages = DIV_ROUND_UP(cfg->array + array_len - 1, PAGE_SIZE) - + cfg->array / PAGE_SIZE; + pages = kmalloc(sizeof(struct page *) * num_pages, GFP_KERNEL); + if (!pages) + return -ENOMEM; + + ret = get_user_pages_fast(cfg->array, num_pages, 1, pages); + if (ret < 0) + goto err_pages; + + if (ret != num_pages) { + num_pages = ret; + ret = -EFAULT; + goto err_put_page; + } + + virt = vmap(pages, num_pages, VM_MAP, PAGE_KERNEL); + if (!virt) { + ret = -ENOMEM; + goto err_put_page; + } + + privs[0] = kzalloc(sizeof(struct tlbe_priv) * params.tlb_sizes[0], + GFP_KERNEL); + privs[1] = kzalloc(sizeof(struct tlbe_priv) * params.tlb_sizes[1], + GFP_KERNEL); + + if (!privs[0] || !privs[1]) { + ret = -ENOMEM; + goto err_privs; + } + + g2h_bitmap = kzalloc(sizeof(u64) * params.tlb_sizes[1], + GFP_KERNEL); + if (!g2h_bitmap) { + ret = -ENOMEM; + goto err_privs; + } + + free_gtlb(vcpu_e500); + + vcpu_e500->gtlb_priv[0] = privs[0]; + vcpu_e500->gtlb_priv[1] = privs[1]; + vcpu_e500->g2h_tlb1_map = g2h_bitmap; + + vcpu_e500->gtlb_arch = (struct kvm_book3e_206_tlb_entry *) + (virt + (cfg->array & (PAGE_SIZE - 1))); + + vcpu_e500->gtlb_params[0].entries = params.tlb_sizes[0]; + vcpu_e500->gtlb_params[1].entries = params.tlb_sizes[1]; + + vcpu_e500->gtlb_offset[0] = 0; + vcpu_e500->gtlb_offset[1] = params.tlb_sizes[0]; + + /* Update vcpu's MMU geometry based on SW_TLB input */ + vcpu_mmu_geometry_update(vcpu, ¶ms); + + vcpu_e500->shared_tlb_pages = pages; + vcpu_e500->num_shared_tlb_pages = num_pages; + + vcpu_e500->gtlb_params[0].ways = params.tlb_ways[0]; + vcpu_e500->gtlb_params[0].sets = sets; + + vcpu_e500->gtlb_params[1].ways = params.tlb_sizes[1]; + vcpu_e500->gtlb_params[1].sets = 1; + + kvmppc_recalc_tlb1map_range(vcpu_e500); + return 0; + +err_privs: + kfree(privs[0]); + kfree(privs[1]); + +err_put_page: + for (i = 0; i < num_pages; i++) + put_page(pages[i]); + +err_pages: + kfree(pages); + return ret; +} + +int kvm_vcpu_ioctl_dirty_tlb(struct kvm_vcpu *vcpu, + struct kvm_dirty_tlb *dirty) +{ + struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); + kvmppc_recalc_tlb1map_range(vcpu_e500); + kvmppc_core_flush_tlb(vcpu); + return 0; +} + +/* Vcpu's MMU default configuration */ +static int vcpu_mmu_init(struct kvm_vcpu *vcpu, + struct kvmppc_e500_tlb_params *params) +{ + /* Initialize RASIZE, PIDSIZE, NTLBS and MAVN fields with host values*/ + vcpu->arch.mmucfg = mfspr(SPRN_MMUCFG) & ~MMUCFG_LPIDSIZE; + + /* Initialize TLBnCFG fields with host values and SW_TLB geometry*/ + vcpu->arch.tlbcfg[0] = mfspr(SPRN_TLB0CFG) & + ~(TLBnCFG_N_ENTRY | TLBnCFG_ASSOC); + vcpu->arch.tlbcfg[0] |= params[0].entries; + vcpu->arch.tlbcfg[0] |= params[0].ways << TLBnCFG_ASSOC_SHIFT; + + vcpu->arch.tlbcfg[1] = mfspr(SPRN_TLB1CFG) & + ~(TLBnCFG_N_ENTRY | TLBnCFG_ASSOC); + vcpu->arch.tlbcfg[1] |= params[1].entries; + vcpu->arch.tlbcfg[1] |= params[1].ways << TLBnCFG_ASSOC_SHIFT; + + if (has_feature(vcpu, VCPU_FTR_MMU_V2)) { + vcpu->arch.tlbps[0] = mfspr(SPRN_TLB0PS); + vcpu->arch.tlbps[1] = mfspr(SPRN_TLB1PS); + + vcpu->arch.mmucfg &= ~MMUCFG_LRAT; + + /* Guest mmu emulation currently doesn't handle E.PT */ + vcpu->arch.eptcfg = 0; + vcpu->arch.tlbcfg[0] &= ~TLBnCFG_PT; + vcpu->arch.tlbcfg[1] &= ~TLBnCFG_IND; + } + + return 0; +} + +int kvmppc_e500_tlb_init(struct kvmppc_vcpu_e500 *vcpu_e500) +{ + struct kvm_vcpu *vcpu = &vcpu_e500->vcpu; + int entry_size = sizeof(struct kvm_book3e_206_tlb_entry); + int entries = KVM_E500_TLB0_SIZE + KVM_E500_TLB1_SIZE; + + if (e500_mmu_host_init(vcpu_e500)) + goto err; + + vcpu_e500->gtlb_params[0].entries = KVM_E500_TLB0_SIZE; + vcpu_e500->gtlb_params[1].entries = KVM_E500_TLB1_SIZE; + + vcpu_e500->gtlb_params[0].ways = KVM_E500_TLB0_WAY_NUM; + vcpu_e500->gtlb_params[0].sets = + KVM_E500_TLB0_SIZE / KVM_E500_TLB0_WAY_NUM; + + vcpu_e500->gtlb_params[1].ways = KVM_E500_TLB1_SIZE; + vcpu_e500->gtlb_params[1].sets = 1; + + vcpu_e500->gtlb_arch = kmalloc(entries * entry_size, GFP_KERNEL); + if (!vcpu_e500->gtlb_arch) + return -ENOMEM; + + vcpu_e500->gtlb_offset[0] = 0; + vcpu_e500->gtlb_offset[1] = KVM_E500_TLB0_SIZE; + + vcpu_e500->gtlb_priv[0] = kzalloc(sizeof(struct tlbe_ref) * + vcpu_e500->gtlb_params[0].entries, + GFP_KERNEL); + if (!vcpu_e500->gtlb_priv[0]) + goto err; + + vcpu_e500->gtlb_priv[1] = kzalloc(sizeof(struct tlbe_ref) * + vcpu_e500->gtlb_params[1].entries, + GFP_KERNEL); + if (!vcpu_e500->gtlb_priv[1]) + goto err; + + vcpu_e500->g2h_tlb1_map = kzalloc(sizeof(u64) * + vcpu_e500->gtlb_params[1].entries, + GFP_KERNEL); + if (!vcpu_e500->g2h_tlb1_map) + goto err; + + vcpu_mmu_init(vcpu, vcpu_e500->gtlb_params); + + kvmppc_recalc_tlb1map_range(vcpu_e500); + return 0; + +err: + free_gtlb(vcpu_e500); + return -1; +} + +void kvmppc_e500_tlb_uninit(struct kvmppc_vcpu_e500 *vcpu_e500) +{ + free_gtlb(vcpu_e500); + e500_mmu_host_uninit(vcpu_e500); +} diff --git a/kernel/arch/powerpc/kvm/e500_mmu_host.c b/kernel/arch/powerpc/kvm/e500_mmu_host.c new file mode 100644 index 000000000..4d33e199e --- /dev/null +++ b/kernel/arch/powerpc/kvm/e500_mmu_host.c @@ -0,0 +1,813 @@ +/* + * Copyright (C) 2008-2013 Freescale Semiconductor, Inc. All rights reserved. + * + * Author: Yu Liu, yu.liu@freescale.com + * Scott Wood, scottwood@freescale.com + * Ashish Kalra, ashish.kalra@freescale.com + * Varun Sethi, varun.sethi@freescale.com + * Alexander Graf, agraf@suse.de + * + * Description: + * This file is based on arch/powerpc/kvm/44x_tlb.c, + * by Hollis Blanchard <hollisb@us.ibm.com>. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + */ + +#include <linux/kernel.h> +#include <linux/types.h> +#include <linux/slab.h> +#include <linux/string.h> +#include <linux/kvm.h> +#include <linux/kvm_host.h> +#include <linux/highmem.h> +#include <linux/log2.h> +#include <linux/uaccess.h> +#include <linux/sched.h> +#include <linux/rwsem.h> +#include <linux/vmalloc.h> +#include <linux/hugetlb.h> +#include <asm/kvm_ppc.h> + +#include "e500.h" +#include "timing.h" +#include "e500_mmu_host.h" + +#include "trace_booke.h" + +#define to_htlb1_esel(esel) (host_tlb_params[1].entries - (esel) - 1) + +static struct kvmppc_e500_tlb_params host_tlb_params[E500_TLB_NUM]; + +static inline unsigned int tlb1_max_shadow_size(void) +{ + /* reserve one entry for magic page */ + return host_tlb_params[1].entries - tlbcam_index - 1; +} + +static inline u32 e500_shadow_mas3_attrib(u32 mas3, int usermode) +{ + /* Mask off reserved bits. */ + mas3 &= MAS3_ATTRIB_MASK; + +#ifndef CONFIG_KVM_BOOKE_HV + if (!usermode) { + /* Guest is in supervisor mode, + * so we need to translate guest + * supervisor permissions into user permissions. */ + mas3 &= ~E500_TLB_USER_PERM_MASK; + mas3 |= (mas3 & E500_TLB_SUPER_PERM_MASK) << 1; + } + mas3 |= E500_TLB_SUPER_PERM_MASK; +#endif + return mas3; +} + +/* + * writing shadow tlb entry to host TLB + */ +static inline void __write_host_tlbe(struct kvm_book3e_206_tlb_entry *stlbe, + uint32_t mas0, + uint32_t lpid) +{ + unsigned long flags; + + local_irq_save(flags); + mtspr(SPRN_MAS0, mas0); + mtspr(SPRN_MAS1, stlbe->mas1); + mtspr(SPRN_MAS2, (unsigned long)stlbe->mas2); + mtspr(SPRN_MAS3, (u32)stlbe->mas7_3); + mtspr(SPRN_MAS7, (u32)(stlbe->mas7_3 >> 32)); +#ifdef CONFIG_KVM_BOOKE_HV + mtspr(SPRN_MAS8, MAS8_TGS | get_thread_specific_lpid(lpid)); +#endif + asm volatile("isync; tlbwe" : : : "memory"); + +#ifdef CONFIG_KVM_BOOKE_HV + /* Must clear mas8 for other host tlbwe's */ + mtspr(SPRN_MAS8, 0); + isync(); +#endif + local_irq_restore(flags); + + trace_kvm_booke206_stlb_write(mas0, stlbe->mas8, stlbe->mas1, + stlbe->mas2, stlbe->mas7_3); +} + +/* + * Acquire a mas0 with victim hint, as if we just took a TLB miss. + * + * We don't care about the address we're searching for, other than that it's + * in the right set and is not present in the TLB. Using a zero PID and a + * userspace address means we don't have to set and then restore MAS5, or + * calculate a proper MAS6 value. + */ +static u32 get_host_mas0(unsigned long eaddr) +{ + unsigned long flags; + u32 mas0; + u32 mas4; + + local_irq_save(flags); + mtspr(SPRN_MAS6, 0); + mas4 = mfspr(SPRN_MAS4); + mtspr(SPRN_MAS4, mas4 & ~MAS4_TLBSEL_MASK); + asm volatile("tlbsx 0, %0" : : "b" (eaddr & ~CONFIG_PAGE_OFFSET)); + mas0 = mfspr(SPRN_MAS0); + mtspr(SPRN_MAS4, mas4); + local_irq_restore(flags); + + return mas0; +} + +/* sesel is for tlb1 only */ +static inline void write_host_tlbe(struct kvmppc_vcpu_e500 *vcpu_e500, + int tlbsel, int sesel, struct kvm_book3e_206_tlb_entry *stlbe) +{ + u32 mas0; + + if (tlbsel == 0) { + mas0 = get_host_mas0(stlbe->mas2); + __write_host_tlbe(stlbe, mas0, vcpu_e500->vcpu.kvm->arch.lpid); + } else { + __write_host_tlbe(stlbe, + MAS0_TLBSEL(1) | + MAS0_ESEL(to_htlb1_esel(sesel)), + vcpu_e500->vcpu.kvm->arch.lpid); + } +} + +/* sesel is for tlb1 only */ +static void write_stlbe(struct kvmppc_vcpu_e500 *vcpu_e500, + struct kvm_book3e_206_tlb_entry *gtlbe, + struct kvm_book3e_206_tlb_entry *stlbe, + int stlbsel, int sesel) +{ + int stid; + + preempt_disable(); + stid = kvmppc_e500_get_tlb_stid(&vcpu_e500->vcpu, gtlbe); + + stlbe->mas1 |= MAS1_TID(stid); + write_host_tlbe(vcpu_e500, stlbsel, sesel, stlbe); + preempt_enable(); +} + +#ifdef CONFIG_KVM_E500V2 +/* XXX should be a hook in the gva2hpa translation */ +void kvmppc_map_magic(struct kvm_vcpu *vcpu) +{ + struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); + struct kvm_book3e_206_tlb_entry magic; + ulong shared_page = ((ulong)vcpu->arch.shared) & PAGE_MASK; + unsigned int stid; + pfn_t pfn; + + pfn = (pfn_t)virt_to_phys((void *)shared_page) >> PAGE_SHIFT; + get_page(pfn_to_page(pfn)); + + preempt_disable(); + stid = kvmppc_e500_get_sid(vcpu_e500, 0, 0, 0, 0); + + magic.mas1 = MAS1_VALID | MAS1_TS | MAS1_TID(stid) | + MAS1_TSIZE(BOOK3E_PAGESZ_4K); + magic.mas2 = vcpu->arch.magic_page_ea | MAS2_M; + magic.mas7_3 = ((u64)pfn << PAGE_SHIFT) | + MAS3_SW | MAS3_SR | MAS3_UW | MAS3_UR; + magic.mas8 = 0; + + __write_host_tlbe(&magic, MAS0_TLBSEL(1) | MAS0_ESEL(tlbcam_index), 0); + preempt_enable(); +} +#endif + +void inval_gtlbe_on_host(struct kvmppc_vcpu_e500 *vcpu_e500, int tlbsel, + int esel) +{ + struct kvm_book3e_206_tlb_entry *gtlbe = + get_entry(vcpu_e500, tlbsel, esel); + struct tlbe_ref *ref = &vcpu_e500->gtlb_priv[tlbsel][esel].ref; + + /* Don't bother with unmapped entries */ + if (!(ref->flags & E500_TLB_VALID)) { + WARN(ref->flags & (E500_TLB_BITMAP | E500_TLB_TLB0), + "%s: flags %x\n", __func__, ref->flags); + WARN_ON(tlbsel == 1 && vcpu_e500->g2h_tlb1_map[esel]); + } + + if (tlbsel == 1 && ref->flags & E500_TLB_BITMAP) { + u64 tmp = vcpu_e500->g2h_tlb1_map[esel]; + int hw_tlb_indx; + unsigned long flags; + + local_irq_save(flags); + while (tmp) { + hw_tlb_indx = __ilog2_u64(tmp & -tmp); + mtspr(SPRN_MAS0, + MAS0_TLBSEL(1) | + MAS0_ESEL(to_htlb1_esel(hw_tlb_indx))); + mtspr(SPRN_MAS1, 0); + asm volatile("tlbwe"); + vcpu_e500->h2g_tlb1_rmap[hw_tlb_indx] = 0; + tmp &= tmp - 1; + } + mb(); + vcpu_e500->g2h_tlb1_map[esel] = 0; + ref->flags &= ~(E500_TLB_BITMAP | E500_TLB_VALID); + local_irq_restore(flags); + } + + if (tlbsel == 1 && ref->flags & E500_TLB_TLB0) { + /* + * TLB1 entry is backed by 4k pages. This should happen + * rarely and is not worth optimizing. Invalidate everything. + */ + kvmppc_e500_tlbil_all(vcpu_e500); + ref->flags &= ~(E500_TLB_TLB0 | E500_TLB_VALID); + } + + /* + * If TLB entry is still valid then it's a TLB0 entry, and thus + * backed by at most one host tlbe per shadow pid + */ + if (ref->flags & E500_TLB_VALID) + kvmppc_e500_tlbil_one(vcpu_e500, gtlbe); + + /* Mark the TLB as not backed by the host anymore */ + ref->flags = 0; +} + +static inline int tlbe_is_writable(struct kvm_book3e_206_tlb_entry *tlbe) +{ + return tlbe->mas7_3 & (MAS3_SW|MAS3_UW); +} + +static inline void kvmppc_e500_ref_setup(struct tlbe_ref *ref, + struct kvm_book3e_206_tlb_entry *gtlbe, + pfn_t pfn, unsigned int wimg) +{ + ref->pfn = pfn; + ref->flags = E500_TLB_VALID; + + /* Use guest supplied MAS2_G and MAS2_E */ + ref->flags |= (gtlbe->mas2 & MAS2_ATTRIB_MASK) | wimg; + + /* Mark the page accessed */ + kvm_set_pfn_accessed(pfn); + + if (tlbe_is_writable(gtlbe)) + kvm_set_pfn_dirty(pfn); +} + +static inline void kvmppc_e500_ref_release(struct tlbe_ref *ref) +{ + if (ref->flags & E500_TLB_VALID) { + /* FIXME: don't log bogus pfn for TLB1 */ + trace_kvm_booke206_ref_release(ref->pfn, ref->flags); + ref->flags = 0; + } +} + +static void clear_tlb1_bitmap(struct kvmppc_vcpu_e500 *vcpu_e500) +{ + if (vcpu_e500->g2h_tlb1_map) + memset(vcpu_e500->g2h_tlb1_map, 0, + sizeof(u64) * vcpu_e500->gtlb_params[1].entries); + if (vcpu_e500->h2g_tlb1_rmap) + memset(vcpu_e500->h2g_tlb1_rmap, 0, + sizeof(unsigned int) * host_tlb_params[1].entries); +} + +static void clear_tlb_privs(struct kvmppc_vcpu_e500 *vcpu_e500) +{ + int tlbsel; + int i; + + for (tlbsel = 0; tlbsel <= 1; tlbsel++) { + for (i = 0; i < vcpu_e500->gtlb_params[tlbsel].entries; i++) { + struct tlbe_ref *ref = + &vcpu_e500->gtlb_priv[tlbsel][i].ref; + kvmppc_e500_ref_release(ref); + } + } +} + +void kvmppc_core_flush_tlb(struct kvm_vcpu *vcpu) +{ + struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); + kvmppc_e500_tlbil_all(vcpu_e500); + clear_tlb_privs(vcpu_e500); + clear_tlb1_bitmap(vcpu_e500); +} + +/* TID must be supplied by the caller */ +static void kvmppc_e500_setup_stlbe( + struct kvm_vcpu *vcpu, + struct kvm_book3e_206_tlb_entry *gtlbe, + int tsize, struct tlbe_ref *ref, u64 gvaddr, + struct kvm_book3e_206_tlb_entry *stlbe) +{ + pfn_t pfn = ref->pfn; + u32 pr = vcpu->arch.shared->msr & MSR_PR; + + BUG_ON(!(ref->flags & E500_TLB_VALID)); + + /* Force IPROT=0 for all guest mappings. */ + stlbe->mas1 = MAS1_TSIZE(tsize) | get_tlb_sts(gtlbe) | MAS1_VALID; + stlbe->mas2 = (gvaddr & MAS2_EPN) | (ref->flags & E500_TLB_MAS2_ATTR); + stlbe->mas7_3 = ((u64)pfn << PAGE_SHIFT) | + e500_shadow_mas3_attrib(gtlbe->mas7_3, pr); +} + +static inline int kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500, + u64 gvaddr, gfn_t gfn, struct kvm_book3e_206_tlb_entry *gtlbe, + int tlbsel, struct kvm_book3e_206_tlb_entry *stlbe, + struct tlbe_ref *ref) +{ + struct kvm_memory_slot *slot; + unsigned long pfn = 0; /* silence GCC warning */ + unsigned long hva; + int pfnmap = 0; + int tsize = BOOK3E_PAGESZ_4K; + int ret = 0; + unsigned long mmu_seq; + struct kvm *kvm = vcpu_e500->vcpu.kvm; + unsigned long tsize_pages = 0; + pte_t *ptep; + unsigned int wimg = 0; + pgd_t *pgdir; + unsigned long flags; + + /* used to check for invalidations in progress */ + mmu_seq = kvm->mmu_notifier_seq; + smp_rmb(); + + /* + * Translate guest physical to true physical, acquiring + * a page reference if it is normal, non-reserved memory. + * + * gfn_to_memslot() must succeed because otherwise we wouldn't + * have gotten this far. Eventually we should just pass the slot + * pointer through from the first lookup. + */ + slot = gfn_to_memslot(vcpu_e500->vcpu.kvm, gfn); + hva = gfn_to_hva_memslot(slot, gfn); + + if (tlbsel == 1) { + struct vm_area_struct *vma; + down_read(¤t->mm->mmap_sem); + + vma = find_vma(current->mm, hva); + if (vma && hva >= vma->vm_start && + (vma->vm_flags & VM_PFNMAP)) { + /* + * This VMA is a physically contiguous region (e.g. + * /dev/mem) that bypasses normal Linux page + * management. Find the overlap between the + * vma and the memslot. + */ + + unsigned long start, end; + unsigned long slot_start, slot_end; + + pfnmap = 1; + + start = vma->vm_pgoff; + end = start + + ((vma->vm_end - vma->vm_start) >> PAGE_SHIFT); + + pfn = start + ((hva - vma->vm_start) >> PAGE_SHIFT); + + slot_start = pfn - (gfn - slot->base_gfn); + slot_end = slot_start + slot->npages; + + if (start < slot_start) + start = slot_start; + if (end > slot_end) + end = slot_end; + + tsize = (gtlbe->mas1 & MAS1_TSIZE_MASK) >> + MAS1_TSIZE_SHIFT; + + /* + * e500 doesn't implement the lowest tsize bit, + * or 1K pages. + */ + tsize = max(BOOK3E_PAGESZ_4K, tsize & ~1); + + /* + * Now find the largest tsize (up to what the guest + * requested) that will cover gfn, stay within the + * range, and for which gfn and pfn are mutually + * aligned. + */ + + for (; tsize > BOOK3E_PAGESZ_4K; tsize -= 2) { + unsigned long gfn_start, gfn_end; + tsize_pages = 1 << (tsize - 2); + + gfn_start = gfn & ~(tsize_pages - 1); + gfn_end = gfn_start + tsize_pages; + + if (gfn_start + pfn - gfn < start) + continue; + if (gfn_end + pfn - gfn > end) + continue; + if ((gfn & (tsize_pages - 1)) != + (pfn & (tsize_pages - 1))) + continue; + + gvaddr &= ~((tsize_pages << PAGE_SHIFT) - 1); + pfn &= ~(tsize_pages - 1); + break; + } + } else if (vma && hva >= vma->vm_start && + (vma->vm_flags & VM_HUGETLB)) { + unsigned long psize = vma_kernel_pagesize(vma); + + tsize = (gtlbe->mas1 & MAS1_TSIZE_MASK) >> + MAS1_TSIZE_SHIFT; + + /* + * Take the largest page size that satisfies both host + * and guest mapping + */ + tsize = min(__ilog2(psize) - 10, tsize); + + /* + * e500 doesn't implement the lowest tsize bit, + * or 1K pages. + */ + tsize = max(BOOK3E_PAGESZ_4K, tsize & ~1); + } + + up_read(¤t->mm->mmap_sem); + } + + if (likely(!pfnmap)) { + tsize_pages = 1 << (tsize + 10 - PAGE_SHIFT); + pfn = gfn_to_pfn_memslot(slot, gfn); + if (is_error_noslot_pfn(pfn)) { + if (printk_ratelimit()) + pr_err("%s: real page not found for gfn %lx\n", + __func__, (long)gfn); + return -EINVAL; + } + + /* Align guest and physical address to page map boundaries */ + pfn &= ~(tsize_pages - 1); + gvaddr &= ~((tsize_pages << PAGE_SHIFT) - 1); + } + + spin_lock(&kvm->mmu_lock); + if (mmu_notifier_retry(kvm, mmu_seq)) { + ret = -EAGAIN; + goto out; + } + + + pgdir = vcpu_e500->vcpu.arch.pgdir; + /* + * We are just looking at the wimg bits, so we don't + * care much about the trans splitting bit. + * We are holding kvm->mmu_lock so a notifier invalidate + * can't run hence pfn won't change. + */ + local_irq_save(flags); + ptep = find_linux_pte_or_hugepte(pgdir, hva, NULL); + if (ptep) { + pte_t pte = READ_ONCE(*ptep); + + if (pte_present(pte)) { + wimg = (pte_val(pte) >> PTE_WIMGE_SHIFT) & + MAS2_WIMGE_MASK; + local_irq_restore(flags); + } else { + local_irq_restore(flags); + pr_err_ratelimited("%s: pte not present: gfn %lx,pfn %lx\n", + __func__, (long)gfn, pfn); + ret = -EINVAL; + goto out; + } + } + kvmppc_e500_ref_setup(ref, gtlbe, pfn, wimg); + + kvmppc_e500_setup_stlbe(&vcpu_e500->vcpu, gtlbe, tsize, + ref, gvaddr, stlbe); + + /* Clear i-cache for new pages */ + kvmppc_mmu_flush_icache(pfn); + +out: + spin_unlock(&kvm->mmu_lock); + + /* Drop refcount on page, so that mmu notifiers can clear it */ + kvm_release_pfn_clean(pfn); + + return ret; +} + +/* XXX only map the one-one case, for now use TLB0 */ +static int kvmppc_e500_tlb0_map(struct kvmppc_vcpu_e500 *vcpu_e500, int esel, + struct kvm_book3e_206_tlb_entry *stlbe) +{ + struct kvm_book3e_206_tlb_entry *gtlbe; + struct tlbe_ref *ref; + int stlbsel = 0; + int sesel = 0; + int r; + + gtlbe = get_entry(vcpu_e500, 0, esel); + ref = &vcpu_e500->gtlb_priv[0][esel].ref; + + r = kvmppc_e500_shadow_map(vcpu_e500, get_tlb_eaddr(gtlbe), + get_tlb_raddr(gtlbe) >> PAGE_SHIFT, + gtlbe, 0, stlbe, ref); + if (r) + return r; + + write_stlbe(vcpu_e500, gtlbe, stlbe, stlbsel, sesel); + + return 0; +} + +static int kvmppc_e500_tlb1_map_tlb1(struct kvmppc_vcpu_e500 *vcpu_e500, + struct tlbe_ref *ref, + int esel) +{ + unsigned int sesel = vcpu_e500->host_tlb1_nv++; + + if (unlikely(vcpu_e500->host_tlb1_nv >= tlb1_max_shadow_size())) + vcpu_e500->host_tlb1_nv = 0; + + if (vcpu_e500->h2g_tlb1_rmap[sesel]) { + unsigned int idx = vcpu_e500->h2g_tlb1_rmap[sesel] - 1; + vcpu_e500->g2h_tlb1_map[idx] &= ~(1ULL << sesel); + } + + vcpu_e500->gtlb_priv[1][esel].ref.flags |= E500_TLB_BITMAP; + vcpu_e500->g2h_tlb1_map[esel] |= (u64)1 << sesel; + vcpu_e500->h2g_tlb1_rmap[sesel] = esel + 1; + WARN_ON(!(ref->flags & E500_TLB_VALID)); + + return sesel; +} + +/* Caller must ensure that the specified guest TLB entry is safe to insert into + * the shadow TLB. */ +/* For both one-one and one-to-many */ +static int kvmppc_e500_tlb1_map(struct kvmppc_vcpu_e500 *vcpu_e500, + u64 gvaddr, gfn_t gfn, struct kvm_book3e_206_tlb_entry *gtlbe, + struct kvm_book3e_206_tlb_entry *stlbe, int esel) +{ + struct tlbe_ref *ref = &vcpu_e500->gtlb_priv[1][esel].ref; + int sesel; + int r; + + r = kvmppc_e500_shadow_map(vcpu_e500, gvaddr, gfn, gtlbe, 1, stlbe, + ref); + if (r) + return r; + + /* Use TLB0 when we can only map a page with 4k */ + if (get_tlb_tsize(stlbe) == BOOK3E_PAGESZ_4K) { + vcpu_e500->gtlb_priv[1][esel].ref.flags |= E500_TLB_TLB0; + write_stlbe(vcpu_e500, gtlbe, stlbe, 0, 0); + return 0; + } + + /* Otherwise map into TLB1 */ + sesel = kvmppc_e500_tlb1_map_tlb1(vcpu_e500, ref, esel); + write_stlbe(vcpu_e500, gtlbe, stlbe, 1, sesel); + + return 0; +} + +void kvmppc_mmu_map(struct kvm_vcpu *vcpu, u64 eaddr, gpa_t gpaddr, + unsigned int index) +{ + struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); + struct tlbe_priv *priv; + struct kvm_book3e_206_tlb_entry *gtlbe, stlbe; + int tlbsel = tlbsel_of(index); + int esel = esel_of(index); + + gtlbe = get_entry(vcpu_e500, tlbsel, esel); + + switch (tlbsel) { + case 0: + priv = &vcpu_e500->gtlb_priv[tlbsel][esel]; + + /* Triggers after clear_tlb_privs or on initial mapping */ + if (!(priv->ref.flags & E500_TLB_VALID)) { + kvmppc_e500_tlb0_map(vcpu_e500, esel, &stlbe); + } else { + kvmppc_e500_setup_stlbe(vcpu, gtlbe, BOOK3E_PAGESZ_4K, + &priv->ref, eaddr, &stlbe); + write_stlbe(vcpu_e500, gtlbe, &stlbe, 0, 0); + } + break; + + case 1: { + gfn_t gfn = gpaddr >> PAGE_SHIFT; + kvmppc_e500_tlb1_map(vcpu_e500, eaddr, gfn, gtlbe, &stlbe, + esel); + break; + } + + default: + BUG(); + break; + } +} + +#ifdef CONFIG_KVM_BOOKE_HV +int kvmppc_load_last_inst(struct kvm_vcpu *vcpu, enum instruction_type type, + u32 *instr) +{ + gva_t geaddr; + hpa_t addr; + hfn_t pfn; + hva_t eaddr; + u32 mas1, mas2, mas3; + u64 mas7_mas3; + struct page *page; + unsigned int addr_space, psize_shift; + bool pr; + unsigned long flags; + + /* Search TLB for guest pc to get the real address */ + geaddr = kvmppc_get_pc(vcpu); + + addr_space = (vcpu->arch.shared->msr & MSR_IS) >> MSR_IR_LG; + + local_irq_save(flags); + mtspr(SPRN_MAS6, (vcpu->arch.pid << MAS6_SPID_SHIFT) | addr_space); + mtspr(SPRN_MAS5, MAS5_SGS | get_lpid(vcpu)); + asm volatile("tlbsx 0, %[geaddr]\n" : : + [geaddr] "r" (geaddr)); + mtspr(SPRN_MAS5, 0); + mtspr(SPRN_MAS8, 0); + mas1 = mfspr(SPRN_MAS1); + mas2 = mfspr(SPRN_MAS2); + mas3 = mfspr(SPRN_MAS3); +#ifdef CONFIG_64BIT + mas7_mas3 = mfspr(SPRN_MAS7_MAS3); +#else + mas7_mas3 = ((u64)mfspr(SPRN_MAS7) << 32) | mas3; +#endif + local_irq_restore(flags); + + /* + * If the TLB entry for guest pc was evicted, return to the guest. + * There are high chances to find a valid TLB entry next time. + */ + if (!(mas1 & MAS1_VALID)) + return EMULATE_AGAIN; + + /* + * Another thread may rewrite the TLB entry in parallel, don't + * execute from the address if the execute permission is not set + */ + pr = vcpu->arch.shared->msr & MSR_PR; + if (unlikely((pr && !(mas3 & MAS3_UX)) || + (!pr && !(mas3 & MAS3_SX)))) { + pr_err_ratelimited( + "%s: Instruction emulation from guest address %08lx without execute permission\n", + __func__, geaddr); + return EMULATE_AGAIN; + } + + /* + * The real address will be mapped by a cacheable, memory coherent, + * write-back page. Check for mismatches when LRAT is used. + */ + if (has_feature(vcpu, VCPU_FTR_MMU_V2) && + unlikely((mas2 & MAS2_I) || (mas2 & MAS2_W) || !(mas2 & MAS2_M))) { + pr_err_ratelimited( + "%s: Instruction emulation from guest address %08lx mismatches storage attributes\n", + __func__, geaddr); + return EMULATE_AGAIN; + } + + /* Get pfn */ + psize_shift = MAS1_GET_TSIZE(mas1) + 10; + addr = (mas7_mas3 & (~0ULL << psize_shift)) | + (geaddr & ((1ULL << psize_shift) - 1ULL)); + pfn = addr >> PAGE_SHIFT; + + /* Guard against emulation from devices area */ + if (unlikely(!page_is_ram(pfn))) { + pr_err_ratelimited("%s: Instruction emulation from non-RAM host address %08llx is not supported\n", + __func__, addr); + return EMULATE_AGAIN; + } + + /* Map a page and get guest's instruction */ + page = pfn_to_page(pfn); + eaddr = (unsigned long)kmap_atomic(page); + *instr = *(u32 *)(eaddr | (unsigned long)(addr & ~PAGE_MASK)); + kunmap_atomic((u32 *)eaddr); + + return EMULATE_DONE; +} +#else +int kvmppc_load_last_inst(struct kvm_vcpu *vcpu, enum instruction_type type, + u32 *instr) +{ + return EMULATE_AGAIN; +} +#endif + +/************* MMU Notifiers *************/ + +int kvm_unmap_hva(struct kvm *kvm, unsigned long hva) +{ + trace_kvm_unmap_hva(hva); + + /* + * Flush all shadow tlb entries everywhere. This is slow, but + * we are 100% sure that we catch the to be unmapped page + */ + kvm_flush_remote_tlbs(kvm); + + return 0; +} + +int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end) +{ + /* kvm_unmap_hva flushes everything anyways */ + kvm_unmap_hva(kvm, start); + + return 0; +} + +int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end) +{ + /* XXX could be more clever ;) */ + return 0; +} + +int kvm_test_age_hva(struct kvm *kvm, unsigned long hva) +{ + /* XXX could be more clever ;) */ + return 0; +} + +void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte) +{ + /* The page will get remapped properly on its next fault */ + kvm_unmap_hva(kvm, hva); +} + +/*****************************************/ + +int e500_mmu_host_init(struct kvmppc_vcpu_e500 *vcpu_e500) +{ + host_tlb_params[0].entries = mfspr(SPRN_TLB0CFG) & TLBnCFG_N_ENTRY; + host_tlb_params[1].entries = mfspr(SPRN_TLB1CFG) & TLBnCFG_N_ENTRY; + + /* + * This should never happen on real e500 hardware, but is + * architecturally possible -- e.g. in some weird nested + * virtualization case. + */ + if (host_tlb_params[0].entries == 0 || + host_tlb_params[1].entries == 0) { + pr_err("%s: need to know host tlb size\n", __func__); + return -ENODEV; + } + + host_tlb_params[0].ways = (mfspr(SPRN_TLB0CFG) & TLBnCFG_ASSOC) >> + TLBnCFG_ASSOC_SHIFT; + host_tlb_params[1].ways = host_tlb_params[1].entries; + + if (!is_power_of_2(host_tlb_params[0].entries) || + !is_power_of_2(host_tlb_params[0].ways) || + host_tlb_params[0].entries < host_tlb_params[0].ways || + host_tlb_params[0].ways == 0) { + pr_err("%s: bad tlb0 host config: %u entries %u ways\n", + __func__, host_tlb_params[0].entries, + host_tlb_params[0].ways); + return -ENODEV; + } + + host_tlb_params[0].sets = + host_tlb_params[0].entries / host_tlb_params[0].ways; + host_tlb_params[1].sets = 1; + + vcpu_e500->h2g_tlb1_rmap = kzalloc(sizeof(unsigned int) * + host_tlb_params[1].entries, + GFP_KERNEL); + if (!vcpu_e500->h2g_tlb1_rmap) + return -EINVAL; + + return 0; +} + +void e500_mmu_host_uninit(struct kvmppc_vcpu_e500 *vcpu_e500) +{ + kfree(vcpu_e500->h2g_tlb1_rmap); +} diff --git a/kernel/arch/powerpc/kvm/e500_mmu_host.h b/kernel/arch/powerpc/kvm/e500_mmu_host.h new file mode 100644 index 000000000..7624835b7 --- /dev/null +++ b/kernel/arch/powerpc/kvm/e500_mmu_host.h @@ -0,0 +1,18 @@ +/* + * Copyright (C) 2008-2013 Freescale Semiconductor, Inc. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + */ + +#ifndef KVM_E500_MMU_HOST_H +#define KVM_E500_MMU_HOST_H + +void inval_gtlbe_on_host(struct kvmppc_vcpu_e500 *vcpu_e500, int tlbsel, + int esel); + +int e500_mmu_host_init(struct kvmppc_vcpu_e500 *vcpu_e500); +void e500_mmu_host_uninit(struct kvmppc_vcpu_e500 *vcpu_e500); + +#endif /* KVM_E500_MMU_HOST_H */ diff --git a/kernel/arch/powerpc/kvm/e500mc.c b/kernel/arch/powerpc/kvm/e500mc.c new file mode 100644 index 000000000..cda695de8 --- /dev/null +++ b/kernel/arch/powerpc/kvm/e500mc.c @@ -0,0 +1,443 @@ +/* + * Copyright (C) 2010,2012 Freescale Semiconductor, Inc. All rights reserved. + * + * Author: Varun Sethi, <varun.sethi@freescale.com> + * + * Description: + * This file is derived from arch/powerpc/kvm/e500.c, + * by Yu Liu <yu.liu@freescale.com>. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + */ + +#include <linux/kvm_host.h> +#include <linux/slab.h> +#include <linux/err.h> +#include <linux/export.h> +#include <linux/miscdevice.h> +#include <linux/module.h> + +#include <asm/reg.h> +#include <asm/cputable.h> +#include <asm/tlbflush.h> +#include <asm/kvm_ppc.h> +#include <asm/dbell.h> + +#include "booke.h" +#include "e500.h" + +void kvmppc_set_pending_interrupt(struct kvm_vcpu *vcpu, enum int_class type) +{ + enum ppc_dbell dbell_type; + unsigned long tag; + + switch (type) { + case INT_CLASS_NONCRIT: + dbell_type = PPC_G_DBELL; + break; + case INT_CLASS_CRIT: + dbell_type = PPC_G_DBELL_CRIT; + break; + case INT_CLASS_MC: + dbell_type = PPC_G_DBELL_MC; + break; + default: + WARN_ONCE(1, "%s: unknown int type %d\n", __func__, type); + return; + } + + preempt_disable(); + tag = PPC_DBELL_LPID(get_lpid(vcpu)) | vcpu->vcpu_id; + mb(); + ppc_msgsnd(dbell_type, 0, tag); + preempt_enable(); +} + +/* gtlbe must not be mapped by more than one host tlb entry */ +void kvmppc_e500_tlbil_one(struct kvmppc_vcpu_e500 *vcpu_e500, + struct kvm_book3e_206_tlb_entry *gtlbe) +{ + unsigned int tid, ts; + gva_t eaddr; + u32 val; + unsigned long flags; + + ts = get_tlb_ts(gtlbe); + tid = get_tlb_tid(gtlbe); + + /* We search the host TLB to invalidate its shadow TLB entry */ + val = (tid << 16) | ts; + eaddr = get_tlb_eaddr(gtlbe); + + local_irq_save(flags); + + mtspr(SPRN_MAS6, val); + mtspr(SPRN_MAS5, MAS5_SGS | get_lpid(&vcpu_e500->vcpu)); + + asm volatile("tlbsx 0, %[eaddr]\n" : : [eaddr] "r" (eaddr)); + val = mfspr(SPRN_MAS1); + if (val & MAS1_VALID) { + mtspr(SPRN_MAS1, val & ~MAS1_VALID); + asm volatile("tlbwe"); + } + mtspr(SPRN_MAS5, 0); + /* NOTE: tlbsx also updates mas8, so clear it for host tlbwe */ + mtspr(SPRN_MAS8, 0); + isync(); + + local_irq_restore(flags); +} + +void kvmppc_e500_tlbil_all(struct kvmppc_vcpu_e500 *vcpu_e500) +{ + unsigned long flags; + + local_irq_save(flags); + mtspr(SPRN_MAS5, MAS5_SGS | get_lpid(&vcpu_e500->vcpu)); + asm volatile("tlbilxlpid"); + mtspr(SPRN_MAS5, 0); + local_irq_restore(flags); +} + +void kvmppc_set_pid(struct kvm_vcpu *vcpu, u32 pid) +{ + vcpu->arch.pid = pid; +} + +void kvmppc_mmu_msr_notify(struct kvm_vcpu *vcpu, u32 old_msr) +{ +} + +/* We use two lpids per VM */ +static DEFINE_PER_CPU(struct kvm_vcpu *[KVMPPC_NR_LPIDS], last_vcpu_of_lpid); + +static void kvmppc_core_vcpu_load_e500mc(struct kvm_vcpu *vcpu, int cpu) +{ + struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); + + kvmppc_booke_vcpu_load(vcpu, cpu); + + mtspr(SPRN_LPID, get_lpid(vcpu)); + mtspr(SPRN_EPCR, vcpu->arch.shadow_epcr); + mtspr(SPRN_GPIR, vcpu->vcpu_id); + mtspr(SPRN_MSRP, vcpu->arch.shadow_msrp); + vcpu->arch.eplc = EPC_EGS | (get_lpid(vcpu) << EPC_ELPID_SHIFT); + vcpu->arch.epsc = vcpu->arch.eplc; + mtspr(SPRN_EPLC, vcpu->arch.eplc); + mtspr(SPRN_EPSC, vcpu->arch.epsc); + + mtspr(SPRN_GIVPR, vcpu->arch.ivpr); + mtspr(SPRN_GIVOR2, vcpu->arch.ivor[BOOKE_IRQPRIO_DATA_STORAGE]); + mtspr(SPRN_GIVOR8, vcpu->arch.ivor[BOOKE_IRQPRIO_SYSCALL]); + mtspr(SPRN_GSPRG0, (unsigned long)vcpu->arch.shared->sprg0); + mtspr(SPRN_GSPRG1, (unsigned long)vcpu->arch.shared->sprg1); + mtspr(SPRN_GSPRG2, (unsigned long)vcpu->arch.shared->sprg2); + mtspr(SPRN_GSPRG3, (unsigned long)vcpu->arch.shared->sprg3); + + mtspr(SPRN_GSRR0, vcpu->arch.shared->srr0); + mtspr(SPRN_GSRR1, vcpu->arch.shared->srr1); + + mtspr(SPRN_GEPR, vcpu->arch.epr); + mtspr(SPRN_GDEAR, vcpu->arch.shared->dar); + mtspr(SPRN_GESR, vcpu->arch.shared->esr); + + if (vcpu->arch.oldpir != mfspr(SPRN_PIR) || + __this_cpu_read(last_vcpu_of_lpid[get_lpid(vcpu)]) != vcpu) { + kvmppc_e500_tlbil_all(vcpu_e500); + __this_cpu_write(last_vcpu_of_lpid[get_lpid(vcpu)], vcpu); + } +} + +static void kvmppc_core_vcpu_put_e500mc(struct kvm_vcpu *vcpu) +{ + vcpu->arch.eplc = mfspr(SPRN_EPLC); + vcpu->arch.epsc = mfspr(SPRN_EPSC); + + vcpu->arch.shared->sprg0 = mfspr(SPRN_GSPRG0); + vcpu->arch.shared->sprg1 = mfspr(SPRN_GSPRG1); + vcpu->arch.shared->sprg2 = mfspr(SPRN_GSPRG2); + vcpu->arch.shared->sprg3 = mfspr(SPRN_GSPRG3); + + vcpu->arch.shared->srr0 = mfspr(SPRN_GSRR0); + vcpu->arch.shared->srr1 = mfspr(SPRN_GSRR1); + + vcpu->arch.epr = mfspr(SPRN_GEPR); + vcpu->arch.shared->dar = mfspr(SPRN_GDEAR); + vcpu->arch.shared->esr = mfspr(SPRN_GESR); + + vcpu->arch.oldpir = mfspr(SPRN_PIR); + + kvmppc_booke_vcpu_put(vcpu); +} + +int kvmppc_core_check_processor_compat(void) +{ + int r; + + if (strcmp(cur_cpu_spec->cpu_name, "e500mc") == 0) + r = 0; + else if (strcmp(cur_cpu_spec->cpu_name, "e5500") == 0) + r = 0; +#ifdef CONFIG_ALTIVEC + /* + * Since guests have the priviledge to enable AltiVec, we need AltiVec + * support in the host to save/restore their context. + * Don't use CPU_FTR_ALTIVEC to identify cores with AltiVec unit + * because it's cleared in the absence of CONFIG_ALTIVEC! + */ + else if (strcmp(cur_cpu_spec->cpu_name, "e6500") == 0) + r = 0; +#endif + else + r = -ENOTSUPP; + + return r; +} + +int kvmppc_core_vcpu_setup(struct kvm_vcpu *vcpu) +{ + struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); + + vcpu->arch.shadow_epcr = SPRN_EPCR_DSIGS | SPRN_EPCR_DGTMI | \ + SPRN_EPCR_DUVD; +#ifdef CONFIG_64BIT + vcpu->arch.shadow_epcr |= SPRN_EPCR_ICM; +#endif + vcpu->arch.shadow_msrp = MSRP_UCLEP | MSRP_PMMP; + + vcpu->arch.pvr = mfspr(SPRN_PVR); + vcpu_e500->svr = mfspr(SPRN_SVR); + + vcpu->arch.cpu_type = KVM_CPU_E500MC; + + return 0; +} + +static int kvmppc_core_get_sregs_e500mc(struct kvm_vcpu *vcpu, + struct kvm_sregs *sregs) +{ + struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); + + sregs->u.e.features |= KVM_SREGS_E_ARCH206_MMU | KVM_SREGS_E_PM | + KVM_SREGS_E_PC; + sregs->u.e.impl_id = KVM_SREGS_E_IMPL_FSL; + + sregs->u.e.impl.fsl.features = 0; + sregs->u.e.impl.fsl.svr = vcpu_e500->svr; + sregs->u.e.impl.fsl.hid0 = vcpu_e500->hid0; + sregs->u.e.impl.fsl.mcar = vcpu_e500->mcar; + + kvmppc_get_sregs_e500_tlb(vcpu, sregs); + + sregs->u.e.ivor_high[3] = + vcpu->arch.ivor[BOOKE_IRQPRIO_PERFORMANCE_MONITOR]; + sregs->u.e.ivor_high[4] = vcpu->arch.ivor[BOOKE_IRQPRIO_DBELL]; + sregs->u.e.ivor_high[5] = vcpu->arch.ivor[BOOKE_IRQPRIO_DBELL_CRIT]; + + return kvmppc_get_sregs_ivor(vcpu, sregs); +} + +static int kvmppc_core_set_sregs_e500mc(struct kvm_vcpu *vcpu, + struct kvm_sregs *sregs) +{ + struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); + int ret; + + if (sregs->u.e.impl_id == KVM_SREGS_E_IMPL_FSL) { + vcpu_e500->svr = sregs->u.e.impl.fsl.svr; + vcpu_e500->hid0 = sregs->u.e.impl.fsl.hid0; + vcpu_e500->mcar = sregs->u.e.impl.fsl.mcar; + } + + ret = kvmppc_set_sregs_e500_tlb(vcpu, sregs); + if (ret < 0) + return ret; + + if (!(sregs->u.e.features & KVM_SREGS_E_IVOR)) + return 0; + + if (sregs->u.e.features & KVM_SREGS_E_PM) { + vcpu->arch.ivor[BOOKE_IRQPRIO_PERFORMANCE_MONITOR] = + sregs->u.e.ivor_high[3]; + } + + if (sregs->u.e.features & KVM_SREGS_E_PC) { + vcpu->arch.ivor[BOOKE_IRQPRIO_DBELL] = + sregs->u.e.ivor_high[4]; + vcpu->arch.ivor[BOOKE_IRQPRIO_DBELL_CRIT] = + sregs->u.e.ivor_high[5]; + } + + return kvmppc_set_sregs_ivor(vcpu, sregs); +} + +static int kvmppc_get_one_reg_e500mc(struct kvm_vcpu *vcpu, u64 id, + union kvmppc_one_reg *val) +{ + int r = 0; + + switch (id) { + case KVM_REG_PPC_SPRG9: + *val = get_reg_val(id, vcpu->arch.sprg9); + break; + default: + r = kvmppc_get_one_reg_e500_tlb(vcpu, id, val); + } + + return r; +} + +static int kvmppc_set_one_reg_e500mc(struct kvm_vcpu *vcpu, u64 id, + union kvmppc_one_reg *val) +{ + int r = 0; + + switch (id) { + case KVM_REG_PPC_SPRG9: + vcpu->arch.sprg9 = set_reg_val(id, *val); + break; + default: + r = kvmppc_set_one_reg_e500_tlb(vcpu, id, val); + } + + return r; +} + +static struct kvm_vcpu *kvmppc_core_vcpu_create_e500mc(struct kvm *kvm, + unsigned int id) +{ + struct kvmppc_vcpu_e500 *vcpu_e500; + struct kvm_vcpu *vcpu; + int err; + + vcpu_e500 = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL); + if (!vcpu_e500) { + err = -ENOMEM; + goto out; + } + vcpu = &vcpu_e500->vcpu; + + /* Invalid PIR value -- this LPID dosn't have valid state on any cpu */ + vcpu->arch.oldpir = 0xffffffff; + + err = kvm_vcpu_init(vcpu, kvm, id); + if (err) + goto free_vcpu; + + err = kvmppc_e500_tlb_init(vcpu_e500); + if (err) + goto uninit_vcpu; + + vcpu->arch.shared = (void *)__get_free_page(GFP_KERNEL | __GFP_ZERO); + if (!vcpu->arch.shared) + goto uninit_tlb; + + return vcpu; + +uninit_tlb: + kvmppc_e500_tlb_uninit(vcpu_e500); +uninit_vcpu: + kvm_vcpu_uninit(vcpu); + +free_vcpu: + kmem_cache_free(kvm_vcpu_cache, vcpu_e500); +out: + return ERR_PTR(err); +} + +static void kvmppc_core_vcpu_free_e500mc(struct kvm_vcpu *vcpu) +{ + struct kvmppc_vcpu_e500 *vcpu_e500 = to_e500(vcpu); + + free_page((unsigned long)vcpu->arch.shared); + kvmppc_e500_tlb_uninit(vcpu_e500); + kvm_vcpu_uninit(vcpu); + kmem_cache_free(kvm_vcpu_cache, vcpu_e500); +} + +static int kvmppc_core_init_vm_e500mc(struct kvm *kvm) +{ + int lpid; + + lpid = kvmppc_alloc_lpid(); + if (lpid < 0) + return lpid; + + /* + * Use two lpids per VM on cores with two threads like e6500. Use + * even numbers to speedup vcpu lpid computation with consecutive lpids + * per VM. vm1 will use lpids 2 and 3, vm2 lpids 4 and 5, and so on. + */ + if (threads_per_core == 2) + lpid <<= 1; + + kvm->arch.lpid = lpid; + return 0; +} + +static void kvmppc_core_destroy_vm_e500mc(struct kvm *kvm) +{ + int lpid = kvm->arch.lpid; + + if (threads_per_core == 2) + lpid >>= 1; + + kvmppc_free_lpid(lpid); +} + +static struct kvmppc_ops kvm_ops_e500mc = { + .get_sregs = kvmppc_core_get_sregs_e500mc, + .set_sregs = kvmppc_core_set_sregs_e500mc, + .get_one_reg = kvmppc_get_one_reg_e500mc, + .set_one_reg = kvmppc_set_one_reg_e500mc, + .vcpu_load = kvmppc_core_vcpu_load_e500mc, + .vcpu_put = kvmppc_core_vcpu_put_e500mc, + .vcpu_create = kvmppc_core_vcpu_create_e500mc, + .vcpu_free = kvmppc_core_vcpu_free_e500mc, + .mmu_destroy = kvmppc_mmu_destroy_e500, + .init_vm = kvmppc_core_init_vm_e500mc, + .destroy_vm = kvmppc_core_destroy_vm_e500mc, + .emulate_op = kvmppc_core_emulate_op_e500, + .emulate_mtspr = kvmppc_core_emulate_mtspr_e500, + .emulate_mfspr = kvmppc_core_emulate_mfspr_e500, +}; + +static int __init kvmppc_e500mc_init(void) +{ + int r; + + r = kvmppc_booke_init(); + if (r) + goto err_out; + + /* + * Use two lpids per VM on dual threaded processors like e6500 + * to workarround the lack of tlb write conditional instruction. + * Expose half the number of available hardware lpids to the lpid + * allocator. + */ + kvmppc_init_lpid(KVMPPC_NR_LPIDS/threads_per_core); + kvmppc_claim_lpid(0); /* host */ + + r = kvm_init(NULL, sizeof(struct kvmppc_vcpu_e500), 0, THIS_MODULE); + if (r) + goto err_out; + kvm_ops_e500mc.owner = THIS_MODULE; + kvmppc_pr_ops = &kvm_ops_e500mc; + +err_out: + return r; +} + +static void __exit kvmppc_e500mc_exit(void) +{ + kvmppc_pr_ops = NULL; + kvmppc_booke_exit(); +} + +module_init(kvmppc_e500mc_init); +module_exit(kvmppc_e500mc_exit); +MODULE_ALIAS_MISCDEV(KVM_MINOR); +MODULE_ALIAS("devname:kvm"); diff --git a/kernel/arch/powerpc/kvm/emulate.c b/kernel/arch/powerpc/kvm/emulate.c new file mode 100644 index 000000000..5cc2e7af3 --- /dev/null +++ b/kernel/arch/powerpc/kvm/emulate.c @@ -0,0 +1,317 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * + * Copyright IBM Corp. 2007 + * Copyright 2011 Freescale Semiconductor, Inc. + * + * Authors: Hollis Blanchard <hollisb@us.ibm.com> + */ + +#include <linux/jiffies.h> +#include <linux/hrtimer.h> +#include <linux/types.h> +#include <linux/string.h> +#include <linux/kvm_host.h> +#include <linux/clockchips.h> + +#include <asm/reg.h> +#include <asm/time.h> +#include <asm/byteorder.h> +#include <asm/kvm_ppc.h> +#include <asm/disassemble.h> +#include <asm/ppc-opcode.h> +#include "timing.h" +#include "trace.h" + +void kvmppc_emulate_dec(struct kvm_vcpu *vcpu) +{ + unsigned long dec_nsec; + unsigned long long dec_time; + + pr_debug("mtDEC: %x\n", vcpu->arch.dec); + hrtimer_try_to_cancel(&vcpu->arch.dec_timer); + +#ifdef CONFIG_PPC_BOOK3S + /* mtdec lowers the interrupt line when positive. */ + kvmppc_core_dequeue_dec(vcpu); + + /* POWER4+ triggers a dec interrupt if the value is < 0 */ + if (vcpu->arch.dec & 0x80000000) { + kvmppc_core_queue_dec(vcpu); + return; + } +#endif + +#ifdef CONFIG_BOOKE + /* On BOOKE, DEC = 0 is as good as decrementer not enabled */ + if (vcpu->arch.dec == 0) + return; +#endif + + /* + * The decrementer ticks at the same rate as the timebase, so + * that's how we convert the guest DEC value to the number of + * host ticks. + */ + + dec_time = vcpu->arch.dec; + /* + * Guest timebase ticks at the same frequency as host decrementer. + * So use the host decrementer calculations for decrementer emulation. + */ + dec_time = dec_time << decrementer_clockevent.shift; + do_div(dec_time, decrementer_clockevent.mult); + dec_nsec = do_div(dec_time, NSEC_PER_SEC); + hrtimer_start(&vcpu->arch.dec_timer, + ktime_set(dec_time, dec_nsec), HRTIMER_MODE_REL); + vcpu->arch.dec_jiffies = get_tb(); +} + +u32 kvmppc_get_dec(struct kvm_vcpu *vcpu, u64 tb) +{ + u64 jd = tb - vcpu->arch.dec_jiffies; + +#ifdef CONFIG_BOOKE + if (vcpu->arch.dec < jd) + return 0; +#endif + + return vcpu->arch.dec - jd; +} + +static int kvmppc_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs) +{ + enum emulation_result emulated = EMULATE_DONE; + ulong spr_val = kvmppc_get_gpr(vcpu, rs); + + switch (sprn) { + case SPRN_SRR0: + kvmppc_set_srr0(vcpu, spr_val); + break; + case SPRN_SRR1: + kvmppc_set_srr1(vcpu, spr_val); + break; + + /* XXX We need to context-switch the timebase for + * watchdog and FIT. */ + case SPRN_TBWL: break; + case SPRN_TBWU: break; + + case SPRN_DEC: + vcpu->arch.dec = spr_val; + kvmppc_emulate_dec(vcpu); + break; + + case SPRN_SPRG0: + kvmppc_set_sprg0(vcpu, spr_val); + break; + case SPRN_SPRG1: + kvmppc_set_sprg1(vcpu, spr_val); + break; + case SPRN_SPRG2: + kvmppc_set_sprg2(vcpu, spr_val); + break; + case SPRN_SPRG3: + kvmppc_set_sprg3(vcpu, spr_val); + break; + + /* PIR can legally be written, but we ignore it */ + case SPRN_PIR: break; + + default: + emulated = vcpu->kvm->arch.kvm_ops->emulate_mtspr(vcpu, sprn, + spr_val); + if (emulated == EMULATE_FAIL) + printk(KERN_INFO "mtspr: unknown spr " + "0x%x\n", sprn); + break; + } + + kvmppc_set_exit_type(vcpu, EMULATED_MTSPR_EXITS); + + return emulated; +} + +static int kvmppc_emulate_mfspr(struct kvm_vcpu *vcpu, int sprn, int rt) +{ + enum emulation_result emulated = EMULATE_DONE; + ulong spr_val = 0; + + switch (sprn) { + case SPRN_SRR0: + spr_val = kvmppc_get_srr0(vcpu); + break; + case SPRN_SRR1: + spr_val = kvmppc_get_srr1(vcpu); + break; + case SPRN_PVR: + spr_val = vcpu->arch.pvr; + break; + case SPRN_PIR: + spr_val = vcpu->vcpu_id; + break; + + /* Note: mftb and TBRL/TBWL are user-accessible, so + * the guest can always access the real TB anyways. + * In fact, we probably will never see these traps. */ + case SPRN_TBWL: + spr_val = get_tb() >> 32; + break; + case SPRN_TBWU: + spr_val = get_tb(); + break; + + case SPRN_SPRG0: + spr_val = kvmppc_get_sprg0(vcpu); + break; + case SPRN_SPRG1: + spr_val = kvmppc_get_sprg1(vcpu); + break; + case SPRN_SPRG2: + spr_val = kvmppc_get_sprg2(vcpu); + break; + case SPRN_SPRG3: + spr_val = kvmppc_get_sprg3(vcpu); + break; + /* Note: SPRG4-7 are user-readable, so we don't get + * a trap. */ + + case SPRN_DEC: + spr_val = kvmppc_get_dec(vcpu, get_tb()); + break; + default: + emulated = vcpu->kvm->arch.kvm_ops->emulate_mfspr(vcpu, sprn, + &spr_val); + if (unlikely(emulated == EMULATE_FAIL)) { + printk(KERN_INFO "mfspr: unknown spr " + "0x%x\n", sprn); + } + break; + } + + if (emulated == EMULATE_DONE) + kvmppc_set_gpr(vcpu, rt, spr_val); + kvmppc_set_exit_type(vcpu, EMULATED_MFSPR_EXITS); + + return emulated; +} + +/* XXX Should probably auto-generate instruction decoding for a particular core + * from opcode tables in the future. */ +int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu) +{ + u32 inst; + int rs, rt, sprn; + enum emulation_result emulated; + int advance = 1; + + /* this default type might be overwritten by subcategories */ + kvmppc_set_exit_type(vcpu, EMULATED_INST_EXITS); + + emulated = kvmppc_get_last_inst(vcpu, INST_GENERIC, &inst); + if (emulated != EMULATE_DONE) + return emulated; + + pr_debug("Emulating opcode %d / %d\n", get_op(inst), get_xop(inst)); + + rs = get_rs(inst); + rt = get_rt(inst); + sprn = get_sprn(inst); + + switch (get_op(inst)) { + case OP_TRAP: +#ifdef CONFIG_PPC_BOOK3S + case OP_TRAP_64: + kvmppc_core_queue_program(vcpu, SRR1_PROGTRAP); +#else + kvmppc_core_queue_program(vcpu, + vcpu->arch.shared->esr | ESR_PTR); +#endif + advance = 0; + break; + + case 31: + switch (get_xop(inst)) { + + case OP_31_XOP_TRAP: +#ifdef CONFIG_64BIT + case OP_31_XOP_TRAP_64: +#endif +#ifdef CONFIG_PPC_BOOK3S + kvmppc_core_queue_program(vcpu, SRR1_PROGTRAP); +#else + kvmppc_core_queue_program(vcpu, + vcpu->arch.shared->esr | ESR_PTR); +#endif + advance = 0; + break; + + case OP_31_XOP_MFSPR: + emulated = kvmppc_emulate_mfspr(vcpu, sprn, rt); + break; + + case OP_31_XOP_MTSPR: + emulated = kvmppc_emulate_mtspr(vcpu, sprn, rs); + break; + + case OP_31_XOP_TLBSYNC: + break; + + default: + /* Attempt core-specific emulation below. */ + emulated = EMULATE_FAIL; + } + break; + + case 0: + /* + * Instruction with primary opcode 0. Based on PowerISA + * these are illegal instructions. + */ + if (inst == KVMPPC_INST_SW_BREAKPOINT) { + run->exit_reason = KVM_EXIT_DEBUG; + run->debug.arch.address = kvmppc_get_pc(vcpu); + emulated = EMULATE_EXIT_USER; + advance = 0; + } else + emulated = EMULATE_FAIL; + + break; + + default: + emulated = EMULATE_FAIL; + } + + if (emulated == EMULATE_FAIL) { + emulated = vcpu->kvm->arch.kvm_ops->emulate_op(run, vcpu, inst, + &advance); + if (emulated == EMULATE_AGAIN) { + advance = 0; + } else if (emulated == EMULATE_FAIL) { + advance = 0; + printk(KERN_ERR "Couldn't emulate instruction 0x%08x " + "(op %d xop %d)\n", inst, get_op(inst), get_xop(inst)); + kvmppc_core_queue_program(vcpu, 0); + } + } + + trace_kvm_ppc_instr(inst, kvmppc_get_pc(vcpu), emulated); + + /* Advance past emulated instruction. */ + if (advance) + kvmppc_set_pc(vcpu, kvmppc_get_pc(vcpu) + 4); + + return emulated; +} +EXPORT_SYMBOL_GPL(kvmppc_emulate_instruction); diff --git a/kernel/arch/powerpc/kvm/emulate_loadstore.c b/kernel/arch/powerpc/kvm/emulate_loadstore.c new file mode 100644 index 000000000..6d3c0ee1d --- /dev/null +++ b/kernel/arch/powerpc/kvm/emulate_loadstore.c @@ -0,0 +1,272 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * + * Copyright IBM Corp. 2007 + * Copyright 2011 Freescale Semiconductor, Inc. + * + * Authors: Hollis Blanchard <hollisb@us.ibm.com> + */ + +#include <linux/jiffies.h> +#include <linux/hrtimer.h> +#include <linux/types.h> +#include <linux/string.h> +#include <linux/kvm_host.h> +#include <linux/clockchips.h> + +#include <asm/reg.h> +#include <asm/time.h> +#include <asm/byteorder.h> +#include <asm/kvm_ppc.h> +#include <asm/disassemble.h> +#include <asm/ppc-opcode.h> +#include "timing.h" +#include "trace.h" + +/* XXX to do: + * lhax + * lhaux + * lswx + * lswi + * stswx + * stswi + * lha + * lhau + * lmw + * stmw + * + */ +int kvmppc_emulate_loadstore(struct kvm_vcpu *vcpu) +{ + struct kvm_run *run = vcpu->run; + u32 inst; + int ra, rs, rt; + enum emulation_result emulated; + int advance = 1; + + /* this default type might be overwritten by subcategories */ + kvmppc_set_exit_type(vcpu, EMULATED_INST_EXITS); + + emulated = kvmppc_get_last_inst(vcpu, INST_GENERIC, &inst); + if (emulated != EMULATE_DONE) + return emulated; + + ra = get_ra(inst); + rs = get_rs(inst); + rt = get_rt(inst); + + switch (get_op(inst)) { + case 31: + switch (get_xop(inst)) { + case OP_31_XOP_LWZX: + emulated = kvmppc_handle_load(run, vcpu, rt, 4, 1); + break; + + case OP_31_XOP_LBZX: + emulated = kvmppc_handle_load(run, vcpu, rt, 1, 1); + break; + + case OP_31_XOP_LBZUX: + emulated = kvmppc_handle_load(run, vcpu, rt, 1, 1); + kvmppc_set_gpr(vcpu, ra, vcpu->arch.vaddr_accessed); + break; + + case OP_31_XOP_STWX: + emulated = kvmppc_handle_store(run, vcpu, + kvmppc_get_gpr(vcpu, rs), + 4, 1); + break; + + case OP_31_XOP_STBX: + emulated = kvmppc_handle_store(run, vcpu, + kvmppc_get_gpr(vcpu, rs), + 1, 1); + break; + + case OP_31_XOP_STBUX: + emulated = kvmppc_handle_store(run, vcpu, + kvmppc_get_gpr(vcpu, rs), + 1, 1); + kvmppc_set_gpr(vcpu, ra, vcpu->arch.vaddr_accessed); + break; + + case OP_31_XOP_LHAX: + emulated = kvmppc_handle_loads(run, vcpu, rt, 2, 1); + break; + + case OP_31_XOP_LHZX: + emulated = kvmppc_handle_load(run, vcpu, rt, 2, 1); + break; + + case OP_31_XOP_LHZUX: + emulated = kvmppc_handle_load(run, vcpu, rt, 2, 1); + kvmppc_set_gpr(vcpu, ra, vcpu->arch.vaddr_accessed); + break; + + case OP_31_XOP_STHX: + emulated = kvmppc_handle_store(run, vcpu, + kvmppc_get_gpr(vcpu, rs), + 2, 1); + break; + + case OP_31_XOP_STHUX: + emulated = kvmppc_handle_store(run, vcpu, + kvmppc_get_gpr(vcpu, rs), + 2, 1); + kvmppc_set_gpr(vcpu, ra, vcpu->arch.vaddr_accessed); + break; + + case OP_31_XOP_DCBST: + case OP_31_XOP_DCBF: + case OP_31_XOP_DCBI: + /* Do nothing. The guest is performing dcbi because + * hardware DMA is not snooped by the dcache, but + * emulated DMA either goes through the dcache as + * normal writes, or the host kernel has handled dcache + * coherence. */ + break; + + case OP_31_XOP_LWBRX: + emulated = kvmppc_handle_load(run, vcpu, rt, 4, 0); + break; + + case OP_31_XOP_STWBRX: + emulated = kvmppc_handle_store(run, vcpu, + kvmppc_get_gpr(vcpu, rs), + 4, 0); + break; + + case OP_31_XOP_LHBRX: + emulated = kvmppc_handle_load(run, vcpu, rt, 2, 0); + break; + + case OP_31_XOP_STHBRX: + emulated = kvmppc_handle_store(run, vcpu, + kvmppc_get_gpr(vcpu, rs), + 2, 0); + break; + + default: + emulated = EMULATE_FAIL; + break; + } + break; + + case OP_LWZ: + emulated = kvmppc_handle_load(run, vcpu, rt, 4, 1); + break; + + /* TBD: Add support for other 64 bit load variants like ldu, ldux, ldx etc. */ + case OP_LD: + rt = get_rt(inst); + emulated = kvmppc_handle_load(run, vcpu, rt, 8, 1); + break; + + case OP_LWZU: + emulated = kvmppc_handle_load(run, vcpu, rt, 4, 1); + kvmppc_set_gpr(vcpu, ra, vcpu->arch.vaddr_accessed); + break; + + case OP_LBZ: + emulated = kvmppc_handle_load(run, vcpu, rt, 1, 1); + break; + + case OP_LBZU: + emulated = kvmppc_handle_load(run, vcpu, rt, 1, 1); + kvmppc_set_gpr(vcpu, ra, vcpu->arch.vaddr_accessed); + break; + + case OP_STW: + emulated = kvmppc_handle_store(run, vcpu, + kvmppc_get_gpr(vcpu, rs), + 4, 1); + break; + + /* TBD: Add support for other 64 bit store variants like stdu, stdux, stdx etc. */ + case OP_STD: + rs = get_rs(inst); + emulated = kvmppc_handle_store(run, vcpu, + kvmppc_get_gpr(vcpu, rs), + 8, 1); + break; + + case OP_STWU: + emulated = kvmppc_handle_store(run, vcpu, + kvmppc_get_gpr(vcpu, rs), + 4, 1); + kvmppc_set_gpr(vcpu, ra, vcpu->arch.vaddr_accessed); + break; + + case OP_STB: + emulated = kvmppc_handle_store(run, vcpu, + kvmppc_get_gpr(vcpu, rs), + 1, 1); + break; + + case OP_STBU: + emulated = kvmppc_handle_store(run, vcpu, + kvmppc_get_gpr(vcpu, rs), + 1, 1); + kvmppc_set_gpr(vcpu, ra, vcpu->arch.vaddr_accessed); + break; + + case OP_LHZ: + emulated = kvmppc_handle_load(run, vcpu, rt, 2, 1); + break; + + case OP_LHZU: + emulated = kvmppc_handle_load(run, vcpu, rt, 2, 1); + kvmppc_set_gpr(vcpu, ra, vcpu->arch.vaddr_accessed); + break; + + case OP_LHA: + emulated = kvmppc_handle_loads(run, vcpu, rt, 2, 1); + break; + + case OP_LHAU: + emulated = kvmppc_handle_loads(run, vcpu, rt, 2, 1); + kvmppc_set_gpr(vcpu, ra, vcpu->arch.vaddr_accessed); + break; + + case OP_STH: + emulated = kvmppc_handle_store(run, vcpu, + kvmppc_get_gpr(vcpu, rs), + 2, 1); + break; + + case OP_STHU: + emulated = kvmppc_handle_store(run, vcpu, + kvmppc_get_gpr(vcpu, rs), + 2, 1); + kvmppc_set_gpr(vcpu, ra, vcpu->arch.vaddr_accessed); + break; + + default: + emulated = EMULATE_FAIL; + break; + } + + if (emulated == EMULATE_FAIL) { + advance = 0; + kvmppc_core_queue_program(vcpu, 0); + } + + trace_kvm_ppc_instr(inst, kvmppc_get_pc(vcpu), emulated); + + /* Advance past emulated instruction. */ + if (advance) + kvmppc_set_pc(vcpu, kvmppc_get_pc(vcpu) + 4); + + return emulated; +} diff --git a/kernel/arch/powerpc/kvm/fpu.S b/kernel/arch/powerpc/kvm/fpu.S new file mode 100644 index 000000000..bf68d5975 --- /dev/null +++ b/kernel/arch/powerpc/kvm/fpu.S @@ -0,0 +1,283 @@ +/* + * FPU helper code to use FPU operations from inside the kernel + * + * Copyright (C) 2010 Alexander Graf (agraf@suse.de) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include <asm/reg.h> +#include <asm/page.h> +#include <asm/mmu.h> +#include <asm/pgtable.h> +#include <asm/cputable.h> +#include <asm/cache.h> +#include <asm/thread_info.h> +#include <asm/ppc_asm.h> +#include <asm/asm-offsets.h> + +/* Instructions operating on single parameters */ + +/* + * Single operation with one input operand + * + * R3 = (double*)&fpscr + * R4 = (short*)&result + * R5 = (short*)¶m1 + */ +#define FPS_ONE_IN(name) \ +_GLOBAL(fps_ ## name); \ + lfd 0,0(r3); /* load up fpscr value */ \ + MTFSF_L(0); \ + lfs 0,0(r5); \ + \ + name 0,0; \ + \ + stfs 0,0(r4); \ + mffs 0; \ + stfd 0,0(r3); /* save new fpscr value */ \ + blr + +/* + * Single operation with two input operands + * + * R3 = (double*)&fpscr + * R4 = (short*)&result + * R5 = (short*)¶m1 + * R6 = (short*)¶m2 + */ +#define FPS_TWO_IN(name) \ +_GLOBAL(fps_ ## name); \ + lfd 0,0(r3); /* load up fpscr value */ \ + MTFSF_L(0); \ + lfs 0,0(r5); \ + lfs 1,0(r6); \ + \ + name 0,0,1; \ + \ + stfs 0,0(r4); \ + mffs 0; \ + stfd 0,0(r3); /* save new fpscr value */ \ + blr + +/* + * Single operation with three input operands + * + * R3 = (double*)&fpscr + * R4 = (short*)&result + * R5 = (short*)¶m1 + * R6 = (short*)¶m2 + * R7 = (short*)¶m3 + */ +#define FPS_THREE_IN(name) \ +_GLOBAL(fps_ ## name); \ + lfd 0,0(r3); /* load up fpscr value */ \ + MTFSF_L(0); \ + lfs 0,0(r5); \ + lfs 1,0(r6); \ + lfs 2,0(r7); \ + \ + name 0,0,1,2; \ + \ + stfs 0,0(r4); \ + mffs 0; \ + stfd 0,0(r3); /* save new fpscr value */ \ + blr + +FPS_ONE_IN(fres) +FPS_ONE_IN(frsqrte) +FPS_ONE_IN(fsqrts) +FPS_TWO_IN(fadds) +FPS_TWO_IN(fdivs) +FPS_TWO_IN(fmuls) +FPS_TWO_IN(fsubs) +FPS_THREE_IN(fmadds) +FPS_THREE_IN(fmsubs) +FPS_THREE_IN(fnmadds) +FPS_THREE_IN(fnmsubs) +FPS_THREE_IN(fsel) + + +/* Instructions operating on double parameters */ + +/* + * Beginning of double instruction processing + * + * R3 = (double*)&fpscr + * R4 = (u32*)&cr + * R5 = (double*)&result + * R6 = (double*)¶m1 + * R7 = (double*)¶m2 [load_two] + * R8 = (double*)¶m3 [load_three] + * LR = instruction call function + */ +fpd_load_three: + lfd 2,0(r8) /* load param3 */ +fpd_load_two: + lfd 1,0(r7) /* load param2 */ +fpd_load_one: + lfd 0,0(r6) /* load param1 */ +fpd_load_none: + lfd 3,0(r3) /* load up fpscr value */ + MTFSF_L(3) + lwz r6, 0(r4) /* load cr */ + mtcr r6 + blr + +/* + * End of double instruction processing + * + * R3 = (double*)&fpscr + * R4 = (u32*)&cr + * R5 = (double*)&result + * LR = caller of instruction call function + */ +fpd_return: + mfcr r6 + stfd 0,0(r5) /* save result */ + mffs 0 + stfd 0,0(r3) /* save new fpscr value */ + stw r6,0(r4) /* save new cr value */ + blr + +/* + * Double operation with no input operand + * + * R3 = (double*)&fpscr + * R4 = (u32*)&cr + * R5 = (double*)&result + */ +#define FPD_NONE_IN(name) \ +_GLOBAL(fpd_ ## name); \ + mflr r12; \ + bl fpd_load_none; \ + mtlr r12; \ + \ + name. 0; /* call instruction */ \ + b fpd_return + +/* + * Double operation with one input operand + * + * R3 = (double*)&fpscr + * R4 = (u32*)&cr + * R5 = (double*)&result + * R6 = (double*)¶m1 + */ +#define FPD_ONE_IN(name) \ +_GLOBAL(fpd_ ## name); \ + mflr r12; \ + bl fpd_load_one; \ + mtlr r12; \ + \ + name. 0,0; /* call instruction */ \ + b fpd_return + +/* + * Double operation with two input operands + * + * R3 = (double*)&fpscr + * R4 = (u32*)&cr + * R5 = (double*)&result + * R6 = (double*)¶m1 + * R7 = (double*)¶m2 + * R8 = (double*)¶m3 + */ +#define FPD_TWO_IN(name) \ +_GLOBAL(fpd_ ## name); \ + mflr r12; \ + bl fpd_load_two; \ + mtlr r12; \ + \ + name. 0,0,1; /* call instruction */ \ + b fpd_return + +/* + * CR Double operation with two input operands + * + * R3 = (double*)&fpscr + * R4 = (u32*)&cr + * R5 = (double*)¶m1 + * R6 = (double*)¶m2 + * R7 = (double*)¶m3 + */ +#define FPD_TWO_IN_CR(name) \ +_GLOBAL(fpd_ ## name); \ + lfd 1,0(r6); /* load param2 */ \ + lfd 0,0(r5); /* load param1 */ \ + lfd 3,0(r3); /* load up fpscr value */ \ + MTFSF_L(3); \ + lwz r6, 0(r4); /* load cr */ \ + mtcr r6; \ + \ + name 0,0,1; /* call instruction */ \ + mfcr r6; \ + mffs 0; \ + stfd 0,0(r3); /* save new fpscr value */ \ + stw r6,0(r4); /* save new cr value */ \ + blr + +/* + * Double operation with three input operands + * + * R3 = (double*)&fpscr + * R4 = (u32*)&cr + * R5 = (double*)&result + * R6 = (double*)¶m1 + * R7 = (double*)¶m2 + * R8 = (double*)¶m3 + */ +#define FPD_THREE_IN(name) \ +_GLOBAL(fpd_ ## name); \ + mflr r12; \ + bl fpd_load_three; \ + mtlr r12; \ + \ + name. 0,0,1,2; /* call instruction */ \ + b fpd_return + +FPD_ONE_IN(fsqrts) +FPD_ONE_IN(frsqrtes) +FPD_ONE_IN(fres) +FPD_ONE_IN(frsp) +FPD_ONE_IN(fctiw) +FPD_ONE_IN(fctiwz) +FPD_ONE_IN(fsqrt) +FPD_ONE_IN(fre) +FPD_ONE_IN(frsqrte) +FPD_ONE_IN(fneg) +FPD_ONE_IN(fabs) +FPD_TWO_IN(fadds) +FPD_TWO_IN(fsubs) +FPD_TWO_IN(fdivs) +FPD_TWO_IN(fmuls) +FPD_TWO_IN_CR(fcmpu) +FPD_TWO_IN(fcpsgn) +FPD_TWO_IN(fdiv) +FPD_TWO_IN(fadd) +FPD_TWO_IN(fmul) +FPD_TWO_IN_CR(fcmpo) +FPD_TWO_IN(fsub) +FPD_THREE_IN(fmsubs) +FPD_THREE_IN(fmadds) +FPD_THREE_IN(fnmsubs) +FPD_THREE_IN(fnmadds) +FPD_THREE_IN(fsel) +FPD_THREE_IN(fmsub) +FPD_THREE_IN(fmadd) +FPD_THREE_IN(fnmsub) +FPD_THREE_IN(fnmadd) + +_GLOBAL(kvm_cvt_fd) + lfs 0,0(r3) + stfd 0,0(r4) + blr + +_GLOBAL(kvm_cvt_df) + lfd 0,0(r3) + stfs 0,0(r4) + blr diff --git a/kernel/arch/powerpc/kvm/irq.h b/kernel/arch/powerpc/kvm/irq.h new file mode 100644 index 000000000..5a9a10b90 --- /dev/null +++ b/kernel/arch/powerpc/kvm/irq.h @@ -0,0 +1,20 @@ +#ifndef __IRQ_H +#define __IRQ_H + +#include <linux/kvm_host.h> + +static inline int irqchip_in_kernel(struct kvm *kvm) +{ + int ret = 0; + +#ifdef CONFIG_KVM_MPIC + ret = ret || (kvm->arch.mpic != NULL); +#endif +#ifdef CONFIG_KVM_XICS + ret = ret || (kvm->arch.xics != NULL); +#endif + smp_rmb(); + return ret; +} + +#endif diff --git a/kernel/arch/powerpc/kvm/mpic.c b/kernel/arch/powerpc/kvm/mpic.c new file mode 100644 index 000000000..6249cdc83 --- /dev/null +++ b/kernel/arch/powerpc/kvm/mpic.c @@ -0,0 +1,1852 @@ +/* + * OpenPIC emulation + * + * Copyright (c) 2004 Jocelyn Mayer + * 2011 Alexander Graf + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include <linux/slab.h> +#include <linux/mutex.h> +#include <linux/kvm_host.h> +#include <linux/errno.h> +#include <linux/fs.h> +#include <linux/anon_inodes.h> +#include <asm/uaccess.h> +#include <asm/mpic.h> +#include <asm/kvm_para.h> +#include <asm/kvm_host.h> +#include <asm/kvm_ppc.h> +#include <kvm/iodev.h> + +#define MAX_CPU 32 +#define MAX_SRC 256 +#define MAX_TMR 4 +#define MAX_IPI 4 +#define MAX_MSI 8 +#define MAX_IRQ (MAX_SRC + MAX_IPI + MAX_TMR) +#define VID 0x03 /* MPIC version ID */ + +/* OpenPIC capability flags */ +#define OPENPIC_FLAG_IDR_CRIT (1 << 0) +#define OPENPIC_FLAG_ILR (2 << 0) + +/* OpenPIC address map */ +#define OPENPIC_REG_SIZE 0x40000 +#define OPENPIC_GLB_REG_START 0x0 +#define OPENPIC_GLB_REG_SIZE 0x10F0 +#define OPENPIC_TMR_REG_START 0x10F0 +#define OPENPIC_TMR_REG_SIZE 0x220 +#define OPENPIC_MSI_REG_START 0x1600 +#define OPENPIC_MSI_REG_SIZE 0x200 +#define OPENPIC_SUMMARY_REG_START 0x3800 +#define OPENPIC_SUMMARY_REG_SIZE 0x800 +#define OPENPIC_SRC_REG_START 0x10000 +#define OPENPIC_SRC_REG_SIZE (MAX_SRC * 0x20) +#define OPENPIC_CPU_REG_START 0x20000 +#define OPENPIC_CPU_REG_SIZE (0x100 + ((MAX_CPU - 1) * 0x1000)) + +struct fsl_mpic_info { + int max_ext; +}; + +static struct fsl_mpic_info fsl_mpic_20 = { + .max_ext = 12, +}; + +static struct fsl_mpic_info fsl_mpic_42 = { + .max_ext = 12, +}; + +#define FRR_NIRQ_SHIFT 16 +#define FRR_NCPU_SHIFT 8 +#define FRR_VID_SHIFT 0 + +#define VID_REVISION_1_2 2 +#define VID_REVISION_1_3 3 + +#define VIR_GENERIC 0x00000000 /* Generic Vendor ID */ + +#define GCR_RESET 0x80000000 +#define GCR_MODE_PASS 0x00000000 +#define GCR_MODE_MIXED 0x20000000 +#define GCR_MODE_PROXY 0x60000000 + +#define TBCR_CI 0x80000000 /* count inhibit */ +#define TCCR_TOG 0x80000000 /* toggles when decrement to zero */ + +#define IDR_EP_SHIFT 31 +#define IDR_EP_MASK (1 << IDR_EP_SHIFT) +#define IDR_CI0_SHIFT 30 +#define IDR_CI1_SHIFT 29 +#define IDR_P1_SHIFT 1 +#define IDR_P0_SHIFT 0 + +#define ILR_INTTGT_MASK 0x000000ff +#define ILR_INTTGT_INT 0x00 +#define ILR_INTTGT_CINT 0x01 /* critical */ +#define ILR_INTTGT_MCP 0x02 /* machine check */ +#define NUM_OUTPUTS 3 + +#define MSIIR_OFFSET 0x140 +#define MSIIR_SRS_SHIFT 29 +#define MSIIR_SRS_MASK (0x7 << MSIIR_SRS_SHIFT) +#define MSIIR_IBS_SHIFT 24 +#define MSIIR_IBS_MASK (0x1f << MSIIR_IBS_SHIFT) + +static int get_current_cpu(void) +{ +#if defined(CONFIG_KVM) && defined(CONFIG_BOOKE) + struct kvm_vcpu *vcpu = current->thread.kvm_vcpu; + return vcpu ? vcpu->arch.irq_cpu_id : -1; +#else + /* XXX */ + return -1; +#endif +} + +static int openpic_cpu_write_internal(void *opaque, gpa_t addr, + u32 val, int idx); +static int openpic_cpu_read_internal(void *opaque, gpa_t addr, + u32 *ptr, int idx); +static inline void write_IRQreg_idr(struct openpic *opp, int n_IRQ, + uint32_t val); + +enum irq_type { + IRQ_TYPE_NORMAL = 0, + IRQ_TYPE_FSLINT, /* FSL internal interrupt -- level only */ + IRQ_TYPE_FSLSPECIAL, /* FSL timer/IPI interrupt, edge, no polarity */ +}; + +struct irq_queue { + /* Round up to the nearest 64 IRQs so that the queue length + * won't change when moving between 32 and 64 bit hosts. + */ + unsigned long queue[BITS_TO_LONGS((MAX_IRQ + 63) & ~63)]; + int next; + int priority; +}; + +struct irq_source { + uint32_t ivpr; /* IRQ vector/priority register */ + uint32_t idr; /* IRQ destination register */ + uint32_t destmask; /* bitmap of CPU destinations */ + int last_cpu; + int output; /* IRQ level, e.g. ILR_INTTGT_INT */ + int pending; /* TRUE if IRQ is pending */ + enum irq_type type; + bool level:1; /* level-triggered */ + bool nomask:1; /* critical interrupts ignore mask on some FSL MPICs */ +}; + +#define IVPR_MASK_SHIFT 31 +#define IVPR_MASK_MASK (1 << IVPR_MASK_SHIFT) +#define IVPR_ACTIVITY_SHIFT 30 +#define IVPR_ACTIVITY_MASK (1 << IVPR_ACTIVITY_SHIFT) +#define IVPR_MODE_SHIFT 29 +#define IVPR_MODE_MASK (1 << IVPR_MODE_SHIFT) +#define IVPR_POLARITY_SHIFT 23 +#define IVPR_POLARITY_MASK (1 << IVPR_POLARITY_SHIFT) +#define IVPR_SENSE_SHIFT 22 +#define IVPR_SENSE_MASK (1 << IVPR_SENSE_SHIFT) + +#define IVPR_PRIORITY_MASK (0xF << 16) +#define IVPR_PRIORITY(_ivprr_) ((int)(((_ivprr_) & IVPR_PRIORITY_MASK) >> 16)) +#define IVPR_VECTOR(opp, _ivprr_) ((_ivprr_) & (opp)->vector_mask) + +/* IDR[EP/CI] are only for FSL MPIC prior to v4.0 */ +#define IDR_EP 0x80000000 /* external pin */ +#define IDR_CI 0x40000000 /* critical interrupt */ + +struct irq_dest { + struct kvm_vcpu *vcpu; + + int32_t ctpr; /* CPU current task priority */ + struct irq_queue raised; + struct irq_queue servicing; + + /* Count of IRQ sources asserting on non-INT outputs */ + uint32_t outputs_active[NUM_OUTPUTS]; +}; + +#define MAX_MMIO_REGIONS 10 + +struct openpic { + struct kvm *kvm; + struct kvm_device *dev; + struct kvm_io_device mmio; + const struct mem_reg *mmio_regions[MAX_MMIO_REGIONS]; + int num_mmio_regions; + + gpa_t reg_base; + spinlock_t lock; + + /* Behavior control */ + struct fsl_mpic_info *fsl; + uint32_t model; + uint32_t flags; + uint32_t nb_irqs; + uint32_t vid; + uint32_t vir; /* Vendor identification register */ + uint32_t vector_mask; + uint32_t tfrr_reset; + uint32_t ivpr_reset; + uint32_t idr_reset; + uint32_t brr1; + uint32_t mpic_mode_mask; + + /* Global registers */ + uint32_t frr; /* Feature reporting register */ + uint32_t gcr; /* Global configuration register */ + uint32_t pir; /* Processor initialization register */ + uint32_t spve; /* Spurious vector register */ + uint32_t tfrr; /* Timer frequency reporting register */ + /* Source registers */ + struct irq_source src[MAX_IRQ]; + /* Local registers per output pin */ + struct irq_dest dst[MAX_CPU]; + uint32_t nb_cpus; + /* Timer registers */ + struct { + uint32_t tccr; /* Global timer current count register */ + uint32_t tbcr; /* Global timer base count register */ + } timers[MAX_TMR]; + /* Shared MSI registers */ + struct { + uint32_t msir; /* Shared Message Signaled Interrupt Register */ + } msi[MAX_MSI]; + uint32_t max_irq; + uint32_t irq_ipi0; + uint32_t irq_tim0; + uint32_t irq_msi; +}; + + +static void mpic_irq_raise(struct openpic *opp, struct irq_dest *dst, + int output) +{ + struct kvm_interrupt irq = { + .irq = KVM_INTERRUPT_SET_LEVEL, + }; + + if (!dst->vcpu) { + pr_debug("%s: destination cpu %d does not exist\n", + __func__, (int)(dst - &opp->dst[0])); + return; + } + + pr_debug("%s: cpu %d output %d\n", __func__, dst->vcpu->arch.irq_cpu_id, + output); + + if (output != ILR_INTTGT_INT) /* TODO */ + return; + + kvm_vcpu_ioctl_interrupt(dst->vcpu, &irq); +} + +static void mpic_irq_lower(struct openpic *opp, struct irq_dest *dst, + int output) +{ + if (!dst->vcpu) { + pr_debug("%s: destination cpu %d does not exist\n", + __func__, (int)(dst - &opp->dst[0])); + return; + } + + pr_debug("%s: cpu %d output %d\n", __func__, dst->vcpu->arch.irq_cpu_id, + output); + + if (output != ILR_INTTGT_INT) /* TODO */ + return; + + kvmppc_core_dequeue_external(dst->vcpu); +} + +static inline void IRQ_setbit(struct irq_queue *q, int n_IRQ) +{ + set_bit(n_IRQ, q->queue); +} + +static inline void IRQ_resetbit(struct irq_queue *q, int n_IRQ) +{ + clear_bit(n_IRQ, q->queue); +} + +static void IRQ_check(struct openpic *opp, struct irq_queue *q) +{ + int irq = -1; + int next = -1; + int priority = -1; + + for (;;) { + irq = find_next_bit(q->queue, opp->max_irq, irq + 1); + if (irq == opp->max_irq) + break; + + pr_debug("IRQ_check: irq %d set ivpr_pr=%d pr=%d\n", + irq, IVPR_PRIORITY(opp->src[irq].ivpr), priority); + + if (IVPR_PRIORITY(opp->src[irq].ivpr) > priority) { + next = irq; + priority = IVPR_PRIORITY(opp->src[irq].ivpr); + } + } + + q->next = next; + q->priority = priority; +} + +static int IRQ_get_next(struct openpic *opp, struct irq_queue *q) +{ + /* XXX: optimize */ + IRQ_check(opp, q); + + return q->next; +} + +static void IRQ_local_pipe(struct openpic *opp, int n_CPU, int n_IRQ, + bool active, bool was_active) +{ + struct irq_dest *dst; + struct irq_source *src; + int priority; + + dst = &opp->dst[n_CPU]; + src = &opp->src[n_IRQ]; + + pr_debug("%s: IRQ %d active %d was %d\n", + __func__, n_IRQ, active, was_active); + + if (src->output != ILR_INTTGT_INT) { + pr_debug("%s: output %d irq %d active %d was %d count %d\n", + __func__, src->output, n_IRQ, active, was_active, + dst->outputs_active[src->output]); + + /* On Freescale MPIC, critical interrupts ignore priority, + * IACK, EOI, etc. Before MPIC v4.1 they also ignore + * masking. + */ + if (active) { + if (!was_active && + dst->outputs_active[src->output]++ == 0) { + pr_debug("%s: Raise OpenPIC output %d cpu %d irq %d\n", + __func__, src->output, n_CPU, n_IRQ); + mpic_irq_raise(opp, dst, src->output); + } + } else { + if (was_active && + --dst->outputs_active[src->output] == 0) { + pr_debug("%s: Lower OpenPIC output %d cpu %d irq %d\n", + __func__, src->output, n_CPU, n_IRQ); + mpic_irq_lower(opp, dst, src->output); + } + } + + return; + } + + priority = IVPR_PRIORITY(src->ivpr); + + /* Even if the interrupt doesn't have enough priority, + * it is still raised, in case ctpr is lowered later. + */ + if (active) + IRQ_setbit(&dst->raised, n_IRQ); + else + IRQ_resetbit(&dst->raised, n_IRQ); + + IRQ_check(opp, &dst->raised); + + if (active && priority <= dst->ctpr) { + pr_debug("%s: IRQ %d priority %d too low for ctpr %d on CPU %d\n", + __func__, n_IRQ, priority, dst->ctpr, n_CPU); + active = 0; + } + + if (active) { + if (IRQ_get_next(opp, &dst->servicing) >= 0 && + priority <= dst->servicing.priority) { + pr_debug("%s: IRQ %d is hidden by servicing IRQ %d on CPU %d\n", + __func__, n_IRQ, dst->servicing.next, n_CPU); + } else { + pr_debug("%s: Raise OpenPIC INT output cpu %d irq %d/%d\n", + __func__, n_CPU, n_IRQ, dst->raised.next); + mpic_irq_raise(opp, dst, ILR_INTTGT_INT); + } + } else { + IRQ_get_next(opp, &dst->servicing); + if (dst->raised.priority > dst->ctpr && + dst->raised.priority > dst->servicing.priority) { + pr_debug("%s: IRQ %d inactive, IRQ %d prio %d above %d/%d, CPU %d\n", + __func__, n_IRQ, dst->raised.next, + dst->raised.priority, dst->ctpr, + dst->servicing.priority, n_CPU); + /* IRQ line stays asserted */ + } else { + pr_debug("%s: IRQ %d inactive, current prio %d/%d, CPU %d\n", + __func__, n_IRQ, dst->ctpr, + dst->servicing.priority, n_CPU); + mpic_irq_lower(opp, dst, ILR_INTTGT_INT); + } + } +} + +/* update pic state because registers for n_IRQ have changed value */ +static void openpic_update_irq(struct openpic *opp, int n_IRQ) +{ + struct irq_source *src; + bool active, was_active; + int i; + + src = &opp->src[n_IRQ]; + active = src->pending; + + if ((src->ivpr & IVPR_MASK_MASK) && !src->nomask) { + /* Interrupt source is disabled */ + pr_debug("%s: IRQ %d is disabled\n", __func__, n_IRQ); + active = false; + } + + was_active = !!(src->ivpr & IVPR_ACTIVITY_MASK); + + /* + * We don't have a similar check for already-active because + * ctpr may have changed and we need to withdraw the interrupt. + */ + if (!active && !was_active) { + pr_debug("%s: IRQ %d is already inactive\n", __func__, n_IRQ); + return; + } + + if (active) + src->ivpr |= IVPR_ACTIVITY_MASK; + else + src->ivpr &= ~IVPR_ACTIVITY_MASK; + + if (src->destmask == 0) { + /* No target */ + pr_debug("%s: IRQ %d has no target\n", __func__, n_IRQ); + return; + } + + if (src->destmask == (1 << src->last_cpu)) { + /* Only one CPU is allowed to receive this IRQ */ + IRQ_local_pipe(opp, src->last_cpu, n_IRQ, active, was_active); + } else if (!(src->ivpr & IVPR_MODE_MASK)) { + /* Directed delivery mode */ + for (i = 0; i < opp->nb_cpus; i++) { + if (src->destmask & (1 << i)) { + IRQ_local_pipe(opp, i, n_IRQ, active, + was_active); + } + } + } else { + /* Distributed delivery mode */ + for (i = src->last_cpu + 1; i != src->last_cpu; i++) { + if (i == opp->nb_cpus) + i = 0; + + if (src->destmask & (1 << i)) { + IRQ_local_pipe(opp, i, n_IRQ, active, + was_active); + src->last_cpu = i; + break; + } + } + } +} + +static void openpic_set_irq(void *opaque, int n_IRQ, int level) +{ + struct openpic *opp = opaque; + struct irq_source *src; + + if (n_IRQ >= MAX_IRQ) { + WARN_ONCE(1, "%s: IRQ %d out of range\n", __func__, n_IRQ); + return; + } + + src = &opp->src[n_IRQ]; + pr_debug("openpic: set irq %d = %d ivpr=0x%08x\n", + n_IRQ, level, src->ivpr); + if (src->level) { + /* level-sensitive irq */ + src->pending = level; + openpic_update_irq(opp, n_IRQ); + } else { + /* edge-sensitive irq */ + if (level) { + src->pending = 1; + openpic_update_irq(opp, n_IRQ); + } + + if (src->output != ILR_INTTGT_INT) { + /* Edge-triggered interrupts shouldn't be used + * with non-INT delivery, but just in case, + * try to make it do something sane rather than + * cause an interrupt storm. This is close to + * what you'd probably see happen in real hardware. + */ + src->pending = 0; + openpic_update_irq(opp, n_IRQ); + } + } +} + +static void openpic_reset(struct openpic *opp) +{ + int i; + + opp->gcr = GCR_RESET; + /* Initialise controller registers */ + opp->frr = ((opp->nb_irqs - 1) << FRR_NIRQ_SHIFT) | + (opp->vid << FRR_VID_SHIFT); + + opp->pir = 0; + opp->spve = -1 & opp->vector_mask; + opp->tfrr = opp->tfrr_reset; + /* Initialise IRQ sources */ + for (i = 0; i < opp->max_irq; i++) { + opp->src[i].ivpr = opp->ivpr_reset; + + switch (opp->src[i].type) { + case IRQ_TYPE_NORMAL: + opp->src[i].level = + !!(opp->ivpr_reset & IVPR_SENSE_MASK); + break; + + case IRQ_TYPE_FSLINT: + opp->src[i].ivpr |= IVPR_POLARITY_MASK; + break; + + case IRQ_TYPE_FSLSPECIAL: + break; + } + + write_IRQreg_idr(opp, i, opp->idr_reset); + } + /* Initialise IRQ destinations */ + for (i = 0; i < MAX_CPU; i++) { + opp->dst[i].ctpr = 15; + memset(&opp->dst[i].raised, 0, sizeof(struct irq_queue)); + opp->dst[i].raised.next = -1; + memset(&opp->dst[i].servicing, 0, sizeof(struct irq_queue)); + opp->dst[i].servicing.next = -1; + } + /* Initialise timers */ + for (i = 0; i < MAX_TMR; i++) { + opp->timers[i].tccr = 0; + opp->timers[i].tbcr = TBCR_CI; + } + /* Go out of RESET state */ + opp->gcr = 0; +} + +static inline uint32_t read_IRQreg_idr(struct openpic *opp, int n_IRQ) +{ + return opp->src[n_IRQ].idr; +} + +static inline uint32_t read_IRQreg_ilr(struct openpic *opp, int n_IRQ) +{ + if (opp->flags & OPENPIC_FLAG_ILR) + return opp->src[n_IRQ].output; + + return 0xffffffff; +} + +static inline uint32_t read_IRQreg_ivpr(struct openpic *opp, int n_IRQ) +{ + return opp->src[n_IRQ].ivpr; +} + +static inline void write_IRQreg_idr(struct openpic *opp, int n_IRQ, + uint32_t val) +{ + struct irq_source *src = &opp->src[n_IRQ]; + uint32_t normal_mask = (1UL << opp->nb_cpus) - 1; + uint32_t crit_mask = 0; + uint32_t mask = normal_mask; + int crit_shift = IDR_EP_SHIFT - opp->nb_cpus; + int i; + + if (opp->flags & OPENPIC_FLAG_IDR_CRIT) { + crit_mask = mask << crit_shift; + mask |= crit_mask | IDR_EP; + } + + src->idr = val & mask; + pr_debug("Set IDR %d to 0x%08x\n", n_IRQ, src->idr); + + if (opp->flags & OPENPIC_FLAG_IDR_CRIT) { + if (src->idr & crit_mask) { + if (src->idr & normal_mask) { + pr_debug("%s: IRQ configured for multiple output types, using critical\n", + __func__); + } + + src->output = ILR_INTTGT_CINT; + src->nomask = true; + src->destmask = 0; + + for (i = 0; i < opp->nb_cpus; i++) { + int n_ci = IDR_CI0_SHIFT - i; + + if (src->idr & (1UL << n_ci)) + src->destmask |= 1UL << i; + } + } else { + src->output = ILR_INTTGT_INT; + src->nomask = false; + src->destmask = src->idr & normal_mask; + } + } else { + src->destmask = src->idr; + } +} + +static inline void write_IRQreg_ilr(struct openpic *opp, int n_IRQ, + uint32_t val) +{ + if (opp->flags & OPENPIC_FLAG_ILR) { + struct irq_source *src = &opp->src[n_IRQ]; + + src->output = val & ILR_INTTGT_MASK; + pr_debug("Set ILR %d to 0x%08x, output %d\n", n_IRQ, src->idr, + src->output); + + /* TODO: on MPIC v4.0 only, set nomask for non-INT */ + } +} + +static inline void write_IRQreg_ivpr(struct openpic *opp, int n_IRQ, + uint32_t val) +{ + uint32_t mask; + + /* NOTE when implementing newer FSL MPIC models: starting with v4.0, + * the polarity bit is read-only on internal interrupts. + */ + mask = IVPR_MASK_MASK | IVPR_PRIORITY_MASK | IVPR_SENSE_MASK | + IVPR_POLARITY_MASK | opp->vector_mask; + + /* ACTIVITY bit is read-only */ + opp->src[n_IRQ].ivpr = + (opp->src[n_IRQ].ivpr & IVPR_ACTIVITY_MASK) | (val & mask); + + /* For FSL internal interrupts, The sense bit is reserved and zero, + * and the interrupt is always level-triggered. Timers and IPIs + * have no sense or polarity bits, and are edge-triggered. + */ + switch (opp->src[n_IRQ].type) { + case IRQ_TYPE_NORMAL: + opp->src[n_IRQ].level = + !!(opp->src[n_IRQ].ivpr & IVPR_SENSE_MASK); + break; + + case IRQ_TYPE_FSLINT: + opp->src[n_IRQ].ivpr &= ~IVPR_SENSE_MASK; + break; + + case IRQ_TYPE_FSLSPECIAL: + opp->src[n_IRQ].ivpr &= ~(IVPR_POLARITY_MASK | IVPR_SENSE_MASK); + break; + } + + openpic_update_irq(opp, n_IRQ); + pr_debug("Set IVPR %d to 0x%08x -> 0x%08x\n", n_IRQ, val, + opp->src[n_IRQ].ivpr); +} + +static void openpic_gcr_write(struct openpic *opp, uint64_t val) +{ + if (val & GCR_RESET) { + openpic_reset(opp); + return; + } + + opp->gcr &= ~opp->mpic_mode_mask; + opp->gcr |= val & opp->mpic_mode_mask; +} + +static int openpic_gbl_write(void *opaque, gpa_t addr, u32 val) +{ + struct openpic *opp = opaque; + int err = 0; + + pr_debug("%s: addr %#llx <= %08x\n", __func__, addr, val); + if (addr & 0xF) + return 0; + + switch (addr) { + case 0x00: /* Block Revision Register1 (BRR1) is Readonly */ + break; + case 0x40: + case 0x50: + case 0x60: + case 0x70: + case 0x80: + case 0x90: + case 0xA0: + case 0xB0: + err = openpic_cpu_write_internal(opp, addr, val, + get_current_cpu()); + break; + case 0x1000: /* FRR */ + break; + case 0x1020: /* GCR */ + openpic_gcr_write(opp, val); + break; + case 0x1080: /* VIR */ + break; + case 0x1090: /* PIR */ + /* + * This register is used to reset a CPU core -- + * let userspace handle it. + */ + err = -ENXIO; + break; + case 0x10A0: /* IPI_IVPR */ + case 0x10B0: + case 0x10C0: + case 0x10D0: { + int idx; + idx = (addr - 0x10A0) >> 4; + write_IRQreg_ivpr(opp, opp->irq_ipi0 + idx, val); + break; + } + case 0x10E0: /* SPVE */ + opp->spve = val & opp->vector_mask; + break; + default: + break; + } + + return err; +} + +static int openpic_gbl_read(void *opaque, gpa_t addr, u32 *ptr) +{ + struct openpic *opp = opaque; + u32 retval; + int err = 0; + + pr_debug("%s: addr %#llx\n", __func__, addr); + retval = 0xFFFFFFFF; + if (addr & 0xF) + goto out; + + switch (addr) { + case 0x1000: /* FRR */ + retval = opp->frr; + retval |= (opp->nb_cpus - 1) << FRR_NCPU_SHIFT; + break; + case 0x1020: /* GCR */ + retval = opp->gcr; + break; + case 0x1080: /* VIR */ + retval = opp->vir; + break; + case 0x1090: /* PIR */ + retval = 0x00000000; + break; + case 0x00: /* Block Revision Register1 (BRR1) */ + retval = opp->brr1; + break; + case 0x40: + case 0x50: + case 0x60: + case 0x70: + case 0x80: + case 0x90: + case 0xA0: + case 0xB0: + err = openpic_cpu_read_internal(opp, addr, + &retval, get_current_cpu()); + break; + case 0x10A0: /* IPI_IVPR */ + case 0x10B0: + case 0x10C0: + case 0x10D0: + { + int idx; + idx = (addr - 0x10A0) >> 4; + retval = read_IRQreg_ivpr(opp, opp->irq_ipi0 + idx); + } + break; + case 0x10E0: /* SPVE */ + retval = opp->spve; + break; + default: + break; + } + +out: + pr_debug("%s: => 0x%08x\n", __func__, retval); + *ptr = retval; + return err; +} + +static int openpic_tmr_write(void *opaque, gpa_t addr, u32 val) +{ + struct openpic *opp = opaque; + int idx; + + addr += 0x10f0; + + pr_debug("%s: addr %#llx <= %08x\n", __func__, addr, val); + if (addr & 0xF) + return 0; + + if (addr == 0x10f0) { + /* TFRR */ + opp->tfrr = val; + return 0; + } + + idx = (addr >> 6) & 0x3; + addr = addr & 0x30; + + switch (addr & 0x30) { + case 0x00: /* TCCR */ + break; + case 0x10: /* TBCR */ + if ((opp->timers[idx].tccr & TCCR_TOG) != 0 && + (val & TBCR_CI) == 0 && + (opp->timers[idx].tbcr & TBCR_CI) != 0) + opp->timers[idx].tccr &= ~TCCR_TOG; + + opp->timers[idx].tbcr = val; + break; + case 0x20: /* TVPR */ + write_IRQreg_ivpr(opp, opp->irq_tim0 + idx, val); + break; + case 0x30: /* TDR */ + write_IRQreg_idr(opp, opp->irq_tim0 + idx, val); + break; + } + + return 0; +} + +static int openpic_tmr_read(void *opaque, gpa_t addr, u32 *ptr) +{ + struct openpic *opp = opaque; + uint32_t retval = -1; + int idx; + + pr_debug("%s: addr %#llx\n", __func__, addr); + if (addr & 0xF) + goto out; + + idx = (addr >> 6) & 0x3; + if (addr == 0x0) { + /* TFRR */ + retval = opp->tfrr; + goto out; + } + + switch (addr & 0x30) { + case 0x00: /* TCCR */ + retval = opp->timers[idx].tccr; + break; + case 0x10: /* TBCR */ + retval = opp->timers[idx].tbcr; + break; + case 0x20: /* TIPV */ + retval = read_IRQreg_ivpr(opp, opp->irq_tim0 + idx); + break; + case 0x30: /* TIDE (TIDR) */ + retval = read_IRQreg_idr(opp, opp->irq_tim0 + idx); + break; + } + +out: + pr_debug("%s: => 0x%08x\n", __func__, retval); + *ptr = retval; + return 0; +} + +static int openpic_src_write(void *opaque, gpa_t addr, u32 val) +{ + struct openpic *opp = opaque; + int idx; + + pr_debug("%s: addr %#llx <= %08x\n", __func__, addr, val); + + addr = addr & 0xffff; + idx = addr >> 5; + + switch (addr & 0x1f) { + case 0x00: + write_IRQreg_ivpr(opp, idx, val); + break; + case 0x10: + write_IRQreg_idr(opp, idx, val); + break; + case 0x18: + write_IRQreg_ilr(opp, idx, val); + break; + } + + return 0; +} + +static int openpic_src_read(void *opaque, gpa_t addr, u32 *ptr) +{ + struct openpic *opp = opaque; + uint32_t retval; + int idx; + + pr_debug("%s: addr %#llx\n", __func__, addr); + retval = 0xFFFFFFFF; + + addr = addr & 0xffff; + idx = addr >> 5; + + switch (addr & 0x1f) { + case 0x00: + retval = read_IRQreg_ivpr(opp, idx); + break; + case 0x10: + retval = read_IRQreg_idr(opp, idx); + break; + case 0x18: + retval = read_IRQreg_ilr(opp, idx); + break; + } + + pr_debug("%s: => 0x%08x\n", __func__, retval); + *ptr = retval; + return 0; +} + +static int openpic_msi_write(void *opaque, gpa_t addr, u32 val) +{ + struct openpic *opp = opaque; + int idx = opp->irq_msi; + int srs, ibs; + + pr_debug("%s: addr %#llx <= 0x%08x\n", __func__, addr, val); + if (addr & 0xF) + return 0; + + switch (addr) { + case MSIIR_OFFSET: + srs = val >> MSIIR_SRS_SHIFT; + idx += srs; + ibs = (val & MSIIR_IBS_MASK) >> MSIIR_IBS_SHIFT; + opp->msi[srs].msir |= 1 << ibs; + openpic_set_irq(opp, idx, 1); + break; + default: + /* most registers are read-only, thus ignored */ + break; + } + + return 0; +} + +static int openpic_msi_read(void *opaque, gpa_t addr, u32 *ptr) +{ + struct openpic *opp = opaque; + uint32_t r = 0; + int i, srs; + + pr_debug("%s: addr %#llx\n", __func__, addr); + if (addr & 0xF) + return -ENXIO; + + srs = addr >> 4; + + switch (addr) { + case 0x00: + case 0x10: + case 0x20: + case 0x30: + case 0x40: + case 0x50: + case 0x60: + case 0x70: /* MSIRs */ + r = opp->msi[srs].msir; + /* Clear on read */ + opp->msi[srs].msir = 0; + openpic_set_irq(opp, opp->irq_msi + srs, 0); + break; + case 0x120: /* MSISR */ + for (i = 0; i < MAX_MSI; i++) + r |= (opp->msi[i].msir ? 1 : 0) << i; + break; + } + + pr_debug("%s: => 0x%08x\n", __func__, r); + *ptr = r; + return 0; +} + +static int openpic_summary_read(void *opaque, gpa_t addr, u32 *ptr) +{ + uint32_t r = 0; + + pr_debug("%s: addr %#llx\n", __func__, addr); + + /* TODO: EISR/EIMR */ + + *ptr = r; + return 0; +} + +static int openpic_summary_write(void *opaque, gpa_t addr, u32 val) +{ + pr_debug("%s: addr %#llx <= 0x%08x\n", __func__, addr, val); + + /* TODO: EISR/EIMR */ + return 0; +} + +static int openpic_cpu_write_internal(void *opaque, gpa_t addr, + u32 val, int idx) +{ + struct openpic *opp = opaque; + struct irq_source *src; + struct irq_dest *dst; + int s_IRQ, n_IRQ; + + pr_debug("%s: cpu %d addr %#llx <= 0x%08x\n", __func__, idx, + addr, val); + + if (idx < 0) + return 0; + + if (addr & 0xF) + return 0; + + dst = &opp->dst[idx]; + addr &= 0xFF0; + switch (addr) { + case 0x40: /* IPIDR */ + case 0x50: + case 0x60: + case 0x70: + idx = (addr - 0x40) >> 4; + /* we use IDE as mask which CPUs to deliver the IPI to still. */ + opp->src[opp->irq_ipi0 + idx].destmask |= val; + openpic_set_irq(opp, opp->irq_ipi0 + idx, 1); + openpic_set_irq(opp, opp->irq_ipi0 + idx, 0); + break; + case 0x80: /* CTPR */ + dst->ctpr = val & 0x0000000F; + + pr_debug("%s: set CPU %d ctpr to %d, raised %d servicing %d\n", + __func__, idx, dst->ctpr, dst->raised.priority, + dst->servicing.priority); + + if (dst->raised.priority <= dst->ctpr) { + pr_debug("%s: Lower OpenPIC INT output cpu %d due to ctpr\n", + __func__, idx); + mpic_irq_lower(opp, dst, ILR_INTTGT_INT); + } else if (dst->raised.priority > dst->servicing.priority) { + pr_debug("%s: Raise OpenPIC INT output cpu %d irq %d\n", + __func__, idx, dst->raised.next); + mpic_irq_raise(opp, dst, ILR_INTTGT_INT); + } + + break; + case 0x90: /* WHOAMI */ + /* Read-only register */ + break; + case 0xA0: /* IACK */ + /* Read-only register */ + break; + case 0xB0: { /* EOI */ + int notify_eoi; + + pr_debug("EOI\n"); + s_IRQ = IRQ_get_next(opp, &dst->servicing); + + if (s_IRQ < 0) { + pr_debug("%s: EOI with no interrupt in service\n", + __func__); + break; + } + + IRQ_resetbit(&dst->servicing, s_IRQ); + /* Notify listeners that the IRQ is over */ + notify_eoi = s_IRQ; + /* Set up next servicing IRQ */ + s_IRQ = IRQ_get_next(opp, &dst->servicing); + /* Check queued interrupts. */ + n_IRQ = IRQ_get_next(opp, &dst->raised); + src = &opp->src[n_IRQ]; + if (n_IRQ != -1 && + (s_IRQ == -1 || + IVPR_PRIORITY(src->ivpr) > dst->servicing.priority)) { + pr_debug("Raise OpenPIC INT output cpu %d irq %d\n", + idx, n_IRQ); + mpic_irq_raise(opp, dst, ILR_INTTGT_INT); + } + + spin_unlock(&opp->lock); + kvm_notify_acked_irq(opp->kvm, 0, notify_eoi); + spin_lock(&opp->lock); + + break; + } + default: + break; + } + + return 0; +} + +static int openpic_cpu_write(void *opaque, gpa_t addr, u32 val) +{ + struct openpic *opp = opaque; + + return openpic_cpu_write_internal(opp, addr, val, + (addr & 0x1f000) >> 12); +} + +static uint32_t openpic_iack(struct openpic *opp, struct irq_dest *dst, + int cpu) +{ + struct irq_source *src; + int retval, irq; + + pr_debug("Lower OpenPIC INT output\n"); + mpic_irq_lower(opp, dst, ILR_INTTGT_INT); + + irq = IRQ_get_next(opp, &dst->raised); + pr_debug("IACK: irq=%d\n", irq); + + if (irq == -1) + /* No more interrupt pending */ + return opp->spve; + + src = &opp->src[irq]; + if (!(src->ivpr & IVPR_ACTIVITY_MASK) || + !(IVPR_PRIORITY(src->ivpr) > dst->ctpr)) { + pr_err("%s: bad raised IRQ %d ctpr %d ivpr 0x%08x\n", + __func__, irq, dst->ctpr, src->ivpr); + openpic_update_irq(opp, irq); + retval = opp->spve; + } else { + /* IRQ enter servicing state */ + IRQ_setbit(&dst->servicing, irq); + retval = IVPR_VECTOR(opp, src->ivpr); + } + + if (!src->level) { + /* edge-sensitive IRQ */ + src->ivpr &= ~IVPR_ACTIVITY_MASK; + src->pending = 0; + IRQ_resetbit(&dst->raised, irq); + } + + if ((irq >= opp->irq_ipi0) && (irq < (opp->irq_ipi0 + MAX_IPI))) { + src->destmask &= ~(1 << cpu); + if (src->destmask && !src->level) { + /* trigger on CPUs that didn't know about it yet */ + openpic_set_irq(opp, irq, 1); + openpic_set_irq(opp, irq, 0); + /* if all CPUs knew about it, set active bit again */ + src->ivpr |= IVPR_ACTIVITY_MASK; + } + } + + return retval; +} + +void kvmppc_mpic_set_epr(struct kvm_vcpu *vcpu) +{ + struct openpic *opp = vcpu->arch.mpic; + int cpu = vcpu->arch.irq_cpu_id; + unsigned long flags; + + spin_lock_irqsave(&opp->lock, flags); + + if ((opp->gcr & opp->mpic_mode_mask) == GCR_MODE_PROXY) + kvmppc_set_epr(vcpu, openpic_iack(opp, &opp->dst[cpu], cpu)); + + spin_unlock_irqrestore(&opp->lock, flags); +} + +static int openpic_cpu_read_internal(void *opaque, gpa_t addr, + u32 *ptr, int idx) +{ + struct openpic *opp = opaque; + struct irq_dest *dst; + uint32_t retval; + + pr_debug("%s: cpu %d addr %#llx\n", __func__, idx, addr); + retval = 0xFFFFFFFF; + + if (idx < 0) + goto out; + + if (addr & 0xF) + goto out; + + dst = &opp->dst[idx]; + addr &= 0xFF0; + switch (addr) { + case 0x80: /* CTPR */ + retval = dst->ctpr; + break; + case 0x90: /* WHOAMI */ + retval = idx; + break; + case 0xA0: /* IACK */ + retval = openpic_iack(opp, dst, idx); + break; + case 0xB0: /* EOI */ + retval = 0; + break; + default: + break; + } + pr_debug("%s: => 0x%08x\n", __func__, retval); + +out: + *ptr = retval; + return 0; +} + +static int openpic_cpu_read(void *opaque, gpa_t addr, u32 *ptr) +{ + struct openpic *opp = opaque; + + return openpic_cpu_read_internal(opp, addr, ptr, + (addr & 0x1f000) >> 12); +} + +struct mem_reg { + int (*read)(void *opaque, gpa_t addr, u32 *ptr); + int (*write)(void *opaque, gpa_t addr, u32 val); + gpa_t start_addr; + int size; +}; + +static const struct mem_reg openpic_gbl_mmio = { + .write = openpic_gbl_write, + .read = openpic_gbl_read, + .start_addr = OPENPIC_GLB_REG_START, + .size = OPENPIC_GLB_REG_SIZE, +}; + +static const struct mem_reg openpic_tmr_mmio = { + .write = openpic_tmr_write, + .read = openpic_tmr_read, + .start_addr = OPENPIC_TMR_REG_START, + .size = OPENPIC_TMR_REG_SIZE, +}; + +static const struct mem_reg openpic_cpu_mmio = { + .write = openpic_cpu_write, + .read = openpic_cpu_read, + .start_addr = OPENPIC_CPU_REG_START, + .size = OPENPIC_CPU_REG_SIZE, +}; + +static const struct mem_reg openpic_src_mmio = { + .write = openpic_src_write, + .read = openpic_src_read, + .start_addr = OPENPIC_SRC_REG_START, + .size = OPENPIC_SRC_REG_SIZE, +}; + +static const struct mem_reg openpic_msi_mmio = { + .read = openpic_msi_read, + .write = openpic_msi_write, + .start_addr = OPENPIC_MSI_REG_START, + .size = OPENPIC_MSI_REG_SIZE, +}; + +static const struct mem_reg openpic_summary_mmio = { + .read = openpic_summary_read, + .write = openpic_summary_write, + .start_addr = OPENPIC_SUMMARY_REG_START, + .size = OPENPIC_SUMMARY_REG_SIZE, +}; + +static void add_mmio_region(struct openpic *opp, const struct mem_reg *mr) +{ + if (opp->num_mmio_regions >= MAX_MMIO_REGIONS) { + WARN(1, "kvm mpic: too many mmio regions\n"); + return; + } + + opp->mmio_regions[opp->num_mmio_regions++] = mr; +} + +static void fsl_common_init(struct openpic *opp) +{ + int i; + int virq = MAX_SRC; + + add_mmio_region(opp, &openpic_msi_mmio); + add_mmio_region(opp, &openpic_summary_mmio); + + opp->vid = VID_REVISION_1_2; + opp->vir = VIR_GENERIC; + opp->vector_mask = 0xFFFF; + opp->tfrr_reset = 0; + opp->ivpr_reset = IVPR_MASK_MASK; + opp->idr_reset = 1 << 0; + opp->max_irq = MAX_IRQ; + + opp->irq_ipi0 = virq; + virq += MAX_IPI; + opp->irq_tim0 = virq; + virq += MAX_TMR; + + BUG_ON(virq > MAX_IRQ); + + opp->irq_msi = 224; + + for (i = 0; i < opp->fsl->max_ext; i++) + opp->src[i].level = false; + + /* Internal interrupts, including message and MSI */ + for (i = 16; i < MAX_SRC; i++) { + opp->src[i].type = IRQ_TYPE_FSLINT; + opp->src[i].level = true; + } + + /* timers and IPIs */ + for (i = MAX_SRC; i < virq; i++) { + opp->src[i].type = IRQ_TYPE_FSLSPECIAL; + opp->src[i].level = false; + } +} + +static int kvm_mpic_read_internal(struct openpic *opp, gpa_t addr, u32 *ptr) +{ + int i; + + for (i = 0; i < opp->num_mmio_regions; i++) { + const struct mem_reg *mr = opp->mmio_regions[i]; + + if (mr->start_addr > addr || addr >= mr->start_addr + mr->size) + continue; + + return mr->read(opp, addr - mr->start_addr, ptr); + } + + return -ENXIO; +} + +static int kvm_mpic_write_internal(struct openpic *opp, gpa_t addr, u32 val) +{ + int i; + + for (i = 0; i < opp->num_mmio_regions; i++) { + const struct mem_reg *mr = opp->mmio_regions[i]; + + if (mr->start_addr > addr || addr >= mr->start_addr + mr->size) + continue; + + return mr->write(opp, addr - mr->start_addr, val); + } + + return -ENXIO; +} + +static int kvm_mpic_read(struct kvm_vcpu *vcpu, + struct kvm_io_device *this, + gpa_t addr, int len, void *ptr) +{ + struct openpic *opp = container_of(this, struct openpic, mmio); + int ret; + union { + u32 val; + u8 bytes[4]; + } u; + + if (addr & (len - 1)) { + pr_debug("%s: bad alignment %llx/%d\n", + __func__, addr, len); + return -EINVAL; + } + + spin_lock_irq(&opp->lock); + ret = kvm_mpic_read_internal(opp, addr - opp->reg_base, &u.val); + spin_unlock_irq(&opp->lock); + + /* + * Technically only 32-bit accesses are allowed, but be nice to + * people dumping registers a byte at a time -- it works in real + * hardware (reads only, not writes). + */ + if (len == 4) { + *(u32 *)ptr = u.val; + pr_debug("%s: addr %llx ret %d len 4 val %x\n", + __func__, addr, ret, u.val); + } else if (len == 1) { + *(u8 *)ptr = u.bytes[addr & 3]; + pr_debug("%s: addr %llx ret %d len 1 val %x\n", + __func__, addr, ret, u.bytes[addr & 3]); + } else { + pr_debug("%s: bad length %d\n", __func__, len); + return -EINVAL; + } + + return ret; +} + +static int kvm_mpic_write(struct kvm_vcpu *vcpu, + struct kvm_io_device *this, + gpa_t addr, int len, const void *ptr) +{ + struct openpic *opp = container_of(this, struct openpic, mmio); + int ret; + + if (len != 4) { + pr_debug("%s: bad length %d\n", __func__, len); + return -EOPNOTSUPP; + } + if (addr & 3) { + pr_debug("%s: bad alignment %llx/%d\n", __func__, addr, len); + return -EOPNOTSUPP; + } + + spin_lock_irq(&opp->lock); + ret = kvm_mpic_write_internal(opp, addr - opp->reg_base, + *(const u32 *)ptr); + spin_unlock_irq(&opp->lock); + + pr_debug("%s: addr %llx ret %d val %x\n", + __func__, addr, ret, *(const u32 *)ptr); + + return ret; +} + +static const struct kvm_io_device_ops mpic_mmio_ops = { + .read = kvm_mpic_read, + .write = kvm_mpic_write, +}; + +static void map_mmio(struct openpic *opp) +{ + kvm_iodevice_init(&opp->mmio, &mpic_mmio_ops); + + kvm_io_bus_register_dev(opp->kvm, KVM_MMIO_BUS, + opp->reg_base, OPENPIC_REG_SIZE, + &opp->mmio); +} + +static void unmap_mmio(struct openpic *opp) +{ + kvm_io_bus_unregister_dev(opp->kvm, KVM_MMIO_BUS, &opp->mmio); +} + +static int set_base_addr(struct openpic *opp, struct kvm_device_attr *attr) +{ + u64 base; + + if (copy_from_user(&base, (u64 __user *)(long)attr->addr, sizeof(u64))) + return -EFAULT; + + if (base & 0x3ffff) { + pr_debug("kvm mpic %s: KVM_DEV_MPIC_BASE_ADDR %08llx not aligned\n", + __func__, base); + return -EINVAL; + } + + if (base == opp->reg_base) + return 0; + + mutex_lock(&opp->kvm->slots_lock); + + unmap_mmio(opp); + opp->reg_base = base; + + pr_debug("kvm mpic %s: KVM_DEV_MPIC_BASE_ADDR %08llx\n", + __func__, base); + + if (base == 0) + goto out; + + map_mmio(opp); + +out: + mutex_unlock(&opp->kvm->slots_lock); + return 0; +} + +#define ATTR_SET 0 +#define ATTR_GET 1 + +static int access_reg(struct openpic *opp, gpa_t addr, u32 *val, int type) +{ + int ret; + + if (addr & 3) + return -ENXIO; + + spin_lock_irq(&opp->lock); + + if (type == ATTR_SET) + ret = kvm_mpic_write_internal(opp, addr, *val); + else + ret = kvm_mpic_read_internal(opp, addr, val); + + spin_unlock_irq(&opp->lock); + + pr_debug("%s: type %d addr %llx val %x\n", __func__, type, addr, *val); + + return ret; +} + +static int mpic_set_attr(struct kvm_device *dev, struct kvm_device_attr *attr) +{ + struct openpic *opp = dev->private; + u32 attr32; + + switch (attr->group) { + case KVM_DEV_MPIC_GRP_MISC: + switch (attr->attr) { + case KVM_DEV_MPIC_BASE_ADDR: + return set_base_addr(opp, attr); + } + + break; + + case KVM_DEV_MPIC_GRP_REGISTER: + if (get_user(attr32, (u32 __user *)(long)attr->addr)) + return -EFAULT; + + return access_reg(opp, attr->attr, &attr32, ATTR_SET); + + case KVM_DEV_MPIC_GRP_IRQ_ACTIVE: + if (attr->attr > MAX_SRC) + return -EINVAL; + + if (get_user(attr32, (u32 __user *)(long)attr->addr)) + return -EFAULT; + + if (attr32 != 0 && attr32 != 1) + return -EINVAL; + + spin_lock_irq(&opp->lock); + openpic_set_irq(opp, attr->attr, attr32); + spin_unlock_irq(&opp->lock); + return 0; + } + + return -ENXIO; +} + +static int mpic_get_attr(struct kvm_device *dev, struct kvm_device_attr *attr) +{ + struct openpic *opp = dev->private; + u64 attr64; + u32 attr32; + int ret; + + switch (attr->group) { + case KVM_DEV_MPIC_GRP_MISC: + switch (attr->attr) { + case KVM_DEV_MPIC_BASE_ADDR: + mutex_lock(&opp->kvm->slots_lock); + attr64 = opp->reg_base; + mutex_unlock(&opp->kvm->slots_lock); + + if (copy_to_user((u64 __user *)(long)attr->addr, + &attr64, sizeof(u64))) + return -EFAULT; + + return 0; + } + + break; + + case KVM_DEV_MPIC_GRP_REGISTER: + ret = access_reg(opp, attr->attr, &attr32, ATTR_GET); + if (ret) + return ret; + + if (put_user(attr32, (u32 __user *)(long)attr->addr)) + return -EFAULT; + + return 0; + + case KVM_DEV_MPIC_GRP_IRQ_ACTIVE: + if (attr->attr > MAX_SRC) + return -EINVAL; + + spin_lock_irq(&opp->lock); + attr32 = opp->src[attr->attr].pending; + spin_unlock_irq(&opp->lock); + + if (put_user(attr32, (u32 __user *)(long)attr->addr)) + return -EFAULT; + + return 0; + } + + return -ENXIO; +} + +static int mpic_has_attr(struct kvm_device *dev, struct kvm_device_attr *attr) +{ + switch (attr->group) { + case KVM_DEV_MPIC_GRP_MISC: + switch (attr->attr) { + case KVM_DEV_MPIC_BASE_ADDR: + return 0; + } + + break; + + case KVM_DEV_MPIC_GRP_REGISTER: + return 0; + + case KVM_DEV_MPIC_GRP_IRQ_ACTIVE: + if (attr->attr > MAX_SRC) + break; + + return 0; + } + + return -ENXIO; +} + +static void mpic_destroy(struct kvm_device *dev) +{ + struct openpic *opp = dev->private; + + dev->kvm->arch.mpic = NULL; + kfree(opp); + kfree(dev); +} + +static int mpic_set_default_irq_routing(struct openpic *opp) +{ + struct kvm_irq_routing_entry *routing; + + /* Create a nop default map, so that dereferencing it still works */ + routing = kzalloc((sizeof(*routing)), GFP_KERNEL); + if (!routing) + return -ENOMEM; + + kvm_set_irq_routing(opp->kvm, routing, 0, 0); + + kfree(routing); + return 0; +} + +static int mpic_create(struct kvm_device *dev, u32 type) +{ + struct openpic *opp; + int ret; + + /* We only support one MPIC at a time for now */ + if (dev->kvm->arch.mpic) + return -EINVAL; + + opp = kzalloc(sizeof(struct openpic), GFP_KERNEL); + if (!opp) + return -ENOMEM; + + dev->private = opp; + opp->kvm = dev->kvm; + opp->dev = dev; + opp->model = type; + spin_lock_init(&opp->lock); + + add_mmio_region(opp, &openpic_gbl_mmio); + add_mmio_region(opp, &openpic_tmr_mmio); + add_mmio_region(opp, &openpic_src_mmio); + add_mmio_region(opp, &openpic_cpu_mmio); + + switch (opp->model) { + case KVM_DEV_TYPE_FSL_MPIC_20: + opp->fsl = &fsl_mpic_20; + opp->brr1 = 0x00400200; + opp->flags |= OPENPIC_FLAG_IDR_CRIT; + opp->nb_irqs = 80; + opp->mpic_mode_mask = GCR_MODE_MIXED; + + fsl_common_init(opp); + + break; + + case KVM_DEV_TYPE_FSL_MPIC_42: + opp->fsl = &fsl_mpic_42; + opp->brr1 = 0x00400402; + opp->flags |= OPENPIC_FLAG_ILR; + opp->nb_irqs = 196; + opp->mpic_mode_mask = GCR_MODE_PROXY; + + fsl_common_init(opp); + + break; + + default: + ret = -ENODEV; + goto err; + } + + ret = mpic_set_default_irq_routing(opp); + if (ret) + goto err; + + openpic_reset(opp); + + smp_wmb(); + dev->kvm->arch.mpic = opp; + + return 0; + +err: + kfree(opp); + return ret; +} + +struct kvm_device_ops kvm_mpic_ops = { + .name = "kvm-mpic", + .create = mpic_create, + .destroy = mpic_destroy, + .set_attr = mpic_set_attr, + .get_attr = mpic_get_attr, + .has_attr = mpic_has_attr, +}; + +int kvmppc_mpic_connect_vcpu(struct kvm_device *dev, struct kvm_vcpu *vcpu, + u32 cpu) +{ + struct openpic *opp = dev->private; + int ret = 0; + + if (dev->ops != &kvm_mpic_ops) + return -EPERM; + if (opp->kvm != vcpu->kvm) + return -EPERM; + if (cpu < 0 || cpu >= MAX_CPU) + return -EPERM; + + spin_lock_irq(&opp->lock); + + if (opp->dst[cpu].vcpu) { + ret = -EEXIST; + goto out; + } + if (vcpu->arch.irq_type) { + ret = -EBUSY; + goto out; + } + + opp->dst[cpu].vcpu = vcpu; + opp->nb_cpus = max(opp->nb_cpus, cpu + 1); + + vcpu->arch.mpic = opp; + vcpu->arch.irq_cpu_id = cpu; + vcpu->arch.irq_type = KVMPPC_IRQ_MPIC; + + /* This might need to be changed if GCR gets extended */ + if (opp->mpic_mode_mask == GCR_MODE_PROXY) + vcpu->arch.epr_flags |= KVMPPC_EPR_KERNEL; + +out: + spin_unlock_irq(&opp->lock); + return ret; +} + +/* + * This should only happen immediately before the mpic is destroyed, + * so we shouldn't need to worry about anything still trying to + * access the vcpu pointer. + */ +void kvmppc_mpic_disconnect_vcpu(struct openpic *opp, struct kvm_vcpu *vcpu) +{ + BUG_ON(!opp->dst[vcpu->arch.irq_cpu_id].vcpu); + + opp->dst[vcpu->arch.irq_cpu_id].vcpu = NULL; +} + +/* + * Return value: + * < 0 Interrupt was ignored (masked or not delivered for other reasons) + * = 0 Interrupt was coalesced (previous irq is still pending) + * > 0 Number of CPUs interrupt was delivered to + */ +static int mpic_set_irq(struct kvm_kernel_irq_routing_entry *e, + struct kvm *kvm, int irq_source_id, int level, + bool line_status) +{ + u32 irq = e->irqchip.pin; + struct openpic *opp = kvm->arch.mpic; + unsigned long flags; + + spin_lock_irqsave(&opp->lock, flags); + openpic_set_irq(opp, irq, level); + spin_unlock_irqrestore(&opp->lock, flags); + + /* All code paths we care about don't check for the return value */ + return 0; +} + +int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e, + struct kvm *kvm, int irq_source_id, int level, bool line_status) +{ + struct openpic *opp = kvm->arch.mpic; + unsigned long flags; + + spin_lock_irqsave(&opp->lock, flags); + + /* + * XXX We ignore the target address for now, as we only support + * a single MSI bank. + */ + openpic_msi_write(kvm->arch.mpic, MSIIR_OFFSET, e->msi.data); + spin_unlock_irqrestore(&opp->lock, flags); + + /* All code paths we care about don't check for the return value */ + return 0; +} + +int kvm_set_routing_entry(struct kvm_kernel_irq_routing_entry *e, + const struct kvm_irq_routing_entry *ue) +{ + int r = -EINVAL; + + switch (ue->type) { + case KVM_IRQ_ROUTING_IRQCHIP: + e->set = mpic_set_irq; + e->irqchip.irqchip = ue->u.irqchip.irqchip; + e->irqchip.pin = ue->u.irqchip.pin; + if (e->irqchip.pin >= KVM_IRQCHIP_NUM_PINS) + goto out; + break; + case KVM_IRQ_ROUTING_MSI: + e->set = kvm_set_msi; + e->msi.address_lo = ue->u.msi.address_lo; + e->msi.address_hi = ue->u.msi.address_hi; + e->msi.data = ue->u.msi.data; + break; + default: + goto out; + } + + r = 0; +out: + return r; +} diff --git a/kernel/arch/powerpc/kvm/powerpc.c b/kernel/arch/powerpc/kvm/powerpc.c new file mode 100644 index 000000000..ac3ddf115 --- /dev/null +++ b/kernel/arch/powerpc/kvm/powerpc.c @@ -0,0 +1,1411 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * + * Copyright IBM Corp. 2007 + * + * Authors: Hollis Blanchard <hollisb@us.ibm.com> + * Christian Ehrhardt <ehrhardt@linux.vnet.ibm.com> + */ + +#include <linux/errno.h> +#include <linux/err.h> +#include <linux/kvm_host.h> +#include <linux/vmalloc.h> +#include <linux/hrtimer.h> +#include <linux/fs.h> +#include <linux/slab.h> +#include <linux/file.h> +#include <linux/module.h> +#include <asm/cputable.h> +#include <asm/uaccess.h> +#include <asm/kvm_ppc.h> +#include <asm/tlbflush.h> +#include <asm/cputhreads.h> +#include <asm/irqflags.h> +#include "timing.h" +#include "irq.h" +#include "../mm/mmu_decl.h" + +#define CREATE_TRACE_POINTS +#include "trace.h" + +struct kvmppc_ops *kvmppc_hv_ops; +EXPORT_SYMBOL_GPL(kvmppc_hv_ops); +struct kvmppc_ops *kvmppc_pr_ops; +EXPORT_SYMBOL_GPL(kvmppc_pr_ops); + + +int kvm_arch_vcpu_runnable(struct kvm_vcpu *v) +{ + return !!(v->arch.pending_exceptions) || + v->requests; +} + +int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu) +{ + return 1; +} + +/* + * Common checks before entering the guest world. Call with interrupts + * disabled. + * + * returns: + * + * == 1 if we're ready to go into guest state + * <= 0 if we need to go back to the host with return value + */ +int kvmppc_prepare_to_enter(struct kvm_vcpu *vcpu) +{ + int r; + + WARN_ON(irqs_disabled()); + hard_irq_disable(); + + while (true) { + if (need_resched()) { + local_irq_enable(); + cond_resched(); + hard_irq_disable(); + continue; + } + + if (signal_pending(current)) { + kvmppc_account_exit(vcpu, SIGNAL_EXITS); + vcpu->run->exit_reason = KVM_EXIT_INTR; + r = -EINTR; + break; + } + + vcpu->mode = IN_GUEST_MODE; + + /* + * Reading vcpu->requests must happen after setting vcpu->mode, + * so we don't miss a request because the requester sees + * OUTSIDE_GUEST_MODE and assumes we'll be checking requests + * before next entering the guest (and thus doesn't IPI). + */ + smp_mb(); + + if (vcpu->requests) { + /* Make sure we process requests preemptable */ + local_irq_enable(); + trace_kvm_check_requests(vcpu); + r = kvmppc_core_check_requests(vcpu); + hard_irq_disable(); + if (r > 0) + continue; + break; + } + + if (kvmppc_core_prepare_to_enter(vcpu)) { + /* interrupts got enabled in between, so we + are back at square 1 */ + continue; + } + + kvm_guest_enter(); + return 1; + } + + /* return to host */ + local_irq_enable(); + return r; +} +EXPORT_SYMBOL_GPL(kvmppc_prepare_to_enter); + +#if defined(CONFIG_PPC_BOOK3S_64) && defined(CONFIG_KVM_BOOK3S_PR_POSSIBLE) +static void kvmppc_swab_shared(struct kvm_vcpu *vcpu) +{ + struct kvm_vcpu_arch_shared *shared = vcpu->arch.shared; + int i; + + shared->sprg0 = swab64(shared->sprg0); + shared->sprg1 = swab64(shared->sprg1); + shared->sprg2 = swab64(shared->sprg2); + shared->sprg3 = swab64(shared->sprg3); + shared->srr0 = swab64(shared->srr0); + shared->srr1 = swab64(shared->srr1); + shared->dar = swab64(shared->dar); + shared->msr = swab64(shared->msr); + shared->dsisr = swab32(shared->dsisr); + shared->int_pending = swab32(shared->int_pending); + for (i = 0; i < ARRAY_SIZE(shared->sr); i++) + shared->sr[i] = swab32(shared->sr[i]); +} +#endif + +int kvmppc_kvm_pv(struct kvm_vcpu *vcpu) +{ + int nr = kvmppc_get_gpr(vcpu, 11); + int r; + unsigned long __maybe_unused param1 = kvmppc_get_gpr(vcpu, 3); + unsigned long __maybe_unused param2 = kvmppc_get_gpr(vcpu, 4); + unsigned long __maybe_unused param3 = kvmppc_get_gpr(vcpu, 5); + unsigned long __maybe_unused param4 = kvmppc_get_gpr(vcpu, 6); + unsigned long r2 = 0; + + if (!(kvmppc_get_msr(vcpu) & MSR_SF)) { + /* 32 bit mode */ + param1 &= 0xffffffff; + param2 &= 0xffffffff; + param3 &= 0xffffffff; + param4 &= 0xffffffff; + } + + switch (nr) { + case KVM_HCALL_TOKEN(KVM_HC_PPC_MAP_MAGIC_PAGE): + { +#if defined(CONFIG_PPC_BOOK3S_64) && defined(CONFIG_KVM_BOOK3S_PR_POSSIBLE) + /* Book3S can be little endian, find it out here */ + int shared_big_endian = true; + if (vcpu->arch.intr_msr & MSR_LE) + shared_big_endian = false; + if (shared_big_endian != vcpu->arch.shared_big_endian) + kvmppc_swab_shared(vcpu); + vcpu->arch.shared_big_endian = shared_big_endian; +#endif + + if (!(param2 & MAGIC_PAGE_FLAG_NOT_MAPPED_NX)) { + /* + * Older versions of the Linux magic page code had + * a bug where they would map their trampoline code + * NX. If that's the case, remove !PR NX capability. + */ + vcpu->arch.disable_kernel_nx = true; + kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); + } + + vcpu->arch.magic_page_pa = param1 & ~0xfffULL; + vcpu->arch.magic_page_ea = param2 & ~0xfffULL; + +#ifdef CONFIG_PPC_64K_PAGES + /* + * Make sure our 4k magic page is in the same window of a 64k + * page within the guest and within the host's page. + */ + if ((vcpu->arch.magic_page_pa & 0xf000) != + ((ulong)vcpu->arch.shared & 0xf000)) { + void *old_shared = vcpu->arch.shared; + ulong shared = (ulong)vcpu->arch.shared; + void *new_shared; + + shared &= PAGE_MASK; + shared |= vcpu->arch.magic_page_pa & 0xf000; + new_shared = (void*)shared; + memcpy(new_shared, old_shared, 0x1000); + vcpu->arch.shared = new_shared; + } +#endif + + r2 = KVM_MAGIC_FEAT_SR | KVM_MAGIC_FEAT_MAS0_TO_SPRG7; + + r = EV_SUCCESS; + break; + } + case KVM_HCALL_TOKEN(KVM_HC_FEATURES): + r = EV_SUCCESS; +#if defined(CONFIG_PPC_BOOK3S) || defined(CONFIG_KVM_E500V2) + r2 |= (1 << KVM_FEATURE_MAGIC_PAGE); +#endif + + /* Second return value is in r4 */ + break; + case EV_HCALL_TOKEN(EV_IDLE): + r = EV_SUCCESS; + kvm_vcpu_block(vcpu); + clear_bit(KVM_REQ_UNHALT, &vcpu->requests); + break; + default: + r = EV_UNIMPLEMENTED; + break; + } + + kvmppc_set_gpr(vcpu, 4, r2); + + return r; +} +EXPORT_SYMBOL_GPL(kvmppc_kvm_pv); + +int kvmppc_sanity_check(struct kvm_vcpu *vcpu) +{ + int r = false; + + /* We have to know what CPU to virtualize */ + if (!vcpu->arch.pvr) + goto out; + + /* PAPR only works with book3s_64 */ + if ((vcpu->arch.cpu_type != KVM_CPU_3S_64) && vcpu->arch.papr_enabled) + goto out; + + /* HV KVM can only do PAPR mode for now */ + if (!vcpu->arch.papr_enabled && is_kvmppc_hv_enabled(vcpu->kvm)) + goto out; + +#ifdef CONFIG_KVM_BOOKE_HV + if (!cpu_has_feature(CPU_FTR_EMB_HV)) + goto out; +#endif + + r = true; + +out: + vcpu->arch.sane = r; + return r ? 0 : -EINVAL; +} +EXPORT_SYMBOL_GPL(kvmppc_sanity_check); + +int kvmppc_emulate_mmio(struct kvm_run *run, struct kvm_vcpu *vcpu) +{ + enum emulation_result er; + int r; + + er = kvmppc_emulate_loadstore(vcpu); + switch (er) { + case EMULATE_DONE: + /* Future optimization: only reload non-volatiles if they were + * actually modified. */ + r = RESUME_GUEST_NV; + break; + case EMULATE_AGAIN: + r = RESUME_GUEST; + break; + case EMULATE_DO_MMIO: + run->exit_reason = KVM_EXIT_MMIO; + /* We must reload nonvolatiles because "update" load/store + * instructions modify register state. */ + /* Future optimization: only reload non-volatiles if they were + * actually modified. */ + r = RESUME_HOST_NV; + break; + case EMULATE_FAIL: + { + u32 last_inst; + + kvmppc_get_last_inst(vcpu, INST_GENERIC, &last_inst); + /* XXX Deliver Program interrupt to guest. */ + pr_emerg("%s: emulation failed (%08x)\n", __func__, last_inst); + r = RESUME_HOST; + break; + } + default: + WARN_ON(1); + r = RESUME_GUEST; + } + + return r; +} +EXPORT_SYMBOL_GPL(kvmppc_emulate_mmio); + +int kvmppc_st(struct kvm_vcpu *vcpu, ulong *eaddr, int size, void *ptr, + bool data) +{ + ulong mp_pa = vcpu->arch.magic_page_pa & KVM_PAM & PAGE_MASK; + struct kvmppc_pte pte; + int r; + + vcpu->stat.st++; + + r = kvmppc_xlate(vcpu, *eaddr, data ? XLATE_DATA : XLATE_INST, + XLATE_WRITE, &pte); + if (r < 0) + return r; + + *eaddr = pte.raddr; + + if (!pte.may_write) + return -EPERM; + + /* Magic page override */ + if (kvmppc_supports_magic_page(vcpu) && mp_pa && + ((pte.raddr & KVM_PAM & PAGE_MASK) == mp_pa) && + !(kvmppc_get_msr(vcpu) & MSR_PR)) { + void *magic = vcpu->arch.shared; + magic += pte.eaddr & 0xfff; + memcpy(magic, ptr, size); + return EMULATE_DONE; + } + + if (kvm_write_guest(vcpu->kvm, pte.raddr, ptr, size)) + return EMULATE_DO_MMIO; + + return EMULATE_DONE; +} +EXPORT_SYMBOL_GPL(kvmppc_st); + +int kvmppc_ld(struct kvm_vcpu *vcpu, ulong *eaddr, int size, void *ptr, + bool data) +{ + ulong mp_pa = vcpu->arch.magic_page_pa & KVM_PAM & PAGE_MASK; + struct kvmppc_pte pte; + int rc; + + vcpu->stat.ld++; + + rc = kvmppc_xlate(vcpu, *eaddr, data ? XLATE_DATA : XLATE_INST, + XLATE_READ, &pte); + if (rc) + return rc; + + *eaddr = pte.raddr; + + if (!pte.may_read) + return -EPERM; + + if (!data && !pte.may_execute) + return -ENOEXEC; + + /* Magic page override */ + if (kvmppc_supports_magic_page(vcpu) && mp_pa && + ((pte.raddr & KVM_PAM & PAGE_MASK) == mp_pa) && + !(kvmppc_get_msr(vcpu) & MSR_PR)) { + void *magic = vcpu->arch.shared; + magic += pte.eaddr & 0xfff; + memcpy(ptr, magic, size); + return EMULATE_DONE; + } + + if (kvm_read_guest(vcpu->kvm, pte.raddr, ptr, size)) + return EMULATE_DO_MMIO; + + return EMULATE_DONE; +} +EXPORT_SYMBOL_GPL(kvmppc_ld); + +int kvm_arch_hardware_enable(void) +{ + return 0; +} + +int kvm_arch_hardware_setup(void) +{ + return 0; +} + +void kvm_arch_check_processor_compat(void *rtn) +{ + *(int *)rtn = kvmppc_core_check_processor_compat(); +} + +int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) +{ + struct kvmppc_ops *kvm_ops = NULL; + /* + * if we have both HV and PR enabled, default is HV + */ + if (type == 0) { + if (kvmppc_hv_ops) + kvm_ops = kvmppc_hv_ops; + else + kvm_ops = kvmppc_pr_ops; + if (!kvm_ops) + goto err_out; + } else if (type == KVM_VM_PPC_HV) { + if (!kvmppc_hv_ops) + goto err_out; + kvm_ops = kvmppc_hv_ops; + } else if (type == KVM_VM_PPC_PR) { + if (!kvmppc_pr_ops) + goto err_out; + kvm_ops = kvmppc_pr_ops; + } else + goto err_out; + + if (kvm_ops->owner && !try_module_get(kvm_ops->owner)) + return -ENOENT; + + kvm->arch.kvm_ops = kvm_ops; + return kvmppc_core_init_vm(kvm); +err_out: + return -EINVAL; +} + +void kvm_arch_destroy_vm(struct kvm *kvm) +{ + unsigned int i; + struct kvm_vcpu *vcpu; + + kvm_for_each_vcpu(i, vcpu, kvm) + kvm_arch_vcpu_free(vcpu); + + mutex_lock(&kvm->lock); + for (i = 0; i < atomic_read(&kvm->online_vcpus); i++) + kvm->vcpus[i] = NULL; + + atomic_set(&kvm->online_vcpus, 0); + + kvmppc_core_destroy_vm(kvm); + + mutex_unlock(&kvm->lock); + + /* drop the module reference */ + module_put(kvm->arch.kvm_ops->owner); +} + +int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) +{ + int r; + /* Assume we're using HV mode when the HV module is loaded */ + int hv_enabled = kvmppc_hv_ops ? 1 : 0; + + if (kvm) { + /* + * Hooray - we know which VM type we're running on. Depend on + * that rather than the guess above. + */ + hv_enabled = is_kvmppc_hv_enabled(kvm); + } + + switch (ext) { +#ifdef CONFIG_BOOKE + case KVM_CAP_PPC_BOOKE_SREGS: + case KVM_CAP_PPC_BOOKE_WATCHDOG: + case KVM_CAP_PPC_EPR: +#else + case KVM_CAP_PPC_SEGSTATE: + case KVM_CAP_PPC_HIOR: + case KVM_CAP_PPC_PAPR: +#endif + case KVM_CAP_PPC_UNSET_IRQ: + case KVM_CAP_PPC_IRQ_LEVEL: + case KVM_CAP_ENABLE_CAP: + case KVM_CAP_ENABLE_CAP_VM: + case KVM_CAP_ONE_REG: + case KVM_CAP_IOEVENTFD: + case KVM_CAP_DEVICE_CTRL: + r = 1; + break; + case KVM_CAP_PPC_PAIRED_SINGLES: + case KVM_CAP_PPC_OSI: + case KVM_CAP_PPC_GET_PVINFO: +#if defined(CONFIG_KVM_E500V2) || defined(CONFIG_KVM_E500MC) + case KVM_CAP_SW_TLB: +#endif + /* We support this only for PR */ + r = !hv_enabled; + break; +#ifdef CONFIG_KVM_MMIO + case KVM_CAP_COALESCED_MMIO: + r = KVM_COALESCED_MMIO_PAGE_OFFSET; + break; +#endif +#ifdef CONFIG_KVM_MPIC + case KVM_CAP_IRQ_MPIC: + r = 1; + break; +#endif + +#ifdef CONFIG_PPC_BOOK3S_64 + case KVM_CAP_SPAPR_TCE: + case KVM_CAP_PPC_ALLOC_HTAB: + case KVM_CAP_PPC_RTAS: + case KVM_CAP_PPC_FIXUP_HCALL: + case KVM_CAP_PPC_ENABLE_HCALL: +#ifdef CONFIG_KVM_XICS + case KVM_CAP_IRQ_XICS: +#endif + r = 1; + break; +#endif /* CONFIG_PPC_BOOK3S_64 */ +#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE + case KVM_CAP_PPC_SMT: + if (hv_enabled) + r = threads_per_subcore; + else + r = 0; + break; + case KVM_CAP_PPC_RMA: + r = 0; + break; + case KVM_CAP_PPC_HWRNG: + r = kvmppc_hwrng_present(); + break; +#endif + case KVM_CAP_SYNC_MMU: +#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE + r = hv_enabled; +#elif defined(KVM_ARCH_WANT_MMU_NOTIFIER) + r = 1; +#else + r = 0; +#endif + break; +#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE + case KVM_CAP_PPC_HTAB_FD: + r = hv_enabled; + break; +#endif + case KVM_CAP_NR_VCPUS: + /* + * Recommending a number of CPUs is somewhat arbitrary; we + * return the number of present CPUs for -HV (since a host + * will have secondary threads "offline"), and for other KVM + * implementations just count online CPUs. + */ + if (hv_enabled) + r = num_present_cpus(); + else + r = num_online_cpus(); + break; + case KVM_CAP_MAX_VCPUS: + r = KVM_MAX_VCPUS; + break; +#ifdef CONFIG_PPC_BOOK3S_64 + case KVM_CAP_PPC_GET_SMMU_INFO: + r = 1; + break; +#endif + default: + r = 0; + break; + } + return r; + +} + +long kvm_arch_dev_ioctl(struct file *filp, + unsigned int ioctl, unsigned long arg) +{ + return -EINVAL; +} + +void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free, + struct kvm_memory_slot *dont) +{ + kvmppc_core_free_memslot(kvm, free, dont); +} + +int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot, + unsigned long npages) +{ + return kvmppc_core_create_memslot(kvm, slot, npages); +} + +int kvm_arch_prepare_memory_region(struct kvm *kvm, + struct kvm_memory_slot *memslot, + struct kvm_userspace_memory_region *mem, + enum kvm_mr_change change) +{ + return kvmppc_core_prepare_memory_region(kvm, memslot, mem); +} + +void kvm_arch_commit_memory_region(struct kvm *kvm, + struct kvm_userspace_memory_region *mem, + const struct kvm_memory_slot *old, + enum kvm_mr_change change) +{ + kvmppc_core_commit_memory_region(kvm, mem, old); +} + +void kvm_arch_flush_shadow_memslot(struct kvm *kvm, + struct kvm_memory_slot *slot) +{ + kvmppc_core_flush_memslot(kvm, slot); +} + +struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id) +{ + struct kvm_vcpu *vcpu; + vcpu = kvmppc_core_vcpu_create(kvm, id); + if (!IS_ERR(vcpu)) { + vcpu->arch.wqp = &vcpu->wq; + kvmppc_create_vcpu_debugfs(vcpu, id); + } + return vcpu; +} + +void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu) +{ +} + +void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu) +{ + /* Make sure we're not using the vcpu anymore */ + hrtimer_cancel(&vcpu->arch.dec_timer); + + kvmppc_remove_vcpu_debugfs(vcpu); + + switch (vcpu->arch.irq_type) { + case KVMPPC_IRQ_MPIC: + kvmppc_mpic_disconnect_vcpu(vcpu->arch.mpic, vcpu); + break; + case KVMPPC_IRQ_XICS: + kvmppc_xics_free_icp(vcpu); + break; + } + + kvmppc_core_vcpu_free(vcpu); +} + +void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) +{ + kvm_arch_vcpu_free(vcpu); +} + +int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu) +{ + return kvmppc_core_pending_dec(vcpu); +} + +enum hrtimer_restart kvmppc_decrementer_wakeup(struct hrtimer *timer) +{ + struct kvm_vcpu *vcpu; + + vcpu = container_of(timer, struct kvm_vcpu, arch.dec_timer); + kvmppc_decrementer_func(vcpu); + + return HRTIMER_NORESTART; +} + +int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) +{ + int ret; + + hrtimer_init(&vcpu->arch.dec_timer, CLOCK_REALTIME, HRTIMER_MODE_ABS); + vcpu->arch.dec_timer.function = kvmppc_decrementer_wakeup; + vcpu->arch.dec_expires = ~(u64)0; + +#ifdef CONFIG_KVM_EXIT_TIMING + mutex_init(&vcpu->arch.exit_timing_lock); +#endif + ret = kvmppc_subarch_vcpu_init(vcpu); + return ret; +} + +void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) +{ + kvmppc_mmu_destroy(vcpu); + kvmppc_subarch_vcpu_uninit(vcpu); +} + +void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) +{ +#ifdef CONFIG_BOOKE + /* + * vrsave (formerly usprg0) isn't used by Linux, but may + * be used by the guest. + * + * On non-booke this is associated with Altivec and + * is handled by code in book3s.c. + */ + mtspr(SPRN_VRSAVE, vcpu->arch.vrsave); +#endif + kvmppc_core_vcpu_load(vcpu, cpu); +} + +void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) +{ + kvmppc_core_vcpu_put(vcpu); +#ifdef CONFIG_BOOKE + vcpu->arch.vrsave = mfspr(SPRN_VRSAVE); +#endif +} + +static void kvmppc_complete_mmio_load(struct kvm_vcpu *vcpu, + struct kvm_run *run) +{ + u64 uninitialized_var(gpr); + + if (run->mmio.len > sizeof(gpr)) { + printk(KERN_ERR "bad MMIO length: %d\n", run->mmio.len); + return; + } + + if (!vcpu->arch.mmio_host_swabbed) { + switch (run->mmio.len) { + case 8: gpr = *(u64 *)run->mmio.data; break; + case 4: gpr = *(u32 *)run->mmio.data; break; + case 2: gpr = *(u16 *)run->mmio.data; break; + case 1: gpr = *(u8 *)run->mmio.data; break; + } + } else { + switch (run->mmio.len) { + case 8: gpr = swab64(*(u64 *)run->mmio.data); break; + case 4: gpr = swab32(*(u32 *)run->mmio.data); break; + case 2: gpr = swab16(*(u16 *)run->mmio.data); break; + case 1: gpr = *(u8 *)run->mmio.data; break; + } + } + + if (vcpu->arch.mmio_sign_extend) { + switch (run->mmio.len) { +#ifdef CONFIG_PPC64 + case 4: + gpr = (s64)(s32)gpr; + break; +#endif + case 2: + gpr = (s64)(s16)gpr; + break; + case 1: + gpr = (s64)(s8)gpr; + break; + } + } + + kvmppc_set_gpr(vcpu, vcpu->arch.io_gpr, gpr); + + switch (vcpu->arch.io_gpr & KVM_MMIO_REG_EXT_MASK) { + case KVM_MMIO_REG_GPR: + kvmppc_set_gpr(vcpu, vcpu->arch.io_gpr, gpr); + break; + case KVM_MMIO_REG_FPR: + VCPU_FPR(vcpu, vcpu->arch.io_gpr & KVM_MMIO_REG_MASK) = gpr; + break; +#ifdef CONFIG_PPC_BOOK3S + case KVM_MMIO_REG_QPR: + vcpu->arch.qpr[vcpu->arch.io_gpr & KVM_MMIO_REG_MASK] = gpr; + break; + case KVM_MMIO_REG_FQPR: + VCPU_FPR(vcpu, vcpu->arch.io_gpr & KVM_MMIO_REG_MASK) = gpr; + vcpu->arch.qpr[vcpu->arch.io_gpr & KVM_MMIO_REG_MASK] = gpr; + break; +#endif + default: + BUG(); + } +} + +int kvmppc_handle_load(struct kvm_run *run, struct kvm_vcpu *vcpu, + unsigned int rt, unsigned int bytes, + int is_default_endian) +{ + int idx, ret; + bool host_swabbed; + + /* Pity C doesn't have a logical XOR operator */ + if (kvmppc_need_byteswap(vcpu)) { + host_swabbed = is_default_endian; + } else { + host_swabbed = !is_default_endian; + } + + if (bytes > sizeof(run->mmio.data)) { + printk(KERN_ERR "%s: bad MMIO length: %d\n", __func__, + run->mmio.len); + } + + run->mmio.phys_addr = vcpu->arch.paddr_accessed; + run->mmio.len = bytes; + run->mmio.is_write = 0; + + vcpu->arch.io_gpr = rt; + vcpu->arch.mmio_host_swabbed = host_swabbed; + vcpu->mmio_needed = 1; + vcpu->mmio_is_write = 0; + vcpu->arch.mmio_sign_extend = 0; + + idx = srcu_read_lock(&vcpu->kvm->srcu); + + ret = kvm_io_bus_read(vcpu, KVM_MMIO_BUS, run->mmio.phys_addr, + bytes, &run->mmio.data); + + srcu_read_unlock(&vcpu->kvm->srcu, idx); + + if (!ret) { + kvmppc_complete_mmio_load(vcpu, run); + vcpu->mmio_needed = 0; + return EMULATE_DONE; + } + + return EMULATE_DO_MMIO; +} +EXPORT_SYMBOL_GPL(kvmppc_handle_load); + +/* Same as above, but sign extends */ +int kvmppc_handle_loads(struct kvm_run *run, struct kvm_vcpu *vcpu, + unsigned int rt, unsigned int bytes, + int is_default_endian) +{ + int r; + + vcpu->arch.mmio_sign_extend = 1; + r = kvmppc_handle_load(run, vcpu, rt, bytes, is_default_endian); + + return r; +} + +int kvmppc_handle_store(struct kvm_run *run, struct kvm_vcpu *vcpu, + u64 val, unsigned int bytes, int is_default_endian) +{ + void *data = run->mmio.data; + int idx, ret; + bool host_swabbed; + + /* Pity C doesn't have a logical XOR operator */ + if (kvmppc_need_byteswap(vcpu)) { + host_swabbed = is_default_endian; + } else { + host_swabbed = !is_default_endian; + } + + if (bytes > sizeof(run->mmio.data)) { + printk(KERN_ERR "%s: bad MMIO length: %d\n", __func__, + run->mmio.len); + } + + run->mmio.phys_addr = vcpu->arch.paddr_accessed; + run->mmio.len = bytes; + run->mmio.is_write = 1; + vcpu->mmio_needed = 1; + vcpu->mmio_is_write = 1; + + /* Store the value at the lowest bytes in 'data'. */ + if (!host_swabbed) { + switch (bytes) { + case 8: *(u64 *)data = val; break; + case 4: *(u32 *)data = val; break; + case 2: *(u16 *)data = val; break; + case 1: *(u8 *)data = val; break; + } + } else { + switch (bytes) { + case 8: *(u64 *)data = swab64(val); break; + case 4: *(u32 *)data = swab32(val); break; + case 2: *(u16 *)data = swab16(val); break; + case 1: *(u8 *)data = val; break; + } + } + + idx = srcu_read_lock(&vcpu->kvm->srcu); + + ret = kvm_io_bus_write(vcpu, KVM_MMIO_BUS, run->mmio.phys_addr, + bytes, &run->mmio.data); + + srcu_read_unlock(&vcpu->kvm->srcu, idx); + + if (!ret) { + vcpu->mmio_needed = 0; + return EMULATE_DONE; + } + + return EMULATE_DO_MMIO; +} +EXPORT_SYMBOL_GPL(kvmppc_handle_store); + +int kvm_vcpu_ioctl_get_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg) +{ + int r = 0; + union kvmppc_one_reg val; + int size; + + size = one_reg_size(reg->id); + if (size > sizeof(val)) + return -EINVAL; + + r = kvmppc_get_one_reg(vcpu, reg->id, &val); + if (r == -EINVAL) { + r = 0; + switch (reg->id) { +#ifdef CONFIG_ALTIVEC + case KVM_REG_PPC_VR0 ... KVM_REG_PPC_VR31: + if (!cpu_has_feature(CPU_FTR_ALTIVEC)) { + r = -ENXIO; + break; + } + vcpu->arch.vr.vr[reg->id - KVM_REG_PPC_VR0] = val.vval; + break; + case KVM_REG_PPC_VSCR: + if (!cpu_has_feature(CPU_FTR_ALTIVEC)) { + r = -ENXIO; + break; + } + vcpu->arch.vr.vscr.u[3] = set_reg_val(reg->id, val); + break; + case KVM_REG_PPC_VRSAVE: + if (!cpu_has_feature(CPU_FTR_ALTIVEC)) { + r = -ENXIO; + break; + } + vcpu->arch.vrsave = set_reg_val(reg->id, val); + break; +#endif /* CONFIG_ALTIVEC */ + default: + r = -EINVAL; + break; + } + } + + if (r) + return r; + + if (copy_to_user((char __user *)(unsigned long)reg->addr, &val, size)) + r = -EFAULT; + + return r; +} + +int kvm_vcpu_ioctl_set_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg) +{ + int r; + union kvmppc_one_reg val; + int size; + + size = one_reg_size(reg->id); + if (size > sizeof(val)) + return -EINVAL; + + if (copy_from_user(&val, (char __user *)(unsigned long)reg->addr, size)) + return -EFAULT; + + r = kvmppc_set_one_reg(vcpu, reg->id, &val); + if (r == -EINVAL) { + r = 0; + switch (reg->id) { +#ifdef CONFIG_ALTIVEC + case KVM_REG_PPC_VR0 ... KVM_REG_PPC_VR31: + if (!cpu_has_feature(CPU_FTR_ALTIVEC)) { + r = -ENXIO; + break; + } + val.vval = vcpu->arch.vr.vr[reg->id - KVM_REG_PPC_VR0]; + break; + case KVM_REG_PPC_VSCR: + if (!cpu_has_feature(CPU_FTR_ALTIVEC)) { + r = -ENXIO; + break; + } + val = get_reg_val(reg->id, vcpu->arch.vr.vscr.u[3]); + break; + case KVM_REG_PPC_VRSAVE: + val = get_reg_val(reg->id, vcpu->arch.vrsave); + break; +#endif /* CONFIG_ALTIVEC */ + default: + r = -EINVAL; + break; + } + } + + return r; +} + +int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run) +{ + int r; + sigset_t sigsaved; + + if (vcpu->sigset_active) + sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved); + + if (vcpu->mmio_needed) { + if (!vcpu->mmio_is_write) + kvmppc_complete_mmio_load(vcpu, run); + vcpu->mmio_needed = 0; + } else if (vcpu->arch.osi_needed) { + u64 *gprs = run->osi.gprs; + int i; + + for (i = 0; i < 32; i++) + kvmppc_set_gpr(vcpu, i, gprs[i]); + vcpu->arch.osi_needed = 0; + } else if (vcpu->arch.hcall_needed) { + int i; + + kvmppc_set_gpr(vcpu, 3, run->papr_hcall.ret); + for (i = 0; i < 9; ++i) + kvmppc_set_gpr(vcpu, 4 + i, run->papr_hcall.args[i]); + vcpu->arch.hcall_needed = 0; +#ifdef CONFIG_BOOKE + } else if (vcpu->arch.epr_needed) { + kvmppc_set_epr(vcpu, run->epr.epr); + vcpu->arch.epr_needed = 0; +#endif + } + + r = kvmppc_vcpu_run(run, vcpu); + + if (vcpu->sigset_active) + sigprocmask(SIG_SETMASK, &sigsaved, NULL); + + return r; +} + +int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, struct kvm_interrupt *irq) +{ + if (irq->irq == KVM_INTERRUPT_UNSET) { + kvmppc_core_dequeue_external(vcpu); + return 0; + } + + kvmppc_core_queue_external(vcpu, irq); + + kvm_vcpu_kick(vcpu); + + return 0; +} + +static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu, + struct kvm_enable_cap *cap) +{ + int r; + + if (cap->flags) + return -EINVAL; + + switch (cap->cap) { + case KVM_CAP_PPC_OSI: + r = 0; + vcpu->arch.osi_enabled = true; + break; + case KVM_CAP_PPC_PAPR: + r = 0; + vcpu->arch.papr_enabled = true; + break; + case KVM_CAP_PPC_EPR: + r = 0; + if (cap->args[0]) + vcpu->arch.epr_flags |= KVMPPC_EPR_USER; + else + vcpu->arch.epr_flags &= ~KVMPPC_EPR_USER; + break; +#ifdef CONFIG_BOOKE + case KVM_CAP_PPC_BOOKE_WATCHDOG: + r = 0; + vcpu->arch.watchdog_enabled = true; + break; +#endif +#if defined(CONFIG_KVM_E500V2) || defined(CONFIG_KVM_E500MC) + case KVM_CAP_SW_TLB: { + struct kvm_config_tlb cfg; + void __user *user_ptr = (void __user *)(uintptr_t)cap->args[0]; + + r = -EFAULT; + if (copy_from_user(&cfg, user_ptr, sizeof(cfg))) + break; + + r = kvm_vcpu_ioctl_config_tlb(vcpu, &cfg); + break; + } +#endif +#ifdef CONFIG_KVM_MPIC + case KVM_CAP_IRQ_MPIC: { + struct fd f; + struct kvm_device *dev; + + r = -EBADF; + f = fdget(cap->args[0]); + if (!f.file) + break; + + r = -EPERM; + dev = kvm_device_from_filp(f.file); + if (dev) + r = kvmppc_mpic_connect_vcpu(dev, vcpu, cap->args[1]); + + fdput(f); + break; + } +#endif +#ifdef CONFIG_KVM_XICS + case KVM_CAP_IRQ_XICS: { + struct fd f; + struct kvm_device *dev; + + r = -EBADF; + f = fdget(cap->args[0]); + if (!f.file) + break; + + r = -EPERM; + dev = kvm_device_from_filp(f.file); + if (dev) + r = kvmppc_xics_connect_vcpu(dev, vcpu, cap->args[1]); + + fdput(f); + break; + } +#endif /* CONFIG_KVM_XICS */ + default: + r = -EINVAL; + break; + } + + if (!r) + r = kvmppc_sanity_check(vcpu); + + return r; +} + +int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu, + struct kvm_mp_state *mp_state) +{ + return -EINVAL; +} + +int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu, + struct kvm_mp_state *mp_state) +{ + return -EINVAL; +} + +long kvm_arch_vcpu_ioctl(struct file *filp, + unsigned int ioctl, unsigned long arg) +{ + struct kvm_vcpu *vcpu = filp->private_data; + void __user *argp = (void __user *)arg; + long r; + + switch (ioctl) { + case KVM_INTERRUPT: { + struct kvm_interrupt irq; + r = -EFAULT; + if (copy_from_user(&irq, argp, sizeof(irq))) + goto out; + r = kvm_vcpu_ioctl_interrupt(vcpu, &irq); + goto out; + } + + case KVM_ENABLE_CAP: + { + struct kvm_enable_cap cap; + r = -EFAULT; + if (copy_from_user(&cap, argp, sizeof(cap))) + goto out; + r = kvm_vcpu_ioctl_enable_cap(vcpu, &cap); + break; + } + + case KVM_SET_ONE_REG: + case KVM_GET_ONE_REG: + { + struct kvm_one_reg reg; + r = -EFAULT; + if (copy_from_user(®, argp, sizeof(reg))) + goto out; + if (ioctl == KVM_SET_ONE_REG) + r = kvm_vcpu_ioctl_set_one_reg(vcpu, ®); + else + r = kvm_vcpu_ioctl_get_one_reg(vcpu, ®); + break; + } + +#if defined(CONFIG_KVM_E500V2) || defined(CONFIG_KVM_E500MC) + case KVM_DIRTY_TLB: { + struct kvm_dirty_tlb dirty; + r = -EFAULT; + if (copy_from_user(&dirty, argp, sizeof(dirty))) + goto out; + r = kvm_vcpu_ioctl_dirty_tlb(vcpu, &dirty); + break; + } +#endif + default: + r = -EINVAL; + } + +out: + return r; +} + +int kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf) +{ + return VM_FAULT_SIGBUS; +} + +static int kvm_vm_ioctl_get_pvinfo(struct kvm_ppc_pvinfo *pvinfo) +{ + u32 inst_nop = 0x60000000; +#ifdef CONFIG_KVM_BOOKE_HV + u32 inst_sc1 = 0x44000022; + pvinfo->hcall[0] = cpu_to_be32(inst_sc1); + pvinfo->hcall[1] = cpu_to_be32(inst_nop); + pvinfo->hcall[2] = cpu_to_be32(inst_nop); + pvinfo->hcall[3] = cpu_to_be32(inst_nop); +#else + u32 inst_lis = 0x3c000000; + u32 inst_ori = 0x60000000; + u32 inst_sc = 0x44000002; + u32 inst_imm_mask = 0xffff; + + /* + * The hypercall to get into KVM from within guest context is as + * follows: + * + * lis r0, r0, KVM_SC_MAGIC_R0@h + * ori r0, KVM_SC_MAGIC_R0@l + * sc + * nop + */ + pvinfo->hcall[0] = cpu_to_be32(inst_lis | ((KVM_SC_MAGIC_R0 >> 16) & inst_imm_mask)); + pvinfo->hcall[1] = cpu_to_be32(inst_ori | (KVM_SC_MAGIC_R0 & inst_imm_mask)); + pvinfo->hcall[2] = cpu_to_be32(inst_sc); + pvinfo->hcall[3] = cpu_to_be32(inst_nop); +#endif + + pvinfo->flags = KVM_PPC_PVINFO_FLAGS_EV_IDLE; + + return 0; +} + +int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_event, + bool line_status) +{ + if (!irqchip_in_kernel(kvm)) + return -ENXIO; + + irq_event->status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, + irq_event->irq, irq_event->level, + line_status); + return 0; +} + + +static int kvm_vm_ioctl_enable_cap(struct kvm *kvm, + struct kvm_enable_cap *cap) +{ + int r; + + if (cap->flags) + return -EINVAL; + + switch (cap->cap) { +#ifdef CONFIG_KVM_BOOK3S_64_HANDLER + case KVM_CAP_PPC_ENABLE_HCALL: { + unsigned long hcall = cap->args[0]; + + r = -EINVAL; + if (hcall > MAX_HCALL_OPCODE || (hcall & 3) || + cap->args[1] > 1) + break; + if (!kvmppc_book3s_hcall_implemented(kvm, hcall)) + break; + if (cap->args[1]) + set_bit(hcall / 4, kvm->arch.enabled_hcalls); + else + clear_bit(hcall / 4, kvm->arch.enabled_hcalls); + r = 0; + break; + } +#endif + default: + r = -EINVAL; + break; + } + + return r; +} + +long kvm_arch_vm_ioctl(struct file *filp, + unsigned int ioctl, unsigned long arg) +{ + struct kvm *kvm __maybe_unused = filp->private_data; + void __user *argp = (void __user *)arg; + long r; + + switch (ioctl) { + case KVM_PPC_GET_PVINFO: { + struct kvm_ppc_pvinfo pvinfo; + memset(&pvinfo, 0, sizeof(pvinfo)); + r = kvm_vm_ioctl_get_pvinfo(&pvinfo); + if (copy_to_user(argp, &pvinfo, sizeof(pvinfo))) { + r = -EFAULT; + goto out; + } + + break; + } + case KVM_ENABLE_CAP: + { + struct kvm_enable_cap cap; + r = -EFAULT; + if (copy_from_user(&cap, argp, sizeof(cap))) + goto out; + r = kvm_vm_ioctl_enable_cap(kvm, &cap); + break; + } +#ifdef CONFIG_PPC_BOOK3S_64 + case KVM_CREATE_SPAPR_TCE: { + struct kvm_create_spapr_tce create_tce; + + r = -EFAULT; + if (copy_from_user(&create_tce, argp, sizeof(create_tce))) + goto out; + r = kvm_vm_ioctl_create_spapr_tce(kvm, &create_tce); + goto out; + } + case KVM_PPC_GET_SMMU_INFO: { + struct kvm_ppc_smmu_info info; + struct kvm *kvm = filp->private_data; + + memset(&info, 0, sizeof(info)); + r = kvm->arch.kvm_ops->get_smmu_info(kvm, &info); + if (r >= 0 && copy_to_user(argp, &info, sizeof(info))) + r = -EFAULT; + break; + } + case KVM_PPC_RTAS_DEFINE_TOKEN: { + struct kvm *kvm = filp->private_data; + + r = kvm_vm_ioctl_rtas_define_token(kvm, argp); + break; + } + default: { + struct kvm *kvm = filp->private_data; + r = kvm->arch.kvm_ops->arch_vm_ioctl(filp, ioctl, arg); + } +#else /* CONFIG_PPC_BOOK3S_64 */ + default: + r = -ENOTTY; +#endif + } +out: + return r; +} + +static unsigned long lpid_inuse[BITS_TO_LONGS(KVMPPC_NR_LPIDS)]; +static unsigned long nr_lpids; + +long kvmppc_alloc_lpid(void) +{ + long lpid; + + do { + lpid = find_first_zero_bit(lpid_inuse, KVMPPC_NR_LPIDS); + if (lpid >= nr_lpids) { + pr_err("%s: No LPIDs free\n", __func__); + return -ENOMEM; + } + } while (test_and_set_bit(lpid, lpid_inuse)); + + return lpid; +} +EXPORT_SYMBOL_GPL(kvmppc_alloc_lpid); + +void kvmppc_claim_lpid(long lpid) +{ + set_bit(lpid, lpid_inuse); +} +EXPORT_SYMBOL_GPL(kvmppc_claim_lpid); + +void kvmppc_free_lpid(long lpid) +{ + clear_bit(lpid, lpid_inuse); +} +EXPORT_SYMBOL_GPL(kvmppc_free_lpid); + +void kvmppc_init_lpid(unsigned long nr_lpids_param) +{ + nr_lpids = min_t(unsigned long, KVMPPC_NR_LPIDS, nr_lpids_param); + memset(lpid_inuse, 0, sizeof(lpid_inuse)); +} +EXPORT_SYMBOL_GPL(kvmppc_init_lpid); + +int kvm_arch_init(void *opaque) +{ + return 0; +} + +EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_ppc_instr); diff --git a/kernel/arch/powerpc/kvm/timing.c b/kernel/arch/powerpc/kvm/timing.c new file mode 100644 index 000000000..e44d2b2ea --- /dev/null +++ b/kernel/arch/powerpc/kvm/timing.c @@ -0,0 +1,245 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * + * Copyright IBM Corp. 2008 + * + * Authors: Hollis Blanchard <hollisb@us.ibm.com> + * Christian Ehrhardt <ehrhardt@linux.vnet.ibm.com> + */ + +#include <linux/kvm_host.h> +#include <linux/fs.h> +#include <linux/seq_file.h> +#include <linux/debugfs.h> +#include <linux/uaccess.h> +#include <linux/module.h> + +#include <asm/time.h> +#include <asm-generic/div64.h> + +#include "timing.h" + +void kvmppc_init_timing_stats(struct kvm_vcpu *vcpu) +{ + int i; + + /* Take a lock to avoid concurrent updates */ + mutex_lock(&vcpu->arch.exit_timing_lock); + + vcpu->arch.last_exit_type = 0xDEAD; + for (i = 0; i < __NUMBER_OF_KVM_EXIT_TYPES; i++) { + vcpu->arch.timing_count_type[i] = 0; + vcpu->arch.timing_max_duration[i] = 0; + vcpu->arch.timing_min_duration[i] = 0xFFFFFFFF; + vcpu->arch.timing_sum_duration[i] = 0; + vcpu->arch.timing_sum_quad_duration[i] = 0; + } + vcpu->arch.timing_last_exit = 0; + vcpu->arch.timing_exit.tv64 = 0; + vcpu->arch.timing_last_enter.tv64 = 0; + + mutex_unlock(&vcpu->arch.exit_timing_lock); +} + +static void add_exit_timing(struct kvm_vcpu *vcpu, u64 duration, int type) +{ + u64 old; + + mutex_lock(&vcpu->arch.exit_timing_lock); + + vcpu->arch.timing_count_type[type]++; + + /* sum */ + old = vcpu->arch.timing_sum_duration[type]; + vcpu->arch.timing_sum_duration[type] += duration; + if (unlikely(old > vcpu->arch.timing_sum_duration[type])) { + printk(KERN_ERR"%s - wrap adding sum of durations" + " old %lld new %lld type %d exit # of type %d\n", + __func__, old, vcpu->arch.timing_sum_duration[type], + type, vcpu->arch.timing_count_type[type]); + } + + /* square sum */ + old = vcpu->arch.timing_sum_quad_duration[type]; + vcpu->arch.timing_sum_quad_duration[type] += (duration*duration); + if (unlikely(old > vcpu->arch.timing_sum_quad_duration[type])) { + printk(KERN_ERR"%s - wrap adding sum of squared durations" + " old %lld new %lld type %d exit # of type %d\n", + __func__, old, + vcpu->arch.timing_sum_quad_duration[type], + type, vcpu->arch.timing_count_type[type]); + } + + /* set min/max */ + if (unlikely(duration < vcpu->arch.timing_min_duration[type])) + vcpu->arch.timing_min_duration[type] = duration; + if (unlikely(duration > vcpu->arch.timing_max_duration[type])) + vcpu->arch.timing_max_duration[type] = duration; + + mutex_unlock(&vcpu->arch.exit_timing_lock); +} + +void kvmppc_update_timing_stats(struct kvm_vcpu *vcpu) +{ + u64 exit = vcpu->arch.timing_last_exit; + u64 enter = vcpu->arch.timing_last_enter.tv64; + + /* save exit time, used next exit when the reenter time is known */ + vcpu->arch.timing_last_exit = vcpu->arch.timing_exit.tv64; + + if (unlikely(vcpu->arch.last_exit_type == 0xDEAD || exit == 0)) + return; /* skip incomplete cycle (e.g. after reset) */ + + /* update statistics for average and standard deviation */ + add_exit_timing(vcpu, (enter - exit), vcpu->arch.last_exit_type); + /* enter -> timing_last_exit is time spent in guest - log this too */ + add_exit_timing(vcpu, (vcpu->arch.timing_last_exit - enter), + TIMEINGUEST); +} + +static const char *kvm_exit_names[__NUMBER_OF_KVM_EXIT_TYPES] = { + [MMIO_EXITS] = "MMIO", + [SIGNAL_EXITS] = "SIGNAL", + [ITLB_REAL_MISS_EXITS] = "ITLBREAL", + [ITLB_VIRT_MISS_EXITS] = "ITLBVIRT", + [DTLB_REAL_MISS_EXITS] = "DTLBREAL", + [DTLB_VIRT_MISS_EXITS] = "DTLBVIRT", + [SYSCALL_EXITS] = "SYSCALL", + [ISI_EXITS] = "ISI", + [DSI_EXITS] = "DSI", + [EMULATED_INST_EXITS] = "EMULINST", + [EMULATED_MTMSRWE_EXITS] = "EMUL_WAIT", + [EMULATED_WRTEE_EXITS] = "EMUL_WRTEE", + [EMULATED_MTSPR_EXITS] = "EMUL_MTSPR", + [EMULATED_MFSPR_EXITS] = "EMUL_MFSPR", + [EMULATED_MTMSR_EXITS] = "EMUL_MTMSR", + [EMULATED_MFMSR_EXITS] = "EMUL_MFMSR", + [EMULATED_TLBSX_EXITS] = "EMUL_TLBSX", + [EMULATED_TLBWE_EXITS] = "EMUL_TLBWE", + [EMULATED_RFI_EXITS] = "EMUL_RFI", + [DEC_EXITS] = "DEC", + [EXT_INTR_EXITS] = "EXTINT", + [HALT_WAKEUP] = "HALT", + [USR_PR_INST] = "USR_PR_INST", + [FP_UNAVAIL] = "FP_UNAVAIL", + [DEBUG_EXITS] = "DEBUG", + [TIMEINGUEST] = "TIMEINGUEST" +}; + +static int kvmppc_exit_timing_show(struct seq_file *m, void *private) +{ + struct kvm_vcpu *vcpu = m->private; + int i; + u64 min, max, sum, sum_quad; + + seq_printf(m, "%s", "type count min max sum sum_squared\n"); + + + for (i = 0; i < __NUMBER_OF_KVM_EXIT_TYPES; i++) { + + min = vcpu->arch.timing_min_duration[i]; + do_div(min, tb_ticks_per_usec); + max = vcpu->arch.timing_max_duration[i]; + do_div(max, tb_ticks_per_usec); + sum = vcpu->arch.timing_sum_duration[i]; + do_div(sum, tb_ticks_per_usec); + sum_quad = vcpu->arch.timing_sum_quad_duration[i]; + do_div(sum_quad, tb_ticks_per_usec); + + seq_printf(m, "%12s %10d %10lld %10lld %20lld %20lld\n", + kvm_exit_names[i], + vcpu->arch.timing_count_type[i], + min, + max, + sum, + sum_quad); + + } + return 0; +} + +/* Write 'c' to clear the timing statistics. */ +static ssize_t kvmppc_exit_timing_write(struct file *file, + const char __user *user_buf, + size_t count, loff_t *ppos) +{ + int err = -EINVAL; + char c; + + if (count > 1) { + goto done; + } + + if (get_user(c, user_buf)) { + err = -EFAULT; + goto done; + } + + if (c == 'c') { + struct seq_file *seqf = file->private_data; + struct kvm_vcpu *vcpu = seqf->private; + /* Write does not affect our buffers previously generated with + * show. seq_file is locked here to prevent races of init with + * a show call */ + mutex_lock(&seqf->lock); + kvmppc_init_timing_stats(vcpu); + mutex_unlock(&seqf->lock); + err = count; + } + +done: + return err; +} + +static int kvmppc_exit_timing_open(struct inode *inode, struct file *file) +{ + return single_open(file, kvmppc_exit_timing_show, inode->i_private); +} + +static const struct file_operations kvmppc_exit_timing_fops = { + .owner = THIS_MODULE, + .open = kvmppc_exit_timing_open, + .read = seq_read, + .write = kvmppc_exit_timing_write, + .llseek = seq_lseek, + .release = single_release, +}; + +void kvmppc_create_vcpu_debugfs(struct kvm_vcpu *vcpu, unsigned int id) +{ + static char dbg_fname[50]; + struct dentry *debugfs_file; + + snprintf(dbg_fname, sizeof(dbg_fname), "vm%u_vcpu%u_timing", + current->pid, id); + debugfs_file = debugfs_create_file(dbg_fname, 0666, + kvm_debugfs_dir, vcpu, + &kvmppc_exit_timing_fops); + + if (!debugfs_file) { + printk(KERN_ERR"%s: error creating debugfs file %s\n", + __func__, dbg_fname); + return; + } + + vcpu->arch.debugfs_exit_timing = debugfs_file; +} + +void kvmppc_remove_vcpu_debugfs(struct kvm_vcpu *vcpu) +{ + if (vcpu->arch.debugfs_exit_timing) { + debugfs_remove(vcpu->arch.debugfs_exit_timing); + vcpu->arch.debugfs_exit_timing = NULL; + } +} diff --git a/kernel/arch/powerpc/kvm/timing.h b/kernel/arch/powerpc/kvm/timing.h new file mode 100644 index 000000000..3123690c8 --- /dev/null +++ b/kernel/arch/powerpc/kvm/timing.h @@ -0,0 +1,109 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * + * Copyright IBM Corp. 2008 + * + * Authors: Christian Ehrhardt <ehrhardt@linux.vnet.ibm.com> + */ + +#ifndef __POWERPC_KVM_EXITTIMING_H__ +#define __POWERPC_KVM_EXITTIMING_H__ + +#include <linux/kvm_host.h> +#include <asm/kvm_host.h> + +#ifdef CONFIG_KVM_EXIT_TIMING +void kvmppc_init_timing_stats(struct kvm_vcpu *vcpu); +void kvmppc_update_timing_stats(struct kvm_vcpu *vcpu); +void kvmppc_create_vcpu_debugfs(struct kvm_vcpu *vcpu, unsigned int id); +void kvmppc_remove_vcpu_debugfs(struct kvm_vcpu *vcpu); + +static inline void kvmppc_set_exit_type(struct kvm_vcpu *vcpu, int type) +{ + vcpu->arch.last_exit_type = type; +} + +#else +/* if exit timing is not configured there is no need to build the c file */ +static inline void kvmppc_init_timing_stats(struct kvm_vcpu *vcpu) {} +static inline void kvmppc_update_timing_stats(struct kvm_vcpu *vcpu) {} +static inline void kvmppc_create_vcpu_debugfs(struct kvm_vcpu *vcpu, + unsigned int id) {} +static inline void kvmppc_remove_vcpu_debugfs(struct kvm_vcpu *vcpu) {} +static inline void kvmppc_set_exit_type(struct kvm_vcpu *vcpu, int type) {} +#endif /* CONFIG_KVM_EXIT_TIMING */ + +/* account the exit in kvm_stats */ +static inline void kvmppc_account_exit_stat(struct kvm_vcpu *vcpu, int type) +{ + /* type has to be known at build time for optimization */ + + /* The BUILD_BUG_ON below breaks in funny ways, commented out + * for now ... -BenH + BUILD_BUG_ON(!__builtin_constant_p(type)); + */ + switch (type) { + case EXT_INTR_EXITS: + vcpu->stat.ext_intr_exits++; + break; + case DEC_EXITS: + vcpu->stat.dec_exits++; + break; + case EMULATED_INST_EXITS: + vcpu->stat.emulated_inst_exits++; + break; + case DSI_EXITS: + vcpu->stat.dsi_exits++; + break; + case ISI_EXITS: + vcpu->stat.isi_exits++; + break; + case SYSCALL_EXITS: + vcpu->stat.syscall_exits++; + break; + case DTLB_REAL_MISS_EXITS: + vcpu->stat.dtlb_real_miss_exits++; + break; + case DTLB_VIRT_MISS_EXITS: + vcpu->stat.dtlb_virt_miss_exits++; + break; + case MMIO_EXITS: + vcpu->stat.mmio_exits++; + break; + case ITLB_REAL_MISS_EXITS: + vcpu->stat.itlb_real_miss_exits++; + break; + case ITLB_VIRT_MISS_EXITS: + vcpu->stat.itlb_virt_miss_exits++; + break; + case SIGNAL_EXITS: + vcpu->stat.signal_exits++; + break; + case DBELL_EXITS: + vcpu->stat.dbell_exits++; + break; + case GDBELL_EXITS: + vcpu->stat.gdbell_exits++; + break; + } +} + +/* wrapper to set exit time and account for it in kvm_stats */ +static inline void kvmppc_account_exit(struct kvm_vcpu *vcpu, int type) +{ + kvmppc_set_exit_type(vcpu, type); + kvmppc_account_exit_stat(vcpu, type); +} + +#endif /* __POWERPC_KVM_EXITTIMING_H__ */ diff --git a/kernel/arch/powerpc/kvm/trace.h b/kernel/arch/powerpc/kvm/trace.h new file mode 100644 index 000000000..2e0e67ef3 --- /dev/null +++ b/kernel/arch/powerpc/kvm/trace.h @@ -0,0 +1,122 @@ +#if !defined(_TRACE_KVM_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_KVM_H + +#include <linux/tracepoint.h> + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM kvm +#define TRACE_INCLUDE_PATH . +#define TRACE_INCLUDE_FILE trace + +/* + * Tracepoint for guest mode entry. + */ +TRACE_EVENT(kvm_ppc_instr, + TP_PROTO(unsigned int inst, unsigned long _pc, unsigned int emulate), + TP_ARGS(inst, _pc, emulate), + + TP_STRUCT__entry( + __field( unsigned int, inst ) + __field( unsigned long, pc ) + __field( unsigned int, emulate ) + ), + + TP_fast_assign( + __entry->inst = inst; + __entry->pc = _pc; + __entry->emulate = emulate; + ), + + TP_printk("inst %u pc 0x%lx emulate %u\n", + __entry->inst, __entry->pc, __entry->emulate) +); + +TRACE_EVENT(kvm_stlb_inval, + TP_PROTO(unsigned int stlb_index), + TP_ARGS(stlb_index), + + TP_STRUCT__entry( + __field( unsigned int, stlb_index ) + ), + + TP_fast_assign( + __entry->stlb_index = stlb_index; + ), + + TP_printk("stlb_index %u", __entry->stlb_index) +); + +TRACE_EVENT(kvm_stlb_write, + TP_PROTO(unsigned int victim, unsigned int tid, unsigned int word0, + unsigned int word1, unsigned int word2), + TP_ARGS(victim, tid, word0, word1, word2), + + TP_STRUCT__entry( + __field( unsigned int, victim ) + __field( unsigned int, tid ) + __field( unsigned int, word0 ) + __field( unsigned int, word1 ) + __field( unsigned int, word2 ) + ), + + TP_fast_assign( + __entry->victim = victim; + __entry->tid = tid; + __entry->word0 = word0; + __entry->word1 = word1; + __entry->word2 = word2; + ), + + TP_printk("victim %u tid %u w0 %u w1 %u w2 %u", + __entry->victim, __entry->tid, __entry->word0, + __entry->word1, __entry->word2) +); + +TRACE_EVENT(kvm_gtlb_write, + TP_PROTO(unsigned int gtlb_index, unsigned int tid, unsigned int word0, + unsigned int word1, unsigned int word2), + TP_ARGS(gtlb_index, tid, word0, word1, word2), + + TP_STRUCT__entry( + __field( unsigned int, gtlb_index ) + __field( unsigned int, tid ) + __field( unsigned int, word0 ) + __field( unsigned int, word1 ) + __field( unsigned int, word2 ) + ), + + TP_fast_assign( + __entry->gtlb_index = gtlb_index; + __entry->tid = tid; + __entry->word0 = word0; + __entry->word1 = word1; + __entry->word2 = word2; + ), + + TP_printk("gtlb_index %u tid %u w0 %u w1 %u w2 %u", + __entry->gtlb_index, __entry->tid, __entry->word0, + __entry->word1, __entry->word2) +); + +TRACE_EVENT(kvm_check_requests, + TP_PROTO(struct kvm_vcpu *vcpu), + TP_ARGS(vcpu), + + TP_STRUCT__entry( + __field( __u32, cpu_nr ) + __field( __u32, requests ) + ), + + TP_fast_assign( + __entry->cpu_nr = vcpu->vcpu_id; + __entry->requests = vcpu->requests; + ), + + TP_printk("vcpu=%x requests=%x", + __entry->cpu_nr, __entry->requests) +); + +#endif /* _TRACE_KVM_H */ + +/* This part must be outside protection */ +#include <trace/define_trace.h> diff --git a/kernel/arch/powerpc/kvm/trace_book3s.h b/kernel/arch/powerpc/kvm/trace_book3s.h new file mode 100644 index 000000000..f647ce0f4 --- /dev/null +++ b/kernel/arch/powerpc/kvm/trace_book3s.h @@ -0,0 +1,32 @@ +#if !defined(_TRACE_KVM_BOOK3S_H) +#define _TRACE_KVM_BOOK3S_H + +/* + * Common defines used by the trace macros in trace_pr.h and trace_hv.h + */ + +#define kvm_trace_symbol_exit \ + {0x100, "SYSTEM_RESET"}, \ + {0x200, "MACHINE_CHECK"}, \ + {0x300, "DATA_STORAGE"}, \ + {0x380, "DATA_SEGMENT"}, \ + {0x400, "INST_STORAGE"}, \ + {0x480, "INST_SEGMENT"}, \ + {0x500, "EXTERNAL"}, \ + {0x501, "EXTERNAL_LEVEL"}, \ + {0x502, "EXTERNAL_HV"}, \ + {0x600, "ALIGNMENT"}, \ + {0x700, "PROGRAM"}, \ + {0x800, "FP_UNAVAIL"}, \ + {0x900, "DECREMENTER"}, \ + {0x980, "HV_DECREMENTER"}, \ + {0xc00, "SYSCALL"}, \ + {0xd00, "TRACE"}, \ + {0xe00, "H_DATA_STORAGE"}, \ + {0xe20, "H_INST_STORAGE"}, \ + {0xe40, "H_EMUL_ASSIST"}, \ + {0xf00, "PERFMON"}, \ + {0xf20, "ALTIVEC"}, \ + {0xf40, "VSX"} + +#endif diff --git a/kernel/arch/powerpc/kvm/trace_booke.h b/kernel/arch/powerpc/kvm/trace_booke.h new file mode 100644 index 000000000..7ec534d1d --- /dev/null +++ b/kernel/arch/powerpc/kvm/trace_booke.h @@ -0,0 +1,220 @@ +#if !defined(_TRACE_KVM_BOOKE_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_KVM_BOOKE_H + +#include <linux/tracepoint.h> + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM kvm_booke +#define TRACE_INCLUDE_PATH . +#define TRACE_INCLUDE_FILE trace_booke + +#define kvm_trace_symbol_exit \ + {0, "CRITICAL"}, \ + {1, "MACHINE_CHECK"}, \ + {2, "DATA_STORAGE"}, \ + {3, "INST_STORAGE"}, \ + {4, "EXTERNAL"}, \ + {5, "ALIGNMENT"}, \ + {6, "PROGRAM"}, \ + {7, "FP_UNAVAIL"}, \ + {8, "SYSCALL"}, \ + {9, "AP_UNAVAIL"}, \ + {10, "DECREMENTER"}, \ + {11, "FIT"}, \ + {12, "WATCHDOG"}, \ + {13, "DTLB_MISS"}, \ + {14, "ITLB_MISS"}, \ + {15, "DEBUG"}, \ + {32, "SPE_UNAVAIL"}, \ + {33, "SPE_FP_DATA"}, \ + {34, "SPE_FP_ROUND"}, \ + {35, "PERFORMANCE_MONITOR"}, \ + {36, "DOORBELL"}, \ + {37, "DOORBELL_CRITICAL"}, \ + {38, "GUEST_DBELL"}, \ + {39, "GUEST_DBELL_CRIT"}, \ + {40, "HV_SYSCALL"}, \ + {41, "HV_PRIV"} + +TRACE_EVENT(kvm_exit, + TP_PROTO(unsigned int exit_nr, struct kvm_vcpu *vcpu), + TP_ARGS(exit_nr, vcpu), + + TP_STRUCT__entry( + __field( unsigned int, exit_nr ) + __field( unsigned long, pc ) + __field( unsigned long, msr ) + __field( unsigned long, dar ) + __field( unsigned long, last_inst ) + ), + + TP_fast_assign( + __entry->exit_nr = exit_nr; + __entry->pc = kvmppc_get_pc(vcpu); + __entry->dar = kvmppc_get_fault_dar(vcpu); + __entry->msr = vcpu->arch.shared->msr; + __entry->last_inst = vcpu->arch.last_inst; + ), + + TP_printk("exit=%s" + " | pc=0x%lx" + " | msr=0x%lx" + " | dar=0x%lx" + " | last_inst=0x%lx" + , + __print_symbolic(__entry->exit_nr, kvm_trace_symbol_exit), + __entry->pc, + __entry->msr, + __entry->dar, + __entry->last_inst + ) +); + +TRACE_EVENT(kvm_unmap_hva, + TP_PROTO(unsigned long hva), + TP_ARGS(hva), + + TP_STRUCT__entry( + __field( unsigned long, hva ) + ), + + TP_fast_assign( + __entry->hva = hva; + ), + + TP_printk("unmap hva 0x%lx\n", __entry->hva) +); + +TRACE_EVENT(kvm_booke206_stlb_write, + TP_PROTO(__u32 mas0, __u32 mas8, __u32 mas1, __u64 mas2, __u64 mas7_3), + TP_ARGS(mas0, mas8, mas1, mas2, mas7_3), + + TP_STRUCT__entry( + __field( __u32, mas0 ) + __field( __u32, mas8 ) + __field( __u32, mas1 ) + __field( __u64, mas2 ) + __field( __u64, mas7_3 ) + ), + + TP_fast_assign( + __entry->mas0 = mas0; + __entry->mas8 = mas8; + __entry->mas1 = mas1; + __entry->mas2 = mas2; + __entry->mas7_3 = mas7_3; + ), + + TP_printk("mas0=%x mas8=%x mas1=%x mas2=%llx mas7_3=%llx", + __entry->mas0, __entry->mas8, __entry->mas1, + __entry->mas2, __entry->mas7_3) +); + +TRACE_EVENT(kvm_booke206_gtlb_write, + TP_PROTO(__u32 mas0, __u32 mas1, __u64 mas2, __u64 mas7_3), + TP_ARGS(mas0, mas1, mas2, mas7_3), + + TP_STRUCT__entry( + __field( __u32, mas0 ) + __field( __u32, mas1 ) + __field( __u64, mas2 ) + __field( __u64, mas7_3 ) + ), + + TP_fast_assign( + __entry->mas0 = mas0; + __entry->mas1 = mas1; + __entry->mas2 = mas2; + __entry->mas7_3 = mas7_3; + ), + + TP_printk("mas0=%x mas1=%x mas2=%llx mas7_3=%llx", + __entry->mas0, __entry->mas1, + __entry->mas2, __entry->mas7_3) +); + +TRACE_EVENT(kvm_booke206_ref_release, + TP_PROTO(__u64 pfn, __u32 flags), + TP_ARGS(pfn, flags), + + TP_STRUCT__entry( + __field( __u64, pfn ) + __field( __u32, flags ) + ), + + TP_fast_assign( + __entry->pfn = pfn; + __entry->flags = flags; + ), + + TP_printk("pfn=%llx flags=%x", + __entry->pfn, __entry->flags) +); + +#ifdef CONFIG_SPE_POSSIBLE +#define kvm_trace_symbol_irqprio_spe \ + {BOOKE_IRQPRIO_SPE_UNAVAIL, "SPE_UNAVAIL"}, \ + {BOOKE_IRQPRIO_SPE_FP_DATA, "SPE_FP_DATA"}, \ + {BOOKE_IRQPRIO_SPE_FP_ROUND, "SPE_FP_ROUND"}, +#else +#define kvm_trace_symbol_irqprio_spe +#endif + +#ifdef CONFIG_PPC_E500MC +#define kvm_trace_symbol_irqprio_e500mc \ + {BOOKE_IRQPRIO_ALTIVEC_UNAVAIL, "ALTIVEC_UNAVAIL"}, \ + {BOOKE_IRQPRIO_ALTIVEC_ASSIST, "ALTIVEC_ASSIST"}, +#else +#define kvm_trace_symbol_irqprio_e500mc +#endif + +#define kvm_trace_symbol_irqprio \ + kvm_trace_symbol_irqprio_spe \ + kvm_trace_symbol_irqprio_e500mc \ + {BOOKE_IRQPRIO_DATA_STORAGE, "DATA_STORAGE"}, \ + {BOOKE_IRQPRIO_INST_STORAGE, "INST_STORAGE"}, \ + {BOOKE_IRQPRIO_ALIGNMENT, "ALIGNMENT"}, \ + {BOOKE_IRQPRIO_PROGRAM, "PROGRAM"}, \ + {BOOKE_IRQPRIO_FP_UNAVAIL, "FP_UNAVAIL"}, \ + {BOOKE_IRQPRIO_SYSCALL, "SYSCALL"}, \ + {BOOKE_IRQPRIO_AP_UNAVAIL, "AP_UNAVAIL"}, \ + {BOOKE_IRQPRIO_DTLB_MISS, "DTLB_MISS"}, \ + {BOOKE_IRQPRIO_ITLB_MISS, "ITLB_MISS"}, \ + {BOOKE_IRQPRIO_MACHINE_CHECK, "MACHINE_CHECK"}, \ + {BOOKE_IRQPRIO_DEBUG, "DEBUG"}, \ + {BOOKE_IRQPRIO_CRITICAL, "CRITICAL"}, \ + {BOOKE_IRQPRIO_WATCHDOG, "WATCHDOG"}, \ + {BOOKE_IRQPRIO_EXTERNAL, "EXTERNAL"}, \ + {BOOKE_IRQPRIO_FIT, "FIT"}, \ + {BOOKE_IRQPRIO_DECREMENTER, "DECREMENTER"}, \ + {BOOKE_IRQPRIO_PERFORMANCE_MONITOR, "PERFORMANCE_MONITOR"}, \ + {BOOKE_IRQPRIO_EXTERNAL_LEVEL, "EXTERNAL_LEVEL"}, \ + {BOOKE_IRQPRIO_DBELL, "DBELL"}, \ + {BOOKE_IRQPRIO_DBELL_CRIT, "DBELL_CRIT"} \ + +TRACE_EVENT(kvm_booke_queue_irqprio, + TP_PROTO(struct kvm_vcpu *vcpu, unsigned int priority), + TP_ARGS(vcpu, priority), + + TP_STRUCT__entry( + __field( __u32, cpu_nr ) + __field( __u32, priority ) + __field( unsigned long, pending ) + ), + + TP_fast_assign( + __entry->cpu_nr = vcpu->vcpu_id; + __entry->priority = priority; + __entry->pending = vcpu->arch.pending_exceptions; + ), + + TP_printk("vcpu=%x prio=%s pending=%lx", + __entry->cpu_nr, + __print_symbolic(__entry->priority, kvm_trace_symbol_irqprio), + __entry->pending) +); + +#endif + +/* This part must be outside protection */ +#include <trace/define_trace.h> diff --git a/kernel/arch/powerpc/kvm/trace_hv.h b/kernel/arch/powerpc/kvm/trace_hv.h new file mode 100644 index 000000000..33d9daff5 --- /dev/null +++ b/kernel/arch/powerpc/kvm/trace_hv.h @@ -0,0 +1,477 @@ +#if !defined(_TRACE_KVM_HV_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_KVM_HV_H + +#include <linux/tracepoint.h> +#include "trace_book3s.h" +#include <asm/hvcall.h> +#include <asm/kvm_asm.h> + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM kvm_hv +#define TRACE_INCLUDE_PATH . +#define TRACE_INCLUDE_FILE trace_hv + +#define kvm_trace_symbol_hcall \ + {H_REMOVE, "H_REMOVE"}, \ + {H_ENTER, "H_ENTER"}, \ + {H_READ, "H_READ"}, \ + {H_CLEAR_MOD, "H_CLEAR_MOD"}, \ + {H_CLEAR_REF, "H_CLEAR_REF"}, \ + {H_PROTECT, "H_PROTECT"}, \ + {H_GET_TCE, "H_GET_TCE"}, \ + {H_PUT_TCE, "H_PUT_TCE"}, \ + {H_SET_SPRG0, "H_SET_SPRG0"}, \ + {H_SET_DABR, "H_SET_DABR"}, \ + {H_PAGE_INIT, "H_PAGE_INIT"}, \ + {H_SET_ASR, "H_SET_ASR"}, \ + {H_ASR_ON, "H_ASR_ON"}, \ + {H_ASR_OFF, "H_ASR_OFF"}, \ + {H_LOGICAL_CI_LOAD, "H_LOGICAL_CI_LOAD"}, \ + {H_LOGICAL_CI_STORE, "H_LOGICAL_CI_STORE"}, \ + {H_LOGICAL_CACHE_LOAD, "H_LOGICAL_CACHE_LOAD"}, \ + {H_LOGICAL_CACHE_STORE, "H_LOGICAL_CACHE_STORE"}, \ + {H_LOGICAL_ICBI, "H_LOGICAL_ICBI"}, \ + {H_LOGICAL_DCBF, "H_LOGICAL_DCBF"}, \ + {H_GET_TERM_CHAR, "H_GET_TERM_CHAR"}, \ + {H_PUT_TERM_CHAR, "H_PUT_TERM_CHAR"}, \ + {H_REAL_TO_LOGICAL, "H_REAL_TO_LOGICAL"}, \ + {H_HYPERVISOR_DATA, "H_HYPERVISOR_DATA"}, \ + {H_EOI, "H_EOI"}, \ + {H_CPPR, "H_CPPR"}, \ + {H_IPI, "H_IPI"}, \ + {H_IPOLL, "H_IPOLL"}, \ + {H_XIRR, "H_XIRR"}, \ + {H_PERFMON, "H_PERFMON"}, \ + {H_MIGRATE_DMA, "H_MIGRATE_DMA"}, \ + {H_REGISTER_VPA, "H_REGISTER_VPA"}, \ + {H_CEDE, "H_CEDE"}, \ + {H_CONFER, "H_CONFER"}, \ + {H_PROD, "H_PROD"}, \ + {H_GET_PPP, "H_GET_PPP"}, \ + {H_SET_PPP, "H_SET_PPP"}, \ + {H_PURR, "H_PURR"}, \ + {H_PIC, "H_PIC"}, \ + {H_REG_CRQ, "H_REG_CRQ"}, \ + {H_FREE_CRQ, "H_FREE_CRQ"}, \ + {H_VIO_SIGNAL, "H_VIO_SIGNAL"}, \ + {H_SEND_CRQ, "H_SEND_CRQ"}, \ + {H_COPY_RDMA, "H_COPY_RDMA"}, \ + {H_REGISTER_LOGICAL_LAN, "H_REGISTER_LOGICAL_LAN"}, \ + {H_FREE_LOGICAL_LAN, "H_FREE_LOGICAL_LAN"}, \ + {H_ADD_LOGICAL_LAN_BUFFER, "H_ADD_LOGICAL_LAN_BUFFER"}, \ + {H_SEND_LOGICAL_LAN, "H_SEND_LOGICAL_LAN"}, \ + {H_BULK_REMOVE, "H_BULK_REMOVE"}, \ + {H_MULTICAST_CTRL, "H_MULTICAST_CTRL"}, \ + {H_SET_XDABR, "H_SET_XDABR"}, \ + {H_STUFF_TCE, "H_STUFF_TCE"}, \ + {H_PUT_TCE_INDIRECT, "H_PUT_TCE_INDIRECT"}, \ + {H_CHANGE_LOGICAL_LAN_MAC, "H_CHANGE_LOGICAL_LAN_MAC"}, \ + {H_VTERM_PARTNER_INFO, "H_VTERM_PARTNER_INFO"}, \ + {H_REGISTER_VTERM, "H_REGISTER_VTERM"}, \ + {H_FREE_VTERM, "H_FREE_VTERM"}, \ + {H_RESET_EVENTS, "H_RESET_EVENTS"}, \ + {H_ALLOC_RESOURCE, "H_ALLOC_RESOURCE"}, \ + {H_FREE_RESOURCE, "H_FREE_RESOURCE"}, \ + {H_MODIFY_QP, "H_MODIFY_QP"}, \ + {H_QUERY_QP, "H_QUERY_QP"}, \ + {H_REREGISTER_PMR, "H_REREGISTER_PMR"}, \ + {H_REGISTER_SMR, "H_REGISTER_SMR"}, \ + {H_QUERY_MR, "H_QUERY_MR"}, \ + {H_QUERY_MW, "H_QUERY_MW"}, \ + {H_QUERY_HCA, "H_QUERY_HCA"}, \ + {H_QUERY_PORT, "H_QUERY_PORT"}, \ + {H_MODIFY_PORT, "H_MODIFY_PORT"}, \ + {H_DEFINE_AQP1, "H_DEFINE_AQP1"}, \ + {H_GET_TRACE_BUFFER, "H_GET_TRACE_BUFFER"}, \ + {H_DEFINE_AQP0, "H_DEFINE_AQP0"}, \ + {H_RESIZE_MR, "H_RESIZE_MR"}, \ + {H_ATTACH_MCQP, "H_ATTACH_MCQP"}, \ + {H_DETACH_MCQP, "H_DETACH_MCQP"}, \ + {H_CREATE_RPT, "H_CREATE_RPT"}, \ + {H_REMOVE_RPT, "H_REMOVE_RPT"}, \ + {H_REGISTER_RPAGES, "H_REGISTER_RPAGES"}, \ + {H_DISABLE_AND_GETC, "H_DISABLE_AND_GETC"}, \ + {H_ERROR_DATA, "H_ERROR_DATA"}, \ + {H_GET_HCA_INFO, "H_GET_HCA_INFO"}, \ + {H_GET_PERF_COUNT, "H_GET_PERF_COUNT"}, \ + {H_MANAGE_TRACE, "H_MANAGE_TRACE"}, \ + {H_FREE_LOGICAL_LAN_BUFFER, "H_FREE_LOGICAL_LAN_BUFFER"}, \ + {H_QUERY_INT_STATE, "H_QUERY_INT_STATE"}, \ + {H_POLL_PENDING, "H_POLL_PENDING"}, \ + {H_ILLAN_ATTRIBUTES, "H_ILLAN_ATTRIBUTES"}, \ + {H_MODIFY_HEA_QP, "H_MODIFY_HEA_QP"}, \ + {H_QUERY_HEA_QP, "H_QUERY_HEA_QP"}, \ + {H_QUERY_HEA, "H_QUERY_HEA"}, \ + {H_QUERY_HEA_PORT, "H_QUERY_HEA_PORT"}, \ + {H_MODIFY_HEA_PORT, "H_MODIFY_HEA_PORT"}, \ + {H_REG_BCMC, "H_REG_BCMC"}, \ + {H_DEREG_BCMC, "H_DEREG_BCMC"}, \ + {H_REGISTER_HEA_RPAGES, "H_REGISTER_HEA_RPAGES"}, \ + {H_DISABLE_AND_GET_HEA, "H_DISABLE_AND_GET_HEA"}, \ + {H_GET_HEA_INFO, "H_GET_HEA_INFO"}, \ + {H_ALLOC_HEA_RESOURCE, "H_ALLOC_HEA_RESOURCE"}, \ + {H_ADD_CONN, "H_ADD_CONN"}, \ + {H_DEL_CONN, "H_DEL_CONN"}, \ + {H_JOIN, "H_JOIN"}, \ + {H_VASI_STATE, "H_VASI_STATE"}, \ + {H_ENABLE_CRQ, "H_ENABLE_CRQ"}, \ + {H_GET_EM_PARMS, "H_GET_EM_PARMS"}, \ + {H_SET_MPP, "H_SET_MPP"}, \ + {H_GET_MPP, "H_GET_MPP"}, \ + {H_HOME_NODE_ASSOCIATIVITY, "H_HOME_NODE_ASSOCIATIVITY"}, \ + {H_BEST_ENERGY, "H_BEST_ENERGY"}, \ + {H_XIRR_X, "H_XIRR_X"}, \ + {H_RANDOM, "H_RANDOM"}, \ + {H_COP, "H_COP"}, \ + {H_GET_MPP_X, "H_GET_MPP_X"}, \ + {H_SET_MODE, "H_SET_MODE"}, \ + {H_RTAS, "H_RTAS"} + +#define kvm_trace_symbol_kvmret \ + {RESUME_GUEST, "RESUME_GUEST"}, \ + {RESUME_GUEST_NV, "RESUME_GUEST_NV"}, \ + {RESUME_HOST, "RESUME_HOST"}, \ + {RESUME_HOST_NV, "RESUME_HOST_NV"} + +#define kvm_trace_symbol_hcall_rc \ + {H_SUCCESS, "H_SUCCESS"}, \ + {H_BUSY, "H_BUSY"}, \ + {H_CLOSED, "H_CLOSED"}, \ + {H_NOT_AVAILABLE, "H_NOT_AVAILABLE"}, \ + {H_CONSTRAINED, "H_CONSTRAINED"}, \ + {H_PARTIAL, "H_PARTIAL"}, \ + {H_IN_PROGRESS, "H_IN_PROGRESS"}, \ + {H_PAGE_REGISTERED, "H_PAGE_REGISTERED"}, \ + {H_PARTIAL_STORE, "H_PARTIAL_STORE"}, \ + {H_PENDING, "H_PENDING"}, \ + {H_CONTINUE, "H_CONTINUE"}, \ + {H_LONG_BUSY_START_RANGE, "H_LONG_BUSY_START_RANGE"}, \ + {H_LONG_BUSY_ORDER_1_MSEC, "H_LONG_BUSY_ORDER_1_MSEC"}, \ + {H_LONG_BUSY_ORDER_10_MSEC, "H_LONG_BUSY_ORDER_10_MSEC"}, \ + {H_LONG_BUSY_ORDER_100_MSEC, "H_LONG_BUSY_ORDER_100_MSEC"}, \ + {H_LONG_BUSY_ORDER_1_SEC, "H_LONG_BUSY_ORDER_1_SEC"}, \ + {H_LONG_BUSY_ORDER_10_SEC, "H_LONG_BUSY_ORDER_10_SEC"}, \ + {H_LONG_BUSY_ORDER_100_SEC, "H_LONG_BUSY_ORDER_100_SEC"}, \ + {H_LONG_BUSY_END_RANGE, "H_LONG_BUSY_END_RANGE"}, \ + {H_TOO_HARD, "H_TOO_HARD"}, \ + {H_HARDWARE, "H_HARDWARE"}, \ + {H_FUNCTION, "H_FUNCTION"}, \ + {H_PRIVILEGE, "H_PRIVILEGE"}, \ + {H_PARAMETER, "H_PARAMETER"}, \ + {H_BAD_MODE, "H_BAD_MODE"}, \ + {H_PTEG_FULL, "H_PTEG_FULL"}, \ + {H_NOT_FOUND, "H_NOT_FOUND"}, \ + {H_RESERVED_DABR, "H_RESERVED_DABR"}, \ + {H_NO_MEM, "H_NO_MEM"}, \ + {H_AUTHORITY, "H_AUTHORITY"}, \ + {H_PERMISSION, "H_PERMISSION"}, \ + {H_DROPPED, "H_DROPPED"}, \ + {H_SOURCE_PARM, "H_SOURCE_PARM"}, \ + {H_DEST_PARM, "H_DEST_PARM"}, \ + {H_REMOTE_PARM, "H_REMOTE_PARM"}, \ + {H_RESOURCE, "H_RESOURCE"}, \ + {H_ADAPTER_PARM, "H_ADAPTER_PARM"}, \ + {H_RH_PARM, "H_RH_PARM"}, \ + {H_RCQ_PARM, "H_RCQ_PARM"}, \ + {H_SCQ_PARM, "H_SCQ_PARM"}, \ + {H_EQ_PARM, "H_EQ_PARM"}, \ + {H_RT_PARM, "H_RT_PARM"}, \ + {H_ST_PARM, "H_ST_PARM"}, \ + {H_SIGT_PARM, "H_SIGT_PARM"}, \ + {H_TOKEN_PARM, "H_TOKEN_PARM"}, \ + {H_MLENGTH_PARM, "H_MLENGTH_PARM"}, \ + {H_MEM_PARM, "H_MEM_PARM"}, \ + {H_MEM_ACCESS_PARM, "H_MEM_ACCESS_PARM"}, \ + {H_ATTR_PARM, "H_ATTR_PARM"}, \ + {H_PORT_PARM, "H_PORT_PARM"}, \ + {H_MCG_PARM, "H_MCG_PARM"}, \ + {H_VL_PARM, "H_VL_PARM"}, \ + {H_TSIZE_PARM, "H_TSIZE_PARM"}, \ + {H_TRACE_PARM, "H_TRACE_PARM"}, \ + {H_MASK_PARM, "H_MASK_PARM"}, \ + {H_MCG_FULL, "H_MCG_FULL"}, \ + {H_ALIAS_EXIST, "H_ALIAS_EXIST"}, \ + {H_P_COUNTER, "H_P_COUNTER"}, \ + {H_TABLE_FULL, "H_TABLE_FULL"}, \ + {H_ALT_TABLE, "H_ALT_TABLE"}, \ + {H_MR_CONDITION, "H_MR_CONDITION"}, \ + {H_NOT_ENOUGH_RESOURCES, "H_NOT_ENOUGH_RESOURCES"}, \ + {H_R_STATE, "H_R_STATE"}, \ + {H_RESCINDED, "H_RESCINDED"}, \ + {H_P2, "H_P2"}, \ + {H_P3, "H_P3"}, \ + {H_P4, "H_P4"}, \ + {H_P5, "H_P5"}, \ + {H_P6, "H_P6"}, \ + {H_P7, "H_P7"}, \ + {H_P8, "H_P8"}, \ + {H_P9, "H_P9"}, \ + {H_TOO_BIG, "H_TOO_BIG"}, \ + {H_OVERLAP, "H_OVERLAP"}, \ + {H_INTERRUPT, "H_INTERRUPT"}, \ + {H_BAD_DATA, "H_BAD_DATA"}, \ + {H_NOT_ACTIVE, "H_NOT_ACTIVE"}, \ + {H_SG_LIST, "H_SG_LIST"}, \ + {H_OP_MODE, "H_OP_MODE"}, \ + {H_COP_HW, "H_COP_HW"}, \ + {H_UNSUPPORTED_FLAG_START, "H_UNSUPPORTED_FLAG_START"}, \ + {H_UNSUPPORTED_FLAG_END, "H_UNSUPPORTED_FLAG_END"}, \ + {H_MULTI_THREADS_ACTIVE, "H_MULTI_THREADS_ACTIVE"}, \ + {H_OUTSTANDING_COP_OPS, "H_OUTSTANDING_COP_OPS"} + +TRACE_EVENT(kvm_guest_enter, + TP_PROTO(struct kvm_vcpu *vcpu), + TP_ARGS(vcpu), + + TP_STRUCT__entry( + __field(int, vcpu_id) + __field(unsigned long, pc) + __field(unsigned long, pending_exceptions) + __field(u8, ceded) + ), + + TP_fast_assign( + __entry->vcpu_id = vcpu->vcpu_id; + __entry->pc = kvmppc_get_pc(vcpu); + __entry->ceded = vcpu->arch.ceded; + __entry->pending_exceptions = vcpu->arch.pending_exceptions; + ), + + TP_printk("VCPU %d: pc=0x%lx pexcp=0x%lx ceded=%d", + __entry->vcpu_id, + __entry->pc, + __entry->pending_exceptions, __entry->ceded) +); + +TRACE_EVENT(kvm_guest_exit, + TP_PROTO(struct kvm_vcpu *vcpu), + TP_ARGS(vcpu), + + TP_STRUCT__entry( + __field(int, vcpu_id) + __field(int, trap) + __field(unsigned long, pc) + __field(unsigned long, msr) + __field(u8, ceded) + ), + + TP_fast_assign( + __entry->vcpu_id = vcpu->vcpu_id; + __entry->trap = vcpu->arch.trap; + __entry->ceded = vcpu->arch.ceded; + __entry->pc = kvmppc_get_pc(vcpu); + __entry->msr = vcpu->arch.shregs.msr; + ), + + TP_printk("VCPU %d: trap=%s pc=0x%lx msr=0x%lx, ceded=%d", + __entry->vcpu_id, + __print_symbolic(__entry->trap, kvm_trace_symbol_exit), + __entry->pc, __entry->msr, __entry->ceded + ) +); + +TRACE_EVENT(kvm_page_fault_enter, + TP_PROTO(struct kvm_vcpu *vcpu, unsigned long *hptep, + struct kvm_memory_slot *memslot, unsigned long ea, + unsigned long dsisr), + + TP_ARGS(vcpu, hptep, memslot, ea, dsisr), + + TP_STRUCT__entry( + __field(int, vcpu_id) + __field(unsigned long, hpte_v) + __field(unsigned long, hpte_r) + __field(unsigned long, gpte_r) + __field(unsigned long, ea) + __field(u64, base_gfn) + __field(u32, slot_flags) + __field(u32, dsisr) + ), + + TP_fast_assign( + __entry->vcpu_id = vcpu->vcpu_id; + __entry->hpte_v = hptep[0]; + __entry->hpte_r = hptep[1]; + __entry->gpte_r = hptep[2]; + __entry->ea = ea; + __entry->dsisr = dsisr; + __entry->base_gfn = memslot ? memslot->base_gfn : -1UL; + __entry->slot_flags = memslot ? memslot->flags : 0; + ), + + TP_printk("VCPU %d: hpte=0x%lx:0x%lx guest=0x%lx ea=0x%lx,%x slot=0x%llx,0x%x", + __entry->vcpu_id, + __entry->hpte_v, __entry->hpte_r, __entry->gpte_r, + __entry->ea, __entry->dsisr, + __entry->base_gfn, __entry->slot_flags) +); + +TRACE_EVENT(kvm_page_fault_exit, + TP_PROTO(struct kvm_vcpu *vcpu, unsigned long *hptep, long ret), + + TP_ARGS(vcpu, hptep, ret), + + TP_STRUCT__entry( + __field(int, vcpu_id) + __field(unsigned long, hpte_v) + __field(unsigned long, hpte_r) + __field(long, ret) + ), + + TP_fast_assign( + __entry->vcpu_id = vcpu->vcpu_id; + __entry->hpte_v = hptep[0]; + __entry->hpte_r = hptep[1]; + __entry->ret = ret; + ), + + TP_printk("VCPU %d: hpte=0x%lx:0x%lx ret=0x%lx", + __entry->vcpu_id, + __entry->hpte_v, __entry->hpte_r, __entry->ret) +); + +TRACE_EVENT(kvm_hcall_enter, + TP_PROTO(struct kvm_vcpu *vcpu), + + TP_ARGS(vcpu), + + TP_STRUCT__entry( + __field(int, vcpu_id) + __field(unsigned long, req) + __field(unsigned long, gpr4) + __field(unsigned long, gpr5) + __field(unsigned long, gpr6) + __field(unsigned long, gpr7) + ), + + TP_fast_assign( + __entry->vcpu_id = vcpu->vcpu_id; + __entry->req = kvmppc_get_gpr(vcpu, 3); + __entry->gpr4 = kvmppc_get_gpr(vcpu, 4); + __entry->gpr5 = kvmppc_get_gpr(vcpu, 5); + __entry->gpr6 = kvmppc_get_gpr(vcpu, 6); + __entry->gpr7 = kvmppc_get_gpr(vcpu, 7); + ), + + TP_printk("VCPU %d: hcall=%s GPR4-7=0x%lx,0x%lx,0x%lx,0x%lx", + __entry->vcpu_id, + __print_symbolic(__entry->req, kvm_trace_symbol_hcall), + __entry->gpr4, __entry->gpr5, __entry->gpr6, __entry->gpr7) +); + +TRACE_EVENT(kvm_hcall_exit, + TP_PROTO(struct kvm_vcpu *vcpu, int ret), + + TP_ARGS(vcpu, ret), + + TP_STRUCT__entry( + __field(int, vcpu_id) + __field(unsigned long, ret) + __field(unsigned long, hcall_rc) + ), + + TP_fast_assign( + __entry->vcpu_id = vcpu->vcpu_id; + __entry->ret = ret; + __entry->hcall_rc = kvmppc_get_gpr(vcpu, 3); + ), + + TP_printk("VCPU %d: ret=%s hcall_rc=%s", + __entry->vcpu_id, + __print_symbolic(__entry->ret, kvm_trace_symbol_kvmret), + __print_symbolic(__entry->ret & RESUME_FLAG_HOST ? + H_TOO_HARD : __entry->hcall_rc, + kvm_trace_symbol_hcall_rc)) +); + +TRACE_EVENT(kvmppc_run_core, + TP_PROTO(struct kvmppc_vcore *vc, int where), + + TP_ARGS(vc, where), + + TP_STRUCT__entry( + __field(int, n_runnable) + __field(int, runner_vcpu) + __field(int, where) + __field(pid_t, tgid) + ), + + TP_fast_assign( + __entry->runner_vcpu = vc->runner->vcpu_id; + __entry->n_runnable = vc->n_runnable; + __entry->where = where; + __entry->tgid = current->tgid; + ), + + TP_printk("%s runner_vcpu==%d runnable=%d tgid=%d", + __entry->where ? "Exit" : "Enter", + __entry->runner_vcpu, __entry->n_runnable, __entry->tgid) +); + +TRACE_EVENT(kvmppc_vcore_blocked, + TP_PROTO(struct kvmppc_vcore *vc, int where), + + TP_ARGS(vc, where), + + TP_STRUCT__entry( + __field(int, n_runnable) + __field(int, runner_vcpu) + __field(int, where) + __field(pid_t, tgid) + ), + + TP_fast_assign( + __entry->runner_vcpu = vc->runner->vcpu_id; + __entry->n_runnable = vc->n_runnable; + __entry->where = where; + __entry->tgid = current->tgid; + ), + + TP_printk("%s runner_vcpu=%d runnable=%d tgid=%d", + __entry->where ? "Exit" : "Enter", + __entry->runner_vcpu, __entry->n_runnable, __entry->tgid) +); + +TRACE_EVENT(kvmppc_run_vcpu_enter, + TP_PROTO(struct kvm_vcpu *vcpu), + + TP_ARGS(vcpu), + + TP_STRUCT__entry( + __field(int, vcpu_id) + __field(pid_t, tgid) + ), + + TP_fast_assign( + __entry->vcpu_id = vcpu->vcpu_id; + __entry->tgid = current->tgid; + ), + + TP_printk("VCPU %d: tgid=%d", __entry->vcpu_id, __entry->tgid) +); + +TRACE_EVENT(kvmppc_run_vcpu_exit, + TP_PROTO(struct kvm_vcpu *vcpu, struct kvm_run *run), + + TP_ARGS(vcpu, run), + + TP_STRUCT__entry( + __field(int, vcpu_id) + __field(int, exit) + __field(int, ret) + ), + + TP_fast_assign( + __entry->vcpu_id = vcpu->vcpu_id; + __entry->exit = run->exit_reason; + __entry->ret = vcpu->arch.ret; + ), + + TP_printk("VCPU %d: exit=%d, ret=%d", + __entry->vcpu_id, __entry->exit, __entry->ret) +); + +#endif /* _TRACE_KVM_HV_H */ + +/* This part must be outside protection */ +#include <trace/define_trace.h> diff --git a/kernel/arch/powerpc/kvm/trace_pr.h b/kernel/arch/powerpc/kvm/trace_pr.h new file mode 100644 index 000000000..810507cb6 --- /dev/null +++ b/kernel/arch/powerpc/kvm/trace_pr.h @@ -0,0 +1,274 @@ + +#if !defined(_TRACE_KVM_PR_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_KVM_PR_H + +#include <linux/tracepoint.h> +#include "trace_book3s.h" + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM kvm_pr +#define TRACE_INCLUDE_PATH . +#define TRACE_INCLUDE_FILE trace_pr + +TRACE_EVENT(kvm_book3s_reenter, + TP_PROTO(int r, struct kvm_vcpu *vcpu), + TP_ARGS(r, vcpu), + + TP_STRUCT__entry( + __field( unsigned int, r ) + __field( unsigned long, pc ) + ), + + TP_fast_assign( + __entry->r = r; + __entry->pc = kvmppc_get_pc(vcpu); + ), + + TP_printk("reentry r=%d | pc=0x%lx", __entry->r, __entry->pc) +); + +#ifdef CONFIG_PPC_BOOK3S_64 + +TRACE_EVENT(kvm_book3s_64_mmu_map, + TP_PROTO(int rflags, ulong hpteg, ulong va, pfn_t hpaddr, + struct kvmppc_pte *orig_pte), + TP_ARGS(rflags, hpteg, va, hpaddr, orig_pte), + + TP_STRUCT__entry( + __field( unsigned char, flag_w ) + __field( unsigned char, flag_x ) + __field( unsigned long, eaddr ) + __field( unsigned long, hpteg ) + __field( unsigned long, va ) + __field( unsigned long long, vpage ) + __field( unsigned long, hpaddr ) + ), + + TP_fast_assign( + __entry->flag_w = ((rflags & HPTE_R_PP) == 3) ? '-' : 'w'; + __entry->flag_x = (rflags & HPTE_R_N) ? '-' : 'x'; + __entry->eaddr = orig_pte->eaddr; + __entry->hpteg = hpteg; + __entry->va = va; + __entry->vpage = orig_pte->vpage; + __entry->hpaddr = hpaddr; + ), + + TP_printk("KVM: %c%c Map 0x%lx: [%lx] 0x%lx (0x%llx) -> %lx", + __entry->flag_w, __entry->flag_x, __entry->eaddr, + __entry->hpteg, __entry->va, __entry->vpage, __entry->hpaddr) +); + +#endif /* CONFIG_PPC_BOOK3S_64 */ + +TRACE_EVENT(kvm_book3s_mmu_map, + TP_PROTO(struct hpte_cache *pte), + TP_ARGS(pte), + + TP_STRUCT__entry( + __field( u64, host_vpn ) + __field( u64, pfn ) + __field( ulong, eaddr ) + __field( u64, vpage ) + __field( ulong, raddr ) + __field( int, flags ) + ), + + TP_fast_assign( + __entry->host_vpn = pte->host_vpn; + __entry->pfn = pte->pfn; + __entry->eaddr = pte->pte.eaddr; + __entry->vpage = pte->pte.vpage; + __entry->raddr = pte->pte.raddr; + __entry->flags = (pte->pte.may_read ? 0x4 : 0) | + (pte->pte.may_write ? 0x2 : 0) | + (pte->pte.may_execute ? 0x1 : 0); + ), + + TP_printk("Map: hvpn=%llx pfn=%llx ea=%lx vp=%llx ra=%lx [%x]", + __entry->host_vpn, __entry->pfn, __entry->eaddr, + __entry->vpage, __entry->raddr, __entry->flags) +); + +TRACE_EVENT(kvm_book3s_mmu_invalidate, + TP_PROTO(struct hpte_cache *pte), + TP_ARGS(pte), + + TP_STRUCT__entry( + __field( u64, host_vpn ) + __field( u64, pfn ) + __field( ulong, eaddr ) + __field( u64, vpage ) + __field( ulong, raddr ) + __field( int, flags ) + ), + + TP_fast_assign( + __entry->host_vpn = pte->host_vpn; + __entry->pfn = pte->pfn; + __entry->eaddr = pte->pte.eaddr; + __entry->vpage = pte->pte.vpage; + __entry->raddr = pte->pte.raddr; + __entry->flags = (pte->pte.may_read ? 0x4 : 0) | + (pte->pte.may_write ? 0x2 : 0) | + (pte->pte.may_execute ? 0x1 : 0); + ), + + TP_printk("Flush: hva=%llx pfn=%llx ea=%lx vp=%llx ra=%lx [%x]", + __entry->host_vpn, __entry->pfn, __entry->eaddr, + __entry->vpage, __entry->raddr, __entry->flags) +); + +TRACE_EVENT(kvm_book3s_mmu_flush, + TP_PROTO(const char *type, struct kvm_vcpu *vcpu, unsigned long long p1, + unsigned long long p2), + TP_ARGS(type, vcpu, p1, p2), + + TP_STRUCT__entry( + __field( int, count ) + __field( unsigned long long, p1 ) + __field( unsigned long long, p2 ) + __field( const char *, type ) + ), + + TP_fast_assign( + __entry->count = to_book3s(vcpu)->hpte_cache_count; + __entry->p1 = p1; + __entry->p2 = p2; + __entry->type = type; + ), + + TP_printk("Flush %d %sPTEs: %llx - %llx", + __entry->count, __entry->type, __entry->p1, __entry->p2) +); + +TRACE_EVENT(kvm_book3s_slb_found, + TP_PROTO(unsigned long long gvsid, unsigned long long hvsid), + TP_ARGS(gvsid, hvsid), + + TP_STRUCT__entry( + __field( unsigned long long, gvsid ) + __field( unsigned long long, hvsid ) + ), + + TP_fast_assign( + __entry->gvsid = gvsid; + __entry->hvsid = hvsid; + ), + + TP_printk("%llx -> %llx", __entry->gvsid, __entry->hvsid) +); + +TRACE_EVENT(kvm_book3s_slb_fail, + TP_PROTO(u16 sid_map_mask, unsigned long long gvsid), + TP_ARGS(sid_map_mask, gvsid), + + TP_STRUCT__entry( + __field( unsigned short, sid_map_mask ) + __field( unsigned long long, gvsid ) + ), + + TP_fast_assign( + __entry->sid_map_mask = sid_map_mask; + __entry->gvsid = gvsid; + ), + + TP_printk("%x/%x: %llx", __entry->sid_map_mask, + SID_MAP_MASK - __entry->sid_map_mask, __entry->gvsid) +); + +TRACE_EVENT(kvm_book3s_slb_map, + TP_PROTO(u16 sid_map_mask, unsigned long long gvsid, + unsigned long long hvsid), + TP_ARGS(sid_map_mask, gvsid, hvsid), + + TP_STRUCT__entry( + __field( unsigned short, sid_map_mask ) + __field( unsigned long long, guest_vsid ) + __field( unsigned long long, host_vsid ) + ), + + TP_fast_assign( + __entry->sid_map_mask = sid_map_mask; + __entry->guest_vsid = gvsid; + __entry->host_vsid = hvsid; + ), + + TP_printk("%x: %llx -> %llx", __entry->sid_map_mask, + __entry->guest_vsid, __entry->host_vsid) +); + +TRACE_EVENT(kvm_book3s_slbmte, + TP_PROTO(u64 slb_vsid, u64 slb_esid), + TP_ARGS(slb_vsid, slb_esid), + + TP_STRUCT__entry( + __field( u64, slb_vsid ) + __field( u64, slb_esid ) + ), + + TP_fast_assign( + __entry->slb_vsid = slb_vsid; + __entry->slb_esid = slb_esid; + ), + + TP_printk("%llx, %llx", __entry->slb_vsid, __entry->slb_esid) +); + +TRACE_EVENT(kvm_exit, + TP_PROTO(unsigned int exit_nr, struct kvm_vcpu *vcpu), + TP_ARGS(exit_nr, vcpu), + + TP_STRUCT__entry( + __field( unsigned int, exit_nr ) + __field( unsigned long, pc ) + __field( unsigned long, msr ) + __field( unsigned long, dar ) + __field( unsigned long, srr1 ) + __field( unsigned long, last_inst ) + ), + + TP_fast_assign( + __entry->exit_nr = exit_nr; + __entry->pc = kvmppc_get_pc(vcpu); + __entry->dar = kvmppc_get_fault_dar(vcpu); + __entry->msr = kvmppc_get_msr(vcpu); + __entry->srr1 = vcpu->arch.shadow_srr1; + __entry->last_inst = vcpu->arch.last_inst; + ), + + TP_printk("exit=%s" + " | pc=0x%lx" + " | msr=0x%lx" + " | dar=0x%lx" + " | srr1=0x%lx" + " | last_inst=0x%lx" + , + __print_symbolic(__entry->exit_nr, kvm_trace_symbol_exit), + __entry->pc, + __entry->msr, + __entry->dar, + __entry->srr1, + __entry->last_inst + ) +); + +TRACE_EVENT(kvm_unmap_hva, + TP_PROTO(unsigned long hva), + TP_ARGS(hva), + + TP_STRUCT__entry( + __field( unsigned long, hva ) + ), + + TP_fast_assign( + __entry->hva = hva; + ), + + TP_printk("unmap hva 0x%lx\n", __entry->hva) +); + +#endif /* _TRACE_KVM_H */ + +/* This part must be outside protection */ +#include <trace/define_trace.h> |