diff options
author | José Pekkarinen <jose.pekkarinen@nokia.com> | 2016-04-11 10:41:07 +0300 |
---|---|---|
committer | José Pekkarinen <jose.pekkarinen@nokia.com> | 2016-04-13 08:17:18 +0300 |
commit | e09b41010ba33a20a87472ee821fa407a5b8da36 (patch) | |
tree | d10dc367189862e7ca5c592f033dc3726e1df4e3 /kernel/arch/x86/kvm/mmu.c | |
parent | f93b97fd65072de626c074dbe099a1fff05ce060 (diff) |
These changes are the raw update to linux-4.4.6-rt14. Kernel sources
are taken from kernel.org, and rt patch from the rt wiki download page.
During the rebasing, the following patch collided:
Force tick interrupt and get rid of softirq magic(I70131fb85).
Collisions have been removed because its logic was found on the
source already.
Change-Id: I7f57a4081d9deaa0d9ccfc41a6c8daccdee3b769
Signed-off-by: José Pekkarinen <jose.pekkarinen@nokia.com>
Diffstat (limited to 'kernel/arch/x86/kvm/mmu.c')
-rw-r--r-- | kernel/arch/x86/kvm/mmu.c | 1035 |
1 files changed, 567 insertions, 468 deletions
diff --git a/kernel/arch/x86/kvm/mmu.c b/kernel/arch/x86/kvm/mmu.c index 554e877e0..8eb8a934b 100644 --- a/kernel/arch/x86/kvm/mmu.c +++ b/kernel/arch/x86/kvm/mmu.c @@ -223,15 +223,15 @@ static unsigned int get_mmio_spte_generation(u64 spte) return gen; } -static unsigned int kvm_current_mmio_generation(struct kvm *kvm) +static unsigned int kvm_current_mmio_generation(struct kvm_vcpu *vcpu) { - return kvm_memslots(kvm)->generation & MMIO_GEN_MASK; + return kvm_vcpu_memslots(vcpu)->generation & MMIO_GEN_MASK; } -static void mark_mmio_spte(struct kvm *kvm, u64 *sptep, u64 gfn, +static void mark_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 gfn, unsigned access) { - unsigned int gen = kvm_current_mmio_generation(kvm); + unsigned int gen = kvm_current_mmio_generation(vcpu); u64 mask = generation_mmio_spte_mask(gen); access &= ACC_WRITE_MASK | ACC_USER_MASK; @@ -258,22 +258,22 @@ static unsigned get_mmio_spte_access(u64 spte) return (spte & ~mask) & ~PAGE_MASK; } -static bool set_mmio_spte(struct kvm *kvm, u64 *sptep, gfn_t gfn, +static bool set_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn, pfn_t pfn, unsigned access) { if (unlikely(is_noslot_pfn(pfn))) { - mark_mmio_spte(kvm, sptep, gfn, access); + mark_mmio_spte(vcpu, sptep, gfn, access); return true; } return false; } -static bool check_mmio_spte(struct kvm *kvm, u64 spte) +static bool check_mmio_spte(struct kvm_vcpu *vcpu, u64 spte) { unsigned int kvm_gen, spte_gen; - kvm_gen = kvm_current_mmio_generation(kvm); + kvm_gen = kvm_current_mmio_generation(vcpu); spte_gen = get_mmio_spte_generation(spte); trace_check_mmio_spte(spte, kvm_gen, spte_gen); @@ -781,30 +781,36 @@ static struct kvm_lpage_info *lpage_info_slot(gfn_t gfn, return &slot->arch.lpage_info[level - 2][idx]; } -static void account_shadowed(struct kvm *kvm, gfn_t gfn) +static void account_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp) { + struct kvm_memslots *slots; struct kvm_memory_slot *slot; struct kvm_lpage_info *linfo; + gfn_t gfn; int i; - slot = gfn_to_memslot(kvm, gfn); - for (i = PT_DIRECTORY_LEVEL; - i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) { + gfn = sp->gfn; + slots = kvm_memslots_for_spte_role(kvm, sp->role); + slot = __gfn_to_memslot(slots, gfn); + for (i = PT_DIRECTORY_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) { linfo = lpage_info_slot(gfn, slot, i); linfo->write_count += 1; } kvm->arch.indirect_shadow_pages++; } -static void unaccount_shadowed(struct kvm *kvm, gfn_t gfn) +static void unaccount_shadowed(struct kvm *kvm, struct kvm_mmu_page *sp) { + struct kvm_memslots *slots; struct kvm_memory_slot *slot; struct kvm_lpage_info *linfo; + gfn_t gfn; int i; - slot = gfn_to_memslot(kvm, gfn); - for (i = PT_DIRECTORY_LEVEL; - i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) { + gfn = sp->gfn; + slots = kvm_memslots_for_spte_role(kvm, sp->role); + slot = __gfn_to_memslot(slots, gfn); + for (i = PT_DIRECTORY_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) { linfo = lpage_info_slot(gfn, slot, i); linfo->write_count -= 1; WARN_ON(linfo->write_count < 0); @@ -812,14 +818,11 @@ static void unaccount_shadowed(struct kvm *kvm, gfn_t gfn) kvm->arch.indirect_shadow_pages--; } -static int has_wrprotected_page(struct kvm *kvm, - gfn_t gfn, - int level) +static int __has_wrprotected_page(gfn_t gfn, int level, + struct kvm_memory_slot *slot) { - struct kvm_memory_slot *slot; struct kvm_lpage_info *linfo; - slot = gfn_to_memslot(kvm, gfn); if (slot) { linfo = lpage_info_slot(gfn, slot, level); return linfo->write_count; @@ -828,6 +831,14 @@ static int has_wrprotected_page(struct kvm *kvm, return 1; } +static int has_wrprotected_page(struct kvm_vcpu *vcpu, gfn_t gfn, int level) +{ + struct kvm_memory_slot *slot; + + slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); + return __has_wrprotected_page(gfn, level, slot); +} + static int host_mapping_level(struct kvm *kvm, gfn_t gfn) { unsigned long page_size; @@ -835,8 +846,7 @@ static int host_mapping_level(struct kvm *kvm, gfn_t gfn) page_size = kvm_host_page_size(kvm, gfn); - for (i = PT_PAGE_TABLE_LEVEL; - i < (PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES); ++i) { + for (i = PT_PAGE_TABLE_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) { if (page_size >= KVM_HPAGE_SIZE(i)) ret = i; else @@ -846,28 +856,43 @@ static int host_mapping_level(struct kvm *kvm, gfn_t gfn) return ret; } +static inline bool memslot_valid_for_gpte(struct kvm_memory_slot *slot, + bool no_dirty_log) +{ + if (!slot || slot->flags & KVM_MEMSLOT_INVALID) + return false; + if (no_dirty_log && slot->dirty_bitmap) + return false; + + return true; +} + static struct kvm_memory_slot * gfn_to_memslot_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t gfn, bool no_dirty_log) { struct kvm_memory_slot *slot; - slot = gfn_to_memslot(vcpu->kvm, gfn); - if (!slot || slot->flags & KVM_MEMSLOT_INVALID || - (no_dirty_log && slot->dirty_bitmap)) + slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); + if (!memslot_valid_for_gpte(slot, no_dirty_log)) slot = NULL; return slot; } -static bool mapping_level_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t large_gfn) -{ - return !gfn_to_memslot_dirty_bitmap(vcpu, large_gfn, true); -} - -static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn) +static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn, + bool *force_pt_level) { int host_level, level, max_level; + struct kvm_memory_slot *slot; + + if (unlikely(*force_pt_level)) + return PT_PAGE_TABLE_LEVEL; + + slot = kvm_vcpu_gfn_to_memslot(vcpu, large_gfn); + *force_pt_level = !memslot_valid_for_gpte(slot, true); + if (unlikely(*force_pt_level)) + return PT_PAGE_TABLE_LEVEL; host_level = host_mapping_level(vcpu->kvm, large_gfn); @@ -877,7 +902,7 @@ static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn) max_level = min(kvm_x86_ops->get_lpage_level(), host_level); for (level = PT_DIRECTORY_LEVEL; level <= max_level; ++level) - if (has_wrprotected_page(vcpu->kvm, large_gfn, level)) + if (__has_wrprotected_page(large_gfn, level, slot)) break; return level - 1; @@ -1019,12 +1044,14 @@ static unsigned long *__gfn_to_rmap(gfn_t gfn, int level, /* * Take gfn and return the reverse mapping to it. */ -static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level) +static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, struct kvm_mmu_page *sp) { + struct kvm_memslots *slots; struct kvm_memory_slot *slot; - slot = gfn_to_memslot(kvm, gfn); - return __gfn_to_rmap(gfn, level, slot); + slots = kvm_memslots_for_spte_role(kvm, sp->role); + slot = __gfn_to_memslot(slots, gfn); + return __gfn_to_rmap(gfn, sp->role.level, slot); } static bool rmap_can_add(struct kvm_vcpu *vcpu) @@ -1042,7 +1069,7 @@ static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) sp = page_header(__pa(spte)); kvm_mmu_page_set_gfn(sp, spte - sp->spt, gfn); - rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level); + rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp); return pte_list_add(vcpu, spte, rmapp); } @@ -1054,7 +1081,7 @@ static void rmap_remove(struct kvm *kvm, u64 *spte) sp = page_header(__pa(spte)); gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt); - rmapp = gfn_to_rmap(kvm, gfn, sp->role.level); + rmapp = gfn_to_rmap(kvm, gfn, sp); pte_list_remove(spte, rmapp); } @@ -1119,6 +1146,11 @@ static u64 *rmap_get_next(struct rmap_iterator *iter) return NULL; } +#define for_each_rmap_spte(_rmap_, _iter_, _spte_) \ + for (_spte_ = rmap_get_first(*_rmap_, _iter_); \ + _spte_ && ({BUG_ON(!is_shadow_present_pte(*_spte_)); 1;}); \ + _spte_ = rmap_get_next(_iter_)) + static void drop_spte(struct kvm *kvm, u64 *sptep) { if (mmu_spte_clear_track_bits(sptep)) @@ -1182,12 +1214,8 @@ static bool __rmap_write_protect(struct kvm *kvm, unsigned long *rmapp, struct rmap_iterator iter; bool flush = false; - for (sptep = rmap_get_first(*rmapp, &iter); sptep;) { - BUG_ON(!(*sptep & PT_PRESENT_MASK)); - + for_each_rmap_spte(rmapp, &iter, sptep) flush |= spte_write_protect(kvm, sptep, pt_protect); - sptep = rmap_get_next(&iter); - } return flush; } @@ -1209,12 +1237,8 @@ static bool __rmap_clear_dirty(struct kvm *kvm, unsigned long *rmapp) struct rmap_iterator iter; bool flush = false; - for (sptep = rmap_get_first(*rmapp, &iter); sptep;) { - BUG_ON(!(*sptep & PT_PRESENT_MASK)); - + for_each_rmap_spte(rmapp, &iter, sptep) flush |= spte_clear_dirty(kvm, sptep); - sptep = rmap_get_next(&iter); - } return flush; } @@ -1236,12 +1260,8 @@ static bool __rmap_set_dirty(struct kvm *kvm, unsigned long *rmapp) struct rmap_iterator iter; bool flush = false; - for (sptep = rmap_get_first(*rmapp, &iter); sptep;) { - BUG_ON(!(*sptep & PT_PRESENT_MASK)); - + for_each_rmap_spte(rmapp, &iter, sptep) flush |= spte_set_dirty(kvm, sptep); - sptep = rmap_get_next(&iter); - } return flush; } @@ -1319,42 +1339,45 @@ void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm, kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask); } -static bool rmap_write_protect(struct kvm *kvm, u64 gfn) +static bool rmap_write_protect(struct kvm_vcpu *vcpu, u64 gfn) { struct kvm_memory_slot *slot; unsigned long *rmapp; int i; bool write_protected = false; - slot = gfn_to_memslot(kvm, gfn); + slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); - for (i = PT_PAGE_TABLE_LEVEL; - i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) { + for (i = PT_PAGE_TABLE_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) { rmapp = __gfn_to_rmap(gfn, i, slot); - write_protected |= __rmap_write_protect(kvm, rmapp, true); + write_protected |= __rmap_write_protect(vcpu->kvm, rmapp, true); } return write_protected; } -static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp, - struct kvm_memory_slot *slot, gfn_t gfn, int level, - unsigned long data) +static bool kvm_zap_rmapp(struct kvm *kvm, unsigned long *rmapp) { u64 *sptep; struct rmap_iterator iter; - int need_tlb_flush = 0; + bool flush = false; while ((sptep = rmap_get_first(*rmapp, &iter))) { BUG_ON(!(*sptep & PT_PRESENT_MASK)); - rmap_printk("kvm_rmap_unmap_hva: spte %p %llx gfn %llx (%d)\n", - sptep, *sptep, gfn, level); + rmap_printk("%s: spte %p %llx.\n", __func__, sptep, *sptep); drop_spte(kvm, sptep); - need_tlb_flush = 1; + flush = true; } - return need_tlb_flush; + return flush; +} + +static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp, + struct kvm_memory_slot *slot, gfn_t gfn, int level, + unsigned long data) +{ + return kvm_zap_rmapp(kvm, rmapp); } static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp, @@ -1371,8 +1394,8 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp, WARN_ON(pte_huge(*ptep)); new_pfn = pte_pfn(*ptep); - for (sptep = rmap_get_first(*rmapp, &iter); sptep;) { - BUG_ON(!is_shadow_present_pte(*sptep)); +restart: + for_each_rmap_spte(rmapp, &iter, sptep) { rmap_printk("kvm_set_pte_rmapp: spte %p %llx gfn %llx (%d)\n", sptep, *sptep, gfn, level); @@ -1380,7 +1403,7 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp, if (pte_write(*ptep)) { drop_spte(kvm, sptep); - sptep = rmap_get_first(*rmapp, &iter); + goto restart; } else { new_spte = *sptep & ~PT64_BASE_ADDR_MASK; new_spte |= (u64)new_pfn << PAGE_SHIFT; @@ -1391,7 +1414,6 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp, mmu_spte_clear_track_bits(sptep); mmu_spte_set(sptep, new_spte); - sptep = rmap_get_next(&iter); } } @@ -1401,6 +1423,74 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp, return 0; } +struct slot_rmap_walk_iterator { + /* input fields. */ + struct kvm_memory_slot *slot; + gfn_t start_gfn; + gfn_t end_gfn; + int start_level; + int end_level; + + /* output fields. */ + gfn_t gfn; + unsigned long *rmap; + int level; + + /* private field. */ + unsigned long *end_rmap; +}; + +static void +rmap_walk_init_level(struct slot_rmap_walk_iterator *iterator, int level) +{ + iterator->level = level; + iterator->gfn = iterator->start_gfn; + iterator->rmap = __gfn_to_rmap(iterator->gfn, level, iterator->slot); + iterator->end_rmap = __gfn_to_rmap(iterator->end_gfn, level, + iterator->slot); +} + +static void +slot_rmap_walk_init(struct slot_rmap_walk_iterator *iterator, + struct kvm_memory_slot *slot, int start_level, + int end_level, gfn_t start_gfn, gfn_t end_gfn) +{ + iterator->slot = slot; + iterator->start_level = start_level; + iterator->end_level = end_level; + iterator->start_gfn = start_gfn; + iterator->end_gfn = end_gfn; + + rmap_walk_init_level(iterator, iterator->start_level); +} + +static bool slot_rmap_walk_okay(struct slot_rmap_walk_iterator *iterator) +{ + return !!iterator->rmap; +} + +static void slot_rmap_walk_next(struct slot_rmap_walk_iterator *iterator) +{ + if (++iterator->rmap <= iterator->end_rmap) { + iterator->gfn += (1UL << KVM_HPAGE_GFN_SHIFT(iterator->level)); + return; + } + + if (++iterator->level > iterator->end_level) { + iterator->rmap = NULL; + return; + } + + rmap_walk_init_level(iterator, iterator->level); +} + +#define for_each_slot_rmap_range(_slot_, _start_level_, _end_level_, \ + _start_gfn, _end_gfn, _iter_) \ + for (slot_rmap_walk_init(_iter_, _slot_, _start_level_, \ + _end_level_, _start_gfn, _end_gfn); \ + slot_rmap_walk_okay(_iter_); \ + slot_rmap_walk_next(_iter_)) + static int kvm_handle_hva_range(struct kvm *kvm, unsigned long start, unsigned long end, @@ -1412,48 +1502,36 @@ static int kvm_handle_hva_range(struct kvm *kvm, int level, unsigned long data)) { - int j; - int ret = 0; struct kvm_memslots *slots; struct kvm_memory_slot *memslot; + struct slot_rmap_walk_iterator iterator; + int ret = 0; + int i; - slots = kvm_memslots(kvm); - - kvm_for_each_memslot(memslot, slots) { - unsigned long hva_start, hva_end; - gfn_t gfn_start, gfn_end; - - hva_start = max(start, memslot->userspace_addr); - hva_end = min(end, memslot->userspace_addr + - (memslot->npages << PAGE_SHIFT)); - if (hva_start >= hva_end) - continue; - /* - * {gfn(page) | page intersects with [hva_start, hva_end)} = - * {gfn_start, gfn_start+1, ..., gfn_end-1}. - */ - gfn_start = hva_to_gfn_memslot(hva_start, memslot); - gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot); - - for (j = PT_PAGE_TABLE_LEVEL; - j < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++j) { - unsigned long idx, idx_end; - unsigned long *rmapp; - gfn_t gfn = gfn_start; + for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { + slots = __kvm_memslots(kvm, i); + kvm_for_each_memslot(memslot, slots) { + unsigned long hva_start, hva_end; + gfn_t gfn_start, gfn_end; + hva_start = max(start, memslot->userspace_addr); + hva_end = min(end, memslot->userspace_addr + + (memslot->npages << PAGE_SHIFT)); + if (hva_start >= hva_end) + continue; /* - * {idx(page_j) | page_j intersects with - * [hva_start, hva_end)} = {idx, idx+1, ..., idx_end}. + * {gfn(page) | page intersects with [hva_start, hva_end)} = + * {gfn_start, gfn_start+1, ..., gfn_end-1}. */ - idx = gfn_to_index(gfn_start, memslot->base_gfn, j); - idx_end = gfn_to_index(gfn_end - 1, memslot->base_gfn, j); - - rmapp = __gfn_to_rmap(gfn_start, j, memslot); - - for (; idx <= idx_end; - ++idx, gfn += (1UL << KVM_HPAGE_GFN_SHIFT(j))) - ret |= handler(kvm, rmapp++, memslot, - gfn, j, data); + gfn_start = hva_to_gfn_memslot(hva_start, memslot); + gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot); + + for_each_slot_rmap_range(memslot, PT_PAGE_TABLE_LEVEL, + PT_MAX_HUGEPAGE_LEVEL, + gfn_start, gfn_end - 1, + &iterator) + ret |= handler(kvm, iterator.rmap, memslot, + iterator.gfn, iterator.level, data); } } @@ -1495,16 +1573,13 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp, BUG_ON(!shadow_accessed_mask); - for (sptep = rmap_get_first(*rmapp, &iter); sptep; - sptep = rmap_get_next(&iter)) { - BUG_ON(!is_shadow_present_pte(*sptep)); - + for_each_rmap_spte(rmapp, &iter, sptep) if (*sptep & shadow_accessed_mask) { young = 1; clear_bit((ffs(shadow_accessed_mask) - 1), (unsigned long *)sptep); } - } + trace_kvm_age_page(gfn, level, slot, young); return young; } @@ -1525,15 +1600,11 @@ static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp, if (!shadow_accessed_mask) goto out; - for (sptep = rmap_get_first(*rmapp, &iter); sptep; - sptep = rmap_get_next(&iter)) { - BUG_ON(!is_shadow_present_pte(*sptep)); - + for_each_rmap_spte(rmapp, &iter, sptep) if (*sptep & shadow_accessed_mask) { young = 1; break; } - } out: return young; } @@ -1547,7 +1618,7 @@ static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) sp = page_header(__pa(spte)); - rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp->role.level); + rmapp = gfn_to_rmap(vcpu->kvm, gfn, sp); kvm_unmap_rmapp(vcpu->kvm, rmapp, NULL, gfn, sp->role.level, 0); kvm_flush_remote_tlbs(vcpu->kvm); @@ -1967,7 +2038,7 @@ static void mmu_sync_children(struct kvm_vcpu *vcpu, bool protected = false; for_each_sp(pages, sp, parents, i) - protected |= rmap_write_protect(vcpu->kvm, sp->gfn); + protected |= rmap_write_protect(vcpu, sp->gfn); if (protected) kvm_flush_remote_tlbs(vcpu->kvm); @@ -2065,12 +2136,12 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu, hlist_add_head(&sp->hash_link, &vcpu->kvm->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)]); if (!direct) { - if (rmap_write_protect(vcpu->kvm, gfn)) + if (rmap_write_protect(vcpu, gfn)) kvm_flush_remote_tlbs(vcpu->kvm); if (level > PT_PAGE_TABLE_LEVEL && need_sync) kvm_sync_pages(vcpu, gfn); - account_shadowed(vcpu->kvm, gfn); + account_shadowed(vcpu->kvm, sp); } sp->mmu_valid_gen = vcpu->kvm->arch.mmu_valid_gen; init_shadow_page_table(sp); @@ -2251,7 +2322,7 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, struct kvm_mmu_page *sp, kvm_mmu_unlink_parents(kvm, sp); if (!sp->role.invalid && !sp->role.direct) - unaccount_shadowed(kvm, sp->gfn); + unaccount_shadowed(kvm, sp); if (sp->unsync) kvm_unlink_unsync_page(kvm, sp); @@ -2363,111 +2434,6 @@ int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn) } EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page); -/* - * The function is based on mtrr_type_lookup() in - * arch/x86/kernel/cpu/mtrr/generic.c - */ -static int get_mtrr_type(struct mtrr_state_type *mtrr_state, - u64 start, u64 end) -{ - int i; - u64 base, mask; - u8 prev_match, curr_match; - int num_var_ranges = KVM_NR_VAR_MTRR; - - if (!mtrr_state->enabled) - return 0xFF; - - /* Make end inclusive end, instead of exclusive */ - end--; - - /* Look in fixed ranges. Just return the type as per start */ - if (mtrr_state->have_fixed && (start < 0x100000)) { - int idx; - - if (start < 0x80000) { - idx = 0; - idx += (start >> 16); - return mtrr_state->fixed_ranges[idx]; - } else if (start < 0xC0000) { - idx = 1 * 8; - idx += ((start - 0x80000) >> 14); - return mtrr_state->fixed_ranges[idx]; - } else if (start < 0x1000000) { - idx = 3 * 8; - idx += ((start - 0xC0000) >> 12); - return mtrr_state->fixed_ranges[idx]; - } - } - - /* - * Look in variable ranges - * Look of multiple ranges matching this address and pick type - * as per MTRR precedence - */ - if (!(mtrr_state->enabled & 2)) - return mtrr_state->def_type; - - prev_match = 0xFF; - for (i = 0; i < num_var_ranges; ++i) { - unsigned short start_state, end_state; - - if (!(mtrr_state->var_ranges[i].mask_lo & (1 << 11))) - continue; - - base = (((u64)mtrr_state->var_ranges[i].base_hi) << 32) + - (mtrr_state->var_ranges[i].base_lo & PAGE_MASK); - mask = (((u64)mtrr_state->var_ranges[i].mask_hi) << 32) + - (mtrr_state->var_ranges[i].mask_lo & PAGE_MASK); - - start_state = ((start & mask) == (base & mask)); - end_state = ((end & mask) == (base & mask)); - if (start_state != end_state) - return 0xFE; - - if ((start & mask) != (base & mask)) - continue; - - curr_match = mtrr_state->var_ranges[i].base_lo & 0xff; - if (prev_match == 0xFF) { - prev_match = curr_match; - continue; - } - - if (prev_match == MTRR_TYPE_UNCACHABLE || - curr_match == MTRR_TYPE_UNCACHABLE) - return MTRR_TYPE_UNCACHABLE; - - if ((prev_match == MTRR_TYPE_WRBACK && - curr_match == MTRR_TYPE_WRTHROUGH) || - (prev_match == MTRR_TYPE_WRTHROUGH && - curr_match == MTRR_TYPE_WRBACK)) { - prev_match = MTRR_TYPE_WRTHROUGH; - curr_match = MTRR_TYPE_WRTHROUGH; - } - - if (prev_match != curr_match) - return MTRR_TYPE_UNCACHABLE; - } - - if (prev_match != 0xFF) - return prev_match; - - return mtrr_state->def_type; -} - -u8 kvm_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn) -{ - u8 mtrr; - - mtrr = get_mtrr_type(&vcpu->arch.mtrr_state, gfn << PAGE_SHIFT, - (gfn << PAGE_SHIFT) + PAGE_SIZE); - if (mtrr == 0xfe || mtrr == 0xff) - mtrr = MTRR_TYPE_WRBACK; - return mtrr; -} -EXPORT_SYMBOL_GPL(kvm_get_guest_memory_type); - static void __kvm_unsync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) { trace_kvm_mmu_unsync_page(sp); @@ -2510,6 +2476,14 @@ static int mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn, return 0; } +static bool kvm_is_mmio_pfn(pfn_t pfn) +{ + if (pfn_valid(pfn)) + return !is_zero_pfn(pfn) && PageReserved(pfn_to_page(pfn)); + + return true; +} + static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, unsigned pte_access, int level, gfn_t gfn, pfn_t pfn, bool speculative, @@ -2518,7 +2492,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, u64 spte; int ret = 0; - if (set_mmio_spte(vcpu->kvm, sptep, gfn, pfn, pte_access)) + if (set_mmio_spte(vcpu, sptep, gfn, pfn, pte_access)) return 0; spte = PT_PRESENT_MASK; @@ -2537,7 +2511,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, spte |= PT_PAGE_SIZE_MASK; if (tdp_enabled) spte |= kvm_x86_ops->get_mt_mask(vcpu, gfn, - kvm_is_reserved_pfn(pfn)); + kvm_is_mmio_pfn(pfn)); if (host_writable) spte |= SPTE_HOST_WRITEABLE; @@ -2555,7 +2529,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, * be fixed if guest refault. */ if (level > PT_PAGE_TABLE_LEVEL && - has_wrprotected_page(vcpu->kvm, gfn, level)) + has_wrprotected_page(vcpu, gfn, level)) goto done; spte |= PT_WRITABLE_MASK | SPTE_MMU_WRITEABLE; @@ -2579,7 +2553,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep, } if (pte_access & ACC_WRITE_MASK) { - mark_page_dirty(vcpu->kvm, gfn); + kvm_vcpu_mark_page_dirty(vcpu, gfn); spte |= shadow_dirty_mask; } @@ -2669,15 +2643,17 @@ static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu, u64 *start, u64 *end) { struct page *pages[PTE_PREFETCH_NUM]; + struct kvm_memory_slot *slot; unsigned access = sp->role.access; int i, ret; gfn_t gfn; gfn = kvm_mmu_page_get_gfn(sp, start - sp->spt); - if (!gfn_to_memslot_dirty_bitmap(vcpu, gfn, access & ACC_WRITE_MASK)) + slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, access & ACC_WRITE_MASK); + if (!slot) return -1; - ret = gfn_to_page_many_atomic(vcpu->kvm, gfn, pages, end - start); + ret = gfn_to_page_many_atomic(slot, gfn, pages, end - start); if (ret <= 0) return -1; @@ -2795,7 +2771,7 @@ static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gfn_t gfn, pfn_t pfn) return 1; if (pfn == KVM_PFN_ERR_HWPOISON) { - kvm_send_hwpoison_signal(gfn_to_hva(vcpu->kvm, gfn), current); + kvm_send_hwpoison_signal(kvm_vcpu_gfn_to_hva(vcpu, gfn), current); return 0; } @@ -2818,7 +2794,7 @@ static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu, if (!is_error_noslot_pfn(pfn) && !kvm_is_reserved_pfn(pfn) && level == PT_PAGE_TABLE_LEVEL && PageTransCompound(pfn_to_page(pfn)) && - !has_wrprotected_page(vcpu->kvm, gfn, PT_DIRECTORY_LEVEL)) { + !has_wrprotected_page(vcpu, gfn, PT_DIRECTORY_LEVEL)) { unsigned long mask; /* * mmu_notifier_retry was successful and we hold the @@ -2910,7 +2886,7 @@ fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, * Compare with set_spte where instead shadow_dirty_mask is set. */ if (cmpxchg64(sptep, spte, spte | PT_WRITABLE_MASK) == spte) - mark_page_dirty(vcpu->kvm, gfn); + kvm_vcpu_mark_page_dirty(vcpu, gfn); return true; } @@ -3006,14 +2982,13 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code, { int r; int level; - int force_pt_level; + bool force_pt_level = false; pfn_t pfn; unsigned long mmu_seq; bool map_writable, write = error_code & PFERR_WRITE_MASK; - force_pt_level = mapping_level_dirty_bitmap(vcpu, gfn); + level = mapping_level(vcpu, gfn, &force_pt_level); if (likely(!force_pt_level)) { - level = mapping_level(vcpu, gfn); /* * This path builds a PAE pagetable - so we can map * 2mb pages at maximum. Therefore check if the level @@ -3023,8 +2998,7 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code, level = PT_DIRECTORY_LEVEL; gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1); - } else - level = PT_PAGE_TABLE_LEVEL; + } if (fast_page_fault(vcpu, v, level, error_code)) return 0; @@ -3312,6 +3286,25 @@ static gpa_t nonpaging_gva_to_gpa_nested(struct kvm_vcpu *vcpu, gva_t vaddr, return vcpu->arch.nested_mmu.translate_gpa(vcpu, vaddr, access, exception); } +static bool +__is_rsvd_bits_set(struct rsvd_bits_validate *rsvd_check, u64 pte, int level) +{ + int bit7 = (pte >> 7) & 1, low6 = pte & 0x3f; + + return (pte & rsvd_check->rsvd_bits_mask[bit7][level-1]) | + ((rsvd_check->bad_mt_xwr & (1ull << low6)) != 0); +} + +static bool is_rsvd_bits_set(struct kvm_mmu *mmu, u64 gpte, int level) +{ + return __is_rsvd_bits_set(&mmu->guest_rsvd_check, gpte, level); +} + +static bool is_shadow_zero_bits_set(struct kvm_mmu *mmu, u64 spte, int level) +{ + return __is_rsvd_bits_set(&mmu->shadow_zero_check, spte, level); +} + static bool quickly_check_mmio_pf(struct kvm_vcpu *vcpu, u64 addr, bool direct) { if (direct) @@ -3320,37 +3313,69 @@ static bool quickly_check_mmio_pf(struct kvm_vcpu *vcpu, u64 addr, bool direct) return vcpu_match_mmio_gva(vcpu, addr); } -static u64 walk_shadow_page_get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr) +/* return true if reserved bit is detected on spte. */ +static bool +walk_shadow_page_get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep) { struct kvm_shadow_walk_iterator iterator; - u64 spte = 0ull; + u64 sptes[PT64_ROOT_LEVEL], spte = 0ull; + int root, leaf; + bool reserved = false; if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) - return spte; + goto exit; walk_shadow_page_lockless_begin(vcpu); - for_each_shadow_entry_lockless(vcpu, addr, iterator, spte) + + for (shadow_walk_init(&iterator, vcpu, addr), + leaf = root = iterator.level; + shadow_walk_okay(&iterator); + __shadow_walk_next(&iterator, spte)) { + spte = mmu_spte_get_lockless(iterator.sptep); + + sptes[leaf - 1] = spte; + leaf--; + if (!is_shadow_present_pte(spte)) break; + + reserved |= is_shadow_zero_bits_set(&vcpu->arch.mmu, spte, + iterator.level); + } + walk_shadow_page_lockless_end(vcpu); - return spte; + if (reserved) { + pr_err("%s: detect reserved bits on spte, addr 0x%llx, dump hierarchy:\n", + __func__, addr); + while (root > leaf) { + pr_err("------ spte 0x%llx level %d.\n", + sptes[root - 1], root); + root--; + } + } +exit: + *sptep = spte; + return reserved; } -int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct) +int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, bool direct) { u64 spte; + bool reserved; if (quickly_check_mmio_pf(vcpu, addr, direct)) return RET_MMIO_PF_EMULATE; - spte = walk_shadow_page_get_mmio_spte(vcpu, addr); + reserved = walk_shadow_page_get_mmio_spte(vcpu, addr, &spte); + if (WARN_ON(reserved)) + return RET_MMIO_PF_BUG; if (is_mmio_spte(spte)) { gfn_t gfn = get_mmio_spte_gfn(spte); unsigned access = get_mmio_spte_access(spte); - if (!check_mmio_spte(vcpu->kvm, spte)) + if (!check_mmio_spte(vcpu, spte)) return RET_MMIO_PF_INVALID; if (direct) @@ -3367,17 +3392,7 @@ int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct) */ return RET_MMIO_PF_RETRY; } -EXPORT_SYMBOL_GPL(handle_mmio_page_fault_common); - -static int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, - u32 error_code, bool direct) -{ - int ret; - - ret = handle_mmio_page_fault_common(vcpu, addr, direct); - WARN_ON(ret == RET_MMIO_PF_BUG); - return ret; -} +EXPORT_SYMBOL_GPL(handle_mmio_page_fault); static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva, u32 error_code, bool prefault) @@ -3388,7 +3403,7 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva, pgprintk("%s: gva %lx error %x\n", __func__, gva, error_code); if (unlikely(error_code & PFERR_RSVD_MASK)) { - r = handle_mmio_page_fault(vcpu, gva, error_code, true); + r = handle_mmio_page_fault(vcpu, gva, true); if (likely(r != RET_MMIO_PF_INVALID)) return r; @@ -3415,12 +3430,12 @@ static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn) arch.direct_map = vcpu->arch.mmu.direct_map; arch.cr3 = vcpu->arch.mmu.get_cr3(vcpu); - return kvm_setup_async_pf(vcpu, gva, gfn_to_hva(vcpu->kvm, gfn), &arch); + return kvm_setup_async_pf(vcpu, gva, kvm_vcpu_gfn_to_hva(vcpu, gfn), &arch); } static bool can_do_async_pf(struct kvm_vcpu *vcpu) { - if (unlikely(!irqchip_in_kernel(vcpu->kvm) || + if (unlikely(!lapic_in_kernel(vcpu) || kvm_event_needs_reinjection(vcpu))) return false; @@ -3430,10 +3445,12 @@ static bool can_do_async_pf(struct kvm_vcpu *vcpu) static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, gva_t gva, pfn_t *pfn, bool write, bool *writable) { + struct kvm_memory_slot *slot; bool async; - *pfn = gfn_to_pfn_async(vcpu->kvm, gfn, &async, write, writable); - + slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); + async = false; + *pfn = __gfn_to_pfn_memslot(slot, gfn, false, &async, write, writable); if (!async) return false; /* *pfn has correct page already */ @@ -3447,18 +3464,27 @@ static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, return true; } - *pfn = gfn_to_pfn_prot(vcpu->kvm, gfn, write, writable); - + *pfn = __gfn_to_pfn_memslot(slot, gfn, false, NULL, write, writable); return false; } +static bool +check_hugepage_cache_consistency(struct kvm_vcpu *vcpu, gfn_t gfn, int level) +{ + int page_num = KVM_PAGES_PER_HPAGE(level); + + gfn &= ~(page_num - 1); + + return kvm_mtrr_check_gfn_range_consistency(vcpu, gfn, page_num); +} + static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code, bool prefault) { pfn_t pfn; int r; int level; - int force_pt_level; + bool force_pt_level; gfn_t gfn = gpa >> PAGE_SHIFT; unsigned long mmu_seq; int write = error_code & PFERR_WRITE_MASK; @@ -3467,7 +3493,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code, MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu.root_hpa)); if (unlikely(error_code & PFERR_RSVD_MASK)) { - r = handle_mmio_page_fault(vcpu, gpa, error_code, true); + r = handle_mmio_page_fault(vcpu, gpa, true); if (likely(r != RET_MMIO_PF_INVALID)) return r; @@ -3477,12 +3503,15 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code, if (r) return r; - force_pt_level = mapping_level_dirty_bitmap(vcpu, gfn); + force_pt_level = !check_hugepage_cache_consistency(vcpu, gfn, + PT_DIRECTORY_LEVEL); + level = mapping_level(vcpu, gfn, &force_pt_level); if (likely(!force_pt_level)) { - level = mapping_level(vcpu, gfn); + if (level > PT_DIRECTORY_LEVEL && + !check_hugepage_cache_consistency(vcpu, gfn, level)) + level = PT_DIRECTORY_LEVEL; gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1); - } else - level = PT_PAGE_TABLE_LEVEL; + } if (fast_page_fault(vcpu, gpa, level, error_code)) return 0; @@ -3545,7 +3574,7 @@ static void inject_page_fault(struct kvm_vcpu *vcpu, vcpu->arch.mmu.inject_page_fault(vcpu, fault); } -static bool sync_mmio_spte(struct kvm *kvm, u64 *sptep, gfn_t gfn, +static bool sync_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn, unsigned access, int *nr_present) { if (unlikely(is_mmio_spte(*sptep))) { @@ -3555,7 +3584,7 @@ static bool sync_mmio_spte(struct kvm *kvm, u64 *sptep, gfn_t gfn, } (*nr_present)++; - mark_mmio_spte(kvm, sptep, gfn, access); + mark_mmio_spte(vcpu, sptep, gfn, access); return true; } @@ -3584,111 +3613,197 @@ static inline bool is_last_gpte(struct kvm_mmu *mmu, unsigned level, unsigned gp #include "paging_tmpl.h" #undef PTTYPE -static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, - struct kvm_mmu *context) +static void +__reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, + struct rsvd_bits_validate *rsvd_check, + int maxphyaddr, int level, bool nx, bool gbpages, + bool pse, bool amd) { - int maxphyaddr = cpuid_maxphyaddr(vcpu); u64 exb_bit_rsvd = 0; u64 gbpages_bit_rsvd = 0; u64 nonleaf_bit8_rsvd = 0; - context->bad_mt_xwr = 0; + rsvd_check->bad_mt_xwr = 0; - if (!context->nx) + if (!nx) exb_bit_rsvd = rsvd_bits(63, 63); - if (!guest_cpuid_has_gbpages(vcpu)) + if (!gbpages) gbpages_bit_rsvd = rsvd_bits(7, 7); /* * Non-leaf PML4Es and PDPEs reserve bit 8 (which would be the G bit for * leaf entries) on AMD CPUs only. */ - if (guest_cpuid_is_amd(vcpu)) + if (amd) nonleaf_bit8_rsvd = rsvd_bits(8, 8); - switch (context->root_level) { + switch (level) { case PT32_ROOT_LEVEL: /* no rsvd bits for 2 level 4K page table entries */ - context->rsvd_bits_mask[0][1] = 0; - context->rsvd_bits_mask[0][0] = 0; - context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[0][0]; + rsvd_check->rsvd_bits_mask[0][1] = 0; + rsvd_check->rsvd_bits_mask[0][0] = 0; + rsvd_check->rsvd_bits_mask[1][0] = + rsvd_check->rsvd_bits_mask[0][0]; - if (!is_pse(vcpu)) { - context->rsvd_bits_mask[1][1] = 0; + if (!pse) { + rsvd_check->rsvd_bits_mask[1][1] = 0; break; } if (is_cpuid_PSE36()) /* 36bits PSE 4MB page */ - context->rsvd_bits_mask[1][1] = rsvd_bits(17, 21); + rsvd_check->rsvd_bits_mask[1][1] = rsvd_bits(17, 21); else /* 32 bits PSE 4MB page */ - context->rsvd_bits_mask[1][1] = rsvd_bits(13, 21); + rsvd_check->rsvd_bits_mask[1][1] = rsvd_bits(13, 21); break; case PT32E_ROOT_LEVEL: - context->rsvd_bits_mask[0][2] = + rsvd_check->rsvd_bits_mask[0][2] = rsvd_bits(maxphyaddr, 63) | rsvd_bits(5, 8) | rsvd_bits(1, 2); /* PDPTE */ - context->rsvd_bits_mask[0][1] = exb_bit_rsvd | + rsvd_check->rsvd_bits_mask[0][1] = exb_bit_rsvd | rsvd_bits(maxphyaddr, 62); /* PDE */ - context->rsvd_bits_mask[0][0] = exb_bit_rsvd | + rsvd_check->rsvd_bits_mask[0][0] = exb_bit_rsvd | rsvd_bits(maxphyaddr, 62); /* PTE */ - context->rsvd_bits_mask[1][1] = exb_bit_rsvd | + rsvd_check->rsvd_bits_mask[1][1] = exb_bit_rsvd | rsvd_bits(maxphyaddr, 62) | rsvd_bits(13, 20); /* large page */ - context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[0][0]; + rsvd_check->rsvd_bits_mask[1][0] = + rsvd_check->rsvd_bits_mask[0][0]; break; case PT64_ROOT_LEVEL: - context->rsvd_bits_mask[0][3] = exb_bit_rsvd | - nonleaf_bit8_rsvd | rsvd_bits(7, 7) | rsvd_bits(maxphyaddr, 51); - context->rsvd_bits_mask[0][2] = exb_bit_rsvd | - nonleaf_bit8_rsvd | gbpages_bit_rsvd | rsvd_bits(maxphyaddr, 51); - context->rsvd_bits_mask[0][1] = exb_bit_rsvd | + rsvd_check->rsvd_bits_mask[0][3] = exb_bit_rsvd | + nonleaf_bit8_rsvd | rsvd_bits(7, 7) | + rsvd_bits(maxphyaddr, 51); + rsvd_check->rsvd_bits_mask[0][2] = exb_bit_rsvd | + nonleaf_bit8_rsvd | gbpages_bit_rsvd | rsvd_bits(maxphyaddr, 51); - context->rsvd_bits_mask[0][0] = exb_bit_rsvd | + rsvd_check->rsvd_bits_mask[0][1] = exb_bit_rsvd | rsvd_bits(maxphyaddr, 51); - context->rsvd_bits_mask[1][3] = context->rsvd_bits_mask[0][3]; - context->rsvd_bits_mask[1][2] = exb_bit_rsvd | + rsvd_check->rsvd_bits_mask[0][0] = exb_bit_rsvd | + rsvd_bits(maxphyaddr, 51); + rsvd_check->rsvd_bits_mask[1][3] = + rsvd_check->rsvd_bits_mask[0][3]; + rsvd_check->rsvd_bits_mask[1][2] = exb_bit_rsvd | gbpages_bit_rsvd | rsvd_bits(maxphyaddr, 51) | rsvd_bits(13, 29); - context->rsvd_bits_mask[1][1] = exb_bit_rsvd | + rsvd_check->rsvd_bits_mask[1][1] = exb_bit_rsvd | rsvd_bits(maxphyaddr, 51) | rsvd_bits(13, 20); /* large page */ - context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[0][0]; + rsvd_check->rsvd_bits_mask[1][0] = + rsvd_check->rsvd_bits_mask[0][0]; break; } } -static void reset_rsvds_bits_mask_ept(struct kvm_vcpu *vcpu, - struct kvm_mmu *context, bool execonly) +static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, + struct kvm_mmu *context) { - int maxphyaddr = cpuid_maxphyaddr(vcpu); - int pte; + __reset_rsvds_bits_mask(vcpu, &context->guest_rsvd_check, + cpuid_maxphyaddr(vcpu), context->root_level, + context->nx, guest_cpuid_has_gbpages(vcpu), + is_pse(vcpu), guest_cpuid_is_amd(vcpu)); +} + +static void +__reset_rsvds_bits_mask_ept(struct rsvd_bits_validate *rsvd_check, + int maxphyaddr, bool execonly) +{ + u64 bad_mt_xwr; - context->rsvd_bits_mask[0][3] = + rsvd_check->rsvd_bits_mask[0][3] = rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 7); - context->rsvd_bits_mask[0][2] = + rsvd_check->rsvd_bits_mask[0][2] = rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 6); - context->rsvd_bits_mask[0][1] = + rsvd_check->rsvd_bits_mask[0][1] = rsvd_bits(maxphyaddr, 51) | rsvd_bits(3, 6); - context->rsvd_bits_mask[0][0] = rsvd_bits(maxphyaddr, 51); + rsvd_check->rsvd_bits_mask[0][0] = rsvd_bits(maxphyaddr, 51); /* large page */ - context->rsvd_bits_mask[1][3] = context->rsvd_bits_mask[0][3]; - context->rsvd_bits_mask[1][2] = + rsvd_check->rsvd_bits_mask[1][3] = rsvd_check->rsvd_bits_mask[0][3]; + rsvd_check->rsvd_bits_mask[1][2] = rsvd_bits(maxphyaddr, 51) | rsvd_bits(12, 29); - context->rsvd_bits_mask[1][1] = + rsvd_check->rsvd_bits_mask[1][1] = rsvd_bits(maxphyaddr, 51) | rsvd_bits(12, 20); - context->rsvd_bits_mask[1][0] = context->rsvd_bits_mask[0][0]; - - for (pte = 0; pte < 64; pte++) { - int rwx_bits = pte & 7; - int mt = pte >> 3; - if (mt == 0x2 || mt == 0x3 || mt == 0x7 || - rwx_bits == 0x2 || rwx_bits == 0x6 || - (rwx_bits == 0x4 && !execonly)) - context->bad_mt_xwr |= (1ull << pte); + rsvd_check->rsvd_bits_mask[1][0] = rsvd_check->rsvd_bits_mask[0][0]; + + bad_mt_xwr = 0xFFull << (2 * 8); /* bits 3..5 must not be 2 */ + bad_mt_xwr |= 0xFFull << (3 * 8); /* bits 3..5 must not be 3 */ + bad_mt_xwr |= 0xFFull << (7 * 8); /* bits 3..5 must not be 7 */ + bad_mt_xwr |= REPEAT_BYTE(1ull << 2); /* bits 0..2 must not be 010 */ + bad_mt_xwr |= REPEAT_BYTE(1ull << 6); /* bits 0..2 must not be 110 */ + if (!execonly) { + /* bits 0..2 must not be 100 unless VMX capabilities allow it */ + bad_mt_xwr |= REPEAT_BYTE(1ull << 4); } + rsvd_check->bad_mt_xwr = bad_mt_xwr; +} + +static void reset_rsvds_bits_mask_ept(struct kvm_vcpu *vcpu, + struct kvm_mmu *context, bool execonly) +{ + __reset_rsvds_bits_mask_ept(&context->guest_rsvd_check, + cpuid_maxphyaddr(vcpu), execonly); +} + +/* + * the page table on host is the shadow page table for the page + * table in guest or amd nested guest, its mmu features completely + * follow the features in guest. + */ +void +reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context) +{ + bool uses_nx = context->nx || context->base_role.smep_andnot_wp; + + /* + * Passing "true" to the last argument is okay; it adds a check + * on bit 8 of the SPTEs which KVM doesn't use anyway. + */ + __reset_rsvds_bits_mask(vcpu, &context->shadow_zero_check, + boot_cpu_data.x86_phys_bits, + context->shadow_root_level, uses_nx, + guest_cpuid_has_gbpages(vcpu), is_pse(vcpu), + true); +} +EXPORT_SYMBOL_GPL(reset_shadow_zero_bits_mask); + +static inline bool boot_cpu_is_amd(void) +{ + WARN_ON_ONCE(!tdp_enabled); + return shadow_x_mask == 0; +} + +/* + * the direct page table on host, use as much mmu features as + * possible, however, kvm currently does not do execution-protection. + */ +static void +reset_tdp_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, + struct kvm_mmu *context) +{ + if (boot_cpu_is_amd()) + __reset_rsvds_bits_mask(vcpu, &context->shadow_zero_check, + boot_cpu_data.x86_phys_bits, + context->shadow_root_level, false, + cpu_has_gbpages, true, true); + else + __reset_rsvds_bits_mask_ept(&context->shadow_zero_check, + boot_cpu_data.x86_phys_bits, + false); + +} + +/* + * as the comments in reset_shadow_zero_bits_mask() except it + * is the shadow page table for intel nested guest. + */ +static void +reset_ept_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, + struct kvm_mmu *context, bool execonly) +{ + __reset_rsvds_bits_mask_ept(&context->shadow_zero_check, + boot_cpu_data.x86_phys_bits, execonly); } static void update_permission_bitmask(struct kvm_vcpu *vcpu, @@ -3833,6 +3948,7 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) struct kvm_mmu *context = &vcpu->arch.mmu; context->base_role.word = 0; + context->base_role.smm = is_smm(vcpu); context->page_fault = tdp_page_fault; context->sync_page = nonpaging_sync_page; context->invlpg = nonpaging_invlpg; @@ -3868,6 +3984,7 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) update_permission_bitmask(vcpu, context, false); update_last_pte_bitmap(vcpu, context); + reset_tdp_shadow_zero_bits_mask(vcpu, context); } void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu) @@ -3894,6 +4011,8 @@ void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu) = smep && !is_write_protection(vcpu); context->base_role.smap_andnot_wp = smap && !is_write_protection(vcpu); + context->base_role.smm = is_smm(vcpu); + reset_shadow_zero_bits_mask(vcpu, context); } EXPORT_SYMBOL_GPL(kvm_init_shadow_mmu); @@ -3917,6 +4036,7 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly) update_permission_bitmask(vcpu, context, true); reset_rsvds_bits_mask_ept(vcpu, context, execonly); + reset_ept_shadow_zero_bits_mask(vcpu, context, execonly); } EXPORT_SYMBOL_GPL(kvm_init_shadow_ept_mmu); @@ -4065,7 +4185,7 @@ static u64 mmu_pte_write_fetch_gpte(struct kvm_vcpu *vcpu, gpa_t *gpa, /* Handle a 32-bit guest writing two halves of a 64-bit gpte */ *gpa &= ~(gpa_t)7; *bytes = 8; - r = kvm_read_guest(vcpu->kvm, *gpa, &gentry, 8); + r = kvm_vcpu_read_guest(vcpu, *gpa, &gentry, 8); if (r) gentry = 0; new = (const u8 *)&gentry; @@ -4177,6 +4297,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, mask.nxe = 1; mask.smep_andnot_wp = 1; mask.smap_andnot_wp = 1; + mask.smm = 1; /* * If we don't have indirect shadow pages, it means no page is @@ -4375,36 +4496,115 @@ void kvm_mmu_setup(struct kvm_vcpu *vcpu) init_kvm_mmu(vcpu); } -void kvm_mmu_slot_remove_write_access(struct kvm *kvm, - struct kvm_memory_slot *memslot) +/* The return value indicates if tlb flush on all vcpus is needed. */ +typedef bool (*slot_level_handler) (struct kvm *kvm, unsigned long *rmap); + +/* The caller should hold mmu-lock before calling this function. */ +static bool +slot_handle_level_range(struct kvm *kvm, struct kvm_memory_slot *memslot, + slot_level_handler fn, int start_level, int end_level, + gfn_t start_gfn, gfn_t end_gfn, bool lock_flush_tlb) { - gfn_t last_gfn; - int i; + struct slot_rmap_walk_iterator iterator; bool flush = false; - last_gfn = memslot->base_gfn + memslot->npages - 1; + for_each_slot_rmap_range(memslot, start_level, end_level, start_gfn, + end_gfn, &iterator) { + if (iterator.rmap) + flush |= fn(kvm, iterator.rmap); - spin_lock(&kvm->mmu_lock); + if (need_resched() || spin_needbreak(&kvm->mmu_lock)) { + if (flush && lock_flush_tlb) { + kvm_flush_remote_tlbs(kvm); + flush = false; + } + cond_resched_lock(&kvm->mmu_lock); + } + } + + if (flush && lock_flush_tlb) { + kvm_flush_remote_tlbs(kvm); + flush = false; + } - for (i = PT_PAGE_TABLE_LEVEL; - i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) { - unsigned long *rmapp; - unsigned long last_index, index; + return flush; +} + +static bool +slot_handle_level(struct kvm *kvm, struct kvm_memory_slot *memslot, + slot_level_handler fn, int start_level, int end_level, + bool lock_flush_tlb) +{ + return slot_handle_level_range(kvm, memslot, fn, start_level, + end_level, memslot->base_gfn, + memslot->base_gfn + memslot->npages - 1, + lock_flush_tlb); +} + +static bool +slot_handle_all_level(struct kvm *kvm, struct kvm_memory_slot *memslot, + slot_level_handler fn, bool lock_flush_tlb) +{ + return slot_handle_level(kvm, memslot, fn, PT_PAGE_TABLE_LEVEL, + PT_MAX_HUGEPAGE_LEVEL, lock_flush_tlb); +} + +static bool +slot_handle_large_level(struct kvm *kvm, struct kvm_memory_slot *memslot, + slot_level_handler fn, bool lock_flush_tlb) +{ + return slot_handle_level(kvm, memslot, fn, PT_PAGE_TABLE_LEVEL + 1, + PT_MAX_HUGEPAGE_LEVEL, lock_flush_tlb); +} + +static bool +slot_handle_leaf(struct kvm *kvm, struct kvm_memory_slot *memslot, + slot_level_handler fn, bool lock_flush_tlb) +{ + return slot_handle_level(kvm, memslot, fn, PT_PAGE_TABLE_LEVEL, + PT_PAGE_TABLE_LEVEL, lock_flush_tlb); +} - rmapp = memslot->arch.rmap[i - PT_PAGE_TABLE_LEVEL]; - last_index = gfn_to_index(last_gfn, memslot->base_gfn, i); +void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end) +{ + struct kvm_memslots *slots; + struct kvm_memory_slot *memslot; + int i; - for (index = 0; index <= last_index; ++index, ++rmapp) { - if (*rmapp) - flush |= __rmap_write_protect(kvm, rmapp, - false); + spin_lock(&kvm->mmu_lock); + for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { + slots = __kvm_memslots(kvm, i); + kvm_for_each_memslot(memslot, slots) { + gfn_t start, end; + + start = max(gfn_start, memslot->base_gfn); + end = min(gfn_end, memslot->base_gfn + memslot->npages); + if (start >= end) + continue; - if (need_resched() || spin_needbreak(&kvm->mmu_lock)) - cond_resched_lock(&kvm->mmu_lock); + slot_handle_level_range(kvm, memslot, kvm_zap_rmapp, + PT_PAGE_TABLE_LEVEL, PT_MAX_HUGEPAGE_LEVEL, + start, end - 1, true); } } spin_unlock(&kvm->mmu_lock); +} + +static bool slot_rmap_write_protect(struct kvm *kvm, unsigned long *rmapp) +{ + return __rmap_write_protect(kvm, rmapp, false); +} + +void kvm_mmu_slot_remove_write_access(struct kvm *kvm, + struct kvm_memory_slot *memslot) +{ + bool flush; + + spin_lock(&kvm->mmu_lock); + flush = slot_handle_all_level(kvm, memslot, slot_rmap_write_protect, + false); + spin_unlock(&kvm->mmu_lock); /* * kvm_mmu_slot_remove_write_access() and kvm_vm_ioctl_get_dirty_log() @@ -4437,9 +4637,8 @@ static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm, pfn_t pfn; struct kvm_mmu_page *sp; - for (sptep = rmap_get_first(*rmapp, &iter); sptep;) { - BUG_ON(!(*sptep & PT_PRESENT_MASK)); - +restart: + for_each_rmap_spte(rmapp, &iter, sptep) { sp = page_header(__pa(sptep)); pfn = spte_to_pfn(*sptep); @@ -4454,71 +4653,31 @@ static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm, !kvm_is_reserved_pfn(pfn) && PageTransCompound(pfn_to_page(pfn))) { drop_spte(kvm, sptep); - sptep = rmap_get_first(*rmapp, &iter); need_tlb_flush = 1; - } else - sptep = rmap_get_next(&iter); + goto restart; + } } return need_tlb_flush; } void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm, - struct kvm_memory_slot *memslot) + const struct kvm_memory_slot *memslot) { - bool flush = false; - unsigned long *rmapp; - unsigned long last_index, index; - + /* FIXME: const-ify all uses of struct kvm_memory_slot. */ spin_lock(&kvm->mmu_lock); - - rmapp = memslot->arch.rmap[0]; - last_index = gfn_to_index(memslot->base_gfn + memslot->npages - 1, - memslot->base_gfn, PT_PAGE_TABLE_LEVEL); - - for (index = 0; index <= last_index; ++index, ++rmapp) { - if (*rmapp) - flush |= kvm_mmu_zap_collapsible_spte(kvm, rmapp); - - if (need_resched() || spin_needbreak(&kvm->mmu_lock)) { - if (flush) { - kvm_flush_remote_tlbs(kvm); - flush = false; - } - cond_resched_lock(&kvm->mmu_lock); - } - } - - if (flush) - kvm_flush_remote_tlbs(kvm); - + slot_handle_leaf(kvm, (struct kvm_memory_slot *)memslot, + kvm_mmu_zap_collapsible_spte, true); spin_unlock(&kvm->mmu_lock); } void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm, struct kvm_memory_slot *memslot) { - gfn_t last_gfn; - unsigned long *rmapp; - unsigned long last_index, index; - bool flush = false; - - last_gfn = memslot->base_gfn + memslot->npages - 1; + bool flush; spin_lock(&kvm->mmu_lock); - - rmapp = memslot->arch.rmap[PT_PAGE_TABLE_LEVEL - 1]; - last_index = gfn_to_index(last_gfn, memslot->base_gfn, - PT_PAGE_TABLE_LEVEL); - - for (index = 0; index <= last_index; ++index, ++rmapp) { - if (*rmapp) - flush |= __rmap_clear_dirty(kvm, rmapp); - - if (need_resched() || spin_needbreak(&kvm->mmu_lock)) - cond_resched_lock(&kvm->mmu_lock); - } - + flush = slot_handle_leaf(kvm, memslot, __rmap_clear_dirty, false); spin_unlock(&kvm->mmu_lock); lockdep_assert_held(&kvm->slots_lock); @@ -4537,31 +4696,11 @@ EXPORT_SYMBOL_GPL(kvm_mmu_slot_leaf_clear_dirty); void kvm_mmu_slot_largepage_remove_write_access(struct kvm *kvm, struct kvm_memory_slot *memslot) { - gfn_t last_gfn; - int i; - bool flush = false; - - last_gfn = memslot->base_gfn + memslot->npages - 1; + bool flush; spin_lock(&kvm->mmu_lock); - - for (i = PT_PAGE_TABLE_LEVEL + 1; /* skip rmap for 4K page */ - i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) { - unsigned long *rmapp; - unsigned long last_index, index; - - rmapp = memslot->arch.rmap[i - PT_PAGE_TABLE_LEVEL]; - last_index = gfn_to_index(last_gfn, memslot->base_gfn, i); - - for (index = 0; index <= last_index; ++index, ++rmapp) { - if (*rmapp) - flush |= __rmap_write_protect(kvm, rmapp, - false); - - if (need_resched() || spin_needbreak(&kvm->mmu_lock)) - cond_resched_lock(&kvm->mmu_lock); - } - } + flush = slot_handle_large_level(kvm, memslot, slot_rmap_write_protect, + false); spin_unlock(&kvm->mmu_lock); /* see kvm_mmu_slot_remove_write_access */ @@ -4575,31 +4714,10 @@ EXPORT_SYMBOL_GPL(kvm_mmu_slot_largepage_remove_write_access); void kvm_mmu_slot_set_dirty(struct kvm *kvm, struct kvm_memory_slot *memslot) { - gfn_t last_gfn; - int i; - bool flush = false; - - last_gfn = memslot->base_gfn + memslot->npages - 1; + bool flush; spin_lock(&kvm->mmu_lock); - - for (i = PT_PAGE_TABLE_LEVEL; - i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) { - unsigned long *rmapp; - unsigned long last_index, index; - - rmapp = memslot->arch.rmap[i - PT_PAGE_TABLE_LEVEL]; - last_index = gfn_to_index(last_gfn, memslot->base_gfn, i); - - for (index = 0; index <= last_index; ++index, ++rmapp) { - if (*rmapp) - flush |= __rmap_set_dirty(kvm, rmapp); - - if (need_resched() || spin_needbreak(&kvm->mmu_lock)) - cond_resched_lock(&kvm->mmu_lock); - } - } - + flush = slot_handle_all_level(kvm, memslot, __rmap_set_dirty, false); spin_unlock(&kvm->mmu_lock); lockdep_assert_held(&kvm->slots_lock); @@ -4696,13 +4814,13 @@ static bool kvm_has_zapped_obsolete_pages(struct kvm *kvm) return unlikely(!list_empty_careful(&kvm->arch.zapped_obsolete_pages)); } -void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm) +void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, struct kvm_memslots *slots) { /* * The very rare case: if the generation-number is round, * zap all shadow pages. */ - if (unlikely(kvm_current_mmio_generation(kvm) == 0)) { + if (unlikely((slots->generation & MMIO_GEN_MASK) == 0)) { printk_ratelimited(KERN_DEBUG "kvm: zapping shadow pages for mmio generation wraparound\n"); kvm_mmu_invalidate_zap_all_pages(kvm); } @@ -4824,41 +4942,22 @@ unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm) unsigned int nr_pages = 0; struct kvm_memslots *slots; struct kvm_memory_slot *memslot; + int i; - slots = kvm_memslots(kvm); + for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { + slots = __kvm_memslots(kvm, i); - kvm_for_each_memslot(memslot, slots) - nr_pages += memslot->npages; + kvm_for_each_memslot(memslot, slots) + nr_pages += memslot->npages; + } nr_mmu_pages = nr_pages * KVM_PERMILLE_MMU_PAGES / 1000; nr_mmu_pages = max(nr_mmu_pages, - (unsigned int) KVM_MIN_ALLOC_MMU_PAGES); + (unsigned int) KVM_MIN_ALLOC_MMU_PAGES); return nr_mmu_pages; } -int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4]) -{ - struct kvm_shadow_walk_iterator iterator; - u64 spte; - int nr_sptes = 0; - - if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) - return nr_sptes; - - walk_shadow_page_lockless_begin(vcpu); - for_each_shadow_entry_lockless(vcpu, addr, iterator, spte) { - sptes[iterator.level-1] = spte; - nr_sptes++; - if (!is_shadow_present_pte(spte)) - break; - } - walk_shadow_page_lockless_end(vcpu); - - return nr_sptes; -} -EXPORT_SYMBOL_GPL(kvm_mmu_get_spte_hierarchy); - void kvm_mmu_destroy(struct kvm_vcpu *vcpu) { kvm_mmu_unload(vcpu); |