diff options
Diffstat (limited to 'kernel/arch/x86/mm/tlb.c')
-rw-r--r-- | kernel/arch/x86/mm/tlb.c | 325 |
1 files changed, 325 insertions, 0 deletions
diff --git a/kernel/arch/x86/mm/tlb.c b/kernel/arch/x86/mm/tlb.c new file mode 100644 index 000000000..3250f2371 --- /dev/null +++ b/kernel/arch/x86/mm/tlb.c @@ -0,0 +1,325 @@ +#include <linux/init.h> + +#include <linux/mm.h> +#include <linux/spinlock.h> +#include <linux/smp.h> +#include <linux/interrupt.h> +#include <linux/module.h> +#include <linux/cpu.h> + +#include <asm/tlbflush.h> +#include <asm/mmu_context.h> +#include <asm/cache.h> +#include <asm/apic.h> +#include <asm/uv/uv.h> +#include <linux/debugfs.h> + +/* + * Smarter SMP flushing macros. + * c/o Linus Torvalds. + * + * These mean you can really definitely utterly forget about + * writing to user space from interrupts. (Its not allowed anyway). + * + * Optimizations Manfred Spraul <manfred@colorfullife.com> + * + * More scalable flush, from Andi Kleen + * + * Implement flush IPI by CALL_FUNCTION_VECTOR, Alex Shi + */ + +struct flush_tlb_info { + struct mm_struct *flush_mm; + unsigned long flush_start; + unsigned long flush_end; +}; + +/* + * We cannot call mmdrop() because we are in interrupt context, + * instead update mm->cpu_vm_mask. + */ +void leave_mm(int cpu) +{ + struct mm_struct *active_mm = this_cpu_read(cpu_tlbstate.active_mm); + if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK) + BUG(); + if (cpumask_test_cpu(cpu, mm_cpumask(active_mm))) { + cpumask_clear_cpu(cpu, mm_cpumask(active_mm)); + load_cr3(swapper_pg_dir); + /* + * This gets called in the idle path where RCU + * functions differently. Tracing normally + * uses RCU, so we have to call the tracepoint + * specially here. + */ + trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL); + } +} +EXPORT_SYMBOL_GPL(leave_mm); + +/* + * The flush IPI assumes that a thread switch happens in this order: + * [cpu0: the cpu that switches] + * 1) switch_mm() either 1a) or 1b) + * 1a) thread switch to a different mm + * 1a1) set cpu_tlbstate to TLBSTATE_OK + * Now the tlb flush NMI handler flush_tlb_func won't call leave_mm + * if cpu0 was in lazy tlb mode. + * 1a2) update cpu active_mm + * Now cpu0 accepts tlb flushes for the new mm. + * 1a3) cpu_set(cpu, new_mm->cpu_vm_mask); + * Now the other cpus will send tlb flush ipis. + * 1a4) change cr3. + * 1a5) cpu_clear(cpu, old_mm->cpu_vm_mask); + * Stop ipi delivery for the old mm. This is not synchronized with + * the other cpus, but flush_tlb_func ignore flush ipis for the wrong + * mm, and in the worst case we perform a superfluous tlb flush. + * 1b) thread switch without mm change + * cpu active_mm is correct, cpu0 already handles flush ipis. + * 1b1) set cpu_tlbstate to TLBSTATE_OK + * 1b2) test_and_set the cpu bit in cpu_vm_mask. + * Atomically set the bit [other cpus will start sending flush ipis], + * and test the bit. + * 1b3) if the bit was 0: leave_mm was called, flush the tlb. + * 2) switch %%esp, ie current + * + * The interrupt must handle 2 special cases: + * - cr3 is changed before %%esp, ie. it cannot use current->{active_,}mm. + * - the cpu performs speculative tlb reads, i.e. even if the cpu only + * runs in kernel space, the cpu could load tlb entries for user space + * pages. + * + * The good news is that cpu_tlbstate is local to each cpu, no + * write/read ordering problems. + */ + +/* + * TLB flush funcation: + * 1) Flush the tlb entries if the cpu uses the mm that's being flushed. + * 2) Leave the mm if we are in the lazy tlb mode. + */ +static void flush_tlb_func(void *info) +{ + struct flush_tlb_info *f = info; + + inc_irq_stat(irq_tlb_count); + + if (f->flush_mm != this_cpu_read(cpu_tlbstate.active_mm)) + return; + if (!f->flush_end) + f->flush_end = f->flush_start + PAGE_SIZE; + + count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED); + if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK) { + if (f->flush_end == TLB_FLUSH_ALL) { + local_flush_tlb(); + trace_tlb_flush(TLB_REMOTE_SHOOTDOWN, TLB_FLUSH_ALL); + } else { + unsigned long addr; + unsigned long nr_pages = + f->flush_end - f->flush_start / PAGE_SIZE; + addr = f->flush_start; + while (addr < f->flush_end) { + __flush_tlb_single(addr); + addr += PAGE_SIZE; + } + trace_tlb_flush(TLB_REMOTE_SHOOTDOWN, nr_pages); + } + } else + leave_mm(smp_processor_id()); + +} + +void native_flush_tlb_others(const struct cpumask *cpumask, + struct mm_struct *mm, unsigned long start, + unsigned long end) +{ + struct flush_tlb_info info; + info.flush_mm = mm; + info.flush_start = start; + info.flush_end = end; + + count_vm_tlb_event(NR_TLB_REMOTE_FLUSH); + if (is_uv_system()) { + unsigned int cpu; + + cpu = smp_processor_id(); + cpumask = uv_flush_tlb_others(cpumask, mm, start, end, cpu); + if (cpumask) + smp_call_function_many(cpumask, flush_tlb_func, + &info, 1); + return; + } + smp_call_function_many(cpumask, flush_tlb_func, &info, 1); +} + +void flush_tlb_current_task(void) +{ + struct mm_struct *mm = current->mm; + + preempt_disable(); + + count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL); + local_flush_tlb(); + trace_tlb_flush(TLB_LOCAL_SHOOTDOWN, TLB_FLUSH_ALL); + if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids) + flush_tlb_others(mm_cpumask(mm), mm, 0UL, TLB_FLUSH_ALL); + preempt_enable(); +} + +/* + * See Documentation/x86/tlb.txt for details. We choose 33 + * because it is large enough to cover the vast majority (at + * least 95%) of allocations, and is small enough that we are + * confident it will not cause too much overhead. Each single + * flush is about 100 ns, so this caps the maximum overhead at + * _about_ 3,000 ns. + * + * This is in units of pages. + */ +static unsigned long tlb_single_page_flush_ceiling __read_mostly = 33; + +void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, + unsigned long end, unsigned long vmflag) +{ + unsigned long addr; + /* do a global flush by default */ + unsigned long base_pages_to_flush = TLB_FLUSH_ALL; + + preempt_disable(); + if (current->active_mm != mm) + goto out; + + if (!current->mm) { + leave_mm(smp_processor_id()); + goto out; + } + + if ((end != TLB_FLUSH_ALL) && !(vmflag & VM_HUGETLB)) + base_pages_to_flush = (end - start) >> PAGE_SHIFT; + + if (base_pages_to_flush > tlb_single_page_flush_ceiling) { + base_pages_to_flush = TLB_FLUSH_ALL; + count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL); + local_flush_tlb(); + } else { + /* flush range by one by one 'invlpg' */ + for (addr = start; addr < end; addr += PAGE_SIZE) { + count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ONE); + __flush_tlb_single(addr); + } + } + trace_tlb_flush(TLB_LOCAL_MM_SHOOTDOWN, base_pages_to_flush); +out: + if (base_pages_to_flush == TLB_FLUSH_ALL) { + start = 0UL; + end = TLB_FLUSH_ALL; + } + if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids) + flush_tlb_others(mm_cpumask(mm), mm, start, end); + preempt_enable(); +} + +void flush_tlb_page(struct vm_area_struct *vma, unsigned long start) +{ + struct mm_struct *mm = vma->vm_mm; + + preempt_disable(); + + if (current->active_mm == mm) { + if (current->mm) + __flush_tlb_one(start); + else + leave_mm(smp_processor_id()); + } + + if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids) + flush_tlb_others(mm_cpumask(mm), mm, start, 0UL); + + preempt_enable(); +} + +static void do_flush_tlb_all(void *info) +{ + count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED); + __flush_tlb_all(); + if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_LAZY) + leave_mm(smp_processor_id()); +} + +void flush_tlb_all(void) +{ + count_vm_tlb_event(NR_TLB_REMOTE_FLUSH); + on_each_cpu(do_flush_tlb_all, NULL, 1); +} + +static void do_kernel_range_flush(void *info) +{ + struct flush_tlb_info *f = info; + unsigned long addr; + + /* flush range by one by one 'invlpg' */ + for (addr = f->flush_start; addr < f->flush_end; addr += PAGE_SIZE) + __flush_tlb_single(addr); +} + +void flush_tlb_kernel_range(unsigned long start, unsigned long end) +{ + + /* Balance as user space task's flush, a bit conservative */ + if (end == TLB_FLUSH_ALL || + (end - start) > tlb_single_page_flush_ceiling * PAGE_SIZE) { + on_each_cpu(do_flush_tlb_all, NULL, 1); + } else { + struct flush_tlb_info info; + info.flush_start = start; + info.flush_end = end; + on_each_cpu(do_kernel_range_flush, &info, 1); + } +} + +static ssize_t tlbflush_read_file(struct file *file, char __user *user_buf, + size_t count, loff_t *ppos) +{ + char buf[32]; + unsigned int len; + + len = sprintf(buf, "%ld\n", tlb_single_page_flush_ceiling); + return simple_read_from_buffer(user_buf, count, ppos, buf, len); +} + +static ssize_t tlbflush_write_file(struct file *file, + const char __user *user_buf, size_t count, loff_t *ppos) +{ + char buf[32]; + ssize_t len; + int ceiling; + + len = min(count, sizeof(buf) - 1); + if (copy_from_user(buf, user_buf, len)) + return -EFAULT; + + buf[len] = '\0'; + if (kstrtoint(buf, 0, &ceiling)) + return -EINVAL; + + if (ceiling < 0) + return -EINVAL; + + tlb_single_page_flush_ceiling = ceiling; + return count; +} + +static const struct file_operations fops_tlbflush = { + .read = tlbflush_read_file, + .write = tlbflush_write_file, + .llseek = default_llseek, +}; + +static int __init create_tlb_single_page_flush_ceiling(void) +{ + debugfs_create_file("tlb_single_page_flush_ceiling", S_IRUSR | S_IWUSR, + arch_debugfs_dir, NULL, &fops_tlbflush); + return 0; +} +late_initcall(create_tlb_single_page_flush_ceiling); |