From 9ca8dbcc65cfc63d6f5ef3312a33184e1d726e00 Mon Sep 17 00:00:00 2001 From: Yunhong Jiang Date: Tue, 4 Aug 2015 12:17:53 -0700 Subject: Add the rt linux 4.1.3-rt3 as base Import the rt linux 4.1.3-rt3 as OPNFV kvm base. It's from git://git.kernel.org/pub/scm/linux/kernel/git/rt/linux-rt-devel.git linux-4.1.y-rt and the base is: commit 0917f823c59692d751951bf5ea699a2d1e2f26a2 Author: Sebastian Andrzej Siewior Date: Sat Jul 25 12:13:34 2015 +0200 Prepare v4.1.3-rt3 Signed-off-by: Sebastian Andrzej Siewior We lose all the git history this way and it's not good. We should apply another opnfv project repo in future. Change-Id: I87543d81c9df70d99c5001fbdf646b202c19f423 Signed-off-by: Yunhong Jiang --- kernel/arch/alpha/mm/Makefile | 9 ++ kernel/arch/alpha/mm/extable.c | 92 ++++++++++++ kernel/arch/alpha/mm/fault.c | 259 +++++++++++++++++++++++++++++++++ kernel/arch/alpha/mm/init.c | 300 ++++++++++++++++++++++++++++++++++++++ kernel/arch/alpha/mm/numa.c | 321 +++++++++++++++++++++++++++++++++++++++++ 5 files changed, 981 insertions(+) create mode 100644 kernel/arch/alpha/mm/Makefile create mode 100644 kernel/arch/alpha/mm/extable.c create mode 100644 kernel/arch/alpha/mm/fault.c create mode 100644 kernel/arch/alpha/mm/init.c create mode 100644 kernel/arch/alpha/mm/numa.c (limited to 'kernel/arch/alpha/mm') diff --git a/kernel/arch/alpha/mm/Makefile b/kernel/arch/alpha/mm/Makefile new file mode 100644 index 000000000..c993d3f93 --- /dev/null +++ b/kernel/arch/alpha/mm/Makefile @@ -0,0 +1,9 @@ +# +# Makefile for the linux alpha-specific parts of the memory manager. +# + +ccflags-y := -Werror + +obj-y := init.o fault.o extable.o + +obj-$(CONFIG_DISCONTIGMEM) += numa.o diff --git a/kernel/arch/alpha/mm/extable.c b/kernel/arch/alpha/mm/extable.c new file mode 100644 index 000000000..813c9b63c --- /dev/null +++ b/kernel/arch/alpha/mm/extable.c @@ -0,0 +1,92 @@ +/* + * linux/arch/alpha/mm/extable.c + */ + +#include +#include +#include + +static inline unsigned long ex_to_addr(const struct exception_table_entry *x) +{ + return (unsigned long)&x->insn + x->insn; +} + +static void swap_ex(void *a, void *b, int size) +{ + struct exception_table_entry *ex_a = a, *ex_b = b; + unsigned long addr_a = ex_to_addr(ex_a), addr_b = ex_to_addr(ex_b); + unsigned int t = ex_a->fixup.unit; + + ex_a->fixup.unit = ex_b->fixup.unit; + ex_b->fixup.unit = t; + ex_a->insn = (int)(addr_b - (unsigned long)&ex_a->insn); + ex_b->insn = (int)(addr_a - (unsigned long)&ex_b->insn); +} + +/* + * The exception table needs to be sorted so that the binary + * search that we use to find entries in it works properly. + * This is used both for the kernel exception table and for + * the exception tables of modules that get loaded. + */ +static int cmp_ex(const void *a, const void *b) +{ + const struct exception_table_entry *x = a, *y = b; + + /* avoid overflow */ + if (ex_to_addr(x) > ex_to_addr(y)) + return 1; + if (ex_to_addr(x) < ex_to_addr(y)) + return -1; + return 0; +} + +void sort_extable(struct exception_table_entry *start, + struct exception_table_entry *finish) +{ + sort(start, finish - start, sizeof(struct exception_table_entry), + cmp_ex, swap_ex); +} + +#ifdef CONFIG_MODULES +/* + * Any entry referring to the module init will be at the beginning or + * the end. + */ +void trim_init_extable(struct module *m) +{ + /*trim the beginning*/ + while (m->num_exentries && + within_module_init(ex_to_addr(&m->extable[0]), m)) { + m->extable++; + m->num_exentries--; + } + /*trim the end*/ + while (m->num_exentries && + within_module_init(ex_to_addr(&m->extable[m->num_exentries-1]), + m)) + m->num_exentries--; +} +#endif /* CONFIG_MODULES */ + +const struct exception_table_entry * +search_extable(const struct exception_table_entry *first, + const struct exception_table_entry *last, + unsigned long value) +{ + while (first <= last) { + const struct exception_table_entry *mid; + unsigned long mid_value; + + mid = (last - first) / 2 + first; + mid_value = ex_to_addr(mid); + if (mid_value == value) + return mid; + else if (mid_value < value) + first = mid+1; + else + last = mid-1; + } + + return NULL; +} diff --git a/kernel/arch/alpha/mm/fault.c b/kernel/arch/alpha/mm/fault.c new file mode 100644 index 000000000..4a905bd66 --- /dev/null +++ b/kernel/arch/alpha/mm/fault.c @@ -0,0 +1,259 @@ +/* + * linux/arch/alpha/mm/fault.c + * + * Copyright (C) 1995 Linus Torvalds + */ + +#include +#include +#include +#include + +#define __EXTERN_INLINE inline +#include +#include +#undef __EXTERN_INLINE + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +extern void die_if_kernel(char *,struct pt_regs *,long, unsigned long *); + + +/* + * Force a new ASN for a task. + */ + +#ifndef CONFIG_SMP +unsigned long last_asn = ASN_FIRST_VERSION; +#endif + +void +__load_new_mm_context(struct mm_struct *next_mm) +{ + unsigned long mmc; + struct pcb_struct *pcb; + + mmc = __get_new_mm_context(next_mm, smp_processor_id()); + next_mm->context[smp_processor_id()] = mmc; + + pcb = ¤t_thread_info()->pcb; + pcb->asn = mmc & HARDWARE_ASN_MASK; + pcb->ptbr = ((unsigned long) next_mm->pgd - IDENT_ADDR) >> PAGE_SHIFT; + + __reload_thread(pcb); +} + + +/* + * This routine handles page faults. It determines the address, + * and the problem, and then passes it off to handle_mm_fault(). + * + * mmcsr: + * 0 = translation not valid + * 1 = access violation + * 2 = fault-on-read + * 3 = fault-on-execute + * 4 = fault-on-write + * + * cause: + * -1 = instruction fetch + * 0 = load + * 1 = store + * + * Registers $9 through $15 are saved in a block just prior to `regs' and + * are saved and restored around the call to allow exception code to + * modify them. + */ + +/* Macro for exception fixup code to access integer registers. */ +#define dpf_reg(r) \ + (((unsigned long *)regs)[(r) <= 8 ? (r) : (r) <= 15 ? (r)-16 : \ + (r) <= 18 ? (r)+8 : (r)-10]) + +asmlinkage void +do_page_fault(unsigned long address, unsigned long mmcsr, + long cause, struct pt_regs *regs) +{ + struct vm_area_struct * vma; + struct mm_struct *mm = current->mm; + const struct exception_table_entry *fixup; + int fault, si_code = SEGV_MAPERR; + siginfo_t info; + unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE; + + /* As of EV6, a load into $31/$f31 is a prefetch, and never faults + (or is suppressed by the PALcode). Support that for older CPUs + by ignoring such an instruction. */ + if (cause == 0) { + unsigned int insn; + __get_user(insn, (unsigned int __user *)regs->pc); + if ((insn >> 21 & 0x1f) == 0x1f && + /* ldq ldl ldt lds ldg ldf ldwu ldbu */ + (1ul << (insn >> 26) & 0x30f00001400ul)) { + regs->pc += 4; + return; + } + } + + /* If we're in an interrupt context, or have no user context, + we must not take the fault. */ + if (!mm || faulthandler_disabled()) + goto no_context; + +#ifdef CONFIG_ALPHA_LARGE_VMALLOC + if (address >= TASK_SIZE) + goto vmalloc_fault; +#endif + if (user_mode(regs)) + flags |= FAULT_FLAG_USER; +retry: + down_read(&mm->mmap_sem); + vma = find_vma(mm, address); + if (!vma) + goto bad_area; + if (vma->vm_start <= address) + goto good_area; + if (!(vma->vm_flags & VM_GROWSDOWN)) + goto bad_area; + if (expand_stack(vma, address)) + goto bad_area; + + /* Ok, we have a good vm_area for this memory access, so + we can handle it. */ + good_area: + si_code = SEGV_ACCERR; + if (cause < 0) { + if (!(vma->vm_flags & VM_EXEC)) + goto bad_area; + } else if (!cause) { + /* Allow reads even for write-only mappings */ + if (!(vma->vm_flags & (VM_READ | VM_WRITE))) + goto bad_area; + } else { + if (!(vma->vm_flags & VM_WRITE)) + goto bad_area; + flags |= FAULT_FLAG_WRITE; + } + + /* If for any reason at all we couldn't handle the fault, + make sure we exit gracefully rather than endlessly redo + the fault. */ + fault = handle_mm_fault(mm, vma, address, flags); + + if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current)) + return; + + if (unlikely(fault & VM_FAULT_ERROR)) { + if (fault & VM_FAULT_OOM) + goto out_of_memory; + else if (fault & VM_FAULT_SIGSEGV) + goto bad_area; + else if (fault & VM_FAULT_SIGBUS) + goto do_sigbus; + BUG(); + } + + if (flags & FAULT_FLAG_ALLOW_RETRY) { + if (fault & VM_FAULT_MAJOR) + current->maj_flt++; + else + current->min_flt++; + if (fault & VM_FAULT_RETRY) { + flags &= ~FAULT_FLAG_ALLOW_RETRY; + + /* No need to up_read(&mm->mmap_sem) as we would + * have already released it in __lock_page_or_retry + * in mm/filemap.c. + */ + + goto retry; + } + } + + up_read(&mm->mmap_sem); + + return; + + /* Something tried to access memory that isn't in our memory map. + Fix it, but check if it's kernel or user first. */ + bad_area: + up_read(&mm->mmap_sem); + + if (user_mode(regs)) + goto do_sigsegv; + + no_context: + /* Are we prepared to handle this fault as an exception? */ + if ((fixup = search_exception_tables(regs->pc)) != 0) { + unsigned long newpc; + newpc = fixup_exception(dpf_reg, fixup, regs->pc); + regs->pc = newpc; + return; + } + + /* Oops. The kernel tried to access some bad page. We'll have to + terminate things with extreme prejudice. */ + printk(KERN_ALERT "Unable to handle kernel paging request at " + "virtual address %016lx\n", address); + die_if_kernel("Oops", regs, cause, (unsigned long*)regs - 16); + do_exit(SIGKILL); + + /* We ran out of memory, or some other thing happened to us that + made us unable to handle the page fault gracefully. */ + out_of_memory: + up_read(&mm->mmap_sem); + if (!user_mode(regs)) + goto no_context; + pagefault_out_of_memory(); + return; + + do_sigbus: + up_read(&mm->mmap_sem); + /* Send a sigbus, regardless of whether we were in kernel + or user mode. */ + info.si_signo = SIGBUS; + info.si_errno = 0; + info.si_code = BUS_ADRERR; + info.si_addr = (void __user *) address; + force_sig_info(SIGBUS, &info, current); + if (!user_mode(regs)) + goto no_context; + return; + + do_sigsegv: + info.si_signo = SIGSEGV; + info.si_errno = 0; + info.si_code = si_code; + info.si_addr = (void __user *) address; + force_sig_info(SIGSEGV, &info, current); + return; + +#ifdef CONFIG_ALPHA_LARGE_VMALLOC + vmalloc_fault: + if (user_mode(regs)) + goto do_sigsegv; + else { + /* Synchronize this task's top level page-table + with the "reference" page table from init. */ + long index = pgd_index(address); + pgd_t *pgd, *pgd_k; + + pgd = current->active_mm->pgd + index; + pgd_k = swapper_pg_dir + index; + if (!pgd_present(*pgd) && pgd_present(*pgd_k)) { + pgd_val(*pgd) = pgd_val(*pgd_k); + return; + } + goto no_context; + } +#endif +} diff --git a/kernel/arch/alpha/mm/init.c b/kernel/arch/alpha/mm/init.c new file mode 100644 index 000000000..a1bea91df --- /dev/null +++ b/kernel/arch/alpha/mm/init.c @@ -0,0 +1,300 @@ +/* + * linux/arch/alpha/mm/init.c + * + * Copyright (C) 1995 Linus Torvalds + */ + +/* 2.3.x zone allocator, 1999 Andrea Arcangeli */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include /* max_low_pfn */ +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +extern void die_if_kernel(char *,struct pt_regs *,long); + +static struct pcb_struct original_pcb; + +pgd_t * +pgd_alloc(struct mm_struct *mm) +{ + pgd_t *ret, *init; + + ret = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO); + init = pgd_offset(&init_mm, 0UL); + if (ret) { +#ifdef CONFIG_ALPHA_LARGE_VMALLOC + memcpy (ret + USER_PTRS_PER_PGD, init + USER_PTRS_PER_PGD, + (PTRS_PER_PGD - USER_PTRS_PER_PGD - 1)*sizeof(pgd_t)); +#else + pgd_val(ret[PTRS_PER_PGD-2]) = pgd_val(init[PTRS_PER_PGD-2]); +#endif + + /* The last PGD entry is the VPTB self-map. */ + pgd_val(ret[PTRS_PER_PGD-1]) + = pte_val(mk_pte(virt_to_page(ret), PAGE_KERNEL)); + } + return ret; +} + + +/* + * BAD_PAGE is the page that is used for page faults when linux + * is out-of-memory. Older versions of linux just did a + * do_exit(), but using this instead means there is less risk + * for a process dying in kernel mode, possibly leaving an inode + * unused etc.. + * + * BAD_PAGETABLE is the accompanying page-table: it is initialized + * to point to BAD_PAGE entries. + * + * ZERO_PAGE is a special page that is used for zero-initialized + * data and COW. + */ +pmd_t * +__bad_pagetable(void) +{ + memset((void *) EMPTY_PGT, 0, PAGE_SIZE); + return (pmd_t *) EMPTY_PGT; +} + +pte_t +__bad_page(void) +{ + memset((void *) EMPTY_PGE, 0, PAGE_SIZE); + return pte_mkdirty(mk_pte(virt_to_page(EMPTY_PGE), PAGE_SHARED)); +} + +static inline unsigned long +load_PCB(struct pcb_struct *pcb) +{ + register unsigned long sp __asm__("$30"); + pcb->ksp = sp; + return __reload_thread(pcb); +} + +/* Set up initial PCB, VPTB, and other such nicities. */ + +static inline void +switch_to_system_map(void) +{ + unsigned long newptbr; + unsigned long original_pcb_ptr; + + /* Initialize the kernel's page tables. Linux puts the vptb in + the last slot of the L1 page table. */ + memset(swapper_pg_dir, 0, PAGE_SIZE); + newptbr = ((unsigned long) swapper_pg_dir - PAGE_OFFSET) >> PAGE_SHIFT; + pgd_val(swapper_pg_dir[1023]) = + (newptbr << 32) | pgprot_val(PAGE_KERNEL); + + /* Set the vptb. This is often done by the bootloader, but + shouldn't be required. */ + if (hwrpb->vptb != 0xfffffffe00000000UL) { + wrvptptr(0xfffffffe00000000UL); + hwrpb->vptb = 0xfffffffe00000000UL; + hwrpb_update_checksum(hwrpb); + } + + /* Also set up the real kernel PCB while we're at it. */ + init_thread_info.pcb.ptbr = newptbr; + init_thread_info.pcb.flags = 1; /* set FEN, clear everything else */ + original_pcb_ptr = load_PCB(&init_thread_info.pcb); + tbia(); + + /* Save off the contents of the original PCB so that we can + restore the original console's page tables for a clean reboot. + + Note that the PCB is supposed to be a physical address, but + since KSEG values also happen to work, folks get confused. + Check this here. */ + + if (original_pcb_ptr < PAGE_OFFSET) { + original_pcb_ptr = (unsigned long) + phys_to_virt(original_pcb_ptr); + } + original_pcb = *(struct pcb_struct *) original_pcb_ptr; +} + +int callback_init_done; + +void * __init +callback_init(void * kernel_end) +{ + struct crb_struct * crb; + pgd_t *pgd; + pmd_t *pmd; + void *two_pages; + + /* Starting at the HWRPB, locate the CRB. */ + crb = (struct crb_struct *)((char *)hwrpb + hwrpb->crb_offset); + + if (alpha_using_srm) { + /* Tell the console whither it is to be remapped. */ + if (srm_fixup(VMALLOC_START, (unsigned long)hwrpb)) + __halt(); /* "We're boned." --Bender */ + + /* Edit the procedure descriptors for DISPATCH and FIXUP. */ + crb->dispatch_va = (struct procdesc_struct *) + (VMALLOC_START + (unsigned long)crb->dispatch_va + - crb->map[0].va); + crb->fixup_va = (struct procdesc_struct *) + (VMALLOC_START + (unsigned long)crb->fixup_va + - crb->map[0].va); + } + + switch_to_system_map(); + + /* Allocate one PGD and one PMD. In the case of SRM, we'll need + these to actually remap the console. There is an assumption + here that only one of each is needed, and this allows for 8MB. + On systems with larger consoles, additional pages will be + allocated as needed during the mapping process. + + In the case of not SRM, but not CONFIG_ALPHA_LARGE_VMALLOC, + we need to allocate the PGD we use for vmalloc before we start + forking other tasks. */ + + two_pages = (void *) + (((unsigned long)kernel_end + ~PAGE_MASK) & PAGE_MASK); + kernel_end = two_pages + 2*PAGE_SIZE; + memset(two_pages, 0, 2*PAGE_SIZE); + + pgd = pgd_offset_k(VMALLOC_START); + pgd_set(pgd, (pmd_t *)two_pages); + pmd = pmd_offset(pgd, VMALLOC_START); + pmd_set(pmd, (pte_t *)(two_pages + PAGE_SIZE)); + + if (alpha_using_srm) { + static struct vm_struct console_remap_vm; + unsigned long nr_pages = 0; + unsigned long vaddr; + unsigned long i, j; + + /* calculate needed size */ + for (i = 0; i < crb->map_entries; ++i) + nr_pages += crb->map[i].count; + + /* register the vm area */ + console_remap_vm.flags = VM_ALLOC; + console_remap_vm.size = nr_pages << PAGE_SHIFT; + vm_area_register_early(&console_remap_vm, PAGE_SIZE); + + vaddr = (unsigned long)console_remap_vm.addr; + + /* Set up the third level PTEs and update the virtual + addresses of the CRB entries. */ + for (i = 0; i < crb->map_entries; ++i) { + unsigned long pfn = crb->map[i].pa >> PAGE_SHIFT; + crb->map[i].va = vaddr; + for (j = 0; j < crb->map[i].count; ++j) { + /* Newer consoles (especially on larger + systems) may require more pages of + PTEs. Grab additional pages as needed. */ + if (pmd != pmd_offset(pgd, vaddr)) { + memset(kernel_end, 0, PAGE_SIZE); + pmd = pmd_offset(pgd, vaddr); + pmd_set(pmd, (pte_t *)kernel_end); + kernel_end += PAGE_SIZE; + } + set_pte(pte_offset_kernel(pmd, vaddr), + pfn_pte(pfn, PAGE_KERNEL)); + pfn++; + vaddr += PAGE_SIZE; + } + } + } + + callback_init_done = 1; + return kernel_end; +} + + +#ifndef CONFIG_DISCONTIGMEM +/* + * paging_init() sets up the memory map. + */ +void __init paging_init(void) +{ + unsigned long zones_size[MAX_NR_ZONES] = {0, }; + unsigned long dma_pfn, high_pfn; + + dma_pfn = virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT; + high_pfn = max_pfn = max_low_pfn; + + if (dma_pfn >= high_pfn) + zones_size[ZONE_DMA] = high_pfn; + else { + zones_size[ZONE_DMA] = dma_pfn; + zones_size[ZONE_NORMAL] = high_pfn - dma_pfn; + } + + /* Initialize mem_map[]. */ + free_area_init(zones_size); + + /* Initialize the kernel's ZERO_PGE. */ + memset((void *)ZERO_PGE, 0, PAGE_SIZE); +} +#endif /* CONFIG_DISCONTIGMEM */ + +#if defined(CONFIG_ALPHA_GENERIC) || defined(CONFIG_ALPHA_SRM) +void +srm_paging_stop (void) +{ + /* Move the vptb back to where the SRM console expects it. */ + swapper_pg_dir[1] = swapper_pg_dir[1023]; + tbia(); + wrvptptr(0x200000000UL); + hwrpb->vptb = 0x200000000UL; + hwrpb_update_checksum(hwrpb); + + /* Reload the page tables that the console had in use. */ + load_PCB(&original_pcb); + tbia(); +} +#endif + +void __init +mem_init(void) +{ + set_max_mapnr(max_low_pfn); + high_memory = (void *) __va(max_low_pfn * PAGE_SIZE); + free_all_bootmem(); + mem_init_print_info(NULL); +} + +void +free_initmem(void) +{ + free_initmem_default(-1); +} + +#ifdef CONFIG_BLK_DEV_INITRD +void +free_initrd_mem(unsigned long start, unsigned long end) +{ + free_reserved_area((void *)start, (void *)end, -1, "initrd"); +} +#endif diff --git a/kernel/arch/alpha/mm/numa.c b/kernel/arch/alpha/mm/numa.c new file mode 100644 index 000000000..d543d71c2 --- /dev/null +++ b/kernel/arch/alpha/mm/numa.c @@ -0,0 +1,321 @@ +/* + * linux/arch/alpha/mm/numa.c + * + * DISCONTIGMEM NUMA alpha support. + * + * Copyright (C) 2001 Andrea Arcangeli SuSE + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +pg_data_t node_data[MAX_NUMNODES]; +EXPORT_SYMBOL(node_data); + +#undef DEBUG_DISCONTIG +#ifdef DEBUG_DISCONTIG +#define DBGDCONT(args...) printk(args) +#else +#define DBGDCONT(args...) +#endif + +#define for_each_mem_cluster(memdesc, _cluster, i) \ + for ((_cluster) = (memdesc)->cluster, (i) = 0; \ + (i) < (memdesc)->numclusters; (i)++, (_cluster)++) + +static void __init show_mem_layout(void) +{ + struct memclust_struct * cluster; + struct memdesc_struct * memdesc; + int i; + + /* Find free clusters, and init and free the bootmem accordingly. */ + memdesc = (struct memdesc_struct *) + (hwrpb->mddt_offset + (unsigned long) hwrpb); + + printk("Raw memory layout:\n"); + for_each_mem_cluster(memdesc, cluster, i) { + printk(" memcluster %2d, usage %1lx, start %8lu, end %8lu\n", + i, cluster->usage, cluster->start_pfn, + cluster->start_pfn + cluster->numpages); + } +} + +static void __init +setup_memory_node(int nid, void *kernel_end) +{ + extern unsigned long mem_size_limit; + struct memclust_struct * cluster; + struct memdesc_struct * memdesc; + unsigned long start_kernel_pfn, end_kernel_pfn; + unsigned long bootmap_size, bootmap_pages, bootmap_start; + unsigned long start, end; + unsigned long node_pfn_start, node_pfn_end; + unsigned long node_min_pfn, node_max_pfn; + int i; + unsigned long node_datasz = PFN_UP(sizeof(pg_data_t)); + int show_init = 0; + + /* Find the bounds of current node */ + node_pfn_start = (node_mem_start(nid)) >> PAGE_SHIFT; + node_pfn_end = node_pfn_start + (node_mem_size(nid) >> PAGE_SHIFT); + + /* Find free clusters, and init and free the bootmem accordingly. */ + memdesc = (struct memdesc_struct *) + (hwrpb->mddt_offset + (unsigned long) hwrpb); + + /* find the bounds of this node (node_min_pfn/node_max_pfn) */ + node_min_pfn = ~0UL; + node_max_pfn = 0UL; + for_each_mem_cluster(memdesc, cluster, i) { + /* Bit 0 is console/PALcode reserved. Bit 1 is + non-volatile memory -- we might want to mark + this for later. */ + if (cluster->usage & 3) + continue; + + start = cluster->start_pfn; + end = start + cluster->numpages; + + if (start >= node_pfn_end || end <= node_pfn_start) + continue; + + if (!show_init) { + show_init = 1; + printk("Initializing bootmem allocator on Node ID %d\n", nid); + } + printk(" memcluster %2d, usage %1lx, start %8lu, end %8lu\n", + i, cluster->usage, cluster->start_pfn, + cluster->start_pfn + cluster->numpages); + + if (start < node_pfn_start) + start = node_pfn_start; + if (end > node_pfn_end) + end = node_pfn_end; + + if (start < node_min_pfn) + node_min_pfn = start; + if (end > node_max_pfn) + node_max_pfn = end; + } + + if (mem_size_limit && node_max_pfn > mem_size_limit) { + static int msg_shown = 0; + if (!msg_shown) { + msg_shown = 1; + printk("setup: forcing memory size to %ldK (from %ldK).\n", + mem_size_limit << (PAGE_SHIFT - 10), + node_max_pfn << (PAGE_SHIFT - 10)); + } + node_max_pfn = mem_size_limit; + } + + if (node_min_pfn >= node_max_pfn) + return; + + /* Update global {min,max}_low_pfn from node information. */ + if (node_min_pfn < min_low_pfn) + min_low_pfn = node_min_pfn; + if (node_max_pfn > max_low_pfn) + max_pfn = max_low_pfn = node_max_pfn; + +#if 0 /* we'll try this one again in a little while */ + /* Cute trick to make sure our local node data is on local memory */ + node_data[nid] = (pg_data_t *)(__va(node_min_pfn << PAGE_SHIFT)); +#endif + /* Quasi-mark the pg_data_t as in-use */ + node_min_pfn += node_datasz; + if (node_min_pfn >= node_max_pfn) { + printk(" not enough mem to reserve NODE_DATA"); + return; + } + NODE_DATA(nid)->bdata = &bootmem_node_data[nid]; + + printk(" Detected node memory: start %8lu, end %8lu\n", + node_min_pfn, node_max_pfn); + + DBGDCONT(" DISCONTIG: node_data[%d] is at 0x%p\n", nid, NODE_DATA(nid)); + DBGDCONT(" DISCONTIG: NODE_DATA(%d)->bdata is at 0x%p\n", nid, NODE_DATA(nid)->bdata); + + /* Find the bounds of kernel memory. */ + start_kernel_pfn = PFN_DOWN(KERNEL_START_PHYS); + end_kernel_pfn = PFN_UP(virt_to_phys(kernel_end)); + bootmap_start = -1; + + if (!nid && (node_max_pfn < end_kernel_pfn || node_min_pfn > start_kernel_pfn)) + panic("kernel loaded out of ram"); + + /* Zone start phys-addr must be 2^(MAX_ORDER-1) aligned. + Note that we round this down, not up - node memory + has much larger alignment than 8Mb, so it's safe. */ + node_min_pfn &= ~((1UL << (MAX_ORDER-1))-1); + + /* We need to know how many physically contiguous pages + we'll need for the bootmap. */ + bootmap_pages = bootmem_bootmap_pages(node_max_pfn-node_min_pfn); + + /* Now find a good region where to allocate the bootmap. */ + for_each_mem_cluster(memdesc, cluster, i) { + if (cluster->usage & 3) + continue; + + start = cluster->start_pfn; + end = start + cluster->numpages; + + if (start >= node_max_pfn || end <= node_min_pfn) + continue; + + if (end > node_max_pfn) + end = node_max_pfn; + if (start < node_min_pfn) + start = node_min_pfn; + + if (start < start_kernel_pfn) { + if (end > end_kernel_pfn + && end - end_kernel_pfn >= bootmap_pages) { + bootmap_start = end_kernel_pfn; + break; + } else if (end > start_kernel_pfn) + end = start_kernel_pfn; + } else if (start < end_kernel_pfn) + start = end_kernel_pfn; + if (end - start >= bootmap_pages) { + bootmap_start = start; + break; + } + } + + if (bootmap_start == -1) + panic("couldn't find a contiguous place for the bootmap"); + + /* Allocate the bootmap and mark the whole MM as reserved. */ + bootmap_size = init_bootmem_node(NODE_DATA(nid), bootmap_start, + node_min_pfn, node_max_pfn); + DBGDCONT(" bootmap_start %lu, bootmap_size %lu, bootmap_pages %lu\n", + bootmap_start, bootmap_size, bootmap_pages); + + /* Mark the free regions. */ + for_each_mem_cluster(memdesc, cluster, i) { + if (cluster->usage & 3) + continue; + + start = cluster->start_pfn; + end = cluster->start_pfn + cluster->numpages; + + if (start >= node_max_pfn || end <= node_min_pfn) + continue; + + if (end > node_max_pfn) + end = node_max_pfn; + if (start < node_min_pfn) + start = node_min_pfn; + + if (start < start_kernel_pfn) { + if (end > end_kernel_pfn) { + free_bootmem_node(NODE_DATA(nid), PFN_PHYS(start), + (PFN_PHYS(start_kernel_pfn) + - PFN_PHYS(start))); + printk(" freeing pages %ld:%ld\n", + start, start_kernel_pfn); + start = end_kernel_pfn; + } else if (end > start_kernel_pfn) + end = start_kernel_pfn; + } else if (start < end_kernel_pfn) + start = end_kernel_pfn; + if (start >= end) + continue; + + free_bootmem_node(NODE_DATA(nid), PFN_PHYS(start), PFN_PHYS(end) - PFN_PHYS(start)); + printk(" freeing pages %ld:%ld\n", start, end); + } + + /* Reserve the bootmap memory. */ + reserve_bootmem_node(NODE_DATA(nid), PFN_PHYS(bootmap_start), + bootmap_size, BOOTMEM_DEFAULT); + printk(" reserving pages %ld:%ld\n", bootmap_start, bootmap_start+PFN_UP(bootmap_size)); + + node_set_online(nid); +} + +void __init +setup_memory(void *kernel_end) +{ + int nid; + + show_mem_layout(); + + nodes_clear(node_online_map); + + min_low_pfn = ~0UL; + max_low_pfn = 0UL; + for (nid = 0; nid < MAX_NUMNODES; nid++) + setup_memory_node(nid, kernel_end); + +#ifdef CONFIG_BLK_DEV_INITRD + initrd_start = INITRD_START; + if (initrd_start) { + extern void *move_initrd(unsigned long); + + initrd_end = initrd_start+INITRD_SIZE; + printk("Initial ramdisk at: 0x%p (%lu bytes)\n", + (void *) initrd_start, INITRD_SIZE); + + if ((void *)initrd_end > phys_to_virt(PFN_PHYS(max_low_pfn))) { + if (!move_initrd(PFN_PHYS(max_low_pfn))) + printk("initrd extends beyond end of memory " + "(0x%08lx > 0x%p)\ndisabling initrd\n", + initrd_end, + phys_to_virt(PFN_PHYS(max_low_pfn))); + } else { + nid = kvaddr_to_nid(initrd_start); + reserve_bootmem_node(NODE_DATA(nid), + virt_to_phys((void *)initrd_start), + INITRD_SIZE, BOOTMEM_DEFAULT); + } + } +#endif /* CONFIG_BLK_DEV_INITRD */ +} + +void __init paging_init(void) +{ + unsigned int nid; + unsigned long zones_size[MAX_NR_ZONES] = {0, }; + unsigned long dma_local_pfn; + + /* + * The old global MAX_DMA_ADDRESS per-arch API doesn't fit + * in the NUMA model, for now we convert it to a pfn and + * we interpret this pfn as a local per-node information. + * This issue isn't very important since none of these machines + * have legacy ISA slots anyways. + */ + dma_local_pfn = virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT; + + for_each_online_node(nid) { + bootmem_data_t *bdata = &bootmem_node_data[nid]; + unsigned long start_pfn = bdata->node_min_pfn; + unsigned long end_pfn = bdata->node_low_pfn; + + if (dma_local_pfn >= end_pfn - start_pfn) + zones_size[ZONE_DMA] = end_pfn - start_pfn; + else { + zones_size[ZONE_DMA] = dma_local_pfn; + zones_size[ZONE_NORMAL] = (end_pfn - start_pfn) - dma_local_pfn; + } + node_set_state(nid, N_NORMAL_MEMORY); + free_area_init_node(nid, zones_size, start_pfn, NULL); + } + + /* Initialize the kernel's ZERO_PGE. */ + memset((void *)ZERO_PGE, 0, PAGE_SIZE); +} -- cgit 1.2.3-korg