summaryrefslogtreecommitdiffstats
path: root/kernel/arch/alpha/mm
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/arch/alpha/mm')
-rw-r--r--kernel/arch/alpha/mm/Makefile9
-rw-r--r--kernel/arch/alpha/mm/extable.c92
-rw-r--r--kernel/arch/alpha/mm/fault.c259
-rw-r--r--kernel/arch/alpha/mm/init.c300
-rw-r--r--kernel/arch/alpha/mm/numa.c321
5 files changed, 981 insertions, 0 deletions
diff --git a/kernel/arch/alpha/mm/Makefile b/kernel/arch/alpha/mm/Makefile
new file mode 100644
index 000000000..c993d3f93
--- /dev/null
+++ b/kernel/arch/alpha/mm/Makefile
@@ -0,0 +1,9 @@
+#
+# Makefile for the linux alpha-specific parts of the memory manager.
+#
+
+ccflags-y := -Werror
+
+obj-y := init.o fault.o extable.o
+
+obj-$(CONFIG_DISCONTIGMEM) += numa.o
diff --git a/kernel/arch/alpha/mm/extable.c b/kernel/arch/alpha/mm/extable.c
new file mode 100644
index 000000000..813c9b63c
--- /dev/null
+++ b/kernel/arch/alpha/mm/extable.c
@@ -0,0 +1,92 @@
+/*
+ * linux/arch/alpha/mm/extable.c
+ */
+
+#include <linux/module.h>
+#include <linux/sort.h>
+#include <asm/uaccess.h>
+
+static inline unsigned long ex_to_addr(const struct exception_table_entry *x)
+{
+ return (unsigned long)&x->insn + x->insn;
+}
+
+static void swap_ex(void *a, void *b, int size)
+{
+ struct exception_table_entry *ex_a = a, *ex_b = b;
+ unsigned long addr_a = ex_to_addr(ex_a), addr_b = ex_to_addr(ex_b);
+ unsigned int t = ex_a->fixup.unit;
+
+ ex_a->fixup.unit = ex_b->fixup.unit;
+ ex_b->fixup.unit = t;
+ ex_a->insn = (int)(addr_b - (unsigned long)&ex_a->insn);
+ ex_b->insn = (int)(addr_a - (unsigned long)&ex_b->insn);
+}
+
+/*
+ * The exception table needs to be sorted so that the binary
+ * search that we use to find entries in it works properly.
+ * This is used both for the kernel exception table and for
+ * the exception tables of modules that get loaded.
+ */
+static int cmp_ex(const void *a, const void *b)
+{
+ const struct exception_table_entry *x = a, *y = b;
+
+ /* avoid overflow */
+ if (ex_to_addr(x) > ex_to_addr(y))
+ return 1;
+ if (ex_to_addr(x) < ex_to_addr(y))
+ return -1;
+ return 0;
+}
+
+void sort_extable(struct exception_table_entry *start,
+ struct exception_table_entry *finish)
+{
+ sort(start, finish - start, sizeof(struct exception_table_entry),
+ cmp_ex, swap_ex);
+}
+
+#ifdef CONFIG_MODULES
+/*
+ * Any entry referring to the module init will be at the beginning or
+ * the end.
+ */
+void trim_init_extable(struct module *m)
+{
+ /*trim the beginning*/
+ while (m->num_exentries &&
+ within_module_init(ex_to_addr(&m->extable[0]), m)) {
+ m->extable++;
+ m->num_exentries--;
+ }
+ /*trim the end*/
+ while (m->num_exentries &&
+ within_module_init(ex_to_addr(&m->extable[m->num_exentries-1]),
+ m))
+ m->num_exentries--;
+}
+#endif /* CONFIG_MODULES */
+
+const struct exception_table_entry *
+search_extable(const struct exception_table_entry *first,
+ const struct exception_table_entry *last,
+ unsigned long value)
+{
+ while (first <= last) {
+ const struct exception_table_entry *mid;
+ unsigned long mid_value;
+
+ mid = (last - first) / 2 + first;
+ mid_value = ex_to_addr(mid);
+ if (mid_value == value)
+ return mid;
+ else if (mid_value < value)
+ first = mid+1;
+ else
+ last = mid-1;
+ }
+
+ return NULL;
+}
diff --git a/kernel/arch/alpha/mm/fault.c b/kernel/arch/alpha/mm/fault.c
new file mode 100644
index 000000000..4a905bd66
--- /dev/null
+++ b/kernel/arch/alpha/mm/fault.c
@@ -0,0 +1,259 @@
+/*
+ * linux/arch/alpha/mm/fault.c
+ *
+ * Copyright (C) 1995 Linus Torvalds
+ */
+
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <asm/io.h>
+
+#define __EXTERN_INLINE inline
+#include <asm/mmu_context.h>
+#include <asm/tlbflush.h>
+#undef __EXTERN_INLINE
+
+#include <linux/signal.h>
+#include <linux/errno.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/ptrace.h>
+#include <linux/mman.h>
+#include <linux/smp.h>
+#include <linux/interrupt.h>
+#include <linux/module.h>
+#include <linux/uaccess.h>
+
+extern void die_if_kernel(char *,struct pt_regs *,long, unsigned long *);
+
+
+/*
+ * Force a new ASN for a task.
+ */
+
+#ifndef CONFIG_SMP
+unsigned long last_asn = ASN_FIRST_VERSION;
+#endif
+
+void
+__load_new_mm_context(struct mm_struct *next_mm)
+{
+ unsigned long mmc;
+ struct pcb_struct *pcb;
+
+ mmc = __get_new_mm_context(next_mm, smp_processor_id());
+ next_mm->context[smp_processor_id()] = mmc;
+
+ pcb = &current_thread_info()->pcb;
+ pcb->asn = mmc & HARDWARE_ASN_MASK;
+ pcb->ptbr = ((unsigned long) next_mm->pgd - IDENT_ADDR) >> PAGE_SHIFT;
+
+ __reload_thread(pcb);
+}
+
+
+/*
+ * This routine handles page faults. It determines the address,
+ * and the problem, and then passes it off to handle_mm_fault().
+ *
+ * mmcsr:
+ * 0 = translation not valid
+ * 1 = access violation
+ * 2 = fault-on-read
+ * 3 = fault-on-execute
+ * 4 = fault-on-write
+ *
+ * cause:
+ * -1 = instruction fetch
+ * 0 = load
+ * 1 = store
+ *
+ * Registers $9 through $15 are saved in a block just prior to `regs' and
+ * are saved and restored around the call to allow exception code to
+ * modify them.
+ */
+
+/* Macro for exception fixup code to access integer registers. */
+#define dpf_reg(r) \
+ (((unsigned long *)regs)[(r) <= 8 ? (r) : (r) <= 15 ? (r)-16 : \
+ (r) <= 18 ? (r)+8 : (r)-10])
+
+asmlinkage void
+do_page_fault(unsigned long address, unsigned long mmcsr,
+ long cause, struct pt_regs *regs)
+{
+ struct vm_area_struct * vma;
+ struct mm_struct *mm = current->mm;
+ const struct exception_table_entry *fixup;
+ int fault, si_code = SEGV_MAPERR;
+ siginfo_t info;
+ unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
+
+ /* As of EV6, a load into $31/$f31 is a prefetch, and never faults
+ (or is suppressed by the PALcode). Support that for older CPUs
+ by ignoring such an instruction. */
+ if (cause == 0) {
+ unsigned int insn;
+ __get_user(insn, (unsigned int __user *)regs->pc);
+ if ((insn >> 21 & 0x1f) == 0x1f &&
+ /* ldq ldl ldt lds ldg ldf ldwu ldbu */
+ (1ul << (insn >> 26) & 0x30f00001400ul)) {
+ regs->pc += 4;
+ return;
+ }
+ }
+
+ /* If we're in an interrupt context, or have no user context,
+ we must not take the fault. */
+ if (!mm || faulthandler_disabled())
+ goto no_context;
+
+#ifdef CONFIG_ALPHA_LARGE_VMALLOC
+ if (address >= TASK_SIZE)
+ goto vmalloc_fault;
+#endif
+ if (user_mode(regs))
+ flags |= FAULT_FLAG_USER;
+retry:
+ down_read(&mm->mmap_sem);
+ vma = find_vma(mm, address);
+ if (!vma)
+ goto bad_area;
+ if (vma->vm_start <= address)
+ goto good_area;
+ if (!(vma->vm_flags & VM_GROWSDOWN))
+ goto bad_area;
+ if (expand_stack(vma, address))
+ goto bad_area;
+
+ /* Ok, we have a good vm_area for this memory access, so
+ we can handle it. */
+ good_area:
+ si_code = SEGV_ACCERR;
+ if (cause < 0) {
+ if (!(vma->vm_flags & VM_EXEC))
+ goto bad_area;
+ } else if (!cause) {
+ /* Allow reads even for write-only mappings */
+ if (!(vma->vm_flags & (VM_READ | VM_WRITE)))
+ goto bad_area;
+ } else {
+ if (!(vma->vm_flags & VM_WRITE))
+ goto bad_area;
+ flags |= FAULT_FLAG_WRITE;
+ }
+
+ /* If for any reason at all we couldn't handle the fault,
+ make sure we exit gracefully rather than endlessly redo
+ the fault. */
+ fault = handle_mm_fault(mm, vma, address, flags);
+
+ if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current))
+ return;
+
+ if (unlikely(fault & VM_FAULT_ERROR)) {
+ if (fault & VM_FAULT_OOM)
+ goto out_of_memory;
+ else if (fault & VM_FAULT_SIGSEGV)
+ goto bad_area;
+ else if (fault & VM_FAULT_SIGBUS)
+ goto do_sigbus;
+ BUG();
+ }
+
+ if (flags & FAULT_FLAG_ALLOW_RETRY) {
+ if (fault & VM_FAULT_MAJOR)
+ current->maj_flt++;
+ else
+ current->min_flt++;
+ if (fault & VM_FAULT_RETRY) {
+ flags &= ~FAULT_FLAG_ALLOW_RETRY;
+
+ /* No need to up_read(&mm->mmap_sem) as we would
+ * have already released it in __lock_page_or_retry
+ * in mm/filemap.c.
+ */
+
+ goto retry;
+ }
+ }
+
+ up_read(&mm->mmap_sem);
+
+ return;
+
+ /* Something tried to access memory that isn't in our memory map.
+ Fix it, but check if it's kernel or user first. */
+ bad_area:
+ up_read(&mm->mmap_sem);
+
+ if (user_mode(regs))
+ goto do_sigsegv;
+
+ no_context:
+ /* Are we prepared to handle this fault as an exception? */
+ if ((fixup = search_exception_tables(regs->pc)) != 0) {
+ unsigned long newpc;
+ newpc = fixup_exception(dpf_reg, fixup, regs->pc);
+ regs->pc = newpc;
+ return;
+ }
+
+ /* Oops. The kernel tried to access some bad page. We'll have to
+ terminate things with extreme prejudice. */
+ printk(KERN_ALERT "Unable to handle kernel paging request at "
+ "virtual address %016lx\n", address);
+ die_if_kernel("Oops", regs, cause, (unsigned long*)regs - 16);
+ do_exit(SIGKILL);
+
+ /* We ran out of memory, or some other thing happened to us that
+ made us unable to handle the page fault gracefully. */
+ out_of_memory:
+ up_read(&mm->mmap_sem);
+ if (!user_mode(regs))
+ goto no_context;
+ pagefault_out_of_memory();
+ return;
+
+ do_sigbus:
+ up_read(&mm->mmap_sem);
+ /* Send a sigbus, regardless of whether we were in kernel
+ or user mode. */
+ info.si_signo = SIGBUS;
+ info.si_errno = 0;
+ info.si_code = BUS_ADRERR;
+ info.si_addr = (void __user *) address;
+ force_sig_info(SIGBUS, &info, current);
+ if (!user_mode(regs))
+ goto no_context;
+ return;
+
+ do_sigsegv:
+ info.si_signo = SIGSEGV;
+ info.si_errno = 0;
+ info.si_code = si_code;
+ info.si_addr = (void __user *) address;
+ force_sig_info(SIGSEGV, &info, current);
+ return;
+
+#ifdef CONFIG_ALPHA_LARGE_VMALLOC
+ vmalloc_fault:
+ if (user_mode(regs))
+ goto do_sigsegv;
+ else {
+ /* Synchronize this task's top level page-table
+ with the "reference" page table from init. */
+ long index = pgd_index(address);
+ pgd_t *pgd, *pgd_k;
+
+ pgd = current->active_mm->pgd + index;
+ pgd_k = swapper_pg_dir + index;
+ if (!pgd_present(*pgd) && pgd_present(*pgd_k)) {
+ pgd_val(*pgd) = pgd_val(*pgd_k);
+ return;
+ }
+ goto no_context;
+ }
+#endif
+}
diff --git a/kernel/arch/alpha/mm/init.c b/kernel/arch/alpha/mm/init.c
new file mode 100644
index 000000000..a1bea91df
--- /dev/null
+++ b/kernel/arch/alpha/mm/init.c
@@ -0,0 +1,300 @@
+/*
+ * linux/arch/alpha/mm/init.c
+ *
+ * Copyright (C) 1995 Linus Torvalds
+ */
+
+/* 2.3.x zone allocator, 1999 Andrea Arcangeli <andrea@suse.de> */
+
+#include <linux/pagemap.h>
+#include <linux/signal.h>
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/ptrace.h>
+#include <linux/mman.h>
+#include <linux/mm.h>
+#include <linux/swap.h>
+#include <linux/init.h>
+#include <linux/bootmem.h> /* max_low_pfn */
+#include <linux/vmalloc.h>
+#include <linux/gfp.h>
+
+#include <asm/uaccess.h>
+#include <asm/pgtable.h>
+#include <asm/pgalloc.h>
+#include <asm/hwrpb.h>
+#include <asm/dma.h>
+#include <asm/mmu_context.h>
+#include <asm/console.h>
+#include <asm/tlb.h>
+#include <asm/setup.h>
+#include <asm/sections.h>
+
+extern void die_if_kernel(char *,struct pt_regs *,long);
+
+static struct pcb_struct original_pcb;
+
+pgd_t *
+pgd_alloc(struct mm_struct *mm)
+{
+ pgd_t *ret, *init;
+
+ ret = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
+ init = pgd_offset(&init_mm, 0UL);
+ if (ret) {
+#ifdef CONFIG_ALPHA_LARGE_VMALLOC
+ memcpy (ret + USER_PTRS_PER_PGD, init + USER_PTRS_PER_PGD,
+ (PTRS_PER_PGD - USER_PTRS_PER_PGD - 1)*sizeof(pgd_t));
+#else
+ pgd_val(ret[PTRS_PER_PGD-2]) = pgd_val(init[PTRS_PER_PGD-2]);
+#endif
+
+ /* The last PGD entry is the VPTB self-map. */
+ pgd_val(ret[PTRS_PER_PGD-1])
+ = pte_val(mk_pte(virt_to_page(ret), PAGE_KERNEL));
+ }
+ return ret;
+}
+
+
+/*
+ * BAD_PAGE is the page that is used for page faults when linux
+ * is out-of-memory. Older versions of linux just did a
+ * do_exit(), but using this instead means there is less risk
+ * for a process dying in kernel mode, possibly leaving an inode
+ * unused etc..
+ *
+ * BAD_PAGETABLE is the accompanying page-table: it is initialized
+ * to point to BAD_PAGE entries.
+ *
+ * ZERO_PAGE is a special page that is used for zero-initialized
+ * data and COW.
+ */
+pmd_t *
+__bad_pagetable(void)
+{
+ memset((void *) EMPTY_PGT, 0, PAGE_SIZE);
+ return (pmd_t *) EMPTY_PGT;
+}
+
+pte_t
+__bad_page(void)
+{
+ memset((void *) EMPTY_PGE, 0, PAGE_SIZE);
+ return pte_mkdirty(mk_pte(virt_to_page(EMPTY_PGE), PAGE_SHARED));
+}
+
+static inline unsigned long
+load_PCB(struct pcb_struct *pcb)
+{
+ register unsigned long sp __asm__("$30");
+ pcb->ksp = sp;
+ return __reload_thread(pcb);
+}
+
+/* Set up initial PCB, VPTB, and other such nicities. */
+
+static inline void
+switch_to_system_map(void)
+{
+ unsigned long newptbr;
+ unsigned long original_pcb_ptr;
+
+ /* Initialize the kernel's page tables. Linux puts the vptb in
+ the last slot of the L1 page table. */
+ memset(swapper_pg_dir, 0, PAGE_SIZE);
+ newptbr = ((unsigned long) swapper_pg_dir - PAGE_OFFSET) >> PAGE_SHIFT;
+ pgd_val(swapper_pg_dir[1023]) =
+ (newptbr << 32) | pgprot_val(PAGE_KERNEL);
+
+ /* Set the vptb. This is often done by the bootloader, but
+ shouldn't be required. */
+ if (hwrpb->vptb != 0xfffffffe00000000UL) {
+ wrvptptr(0xfffffffe00000000UL);
+ hwrpb->vptb = 0xfffffffe00000000UL;
+ hwrpb_update_checksum(hwrpb);
+ }
+
+ /* Also set up the real kernel PCB while we're at it. */
+ init_thread_info.pcb.ptbr = newptbr;
+ init_thread_info.pcb.flags = 1; /* set FEN, clear everything else */
+ original_pcb_ptr = load_PCB(&init_thread_info.pcb);
+ tbia();
+
+ /* Save off the contents of the original PCB so that we can
+ restore the original console's page tables for a clean reboot.
+
+ Note that the PCB is supposed to be a physical address, but
+ since KSEG values also happen to work, folks get confused.
+ Check this here. */
+
+ if (original_pcb_ptr < PAGE_OFFSET) {
+ original_pcb_ptr = (unsigned long)
+ phys_to_virt(original_pcb_ptr);
+ }
+ original_pcb = *(struct pcb_struct *) original_pcb_ptr;
+}
+
+int callback_init_done;
+
+void * __init
+callback_init(void * kernel_end)
+{
+ struct crb_struct * crb;
+ pgd_t *pgd;
+ pmd_t *pmd;
+ void *two_pages;
+
+ /* Starting at the HWRPB, locate the CRB. */
+ crb = (struct crb_struct *)((char *)hwrpb + hwrpb->crb_offset);
+
+ if (alpha_using_srm) {
+ /* Tell the console whither it is to be remapped. */
+ if (srm_fixup(VMALLOC_START, (unsigned long)hwrpb))
+ __halt(); /* "We're boned." --Bender */
+
+ /* Edit the procedure descriptors for DISPATCH and FIXUP. */
+ crb->dispatch_va = (struct procdesc_struct *)
+ (VMALLOC_START + (unsigned long)crb->dispatch_va
+ - crb->map[0].va);
+ crb->fixup_va = (struct procdesc_struct *)
+ (VMALLOC_START + (unsigned long)crb->fixup_va
+ - crb->map[0].va);
+ }
+
+ switch_to_system_map();
+
+ /* Allocate one PGD and one PMD. In the case of SRM, we'll need
+ these to actually remap the console. There is an assumption
+ here that only one of each is needed, and this allows for 8MB.
+ On systems with larger consoles, additional pages will be
+ allocated as needed during the mapping process.
+
+ In the case of not SRM, but not CONFIG_ALPHA_LARGE_VMALLOC,
+ we need to allocate the PGD we use for vmalloc before we start
+ forking other tasks. */
+
+ two_pages = (void *)
+ (((unsigned long)kernel_end + ~PAGE_MASK) & PAGE_MASK);
+ kernel_end = two_pages + 2*PAGE_SIZE;
+ memset(two_pages, 0, 2*PAGE_SIZE);
+
+ pgd = pgd_offset_k(VMALLOC_START);
+ pgd_set(pgd, (pmd_t *)two_pages);
+ pmd = pmd_offset(pgd, VMALLOC_START);
+ pmd_set(pmd, (pte_t *)(two_pages + PAGE_SIZE));
+
+ if (alpha_using_srm) {
+ static struct vm_struct console_remap_vm;
+ unsigned long nr_pages = 0;
+ unsigned long vaddr;
+ unsigned long i, j;
+
+ /* calculate needed size */
+ for (i = 0; i < crb->map_entries; ++i)
+ nr_pages += crb->map[i].count;
+
+ /* register the vm area */
+ console_remap_vm.flags = VM_ALLOC;
+ console_remap_vm.size = nr_pages << PAGE_SHIFT;
+ vm_area_register_early(&console_remap_vm, PAGE_SIZE);
+
+ vaddr = (unsigned long)console_remap_vm.addr;
+
+ /* Set up the third level PTEs and update the virtual
+ addresses of the CRB entries. */
+ for (i = 0; i < crb->map_entries; ++i) {
+ unsigned long pfn = crb->map[i].pa >> PAGE_SHIFT;
+ crb->map[i].va = vaddr;
+ for (j = 0; j < crb->map[i].count; ++j) {
+ /* Newer consoles (especially on larger
+ systems) may require more pages of
+ PTEs. Grab additional pages as needed. */
+ if (pmd != pmd_offset(pgd, vaddr)) {
+ memset(kernel_end, 0, PAGE_SIZE);
+ pmd = pmd_offset(pgd, vaddr);
+ pmd_set(pmd, (pte_t *)kernel_end);
+ kernel_end += PAGE_SIZE;
+ }
+ set_pte(pte_offset_kernel(pmd, vaddr),
+ pfn_pte(pfn, PAGE_KERNEL));
+ pfn++;
+ vaddr += PAGE_SIZE;
+ }
+ }
+ }
+
+ callback_init_done = 1;
+ return kernel_end;
+}
+
+
+#ifndef CONFIG_DISCONTIGMEM
+/*
+ * paging_init() sets up the memory map.
+ */
+void __init paging_init(void)
+{
+ unsigned long zones_size[MAX_NR_ZONES] = {0, };
+ unsigned long dma_pfn, high_pfn;
+
+ dma_pfn = virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT;
+ high_pfn = max_pfn = max_low_pfn;
+
+ if (dma_pfn >= high_pfn)
+ zones_size[ZONE_DMA] = high_pfn;
+ else {
+ zones_size[ZONE_DMA] = dma_pfn;
+ zones_size[ZONE_NORMAL] = high_pfn - dma_pfn;
+ }
+
+ /* Initialize mem_map[]. */
+ free_area_init(zones_size);
+
+ /* Initialize the kernel's ZERO_PGE. */
+ memset((void *)ZERO_PGE, 0, PAGE_SIZE);
+}
+#endif /* CONFIG_DISCONTIGMEM */
+
+#if defined(CONFIG_ALPHA_GENERIC) || defined(CONFIG_ALPHA_SRM)
+void
+srm_paging_stop (void)
+{
+ /* Move the vptb back to where the SRM console expects it. */
+ swapper_pg_dir[1] = swapper_pg_dir[1023];
+ tbia();
+ wrvptptr(0x200000000UL);
+ hwrpb->vptb = 0x200000000UL;
+ hwrpb_update_checksum(hwrpb);
+
+ /* Reload the page tables that the console had in use. */
+ load_PCB(&original_pcb);
+ tbia();
+}
+#endif
+
+void __init
+mem_init(void)
+{
+ set_max_mapnr(max_low_pfn);
+ high_memory = (void *) __va(max_low_pfn * PAGE_SIZE);
+ free_all_bootmem();
+ mem_init_print_info(NULL);
+}
+
+void
+free_initmem(void)
+{
+ free_initmem_default(-1);
+}
+
+#ifdef CONFIG_BLK_DEV_INITRD
+void
+free_initrd_mem(unsigned long start, unsigned long end)
+{
+ free_reserved_area((void *)start, (void *)end, -1, "initrd");
+}
+#endif
diff --git a/kernel/arch/alpha/mm/numa.c b/kernel/arch/alpha/mm/numa.c
new file mode 100644
index 000000000..d543d71c2
--- /dev/null
+++ b/kernel/arch/alpha/mm/numa.c
@@ -0,0 +1,321 @@
+/*
+ * linux/arch/alpha/mm/numa.c
+ *
+ * DISCONTIGMEM NUMA alpha support.
+ *
+ * Copyright (C) 2001 Andrea Arcangeli <andrea@suse.de> SuSE
+ */
+
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/bootmem.h>
+#include <linux/swap.h>
+#include <linux/initrd.h>
+#include <linux/pfn.h>
+#include <linux/module.h>
+
+#include <asm/hwrpb.h>
+#include <asm/pgalloc.h>
+#include <asm/sections.h>
+
+pg_data_t node_data[MAX_NUMNODES];
+EXPORT_SYMBOL(node_data);
+
+#undef DEBUG_DISCONTIG
+#ifdef DEBUG_DISCONTIG
+#define DBGDCONT(args...) printk(args)
+#else
+#define DBGDCONT(args...)
+#endif
+
+#define for_each_mem_cluster(memdesc, _cluster, i) \
+ for ((_cluster) = (memdesc)->cluster, (i) = 0; \
+ (i) < (memdesc)->numclusters; (i)++, (_cluster)++)
+
+static void __init show_mem_layout(void)
+{
+ struct memclust_struct * cluster;
+ struct memdesc_struct * memdesc;
+ int i;
+
+ /* Find free clusters, and init and free the bootmem accordingly. */
+ memdesc = (struct memdesc_struct *)
+ (hwrpb->mddt_offset + (unsigned long) hwrpb);
+
+ printk("Raw memory layout:\n");
+ for_each_mem_cluster(memdesc, cluster, i) {
+ printk(" memcluster %2d, usage %1lx, start %8lu, end %8lu\n",
+ i, cluster->usage, cluster->start_pfn,
+ cluster->start_pfn + cluster->numpages);
+ }
+}
+
+static void __init
+setup_memory_node(int nid, void *kernel_end)
+{
+ extern unsigned long mem_size_limit;
+ struct memclust_struct * cluster;
+ struct memdesc_struct * memdesc;
+ unsigned long start_kernel_pfn, end_kernel_pfn;
+ unsigned long bootmap_size, bootmap_pages, bootmap_start;
+ unsigned long start, end;
+ unsigned long node_pfn_start, node_pfn_end;
+ unsigned long node_min_pfn, node_max_pfn;
+ int i;
+ unsigned long node_datasz = PFN_UP(sizeof(pg_data_t));
+ int show_init = 0;
+
+ /* Find the bounds of current node */
+ node_pfn_start = (node_mem_start(nid)) >> PAGE_SHIFT;
+ node_pfn_end = node_pfn_start + (node_mem_size(nid) >> PAGE_SHIFT);
+
+ /* Find free clusters, and init and free the bootmem accordingly. */
+ memdesc = (struct memdesc_struct *)
+ (hwrpb->mddt_offset + (unsigned long) hwrpb);
+
+ /* find the bounds of this node (node_min_pfn/node_max_pfn) */
+ node_min_pfn = ~0UL;
+ node_max_pfn = 0UL;
+ for_each_mem_cluster(memdesc, cluster, i) {
+ /* Bit 0 is console/PALcode reserved. Bit 1 is
+ non-volatile memory -- we might want to mark
+ this for later. */
+ if (cluster->usage & 3)
+ continue;
+
+ start = cluster->start_pfn;
+ end = start + cluster->numpages;
+
+ if (start >= node_pfn_end || end <= node_pfn_start)
+ continue;
+
+ if (!show_init) {
+ show_init = 1;
+ printk("Initializing bootmem allocator on Node ID %d\n", nid);
+ }
+ printk(" memcluster %2d, usage %1lx, start %8lu, end %8lu\n",
+ i, cluster->usage, cluster->start_pfn,
+ cluster->start_pfn + cluster->numpages);
+
+ if (start < node_pfn_start)
+ start = node_pfn_start;
+ if (end > node_pfn_end)
+ end = node_pfn_end;
+
+ if (start < node_min_pfn)
+ node_min_pfn = start;
+ if (end > node_max_pfn)
+ node_max_pfn = end;
+ }
+
+ if (mem_size_limit && node_max_pfn > mem_size_limit) {
+ static int msg_shown = 0;
+ if (!msg_shown) {
+ msg_shown = 1;
+ printk("setup: forcing memory size to %ldK (from %ldK).\n",
+ mem_size_limit << (PAGE_SHIFT - 10),
+ node_max_pfn << (PAGE_SHIFT - 10));
+ }
+ node_max_pfn = mem_size_limit;
+ }
+
+ if (node_min_pfn >= node_max_pfn)
+ return;
+
+ /* Update global {min,max}_low_pfn from node information. */
+ if (node_min_pfn < min_low_pfn)
+ min_low_pfn = node_min_pfn;
+ if (node_max_pfn > max_low_pfn)
+ max_pfn = max_low_pfn = node_max_pfn;
+
+#if 0 /* we'll try this one again in a little while */
+ /* Cute trick to make sure our local node data is on local memory */
+ node_data[nid] = (pg_data_t *)(__va(node_min_pfn << PAGE_SHIFT));
+#endif
+ /* Quasi-mark the pg_data_t as in-use */
+ node_min_pfn += node_datasz;
+ if (node_min_pfn >= node_max_pfn) {
+ printk(" not enough mem to reserve NODE_DATA");
+ return;
+ }
+ NODE_DATA(nid)->bdata = &bootmem_node_data[nid];
+
+ printk(" Detected node memory: start %8lu, end %8lu\n",
+ node_min_pfn, node_max_pfn);
+
+ DBGDCONT(" DISCONTIG: node_data[%d] is at 0x%p\n", nid, NODE_DATA(nid));
+ DBGDCONT(" DISCONTIG: NODE_DATA(%d)->bdata is at 0x%p\n", nid, NODE_DATA(nid)->bdata);
+
+ /* Find the bounds of kernel memory. */
+ start_kernel_pfn = PFN_DOWN(KERNEL_START_PHYS);
+ end_kernel_pfn = PFN_UP(virt_to_phys(kernel_end));
+ bootmap_start = -1;
+
+ if (!nid && (node_max_pfn < end_kernel_pfn || node_min_pfn > start_kernel_pfn))
+ panic("kernel loaded out of ram");
+
+ /* Zone start phys-addr must be 2^(MAX_ORDER-1) aligned.
+ Note that we round this down, not up - node memory
+ has much larger alignment than 8Mb, so it's safe. */
+ node_min_pfn &= ~((1UL << (MAX_ORDER-1))-1);
+
+ /* We need to know how many physically contiguous pages
+ we'll need for the bootmap. */
+ bootmap_pages = bootmem_bootmap_pages(node_max_pfn-node_min_pfn);
+
+ /* Now find a good region where to allocate the bootmap. */
+ for_each_mem_cluster(memdesc, cluster, i) {
+ if (cluster->usage & 3)
+ continue;
+
+ start = cluster->start_pfn;
+ end = start + cluster->numpages;
+
+ if (start >= node_max_pfn || end <= node_min_pfn)
+ continue;
+
+ if (end > node_max_pfn)
+ end = node_max_pfn;
+ if (start < node_min_pfn)
+ start = node_min_pfn;
+
+ if (start < start_kernel_pfn) {
+ if (end > end_kernel_pfn
+ && end - end_kernel_pfn >= bootmap_pages) {
+ bootmap_start = end_kernel_pfn;
+ break;
+ } else if (end > start_kernel_pfn)
+ end = start_kernel_pfn;
+ } else if (start < end_kernel_pfn)
+ start = end_kernel_pfn;
+ if (end - start >= bootmap_pages) {
+ bootmap_start = start;
+ break;
+ }
+ }
+
+ if (bootmap_start == -1)
+ panic("couldn't find a contiguous place for the bootmap");
+
+ /* Allocate the bootmap and mark the whole MM as reserved. */
+ bootmap_size = init_bootmem_node(NODE_DATA(nid), bootmap_start,
+ node_min_pfn, node_max_pfn);
+ DBGDCONT(" bootmap_start %lu, bootmap_size %lu, bootmap_pages %lu\n",
+ bootmap_start, bootmap_size, bootmap_pages);
+
+ /* Mark the free regions. */
+ for_each_mem_cluster(memdesc, cluster, i) {
+ if (cluster->usage & 3)
+ continue;
+
+ start = cluster->start_pfn;
+ end = cluster->start_pfn + cluster->numpages;
+
+ if (start >= node_max_pfn || end <= node_min_pfn)
+ continue;
+
+ if (end > node_max_pfn)
+ end = node_max_pfn;
+ if (start < node_min_pfn)
+ start = node_min_pfn;
+
+ if (start < start_kernel_pfn) {
+ if (end > end_kernel_pfn) {
+ free_bootmem_node(NODE_DATA(nid), PFN_PHYS(start),
+ (PFN_PHYS(start_kernel_pfn)
+ - PFN_PHYS(start)));
+ printk(" freeing pages %ld:%ld\n",
+ start, start_kernel_pfn);
+ start = end_kernel_pfn;
+ } else if (end > start_kernel_pfn)
+ end = start_kernel_pfn;
+ } else if (start < end_kernel_pfn)
+ start = end_kernel_pfn;
+ if (start >= end)
+ continue;
+
+ free_bootmem_node(NODE_DATA(nid), PFN_PHYS(start), PFN_PHYS(end) - PFN_PHYS(start));
+ printk(" freeing pages %ld:%ld\n", start, end);
+ }
+
+ /* Reserve the bootmap memory. */
+ reserve_bootmem_node(NODE_DATA(nid), PFN_PHYS(bootmap_start),
+ bootmap_size, BOOTMEM_DEFAULT);
+ printk(" reserving pages %ld:%ld\n", bootmap_start, bootmap_start+PFN_UP(bootmap_size));
+
+ node_set_online(nid);
+}
+
+void __init
+setup_memory(void *kernel_end)
+{
+ int nid;
+
+ show_mem_layout();
+
+ nodes_clear(node_online_map);
+
+ min_low_pfn = ~0UL;
+ max_low_pfn = 0UL;
+ for (nid = 0; nid < MAX_NUMNODES; nid++)
+ setup_memory_node(nid, kernel_end);
+
+#ifdef CONFIG_BLK_DEV_INITRD
+ initrd_start = INITRD_START;
+ if (initrd_start) {
+ extern void *move_initrd(unsigned long);
+
+ initrd_end = initrd_start+INITRD_SIZE;
+ printk("Initial ramdisk at: 0x%p (%lu bytes)\n",
+ (void *) initrd_start, INITRD_SIZE);
+
+ if ((void *)initrd_end > phys_to_virt(PFN_PHYS(max_low_pfn))) {
+ if (!move_initrd(PFN_PHYS(max_low_pfn)))
+ printk("initrd extends beyond end of memory "
+ "(0x%08lx > 0x%p)\ndisabling initrd\n",
+ initrd_end,
+ phys_to_virt(PFN_PHYS(max_low_pfn)));
+ } else {
+ nid = kvaddr_to_nid(initrd_start);
+ reserve_bootmem_node(NODE_DATA(nid),
+ virt_to_phys((void *)initrd_start),
+ INITRD_SIZE, BOOTMEM_DEFAULT);
+ }
+ }
+#endif /* CONFIG_BLK_DEV_INITRD */
+}
+
+void __init paging_init(void)
+{
+ unsigned int nid;
+ unsigned long zones_size[MAX_NR_ZONES] = {0, };
+ unsigned long dma_local_pfn;
+
+ /*
+ * The old global MAX_DMA_ADDRESS per-arch API doesn't fit
+ * in the NUMA model, for now we convert it to a pfn and
+ * we interpret this pfn as a local per-node information.
+ * This issue isn't very important since none of these machines
+ * have legacy ISA slots anyways.
+ */
+ dma_local_pfn = virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT;
+
+ for_each_online_node(nid) {
+ bootmem_data_t *bdata = &bootmem_node_data[nid];
+ unsigned long start_pfn = bdata->node_min_pfn;
+ unsigned long end_pfn = bdata->node_low_pfn;
+
+ if (dma_local_pfn >= end_pfn - start_pfn)
+ zones_size[ZONE_DMA] = end_pfn - start_pfn;
+ else {
+ zones_size[ZONE_DMA] = dma_local_pfn;
+ zones_size[ZONE_NORMAL] = (end_pfn - start_pfn) - dma_local_pfn;
+ }
+ node_set_state(nid, N_NORMAL_MEMORY);
+ free_area_init_node(nid, zones_size, start_pfn, NULL);
+ }
+
+ /* Initialize the kernel's ZERO_PGE. */
+ memset((void *)ZERO_PGE, 0, PAGE_SIZE);
+}