diff options
Diffstat (limited to 'kernel/arch/s390/numa')
-rw-r--r-- | kernel/arch/s390/numa/Makefile | 3 | ||||
-rw-r--r-- | kernel/arch/s390/numa/mode_emu.c | 536 | ||||
-rw-r--r-- | kernel/arch/s390/numa/numa.c | 184 | ||||
-rw-r--r-- | kernel/arch/s390/numa/numa_mode.h | 24 | ||||
-rw-r--r-- | kernel/arch/s390/numa/toptree.c | 342 | ||||
-rw-r--r-- | kernel/arch/s390/numa/toptree.h | 60 |
6 files changed, 1149 insertions, 0 deletions
diff --git a/kernel/arch/s390/numa/Makefile b/kernel/arch/s390/numa/Makefile new file mode 100644 index 000000000..f94ecaffa --- /dev/null +++ b/kernel/arch/s390/numa/Makefile @@ -0,0 +1,3 @@ +obj-y += numa.o +obj-y += toptree.o +obj-$(CONFIG_NUMA_EMU) += mode_emu.o diff --git a/kernel/arch/s390/numa/mode_emu.c b/kernel/arch/s390/numa/mode_emu.c new file mode 100644 index 000000000..828d0695d --- /dev/null +++ b/kernel/arch/s390/numa/mode_emu.c @@ -0,0 +1,536 @@ +/* + * NUMA support for s390 + * + * NUMA emulation (aka fake NUMA) distributes the available memory to nodes + * without using real topology information about the physical memory of the + * machine. + * + * It distributes the available CPUs to nodes while respecting the original + * machine topology information. This is done by trying to avoid to separate + * CPUs which reside on the same book or even on the same MC. + * + * Because the current Linux scheduler code requires a stable cpu to node + * mapping, cores are pinned to nodes when the first CPU thread is set online. + * + * Copyright IBM Corp. 2015 + */ + +#define KMSG_COMPONENT "numa_emu" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include <linux/kernel.h> +#include <linux/cpumask.h> +#include <linux/memblock.h> +#include <linux/node.h> +#include <linux/memory.h> +#include <linux/slab.h> +#include <asm/smp.h> +#include <asm/topology.h> +#include "numa_mode.h" +#include "toptree.h" + +/* Distances between the different system components */ +#define DIST_EMPTY 0 +#define DIST_CORE 1 +#define DIST_MC 2 +#define DIST_BOOK 3 +#define DIST_MAX 4 + +/* Node distance reported to common code */ +#define EMU_NODE_DIST 10 + +/* Node ID for free (not yet pinned) cores */ +#define NODE_ID_FREE -1 + +/* Different levels of toptree */ +enum toptree_level {CORE, MC, BOOK, NODE, TOPOLOGY}; + +/* The two toptree IDs */ +enum {TOPTREE_ID_PHYS, TOPTREE_ID_NUMA}; + +/* Number of NUMA nodes */ +static int emu_nodes = 1; +/* NUMA stripe size */ +static unsigned long emu_size; + +/* + * Node to core pinning information updates are protected by + * "sched_domains_mutex". + */ +static struct { + s32 to_node_id[CONFIG_NR_CPUS]; /* Pinned core to node mapping */ + int total; /* Total number of pinned cores */ + int per_node_target; /* Cores per node without extra cores */ + int per_node[MAX_NUMNODES]; /* Number of cores pinned to node */ +} *emu_cores; + +/* + * Pin a core to a node + */ +static void pin_core_to_node(int core_id, int node_id) +{ + if (emu_cores->to_node_id[core_id] == NODE_ID_FREE) { + emu_cores->per_node[node_id]++; + emu_cores->to_node_id[core_id] = node_id; + emu_cores->total++; + } else { + WARN_ON(emu_cores->to_node_id[core_id] != node_id); + } +} + +/* + * Number of pinned cores of a node + */ +static int cores_pinned(struct toptree *node) +{ + return emu_cores->per_node[node->id]; +} + +/* + * ID of the node where the core is pinned (or NODE_ID_FREE) + */ +static int core_pinned_to_node_id(struct toptree *core) +{ + return emu_cores->to_node_id[core->id]; +} + +/* + * Number of cores in the tree that are not yet pinned + */ +static int cores_free(struct toptree *tree) +{ + struct toptree *core; + int count = 0; + + toptree_for_each(core, tree, CORE) { + if (core_pinned_to_node_id(core) == NODE_ID_FREE) + count++; + } + return count; +} + +/* + * Return node of core + */ +static struct toptree *core_node(struct toptree *core) +{ + return core->parent->parent->parent; +} + +/* + * Return book of core + */ +static struct toptree *core_book(struct toptree *core) +{ + return core->parent->parent; +} + +/* + * Return mc of core + */ +static struct toptree *core_mc(struct toptree *core) +{ + return core->parent; +} + +/* + * Distance between two cores + */ +static int dist_core_to_core(struct toptree *core1, struct toptree *core2) +{ + if (core_book(core1)->id != core_book(core2)->id) + return DIST_BOOK; + if (core_mc(core1)->id != core_mc(core2)->id) + return DIST_MC; + /* Same core or sibling on same MC */ + return DIST_CORE; +} + +/* + * Distance of a node to a core + */ +static int dist_node_to_core(struct toptree *node, struct toptree *core) +{ + struct toptree *core_node; + int dist_min = DIST_MAX; + + toptree_for_each(core_node, node, CORE) + dist_min = min(dist_min, dist_core_to_core(core_node, core)); + return dist_min == DIST_MAX ? DIST_EMPTY : dist_min; +} + +/* + * Unify will delete empty nodes, therefore recreate nodes. + */ +static void toptree_unify_tree(struct toptree *tree) +{ + int nid; + + toptree_unify(tree); + for (nid = 0; nid < emu_nodes; nid++) + toptree_get_child(tree, nid); +} + +/* + * Find the best/nearest node for a given core and ensure that no node + * gets more than "emu_cores->per_node_target + extra" cores. + */ +static struct toptree *node_for_core(struct toptree *numa, struct toptree *core, + int extra) +{ + struct toptree *node, *node_best = NULL; + int dist_cur, dist_best, cores_target; + + cores_target = emu_cores->per_node_target + extra; + dist_best = DIST_MAX; + node_best = NULL; + toptree_for_each(node, numa, NODE) { + /* Already pinned cores must use their nodes */ + if (core_pinned_to_node_id(core) == node->id) { + node_best = node; + break; + } + /* Skip nodes that already have enough cores */ + if (cores_pinned(node) >= cores_target) + continue; + dist_cur = dist_node_to_core(node, core); + if (dist_cur < dist_best) { + dist_best = dist_cur; + node_best = node; + } + } + return node_best; +} + +/* + * Find the best node for each core with respect to "extra" core count + */ +static void toptree_to_numa_single(struct toptree *numa, struct toptree *phys, + int extra) +{ + struct toptree *node, *core, *tmp; + + toptree_for_each_safe(core, tmp, phys, CORE) { + node = node_for_core(numa, core, extra); + if (!node) + return; + toptree_move(core, node); + pin_core_to_node(core->id, node->id); + } +} + +/* + * Move structures of given level to specified NUMA node + */ +static void move_level_to_numa_node(struct toptree *node, struct toptree *phys, + enum toptree_level level, bool perfect) +{ + int cores_free, cores_target = emu_cores->per_node_target; + struct toptree *cur, *tmp; + + toptree_for_each_safe(cur, tmp, phys, level) { + cores_free = cores_target - toptree_count(node, CORE); + if (perfect) { + if (cores_free == toptree_count(cur, CORE)) + toptree_move(cur, node); + } else { + if (cores_free >= toptree_count(cur, CORE)) + toptree_move(cur, node); + } + } +} + +/* + * Move structures of a given level to NUMA nodes. If "perfect" is specified + * move only perfectly fitting structures. Otherwise move also smaller + * than needed structures. + */ +static void move_level_to_numa(struct toptree *numa, struct toptree *phys, + enum toptree_level level, bool perfect) +{ + struct toptree *node; + + toptree_for_each(node, numa, NODE) + move_level_to_numa_node(node, phys, level, perfect); +} + +/* + * For the first run try to move the big structures + */ +static void toptree_to_numa_first(struct toptree *numa, struct toptree *phys) +{ + struct toptree *core; + + /* Always try to move perfectly fitting structures first */ + move_level_to_numa(numa, phys, BOOK, true); + move_level_to_numa(numa, phys, BOOK, false); + move_level_to_numa(numa, phys, MC, true); + move_level_to_numa(numa, phys, MC, false); + /* Now pin all the moved cores */ + toptree_for_each(core, numa, CORE) + pin_core_to_node(core->id, core_node(core)->id); +} + +/* + * Allocate new topology and create required nodes + */ +static struct toptree *toptree_new(int id, int nodes) +{ + struct toptree *tree; + int nid; + + tree = toptree_alloc(TOPOLOGY, id); + if (!tree) + goto fail; + for (nid = 0; nid < nodes; nid++) { + if (!toptree_get_child(tree, nid)) + goto fail; + } + return tree; +fail: + panic("NUMA emulation could not allocate topology"); +} + +/* + * Allocate and initialize core to node mapping + */ +static void create_core_to_node_map(void) +{ + int i; + + emu_cores = kzalloc(sizeof(*emu_cores), GFP_KERNEL); + if (emu_cores == NULL) + panic("Could not allocate cores to node memory"); + for (i = 0; i < ARRAY_SIZE(emu_cores->to_node_id); i++) + emu_cores->to_node_id[i] = NODE_ID_FREE; +} + +/* + * Move cores from physical topology into NUMA target topology + * and try to keep as much of the physical topology as possible. + */ +static struct toptree *toptree_to_numa(struct toptree *phys) +{ + static int first = 1; + struct toptree *numa; + int cores_total; + + cores_total = emu_cores->total + cores_free(phys); + emu_cores->per_node_target = cores_total / emu_nodes; + numa = toptree_new(TOPTREE_ID_NUMA, emu_nodes); + if (first) { + toptree_to_numa_first(numa, phys); + first = 0; + } + toptree_to_numa_single(numa, phys, 0); + toptree_to_numa_single(numa, phys, 1); + toptree_unify_tree(numa); + + WARN_ON(cpumask_weight(&phys->mask)); + return numa; +} + +/* + * Create a toptree out of the physical topology that we got from the hypervisor + */ +static struct toptree *toptree_from_topology(void) +{ + struct toptree *phys, *node, *book, *mc, *core; + struct cpu_topology_s390 *top; + int cpu; + + phys = toptree_new(TOPTREE_ID_PHYS, 1); + + for_each_online_cpu(cpu) { + top = &per_cpu(cpu_topology, cpu); + node = toptree_get_child(phys, 0); + book = toptree_get_child(node, top->book_id); + mc = toptree_get_child(book, top->socket_id); + core = toptree_get_child(mc, top->core_id); + if (!book || !mc || !core) + panic("NUMA emulation could not allocate memory"); + cpumask_set_cpu(cpu, &core->mask); + toptree_update_mask(mc); + } + return phys; +} + +/* + * Add toptree core to topology and create correct CPU masks + */ +static void topology_add_core(struct toptree *core) +{ + struct cpu_topology_s390 *top; + int cpu; + + for_each_cpu(cpu, &core->mask) { + top = &per_cpu(cpu_topology, cpu); + cpumask_copy(&top->thread_mask, &core->mask); + cpumask_copy(&top->core_mask, &core_mc(core)->mask); + cpumask_copy(&top->book_mask, &core_book(core)->mask); + cpumask_set_cpu(cpu, &node_to_cpumask_map[core_node(core)->id]); + top->node_id = core_node(core)->id; + } +} + +/* + * Apply toptree to topology and create CPU masks + */ +static void toptree_to_topology(struct toptree *numa) +{ + struct toptree *core; + int i; + + /* Clear all node masks */ + for (i = 0; i < MAX_NUMNODES; i++) + cpumask_clear(&node_to_cpumask_map[i]); + + /* Rebuild all masks */ + toptree_for_each(core, numa, CORE) + topology_add_core(core); +} + +/* + * Show the node to core mapping + */ +static void print_node_to_core_map(void) +{ + int nid, cid; + + if (!numa_debug_enabled) + return; + printk(KERN_DEBUG "NUMA node to core mapping\n"); + for (nid = 0; nid < emu_nodes; nid++) { + printk(KERN_DEBUG " node %3d: ", nid); + for (cid = 0; cid < ARRAY_SIZE(emu_cores->to_node_id); cid++) { + if (emu_cores->to_node_id[cid] == nid) + printk(KERN_CONT "%d ", cid); + } + printk(KERN_CONT "\n"); + } +} + +/* + * Transfer physical topology into a NUMA topology and modify CPU masks + * according to the NUMA topology. + * + * Must be called with "sched_domains_mutex" lock held. + */ +static void emu_update_cpu_topology(void) +{ + struct toptree *phys, *numa; + + if (emu_cores == NULL) + create_core_to_node_map(); + phys = toptree_from_topology(); + numa = toptree_to_numa(phys); + toptree_free(phys); + toptree_to_topology(numa); + toptree_free(numa); + print_node_to_core_map(); +} + +/* + * If emu_size is not set, use CONFIG_EMU_SIZE. Then round to minimum + * alignment (needed for memory hotplug). + */ +static unsigned long emu_setup_size_adjust(unsigned long size) +{ + unsigned long size_new; + + size = size ? : CONFIG_EMU_SIZE; + size_new = roundup(size, memory_block_size_bytes()); + if (size_new == size) + return size; + pr_warn("Increasing memory stripe size from %ld MB to %ld MB\n", + size >> 20, size_new >> 20); + return size_new; +} + +/* + * If we have not enough memory for the specified nodes, reduce the node count. + */ +static int emu_setup_nodes_adjust(int nodes) +{ + int nodes_max; + + nodes_max = memblock.memory.total_size / emu_size; + nodes_max = max(nodes_max, 1); + if (nodes_max >= nodes) + return nodes; + pr_warn("Not enough memory for %d nodes, reducing node count\n", nodes); + return nodes_max; +} + +/* + * Early emu setup + */ +static void emu_setup(void) +{ + emu_size = emu_setup_size_adjust(emu_size); + emu_nodes = emu_setup_nodes_adjust(emu_nodes); + pr_info("Creating %d nodes with memory stripe size %ld MB\n", + emu_nodes, emu_size >> 20); +} + +/* + * Return node id for given page number + */ +static int emu_pfn_to_nid(unsigned long pfn) +{ + return (pfn / (emu_size >> PAGE_SHIFT)) % emu_nodes; +} + +/* + * Return stripe size + */ +static unsigned long emu_align(void) +{ + return emu_size; +} + +/* + * Return distance between two nodes + */ +static int emu_distance(int node1, int node2) +{ + return (node1 != node2) * EMU_NODE_DIST; +} + +/* + * Define callbacks for generic s390 NUMA infrastructure + */ +const struct numa_mode numa_mode_emu = { + .name = "emu", + .setup = emu_setup, + .update_cpu_topology = emu_update_cpu_topology, + .__pfn_to_nid = emu_pfn_to_nid, + .align = emu_align, + .distance = emu_distance, +}; + +/* + * Kernel parameter: emu_nodes=<n> + */ +static int __init early_parse_emu_nodes(char *p) +{ + int count; + + if (kstrtoint(p, 0, &count) != 0 || count <= 0) + return 0; + if (count <= 0) + return 0; + emu_nodes = min(count, MAX_NUMNODES); + return 0; +} +early_param("emu_nodes", early_parse_emu_nodes); + +/* + * Kernel parameter: emu_size=[<n>[k|M|G|T]] + */ +static int __init early_parse_emu_size(char *p) +{ + emu_size = memparse(p, NULL); + return 0; +} +early_param("emu_size", early_parse_emu_size); diff --git a/kernel/arch/s390/numa/numa.c b/kernel/arch/s390/numa/numa.c new file mode 100644 index 000000000..43f32ce60 --- /dev/null +++ b/kernel/arch/s390/numa/numa.c @@ -0,0 +1,184 @@ +/* + * NUMA support for s390 + * + * Implement NUMA core code. + * + * Copyright IBM Corp. 2015 + */ + +#define KMSG_COMPONENT "numa" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include <linux/kernel.h> +#include <linux/mmzone.h> +#include <linux/cpumask.h> +#include <linux/bootmem.h> +#include <linux/memblock.h> +#include <linux/slab.h> +#include <linux/node.h> + +#include <asm/numa.h> +#include "numa_mode.h" + +pg_data_t *node_data[MAX_NUMNODES]; +EXPORT_SYMBOL(node_data); + +cpumask_t node_to_cpumask_map[MAX_NUMNODES]; +EXPORT_SYMBOL(node_to_cpumask_map); + +const struct numa_mode numa_mode_plain = { + .name = "plain", +}; + +static const struct numa_mode *mode = &numa_mode_plain; + +int numa_pfn_to_nid(unsigned long pfn) +{ + return mode->__pfn_to_nid ? mode->__pfn_to_nid(pfn) : 0; +} + +void numa_update_cpu_topology(void) +{ + if (mode->update_cpu_topology) + mode->update_cpu_topology(); +} + +int __node_distance(int a, int b) +{ + return mode->distance ? mode->distance(a, b) : 0; +} + +int numa_debug_enabled; + +/* + * alloc_node_data() - Allocate node data + */ +static __init pg_data_t *alloc_node_data(void) +{ + pg_data_t *res; + + res = (pg_data_t *) memblock_alloc(sizeof(pg_data_t), 1); + if (!res) + panic("Could not allocate memory for node data!\n"); + memset(res, 0, sizeof(pg_data_t)); + return res; +} + +/* + * numa_setup_memory() - Assign bootmem to nodes + * + * The memory is first added to memblock without any respect to nodes. + * This is fixed before remaining memblock memory is handed over to the + * buddy allocator. + * An important side effect is that large bootmem allocations might easily + * cross node boundaries, which can be needed for large allocations with + * smaller memory stripes in each node (i.e. when using NUMA emulation). + * + * Memory defines nodes: + * Therefore this routine also sets the nodes online with memory. + */ +static void __init numa_setup_memory(void) +{ + unsigned long cur_base, align, end_of_dram; + int nid = 0; + + end_of_dram = memblock_end_of_DRAM(); + align = mode->align ? mode->align() : ULONG_MAX; + + /* + * Step through all available memory and assign it to the nodes + * indicated by the mode implementation. + * All nodes which are seen here will be set online. + */ + cur_base = 0; + do { + nid = numa_pfn_to_nid(PFN_DOWN(cur_base)); + node_set_online(nid); + memblock_set_node(cur_base, align, &memblock.memory, nid); + cur_base += align; + } while (cur_base < end_of_dram); + + /* Allocate and fill out node_data */ + for (nid = 0; nid < MAX_NUMNODES; nid++) + NODE_DATA(nid) = alloc_node_data(); + + for_each_online_node(nid) { + unsigned long start_pfn, end_pfn; + unsigned long t_start, t_end; + int i; + + start_pfn = ULONG_MAX; + end_pfn = 0; + for_each_mem_pfn_range(i, nid, &t_start, &t_end, NULL) { + if (t_start < start_pfn) + start_pfn = t_start; + if (t_end > end_pfn) + end_pfn = t_end; + } + NODE_DATA(nid)->node_spanned_pages = end_pfn - start_pfn; + NODE_DATA(nid)->node_id = nid; + } +} + +/* + * numa_setup() - Earliest initialization + * + * Assign the mode and call the mode's setup routine. + */ +void __init numa_setup(void) +{ + pr_info("NUMA mode: %s\n", mode->name); + if (mode->setup) + mode->setup(); + numa_setup_memory(); + memblock_dump_all(); +} + + +/* + * numa_init_early() - Initialization initcall + * + * This runs when only one CPU is online and before the first + * topology update is called for by the scheduler. + */ +static int __init numa_init_early(void) +{ + /* Attach all possible CPUs to node 0 for now. */ + cpumask_copy(&node_to_cpumask_map[0], cpu_possible_mask); + return 0; +} +early_initcall(numa_init_early); + +/* + * numa_init_late() - Initialization initcall + * + * Register NUMA nodes. + */ +static int __init numa_init_late(void) +{ + int nid; + + for_each_online_node(nid) + register_one_node(nid); + return 0; +} +device_initcall(numa_init_late); + +static int __init parse_debug(char *parm) +{ + numa_debug_enabled = 1; + return 0; +} +early_param("numa_debug", parse_debug); + +static int __init parse_numa(char *parm) +{ + if (strcmp(parm, numa_mode_plain.name) == 0) + mode = &numa_mode_plain; +#ifdef CONFIG_NUMA_EMU + if (strcmp(parm, numa_mode_emu.name) == 0) + mode = &numa_mode_emu; +#endif + return 0; +} +early_param("numa", parse_numa); diff --git a/kernel/arch/s390/numa/numa_mode.h b/kernel/arch/s390/numa/numa_mode.h new file mode 100644 index 000000000..08953b0b1 --- /dev/null +++ b/kernel/arch/s390/numa/numa_mode.h @@ -0,0 +1,24 @@ +/* + * NUMA support for s390 + * + * Define declarations used for communication between NUMA mode + * implementations and NUMA core functionality. + * + * Copyright IBM Corp. 2015 + */ +#ifndef __S390_NUMA_MODE_H +#define __S390_NUMA_MODE_H + +struct numa_mode { + char *name; /* Name of mode */ + void (*setup)(void); /* Initizalize mode */ + void (*update_cpu_topology)(void); /* Called by topology code */ + int (*__pfn_to_nid)(unsigned long pfn); /* PFN to node ID */ + unsigned long (*align)(void); /* Minimum node alignment */ + int (*distance)(int a, int b); /* Distance between two nodes */ +}; + +extern const struct numa_mode numa_mode_plain; +extern const struct numa_mode numa_mode_emu; + +#endif /* __S390_NUMA_MODE_H */ diff --git a/kernel/arch/s390/numa/toptree.c b/kernel/arch/s390/numa/toptree.c new file mode 100644 index 000000000..902d350d8 --- /dev/null +++ b/kernel/arch/s390/numa/toptree.c @@ -0,0 +1,342 @@ +/* + * NUMA support for s390 + * + * A tree structure used for machine topology mangling + * + * Copyright IBM Corp. 2015 + */ + +#include <linux/kernel.h> +#include <linux/cpumask.h> +#include <linux/list.h> +#include <linux/list_sort.h> +#include <linux/slab.h> +#include <asm/numa.h> + +#include "toptree.h" + +/** + * toptree_alloc - Allocate and initialize a new tree node. + * @level: The node's vertical level; level 0 contains the leaves. + * @id: ID number, explicitly not unique beyond scope of node's siblings + * + * Allocate a new tree node and initialize it. + * + * RETURNS: + * Pointer to the new tree node or NULL on error + */ +struct toptree *toptree_alloc(int level, int id) +{ + struct toptree *res = kzalloc(sizeof(struct toptree), GFP_KERNEL); + + if (!res) + return res; + + INIT_LIST_HEAD(&res->children); + INIT_LIST_HEAD(&res->sibling); + cpumask_clear(&res->mask); + res->level = level; + res->id = id; + return res; +} + +/** + * toptree_remove - Remove a tree node from a tree + * @cand: Pointer to the node to remove + * + * The node is detached from its parent node. The parent node's + * masks will be updated to reflect the loss of the child. + */ +static void toptree_remove(struct toptree *cand) +{ + struct toptree *oldparent; + + list_del_init(&cand->sibling); + oldparent = cand->parent; + cand->parent = NULL; + toptree_update_mask(oldparent); +} + +/** + * toptree_free - discard a tree node + * @cand: Pointer to the tree node to discard + * + * Checks if @cand is attached to a parent node. Detaches it + * cleanly using toptree_remove. Possible children are freed + * recursively. In the end @cand itself is freed. + */ +void toptree_free(struct toptree *cand) +{ + struct toptree *child, *tmp; + + if (cand->parent) + toptree_remove(cand); + toptree_for_each_child_safe(child, tmp, cand) + toptree_free(child); + kfree(cand); +} + +/** + * toptree_update_mask - Update node bitmasks + * @cand: Pointer to a tree node + * + * The node's cpumask will be updated by combining all children's + * masks. Then toptree_update_mask is called recursively for the + * parent if applicable. + * + * NOTE: + * This must not be called on leaves. If called on a leaf, its + * CPU mask is cleared and lost. + */ +void toptree_update_mask(struct toptree *cand) +{ + struct toptree *child; + + cpumask_clear(&cand->mask); + list_for_each_entry(child, &cand->children, sibling) + cpumask_or(&cand->mask, &cand->mask, &child->mask); + if (cand->parent) + toptree_update_mask(cand->parent); +} + +/** + * toptree_insert - Insert a tree node into tree + * @cand: Pointer to the node to insert + * @target: Pointer to the node to which @cand will added as a child + * + * Insert a tree node into a tree. Masks will be updated automatically. + * + * RETURNS: + * 0 on success, -1 if NULL is passed as argument or the node levels + * don't fit. + */ +static int toptree_insert(struct toptree *cand, struct toptree *target) +{ + if (!cand || !target) + return -1; + if (target->level != (cand->level + 1)) + return -1; + list_add_tail(&cand->sibling, &target->children); + cand->parent = target; + toptree_update_mask(target); + return 0; +} + +/** + * toptree_move_children - Move all child nodes of a node to a new place + * @cand: Pointer to the node whose children are to be moved + * @target: Pointer to the node to which @cand's children will be attached + * + * Take all child nodes of @cand and move them using toptree_move. + */ +static void toptree_move_children(struct toptree *cand, struct toptree *target) +{ + struct toptree *child, *tmp; + + toptree_for_each_child_safe(child, tmp, cand) + toptree_move(child, target); +} + +/** + * toptree_unify - Merge children with same ID + * @cand: Pointer to node whose direct children should be made unique + * + * When mangling the tree it is possible that a node has two or more children + * which have the same ID. This routine merges these children into one and + * moves all children of the merged nodes into the unified node. + */ +void toptree_unify(struct toptree *cand) +{ + struct toptree *child, *tmp, *cand_copy; + + /* Threads cannot be split, cores are not split */ + if (cand->level < 2) + return; + + cand_copy = toptree_alloc(cand->level, 0); + toptree_for_each_child_safe(child, tmp, cand) { + struct toptree *tmpchild; + + if (!cpumask_empty(&child->mask)) { + tmpchild = toptree_get_child(cand_copy, child->id); + toptree_move_children(child, tmpchild); + } + toptree_free(child); + } + toptree_move_children(cand_copy, cand); + toptree_free(cand_copy); + + toptree_for_each_child(child, cand) + toptree_unify(child); +} + +/** + * toptree_move - Move a node to another context + * @cand: Pointer to the node to move + * @target: Pointer to the node where @cand should go + * + * In the easiest case @cand is exactly on the level below @target + * and will be immediately moved to the target. + * + * If @target's level is not the direct parent level of @cand, + * nodes for the missing levels are created and put between + * @cand and @target. The "stacking" nodes' IDs are taken from + * @cand's parents. + * + * After this it is likely to have redundant nodes in the tree + * which are addressed by means of toptree_unify. + */ +void toptree_move(struct toptree *cand, struct toptree *target) +{ + struct toptree *stack_target, *real_insert_point, *ptr, *tmp; + + if (cand->level + 1 == target->level) { + toptree_remove(cand); + toptree_insert(cand, target); + return; + } + + real_insert_point = NULL; + ptr = cand; + stack_target = NULL; + + do { + tmp = stack_target; + stack_target = toptree_alloc(ptr->level + 1, + ptr->parent->id); + toptree_insert(tmp, stack_target); + if (!real_insert_point) + real_insert_point = stack_target; + ptr = ptr->parent; + } while (stack_target->level < (target->level - 1)); + + toptree_remove(cand); + toptree_insert(cand, real_insert_point); + toptree_insert(stack_target, target); +} + +/** + * toptree_get_child - Access a tree node's child by its ID + * @cand: Pointer to tree node whose child is to access + * @id: The desired child's ID + * + * @cand's children are searched for a child with matching ID. + * If no match can be found, a new child with the desired ID + * is created and returned. + */ +struct toptree *toptree_get_child(struct toptree *cand, int id) +{ + struct toptree *child; + + toptree_for_each_child(child, cand) + if (child->id == id) + return child; + child = toptree_alloc(cand->level-1, id); + toptree_insert(child, cand); + return child; +} + +/** + * toptree_first - Find the first descendant on specified level + * @context: Pointer to tree node whose descendants are to be used + * @level: The level of interest + * + * RETURNS: + * @context's first descendant on the specified level, or NULL + * if there is no matching descendant + */ +struct toptree *toptree_first(struct toptree *context, int level) +{ + struct toptree *child, *tmp; + + if (context->level == level) + return context; + + if (!list_empty(&context->children)) { + list_for_each_entry(child, &context->children, sibling) { + tmp = toptree_first(child, level); + if (tmp) + return tmp; + } + } + return NULL; +} + +/** + * toptree_next_sibling - Return next sibling + * @cur: Pointer to a tree node + * + * RETURNS: + * If @cur has a parent and is not the last in the parent's children list, + * the next sibling is returned. Or NULL when there are no siblings left. + */ +static struct toptree *toptree_next_sibling(struct toptree *cur) +{ + if (cur->parent == NULL) + return NULL; + + if (cur == list_last_entry(&cur->parent->children, + struct toptree, sibling)) + return NULL; + return (struct toptree *) list_next_entry(cur, sibling); +} + +/** + * toptree_next - Tree traversal function + * @cur: Pointer to current element + * @context: Pointer to the root node of the tree or subtree to + * be traversed. + * @level: The level of interest. + * + * RETURNS: + * Pointer to the next node on level @level + * or NULL when there is no next node. + */ +struct toptree *toptree_next(struct toptree *cur, struct toptree *context, + int level) +{ + struct toptree *cur_context, *tmp; + + if (!cur) + return NULL; + + if (context->level == level) + return NULL; + + tmp = toptree_next_sibling(cur); + if (tmp != NULL) + return tmp; + + cur_context = cur; + while (cur_context->level < context->level - 1) { + /* Step up */ + cur_context = cur_context->parent; + /* Step aside */ + tmp = toptree_next_sibling(cur_context); + if (tmp != NULL) { + /* Step down */ + tmp = toptree_first(tmp, level); + if (tmp != NULL) + return tmp; + } + } + return NULL; +} + +/** + * toptree_count - Count descendants on specified level + * @context: Pointer to node whose descendants are to be considered + * @level: Only descendants on the specified level will be counted + * + * RETURNS: + * Number of descendants on the specified level + */ +int toptree_count(struct toptree *context, int level) +{ + struct toptree *cur; + int cnt = 0; + + toptree_for_each(cur, context, level) + cnt++; + return cnt; +} diff --git a/kernel/arch/s390/numa/toptree.h b/kernel/arch/s390/numa/toptree.h new file mode 100644 index 000000000..bdf502027 --- /dev/null +++ b/kernel/arch/s390/numa/toptree.h @@ -0,0 +1,60 @@ +/* + * NUMA support for s390 + * + * A tree structure used for machine topology mangling + * + * Copyright IBM Corp. 2015 + */ +#ifndef S390_TOPTREE_H +#define S390_TOPTREE_H + +#include <linux/cpumask.h> +#include <linux/list.h> + +struct toptree { + int level; + int id; + cpumask_t mask; + struct toptree *parent; + struct list_head sibling; + struct list_head children; +}; + +struct toptree *toptree_alloc(int level, int id); +void toptree_free(struct toptree *cand); +void toptree_update_mask(struct toptree *cand); +void toptree_unify(struct toptree *cand); +struct toptree *toptree_get_child(struct toptree *cand, int id); +void toptree_move(struct toptree *cand, struct toptree *target); +int toptree_count(struct toptree *context, int level); + +struct toptree *toptree_first(struct toptree *context, int level); +struct toptree *toptree_next(struct toptree *cur, struct toptree *context, + int level); + +#define toptree_for_each_child(child, ptree) \ + list_for_each_entry(child, &ptree->children, sibling) + +#define toptree_for_each_child_safe(child, ptmp, ptree) \ + list_for_each_entry_safe(child, ptmp, &ptree->children, sibling) + +#define toptree_is_last(ptree) \ + ((ptree->parent == NULL) || \ + (ptree->parent->children.prev == &ptree->sibling)) + +#define toptree_for_each(ptree, cont, ttype) \ + for (ptree = toptree_first(cont, ttype); \ + ptree != NULL; \ + ptree = toptree_next(ptree, cont, ttype)) + +#define toptree_for_each_safe(ptree, tmp, cont, ttype) \ + for (ptree = toptree_first(cont, ttype), \ + tmp = toptree_next(ptree, cont, ttype); \ + ptree != NULL; \ + ptree = tmp, \ + tmp = toptree_next(ptree, cont, ttype)) + +#define toptree_for_each_sibling(ptree, start) \ + toptree_for_each(ptree, start->parent, start->level) + +#endif /* S390_TOPTREE_H */ |