RM: mm: Add NUMA support.

This patch adds support for NUMA (running on either discontiguous and sparse memory). At the moment, the number of nodes has to be specified on the commandline. One can also, optionally, specify the memory size of each node. (Otherwise the memory range is split roughly equally between nodes). CPUs can be striped across nodes (cpu number modulo the number of nodes), or assigned to a node based on their topology_physical_package_id. So for instance on a TC2, the A7 cores can be grouped together in one node and the A15s grouped together in another node. Signed-off-by: Steve Capper <steve.capper@arm.com>
author: Steve Capper <steve.capper@arm.com> 2012-12-26 11:12:13 +0530
committer: Tushar Behera <tushar.behera@linaro.org> 2013-01-22 11:41:48 +0530
commit: 51d454d6e0614ab199f75db70f2f6519003bc285 (patch)
tree: 6b87e282c6be97e226da2e9ec78afb7142fb00f7
parent: 2e73c5afa9dc518eb1f4e60682592bab009a66f9 (diff)
download: linux-topics-51d454d6e0614ab199f75db70f2f6519003bc285.tar.gz
6 files changed, 303 insertions, 41 deletions
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index f688535114d..98d87f02ae6 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -56,6 +56,7 @@ config ARM
 	select HAVE_MOD_ARCH_SPECIFIC if ARM_UNWIND
 	select MODULES_USE_ELF_REL
 	select CLONE_BACKWARDS
+	select HAVE_MEMBLOCK_NODE_MAP
 	help
 	  The ARM series is a line of low-power-consumption RISC chip designs
 	  licensed by ARM Ltd and targeted at embedded applications and
@@ -1191,9 +1192,34 @@ config ARCH_DISCONTIGMEM_ENABLE
 
 source arch/arm/mm/Kconfig
 
+config NUMA
+	bool "NUMA Support (EXPERIMENTAL)"
+	depends on MMU && !FLATMEM && EXPERIMENTAL
+	help
+	  Say Y to compile the kernel to support NUMA (Non-Uniform Memory
+	  Access). At the moment, one has to specify the number of nodes using
+	  the commandline:
+	  numa=fake=x,[size0],[size1],...,[sizeN-1],[usetopology]
+	  where x is the number of nodes, and sizeY is the size of node Y in
+	  bytes (one can suffix m or g for megabytes or gigabytes). If no sizes
+	  are specified, the memory is distributed roughly evenly between nodes.
+	  If "usetopology" is specified, the "topology_physical_package_id" is
+	  used to assign CPUs to nodes (so for instance on the TC2, the A7s are
+	  grouped together in one node and the A15s are grouped together in
+	  another node).
+
+config NODES_SHIFT
+	int "Maximum NUMA Nodes (as a power of 2)" if NUMA
+	range 1 10
+	default "1"
+	depends on NEED_MULTIPLE_NODES
+	---help---
+	  Specify the maximum number of NUMA Nodes available on the target
+	  system.  Increases memory reserved to accommodate various tables.
+
 config NUMA_ALLOC_NODES
 	bool
-	depends on DISCONTIGMEM
+	depends on DISCONTIGMEM || NUMA
 	default y
 
 config ARM_NR_BANKS
diff --git a/arch/arm/include/asm/mmzone.h b/arch/arm/include/asm/mmzone.h
index f6d733796dd..628e5035659 100644
--- a/arch/arm/include/asm/mmzone.h
+++ b/arch/arm/include/asm/mmzone.h
@@ -31,7 +31,19 @@ extern struct pglist_data *node_data[];
 #define arm_numa_alloc_nodes(_mlow)	do {} while (0)
 #endif
 
-#define	pfn_to_nid(pfn)		(0)
+#ifdef CONFIG_NUMA
+extern cpumask_var_t *node_to_cpumask_map;
+extern int numa_cpu_lookup_table[];
+extern int pfn_to_nid(unsigned long pfn);
+extern void __init arm_setup_nodes(unsigned long min, unsigned long max_high);
+extern void __init arm_numa_alloc_cpumask(unsigned long max_low);
+#else
+#define	pfn_to_nid(pfn)			(0)
+#define arm_setup_nodes(min, max_high) memblock_set_node(		\
+					__pfn_to_phys(min), 		\
+					__pfn_to_phys(max_high - min), 0)
+#define arm_numa_alloc_cpumask(max_low)	do {} while (0)
+#endif /* CONFIG_NUMA */
 
 #endif /* __KERNEL__ */
 #endif /* __ASM_ARM_MMZONE_H_ */
diff --git a/arch/arm/include/asm/topology.h b/arch/arm/include/asm/topology.h
index 983fa7c153a..5357eb195d1 100644
--- a/arch/arm/include/asm/topology.h
+++ b/arch/arm/include/asm/topology.h
@@ -68,6 +68,21 @@ static inline int cluster_to_logical_mask(unsigned int socket_id,
 
 #endif
 
+#ifdef CONFIG_NUMA
+
+static inline int cpu_to_node(int cpu)
+{
+	return numa_cpu_lookup_table[cpu];
+}
+
+#define cpumask_of_node(node) ((node) == -1 ?				\
+			       cpu_all_mask :				\
+			       node_to_cpumask_map[node])
+
+#define parent_node(node)	(node)
+
+#endif /* CONFIG_NUMA */
+
 #include <asm-generic/topology.h>
 
 #endif /* _ASM_ARM_TOPOLOGY_H */
diff --git a/arch/arm/kernel/setup.c b/arch/arm/kernel/setup.c
index dca8bc3f22e..d48cda46ea0 100644
--- a/arch/arm/kernel/setup.c
+++ b/arch/arm/kernel/setup.c
@@ -817,6 +817,12 @@ static int __init topology_init(void)
 {
 	int cpu;
 
+#ifdef CONFIG_NUMA
+	int node;
+	for_each_online_node(node)
+		register_one_node(node);
+#endif
+
 	for_each_possible_cpu(cpu) {
 		struct cpuinfo_arm *cpuinfo = &per_cpu(cpu_data, cpu);
 		cpuinfo->cpu.hotpluggable = 1;
diff --git a/arch/arm/mm/init.c b/arch/arm/mm/init.c
index 67c6c31bcb8..e5c80b6a4c8 100644
--- a/arch/arm/mm/init.c
+++ b/arch/arm/mm/init.c
@@ -269,56 +269,31 @@ void __init setup_dma_zone(struct machine_desc *mdesc)
 static void __init arm_bootmem_free(unsigned long min, unsigned long max_low,
 	unsigned long max_high)
 {
-	unsigned long zone_size[MAX_NR_ZONES], zhole_size[MAX_NR_ZONES];
-	struct memblock_region *reg;
+	unsigned long max_zone_pfns[MAX_NR_ZONES];
 
 	/*
-	 * initialise the zones.
+	 * On NUMA systems we register a CPU notifier, split the memory between
+	 * the nodes and bring them online before free_area_init_nodes).
+	 *
+	 * Otherwise, we put all memory into node 0.
 	 */
-	memset(zone_size, 0, sizeof(zone_size));
-
+	arm_setup_nodes(min, max_high);
+	
 	/*
-	 * The memory size has already been determined.  If we need
-	 * to do anything fancy with the allocation of this memory
-	 * to the zones, now is the time to do it.
+	 * initialise the zones.
 	 */
-	zone_size[0] = max_low - min;
-#ifdef CONFIG_HIGHMEM
-	zone_size[ZONE_HIGHMEM] = max_high - max_low;
-#endif
+	memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
+	max_zone_pfns[ZONE_NORMAL] = max_low;
 
-	/*
-	 * Calculate the size of the holes.
-	 *  holes = node_size - sum(bank_sizes)
-	 */
-	memcpy(zhole_size, zone_size, sizeof(zhole_size));
-	for_each_memblock(memory, reg) {
-		unsigned long start = memblock_region_memory_base_pfn(reg);
-		unsigned long end = memblock_region_memory_end_pfn(reg);
-
-		if (start < max_low) {
-			unsigned long low_end = min(end, max_low);
-			zhole_size[0] -= low_end - start;
-		}
 #ifdef CONFIG_HIGHMEM
-		if (end > max_low) {
-			unsigned long high_start = max(start, max_low);
-			zhole_size[ZONE_HIGHMEM] -= end - high_start;
-		}
+	max_zone_pfns[ZONE_HIGHMEM] = max_high;
 #endif
-	}
 
-#ifdef CONFIG_ZONE_DMA
-	/*
-	 * Adjust the sizes according to any special requirements for
-	 * this machine type.
-	 */
-	if (arm_dma_zone_size)
-		arm_adjust_dma_zone(zone_size, zhole_size,
-			arm_dma_zone_size >> PAGE_SHIFT);
+#ifdef CONFIG_DMA
+	max_zone_pfns[ZONE_DMA] = __phys_to_pfn(arm_dma_limit);
 #endif
 
-	free_area_init_node(0, zone_size, min, zhole_size);
+	free_area_init_nodes(max_zone_pfns);
 }
 
 #ifdef CONFIG_HAVE_ARCH_PFN_VALID
diff --git a/arch/arm/mm/numa.c b/arch/arm/mm/numa.c
index 51411349ea2..5933e2caf2d 100644
--- a/arch/arm/mm/numa.c
+++ b/arch/arm/mm/numa.c
@@ -35,10 +35,15 @@ EXPORT_SYMBOL(node_data);
 
 static unsigned int numa_node_count = 1;
 
+cpumask_var_t *node_to_cpumask_map;
+EXPORT_SYMBOL(node_to_cpumask_map);
+
 void __init arm_numa_alloc_nodes(unsigned long max_low)
 {
 	int node;
 
+	arm_numa_alloc_cpumask(max_low);
+
 	for (node = 0; node < numa_node_count; node++) {
 		phys_addr_t pa = memblock_alloc_base(sizeof(pg_data_t),
 				L1_CACHE_BYTES, __pfn_to_phys(max_low));
@@ -48,3 +53,226 @@ void __init arm_numa_alloc_nodes(unsigned long max_low)
 		NODE_DATA(node)->bdata = &bootmem_node_data[node];
 	}
 }
+
+#ifdef CONFIG_NUMA
+
+static unsigned int numa_use_topology;
+
+static char *memcmdline __initdata;
+
+int numa_cpu_lookup_table[NR_CPUS];
+EXPORT_SYMBOL(numa_cpu_lookup_table);
+
+static unsigned long pfn_starts[MAX_NUMNODES];
+
+#ifdef CONFIG_DISCONTIGMEM
+int pfn_to_nid(unsigned long pfn)
+{
+	int node;
+
+	for (node = numa_node_count - 1; node >= 0; node--)
+		if (pfn >= pfn_starts[node])
+			return node;
+
+	panic("NUMA: Unable to locate nid for %lX\n", pfn);
+	return 0;
+}
+#endif
+
+void __init arm_numa_alloc_cpumask(unsigned long max_low)
+{
+	size_t size = sizeof(cpumask_var_t) * numa_node_count;
+	node_to_cpumask_map = __va(memblock_alloc_base(size,
+				L1_CACHE_BYTES, __pfn_to_phys(max_low)));
+	memset(node_to_cpumask_map, 0, size);
+}
+
+/*
+ * Add a CPU to a NUMA node.
+ * Default assignment policy is the cpu number modulo the number of nodes.
+ *
+ * We can also group CPUs via the topology_physical_package_id.
+ * (if the user adds "usetopology" to the command line).
+ * When we add CPU 0 (the boot CPU), it is always to node 0, as we don't have
+ * the topology information at that time.
+ * Subsequent CPUs get added based on the topology_physical_package_id.
+ * To stop CPU0 being added to the same node as CPUs on a different cluster,
+ * we subtract the topology_physical_package_id of node 0.
+ *
+ * This ensures that the TC2 has equivalent node configurations when booted
+ * off the A15s or the A7s.
+ */
+static void add_cpu_to_node(int cpu)
+{
+	unsigned int node;
+	unsigned int n0 = topology_physical_package_id(0);
+	unsigned int nc = topology_physical_package_id(cpu);
+
+	if (numa_use_topology)
+		node = cpu ? (numa_node_count + nc - n0) % numa_node_count : 0;
+	else
+		node = cpu % numa_node_count;
+
+	cpumask_set_cpu(cpu, node_to_cpumask_map[node]);
+	numa_cpu_lookup_table[cpu] = node;
+	pr_info("NUMA: Adding CPU %d to node %d\n", cpu, node);
+}
+
+static int __cpuinit numa_add_cpu(struct notifier_block *self,
+				unsigned long action, void *cpu)
+{
+	if (action == CPU_ONLINE)
+		add_cpu_to_node((int)cpu);
+
+	return NOTIFY_OK;
+
+}
+
+static struct notifier_block __cpuinitdata numa_node_nb = {
+	.notifier_call = numa_add_cpu,
+	.priority = 1, /* Must run before sched domains notifier. */
+};
+
+/*
+ * Split the available memory between the NUMA nodes.
+ * We want all the pages mapped by a pmd to belong to the same node; as code,
+ * such as the THP splitting code, assumes pmds are backed by contiguous
+ * struct page *s. So we mask off the sizes with "rmask".
+ *
+ * By default, the memory is distributed roughly evenly between nodes.
+ *
+ * One can also specify requested node sizes on the command line, if
+ * "memcmdline" is not NULL, we try to parse it as a size.
+ *
+ * We traverse memory blocks rather than the pfn addressable range to allow for
+ * sparse memory configurations and memory holes.
+ */
+static void __init arm_numa_split_memblocks(void)
+{
+	const unsigned long rmask = ~((1UL << (PMD_SHIFT - PAGE_SHIFT)) - 1);
+	unsigned int node;
+	unsigned long pfnsrem = 0, pfnsblock, pfncurr, pfnend = 0;
+	struct memblock_region *reg;
+
+	for_each_memblock(memory, reg) {
+		pfnend = memblock_region_memory_end_pfn(reg);
+		pfnsrem += pfnend - memblock_region_memory_base_pfn(reg);
+	}
+
+	reg = memblock.memory.regions;
+	pfnsblock = memblock_region_memory_end_pfn(reg)
+		    - memblock_region_memory_base_pfn(reg);
+
+	pfncurr = memblock_region_memory_base_pfn(reg);
+	pfn_starts[0] = pfncurr;
+
+	for (node = 0; node < numa_node_count - 1; node++) {
+		unsigned long pfnsnode = pfnsrem / (numa_node_count - node)
+					& rmask;
+
+		if (memcmdline) {
+			unsigned long nsize = __phys_to_pfn(
+					     memparse(memcmdline, &memcmdline))
+						& rmask;
+			if (*memcmdline == ',')
+				++memcmdline;
+
+			if ((nsize > 0) && (nsize < pfnsrem))
+				pfnsnode = nsize;
+			else
+				memcmdline = NULL;
+		}
+
+		while (pfnsnode > 0) {
+			unsigned long pfnsset = min(pfnsnode, pfnsblock);
+
+			pfncurr += pfnsset;
+
+			pfnsblock -= pfnsset;
+			pfnsrem -= pfnsset;
+			pfnsnode -= pfnsset;
+
+			if (pfnsblock == 0) {
+				reg++;
+				pfnsblock = memblock_region_memory_end_pfn(reg)
+					    - memblock_region_memory_base_pfn(reg);
+				pfncurr = memblock_region_memory_base_pfn(reg);
+			}
+		}
+
+		pfn_starts[node + 1] = pfncurr;
+	}
+
+	for (node = 0; node < numa_node_count - 1; node++)
+		memblock_set_node(__pfn_to_phys(pfn_starts[node]),
+			__pfn_to_phys(pfn_starts[node + 1] - pfn_starts[node]),
+			node);
+
+	memblock_set_node(__pfn_to_phys(pfn_starts[node]),
+		__pfn_to_phys(pfnend - pfn_starts[node]), node);
+
+}
+
+void __init arm_setup_nodes(unsigned long min, unsigned long max_high)
+{
+	int node;
+
+	register_cpu_notifier(&numa_node_nb);
+	arm_numa_split_memblocks();
+
+
+	for (node = 0; node < numa_node_count; node++) {
+		alloc_bootmem_cpumask_var(&node_to_cpumask_map[node]);
+		node_set_online(node);
+	}
+
+	add_cpu_to_node(0);
+
+}
+
+static int __init early_numa(char *p)
+{
+	if (!p)
+		return 0;
+
+	p = strstr(p, "fake=");
+	if (p) {
+		int num_nodes = 0;
+		int optres;
+
+		p += strlen("fake=");
+		optres = get_option(&p, &num_nodes);
+		if ((optres == 0) || (optres == 3))
+			return -EINVAL;
+
+		if ((num_nodes > 0) && (num_nodes <= MAX_NUMNODES)) {
+			pr_info("NUMA: setting up fake NUMA with %d nodes.\n",
+				num_nodes);
+
+			numa_node_count = num_nodes;
+		} else {
+			pr_info("NUMA: can't set up %d nodes for NUMA (MAX_NUMNODES = %d)\n",
+				num_nodes, MAX_NUMNODES);
+			return -EINVAL;
+		}
+
+		/*
+		 * If a comma was specified after the number of nodes then subsequent
+		 * numbers should be regarded as memory sizes for each node for as
+		 * many nodes as are supplied.
+		 */
+		if (optres == 2)
+			memcmdline = p;
+
+		if (strstr(p, "usetopology")) {
+			numa_use_topology = 1;
+			pr_info("NUMA: using CPU topology to assign nodes.\n");
+		} else
+			pr_info("NUMA: NOT using CPU topology.\n");
+	}
+
+	return 0;
+}
+early_param("numa", early_numa);
+
+#endif /* CONFIG_NUMA */
author	Steve Capper <steve.capper@arm.com>	2012-12-26 11:12:13 +0530
committer	Tushar Behera <tushar.behera@linaro.org>	2013-01-22 11:41:48 +0530
commit	51d454d6e0614ab199f75db70f2f6519003bc285 (patch)
tree	6b87e282c6be97e226da2e9ec78afb7142fb00f7
parent	2e73c5afa9dc518eb1f4e60682592bab009a66f9 (diff)
download	linux-topics-51d454d6e0614ab199f75db70f2f6519003bc285.tar.gz