From 8b8ca80e192b10eecc01fc44a2902510af86f73b Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Wed, 2 May 2007 19:27:09 +0200 Subject: [PATCH] x86-64: configurable fake numa node sizes Extends the numa=fake x86_64 command-line option to allow for configurable node sizes. These nodes can be used in conjunction with cpusets for coarse memory resource management. The old command-line option is still supported: numa=fake=32 gives 32 fake NUMA nodes, ignoring the NUMA setup of the actual machine. But now you may configure your system for the node sizes of your choice: numa=fake=2*512,1024,2*256 gives two 512M nodes, one 1024M node, two 256M nodes, and the rest of system memory to a sixth node. The existing hash function is maintained to support the various node sizes that are possible with this implementation. Each node of the same size receives roughly the same amount of available pages, regardless of any reserved memory with its address range. The total available pages on the system is calculated and divided by the number of equal nodes to allocate. These nodes are then dynamically allocated and their borders extended until such time as their number of available pages reaches the required size. Configurable node sizes are recommended when used in conjunction with cpusets for memory control because it eliminates the overhead associated with scanning the zonelists of many smaller full nodes on page_alloc(). Cc: Andi Kleen Signed-off-by: David Rientjes Signed-off-by: Andi Kleen Cc: Paul Jackson Cc: Christoph Lameter Signed-off-by: Andrew Morton --- Documentation/x86_64/boot-options.txt | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'Documentation/x86_64/boot-options.txt') diff --git a/Documentation/x86_64/boot-options.txt b/Documentation/x86_64/boot-options.txt index 85f51e5a749f..7500aad95f3c 100644 --- a/Documentation/x86_64/boot-options.txt +++ b/Documentation/x86_64/boot-options.txt @@ -149,7 +149,13 @@ NUMA numa=noacpi Don't parse the SRAT table for NUMA setup - numa=fake=X Fake X nodes and ignore NUMA setup of the actual machine. + numa=fake=CMDLINE + If a number, fakes CMDLINE nodes and ignores NUMA setup of the + actual machine. Otherwise, system memory is configured + depending on the sizes and coefficients listed. For example: + numa=fake=2*512,1024,4*256 + gives two 512M nodes, a 1024M node, and four 256M nodes. The + remaining system RAM is allocated to an additional node. numa=hotadd=percent Only allow hotadd memory to preallocate page structures upto -- cgit v1.2.3 From 14694d736bb66d0ec250d05c81c6e98a19c229c6 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Wed, 2 May 2007 19:27:09 +0200 Subject: [PATCH] x86-64: split remaining fake nodes equally Extends the numa=fake x86_64 command-line option to split the remaining system memory into equal-sized nodes. For example: numa=fake=2*512,4* gives two 512M nodes and the remaining system memory is split into four approximately equal chunks. This is beneficial for systems where the exact size of RAM is unknown or not necessarily relevant, but the granularity with which nodes shall be allocated is known. Cc: Andi Kleen Signed-off-by: David Rientjes Signed-off-by: Andi Kleen Cc: Paul Jackson Cc: Christoph Lameter Signed-off-by: Andrew Morton --- Documentation/x86_64/boot-options.txt | 4 +++- arch/x86_64/mm/numa.c | 22 ++++++++++++++++++---- 2 files changed, 21 insertions(+), 5 deletions(-) (limited to 'Documentation/x86_64/boot-options.txt') diff --git a/Documentation/x86_64/boot-options.txt b/Documentation/x86_64/boot-options.txt index 7500aad95f3c..12a9aacecaae 100644 --- a/Documentation/x86_64/boot-options.txt +++ b/Documentation/x86_64/boot-options.txt @@ -155,7 +155,9 @@ NUMA depending on the sizes and coefficients listed. For example: numa=fake=2*512,1024,4*256 gives two 512M nodes, a 1024M node, and four 256M nodes. The - remaining system RAM is allocated to an additional node. + remaining system RAM is allocated to an additional node. If + the last character of CMDLINE is a *, the remaining system RAM + is instead divided up equally among its coefficient. numa=hotadd=percent Only allow hotadd memory to preallocate page structures upto diff --git a/arch/x86_64/mm/numa.c b/arch/x86_64/mm/numa.c index c55936bc6be6..0ae2d9d5d7ea 100644 --- a/arch/x86_64/mm/numa.c +++ b/arch/x86_64/mm/numa.c @@ -418,11 +418,25 @@ static int __init numa_emulation(unsigned long start_pfn, unsigned long end_pfn) done: if (!num_nodes) return -1; - /* Fill remainder of system RAM with a final node, if appropriate. */ + /* Fill remainder of system RAM, if appropriate. */ if (addr < max_addr) { - setup_node_range(num_nodes, nodes, &addr, max_addr - addr, - max_addr); - num_nodes++; + switch (*(cmdline - 1)) { + case '*': + /* Split remaining nodes into coeff chunks */ + if (coeff <= 0) + break; + num_nodes += split_nodes_equally(nodes, &addr, max_addr, + num_nodes, coeff); + break; + case ',': + /* Do not allocate remaining system RAM */ + break; + default: + /* Give one final node */ + setup_node_range(num_nodes, nodes, &addr, + max_addr - addr, max_addr); + num_nodes++; + } } out: memnode_shift = compute_hash_shift(nodes, num_nodes); -- cgit v1.2.3 From 382591d500bbcd20a44416c5e0e292708468587c Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Wed, 2 May 2007 19:27:09 +0200 Subject: [PATCH] x86-64: fixed size remaining fake nodes Extends the numa=fake x86_64 command-line option to split the remaining system memory into nodes of fixed size. Any leftover memory is allocated to a final node unless the command-line ends with a comma. For example: numa=fake=2*512,*128 gives two 512M nodes and the remaining system memory is split into nodes of 128M each. This is beneficial for systems where the exact size of RAM is unknown or not necessarily relevant, but the size of the remaining nodes to be allocated is known based on their capacity for resource management. Cc: Andi Kleen Signed-off-by: David Rientjes Signed-off-by: Andi Kleen Cc: Paul Jackson Cc: Christoph Lameter Signed-off-by: Andrew Morton --- Documentation/x86_64/boot-options.txt | 14 +++++++---- arch/x86_64/mm/numa.c | 47 +++++++++++++++++++++++++++-------- 2 files changed, 46 insertions(+), 15 deletions(-) (limited to 'Documentation/x86_64/boot-options.txt') diff --git a/Documentation/x86_64/boot-options.txt b/Documentation/x86_64/boot-options.txt index 12a9aacecaae..6177d881983f 100644 --- a/Documentation/x86_64/boot-options.txt +++ b/Documentation/x86_64/boot-options.txt @@ -153,11 +153,15 @@ NUMA If a number, fakes CMDLINE nodes and ignores NUMA setup of the actual machine. Otherwise, system memory is configured depending on the sizes and coefficients listed. For example: - numa=fake=2*512,1024,4*256 - gives two 512M nodes, a 1024M node, and four 256M nodes. The - remaining system RAM is allocated to an additional node. If - the last character of CMDLINE is a *, the remaining system RAM - is instead divided up equally among its coefficient. + numa=fake=2*512,1024,4*256,*128 + gives two 512M nodes, a 1024M node, four 256M nodes, and the + rest split into 128M chunks. If the last character of CMDLINE + is a *, the remaining memory is divided up equally among its + coefficient: + numa=fake=2*512,2* + gives two 512M nodes and the rest split into two nodes. + Otherwise, the remaining system RAM is allocated to an + additional node. numa=hotadd=percent Only allow hotadd memory to preallocate page structures upto diff --git a/arch/x86_64/mm/numa.c b/arch/x86_64/mm/numa.c index 0ae2d9d5d7ea..5ee07bc41eb5 100644 --- a/arch/x86_64/mm/numa.c +++ b/arch/x86_64/mm/numa.c @@ -361,6 +361,21 @@ static int __init split_nodes_equally(struct bootnode *nodes, u64 *addr, return i - node_start + 1; } +/* + * Splits the remaining system RAM into chunks of size. The remaining memory is + * always assigned to a final node and can be asymmetric. Returns the number of + * nodes split. + */ +static int __init split_nodes_by_size(struct bootnode *nodes, u64 *addr, + u64 max_addr, int node_start, u64 size) +{ + int i = node_start; + size = (size << 20) & FAKE_NODE_MIN_HASH_MASK; + while (!setup_node_range(i++, nodes, addr, size, max_addr)) + ; + return i - node_start; +} + /* * Sets up the system RAM area from start_pfn to end_pfn according to the * numa=fake command-line option. @@ -370,9 +385,10 @@ static int __init numa_emulation(unsigned long start_pfn, unsigned long end_pfn) struct bootnode nodes[MAX_NUMNODES]; u64 addr = start_pfn << PAGE_SHIFT; u64 max_addr = end_pfn << PAGE_SHIFT; - unsigned int coeff; - unsigned int num = 0; int num_nodes = 0; + int coeff_flag; + int coeff = -1; + int num = 0; u64 size; int i; @@ -390,29 +406,34 @@ static int __init numa_emulation(unsigned long start_pfn, unsigned long end_pfn) } /* Parse the command line. */ - for (coeff = 1; ; cmdline++) { + for (coeff_flag = 0; ; cmdline++) { if (*cmdline && isdigit(*cmdline)) { num = num * 10 + *cmdline - '0'; continue; } - if (*cmdline == '*') - coeff = num; + if (*cmdline == '*') { + if (num > 0) + coeff = num; + coeff_flag = 1; + } if (!*cmdline || *cmdline == ',') { + if (!coeff_flag) + coeff = 1; /* * Round down to the nearest FAKE_NODE_MIN_SIZE. * Command-line coefficients are in megabytes. */ size = ((u64)num << 20) & FAKE_NODE_MIN_HASH_MASK; - if (size) { + if (size) for (i = 0; i < coeff; i++, num_nodes++) if (setup_node_range(num_nodes, nodes, &addr, size, max_addr) < 0) goto done; - coeff = 1; - } + if (!*cmdline) + break; + coeff_flag = 0; + coeff = -1; } - if (!*cmdline) - break; num = 0; } done: @@ -420,6 +441,12 @@ done: return -1; /* Fill remainder of system RAM, if appropriate. */ if (addr < max_addr) { + if (coeff_flag && coeff < 0) { + /* Split remaining nodes into num-sized chunks */ + num_nodes += split_nodes_by_size(nodes, &addr, max_addr, + num_nodes, num); + goto out; + } switch (*(cmdline - 1)) { case '*': /* Split remaining nodes into coeff chunks */ -- cgit v1.2.3