Mailing List Archive

[xen-unstable] x86: fix NUMA handling (c/s 20599:e5a757ce7845)
# HG changeset patch
# User Keir Fraser <keir.fraser@citrix.com>
# Date 1262949761 0
# Node ID 5e8b6ecd045e827f3229f3a2fb15621946c50a6b
# Parent cba56c13ca3eba67a3b56e78256418fd62445a95
x86: fix NUMA handling (c/s 20599:e5a757ce7845)

c/s 20599 caused the hash shift to become significantly smaller on
systems with an SRAT like this

(XEN) SRAT: Node 0 PXM 0 0-a0000
(XEN) SRAT: Node 0 PXM 0 100000-80000000
(XEN) SRAT: Node 1 PXM 1 80000000-d0000000
(XEN) SRAT: Node 1 PXM 1 100000000-130000000

Comined with the static size of the memnodemap[] array, NUMA got
therefore disabled on such systems. The backport from Linux was really
incomplete, as Linux much earlier had already introduced a dynamcially
allocated memnodemap[].

Further, doing to/from pdx translations on addresses just past a valid
range is not correct, as it may strip/fail to insert non-zero bits in
this case.

Finally, using 63 as the cover-it-all shift value is invalid on 32bit,
since pdx values are unsigned long.

Signed-off-by: Jan Beulich <jbeulich@novell.com>
---
xen/arch/x86/numa.c | 45 +++++++++++++++++++++++++++++++++++++--------
xen/include/asm-x86/numa.h | 6 +++---
2 files changed, 40 insertions(+), 11 deletions(-)

diff -r cba56c13ca3e -r 5e8b6ecd045e xen/arch/x86/numa.c
--- a/xen/arch/x86/numa.c Wed Jan 06 12:45:23 2010 +0000
+++ b/xen/arch/x86/numa.c Fri Jan 08 11:22:41 2010 +0000
@@ -30,7 +30,9 @@ struct node_data node_data[MAX_NUMNODES]

/* Mapping from pdx to node id */
int memnode_shift;
-u8 memnodemap[NODEMAPSIZE];
+static typeof(*memnodemap) _memnodemap[2];
+unsigned long memnodemapsize;
+u8 *memnodemap;

unsigned char cpu_to_node[NR_CPUS] __read_mostly = {
[0 ... NR_CPUS-1] = NUMA_NO_NODE
@@ -62,13 +64,13 @@ static int __init populate_memnodemap(co
unsigned long spdx, epdx;
int i, res = -1;

- memset(memnodemap, NUMA_NO_NODE, sizeof(memnodemap));
+ memset(memnodemap, NUMA_NO_NODE, memnodemapsize * sizeof(*memnodemap));
for (i = 0; i < numnodes; i++) {
spdx = paddr_to_pdx(nodes[i].start);
- epdx = paddr_to_pdx(nodes[i].end);
+ epdx = paddr_to_pdx(nodes[i].end - 1) + 1;
if (spdx >= epdx)
continue;
- if ((epdx >> shift) >= NODEMAPSIZE)
+ if ((epdx >> shift) >= memnodemapsize)
return 0;
do {
if (memnodemap[spdx >> shift] != NUMA_NO_NODE)
@@ -84,6 +86,28 @@ static int __init populate_memnodemap(co
res = 1;
}
return res;
+}
+
+static int __init allocate_cachealigned_memnodemap(void)
+{
+ unsigned long size = PFN_UP(memnodemapsize * sizeof(*memnodemap));
+ unsigned long mfn = alloc_boot_pages(size, 1);
+
+ if (!mfn) {
+ printk(KERN_ERR
+ "NUMA: Unable to allocate Memory to Node hash map\n");
+ memnodemapsize = 0;
+ return -1;
+ }
+
+ memnodemap = mfn_to_virt(mfn);
+ mfn <<= PAGE_SHIFT;
+ size <<= PAGE_SHIFT;
+ printk(KERN_DEBUG "NUMA: Allocated memnodemap from %lx - %lx\n",
+ mfn, mfn + size);
+ memnodemapsize = size / sizeof(*memnodemap);
+
+ return 0;
}

/*
@@ -99,7 +123,7 @@ static int __init extract_lsb_from_nodes

for (i = 0; i < numnodes; i++) {
spdx = paddr_to_pdx(nodes[i].start);
- epdx = paddr_to_pdx(nodes[i].end);
+ epdx = paddr_to_pdx(nodes[i].end - 1) + 1;
if (spdx >= epdx)
continue;
bitfield |= spdx;
@@ -108,9 +132,10 @@ static int __init extract_lsb_from_nodes
memtop = epdx;
}
if (nodes_used <= 1)
- i = 63;
+ i = BITS_PER_LONG - 1;
else
i = find_first_bit(&bitfield, sizeof(unsigned long)*8);
+ memnodemapsize = (memtop >> i) + 1;
return i;
}

@@ -120,6 +145,10 @@ int __init compute_hash_shift(struct nod
int shift;

shift = extract_lsb_from_nodes(nodes, numnodes);
+ if (memnodemapsize <= ARRAY_SIZE(_memnodemap))
+ memnodemap = _memnodemap;
+ else if (allocate_cachealigned_memnodemap())
+ return -1;
printk(KERN_DEBUG "NUMA: Using %d for the hash shift.\n",
shift);

@@ -233,8 +262,8 @@ void __init numa_initmem_init(unsigned l
(u64)start_pfn << PAGE_SHIFT,
(u64)end_pfn << PAGE_SHIFT);
/* setup dummy node covering all memory */
- memnode_shift = 63;
- memnodemap[0] = 0;
+ memnode_shift = BITS_PER_LONG - 1;
+ memnodemap = _memnodemap;
nodes_clear(node_online_map);
node_set_online(0);
for (i = 0; i < NR_CPUS; i++)
diff -r cba56c13ca3e -r 5e8b6ecd045e xen/include/asm-x86/numa.h
--- a/xen/include/asm-x86/numa.h Wed Jan 06 12:45:23 2010 +0000
+++ b/xen/include/asm-x86/numa.h Fri Jan 08 11:22:41 2010 +0000
@@ -25,7 +25,6 @@ extern int pxm_to_node(int nid);

#define ZONE_ALIGN (1UL << (MAX_ORDER+PAGE_SHIFT))
#define VIRTUAL_BUG_ON(x)
-#define NODEMAPSIZE 0xfff

extern void numa_add_cpu(int cpu);
extern void numa_init_array(void);
@@ -51,7 +50,8 @@ static inline void clear_node_cpumask(in

/* Simple perfect hash to map pdx to node numbers */
extern int memnode_shift;
-extern u8 memnodemap[NODEMAPSIZE];
+extern unsigned long memnodemapsize;
+extern u8 *memnodemap;

struct node_data {
unsigned long node_start_pfn;
@@ -64,7 +64,7 @@ static inline __attribute__((pure)) int
static inline __attribute__((pure)) int phys_to_nid(paddr_t addr)
{
unsigned nid;
- VIRTUAL_BUG_ON((paddr_to_pdx(addr) >> memnode_shift) >= NODEMAPSIZE);
+ VIRTUAL_BUG_ON((paddr_to_pdx(addr) >> memnode_shift) >= memnodemapsize);
nid = memnodemap[paddr_to_pdx(addr) >> memnode_shift];
VIRTUAL_BUG_ON(nid >= MAX_NUMNODES || !node_data[nid]);
return nid;

_______________________________________________
Xen-changelog mailing list
Xen-changelog@lists.xensource.com
http://lists.xensource.com/xen-changelog