Mailing List Archive

[xen-unstable] Host Numa information in dom0
# HG changeset patch
# User Keir Fraser <keir.fraser@citrix.com>
# Date 1270653725 -3600
# Node ID 28e5409e3fb377830a5f4346fd414d3d158f3483
# Parent f0ef396d8c334100293fcba75ee89f311811b9f2
Host Numa information in dom0

'xm info' command now also gives the cpu topology & host numa
information. This will be later used to build guest numa support. The
patch basically changes physinfo sysctl, and adds topology_info &
numa_info sysctls, and also changes the python & libxc code
accordingly.

Signed-off-by: Nitin A Kamble <nitin.a.kamble@intel.com>
---
tools/libxc/xc_misc.c | 37 ++++++
tools/libxc/xenctrl.h | 14 ++
tools/python/xen/lowlevel/xc/xc.c | 215 ++++++++++++++++++++++++++------------
tools/python/xen/xend/XendNode.py | 63 ++++++-----
tools/python/xen/xend/balloon.py | 14 --
xen/arch/x86/sysctl.c | 140 ++++++++++++++++++++++--
xen/common/page_alloc.c | 6 +
xen/include/asm-x86/numa.h | 1
xen/include/public/sysctl.h | 90 ++++++++++++---
xen/include/xen/mm.h | 1
10 files changed, 447 insertions(+), 134 deletions(-)

diff -r f0ef396d8c33 -r 28e5409e3fb3 tools/libxc/xc_misc.c
--- a/tools/libxc/xc_misc.c Wed Apr 07 15:44:29 2010 +0100
+++ b/tools/libxc/xc_misc.c Wed Apr 07 16:22:05 2010 +0100
@@ -79,6 +79,43 @@ int xc_physinfo(int xc_handle,

return 0;
}
+
+int xc_topologyinfo(int xc_handle,
+ xc_topologyinfo_t *put_info)
+{
+ int ret;
+ DECLARE_SYSCTL;
+
+ sysctl.cmd = XEN_SYSCTL_topologyinfo;
+
+ memcpy(&sysctl.u.topologyinfo, put_info, sizeof(*put_info));
+
+ if ( (ret = do_sysctl(xc_handle, &sysctl)) != 0 )
+ return ret;
+
+ memcpy(put_info, &sysctl.u.topologyinfo, sizeof(*put_info));
+
+ return 0;
+}
+
+int xc_numainfo(int xc_handle,
+ xc_numainfo_t *put_info)
+{
+ int ret;
+ DECLARE_SYSCTL;
+
+ sysctl.cmd = XEN_SYSCTL_numainfo;
+
+ memcpy(&sysctl.u.numainfo, put_info, sizeof(*put_info));
+
+ if ((ret = do_sysctl(xc_handle, &sysctl)) != 0)
+ return ret;
+
+ memcpy(put_info, &sysctl.u.numainfo, sizeof(*put_info));
+
+ return 0;
+}
+

int xc_sched_id(int xc_handle,
int *sched_id)
diff -r f0ef396d8c33 -r 28e5409e3fb3 tools/libxc/xenctrl.h
--- a/tools/libxc/xenctrl.h Wed Apr 07 15:44:29 2010 +0100
+++ b/tools/libxc/xenctrl.h Wed Apr 07 16:22:05 2010 +0100
@@ -612,9 +612,19 @@ int xc_send_debug_keys(int xc_handle, ch
int xc_send_debug_keys(int xc_handle, char *keys);

typedef xen_sysctl_physinfo_t xc_physinfo_t;
+typedef xen_sysctl_topologyinfo_t xc_topologyinfo_t;
+typedef xen_sysctl_numainfo_t xc_numainfo_t;
+
typedef uint32_t xc_cpu_to_node_t;
-int xc_physinfo(int xc_handle,
- xc_physinfo_t *info);
+typedef uint32_t xc_cpu_to_socket_t;
+typedef uint32_t xc_cpu_to_core_t;
+typedef uint64_t xc_node_to_memsize_t;
+typedef uint64_t xc_node_to_memfree_t;
+typedef uint32_t xc_node_to_node_dist_t;
+
+int xc_physinfo(int xc_handle, xc_physinfo_t *info);
+int xc_topologyinfo(int xc_handle, xc_topologyinfo_t *info);
+int xc_numainfo(int xc_handle, xc_numainfo_t *info);

int xc_sched_id(int xc_handle,
int *sched_id);
diff -r f0ef396d8c33 -r 28e5409e3fb3 tools/python/xen/lowlevel/xc/xc.c
--- a/tools/python/xen/lowlevel/xc/xc.c Wed Apr 07 15:44:29 2010 +0100
+++ b/tools/python/xen/lowlevel/xc/xc.c Wed Apr 07 16:22:05 2010 +0100
@@ -1151,105 +1151,178 @@ static PyObject *pyxc_pages_to_kib(XcObj
return PyLong_FromUnsignedLong(pages_to_kib(pages));
}

-
static PyObject *pyxc_physinfo(XcObject *self)
{
-#define MAX_CPU_ID 255
- xc_physinfo_t info;
+ xc_physinfo_t pinfo;
char cpu_cap[128], virt_caps[128], *p;
- int i, j, max_cpu_id, nr_nodes = 0;
- uint64_t free_heap;
- PyObject *ret_obj, *node_to_cpu_obj, *node_to_memory_obj;
- PyObject *node_to_dma32_mem_obj;
- xc_cpu_to_node_t map[MAX_CPU_ID + 1];
+ int i;
const char *virtcap_names[] = { "hvm", "hvm_directio" };

- set_xen_guest_handle(info.cpu_to_node, map);
- info.max_cpu_id = MAX_CPU_ID;
-
- if ( xc_physinfo(self->xc_handle, &info) != 0 )
+ if ( xc_physinfo(self->xc_handle, &pinfo) != 0 )
return pyxc_error_to_exception();

p = cpu_cap;
*p = '\0';
- for ( i = 0; i < sizeof(info.hw_cap)/4; i++ )
- p += sprintf(p, "%08x:", info.hw_cap[i]);
+ for ( i = 0; i < sizeof(pinfo.hw_cap)/4; i++ )
+ p += sprintf(p, "%08x:", pinfo.hw_cap[i]);
*(p-1) = 0;

p = virt_caps;
*p = '\0';
for ( i = 0; i < 2; i++ )
- if ( (info.capabilities >> i) & 1 )
+ if ( (pinfo.capabilities >> i) & 1 )
p += sprintf(p, "%s ", virtcap_names[i]);
if ( p != virt_caps )
*(p-1) = '\0';

- max_cpu_id = info.max_cpu_id;
- if ( max_cpu_id > MAX_CPU_ID )
- max_cpu_id = MAX_CPU_ID;
+ return Py_BuildValue("{s:i,s:i,s:i,s:i,s:i,s:l,s:l,s:l,s:i,s:s,s:s}",
+ "nr_nodes", pinfo.nr_nodes,
+ "threads_per_core", pinfo.threads_per_core,
+ "cores_per_socket", pinfo.cores_per_socket,
+ "sockets_per_node", pinfo.sockets_per_node,
+ "nr_cpus", pinfo.nr_cpus,
+ "total_memory", pages_to_kib(pinfo.total_pages),
+ "free_memory", pages_to_kib(pinfo.free_pages),
+ "scrub_memory", pages_to_kib(pinfo.scrub_pages),
+ "cpu_khz", pinfo.cpu_khz,
+ "hw_caps", cpu_cap,
+ "virt_caps", virt_caps);
+}
+
+static PyObject *pyxc_topologyinfo(XcObject *self)
+{
+#define MAX_CPU_INDEX 255
+ xc_topologyinfo_t tinfo;
+ int i, max_cpu_index;
+ PyObject *ret_obj;
+ PyObject *cpu_to_core_obj, *cpu_to_socket_obj, *cpu_to_node_obj;
+ xc_cpu_to_core_t coremap[MAX_CPU_INDEX + 1];
+ xc_cpu_to_socket_t socketmap[MAX_CPU_INDEX + 1];
+ xc_cpu_to_node_t nodemap[MAX_CPU_INDEX + 1];
+
+
+ set_xen_guest_handle(tinfo.cpu_to_core, coremap);
+ set_xen_guest_handle(tinfo.cpu_to_socket, socketmap);
+ set_xen_guest_handle(tinfo.cpu_to_node, nodemap);
+ tinfo.max_cpu_index = MAX_CPU_INDEX;
+
+ if ( xc_topologyinfo(self->xc_handle, &tinfo) != 0 )
+ return pyxc_error_to_exception();
+
+ max_cpu_index = tinfo.max_cpu_index;
+ if ( max_cpu_index > MAX_CPU_INDEX )
+ max_cpu_index = MAX_CPU_INDEX;
+
+ /* Construct cpu-to-* lists. */
+ cpu_to_core_obj = PyList_New(0);
+ cpu_to_socket_obj = PyList_New(0);
+ cpu_to_node_obj = PyList_New(0);
+ for ( i = 0; i < max_cpu_index; i++ )
+ {
+ PyObject *pyint;
+
+ pyint = PyInt_FromLong(coremap[i]);
+ PyList_Append(cpu_to_core_obj, pyint);
+ Py_DECREF(pyint);
+
+ pyint = PyInt_FromLong(socketmap[i]);
+ PyList_Append(cpu_to_socket_obj, pyint);
+ Py_DECREF(pyint);
+
+ pyint = PyInt_FromLong(nodemap[i]);
+ PyList_Append(cpu_to_node_obj, pyint);
+ Py_DECREF(pyint);
+ }
+
+ ret_obj = Py_BuildValue("{s:i}", "max_cpu_index", max_cpu_index);
+
+ PyDict_SetItemString(ret_obj, "cpu_to_core", cpu_to_core_obj);
+ Py_DECREF(cpu_to_core_obj);
+
+ PyDict_SetItemString(ret_obj, "cpu_to_socket", cpu_to_socket_obj);
+ Py_DECREF(cpu_to_socket_obj);
+
+ PyDict_SetItemString(ret_obj, "cpu_to_node", cpu_to_node_obj);
+ Py_DECREF(cpu_to_node_obj);
+
+ return ret_obj;
+#undef MAX_CPU_INDEX
+}
+
+static PyObject *pyxc_numainfo(XcObject *self)
+{
+#define MAX_NODE_INDEX 31
+ xc_numainfo_t ninfo;
+ int i, j, max_node_index;
+ uint64_t free_heap;
+ PyObject *ret_obj;
+ PyObject *node_to_memsize_obj, *node_to_memfree_obj;
+ PyObject *node_to_dma32_mem_obj, *node_to_node_dist_obj;
+ xc_node_to_memsize_t node_memsize[MAX_NODE_INDEX + 1];
+ xc_node_to_memfree_t node_memfree[MAX_NODE_INDEX + 1];
+ xc_node_to_node_dist_t nodes_dist[(MAX_NODE_INDEX * MAX_NODE_INDEX) + 1];
+
+ set_xen_guest_handle(ninfo.node_to_memsize, node_memsize);
+ set_xen_guest_handle(ninfo.node_to_memfree, node_memfree);
+ set_xen_guest_handle(ninfo.node_to_node_distance, nodes_dist);
+ ninfo.max_node_index = MAX_NODE_INDEX;
+ if( xc_numainfo(self->xc_handle, &ninfo) != 0 )
+ return pyxc_error_to_exception();
+
+ max_node_index = ninfo.max_node_index;
+ if ( max_node_index > MAX_NODE_INDEX )
+ max_node_index = MAX_NODE_INDEX;

/* Construct node-to-* lists. */
- node_to_cpu_obj = PyList_New(0);
- node_to_memory_obj = PyList_New(0);
+ node_to_memsize_obj = PyList_New(0);
+ node_to_memfree_obj = PyList_New(0);
node_to_dma32_mem_obj = PyList_New(0);
- for ( i = 0; i <= info.max_node_id; i++ )
+ node_to_node_dist_obj = PyList_New(0);
+ for ( i = 0; i < max_node_index; i++ )
{
- int node_exists = 0;
PyObject *pyint;

- /* CPUs. */
- PyObject *cpus = PyList_New(0);
- for ( j = 0; j <= max_cpu_id; j++ )
- {
- if ( i != map[j] )
- continue;
- pyint = PyInt_FromLong(j);
- PyList_Append(cpus, pyint);
- Py_DECREF(pyint);
- node_exists = 1;
- }
- PyList_Append(node_to_cpu_obj, cpus);
- Py_DECREF(cpus);
-
- /* Memory. */
- xc_availheap(self->xc_handle, 0, 0, i, &free_heap);
- node_exists = node_exists || (free_heap != 0);
- pyint = PyInt_FromLong(free_heap / 1024);
- PyList_Append(node_to_memory_obj, pyint);
+ /* Total Memory */
+ pyint = PyInt_FromLong(node_memsize[i] >> 20); /* MB */
+ PyList_Append(node_to_memsize_obj, pyint);
+ Py_DECREF(pyint);
+
+ /* Free Memory */
+ pyint = PyInt_FromLong(node_memfree[i] >> 20); /* MB */
+ PyList_Append(node_to_memfree_obj, pyint);
Py_DECREF(pyint);

/* DMA memory. */
xc_availheap(self->xc_handle, 0, 32, i, &free_heap);
- pyint = PyInt_FromLong(free_heap / 1024);
+ pyint = PyInt_FromLong(free_heap >> 20); /* MB */
PyList_Append(node_to_dma32_mem_obj, pyint);
Py_DECREF(pyint);

- if ( node_exists )
- nr_nodes++;
- }
-
- ret_obj = Py_BuildValue("{s:i,s:i,s:i,s:i,s:i,s:i,s:l,s:l,s:l,s:i,s:s:s:s}",
- "nr_nodes", nr_nodes,
- "max_node_id", info.max_node_id,
- "max_cpu_id", info.max_cpu_id,
- "threads_per_core", info.threads_per_core,
- "cores_per_socket", info.cores_per_socket,
- "nr_cpus", info.nr_cpus,
- "total_memory", pages_to_kib(info.total_pages),
- "free_memory", pages_to_kib(info.free_pages),
- "scrub_memory", pages_to_kib(info.scrub_pages),
- "cpu_khz", info.cpu_khz,
- "hw_caps", cpu_cap,
- "virt_caps", virt_caps);
- PyDict_SetItemString(ret_obj, "node_to_cpu", node_to_cpu_obj);
- Py_DECREF(node_to_cpu_obj);
- PyDict_SetItemString(ret_obj, "node_to_memory", node_to_memory_obj);
- Py_DECREF(node_to_memory_obj);
+ /* Node to Node Distance */
+ for ( j = 0; j < ninfo.max_node_index; j++ )
+ {
+ pyint = PyInt_FromLong(nodes_dist[(i * ninfo.max_node_index) + j]);
+ PyList_Append(node_to_node_dist_obj, pyint);
+ Py_DECREF(pyint);
+ }
+ }
+
+ ret_obj = Py_BuildValue("{s:i}", "max_node_index", max_node_index);
+
+ PyDict_SetItemString(ret_obj, "node_memsize", node_to_memsize_obj);
+ Py_DECREF(node_to_memsize_obj);
+
+ PyDict_SetItemString(ret_obj, "node_memfree", node_to_memfree_obj);
+ Py_DECREF(node_to_memfree_obj);
+
PyDict_SetItemString(ret_obj, "node_to_dma32_mem", node_to_dma32_mem_obj);
Py_DECREF(node_to_dma32_mem_obj);
+
+ PyDict_SetItemString(ret_obj, "node_to_node_dist", node_to_node_dist_obj);
+ Py_DECREF(node_to_node_dist_obj);

return ret_obj;
-#undef MAX_CPU_ID
+#undef MAX_NODE_INDEX
}

static PyObject *pyxc_xeninfo(XcObject *self)
@@ -2056,6 +2129,20 @@ static PyMethodDef pyxc_methods[] = {
METH_NOARGS, "\n"
"Get information about the physical host machine\n"
"Returns [dict]: information about the hardware"
+ " [None]: on failure.\n" },
+
+ { "topologyinfo",
+ (PyCFunction)pyxc_topologyinfo,
+ METH_NOARGS, "\n"
+ "Get information about the cpu topology on the host machine\n"
+ "Returns [dict]: information about the cpu topology on host"
+ " [None]: on failure.\n" },
+
+ { "numainfo",
+ (PyCFunction)pyxc_numainfo,
+ METH_NOARGS, "\n"
+ "Get NUMA information on the host machine\n"
+ "Returns [dict]: NUMA information on host"
" [None]: on failure.\n" },

{ "xeninfo",
diff -r f0ef396d8c33 -r 28e5409e3fb3 tools/python/xen/xend/XendNode.py
--- a/tools/python/xen/xend/XendNode.py Wed Apr 07 15:44:29 2010 +0100
+++ b/tools/python/xen/xend/XendNode.py Wed Apr 07 16:22:05 2010 +0100
@@ -878,65 +878,70 @@ class XendNode:
def list_to_strrange(self,list):
return self.format_pairs(self.list_to_rangepairs(list))

- def format_node_to_cpu(self, pinfo):
- str=''
- whitespace=''
+ def format_cpu_to_core_socket_node(self, tinfo):
try:
- node_to_cpu=pinfo['node_to_cpu']
- for i in range(0, pinfo['max_node_id']+1):
- str+='%snode%d:%s\n' % (whitespace,
- i,
- self.list_to_strrange(node_to_cpu[i]))
- whitespace='%25s' % ''
+ nr_cpus=tinfo['max_cpu_index']
+ str='\ncpu: core socket node\n'
+ for i in range(0, nr_cpus):
+ str+='%3d:%8d %8d %8d\n' % (i,
+ tinfo['cpu_to_core'][i],
+ tinfo['cpu_to_socket'][i],
+ tinfo['cpu_to_node'][i])
except:
str='none\n'
return str[:-1];
- def format_node_to_memory(self, pinfo, key):
- str=''
- whitespace=''
+
+ def format_numa_info(self, ninfo):
try:
- node_to_memory=pinfo[key]
- for i in range(0, pinfo['max_node_id']+1):
- str+='%snode%d:%d\n' % (whitespace,
- i,
- node_to_memory[i] / 1024)
- whitespace='%25s' % ''
+ nr_nodes=ninfo['max_node_index']
+ str='\nnode: TotalMemory FreeMemory dma32Memory NodeDist:'
+ for i in range(0, nr_nodes):
+ str+='%4d ' % i
+ str+='\n'
+ for i in range(0, nr_nodes):
+ str+='%4d: %8dMB %8dMB %8dMB :' % (i,
+ ninfo['node_memsize'][i],
+ ninfo['node_memfree'][i],
+ ninfo['node_to_dma32_mem'][i])
+ for j in range(0, nr_nodes):
+ str+='%4d ' % ninfo['node_to_node_dist'][(i*nr_nodes)+j]
+ str+='\n'
except:
str='none\n'
return str[:-1];

-
def physinfo(self):
info = self.xc.physinfo()
+ tinfo = self.xc.topologyinfo()
+ ninfo = self.xc.numainfo()

info['cpu_mhz'] = info['cpu_khz'] / 1000

# physinfo is in KiB, need it in MiB
info['total_memory'] = info['total_memory'] / 1024
info['free_memory'] = info['free_memory'] / 1024
- info['node_to_cpu'] = self.format_node_to_cpu(info)
- info['node_to_memory'] = \
- self.format_node_to_memory(info, 'node_to_memory')
- info['node_to_dma32_mem'] = \
- self.format_node_to_memory(info, 'node_to_dma32_mem')
+
+ info['cpu_topology'] = \
+ self.format_cpu_to_core_socket_node(tinfo)
+
+ info['numa_info'] = \
+ self.format_numa_info(ninfo)

ITEM_ORDER = [.'nr_cpus',
'nr_nodes',
'cores_per_socket',
'threads_per_core',
+ 'sockets_per_node',
'cpu_mhz',
'hw_caps',
'virt_caps',
'total_memory',
'free_memory',
- 'node_to_cpu',
- 'node_to_memory',
- 'node_to_dma32_mem',
- 'max_node_id'
+ 'cpu_topology',
+ 'numa_info',
]

return [[k, info[k]] for k in ITEM_ORDER]
-

def pciinfo(self):
from xen.xend.server.pciif import get_all_assigned_pci_devices
diff -r f0ef396d8c33 -r 28e5409e3fb3 tools/python/xen/xend/balloon.py
--- a/tools/python/xen/xend/balloon.py Wed Apr 07 15:44:29 2010 +0100
+++ b/tools/python/xen/xend/balloon.py Wed Apr 07 16:22:05 2010 +0100
@@ -184,15 +184,11 @@ def free(need_mem, dominfo):
waitscrub = 1
vcpus = dominfo.info['cpus'][0]
for vcpu in vcpus:
- nodenum = 0
- for node in physinfo['node_to_cpu']:
- for cpu in node:
- if vcpu == cpu:
- if oldnode == -1:
- oldnode = nodenum
- elif oldnode != nodenum:
- waitscrub = 0
- nodenum = nodenum + 1
+ nodenum = xc.numainfo()['cpu_to_node'][cpu]
+ if oldnode == -1:
+ oldnode = nodenum
+ elif oldnode != nodenum:
+ waitscrub = 0

if waitscrub == 1 and scrub_mem > 0:
log.debug("wait for scrub %s", scrub_mem)
diff -r f0ef396d8c33 -r 28e5409e3fb3 xen/arch/x86/sysctl.c
--- a/xen/arch/x86/sysctl.c Wed Apr 07 15:44:29 2010 +0100
+++ b/xen/arch/x86/sysctl.c Wed Apr 07 16:22:05 2010 +0100
@@ -35,6 +35,8 @@ static long cpu_down_helper(void *data)
return cpu_down(cpu);
}

+extern int __node_distance(int a, int b);
+
long arch_do_sysctl(
struct xen_sysctl *sysctl, XEN_GUEST_HANDLE(xen_sysctl_t) u_sysctl)
{
@@ -45,25 +47,22 @@ long arch_do_sysctl(

case XEN_SYSCTL_physinfo:
{
- uint32_t i, max_array_ent;
- XEN_GUEST_HANDLE_64(uint32) cpu_to_node_arr;
-
xen_sysctl_physinfo_t *pi = &sysctl->u.physinfo;

ret = xsm_physinfo();
if ( ret )
break;

- max_array_ent = pi->max_cpu_id;
- cpu_to_node_arr = pi->cpu_to_node;

memset(pi, 0, sizeof(*pi));
- pi->cpu_to_node = cpu_to_node_arr;
pi->threads_per_core =
cpus_weight(per_cpu(cpu_sibling_map, 0));
pi->cores_per_socket =
cpus_weight(per_cpu(cpu_core_map, 0)) / pi->threads_per_core;
pi->nr_cpus = (u32)num_online_cpus();
+ pi->nr_nodes = (u32)num_online_nodes();
+ pi->sockets_per_node = pi->nr_cpus /
+ (pi->nr_nodes * pi->cores_per_socket * pi->threads_per_core);
pi->total_pages = total_pages;
pi->free_pages = avail_domheap_pages();
pi->scrub_pages = 0;
@@ -74,15 +73,56 @@ long arch_do_sysctl(
if ( iommu_enabled )
pi->capabilities |= XEN_SYSCTL_PHYSCAP_hvm_directio;

- pi->max_node_id = last_node(node_online_map);
- pi->max_cpu_id = last_cpu(cpu_online_map);
- max_array_ent = min_t(uint32_t, max_array_ent, pi->max_cpu_id);
+ if ( copy_to_guest(u_sysctl, sysctl, 1) )
+ ret = -EFAULT;
+ }
+ break;
+
+ case XEN_SYSCTL_topologyinfo:
+ {
+ uint32_t i, max_cpu_index;
+ XEN_GUEST_HANDLE_64(uint32) cpu_to_core_arr;
+ XEN_GUEST_HANDLE_64(uint32) cpu_to_socket_arr;
+ XEN_GUEST_HANDLE_64(uint32) cpu_to_node_arr;
+
+ xen_sysctl_topologyinfo_t *ti = &sysctl->u.topologyinfo;
+
+ max_cpu_index = ti->max_cpu_index;
+ cpu_to_core_arr = ti->cpu_to_core;
+ cpu_to_socket_arr = ti->cpu_to_socket;
+ cpu_to_node_arr = ti->cpu_to_node;
+
+ memset(ti, 0, sizeof(*ti));
+ ti->cpu_to_core = cpu_to_core_arr;
+ ti->cpu_to_socket = cpu_to_socket_arr;
+ ti->cpu_to_node = cpu_to_node_arr;
+
+ max_cpu_index = min_t(uint32_t, max_cpu_index, num_online_cpus());
+ ti->max_cpu_index = max_cpu_index;

ret = 0;

- if ( !guest_handle_is_null(cpu_to_node_arr) )
- {
- for ( i = 0; i <= max_array_ent; i++ )
+ for ( i = 0; i < max_cpu_index; i++ )
+ {
+ if ( !guest_handle_is_null(cpu_to_core_arr) )
+ {
+ uint32_t core = cpu_online(i) ? cpu_to_core(i) : ~0u;
+ if ( copy_to_guest_offset(cpu_to_core_arr, i, &core, 1) )
+ {
+ ret = -EFAULT;
+ break;
+ }
+ }
+ if ( !guest_handle_is_null(cpu_to_socket_arr) )
+ {
+ uint32_t socket = cpu_online(i) ? cpu_to_socket(i) : ~0u;
+ if ( copy_to_guest_offset(cpu_to_socket_arr, i, &socket, 1) )
+ {
+ ret = -EFAULT;
+ break;
+ }
+ }
+ if ( !guest_handle_is_null(cpu_to_node_arr) )
{
uint32_t node = cpu_online(i) ? cpu_to_node(i) : ~0u;
if ( copy_to_guest_offset(cpu_to_node_arr, i, &node, 1) )
@@ -92,6 +132,82 @@ long arch_do_sysctl(
}
}
}
+
+ if (ret)
+ break;
+
+ if ( copy_to_guest(u_sysctl, sysctl, 1) )
+ ret = -EFAULT;
+ }
+ break;
+
+ case XEN_SYSCTL_numainfo:
+ {
+ uint32_t i, max_node_index;
+ XEN_GUEST_HANDLE_64(uint64) node_to_memsize_arr;
+ XEN_GUEST_HANDLE_64(uint64) node_to_memfree_arr;
+ XEN_GUEST_HANDLE_64(uint32) node_to_node_distance_arr;
+
+ xen_sysctl_numainfo_t *ni = &sysctl->u.numainfo;
+
+ max_node_index = ni->max_node_index;
+ node_to_memsize_arr = ni->node_to_memsize;
+ node_to_memfree_arr = ni->node_to_memfree;
+ node_to_node_distance_arr = ni->node_to_node_distance;
+
+ memset(ni, 0, sizeof(*ni));
+ ni->node_to_memsize = node_to_memsize_arr;
+ ni->node_to_memfree = node_to_memfree_arr;
+ ni->node_to_node_distance = node_to_node_distance_arr;
+
+ max_node_index = min_t(uint32_t, max_node_index, num_online_nodes());
+ ni->max_node_index = max_node_index;
+
+ ret = 0;
+
+ for ( i = 0; i < max_node_index; i++ )
+ {
+ if ( !guest_handle_is_null(node_to_memsize_arr) )
+ {
+ uint64_t memsize = node_online(i) ?
+ node_spanned_pages(i) << PAGE_SHIFT : 0ul;
+ if ( copy_to_guest_offset(node_to_memsize_arr, i, &memsize, 1) )
+ {
+ ret = -EFAULT;
+ break;
+ }
+ }
+ if ( !guest_handle_is_null(node_to_memfree_arr) )
+ {
+ uint64_t memfree = node_online(i) ?
+ avail_node_heap_pages(i) << PAGE_SHIFT : 0ul;
+ if ( copy_to_guest_offset(node_to_memfree_arr, i, &memfree, 1) )
+ {
+ ret = -EFAULT;
+ break;
+ }
+ }
+
+ if ( !guest_handle_is_null(node_to_node_distance_arr) )
+ {
+ int j;
+ for ( j = 0; j < max_node_index; j++)
+ {
+ uint32_t distance = ~0u;
+ if (node_online(i) && node_online (j))
+ distance = __node_distance(i, j);
+
+ if ( copy_to_guest_offset(node_to_node_distance_arr,
+ (i * max_node_index + j), &distance, 1) )
+ {
+ ret = -EFAULT;
+ break;
+ }
+ }
+ }
+ }
+ if (ret)
+ break;

if ( copy_to_guest(u_sysctl, sysctl, 1) )
ret = -EFAULT;
diff -r f0ef396d8c33 -r 28e5409e3fb3 xen/common/page_alloc.c
--- a/xen/common/page_alloc.c Wed Apr 07 15:44:29 2010 +0100
+++ b/xen/common/page_alloc.c Wed Apr 07 16:22:05 2010 +0100
@@ -1256,6 +1256,12 @@ unsigned long avail_domheap_pages(void)
-1);
}

+unsigned long avail_node_heap_pages(unsigned int nodeid)
+{
+ return avail_heap_pages(MEMZONE_XEN, NR_ZONES -1, nodeid);
+}
+
+
static void pagealloc_info(unsigned char key)
{
unsigned int zone = MEMZONE_XEN;
diff -r f0ef396d8c33 -r 28e5409e3fb3 xen/include/asm-x86/numa.h
--- a/xen/include/asm-x86/numa.h Wed Apr 07 15:44:29 2010 +0100
+++ b/xen/include/asm-x86/numa.h Wed Apr 07 16:22:05 2010 +0100
@@ -73,6 +73,7 @@ static inline __attribute__((pure)) int
#define NODE_DATA(nid) (&(node_data[nid]))

#define node_start_pfn(nid) (NODE_DATA(nid)->node_start_pfn)
+#define node_spanned_pages(nid) (NODE_DATA(nid)->node_spanned_pages)
#define node_end_pfn(nid) (NODE_DATA(nid)->node_start_pfn + \
NODE_DATA(nid)->node_spanned_pages)

diff -r f0ef396d8c33 -r 28e5409e3fb3 xen/include/public/sysctl.h
--- a/xen/include/public/sysctl.h Wed Apr 07 15:44:29 2010 +0100
+++ b/xen/include/public/sysctl.h Wed Apr 07 16:22:05 2010 +0100
@@ -34,7 +34,7 @@
#include "xen.h"
#include "domctl.h"

-#define XEN_SYSCTL_INTERFACE_VERSION 0x00000007
+#define XEN_SYSCTL_INTERFACE_VERSION 0x00000008

/*
* Read console content from Xen buffer ring.
@@ -93,29 +93,14 @@ struct xen_sysctl_physinfo {
struct xen_sysctl_physinfo {
uint32_t threads_per_core;
uint32_t cores_per_socket;
+ uint32_t sockets_per_node;
uint32_t nr_cpus;
- uint32_t max_node_id;
+ uint32_t nr_nodes;
uint32_t cpu_khz;
uint64_aligned_t total_pages;
uint64_aligned_t free_pages;
uint64_aligned_t scrub_pages;
uint32_t hw_cap[8];
-
- /*
- * IN: maximum addressable entry in the caller-provided cpu_to_node array.
- * OUT: largest cpu identifier in the system.
- * If OUT is greater than IN then the cpu_to_node array is truncated!
- */
- uint32_t max_cpu_id;
- /*
- * If not NULL, this array is filled with node identifier for each cpu.
- * If a cpu has no node information (e.g., cpu not present) then the
- * sentinel value ~0u is written.
- * The size of this array is specified by the caller in @max_cpu_id.
- * If the actual @max_cpu_id is smaller than the array then the trailing
- * elements of the array will not be written by the sysctl.
- */
- XEN_GUEST_HANDLE_64(uint32) cpu_to_node;

/* XEN_SYSCTL_PHYSCAP_??? */
uint32_t capabilities;
@@ -491,6 +476,73 @@ typedef struct xen_sysctl_lockprof_op xe
typedef struct xen_sysctl_lockprof_op xen_sysctl_lockprof_op_t;
DEFINE_XEN_GUEST_HANDLE(xen_sysctl_lockprof_op_t);

+#define XEN_SYSCTL_topologyinfo 16
+struct xen_sysctl_topologyinfo {
+
+ /*
+ * IN: maximum addressable entry in the caller-provided cpu_to_core,
+ * cpu_to_socket & cpu_to_node arrays.
+ * OUT: largest cpu identifier in the system.
+ * If OUT is greater than IN then the cpu_to_node array is truncated!
+ */
+ uint32_t max_cpu_index;
+
+ /*
+ * If not NULL, this array is filled with core/socket/node identifier for
+ * each cpu.
+ * If a cpu has no core/socket/node information (e.g., cpu not present)
+ * then the sentinel value ~0u is written.
+ * The size of this array is specified by the caller in @max_cpu_index.
+ * If the actual @max_cpu_index is smaller than the array then the trailing
+ * elements of the array will not be written by the sysctl.
+ */
+ XEN_GUEST_HANDLE_64(uint32) cpu_to_core;
+ XEN_GUEST_HANDLE_64(uint32) cpu_to_socket;
+ XEN_GUEST_HANDLE_64(uint32) cpu_to_node; /* node_number */
+
+};
+typedef struct xen_sysctl_topologyinfo xen_sysctl_topologyinfo_t;
+DEFINE_XEN_GUEST_HANDLE(xen_sysctl_topologyinfo_t);
+
+#define XEN_SYSCTL_numainfo 17
+struct xen_sysctl_numainfo {
+ /*
+ * IN: maximum addressable entry in the caller-provided node_numbers,
+ * node_to_memsize & node_to_memfree arrays.
+ * OUT: largest possible node index for the system.
+ * If OUT is greater than IN then these arrays are truncated!
+ */
+ uint32_t max_node_index;
+
+ /* For node_to_memsize & node_to_memfree arrays, the
+ * entry with same index corrosponds to the same node.
+ * If a entry has no node information (e.g., node not present) then the
+ * sentinel value ~0u is written for_node_number, and value 0u is written
+ * for node_to_memsize & node_to_memfree.
+ * The size of this array is specified by the caller in @max_node_index.
+ * If the actual @max_node_index is smaller than the array then the
+ * trailing elements of the array will not be written by the sysctl.
+ */
+ XEN_GUEST_HANDLE_64(uint64) node_to_memsize;
+ XEN_GUEST_HANDLE_64(uint64) node_to_memfree;
+
+
+ /* node_to_node_distance is array of size (nr_nodes * nr_nodes) listing
+ * memory access distances between nodes. i'th entry in the array
+ * specifies distance between node (i / nr_nodes) & node (i % nr_nodes)
+ * If a entry has no node distance information (e.g., node not present)
+ * then the sentinel value ~0u is written.
+ * The size of this array is specified by the caller in
+ * @max_node_distance_index. If the max_node_index*max_node_index is
+ * smaller than the array then the trailing elements of the array will
+ * not be written by the sysctl.
+ */
+ XEN_GUEST_HANDLE_64(uint32) node_to_node_distance;
+};
+typedef struct xen_sysctl_numainfo xen_sysctl_numainfo_t;
+DEFINE_XEN_GUEST_HANDLE(xen_sysctl_numainfo_t);
+
+
struct xen_sysctl {
uint32_t cmd;
uint32_t interface_version; /* XEN_SYSCTL_INTERFACE_VERSION */
@@ -498,6 +550,8 @@ struct xen_sysctl {
struct xen_sysctl_readconsole readconsole;
struct xen_sysctl_tbuf_op tbuf_op;
struct xen_sysctl_physinfo physinfo;
+ struct xen_sysctl_topologyinfo topologyinfo;
+ struct xen_sysctl_numainfo numainfo;
struct xen_sysctl_sched_id sched_id;
struct xen_sysctl_perfc_op perfc_op;
struct xen_sysctl_getdomaininfolist getdomaininfolist;
diff -r f0ef396d8c33 -r 28e5409e3fb3 xen/include/xen/mm.h
--- a/xen/include/xen/mm.h Wed Apr 07 15:44:29 2010 +0100
+++ b/xen/include/xen/mm.h Wed Apr 07 16:22:05 2010 +0100
@@ -57,6 +57,7 @@ unsigned long avail_domheap_pages_region
unsigned long avail_domheap_pages_region(
unsigned int node, unsigned int min_width, unsigned int max_width);
unsigned long avail_domheap_pages(void);
+unsigned long avail_node_heap_pages(unsigned int);
#define alloc_domheap_page(d,f) (alloc_domheap_pages(d,0,f))
#define free_domheap_page(p) (free_domheap_pages(p,0))
unsigned int online_page(unsigned long mfn, uint32_t *status);

_______________________________________________
Xen-changelog mailing list
Xen-changelog@lists.xensource.com
http://lists.xensource.com/xen-changelog