Mailing List Archive

[PATCH] Reduce overhead in find_domain_by_id() [2/2]
This is a set of patches to improve performance of find_domain_by_id().
find_domain_by_id shows up high in profiles for network I/O intensive
workloads.
Most of the cost for this function comes from 3 main functions (of
aproximate equal costs): 1)read_lock(), 2)read_unlock() and
3)get_domain().
These patches replace the lock used for accessing domain_list and
domain_hash with a lock free RCU scheme. Experiments confirm that the
cost of find_domain_by_id() is in fact reduced by 2/3.
The patches apply cleanly to changeset 12732.

Renato

Patch:
2/2 - replace domlist_lock operations by RCU operations

=======================================================
Signed-off-by: Jose Renato Santos <jsantos@hpl.hp.com>

# HG changeset patch
# User jsantos@hpl.hp.com
# Node ID 8d07c0a68759ab97cb7917bdf65144f20a5568ca
# Parent 555277cafc3576a3a96bd7ec345c3871b18aa867
Modify accesses to domain_list and domain_hash to use RCU functions
instead of rwlock

diff -r 555277cafc35 -r 8d07c0a68759 xen/acm/acm_chinesewall_hooks.c
--- a/xen/acm/acm_chinesewall_hooks.c Tue Dec 5 14:47:26 2006 -0800
+++ b/xen/acm/acm_chinesewall_hooks.c Tue Dec 5 15:31:44 2006 -0800
@@ -196,7 +196,7 @@ chwall_init_state(struct acm_chwall_poli
ssidref_t chwall_ssidref;
struct domain **pd;

- write_lock(&domlist_lock);
+ spin_lock(&domlist_lock);
/* go through all domains and adjust policy as if this domain was
started now */
pd = &domain_list;
for (pd = &domain_list; *pd != NULL; pd = &(*pd)->next_in_list)
@@ -247,7 +247,7 @@ chwall_init_state(struct acm_chwall_poli
}
}
out:
- write_unlock(&domlist_lock);
+ spin_unlock(&domlist_lock);
return violation;
/* returning "violation != 0" means that the currently running set
of domains would
* not be possible if the new policy had been enforced before
starting them; for chinese
diff -r 555277cafc35 -r 8d07c0a68759
xen/acm/acm_simple_type_enforcement_hooks.c
--- a/xen/acm/acm_simple_type_enforcement_hooks.c Tue Dec 5
14:47:26 2006 -0800
+++ b/xen/acm/acm_simple_type_enforcement_hooks.c Tue Dec 5
15:31:44 2006 -0800
@@ -175,36 +175,37 @@ ste_init_state(struct acm_ste_policy_buf
int violation = 1;
struct ste_ssid *ste_ssid, *ste_rssid;
ssidref_t ste_ssidref, ste_rssidref;
- struct domain **pd, *rdom;
+ struct domain *d, *rdom;
domid_t rdomid;
grant_entry_t sha_copy;
int port, i;

- read_lock(&domlist_lock); /* go by domain? or directly by global?
event/grant list */
+ rcu_read_lock(); /* go by domain? or directly by global?
event/grant list */
/* go through all domains and adjust policy as if this domain was
started now */
- pd = &domain_list;
- for ( pd = &domain_list; *pd != NULL; pd = &(*pd)->next_in_list ) {
+ for ( d = rcu_dereference(domain_list);
+ d != NULL;
+ d = rcu_dereference(d->next_in_list) ) {
ste_ssid = GET_SSIDP(ACM_SIMPLE_TYPE_ENFORCEMENT_POLICY,
- (struct acm_ssid_domain *)(*pd)->ssid);
+ (struct acm_ssid_domain *)d->ssid);
ste_ssidref = ste_ssid->ste_ssidref;
traceprintk("%s: validating policy for eventch domain %x
(ste-Ref=%x).\n",
- __func__, (*pd)->domain_id, ste_ssidref);
+ __func__, d->domain_id, ste_ssidref);
/* a) check for event channel conflicts */
for (port=0; port < NR_EVTCHN_BUCKETS; port++) {
- spin_lock(&(*pd)->evtchn_lock);
- if ((*pd)->evtchn[port] == NULL) {
- spin_unlock(&(*pd)->evtchn_lock);
+ spin_lock(&d->evtchn_lock);
+ if (d->evtchn[port] == NULL) {
+ spin_unlock(d->evtchn_lock);
continue;
}
- if ((*pd)->evtchn[port]->state == ECS_INTERDOMAIN) {
- rdom = (*pd)->evtchn[port]->u.interdomain.remote_dom;
+ if (d->evtchn[port]->state == ECS_INTERDOMAIN) {
+ rdom = d->evtchn[port]->u.interdomain.remote_dom;
rdomid = rdom->domain_id;
/* rdom now has remote domain */
ste_rssid =
GET_SSIDP(ACM_SIMPLE_TYPE_ENFORCEMENT_POLICY,
(struct acm_ssid_domain
*)(rdom->ssid));
ste_rssidref = ste_rssid->ste_ssidref;
- } else if ((*pd)->evtchn[port]->state == ECS_UNBOUND) {
- rdomid = (*pd)->evtchn[port]->u.unbound.remote_domid;
+ } else if (d->evtchn[port]->state == ECS_UNBOUND) {
+ rdomid = d->evtchn[port]->u.unbound.remote_domid;
if ((rdom = find_domain_by_id(rdomid)) == NULL) {
printk("%s: Error finding domain to id %x!\n",
__func__, rdomid);
goto out;
@@ -215,34 +216,34 @@ ste_init_state(struct acm_ste_policy_buf
ste_rssidref = ste_rssid->ste_ssidref;
put_domain(rdom);
} else {
- spin_unlock(&(*pd)->evtchn_lock);
+ spin_unlock(&d->evtchn_lock);
continue; /* port unused */
}
- spin_unlock(&(*pd)->evtchn_lock);
+ spin_unlock(&d->evtchn_lock);

/* rdom now has remote domain */
ste_rssid = GET_SSIDP(ACM_SIMPLE_TYPE_ENFORCEMENT_POLICY,
(struct acm_ssid_domain
*)(rdom->ssid));
ste_rssidref = ste_rssid->ste_ssidref;
traceprintk("%s: eventch: domain %x (ssidref %x) --> domain
%x (rssidref %x) used (port %x).\n",
- __func__, (*pd)->domain_id, ste_ssidref,
rdom->domain_id, ste_rssidref, port);
+ __func__, d->domain_id, ste_ssidref,
rdom->domain_id, ste_rssidref, port);
/* check whether on subj->ssid, obj->ssid share a common
type*/
if (!have_common_type(ste_ssidref, ste_rssidref)) {
printkd("%s: Policy violation in event channel domain
%x -> domain %x.\n",
- __func__, (*pd)->domain_id, rdomid);
+ __func__, d->domain_id, rdomid);
goto out;
}
}
/* b) check for grant table conflicts on shared pages */
- if ((*pd)->grant_table->shared == NULL) {
- printkd("%s: Grant ... sharing for domain %x not setup!\n",
__func__, (*pd)->domain_id);
+ if (d->grant_table->shared == NULL) {
+ printkd("%s: Grant ... sharing for domain %x not setup!\n",
__func__, (d->domain_id);
continue;
}
for ( i = 0; i < NR_GRANT_ENTRIES; i++ ) {
- sha_copy = (*pd)->grant_table->shared[i];
+ sha_copy = (d->grant_table->shared[i];
if ( sha_copy.flags ) {
printkd("%s: grant dom (%hu) SHARED (%d) flags:(%hx)
dom:(%hu) frame:(%lx)\n",
- __func__, (*pd)->domain_id, i, sha_copy.flags,
sha_copy.domid,
+ __func__, (d->domain_id, i, sha_copy.flags,
sha_copy.domid,
(unsigned long)sha_copy.frame);
rdomid = sha_copy.domid;
if ((rdom = find_domain_by_id(rdomid)) == NULL) {
@@ -256,7 +257,7 @@ ste_init_state(struct acm_ste_policy_buf
put_domain(rdom);
if (!have_common_type(ste_ssidref, ste_rssidref)) {
printkd("%s: Policy violation in grant table
sharing domain %x -> domain %x.\n",
- __func__, (*pd)->domain_id, rdomid);
+ __func__, d->domain_id, rdomid);
goto out;
}
}
@@ -264,7 +265,7 @@ ste_init_state(struct acm_ste_policy_buf
}
violation = 0;
out:
- read_unlock(&domlist_lock);
+ rcu_read_unlock();
return violation;
/* returning "violation != 0" means that existing sharing between
domains would not
* have been allowed if the new policy had been enforced before the
sharing; for ste,
@@ -280,7 +281,7 @@ ste_set_policy(u8 *buf, u32 buf_size)
struct acm_ste_policy_buffer *ste_buf = (struct
acm_ste_policy_buffer *)buf;
void *ssidrefsbuf;
struct ste_ssid *ste_ssid;
- struct domain **pd;
+ struct domain *d;
int i;

if (buf_size < sizeof(struct acm_ste_policy_buffer))
@@ -325,15 +326,16 @@ ste_set_policy(u8 *buf, u32 buf_size)
ste_bin_pol.ssidrefs = (domaintype_t *)ssidrefsbuf;

/* clear all ste caches */
- read_lock(&domlist_lock);
- pd = &domain_list;
- for ( pd = &domain_list; *pd != NULL; pd = &(*pd)->next_in_list ) {
+ rcu_read_lock();
+ for ( d = rcu_dereference(domain_list);
+ d != NULL;
+ d = rcu_dereference(d->next_in_list) ) {
ste_ssid = GET_SSIDP(ACM_SIMPLE_TYPE_ENFORCEMENT_POLICY,
- (struct acm_ssid_domain *)(*pd)->ssid);
+ (struct acm_ssid_domain *)(d)->ssid);
for (i=0; i<ACM_TE_CACHE_SIZE; i++)
ste_ssid->ste_cache[i].valid = ACM_STE_free;
}
- read_unlock(&domlist_lock);
+ rcu_read_unlock();
return ACM_OK;

error_free:
@@ -435,14 +437,15 @@ clean_id_from_cache(domid_t id)
{
struct ste_ssid *ste_ssid;
int i;
- struct domain **pd;
+ struct domain *d;
struct acm_ssid_domain *ssid;

printkd("deleting cache for dom %x.\n", id);
- read_lock(&domlist_lock); /* look through caches of all domains */
- pd = &domain_list;
- for ( pd = &domain_list; *pd != NULL; pd = &(*pd)->next_in_list ) {
- ssid = (struct acm_ssid_domain *)((*pd)->ssid);
+ rcu_read_lock(); /* look through caches of all domains */
+ for ( d = rcu_dereference(domain_list);
+ d != NULL;
+ d = rcu_dereference(d->next_in_list ) {
+ ssid = (struct acm_ssid_domain *)(d->ssid);

if (ssid == NULL)
continue; /* hanging domain structure, no ssid any more ...
*/
@@ -458,7 +461,7 @@ clean_id_from_cache(domid_t id)
ste_ssid->ste_cache[i].valid = ACM_STE_free;
}
out:
- read_unlock(&domlist_lock);
+ rcu_read_unlock();
}

/***************************
diff -r 555277cafc35 -r 8d07c0a68759 xen/arch/ia64/xen/flushtlb.c
--- a/xen/arch/ia64/xen/flushtlb.c Tue Dec 5 14:47:26 2006 -0800
+++ b/xen/arch/ia64/xen/flushtlb.c Tue Dec 5 15:31:44 2006 -0800
@@ -78,7 +78,7 @@ new_tlbflush_clock_period(void)
struct domain* d;
struct vcpu* v;
/* flush all vhpt of vcpu of all existing domain. */
- read_lock(&domlist_lock);
+ rcu_read_lock();
for_each_domain(d) {
for_each_vcpu(d, v) {
vcpu_purge_tr_entry(&PSCBX(v,dtlb));
@@ -92,7 +92,7 @@ new_tlbflush_clock_period(void)
vcpu_vhpt_flush(v);
}
}
- read_unlock(&domlist_lock);
+ rcu_read_unlock();
/* unlock has release semantics */

/* flush all vhpt of physical cpu and mTLB */
diff -r 555277cafc35 -r 8d07c0a68759 xen/arch/x86/time.c
--- a/xen/arch/x86/time.c Tue Dec 5 14:47:26 2006 -0800
+++ b/xen/arch/x86/time.c Tue Dec 5 15:31:44 2006 -0800
@@ -720,10 +720,10 @@ void do_settime(unsigned long secs, unsi
wc_nsec = _wc_nsec = (u32)y;
spin_unlock(&wc_lock);

- read_lock(&domlist_lock);
+ rcu_read_lock();
for_each_domain ( d )
update_domain_wallclock_time(d);
- read_unlock(&domlist_lock);
+ rcu_read_unlock();
}

static void local_time_calibration(void *unused)
diff -r 555277cafc35 -r 8d07c0a68759 xen/common/domain.c
--- a/xen/common/domain.c Tue Dec 5 14:47:26 2006 -0800
+++ b/xen/common/domain.c Tue Dec 5 15:31:44 2006 -0800
@@ -23,14 +23,19 @@
#include <xen/shutdown.h>
#include <xen/percpu.h>
#include <xen/multicall.h>
+#include <xen/rcupdate.h>
#include <asm/debugger.h>
#include <public/sched.h>
#include <public/vcpu.h>

-/* Both these structures are protected by the domlist_lock. */
-DEFINE_RWLOCK(domlist_lock);
+/* domain_list and domain_hash are protected by the same RCU functions
+ * they are not modified atomically and may be inconsistent with each
other
+ * thus, only one of these can be used on each RCU reader-side
+ * critical section */
struct domain *domain_hash[DOMAIN_HASH_SIZE];
struct domain *domain_list;
+/* spin lock for domain_list and domain_hash (RCU) */
+DEFINE_SPINLOCK(domlist_lock);

struct domain *dom0;

@@ -170,16 +175,20 @@ struct domain *domain_create(domid_t dom

if ( !is_idle_domain(d) )
{
- write_lock(&domlist_lock);
+ spin_lock(&domlist_lock);
pd = &domain_list; /* NB. domain_list maintained in order of
domid. */
for ( pd = &domain_list; *pd != NULL; pd = &(*pd)->next_in_list
)
if ( (*pd)->domain_id > d->domain_id )
break;
d->next_in_list = *pd;
- *pd = d;
d->next_in_hashbucket = domain_hash[DOMAIN_HASH(domid)];
- domain_hash[DOMAIN_HASH(domid)] = d;
- write_unlock(&domlist_lock);
+ /* Two rcu assignments are not atomic
+ * Readers may see inconsistent domlist and hash table
+ * That is OK as long as each RCU reader-side critical section
uses
+ * only one or them */
+ rcu_assign_pointer(*pd, d);
+ rcu_assign_pointer(domain_hash[DOMAIN_HASH(domid)], d);
+ spin_unlock(&domlist_lock);
}

return d;
@@ -203,8 +212,8 @@ struct domain *find_domain_by_id(domid_t
{
struct domain *d;

- read_lock(&domlist_lock);
- d = domain_hash[DOMAIN_HASH(dom)];
+ rcu_read_lock();
+ d = rcu_dereference(domain_hash[DOMAIN_HASH(dom)]);
while ( d != NULL )
{
if ( d->domain_id == dom )
@@ -213,9 +222,9 @@ struct domain *find_domain_by_id(domid_t
d = NULL;
break;
}
- d = d->next_in_hashbucket;
- }
- read_unlock(&domlist_lock);
+ d = rcu_dereference(d->next_in_hashbucket);
+ }
+ rcu_read_unlock();

return d;
}
@@ -306,6 +315,23 @@ void domain_pause_for_debugger(void)
send_guest_global_virq(dom0, VIRQ_DEBUGGER);
}

+/* Complete domain destroy after RCU readers are not holding
+ old references */
+static void complete_domain_destroy(struct rcu_head *head)
+{
+ struct domain *d = container_of(head, struct domain, rcu);
+
+ rangeset_domain_destroy(d);
+
+ evtchn_destroy(d);
+ grant_table_destroy(d);
+
+ arch_domain_destroy(d);
+
+ free_domain(d);
+
+ send_guest_global_virq(dom0, VIRQ_DOM_EXC);
+}

/* Release resources belonging to task @p. */
void domain_destroy(struct domain *d)
@@ -323,27 +349,19 @@ void domain_destroy(struct domain *d)
return;

/* Delete from task list and task hashtable. */
- write_lock(&domlist_lock);
+ spin_lock(&domlist_lock);
pd = &domain_list;
while ( *pd != d )
pd = &(*pd)->next_in_list;
- *pd = d->next_in_list;
+ rcu_assign_pointer(*pd, d->next_in_list);
pd = &domain_hash[DOMAIN_HASH(d->domain_id)];
while ( *pd != d )
pd = &(*pd)->next_in_hashbucket;
- *pd = d->next_in_hashbucket;
- write_unlock(&domlist_lock);
-
- rangeset_domain_destroy(d);
-
- evtchn_destroy(d);
- grant_table_destroy(d);
-
- arch_domain_destroy(d);
-
- free_domain(d);
-
- send_guest_global_virq(dom0, VIRQ_DOM_EXC);
+ rcu_assign_pointer(*pd, d->next_in_hashbucket);
+ spin_unlock(&domlist_lock);
+
+ /* schedule RCU asynchronous completion of domain destroy */
+ call_rcu(&d->rcu, complete_domain_destroy);
}

void vcpu_pause(struct vcpu *v)
diff -r 555277cafc35 -r 8d07c0a68759 xen/common/domctl.c
--- a/xen/common/domctl.c Tue Dec 5 14:47:26 2006 -0800
+++ b/xen/common/domctl.c Tue Dec 5 15:31:44 2006 -0800
@@ -17,6 +17,7 @@
#include <xen/trace.h>
#include <xen/console.h>
#include <xen/iocap.h>
+#include <xen/rcupdate.h>
#include <xen/guest_access.h>
#include <asm/current.h>
#include <public/domctl.h>
@@ -139,12 +140,12 @@ static unsigned int default_vcpu0_locati
cpumask_t cpu_exclude_map;

/* Do an initial CPU placement. Pick the least-populated CPU. */
- read_lock(&domlist_lock);
+ rcu_read_lock();
for_each_domain ( d )
for_each_vcpu ( d, v )
if ( !test_bit(_VCPUF_down, &v->vcpu_flags) )
cnt[v->processor]++;
- read_unlock(&domlist_lock);
+ rcu_read_unlock();

/*
* If we're on a HT system, we only auto-allocate to a non-primary
HT. We
@@ -417,7 +418,7 @@ long do_domctl(XEN_GUEST_HANDLE(xen_domc
if ( dom == DOMID_SELF )
dom = current->domain->domain_id;

- read_lock(&domlist_lock);
+ rcu_read_lock();

for_each_domain ( d )
{
@@ -427,12 +428,12 @@ long do_domctl(XEN_GUEST_HANDLE(xen_domc

if ( (d == NULL) || !get_domain(d) )
{
- read_unlock(&domlist_lock);
+ rcu_read_unlock();
ret = -ESRCH;
break;
}

- read_unlock(&domlist_lock);
+ rcu_read_unlock();

getdomaininfo(d, &op->u.getdomaininfo);

diff -r 555277cafc35 -r 8d07c0a68759 xen/common/keyhandler.c
--- a/xen/common/keyhandler.c Tue Dec 5 14:47:26 2006 -0800
+++ b/xen/common/keyhandler.c Tue Dec 5 15:31:44 2006 -0800
@@ -139,7 +139,7 @@ static void dump_domains(unsigned char k
printk("'%c' pressed -> dumping domain info (now=0x%X:%08X)\n",
key,
(u32)(now>>32), (u32)now);

- read_lock(&domlist_lock);
+ rcu_read_lock();

for_each_domain ( d )
{
@@ -190,7 +190,7 @@ static void dump_domains(unsigned char k
}
}

- read_unlock(&domlist_lock);
+ rcu_read_unlock();
}

static cpumask_t read_clocks_cpumask = CPU_MASK_NONE;
diff -r 555277cafc35 -r 8d07c0a68759 xen/common/sysctl.c
--- a/xen/common/sysctl.c Tue Dec 5 14:47:26 2006 -0800
+++ b/xen/common/sysctl.c Tue Dec 5 15:31:44 2006 -0800
@@ -80,7 +80,7 @@ long do_sysctl(XEN_GUEST_HANDLE(xen_sysc
struct xen_domctl_getdomaininfo info;
u32 num_domains = 0;

- read_lock(&domlist_lock);
+ rcu_read_lock();

for_each_domain ( d )
{
@@ -108,7 +108,7 @@ long do_sysctl(XEN_GUEST_HANDLE(xen_sysc
num_domains++;
}

- read_unlock(&domlist_lock);
+ rcu_read_unlock();

if ( ret != 0 )
break;
diff -r 555277cafc35 -r 8d07c0a68759 xen/include/xen/sched.h
--- a/xen/include/xen/sched.h Tue Dec 5 14:47:26 2006 -0800
+++ b/xen/include/xen/sched.h Tue Dec 5 15:31:44 2006 -0800
@@ -15,10 +15,11 @@
#include <xen/rangeset.h>
#include <asm/domain.h>
#include <xen/xenoprof.h>
+#include <xen/rcupdate.h>
#include <xen/irq.h>

extern unsigned long volatile jiffies;
-extern rwlock_t domlist_lock;
+extern spinlock_t domlist_lock;

/* A global pointer to the initial domain (DOM0). */
extern struct domain *dom0;
@@ -172,6 +173,8 @@ struct domain
/* OProfile support. */
struct xenoprof *xenoprof;
int32_t time_offset_seconds;
+
+ struct rcu_head rcu;
};

struct domain_setup_info
@@ -344,16 +347,19 @@ unsigned long hypercall_create_continuat
local_events_need_delivery() \
))

-/* This domain_hash and domain_list are protected by the domlist_lock.
*/
+/* domain_list and domain_hash are protected by the same RCU functions
+ * they are not modified atomically and may be inconsistent with each
other
+ * thus, only one of these can be used on each RCU reader-side
+ * critical section */
#define DOMAIN_HASH_SIZE 256
#define DOMAIN_HASH(_id) ((int)(_id)&(DOMAIN_HASH_SIZE-1))
extern struct domain *domain_hash[DOMAIN_HASH_SIZE];
extern struct domain *domain_list;

#define for_each_domain(_d) \
- for ( (_d) = domain_list; \
+ for ( (_d) = rcu_dereference(domain_list); \
(_d) != NULL; \
- (_d) = (_d)->next_in_list )
+ (_d) = rcu_dereference((_d)->next_in_list )) \

#define for_each_vcpu(_d,_v) \
for ( (_v) = (_d)->vcpu[0]; \