As vcpu-affinity tells where vcpus can run, node-affinity tells
where a domain's vcpus prefer to run. Respecting vcpu-affinity is
the primary concern, but honouring node-affinity will likely
result in some performances benefit.
This change modifies the vcpu load balancing algorithm (for the
credit scheduler only), introducing a two steps logic.
During the first step, we use the node-affinity mask. The aim is
giving precedence to the CPUs where it is known to be preferrable
for the domain to run. If that fails in finding a valid CPU, the
node-affinity is just ignored and, in the second step, we fall
back to using cpu-affinity only.
Signed-off-by: Dario Faggioli <dario.faggioli@citrix.com>
diff --git a/xen/common/sched_credit.c b/xen/common/sched_credit.c
--- a/xen/common/sched_credit.c
+++ b/xen/common/sched_credit.c
@@ -101,6 +101,13 @@
/*
+ * Node Balancing
+ */
+#define CSCHED_BALANCE_CPU_AFFINITY 0
+#define CSCHED_BALANCE_NODE_AFFINITY 1
+#define CSCHED_BALANCE_LAST CSCHED_BALANCE_NODE_AFFINITY
+
+/*
* Boot parameters
*/
static int __read_mostly sched_credit_tslice_ms = CSCHED_DEFAULT_TSLICE_MS;
@@ -148,6 +155,9 @@ struct csched_dom {
struct list_head active_vcpu;
struct list_head active_sdom_elem;
struct domain *dom;
+ /* cpumask translated from the domain' node-affinity
+ * mask. Basically, the CPUs we prefer to be scheduled on. */
+ cpumask_var_t node_affinity_cpumask;
uint16_t active_vcpu_count;
uint16_t weight;
uint16_t cap;
@@ -228,6 +238,39 @@ static inline void
list_del_init(&svc->runq_elem);
}
+#define for_each_csched_balance_step(__step) \
+ for ( (__step) = CSCHED_BALANCE_LAST; (__step) >= 0; (__step)-- )
+
+/*
+ * Each csched-balance step should use its own cpumask. This function
+ * determines which one, given the step, and copies it in mask. Notice
+ * that, in the case of a node balancing step, it also filters out from
+ * the node-affinity mask the cpus that are not part of vc's cpu-affinity,
+ * as we do not want to end up running a vcpu where it is not allowed to!
+ *
+ * As an optimization, if a domain does not have any specific node-affinity
+ * (namely, its node affinity is automatically computed), we inform the
+ * caller that he can skip the first step by returning -1.
+ */
+static int
+csched_balance_cpumask(const struct vcpu *vc, int step, cpumask_t *mask)
+{
+ if ( step == CSCHED_BALANCE_NODE_AFFINITY )
+ {
+ struct domain *d = vc->domain;
+ struct csched_dom *sdom = CSCHED_DOM(d);
+
+ if ( cpumask_full(sdom->node_affinity_cpumask) )
+ return -1;
+
+ cpumask_and(mask, sdom->node_affinity_cpumask, vc->cpu_affinity);
+ }
+ else /* step == CSCHED_BALANACE_CPU_AFFINITY */
+ cpumask_copy(mask, vc->cpu_affinity);
+
+ return 0;
+}
+
static void burn_credits(struct csched_vcpu *svc, s_time_t now)
{
s_time_t delta;
@@ -250,6 +293,20 @@ boolean_param("tickle_one_idle_cpu", opt
DEFINE_PER_CPU(unsigned int, last_tickle_cpu);
static inline void
+__cpumask_tickle(cpumask_t *mask, const cpumask_t *idle_mask)
+{
+ CSCHED_STAT_CRANK(tickle_idlers_some);
+ if ( opt_tickle_one_idle )
+ {
+ this_cpu(last_tickle_cpu) =
+ cpumask_cycle(this_cpu(last_tickle_cpu), idle_mask);
+ cpumask_set_cpu(this_cpu(last_tickle_cpu), mask);
+ }
+ else
+ cpumask_or(mask, mask, idle_mask);
+}
+
+static inline void
__runq_tickle(unsigned int cpu, struct csched_vcpu *new)
{
struct csched_vcpu * const cur =
@@ -287,22 +344,26 @@ static inline void
}
else
{
- cpumask_t idle_mask;
+ cpumask_t idle_mask, balance_mask;
+ int balance_step;
- cpumask_and(&idle_mask, prv->idlers, new->vcpu->cpu_affinity);
- if ( !cpumask_empty(&idle_mask) )
+ for_each_csched_balance_step(balance_step)
{
- CSCHED_STAT_CRANK(tickle_idlers_some);
- if ( opt_tickle_one_idle )
- {
- this_cpu(last_tickle_cpu) =
- cpumask_cycle(this_cpu(last_tickle_cpu), &idle_mask);
- cpumask_set_cpu(this_cpu(last_tickle_cpu), &mask);
- }
- else
- cpumask_or(&mask, &mask, &idle_mask);
+ if ( csched_balance_cpumask(new->vcpu, balance_step,
+ &balance_mask) )
+ continue;
+
+ /* Look for idlers in the step's cpumask */
+ cpumask_and(&idle_mask, prv->idlers, &balance_mask);
+ if ( !cpumask_empty(&idle_mask) )
+ __cpumask_tickle(&mask, &idle_mask);
+
+ cpumask_and(&mask, &mask, &balance_mask);
+
+ /* We can quit balancing if we found someone to tickle */
+ if ( !cpumask_empty(&mask) )
+ break;
}
- cpumask_and(&mask, &mask, new->vcpu->cpu_affinity);
}
}
@@ -443,35 +504,42 @@ static inline int
}
static inline int
-__csched_vcpu_is_migrateable(struct vcpu *vc, int dest_cpu)
+__csched_vcpu_is_migrateable(struct vcpu *vc, int dest_cpu, cpumask_t *mask)
{
/*
* Don't pick up work that's in the peer's scheduling tail or hot on
- * peer PCPU. Only pick up work that's allowed to run on our CPU.
+ * peer PCPU. Only pick up work that prefers and/or is allowed to run
+ * on our CPU.
*/
return !vc->is_running &&
!__csched_vcpu_is_cache_hot(vc) &&
- cpumask_test_cpu(dest_cpu, vc->cpu_affinity);
+ cpumask_test_cpu(dest_cpu, mask);
}
static int
_csched_cpu_pick(const struct scheduler *ops, struct vcpu *vc, bool_t commit)
{
- cpumask_t cpus;
+ cpumask_t cpus, start_cpus;
cpumask_t idlers;
cpumask_t *online;
+ struct csched_dom *sdom = CSCHED_DOM(vc->domain);
struct csched_pcpu *spc = NULL;
int cpu;
/*
- * Pick from online CPUs in VCPU's affinity mask, giving a
- * preference to its current processor if it's in there.
+ * Pick an online CPU from the && of vcpu-affinity and node-affinity
+ * masks (if not empty, in which case only the vcpu-affinity mask is
+ * used). Also, try to give a preference to its current processor if
+ * it's in there.
*/
online = cpupool_scheduler_cpumask(vc->domain->cpupool);
cpumask_and(&cpus, online, vc->cpu_affinity);
- cpu = cpumask_test_cpu(vc->processor, &cpus)
+ cpumask_and(&start_cpus, &cpus, sdom->node_affinity_cpumask);
+ if ( unlikely(cpumask_empty(&start_cpus)) )
+ cpumask_copy(&start_cpus, &cpus);
+ cpu = cpumask_test_cpu(vc->processor, &start_cpus)
? vc->processor
- : cpumask_cycle(vc->processor, &cpus);
+ : cpumask_cycle(vc->processor, &start_cpus);
ASSERT( !cpumask_empty(&cpus) && cpumask_test_cpu(cpu, &cpus) );
/*
@@ -867,6 +935,13 @@ csched_alloc_domdata(const struct schedu
if ( sdom == NULL )
return NULL;
+ if ( !alloc_cpumask_var(&sdom->node_affinity_cpumask) )
+ {
+ xfree(sdom);
+ return NULL;
+ }
+ cpumask_setall(sdom->node_affinity_cpumask);
+
/* Initialize credit and weight */
INIT_LIST_HEAD(&sdom->active_vcpu);
sdom->active_vcpu_count = 0;
@@ -900,6 +975,9 @@ csched_dom_init(const struct scheduler *
static void
csched_free_domdata(const struct scheduler *ops, void *data)
{
+ struct csched_dom *sdom = data;
+
+ free_cpumask_var(sdom->node_affinity_cpumask);
xfree(data);
}
@@ -1211,30 +1289,48 @@ csched_runq_steal(int peer_cpu, int cpu,
*/
if ( peer_pcpu != NULL && !is_idle_vcpu(peer_vcpu) )
{
- list_for_each( iter, &peer_pcpu->runq )
+ int balance_step;
+
+ /*
+ * Take node-affinity into account. That means, for all the vcpus
+ * in peer_pcpu's runq, check _first_ if their node-affinity allows
+ * them to run on cpu. If not, retry the loop considering plain
+ * vcpu-affinity. Also, notice that as soon as one vcpu is found,
+ * balancing is considered done, and the vcpu is returned to the
+ * caller.
+ */
+ for_each_csched_balance_step(balance_step)
{
- speer = __runq_elem(iter);
+ list_for_each( iter, &peer_pcpu->runq )
+ {
+ cpumask_t balance_mask;
- /*
- * If next available VCPU here is not of strictly higher
- * priority than ours, this PCPU is useless to us.
- */
- if ( speer->pri <= pri )
- break;
+ speer = __runq_elem(iter);
- /* Is this VCPU is runnable on our PCPU? */
- vc = speer->vcpu;
- BUG_ON( is_idle_vcpu(vc) );
+ /*
+ * If next available VCPU here is not of strictly higher
+ * priority than ours, this PCPU is useless to us.
+ */
+ if ( speer->pri <= pri )
+ break;
- if (__csched_vcpu_is_migrateable(vc, cpu))
- {
- /* We got a candidate. Grab it! */
- CSCHED_VCPU_STAT_CRANK(speer, migrate_q);
- CSCHED_STAT_CRANK(migrate_queued);
- WARN_ON(vc->is_urgent);
- __runq_remove(speer);
- vc->processor = cpu;
- return speer;
+ /* Is this VCPU runnable on our PCPU? */
+ vc = speer->vcpu;
+ BUG_ON( is_idle_vcpu(vc) );
+
+ if ( csched_balance_cpumask(vc, balance_step, &balance_mask) )
+ continue;
+
+ if (__csched_vcpu_is_migrateable(vc, cpu, &balance_mask))
+ {
+ /* We got a candidate. Grab it! */
+ CSCHED_VCPU_STAT_CRANK(speer, migrate_q);
+ CSCHED_STAT_CRANK(migrate_queued);
+ WARN_ON(vc->is_urgent);
+ __runq_remove(speer);
+ vc->processor = cpu;
+ return speer;
+ }
}
}
}
_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel
where a domain's vcpus prefer to run. Respecting vcpu-affinity is
the primary concern, but honouring node-affinity will likely
result in some performances benefit.
This change modifies the vcpu load balancing algorithm (for the
credit scheduler only), introducing a two steps logic.
During the first step, we use the node-affinity mask. The aim is
giving precedence to the CPUs where it is known to be preferrable
for the domain to run. If that fails in finding a valid CPU, the
node-affinity is just ignored and, in the second step, we fall
back to using cpu-affinity only.
Signed-off-by: Dario Faggioli <dario.faggioli@citrix.com>
diff --git a/xen/common/sched_credit.c b/xen/common/sched_credit.c
--- a/xen/common/sched_credit.c
+++ b/xen/common/sched_credit.c
@@ -101,6 +101,13 @@
/*
+ * Node Balancing
+ */
+#define CSCHED_BALANCE_CPU_AFFINITY 0
+#define CSCHED_BALANCE_NODE_AFFINITY 1
+#define CSCHED_BALANCE_LAST CSCHED_BALANCE_NODE_AFFINITY
+
+/*
* Boot parameters
*/
static int __read_mostly sched_credit_tslice_ms = CSCHED_DEFAULT_TSLICE_MS;
@@ -148,6 +155,9 @@ struct csched_dom {
struct list_head active_vcpu;
struct list_head active_sdom_elem;
struct domain *dom;
+ /* cpumask translated from the domain' node-affinity
+ * mask. Basically, the CPUs we prefer to be scheduled on. */
+ cpumask_var_t node_affinity_cpumask;
uint16_t active_vcpu_count;
uint16_t weight;
uint16_t cap;
@@ -228,6 +238,39 @@ static inline void
list_del_init(&svc->runq_elem);
}
+#define for_each_csched_balance_step(__step) \
+ for ( (__step) = CSCHED_BALANCE_LAST; (__step) >= 0; (__step)-- )
+
+/*
+ * Each csched-balance step should use its own cpumask. This function
+ * determines which one, given the step, and copies it in mask. Notice
+ * that, in the case of a node balancing step, it also filters out from
+ * the node-affinity mask the cpus that are not part of vc's cpu-affinity,
+ * as we do not want to end up running a vcpu where it is not allowed to!
+ *
+ * As an optimization, if a domain does not have any specific node-affinity
+ * (namely, its node affinity is automatically computed), we inform the
+ * caller that he can skip the first step by returning -1.
+ */
+static int
+csched_balance_cpumask(const struct vcpu *vc, int step, cpumask_t *mask)
+{
+ if ( step == CSCHED_BALANCE_NODE_AFFINITY )
+ {
+ struct domain *d = vc->domain;
+ struct csched_dom *sdom = CSCHED_DOM(d);
+
+ if ( cpumask_full(sdom->node_affinity_cpumask) )
+ return -1;
+
+ cpumask_and(mask, sdom->node_affinity_cpumask, vc->cpu_affinity);
+ }
+ else /* step == CSCHED_BALANACE_CPU_AFFINITY */
+ cpumask_copy(mask, vc->cpu_affinity);
+
+ return 0;
+}
+
static void burn_credits(struct csched_vcpu *svc, s_time_t now)
{
s_time_t delta;
@@ -250,6 +293,20 @@ boolean_param("tickle_one_idle_cpu", opt
DEFINE_PER_CPU(unsigned int, last_tickle_cpu);
static inline void
+__cpumask_tickle(cpumask_t *mask, const cpumask_t *idle_mask)
+{
+ CSCHED_STAT_CRANK(tickle_idlers_some);
+ if ( opt_tickle_one_idle )
+ {
+ this_cpu(last_tickle_cpu) =
+ cpumask_cycle(this_cpu(last_tickle_cpu), idle_mask);
+ cpumask_set_cpu(this_cpu(last_tickle_cpu), mask);
+ }
+ else
+ cpumask_or(mask, mask, idle_mask);
+}
+
+static inline void
__runq_tickle(unsigned int cpu, struct csched_vcpu *new)
{
struct csched_vcpu * const cur =
@@ -287,22 +344,26 @@ static inline void
}
else
{
- cpumask_t idle_mask;
+ cpumask_t idle_mask, balance_mask;
+ int balance_step;
- cpumask_and(&idle_mask, prv->idlers, new->vcpu->cpu_affinity);
- if ( !cpumask_empty(&idle_mask) )
+ for_each_csched_balance_step(balance_step)
{
- CSCHED_STAT_CRANK(tickle_idlers_some);
- if ( opt_tickle_one_idle )
- {
- this_cpu(last_tickle_cpu) =
- cpumask_cycle(this_cpu(last_tickle_cpu), &idle_mask);
- cpumask_set_cpu(this_cpu(last_tickle_cpu), &mask);
- }
- else
- cpumask_or(&mask, &mask, &idle_mask);
+ if ( csched_balance_cpumask(new->vcpu, balance_step,
+ &balance_mask) )
+ continue;
+
+ /* Look for idlers in the step's cpumask */
+ cpumask_and(&idle_mask, prv->idlers, &balance_mask);
+ if ( !cpumask_empty(&idle_mask) )
+ __cpumask_tickle(&mask, &idle_mask);
+
+ cpumask_and(&mask, &mask, &balance_mask);
+
+ /* We can quit balancing if we found someone to tickle */
+ if ( !cpumask_empty(&mask) )
+ break;
}
- cpumask_and(&mask, &mask, new->vcpu->cpu_affinity);
}
}
@@ -443,35 +504,42 @@ static inline int
}
static inline int
-__csched_vcpu_is_migrateable(struct vcpu *vc, int dest_cpu)
+__csched_vcpu_is_migrateable(struct vcpu *vc, int dest_cpu, cpumask_t *mask)
{
/*
* Don't pick up work that's in the peer's scheduling tail or hot on
- * peer PCPU. Only pick up work that's allowed to run on our CPU.
+ * peer PCPU. Only pick up work that prefers and/or is allowed to run
+ * on our CPU.
*/
return !vc->is_running &&
!__csched_vcpu_is_cache_hot(vc) &&
- cpumask_test_cpu(dest_cpu, vc->cpu_affinity);
+ cpumask_test_cpu(dest_cpu, mask);
}
static int
_csched_cpu_pick(const struct scheduler *ops, struct vcpu *vc, bool_t commit)
{
- cpumask_t cpus;
+ cpumask_t cpus, start_cpus;
cpumask_t idlers;
cpumask_t *online;
+ struct csched_dom *sdom = CSCHED_DOM(vc->domain);
struct csched_pcpu *spc = NULL;
int cpu;
/*
- * Pick from online CPUs in VCPU's affinity mask, giving a
- * preference to its current processor if it's in there.
+ * Pick an online CPU from the && of vcpu-affinity and node-affinity
+ * masks (if not empty, in which case only the vcpu-affinity mask is
+ * used). Also, try to give a preference to its current processor if
+ * it's in there.
*/
online = cpupool_scheduler_cpumask(vc->domain->cpupool);
cpumask_and(&cpus, online, vc->cpu_affinity);
- cpu = cpumask_test_cpu(vc->processor, &cpus)
+ cpumask_and(&start_cpus, &cpus, sdom->node_affinity_cpumask);
+ if ( unlikely(cpumask_empty(&start_cpus)) )
+ cpumask_copy(&start_cpus, &cpus);
+ cpu = cpumask_test_cpu(vc->processor, &start_cpus)
? vc->processor
- : cpumask_cycle(vc->processor, &cpus);
+ : cpumask_cycle(vc->processor, &start_cpus);
ASSERT( !cpumask_empty(&cpus) && cpumask_test_cpu(cpu, &cpus) );
/*
@@ -867,6 +935,13 @@ csched_alloc_domdata(const struct schedu
if ( sdom == NULL )
return NULL;
+ if ( !alloc_cpumask_var(&sdom->node_affinity_cpumask) )
+ {
+ xfree(sdom);
+ return NULL;
+ }
+ cpumask_setall(sdom->node_affinity_cpumask);
+
/* Initialize credit and weight */
INIT_LIST_HEAD(&sdom->active_vcpu);
sdom->active_vcpu_count = 0;
@@ -900,6 +975,9 @@ csched_dom_init(const struct scheduler *
static void
csched_free_domdata(const struct scheduler *ops, void *data)
{
+ struct csched_dom *sdom = data;
+
+ free_cpumask_var(sdom->node_affinity_cpumask);
xfree(data);
}
@@ -1211,30 +1289,48 @@ csched_runq_steal(int peer_cpu, int cpu,
*/
if ( peer_pcpu != NULL && !is_idle_vcpu(peer_vcpu) )
{
- list_for_each( iter, &peer_pcpu->runq )
+ int balance_step;
+
+ /*
+ * Take node-affinity into account. That means, for all the vcpus
+ * in peer_pcpu's runq, check _first_ if their node-affinity allows
+ * them to run on cpu. If not, retry the loop considering plain
+ * vcpu-affinity. Also, notice that as soon as one vcpu is found,
+ * balancing is considered done, and the vcpu is returned to the
+ * caller.
+ */
+ for_each_csched_balance_step(balance_step)
{
- speer = __runq_elem(iter);
+ list_for_each( iter, &peer_pcpu->runq )
+ {
+ cpumask_t balance_mask;
- /*
- * If next available VCPU here is not of strictly higher
- * priority than ours, this PCPU is useless to us.
- */
- if ( speer->pri <= pri )
- break;
+ speer = __runq_elem(iter);
- /* Is this VCPU is runnable on our PCPU? */
- vc = speer->vcpu;
- BUG_ON( is_idle_vcpu(vc) );
+ /*
+ * If next available VCPU here is not of strictly higher
+ * priority than ours, this PCPU is useless to us.
+ */
+ if ( speer->pri <= pri )
+ break;
- if (__csched_vcpu_is_migrateable(vc, cpu))
- {
- /* We got a candidate. Grab it! */
- CSCHED_VCPU_STAT_CRANK(speer, migrate_q);
- CSCHED_STAT_CRANK(migrate_queued);
- WARN_ON(vc->is_urgent);
- __runq_remove(speer);
- vc->processor = cpu;
- return speer;
+ /* Is this VCPU runnable on our PCPU? */
+ vc = speer->vcpu;
+ BUG_ON( is_idle_vcpu(vc) );
+
+ if ( csched_balance_cpumask(vc, balance_step, &balance_mask) )
+ continue;
+
+ if (__csched_vcpu_is_migrateable(vc, cpu, &balance_mask))
+ {
+ /* We got a candidate. Grab it! */
+ CSCHED_VCPU_STAT_CRANK(speer, migrate_q);
+ CSCHED_STAT_CRANK(migrate_queued);
+ WARN_ON(vc->is_urgent);
+ __runq_remove(speer);
+ vc->processor = cpu;
+ return speer;
+ }
}
}
}
_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel