Mailing List Archive

[PATCH 3 of 8] xen: let the (credit) scheduler know about `node affinity`
As vcpu-affinity tells where vcpus can run, node-affinity tells
where a domain's vcpus prefer to run. Respecting vcpu-affinity is
the primary concern, but honouring node-affinity will likely
result in some performances benefit.

This change modifies the vcpu load balancing algorithm (for the
credit scheduler only), introducing a two steps logic.
During the first step, we use the node-affinity mask. The aim is
giving precedence to the CPUs where it is known to be preferrable
for the domain to run. If that fails in finding a valid CPU, the
node-affinity is just ignored and, in the second step, we fall
back to using cpu-affinity only.

Signed-off-by: Dario Faggioli <dario.faggioli@citrix.com>

diff --git a/xen/common/sched_credit.c b/xen/common/sched_credit.c
--- a/xen/common/sched_credit.c
+++ b/xen/common/sched_credit.c
@@ -101,6 +101,13 @@


/*
+ * Node Balancing
+ */
+#define CSCHED_BALANCE_CPU_AFFINITY 0
+#define CSCHED_BALANCE_NODE_AFFINITY 1
+#define CSCHED_BALANCE_LAST CSCHED_BALANCE_NODE_AFFINITY
+
+/*
* Boot parameters
*/
static int __read_mostly sched_credit_tslice_ms = CSCHED_DEFAULT_TSLICE_MS;
@@ -148,6 +155,9 @@ struct csched_dom {
struct list_head active_vcpu;
struct list_head active_sdom_elem;
struct domain *dom;
+ /* cpumask translated from the domain' node-affinity
+ * mask. Basically, the CPUs we prefer to be scheduled on. */
+ cpumask_var_t node_affinity_cpumask;
uint16_t active_vcpu_count;
uint16_t weight;
uint16_t cap;
@@ -228,6 +238,39 @@ static inline void
list_del_init(&svc->runq_elem);
}

+#define for_each_csched_balance_step(__step) \
+ for ( (__step) = CSCHED_BALANCE_LAST; (__step) >= 0; (__step)-- )
+
+/*
+ * Each csched-balance step should use its own cpumask. This function
+ * determines which one, given the step, and copies it in mask. Notice
+ * that, in the case of a node balancing step, it also filters out from
+ * the node-affinity mask the cpus that are not part of vc's cpu-affinity,
+ * as we do not want to end up running a vcpu where it is not allowed to!
+ *
+ * As an optimization, if a domain does not have any specific node-affinity
+ * (namely, its node affinity is automatically computed), we inform the
+ * caller that he can skip the first step by returning -1.
+ */
+static int
+csched_balance_cpumask(const struct vcpu *vc, int step, cpumask_t *mask)
+{
+ if ( step == CSCHED_BALANCE_NODE_AFFINITY )
+ {
+ struct domain *d = vc->domain;
+ struct csched_dom *sdom = CSCHED_DOM(d);
+
+ if ( cpumask_full(sdom->node_affinity_cpumask) )
+ return -1;
+
+ cpumask_and(mask, sdom->node_affinity_cpumask, vc->cpu_affinity);
+ }
+ else /* step == CSCHED_BALANACE_CPU_AFFINITY */
+ cpumask_copy(mask, vc->cpu_affinity);
+
+ return 0;
+}
+
static void burn_credits(struct csched_vcpu *svc, s_time_t now)
{
s_time_t delta;
@@ -250,6 +293,20 @@ boolean_param("tickle_one_idle_cpu", opt
DEFINE_PER_CPU(unsigned int, last_tickle_cpu);

static inline void
+__cpumask_tickle(cpumask_t *mask, const cpumask_t *idle_mask)
+{
+ CSCHED_STAT_CRANK(tickle_idlers_some);
+ if ( opt_tickle_one_idle )
+ {
+ this_cpu(last_tickle_cpu) =
+ cpumask_cycle(this_cpu(last_tickle_cpu), idle_mask);
+ cpumask_set_cpu(this_cpu(last_tickle_cpu), mask);
+ }
+ else
+ cpumask_or(mask, mask, idle_mask);
+}
+
+static inline void
__runq_tickle(unsigned int cpu, struct csched_vcpu *new)
{
struct csched_vcpu * const cur =
@@ -287,22 +344,26 @@ static inline void
}
else
{
- cpumask_t idle_mask;
+ cpumask_t idle_mask, balance_mask;
+ int balance_step;

- cpumask_and(&idle_mask, prv->idlers, new->vcpu->cpu_affinity);
- if ( !cpumask_empty(&idle_mask) )
+ for_each_csched_balance_step(balance_step)
{
- CSCHED_STAT_CRANK(tickle_idlers_some);
- if ( opt_tickle_one_idle )
- {
- this_cpu(last_tickle_cpu) =
- cpumask_cycle(this_cpu(last_tickle_cpu), &idle_mask);
- cpumask_set_cpu(this_cpu(last_tickle_cpu), &mask);
- }
- else
- cpumask_or(&mask, &mask, &idle_mask);
+ if ( csched_balance_cpumask(new->vcpu, balance_step,
+ &balance_mask) )
+ continue;
+
+ /* Look for idlers in the step's cpumask */
+ cpumask_and(&idle_mask, prv->idlers, &balance_mask);
+ if ( !cpumask_empty(&idle_mask) )
+ __cpumask_tickle(&mask, &idle_mask);
+
+ cpumask_and(&mask, &mask, &balance_mask);
+
+ /* We can quit balancing if we found someone to tickle */
+ if ( !cpumask_empty(&mask) )
+ break;
}
- cpumask_and(&mask, &mask, new->vcpu->cpu_affinity);
}
}

@@ -443,35 +504,42 @@ static inline int
}

static inline int
-__csched_vcpu_is_migrateable(struct vcpu *vc, int dest_cpu)
+__csched_vcpu_is_migrateable(struct vcpu *vc, int dest_cpu, cpumask_t *mask)
{
/*
* Don't pick up work that's in the peer's scheduling tail or hot on
- * peer PCPU. Only pick up work that's allowed to run on our CPU.
+ * peer PCPU. Only pick up work that prefers and/or is allowed to run
+ * on our CPU.
*/
return !vc->is_running &&
!__csched_vcpu_is_cache_hot(vc) &&
- cpumask_test_cpu(dest_cpu, vc->cpu_affinity);
+ cpumask_test_cpu(dest_cpu, mask);
}

static int
_csched_cpu_pick(const struct scheduler *ops, struct vcpu *vc, bool_t commit)
{
- cpumask_t cpus;
+ cpumask_t cpus, start_cpus;
cpumask_t idlers;
cpumask_t *online;
+ struct csched_dom *sdom = CSCHED_DOM(vc->domain);
struct csched_pcpu *spc = NULL;
int cpu;

/*
- * Pick from online CPUs in VCPU's affinity mask, giving a
- * preference to its current processor if it's in there.
+ * Pick an online CPU from the && of vcpu-affinity and node-affinity
+ * masks (if not empty, in which case only the vcpu-affinity mask is
+ * used). Also, try to give a preference to its current processor if
+ * it's in there.
*/
online = cpupool_scheduler_cpumask(vc->domain->cpupool);
cpumask_and(&cpus, online, vc->cpu_affinity);
- cpu = cpumask_test_cpu(vc->processor, &cpus)
+ cpumask_and(&start_cpus, &cpus, sdom->node_affinity_cpumask);
+ if ( unlikely(cpumask_empty(&start_cpus)) )
+ cpumask_copy(&start_cpus, &cpus);
+ cpu = cpumask_test_cpu(vc->processor, &start_cpus)
? vc->processor
- : cpumask_cycle(vc->processor, &cpus);
+ : cpumask_cycle(vc->processor, &start_cpus);
ASSERT( !cpumask_empty(&cpus) && cpumask_test_cpu(cpu, &cpus) );

/*
@@ -867,6 +935,13 @@ csched_alloc_domdata(const struct schedu
if ( sdom == NULL )
return NULL;

+ if ( !alloc_cpumask_var(&sdom->node_affinity_cpumask) )
+ {
+ xfree(sdom);
+ return NULL;
+ }
+ cpumask_setall(sdom->node_affinity_cpumask);
+
/* Initialize credit and weight */
INIT_LIST_HEAD(&sdom->active_vcpu);
sdom->active_vcpu_count = 0;
@@ -900,6 +975,9 @@ csched_dom_init(const struct scheduler *
static void
csched_free_domdata(const struct scheduler *ops, void *data)
{
+ struct csched_dom *sdom = data;
+
+ free_cpumask_var(sdom->node_affinity_cpumask);
xfree(data);
}

@@ -1211,30 +1289,48 @@ csched_runq_steal(int peer_cpu, int cpu,
*/
if ( peer_pcpu != NULL && !is_idle_vcpu(peer_vcpu) )
{
- list_for_each( iter, &peer_pcpu->runq )
+ int balance_step;
+
+ /*
+ * Take node-affinity into account. That means, for all the vcpus
+ * in peer_pcpu's runq, check _first_ if their node-affinity allows
+ * them to run on cpu. If not, retry the loop considering plain
+ * vcpu-affinity. Also, notice that as soon as one vcpu is found,
+ * balancing is considered done, and the vcpu is returned to the
+ * caller.
+ */
+ for_each_csched_balance_step(balance_step)
{
- speer = __runq_elem(iter);
+ list_for_each( iter, &peer_pcpu->runq )
+ {
+ cpumask_t balance_mask;

- /*
- * If next available VCPU here is not of strictly higher
- * priority than ours, this PCPU is useless to us.
- */
- if ( speer->pri <= pri )
- break;
+ speer = __runq_elem(iter);

- /* Is this VCPU is runnable on our PCPU? */
- vc = speer->vcpu;
- BUG_ON( is_idle_vcpu(vc) );
+ /*
+ * If next available VCPU here is not of strictly higher
+ * priority than ours, this PCPU is useless to us.
+ */
+ if ( speer->pri <= pri )
+ break;

- if (__csched_vcpu_is_migrateable(vc, cpu))
- {
- /* We got a candidate. Grab it! */
- CSCHED_VCPU_STAT_CRANK(speer, migrate_q);
- CSCHED_STAT_CRANK(migrate_queued);
- WARN_ON(vc->is_urgent);
- __runq_remove(speer);
- vc->processor = cpu;
- return speer;
+ /* Is this VCPU runnable on our PCPU? */
+ vc = speer->vcpu;
+ BUG_ON( is_idle_vcpu(vc) );
+
+ if ( csched_balance_cpumask(vc, balance_step, &balance_mask) )
+ continue;
+
+ if (__csched_vcpu_is_migrateable(vc, cpu, &balance_mask))
+ {
+ /* We got a candidate. Grab it! */
+ CSCHED_VCPU_STAT_CRANK(speer, migrate_q);
+ CSCHED_STAT_CRANK(migrate_queued);
+ WARN_ON(vc->is_urgent);
+ __runq_remove(speer);
+ vc->processor = cpu;
+ return speer;
+ }
}
}
}

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel
Re: [PATCH 3 of 8] xen: let the (credit) scheduler know about `node affinity` [ In reply to ]
>>> On 05.10.12 at 16:08, Dario Faggioli <dario.faggioli@citrix.com> wrote:
> @@ -287,22 +344,26 @@ static inline void
> }
> else
> {
> - cpumask_t idle_mask;
> + cpumask_t idle_mask, balance_mask;

Be _very_ careful about adding on-stack CPU mask variables
(also further below): each one of them grows the stack frame
by 512 bytes (when building for the current maximum of 4095
CPUs), which is generally too much; you may want to consider
pre-allocated scratch space instead.

Jan


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel
Re: [PATCH 3 of 8] xen: let the (credit) scheduler know about `node affinity` [ In reply to ]
Am 05.10.2012 16:08, schrieb Dario Faggioli:
> As vcpu-affinity tells where vcpus can run, node-affinity tells
> where a domain's vcpus prefer to run. Respecting vcpu-affinity is
> the primary concern, but honouring node-affinity will likely
> result in some performances benefit.
>
> This change modifies the vcpu load balancing algorithm (for the
> credit scheduler only), introducing a two steps logic.
> During the first step, we use the node-affinity mask. The aim is
> giving precedence to the CPUs where it is known to be preferrable
> for the domain to run. If that fails in finding a valid CPU, the
> node-affinity is just ignored and, in the second step, we fall
> back to using cpu-affinity only.
>
> Signed-off-by: Dario Faggioli<dario.faggioli@citrix.com>
>
> diff --git a/xen/common/sched_credit.c b/xen/common/sched_credit.c
> --- a/xen/common/sched_credit.c
> +++ b/xen/common/sched_credit.c
...
> static int
> _csched_cpu_pick(const struct scheduler *ops, struct vcpu *vc, bool_t commit)
> {
> - cpumask_t cpus;
> + cpumask_t cpus, start_cpus;
> cpumask_t idlers;
> cpumask_t *online;
> + struct csched_dom *sdom = CSCHED_DOM(vc->domain);
> struct csched_pcpu *spc = NULL;
> int cpu;
>
> /*
> - * Pick from online CPUs in VCPU's affinity mask, giving a
> - * preference to its current processor if it's in there.
> + * Pick an online CPU from the&& of vcpu-affinity and node-affinity
> + * masks (if not empty, in which case only the vcpu-affinity mask is
> + * used). Also, try to give a preference to its current processor if
> + * it's in there.
> */
> online = cpupool_scheduler_cpumask(vc->domain->cpupool);
> cpumask_and(&cpus, online, vc->cpu_affinity);
> - cpu = cpumask_test_cpu(vc->processor,&cpus)
> + cpumask_and(&start_cpus,&cpus, sdom->node_affinity_cpumask);
> + if ( unlikely(cpumask_empty(&start_cpus)) )
> + cpumask_copy(&start_cpus,&cpus);
> + cpu = cpumask_test_cpu(vc->processor,&start_cpus)
> ? vc->processor
> - : cpumask_cycle(vc->processor,&cpus);
> + : cpumask_cycle(vc->processor,&start_cpus);
> ASSERT( !cpumask_empty(&cpus)&& cpumask_test_cpu(cpu,&cpus) );

Shouldn't the ASSERT be changed to start_cpus, too?


Juergen

--
Juergen Gross Principal Developer Operating Systems
PBG PDG ES&S SWE OS6 Telephone: +49 (0) 89 3222 2967
Fujitsu Technology Solutions e-mail: juergen.gross@ts.fujitsu.com
Domagkstr. 28 Internet: ts.fujitsu.com
D-80807 Muenchen Company details: ts.fujitsu.com/imprint.html

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel
Re: [PATCH 3 of 8] xen: let the (credit) scheduler know about `node affinity` [ In reply to ]
On Tue, 2012-10-09 at 11:53 +0200, Juergen Gross wrote:
> > diff --git a/xen/common/sched_credit.c b/xen/common/sched_credit.c
> > --- a/xen/common/sched_credit.c
> > +++ b/xen/common/sched_credit.c
> ...
> > static int
> > _csched_cpu_pick(const struct scheduler *ops, struct vcpu *vc, bool_t commit)
> > {
> > - cpumask_t cpus;
> > + cpumask_t cpus, start_cpus;
> > cpumask_t idlers;
> > cpumask_t *online;
> > + struct csched_dom *sdom = CSCHED_DOM(vc->domain);
> > struct csched_pcpu *spc = NULL;
> > int cpu;
> >
> > /*
> > - * Pick from online CPUs in VCPU's affinity mask, giving a
> > - * preference to its current processor if it's in there.
> > + * Pick an online CPU from the&& of vcpu-affinity and node-affinity
> > + * masks (if not empty, in which case only the vcpu-affinity mask is
> > + * used). Also, try to give a preference to its current processor if
> > + * it's in there.
> > */
> > online = cpupool_scheduler_cpumask(vc->domain->cpupool);
> > cpumask_and(&cpus, online, vc->cpu_affinity);
> > - cpu = cpumask_test_cpu(vc->processor,&cpus)
> > + cpumask_and(&start_cpus,&cpus, sdom->node_affinity_cpumask);
> > + if ( unlikely(cpumask_empty(&start_cpus)) )
> > + cpumask_copy(&start_cpus,&cpus);
> > + cpu = cpumask_test_cpu(vc->processor,&start_cpus)
> > ? vc->processor
> > - : cpumask_cycle(vc->processor,&cpus);
> > + : cpumask_cycle(vc->processor,&start_cpus);
> > ASSERT( !cpumask_empty(&cpus)&& cpumask_test_cpu(cpu,&cpus) );
>
> Shouldn't the ASSERT be changed to start_cpus, too?
>
Well, it seems it definitely should, and I seem to have missed that!

Thanks a lot,
Dario

--
<<This happens because I choose it to happen!>> (Raistlin Majere)
-----------------------------------------------------------------
Dario Faggioli, Ph.D, http://retis.sssup.it/people/faggioli
Senior Software Engineer, Citrix Systems R&D Ltd., Cambridge (UK)
Re: [PATCH 3 of 8] xen: let the (credit) scheduler know about `node affinity` [ In reply to ]
On Fri, 2012-10-05 at 15:25 +0100, Jan Beulich wrote:
> >>> On 05.10.12 at 16:08, Dario Faggioli <dario.faggioli@citrix.com> wrote:
> > @@ -287,22 +344,26 @@ static inline void
> > }
> > else
> > {
> > - cpumask_t idle_mask;
> > + cpumask_t idle_mask, balance_mask;
>
> Be _very_ careful about adding on-stack CPU mask variables
> (also further below): each one of them grows the stack frame
> by 512 bytes (when building for the current maximum of 4095
> CPUs), which is generally too much; you may want to consider
> pre-allocated scratch space instead.
>
I see your point, and I think you're right... I wasn't "thinking that
big". :-)

I'll look into all of these situations and see if I can move the masks
off the stack. Any preference between global variables and members of
one of the scheduler's data structures?

Thanks and Regards,
Dario

--
<<This happens because I choose it to happen!>> (Raistlin Majere)
-----------------------------------------------------------------
Dario Faggioli, Ph.D, http://retis.sssup.it/people/faggioli
Senior Software Engineer, Citrix Systems R&D Ltd., Cambridge (UK)
Re: [PATCH 3 of 8] xen: let the (credit) scheduler know about `node affinity` [ In reply to ]
On 09/10/2012 11:29, "Dario Faggioli" <dario.faggioli@citrix.com> wrote:

> On Fri, 2012-10-05 at 15:25 +0100, Jan Beulich wrote:
>>>>> On 05.10.12 at 16:08, Dario Faggioli <dario.faggioli@citrix.com> wrote:
>>> @@ -287,22 +344,26 @@ static inline void
>>> }
>>> else
>>> {
>>> - cpumask_t idle_mask;
>>> + cpumask_t idle_mask, balance_mask;
>>
>> Be _very_ careful about adding on-stack CPU mask variables
>> (also further below): each one of them grows the stack frame
>> by 512 bytes (when building for the current maximum of 4095
>> CPUs), which is generally too much; you may want to consider
>> pre-allocated scratch space instead.
>>
> I see your point, and I think you're right... I wasn't "thinking that
> big". :-)
>
> I'll look into all of these situations and see if I can move the masks
> off the stack. Any preference between global variables and members of
> one of the scheduler's data structures?

Since multiple instances of the scheduler can be active, across multiple cpu
pools, surely they have to be allocated in the per-scheduler-instance
structures? Or dynamically xmalloc'ed just in the scope they are needed.

-- Keir

> Thanks and Regards,
> Dario



_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel
Re: [PATCH 3 of 8] xen: let the (credit) scheduler know about `node affinity` [ In reply to ]
On Fri, Oct 5, 2012 at 3:08 PM, Dario Faggioli
<dario.faggioli@citrix.com> wrote:
> As vcpu-affinity tells where vcpus can run, node-affinity tells
> where a domain's vcpus prefer to run. Respecting vcpu-affinity is
> the primary concern, but honouring node-affinity will likely
> result in some performances benefit.
>
> This change modifies the vcpu load balancing algorithm (for the
> credit scheduler only), introducing a two steps logic.
> During the first step, we use the node-affinity mask. The aim is
> giving precedence to the CPUs where it is known to be preferrable
> for the domain to run. If that fails in finding a valid CPU, the
> node-affinity is just ignored and, in the second step, we fall
> back to using cpu-affinity only.
>
> Signed-off-by: Dario Faggioli <dario.faggioli@citrix.com>

Looking at the load-balancing code, it makes me think that there is
probably some interesting work to do there in the future; but I think
this patch can go in as it is for now. So

(Once others' comments are addressed)
Acked-by: George Dunlap <george.dunlap@eu.citrix.com>

Do scroll down to read my comments on load balancing...

> @@ -1211,30 +1289,48 @@ csched_runq_steal(int peer_cpu, int cpu,
> */
> if ( peer_pcpu != NULL && !is_idle_vcpu(peer_vcpu) )
> {
> - list_for_each( iter, &peer_pcpu->runq )
> + int balance_step;
> +
> + /*
> + * Take node-affinity into account. That means, for all the vcpus
> + * in peer_pcpu's runq, check _first_ if their node-affinity allows
> + * them to run on cpu. If not, retry the loop considering plain
> + * vcpu-affinity. Also, notice that as soon as one vcpu is found,
> + * balancing is considered done, and the vcpu is returned to the
> + * caller.
> + */
> + for_each_csched_balance_step(balance_step)
> {
> - speer = __runq_elem(iter);
> + list_for_each( iter, &peer_pcpu->runq )
> + {
> + cpumask_t balance_mask;
>
> - /*
> - * If next available VCPU here is not of strictly higher
> - * priority than ours, this PCPU is useless to us.
> - */
> - if ( speer->pri <= pri )
> - break;
> + speer = __runq_elem(iter);
>
> - /* Is this VCPU is runnable on our PCPU? */
> - vc = speer->vcpu;
> - BUG_ON( is_idle_vcpu(vc) );
> + /*
> + * If next available VCPU here is not of strictly higher
> + * priority than ours, this PCPU is useless to us.
> + */
> + if ( speer->pri <= pri )
> + break;
>
> - if (__csched_vcpu_is_migrateable(vc, cpu))
> - {
> - /* We got a candidate. Grab it! */
> - CSCHED_VCPU_STAT_CRANK(speer, migrate_q);
> - CSCHED_STAT_CRANK(migrate_queued);
> - WARN_ON(vc->is_urgent);
> - __runq_remove(speer);
> - vc->processor = cpu;
> - return speer;
> + /* Is this VCPU runnable on our PCPU? */
> + vc = speer->vcpu;
> + BUG_ON( is_idle_vcpu(vc) );
> +
> + if ( csched_balance_cpumask(vc, balance_step, &balance_mask) )
> + continue;

This will have the effect that a vcpu with any node affinity at all
will be stolen before a vcpu with no node affinity: i.e., if you have
a system with 4 nodes, and one vcpu has an affinity to nodes 1-2-3,
another has affinity with only 1, and another has an affinity to all
4, the one which has an affinity to all 4 will be passed over the
first round, while either of the first ones might be nabbed (depending
on what pcpu they're on).

Furthermore, the effect of this whole thing (if I'm reading it right)
will be to go through *each runqueue* twice, rather than checking all
cpus for vcpus with good node affinity, and then all cpus for vcpus
with good cpumasks.

It seems like it would be better to check:
* This node for node-affine work to steal
* Other nodes for node-affine work to steal
* All nodes for cpu-affine work to steal.

Ideally, the search would terminate fairly quickly with the first set,
meaning that in the common case we never even check other nodes.

Going through the cpu list twice means trying to grab the scheduler
lock for each cpu twice; but hopefully that would be made up for by
having a shorter list.

Thoughts?

Like I said, I think this is something to put on our to-do list; this
patch should go in so we can start testing it as soon as possible.

-George

> +
> + if (__csched_vcpu_is_migrateable(vc, cpu, &balance_mask))
> + {
> + /* We got a candidate. Grab it! */
> + CSCHED_VCPU_STAT_CRANK(speer, migrate_q);
> + CSCHED_STAT_CRANK(migrate_queued);
> + WARN_ON(vc->is_urgent);
> + __runq_remove(speer);
> + vc->processor = cpu;
> + return speer;
> + }
> }
> }
> }
>
> _______________________________________________
> Xen-devel mailing list
> Xen-devel@lists.xen.org
> http://lists.xen.org/xen-devel

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel