Mailing List Archive

[PATCH 05/10] viridian: use softirq batching in hvcall_ipi()
From: Paul Durrant <pdurrant@amazon.com>

vlapic_ipi() uses a softirq batching mechanism to improve the efficiency of
sending a IPIs to large number of processors. This patch modifies send_ipi()
(the worker function called by hvcall_ipi()) to also make use of the
mechanism when there multiple bits set the hypercall_vpmask. Hence a `nr`
field is added to the structure to track the number of set bits.

Signed-off-by: Paul Durrant <pdurrant@amazon.com>
---
Cc: Wei Liu <wl@xen.org>
Cc: Jan Beulich <jbeulich@suse.com>
Cc: Andrew Cooper <andrew.cooper3@citrix.com>
Cc: "Roger Pau Monné" <roger.pau@citrix.com>
---
xen/arch/x86/hvm/viridian/viridian.c | 18 ++++++++++++++++--
1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/xen/arch/x86/hvm/viridian/viridian.c b/xen/arch/x86/hvm/viridian/viridian.c
index 63f63093a513..765d53016c02 100644
--- a/xen/arch/x86/hvm/viridian/viridian.c
+++ b/xen/arch/x86/hvm/viridian/viridian.c
@@ -11,6 +11,7 @@
#include <xen/hypercall.h>
#include <xen/domain_page.h>
#include <xen/param.h>
+#include <xen/softirq.h>
#include <asm/guest/hyperv-tlfs.h>
#include <asm/paging.h>
#include <asm/p2m.h>
@@ -509,6 +510,7 @@ void viridian_domain_deinit(struct domain *d)

struct hypercall_vpmask {
DECLARE_BITMAP(mask, HVM_MAX_VCPUS);
+ unsigned int nr;
};

static DEFINE_PER_CPU(struct hypercall_vpmask, hypercall_vpmask);
@@ -516,21 +518,24 @@ static DEFINE_PER_CPU(struct hypercall_vpmask, hypercall_vpmask);
static void vpmask_empty(struct hypercall_vpmask *vpmask)
{
bitmap_zero(vpmask->mask, HVM_MAX_VCPUS);
+ vpmask->nr = 0;
}

static void vpmask_set(struct hypercall_vpmask *vpmask, unsigned int vp)
{
- __set_bit(vp, vpmask->mask);
+ if ( !test_and_set_bit(vp, vpmask->mask) )
+ vpmask->nr++;
}

static void vpmask_fill(struct hypercall_vpmask *vpmask)
{
bitmap_fill(vpmask->mask, HVM_MAX_VCPUS);
+ vpmask->nr = HVM_MAX_VCPUS;
}

static bool vpmask_test(struct hypercall_vpmask *vpmask, unsigned int vp)
{
- return test_bit(vp, vpmask->mask);
+ return vpmask->nr && test_bit(vp, vpmask->mask);
}

static unsigned int vpmask_first(struct hypercall_vpmask *vpmask)
@@ -644,8 +649,17 @@ static void send_ipi(struct hypercall_vpmask *vpmask, uint8_t vector)
struct domain *currd = current->domain;
unsigned int vp;

+ if ( !vpmask->nr )
+ return;
+
+ if ( vpmask->nr > 1 )
+ cpu_raise_softirq_batch_begin();
+
for_each_vp ( vpmask, vp )
vlapic_set_irq(vcpu_vlapic(currd->vcpu[vp]), vector, 0);
+
+ if ( vpmask->nr > 1 )
+ cpu_raise_softirq_batch_finish();
}

static int hvcall_ipi(union hypercall_input *input,
--
2.20.1
Re: [PATCH 05/10] viridian: use softirq batching in hvcall_ipi() [ In reply to ]
On 11.11.2020 21:07, Paul Durrant wrote:
> From: Paul Durrant <pdurrant@amazon.com>
>
> vlapic_ipi() uses a softirq batching mechanism to improve the efficiency of
> sending a IPIs to large number of processors. This patch modifies send_ipi()
> (the worker function called by hvcall_ipi()) to also make use of the
> mechanism when there multiple bits set the hypercall_vpmask. Hence a `nr`
> field is added to the structure to track the number of set bits.

This is kind of unusual, i.e. we don't do so elsewhere. I take it the
assumption is that using bitmap_weight() is too much overhead?

> @@ -509,6 +510,7 @@ void viridian_domain_deinit(struct domain *d)
>
> struct hypercall_vpmask {
> DECLARE_BITMAP(mask, HVM_MAX_VCPUS);
> + unsigned int nr;
> };
>
> static DEFINE_PER_CPU(struct hypercall_vpmask, hypercall_vpmask);
> @@ -516,21 +518,24 @@ static DEFINE_PER_CPU(struct hypercall_vpmask, hypercall_vpmask);
> static void vpmask_empty(struct hypercall_vpmask *vpmask)
> {
> bitmap_zero(vpmask->mask, HVM_MAX_VCPUS);
> + vpmask->nr = 0;
> }
>
> static void vpmask_set(struct hypercall_vpmask *vpmask, unsigned int vp)
> {
> - __set_bit(vp, vpmask->mask);
> + if ( !test_and_set_bit(vp, vpmask->mask) )
> + vpmask->nr++;

If test_and_set_bit() is the correct thing to use here (rather
than __test_and_set_bit()), the counter also needs updating
atomically.

> }
>
> static void vpmask_fill(struct hypercall_vpmask *vpmask)
> {
> bitmap_fill(vpmask->mask, HVM_MAX_VCPUS);
> + vpmask->nr = HVM_MAX_VCPUS;
> }
>
> static bool vpmask_test(struct hypercall_vpmask *vpmask, unsigned int vp)
> {
> - return test_bit(vp, vpmask->mask);
> + return vpmask->nr && test_bit(vp, vpmask->mask);

Is this in fact an improvement?

Jan
RE: [PATCH 05/10] viridian: use softirq batching in hvcall_ipi() [ In reply to ]
> -----Original Message-----
> From: Jan Beulich <jbeulich@suse.com>
> Sent: 12 November 2020 08:53
> To: Paul Durrant <paul@xen.org>
> Cc: Paul Durrant <pdurrant@amazon.com>; Wei Liu <wl@xen.org>; Andrew Cooper
> <andrew.cooper3@citrix.com>; Roger Pau Monné <roger.pau@citrix.com>; xen-devel@lists.xenproject.org
> Subject: Re: [PATCH 05/10] viridian: use softirq batching in hvcall_ipi()
>
> On 11.11.2020 21:07, Paul Durrant wrote:
> > From: Paul Durrant <pdurrant@amazon.com>
> >
> > vlapic_ipi() uses a softirq batching mechanism to improve the efficiency of
> > sending a IPIs to large number of processors. This patch modifies send_ipi()
> > (the worker function called by hvcall_ipi()) to also make use of the
> > mechanism when there multiple bits set the hypercall_vpmask. Hence a `nr`
> > field is added to the structure to track the number of set bits.
>
> This is kind of unusual, i.e. we don't do so elsewhere. I take it the
> assumption is that using bitmap_weight() is too much overhead?

It just seemed wasteful in the circumstances. If I move to bitmap copy OTOH then I'll have to use bitmap_weight().

>
> > @@ -509,6 +510,7 @@ void viridian_domain_deinit(struct domain *d)
> >
> > struct hypercall_vpmask {
> > DECLARE_BITMAP(mask, HVM_MAX_VCPUS);
> > + unsigned int nr;
> > };
> >
> > static DEFINE_PER_CPU(struct hypercall_vpmask, hypercall_vpmask);
> > @@ -516,21 +518,24 @@ static DEFINE_PER_CPU(struct hypercall_vpmask, hypercall_vpmask);
> > static void vpmask_empty(struct hypercall_vpmask *vpmask)
> > {
> > bitmap_zero(vpmask->mask, HVM_MAX_VCPUS);
> > + vpmask->nr = 0;
> > }
> >
> > static void vpmask_set(struct hypercall_vpmask *vpmask, unsigned int vp)
> > {
> > - __set_bit(vp, vpmask->mask);
> > + if ( !test_and_set_bit(vp, vpmask->mask) )
> > + vpmask->nr++;
>
> If test_and_set_bit() is the correct thing to use here (rather
> than __test_and_set_bit()), the counter also needs updating
> atomically.
>

It doesn't need to be atomic, but I'll probably drop it.

> > }
> >
> > static void vpmask_fill(struct hypercall_vpmask *vpmask)
> > {
> > bitmap_fill(vpmask->mask, HVM_MAX_VCPUS);
> > + vpmask->nr = HVM_MAX_VCPUS;
> > }
> >
> > static bool vpmask_test(struct hypercall_vpmask *vpmask, unsigned int vp)
> > {
> > - return test_bit(vp, vpmask->mask);
> > + return vpmask->nr && test_bit(vp, vpmask->mask);
>
> Is this in fact an improvement?
>

I think so, but it's a moot point if I drop 'nr'.

Paul

> Jan