Mailing List Archive

[xen stable-4.11] vtd: optimize CPU cache sync
commit f2bc74c120bdb25446684c5375d1cc51d4702da4
Author: Roger Pau Monné <roger.pau@citrix.com>
AuthorDate: Tue Jul 7 15:24:30 2020 +0200
Commit: Jan Beulich <jbeulich@suse.com>
CommitDate: Tue Jul 7 15:24:30 2020 +0200

vtd: optimize CPU cache sync

Some VT-d IOMMUs are non-coherent, which requires a cache write back
in order for the changes made by the CPU to be visible to the IOMMU.
This cache write back was unconditionally done using clflush, but there are
other more efficient instructions to do so, hence implement support
for them using the alternative framework.

This is part of XSA-321.

Signed-off-by: Roger Pau Monné <roger.pau@citrix.com>
Reviewed-by: Jan Beulich <jbeulich@suse.com>
master commit: a64ea16522a73a13a0d66cfa4b66a9d3b95dd9d6
master date: 2020-07-07 14:39:54 +0200
---
xen/drivers/passthrough/vtd/extern.h | 1 -
xen/drivers/passthrough/vtd/iommu.c | 38 ++++++++++++++++++++++++++++++++++-
xen/drivers/passthrough/vtd/x86/vtd.c | 5 -----
3 files changed, 37 insertions(+), 7 deletions(-)

diff --git a/xen/drivers/passthrough/vtd/extern.h b/xen/drivers/passthrough/vtd/extern.h
index 00a73d63ea..065d768b52 100644
--- a/xen/drivers/passthrough/vtd/extern.h
+++ b/xen/drivers/passthrough/vtd/extern.h
@@ -63,7 +63,6 @@ int __must_check qinval_device_iotlb_sync(struct iommu *iommu,
u16 did, u16 size, u64 addr);

unsigned int get_cache_line_size(void);
-void cacheline_flush(char *);
void flush_all_cache(void);

u64 alloc_pgtable_maddr(struct acpi_drhd_unit *drhd, unsigned long npages);
diff --git a/xen/drivers/passthrough/vtd/iommu.c b/xen/drivers/passthrough/vtd/iommu.c
index 270eb1c31a..05fe61f69e 100644
--- a/xen/drivers/passthrough/vtd/iommu.c
+++ b/xen/drivers/passthrough/vtd/iommu.c
@@ -31,6 +31,7 @@
#include <xen/pci_regs.h>
#include <xen/keyhandler.h>
#include <asm/msi.h>
+#include <asm/nops.h>
#include <asm/irq.h>
#include <asm/hvm/vmx/vmx.h>
#include <asm/p2m.h>
@@ -172,7 +173,42 @@ static void sync_cache(const void *addr, unsigned int size)

addr -= (unsigned long)addr & (clflush_size - 1);
for ( ; addr < end; addr += clflush_size )
- cacheline_flush((char *)addr);
+/*
+ * The arguments to a macro must not include preprocessor directives. Doing so
+ * results in undefined behavior, so we have to create some defines here in
+ * order to avoid it.
+ */
+#if defined(HAVE_AS_CLWB)
+# define CLWB_ENCODING "clwb %[p]"
+#elif defined(HAVE_AS_XSAVEOPT)
+# define CLWB_ENCODING "data16 xsaveopt %[p]" /* clwb */
+#else
+# define CLWB_ENCODING ".byte 0x66, 0x0f, 0xae, 0x30" /* clwb (%%rax) */
+#endif
+
+#define BASE_INPUT(addr) [p] "m" (*(const char *)(addr))
+#if defined(HAVE_AS_CLWB) || defined(HAVE_AS_XSAVEOPT)
+# define INPUT BASE_INPUT
+#else
+# define INPUT(addr) "a" (addr), BASE_INPUT(addr)
+#endif
+ /*
+ * Note regarding the use of NOP_DS_PREFIX: it's faster to do a clflush
+ * + prefix than a clflush + nop, and hence the prefix is added instead
+ * of letting the alternative framework fill the gap by appending nops.
+ */
+ alternative_io_2(".byte " __stringify(NOP_DS_PREFIX) "; clflush %[p]",
+ "data16 clflush %[p]", /* clflushopt */
+ X86_FEATURE_CLFLUSHOPT,
+ CLWB_ENCODING,
+ X86_FEATURE_CLWB, /* no outputs */,
+ INPUT(addr));
+#undef INPUT
+#undef BASE_INPUT
+#undef CLWB_ENCODING
+
+ alternative_2("", "sfence", X86_FEATURE_CLFLUSHOPT,
+ "sfence", X86_FEATURE_CLWB);
}

/* Allocate page table, return its machine address */
diff --git a/xen/drivers/passthrough/vtd/x86/vtd.c b/xen/drivers/passthrough/vtd/x86/vtd.c
index 88a60b3307..7f96b91dd4 100644
--- a/xen/drivers/passthrough/vtd/x86/vtd.c
+++ b/xen/drivers/passthrough/vtd/x86/vtd.c
@@ -53,11 +53,6 @@ unsigned int get_cache_line_size(void)
return ((cpuid_ebx(1) >> 8) & 0xff) * 8;
}

-void cacheline_flush(char * addr)
-{
- clflush(addr);
-}
-
void flush_all_cache()
{
wbinvd();
--
generated by git-patchbot for /home/xen/git/xen.git#stable-4.11