Mailing List Archive: [PATCH 6 of 9] Protect superpage splitting in implementation-dependent traversals

xen/arch/x86/mm/p2m-ept.c | 15 +++++++-
xen/arch/x86/mm/p2m-lock.h | 11 +++++-
xen/arch/x86/mm/p2m-pt.c | 82 ++++++++++++++++++++++++++++++++++-----------
3 files changed, 84 insertions(+), 24 deletions(-)

In both pt and ept modes, the p2m trees can map 1GB superpages.
Because our locks work on a 2MB superpage basis, without proper
locking we could have two simultaneous traversals to two different
2MB ranges split the 1GB superpage in a racy manner.

Fix this with the existing alloc_lock in the superpage structure.
We allow 1GB-grained locking for a future implementation -- we
just default to a global lock in all cases currently.

Signed-off-by: Andres Lagar-Cavilla <andres@lagarcavilla.org>

diff -r a23e1262b124 -r 0a97d62c2d41 xen/arch/x86/mm/p2m-ept.c
--- a/xen/arch/x86/mm/p2m-ept.c
+++ b/xen/arch/x86/mm/p2m-ept.c
@@ -163,6 +163,7 @@ static int ept_set_middle_entry(struct p
}

/* free ept sub tree behind an entry */
+/* Lock on this superpage (if any) held on entry */
static void ept_free_entry(struct p2m_domain *p2m, ept_entry_t *ept_entry, int level)
{
/* End if the entry is a leaf entry. */
@@ -181,6 +182,7 @@ static void ept_free_entry(struct p2m_do
p2m_free_ptp(p2m, mfn_to_page(ept_entry->mfn));
}

+/* Lock on this superpage held on entry */
static int ept_split_super_page(struct p2m_domain *p2m, ept_entry_t *ept_entry,
int level, int target)
{
@@ -315,6 +317,7 @@ ept_set_entry(struct p2m_domain *p2m, un
int needs_sync = 1;
struct domain *d = p2m->domain;
ept_entry_t old_entry = { .epte = 0 };
+ p2m_lock_t *p2ml = p2m->lock;

/*
* the caller must make sure:
@@ -361,6 +364,8 @@ ept_set_entry(struct p2m_domain *p2m, un
* with a leaf entry (a 1GiB or 2MiB page), and handle things appropriately.
*/

+ if ( target == 2 )
+ lock_p2m_1G(p2ml, index);
if ( i == target )
{
/* We reached the target level. */
@@ -373,7 +378,7 @@ ept_set_entry(struct p2m_domain *p2m, un
/* If we're replacing a non-leaf entry with a leaf entry (1GiB or 2MiB),
* the intermediate tables will be freed below after the ept flush
*
- * Read-then-write is OK because we hold the p2m lock. */
+ * Read-then-write is OK because we hold the 1G or 2M lock. */
old_entry = *ept_entry;

if ( mfn_valid(mfn_x(mfn)) || direct_mmio || p2m_is_paged(p2mt) ||
@@ -412,6 +417,8 @@ ept_set_entry(struct p2m_domain *p2m, un
if ( !ept_split_super_page(p2m, &split_ept_entry, i, target) )
{
ept_free_entry(p2m, &split_ept_entry, i);
+ if ( target == 2 )
+ unlock_p2m_1G(p2ml, index);
goto out;
}

@@ -440,7 +447,7 @@ ept_set_entry(struct p2m_domain *p2m, un
/* the caller should take care of the previous page */
new_entry.mfn = mfn_x(mfn);

- /* Safe to read-then-write because we hold the p2m lock */
+ /* Safe to read-then-write because we hold the 1G or 2M lock */
if ( ept_entry->mfn == new_entry.mfn )
need_modify_vtd_table = 0;

@@ -448,6 +455,8 @@ ept_set_entry(struct p2m_domain *p2m, un

atomic_write_ept_entry(ept_entry, new_entry);
}
+ if ( target == 2 )
+ unlock_p2m_1G(p2ml, index);

/* Track the highest gfn for which we have ever had a valid mapping */
if ( mfn_valid(mfn_x(mfn)) &&
@@ -642,6 +651,8 @@ static ept_entry_t ept_get_entry_content
return content;
}

+/* This is called before crashing a domain, so we're not particularly
+ * concerned with locking */
void ept_walk_table(struct domain *d, unsigned long gfn)
{
struct p2m_domain *p2m = p2m_get_hostp2m(d);
diff -r a23e1262b124 -r 0a97d62c2d41 xen/arch/x86/mm/p2m-lock.h
--- a/xen/arch/x86/mm/p2m-lock.h
+++ b/xen/arch/x86/mm/p2m-lock.h
@@ -109,6 +109,11 @@ typedef struct __p2m_lock {
mm_lock_t lock;
} p2m_lock_t;

+/* We do not need sub-locking on 1G superpages because we have a
+ * global lock */
+#define lock_p2m_1G(l, gfn) ((void)l)
+#define unlock_p2m_1G(l, gfn) ((void)l)
+
static inline int p2m_lock_init(struct p2m_domain *p2m)
{
p2m_lock_t *p2ml = xmalloc(p2m_lock_t);
@@ -271,7 +276,8 @@ typedef struct __p2m_lock
/* To enforce ordering in mm-locks */
int unlock_level;
/* To protect on-demand allocation of locks
- * (yeah you heard that right) */
+ * (yeah you heard that right)
+ * Also protects 1GB superpage splitting. */
spinlock_t alloc_lock;
/* Global lock */
p2m_inner_lock_t global;
@@ -429,6 +435,9 @@ static inline void put_p2m_2m(struct p2m
unlock_p2m_leaf(__get_2m_lock(p2m->lock, gfn_1g, gfn_2m));
}

+#define lock_p2m_1G(l, gfn) spin_lock(&l->alloc_lock)
+#define unlock_p2m_1G(l, gfn) spin_unlock(&l->alloc_lock)
+
/* Allocate 2M locks we may not have allocated yet for this 1G superpage */
static inline int alloc_locks_2m(struct p2m_domain *p2m, unsigned long gfn_1g)
{
diff -r a23e1262b124 -r 0a97d62c2d41 xen/arch/x86/mm/p2m-pt.c
--- a/xen/arch/x86/mm/p2m-pt.c
+++ b/xen/arch/x86/mm/p2m-pt.c
@@ -159,6 +159,7 @@ p2m_next_level(struct p2m_domain *p2m, m
unsigned long *gfn_remainder, unsigned long gfn, u32 shift,
u32 max, unsigned long type)
{
+ p2m_lock_t *p2ml = p2m->lock;
l1_pgentry_t *l1_entry;
l1_pgentry_t *p2m_entry;
l1_pgentry_t new_entry;
@@ -207,6 +208,7 @@ p2m_next_level(struct p2m_domain *p2m, m
ASSERT(l1e_get_flags(*p2m_entry) & (_PAGE_PRESENT|_PAGE_PSE));

/* split 1GB pages into 2MB pages */
+ lock_p2m_1G(p2ml, *gfn_remainder >> shift);
if ( type == PGT_l2_page_table && (l1e_get_flags(*p2m_entry) & _PAGE_PSE) )
{
unsigned long flags, pfn;
@@ -214,7 +216,10 @@ p2m_next_level(struct p2m_domain *p2m, m

pg = p2m_alloc_ptp(p2m, PGT_l2_page_table);
if ( pg == NULL )
+ {
+ unlock_p2m_1G(p2ml, *gfn_remainder >> shift);
return 0;
+ }

flags = l1e_get_flags(*p2m_entry);
pfn = l1e_get_pfn(*p2m_entry);
@@ -233,9 +238,11 @@ p2m_next_level(struct p2m_domain *p2m, m
p2m_add_iommu_flags(&new_entry, 2, IOMMUF_readable|IOMMUF_writable);
p2m->write_p2m_entry(p2m, gfn, p2m_entry, *table_mfn, new_entry, 3);
}
-
+ unlock_p2m_1G(p2ml, *gfn_remainder >> shift);

/* split single 2MB large page into 4KB page in P2M table */
+ /* This does not necessitate locking because 2MB regions are locked
+ * exclusively */
if ( type == PGT_l1_page_table && (l1e_get_flags(*p2m_entry) & _PAGE_PSE) )
{
unsigned long flags, pfn;
@@ -297,6 +304,7 @@ p2m_set_entry(struct p2m_domain *p2m, un
IOMMUF_readable|IOMMUF_writable:
0;
unsigned long old_mfn = 0;
+ p2m_lock_t *p2ml = p2m->lock;

if ( tb_init_done )
{
@@ -326,7 +334,10 @@ p2m_set_entry(struct p2m_domain *p2m, un
*/
if ( page_order == PAGE_ORDER_1G )
{
- l1_pgentry_t old_entry = l1e_empty();
+ l1_pgentry_t old_entry;
+ lock_p2m_1G(p2ml, l3_table_offset(gfn));
+
+ old_entry = l1e_empty();
p2m_entry = p2m_find_entry(table, &gfn_remainder, gfn,
L3_PAGETABLE_SHIFT - PAGE_SHIFT,
L3_PAGETABLE_ENTRIES);
@@ -358,7 +369,9 @@ p2m_set_entry(struct p2m_domain *p2m, un
/* Free old intermediate tables if necessary */
if ( l1e_get_flags(old_entry) & _PAGE_PRESENT )
p2m_free_entry(p2m, &old_entry, page_order);
+ unlock_p2m_1G(p2ml, l3_table_offset(gfn));
}
+
/*
* When using PAE Xen, we only allow 33 bits of pseudo-physical
* address in translated guests (i.e. 8 GBytes). This restriction
@@ -515,6 +528,7 @@ static mfn_t p2m_gfn_to_mfn_current(stru
* XXX Once we start explicitly registering MMIO regions in the p2m
* XXX we will return p2m_invalid for unmapped gfns */

+ p2m_lock_t *p2ml = p2m->lock;
l1_pgentry_t l1e = l1e_empty(), *p2m_entry;
l2_pgentry_t l2e = l2e_empty();
int ret;
@@ -543,6 +557,8 @@ pod_retry_l3:
/* The read has succeeded, so we know that mapping exists */
if ( q != p2m_query )
{
+ /* We do not need to lock the 1G superpage here because PoD
+ * will do it by splitting */
if ( !p2m_pod_demand_populate(p2m, gfn, PAGE_ORDER_1G, q) )
goto pod_retry_l3;
p2mt = p2m_invalid;
@@ -558,6 +574,7 @@ pod_retry_l3:
goto pod_retry_l2;
}

+ lock_p2m_1G(p2ml, l3_table_offset(addr));
if ( l3e_get_flags(l3e) & _PAGE_PSE )
{
p2mt = p2m_flags_to_type(l3e_get_flags(l3e));
@@ -571,8 +588,12 @@ pod_retry_l3:

if ( page_order )
*page_order = PAGE_ORDER_1G;
+ unlock_p2m_1G(p2ml, l3_table_offset(addr));
goto out;
}
+ unlock_p2m_1G(p2ml, l3_table_offset(addr));
+#else
+ (void)p2ml; /* gcc ... */
#endif
/*
* Read & process L2
@@ -691,6 +712,7 @@ p2m_gfn_to_mfn(struct p2m_domain *p2m, u
paddr_t addr = ((paddr_t)gfn) << PAGE_SHIFT;
l2_pgentry_t *l2e;
l1_pgentry_t *l1e;
+ p2m_lock_t *p2ml = p2m->lock;

ASSERT(paging_mode_translate(p2m->domain));

@@ -744,6 +766,8 @@ pod_retry_l3:
{
if ( q != p2m_query )
{
+ /* See previous comments on why there is no need to lock
+ * 1GB superpage here */
if ( !p2m_pod_demand_populate(p2m, gfn, PAGE_ORDER_1G, q) )
goto pod_retry_l3;
}
@@ -755,16 +779,23 @@ pod_retry_l3:
}
else if ( (l3e_get_flags(*l3e) & _PAGE_PSE) )
{
- mfn = _mfn(l3e_get_pfn(*l3e) +
- l2_table_offset(addr) * L1_PAGETABLE_ENTRIES +
- l1_table_offset(addr));
- *t = p2m_flags_to_type(l3e_get_flags(*l3e));
- unmap_domain_page(l3e);
+ lock_p2m_1G(p2ml, l3_table_offset(addr));
+ /* Retry to be sure */
+ if ( (l3e_get_flags(*l3e) & _PAGE_PSE) )
+ {
+ mfn = _mfn(l3e_get_pfn(*l3e) +
+ l2_table_offset(addr) * L1_PAGETABLE_ENTRIES +
+ l1_table_offset(addr));
+ *t = p2m_flags_to_type(l3e_get_flags(*l3e));
+ unmap_domain_page(l3e);

- ASSERT(mfn_valid(mfn) || !p2m_is_ram(*t));
- if ( page_order )
- *page_order = PAGE_ORDER_1G;
- return (p2m_is_valid(*t)) ? mfn : _mfn(INVALID_MFN);
+ ASSERT(mfn_valid(mfn) || !p2m_is_ram(*t));
+ if ( page_order )
+ *page_order = PAGE_ORDER_1G;
+ unlock_p2m_1G(p2ml, l3_table_offset(addr));
+ return (p2m_is_valid(*t)) ? mfn : _mfn(INVALID_MFN);
+ }
+ unlock_p2m_1G(p2ml, l3_table_offset(addr));
}

mfn = _mfn(l3e_get_pfn(*l3e));
@@ -852,6 +883,7 @@ static void p2m_change_type_global(struc
l4_pgentry_t *l4e;
unsigned long i4;
#endif /* CONFIG_PAGING_LEVELS == 4 */
+ p2m_lock_t *p2ml = p2m->lock;

BUG_ON(p2m_is_grant(ot) || p2m_is_grant(nt));
BUG_ON(ot != nt && (ot == p2m_mmio_direct || nt == p2m_mmio_direct));
@@ -891,17 +923,25 @@ static void p2m_change_type_global(struc
}
if ( (l3e_get_flags(l3e[i3]) & _PAGE_PSE) )
{
- flags = l3e_get_flags(l3e[i3]);
- if ( p2m_flags_to_type(flags) != ot )
+ lock_p2m_1G(p2ml, i3);
+ if ( (l3e_get_flags(l3e[i3]) & _PAGE_PSE) )
+ {
+ flags = l3e_get_flags(l3e[i3]);
+ if ( p2m_flags_to_type(flags) != ot )
+ {
+ unlock_p2m_1G(p2ml, i3);
+ continue;
+ }
+ mfn = l3e_get_pfn(l3e[i3]);
+ gfn = get_gpfn_from_mfn(mfn);
+ flags = p2m_type_to_flags(nt, _mfn(mfn));
+ l1e_content = l1e_from_pfn(mfn, flags | _PAGE_PSE);
+ p2m->write_p2m_entry(p2m, gfn,
+ (l1_pgentry_t *)&l3e[i3],
+ l3mfn, l1e_content, 3);
+ unlock_p2m_1G(p2ml, i3);
continue;
- mfn = l3e_get_pfn(l3e[i3]);
- gfn = get_gpfn_from_mfn(mfn);
- flags = p2m_type_to_flags(nt, _mfn(mfn));
- l1e_content = l1e_from_pfn(mfn, flags | _PAGE_PSE);
- p2m->write_p2m_entry(p2m, gfn,
- (l1_pgentry_t *)&l3e[i3],
- l3mfn, l1e_content, 3);
- continue;
+ }
}

l2mfn = _mfn(l3e_get_pfn(l3e[i3]));

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xensource.com
http://lists.xensource.com/xen-devel