Mailing List Archive: [xen-unstable] 1GB Page Table Support for HVM Guest 2/3

# HG changeset patch
# User Keir Fraser <keir.fraser@citrix.com>
# Date 1270534057 -3600
# Node ID 1ea7b73b30618ad4712a4475a1a40b72f78d7441
# Parent 6868816898bda40a319445ffdcb0bdef207dfb3a
1GB Page Table Support for HVM Guest 2/3

This patch changes P2M code to works with 1GB page now.

Signed-off-by: Wei Huang <wei.huang2@amd.com>
Acked-by: Dongxiao Xu <dongxiao.xu@intel.com>
Acked-by: Tim Deegan <tim.deegan@citrix.com>
---
xen/arch/x86/mm/p2m.c | 211 +++++++++++++++++++++++++++++++++++++++++++++++---
1 files changed, 200 insertions(+), 11 deletions(-)

diff -r 6868816898bd -r 1ea7b73b3061 xen/arch/x86/mm/p2m.c
--- a/xen/arch/x86/mm/p2m.c Tue Apr 06 07:02:17 2010 +0100
+++ b/xen/arch/x86/mm/p2m.c Tue Apr 06 07:07:37 2010 +0100
@@ -187,7 +187,36 @@ p2m_next_level(struct domain *d, mfn_t *

ASSERT(l1e_get_flags(*p2m_entry) & (_PAGE_PRESENT|_PAGE_PSE));

- /* split single large page into 4KB page in P2M table */
+ /* split 1GB pages into 2MB pages */
+ if ( type == PGT_l2_page_table && (l1e_get_flags(*p2m_entry) & _PAGE_PSE) )
+ {
+ unsigned long flags, pfn;
+ struct page_info *pg = d->arch.p2m->alloc_page(d);
+ if ( pg == NULL )
+ return 0;
+ page_list_add_tail(pg, &d->arch.p2m->pages);
+ pg->u.inuse.type_info = PGT_l2_page_table | 1 | PGT_validated;
+ pg->count_info = 1;
+
+ flags = l1e_get_flags(*p2m_entry);
+ pfn = l1e_get_pfn(*p2m_entry);
+
+ l1_entry = map_domain_page(mfn_x(page_to_mfn(pg)));
+ for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
+ {
+ new_entry = l1e_from_pfn(pfn + (i * L1_PAGETABLE_ENTRIES), flags);
+ paging_write_p2m_entry(d, gfn, l1_entry+i, *table_mfn, new_entry,
+ 2);
+ }
+ unmap_domain_page(l1_entry);
+ new_entry = l1e_from_pfn(mfn_x(page_to_mfn(pg)),
+ __PAGE_HYPERVISOR|_PAGE_USER); //disable PSE
+ paging_write_p2m_entry(d, gfn,
+ p2m_entry, *table_mfn, new_entry, 3);
+ }
+
+
+ /* split single 2MB large page into 4KB page in P2M table */
if ( type == PGT_l1_page_table && (l1e_get_flags(*p2m_entry) & _PAGE_PSE) )
{
unsigned long flags, pfn;
@@ -1064,6 +1093,23 @@ p2m_pod_demand_populate(struct domain *d
if ( unlikely(d->is_dying) )
goto out_fail;

+ /* Because PoD does not have cache list for 1GB pages, it has to remap
+ * 1GB region to 2MB chunks for a retry. */
+ if ( order == 18 )
+ {
+ gfn_aligned = (gfn >> order) << order;
+ /* Note that we are supposed to call set_p2m_entry() 512 times to
+ * split 1GB into 512 2MB pages here. But We only do once here because
+ * set_p2m_entry() should automatically shatter the 1GB page into
+ * 512 2MB pages. The rest of 511 calls are unnecessary.
+ */
+ set_p2m_entry(d, gfn_aligned, _mfn(POPULATE_ON_DEMAND_MFN), 9,
+ p2m_populate_on_demand);
+ audit_p2m(d);
+ p2m_unlock(p2md);
+ return 0;
+ }
+
/* If we're low, start a sweep */
if ( order == 9 && page_list_empty(&p2md->pod.super) )
p2m_pod_emergency_sweep_super(d);
@@ -1196,6 +1242,7 @@ p2m_set_entry(struct domain *d, unsigned
l1_pgentry_t *p2m_entry;
l1_pgentry_t entry_content;
l2_pgentry_t l2e_content;
+ l3_pgentry_t l3e_content;
int rv=0;

if ( tb_init_done )
@@ -1222,18 +1269,41 @@ p2m_set_entry(struct domain *d, unsigned
goto out;
#endif
/*
+ * Try to allocate 1GB page table if this feature is supported.
+ */
+ if ( page_order == 18 )
+ {
+ p2m_entry = p2m_find_entry(table, &gfn_remainder, gfn,
+ L3_PAGETABLE_SHIFT - PAGE_SHIFT,
+ L3_PAGETABLE_ENTRIES);
+ ASSERT(p2m_entry);
+ if ( (l1e_get_flags(*p2m_entry) & _PAGE_PRESENT) &&
+ !(l1e_get_flags(*p2m_entry) & _PAGE_PSE) )
+ {
+ P2M_ERROR("configure P2M table L3 entry with large page\n");
+ domain_crash(d);
+ goto out;
+ }
+ l3e_content = mfn_valid(mfn)
+ ? l3e_from_pfn(mfn_x(mfn), p2m_type_to_flags(p2mt) | _PAGE_PSE)
+ : l3e_empty();
+ entry_content.l1 = l3e_content.l3;
+ paging_write_p2m_entry(d, gfn, p2m_entry, table_mfn, entry_content, 3);
+
+ }
+ /*
* When using PAE Xen, we only allow 33 bits of pseudo-physical
* address in translated guests (i.e. 8 GBytes). This restriction
* comes from wanting to map the P2M table into the 16MB RO_MPT hole
* in Xen's address space for translated PV guests.
* When using AMD's NPT on PAE Xen, we are restricted to 4GB.
*/
- if ( !p2m_next_level(d, &table_mfn, &table, &gfn_remainder, gfn,
- L3_PAGETABLE_SHIFT - PAGE_SHIFT,
- ((CONFIG_PAGING_LEVELS == 3)
- ? (paging_mode_hap(d) ? 4 : 8)
- : L3_PAGETABLE_ENTRIES),
- PGT_l2_page_table) )
+ else if ( !p2m_next_level(d, &table_mfn, &table, &gfn_remainder, gfn,
+ L3_PAGETABLE_SHIFT - PAGE_SHIFT,
+ ((CONFIG_PAGING_LEVELS == 3)
+ ? (paging_mode_hap(d) ? 4 : 8)
+ : L3_PAGETABLE_ENTRIES),
+ PGT_l2_page_table) )
goto out;

if ( page_order == 0 )
@@ -1255,7 +1325,7 @@ p2m_set_entry(struct domain *d, unsigned
/* level 1 entry */
paging_write_p2m_entry(d, gfn, p2m_entry, table_mfn, entry_content, 1);
}
- else
+ else if ( page_order == 9 )
{
p2m_entry = p2m_find_entry(table, &gfn_remainder, gfn,
L2_PAGETABLE_SHIFT - PAGE_SHIFT,
@@ -1352,11 +1422,34 @@ p2m_gfn_to_mfn(struct domain *d, unsigne
#else
l3e += l3_table_offset(addr);
#endif
+pod_retry_l3:
if ( (l3e_get_flags(*l3e) & _PAGE_PRESENT) == 0 )
{
+ if ( p2m_flags_to_type(l3e_get_flags(*l3e)) == p2m_populate_on_demand )
+ {
+ if ( q != p2m_query )
+ {
+ if ( !p2m_pod_demand_populate(d, gfn, 18, q) )
+ goto pod_retry_l3;
+ }
+ else
+ *t = p2m_populate_on_demand;
+ }
unmap_domain_page(l3e);
return _mfn(INVALID_MFN);
}
+ else if ( (l3e_get_flags(*l3e) & _PAGE_PSE) )
+ {
+ mfn = _mfn(l3e_get_pfn(*l3e) +
+ l2_table_offset(addr) * L1_PAGETABLE_ENTRIES +
+ l1_table_offset(addr));
+ *t = p2m_flags_to_type(l3e_get_flags(*l3e));
+ unmap_domain_page(l3e);
+
+ ASSERT(mfn_valid(mfn) || !p2m_is_ram(*t));
+ return (p2m_is_valid(*t)) ? mfn : _mfn(INVALID_MFN);
+ }
+
mfn = _mfn(l3e_get_pfn(*l3e));
unmap_domain_page(l3e);
}
@@ -1437,10 +1530,57 @@ static mfn_t p2m_gfn_to_mfn_current(unsi
{
l1_pgentry_t l1e = l1e_empty(), *p2m_entry;
l2_pgentry_t l2e = l2e_empty();
+ l3_pgentry_t l3e = l3e_empty();
int ret;

ASSERT(gfn < (RO_MPT_VIRT_END - RO_MPT_VIRT_START)
/ sizeof(l1_pgentry_t));
+
+ /*
+ * Read & process L3
+ */
+ p2m_entry = (l1_pgentry_t *)
+ &__linear_l2_table[l2_linear_offset(RO_MPT_VIRT_START)
+ + l3_linear_offset(addr)];
+ pod_retry_l3:
+ ret = __copy_from_user(&l3e, p2m_entry, sizeof(l3e));
+
+ if ( ret != 0 || !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
+ {
+ if ( (l3e_get_flags(l3e) & _PAGE_PSE) &&
+ (p2m_flags_to_type(l3e_get_flags(l3e)) == p2m_populate_on_demand) )
+ {
+ /* The read has succeeded, so we know that mapping exists */
+ if ( q != p2m_query )
+ {
+ if ( !p2m_pod_demand_populate(current->domain, gfn, 18, q) )
+ goto pod_retry_l3;
+ p2mt = p2m_invalid;
+ printk("%s: Allocate 1GB failed!\n", __func__);
+ goto out;
+ }
+ else
+ {
+ p2mt = p2m_populate_on_demand;
+ goto out;
+ }
+ }
+ goto pod_retry_l2;
+ }
+
+ if ( l3e_get_flags(l3e) & _PAGE_PSE )
+ {
+ p2mt = p2m_flags_to_type(l3e_get_flags(l3e));
+ ASSERT(l3e_get_pfn(l3e) != INVALID_MFN || !p2m_is_ram(p2mt));
+ if (p2m_is_valid(p2mt) )
+ mfn = _mfn(l3e_get_pfn(l3e) +
+ l2_table_offset(addr) * L1_PAGETABLE_ENTRIES +
+ l1_table_offset(addr));
+ else
+ p2mt = p2m_mmio_dm;
+
+ goto out;
+ }

/*
* Read & process L2
@@ -1596,10 +1736,18 @@ int set_p2m_entry(struct domain *d, unsi
while ( todo )
{
if ( is_hvm_domain(d) && paging_mode_hap(d) )
- order = (((gfn | mfn_x(mfn) | todo) & (SUPERPAGE_PAGES - 1)) == 0) ?
- 9 : 0;
+ order = ( (((gfn | mfn_x(mfn) | todo) & ((1ul << 18) - 1)) == 0) ) ?
+ 18 :
+ (((gfn | mfn_x(mfn) | todo) & ((1ul << 9) - 1)) == 0) ? 9 : 0;
else
order = 0;
+
+ /* Note that we only enable hap_1gb_pgtb when CONFIG_PAGING_LEVELS==4.
+ * So 1GB should never be enabled under 32bit or PAE modes. But for
+ * safety's reason, we double-check the page order again..
+ */
+ BUG_ON(order == 18 && CONFIG_PAGING_LEVELS < 4);
+
if ( !d->arch.p2m->set_entry(d, gfn, mfn, order, p2mt) )
rc = 0;
gfn += 1ul << order;
@@ -1867,6 +2015,31 @@ static void audit_p2m(struct domain *d)
gfn += 1 << (L3_PAGETABLE_SHIFT - PAGE_SHIFT);
continue;
}
+
+ /* check for 1GB super page */
+ if ( l3e_get_flags(l3e[i3]) & _PAGE_PSE )
+ {
+ mfn = l3e_get_pfn(l3e[i3]);
+ ASSERT(mfn_valid(_mfn(mfn)));
+ /* we have to cover 512x512 4K pages */
+ for ( i2 = 0;
+ i2 < (L2_PAGETABLE_ENTRIES * L1_PAGETABLE_ENTRIES);
+ i2++)
+ {
+ m2pfn = get_gpfn_from_mfn(mfn+i2);
+ if ( m2pfn != (gfn + i2) )
+ {
+ pmbad++;
+ P2M_PRINTK("mismatch: gfn %#lx -> mfn %#lx"
+ " -> gfn %#lx\n", gfn+i2, mfn+i2,
+ m2pfn);
+ BUG();
+ }
+ gfn += 1 << (L3_PAGETABLE_SHIFT - PAGE_SHIFT);
+ continue;
+ }
+ }
+
l2e = map_domain_page(mfn_x(_mfn(l3e_get_pfn(l3e[i3]))));
for ( i2 = 0; i2 < L2_PAGETABLE_ENTRIES; i2++ )
{
@@ -2224,7 +2397,7 @@ void p2m_change_type_global(struct domai
l1_pgentry_t l1e_content;
l1_pgentry_t *l1e;
l2_pgentry_t *l2e;
- mfn_t l1mfn, l2mfn;
+ mfn_t l1mfn, l2mfn, l3mfn;
unsigned long i1, i2, i3;
l3_pgentry_t *l3e;
#if CONFIG_PAGING_LEVELS == 4
@@ -2245,6 +2418,7 @@ void p2m_change_type_global(struct domai
#if CONFIG_PAGING_LEVELS == 4
l4e = map_domain_page(mfn_x(pagetable_get_mfn(d->arch.phys_table)));
#else /* CONFIG_PAGING_LEVELS == 3 */
+ l3mfn = _mfn(mfn_x(pagetable_get_mfn(d->arch.phys_table)));
l3e = map_domain_page(mfn_x(pagetable_get_mfn(d->arch.phys_table)));
#endif

@@ -2255,6 +2429,7 @@ void p2m_change_type_global(struct domai
{
continue;
}
+ l3mfn = _mfn(l4e_get_pfn(l4e[i4]));
l3e = map_domain_page(l4e_get_pfn(l4e[i4]));
#endif
for ( i3 = 0;
@@ -2265,6 +2440,20 @@ void p2m_change_type_global(struct domai
{
continue;
}
+ if ( (l3e_get_flags(l3e[i3]) & _PAGE_PSE) )
+ {
+ flags = l3e_get_flags(l3e[i3]);
+ if ( p2m_flags_to_type(flags) != ot )
+ continue;
+ mfn = l3e_get_pfn(l3e[i3]);
+ gfn = get_gpfn_from_mfn(mfn);
+ flags = p2m_type_to_flags(nt);
+ l1e_content = l1e_from_pfn(mfn, flags | _PAGE_PSE);
+ paging_write_p2m_entry(d, gfn, (l1_pgentry_t *)&l3e[i3],
+ l3mfn, l1e_content, 3);
+ continue;
+ }
+
l2mfn = _mfn(l3e_get_pfn(l3e[i3]));
l2e = map_domain_page(l3e_get_pfn(l3e[i3]));
for ( i2 = 0; i2 < L2_PAGETABLE_ENTRIES; i2++ )

_______________________________________________
Xen-changelog mailing list
Xen-changelog@lists.xensource.com
http://lists.xensource.com/xen-changelog