Mailing List Archive

[RFC PATCH V1 01/12] hvm/ioreq: Make x86's IOREQ feature common
From: Oleksandr Tyshchenko <oleksandr_tyshchenko@epam.com>

As a lot of x86 code can be re-used on Arm later on, this patch
splits IOREQ support into common and arch specific parts.

This support is going to be used on Arm to be able run device
emulator outside of Xen hypervisor.

Please note, this is a split/cleanup of Julien's PoC:
"Add support for Guest IO forwarding to a device emulator"

Signed-off-by: Julien Grall <julien.grall@arm.com>
Signed-off-by: Oleksandr Tyshchenko <oleksandr_tyshchenko@epam.com>
---
xen/arch/x86/Kconfig | 1 +
xen/arch/x86/hvm/dm.c | 2 +-
xen/arch/x86/hvm/emulate.c | 2 +-
xen/arch/x86/hvm/hvm.c | 2 +-
xen/arch/x86/hvm/io.c | 2 +-
xen/arch/x86/hvm/ioreq.c | 1431 +--------------------------------------
xen/arch/x86/hvm/stdvga.c | 2 +-
xen/arch/x86/hvm/vmx/realmode.c | 1 +
xen/arch/x86/hvm/vmx/vvmx.c | 2 +-
xen/arch/x86/mm.c | 2 +-
xen/arch/x86/mm/shadow/common.c | 2 +-
xen/common/Kconfig | 3 +
xen/common/Makefile | 1 +
xen/common/hvm/Makefile | 1 +
xen/common/hvm/ioreq.c | 1430 ++++++++++++++++++++++++++++++++++++++
xen/include/asm-x86/hvm/ioreq.h | 45 +-
xen/include/asm-x86/hvm/vcpu.h | 7 -
xen/include/xen/hvm/ioreq.h | 89 +++
18 files changed, 1575 insertions(+), 1450 deletions(-)
create mode 100644 xen/common/hvm/Makefile
create mode 100644 xen/common/hvm/ioreq.c
create mode 100644 xen/include/xen/hvm/ioreq.h

diff --git a/xen/arch/x86/Kconfig b/xen/arch/x86/Kconfig
index a636a4b..f5a9f87 100644
--- a/xen/arch/x86/Kconfig
+++ b/xen/arch/x86/Kconfig
@@ -91,6 +91,7 @@ config PV_LINEAR_PT

config HVM
def_bool !PV_SHIM_EXCLUSIVE
+ select IOREQ_SERVER
prompt "HVM support"
---help---
Interfaces to support HVM domains. HVM domains require hardware
diff --git a/xen/arch/x86/hvm/dm.c b/xen/arch/x86/hvm/dm.c
index e3f8451..70adb27 100644
--- a/xen/arch/x86/hvm/dm.c
+++ b/xen/arch/x86/hvm/dm.c
@@ -16,13 +16,13 @@

#include <xen/event.h>
#include <xen/guest_access.h>
+#include <xen/hvm/ioreq.h>
#include <xen/hypercall.h>
#include <xen/nospec.h>
#include <xen/sched.h>

#include <asm/hap.h>
#include <asm/hvm/cacheattr.h>
-#include <asm/hvm/ioreq.h>
#include <asm/shadow.h>

#include <xsm/xsm.h>
diff --git a/xen/arch/x86/hvm/emulate.c b/xen/arch/x86/hvm/emulate.c
index 8b4e73a..78993b3 100644
--- a/xen/arch/x86/hvm/emulate.c
+++ b/xen/arch/x86/hvm/emulate.c
@@ -9,6 +9,7 @@
* Keir Fraser <keir@xen.org>
*/

+#include <xen/hvm/ioreq.h>
#include <xen/init.h>
#include <xen/lib.h>
#include <xen/sched.h>
@@ -20,7 +21,6 @@
#include <asm/xstate.h>
#include <asm/hvm/emulate.h>
#include <asm/hvm/hvm.h>
-#include <asm/hvm/ioreq.h>
#include <asm/hvm/monitor.h>
#include <asm/hvm/trace.h>
#include <asm/hvm/support.h>
diff --git a/xen/arch/x86/hvm/hvm.c b/xen/arch/x86/hvm/hvm.c
index 5bb4758..c05025d 100644
--- a/xen/arch/x86/hvm/hvm.c
+++ b/xen/arch/x86/hvm/hvm.c
@@ -19,6 +19,7 @@
*/

#include <xen/ctype.h>
+#include <xen/hvm/ioreq.h>
#include <xen/init.h>
#include <xen/lib.h>
#include <xen/trace.h>
@@ -64,7 +65,6 @@
#include <asm/hvm/trace.h>
#include <asm/hvm/nestedhvm.h>
#include <asm/hvm/monitor.h>
-#include <asm/hvm/ioreq.h>
#include <asm/hvm/viridian.h>
#include <asm/hvm/vm_event.h>
#include <asm/altp2m.h>
diff --git a/xen/arch/x86/hvm/io.c b/xen/arch/x86/hvm/io.c
index 724ab44..5d501d1 100644
--- a/xen/arch/x86/hvm/io.c
+++ b/xen/arch/x86/hvm/io.c
@@ -18,6 +18,7 @@
* this program; If not, see <http://www.gnu.org/licenses/>.
*/

+#include <xen/hvm/ioreq.h>
#include <xen/init.h>
#include <xen/mm.h>
#include <xen/lib.h>
@@ -35,7 +36,6 @@
#include <asm/shadow.h>
#include <asm/p2m.h>
#include <asm/hvm/hvm.h>
-#include <asm/hvm/ioreq.h>
#include <asm/hvm/support.h>
#include <asm/hvm/vpt.h>
#include <asm/hvm/vpic.h>
diff --git a/xen/arch/x86/hvm/ioreq.c b/xen/arch/x86/hvm/ioreq.c
index 7240070..dd21e85 100644
--- a/xen/arch/x86/hvm/ioreq.c
+++ b/xen/arch/x86/hvm/ioreq.c
@@ -17,6 +17,7 @@
*/

#include <xen/ctype.h>
+#include <xen/hvm/ioreq.h>
#include <xen/init.h>
#include <xen/lib.h>
#include <xen/trace.h>
@@ -28,1069 +29,16 @@
#include <xen/paging.h>
#include <xen/vpci.h>

-#include <asm/hvm/emulate.h>
-#include <asm/hvm/hvm.h>
-#include <asm/hvm/ioreq.h>
-#include <asm/hvm/vmx/vmx.h>
-
-#include <public/hvm/ioreq.h>
-#include <public/hvm/params.h>
-
-static void set_ioreq_server(struct domain *d, unsigned int id,
- struct hvm_ioreq_server *s)
-{
- ASSERT(id < MAX_NR_IOREQ_SERVERS);
- ASSERT(!s || !d->arch.hvm.ioreq_server.server[id]);
-
- d->arch.hvm.ioreq_server.server[id] = s;
-}
-
-#define GET_IOREQ_SERVER(d, id) \
- (d)->arch.hvm.ioreq_server.server[id]
-
-static struct hvm_ioreq_server *get_ioreq_server(const struct domain *d,
- unsigned int id)
-{
- if ( id >= MAX_NR_IOREQ_SERVERS )
- return NULL;
-
- return GET_IOREQ_SERVER(d, id);
-}
-
-/*
- * Iterate over all possible ioreq servers.
- *
- * NOTE: The iteration is backwards such that more recently created
- * ioreq servers are favoured in hvm_select_ioreq_server().
- * This is a semantic that previously existed when ioreq servers
- * were held in a linked list.
- */
-#define FOR_EACH_IOREQ_SERVER(d, id, s) \
- for ( (id) = MAX_NR_IOREQ_SERVERS; (id) != 0; ) \
- if ( !(s = GET_IOREQ_SERVER(d, --(id))) ) \
- continue; \
- else
-
-static ioreq_t *get_ioreq(struct hvm_ioreq_server *s, struct vcpu *v)
-{
- shared_iopage_t *p = s->ioreq.va;
-
- ASSERT((v == current) || !vcpu_runnable(v));
- ASSERT(p != NULL);
-
- return &p->vcpu_ioreq[v->vcpu_id];
-}
-
-bool hvm_io_pending(struct vcpu *v)
-{
- struct domain *d = v->domain;
- struct hvm_ioreq_server *s;
- unsigned int id;
-
- FOR_EACH_IOREQ_SERVER(d, id, s)
- {
- struct hvm_ioreq_vcpu *sv;
-
- list_for_each_entry ( sv,
- &s->ioreq_vcpu_list,
- list_entry )
- {
- if ( sv->vcpu == v && sv->pending )
- return true;
- }
- }
-
- return false;
-}
-
-static void hvm_io_assist(struct hvm_ioreq_vcpu *sv, uint64_t data)
-{
- struct vcpu *v = sv->vcpu;
- ioreq_t *ioreq = &v->arch.hvm.hvm_io.io_req;
-
- if ( hvm_ioreq_needs_completion(ioreq) )
- ioreq->data = data;
-
- sv->pending = false;
-}
-
-static bool hvm_wait_for_io(struct hvm_ioreq_vcpu *sv, ioreq_t *p)
-{
- unsigned int prev_state = STATE_IOREQ_NONE;
-
- while ( sv->pending )
- {
- unsigned int state = p->state;
-
- smp_rmb();
-
- recheck:
- if ( unlikely(state == STATE_IOREQ_NONE) )
- {
- /*
- * The only reason we should see this case is when an
- * emulator is dying and it races with an I/O being
- * requested.
- */
- hvm_io_assist(sv, ~0ul);
- break;
- }
-
- if ( unlikely(state < prev_state) )
- {
- gdprintk(XENLOG_ERR, "Weird HVM ioreq state transition %u -> %u\n",
- prev_state, state);
- sv->pending = false;
- domain_crash(sv->vcpu->domain);
- return false; /* bail */
- }
-
- switch ( prev_state = state )
- {
- case STATE_IORESP_READY: /* IORESP_READY -> NONE */
- p->state = STATE_IOREQ_NONE;
- hvm_io_assist(sv, p->data);
- break;
- case STATE_IOREQ_READY: /* IOREQ_{READY,INPROCESS} -> IORESP_READY */
- case STATE_IOREQ_INPROCESS:
- wait_on_xen_event_channel(sv->ioreq_evtchn,
- ({ state = p->state;
- smp_rmb();
- state != prev_state; }));
- goto recheck;
- default:
- gdprintk(XENLOG_ERR, "Weird HVM iorequest state %u\n", state);
- sv->pending = false;
- domain_crash(sv->vcpu->domain);
- return false; /* bail */
- }
- }
-
- return true;
-}
-
-bool handle_hvm_io_completion(struct vcpu *v)
-{
- struct domain *d = v->domain;
- struct hvm_vcpu_io *vio = &v->arch.hvm.hvm_io;
- struct hvm_ioreq_server *s;
- enum hvm_io_completion io_completion;
- unsigned int id;
-
- if ( has_vpci(d) && vpci_process_pending(v) )
- {
- raise_softirq(SCHEDULE_SOFTIRQ);
- return false;
- }
-
- FOR_EACH_IOREQ_SERVER(d, id, s)
- {
- struct hvm_ioreq_vcpu *sv;
-
- list_for_each_entry ( sv,
- &s->ioreq_vcpu_list,
- list_entry )
- {
- if ( sv->vcpu == v && sv->pending )
- {
- if ( !hvm_wait_for_io(sv, get_ioreq(s, v)) )
- return false;
-
- break;
- }
- }
- }
-
- vio->io_req.state = hvm_ioreq_needs_completion(&vio->io_req) ?
- STATE_IORESP_READY : STATE_IOREQ_NONE;
-
- msix_write_completion(v);
- vcpu_end_shutdown_deferral(v);
-
- io_completion = vio->io_completion;
- vio->io_completion = HVMIO_no_completion;
-
- switch ( io_completion )
- {
- case HVMIO_no_completion:
- break;
-
- case HVMIO_mmio_completion:
- return handle_mmio();
-
- case HVMIO_pio_completion:
- return handle_pio(vio->io_req.addr, vio->io_req.size,
- vio->io_req.dir);
-
- case HVMIO_realmode_completion:
- {
- struct hvm_emulate_ctxt ctxt;
-
- hvm_emulate_init_once(&ctxt, NULL, guest_cpu_user_regs());
- vmx_realmode_emulate_one(&ctxt);
- hvm_emulate_writeback(&ctxt);
-
- break;
- }
- default:
- ASSERT_UNREACHABLE();
- break;
- }
-
- return true;
-}
-
-static gfn_t hvm_alloc_legacy_ioreq_gfn(struct hvm_ioreq_server *s)
-{
- struct domain *d = s->target;
- unsigned int i;
-
- BUILD_BUG_ON(HVM_PARAM_BUFIOREQ_PFN != HVM_PARAM_IOREQ_PFN + 1);
-
- for ( i = HVM_PARAM_IOREQ_PFN; i <= HVM_PARAM_BUFIOREQ_PFN; i++ )
- {
- if ( !test_and_clear_bit(i, &d->arch.hvm.ioreq_gfn.legacy_mask) )
- return _gfn(d->arch.hvm.params[i]);
- }
-
- return INVALID_GFN;
-}
-
-static gfn_t hvm_alloc_ioreq_gfn(struct hvm_ioreq_server *s)
-{
- struct domain *d = s->target;
- unsigned int i;
-
- for ( i = 0; i < sizeof(d->arch.hvm.ioreq_gfn.mask) * 8; i++ )
- {
- if ( test_and_clear_bit(i, &d->arch.hvm.ioreq_gfn.mask) )
- return _gfn(d->arch.hvm.ioreq_gfn.base + i);
- }
-
- /*
- * If we are out of 'normal' GFNs then we may still have a 'legacy'
- * GFN available.
- */
- return hvm_alloc_legacy_ioreq_gfn(s);
-}
-
-static bool hvm_free_legacy_ioreq_gfn(struct hvm_ioreq_server *s,
- gfn_t gfn)
-{
- struct domain *d = s->target;
- unsigned int i;
-
- for ( i = HVM_PARAM_IOREQ_PFN; i <= HVM_PARAM_BUFIOREQ_PFN; i++ )
- {
- if ( gfn_eq(gfn, _gfn(d->arch.hvm.params[i])) )
- break;
- }
- if ( i > HVM_PARAM_BUFIOREQ_PFN )
- return false;
-
- set_bit(i, &d->arch.hvm.ioreq_gfn.legacy_mask);
- return true;
-}
-
-static void hvm_free_ioreq_gfn(struct hvm_ioreq_server *s, gfn_t gfn)
-{
- struct domain *d = s->target;
- unsigned int i = gfn_x(gfn) - d->arch.hvm.ioreq_gfn.base;
-
- ASSERT(!gfn_eq(gfn, INVALID_GFN));
-
- if ( !hvm_free_legacy_ioreq_gfn(s, gfn) )
- {
- ASSERT(i < sizeof(d->arch.hvm.ioreq_gfn.mask) * 8);
- set_bit(i, &d->arch.hvm.ioreq_gfn.mask);
- }
-}
-
-static void hvm_unmap_ioreq_gfn(struct hvm_ioreq_server *s, bool buf)
-{
- struct hvm_ioreq_page *iorp = buf ? &s->bufioreq : &s->ioreq;
-
- if ( gfn_eq(iorp->gfn, INVALID_GFN) )
- return;
-
- destroy_ring_for_helper(&iorp->va, iorp->page);
- iorp->page = NULL;
-
- hvm_free_ioreq_gfn(s, iorp->gfn);
- iorp->gfn = INVALID_GFN;
-}
-
-static int hvm_map_ioreq_gfn(struct hvm_ioreq_server *s, bool buf)
-{
- struct domain *d = s->target;
- struct hvm_ioreq_page *iorp = buf ? &s->bufioreq : &s->ioreq;
- int rc;
-
- if ( iorp->page )
- {
- /*
- * If a page has already been allocated (which will happen on
- * demand if hvm_get_ioreq_server_frame() is called), then
- * mapping a guest frame is not permitted.
- */
- if ( gfn_eq(iorp->gfn, INVALID_GFN) )
- return -EPERM;
-
- return 0;
- }
-
- if ( d->is_dying )
- return -EINVAL;
-
- iorp->gfn = hvm_alloc_ioreq_gfn(s);
-
- if ( gfn_eq(iorp->gfn, INVALID_GFN) )
- return -ENOMEM;
-
- rc = prepare_ring_for_helper(d, gfn_x(iorp->gfn), &iorp->page,
- &iorp->va);
-
- if ( rc )
- hvm_unmap_ioreq_gfn(s, buf);
-
- return rc;
-}
-
-static int hvm_alloc_ioreq_mfn(struct hvm_ioreq_server *s, bool buf)
-{
- struct hvm_ioreq_page *iorp = buf ? &s->bufioreq : &s->ioreq;
- struct page_info *page;
-
- if ( iorp->page )
- {
- /*
- * If a guest frame has already been mapped (which may happen
- * on demand if hvm_get_ioreq_server_info() is called), then
- * allocating a page is not permitted.
- */
- if ( !gfn_eq(iorp->gfn, INVALID_GFN) )
- return -EPERM;
-
- return 0;
- }
-
- page = alloc_domheap_page(s->target, MEMF_no_refcount);
-
- if ( !page )
- return -ENOMEM;
-
- if ( !get_page_and_type(page, s->target, PGT_writable_page) )
- {
- /*
- * The domain can't possibly know about this page yet, so failure
- * here is a clear indication of something fishy going on.
- */
- domain_crash(s->emulator);
- return -ENODATA;
- }
-
- iorp->va = __map_domain_page_global(page);
- if ( !iorp->va )
- goto fail;
-
- iorp->page = page;
- clear_page(iorp->va);
- return 0;
-
- fail:
- put_page_alloc_ref(page);
- put_page_and_type(page);
-
- return -ENOMEM;
-}
-
-static void hvm_free_ioreq_mfn(struct hvm_ioreq_server *s, bool buf)
-{
- struct hvm_ioreq_page *iorp = buf ? &s->bufioreq : &s->ioreq;
- struct page_info *page = iorp->page;
-
- if ( !page )
- return;
-
- iorp->page = NULL;
-
- unmap_domain_page_global(iorp->va);
- iorp->va = NULL;
-
- put_page_alloc_ref(page);
- put_page_and_type(page);
-}
-
-bool is_ioreq_server_page(struct domain *d, const struct page_info *page)
-{
- const struct hvm_ioreq_server *s;
- unsigned int id;
- bool found = false;
-
- spin_lock_recursive(&d->arch.hvm.ioreq_server.lock);
-
- FOR_EACH_IOREQ_SERVER(d, id, s)
- {
- if ( (s->ioreq.page == page) || (s->bufioreq.page == page) )
- {
- found = true;
- break;
- }
- }
-
- spin_unlock_recursive(&d->arch.hvm.ioreq_server.lock);
-
- return found;
-}
-
-static void hvm_remove_ioreq_gfn(struct hvm_ioreq_server *s, bool buf)
-
-{
- struct domain *d = s->target;
- struct hvm_ioreq_page *iorp = buf ? &s->bufioreq : &s->ioreq;
-
- if ( gfn_eq(iorp->gfn, INVALID_GFN) )
- return;
-
- if ( guest_physmap_remove_page(d, iorp->gfn,
- page_to_mfn(iorp->page), 0) )
- domain_crash(d);
- clear_page(iorp->va);
-}
-
-static int hvm_add_ioreq_gfn(struct hvm_ioreq_server *s, bool buf)
-{
- struct domain *d = s->target;
- struct hvm_ioreq_page *iorp = buf ? &s->bufioreq : &s->ioreq;
- int rc;
-
- if ( gfn_eq(iorp->gfn, INVALID_GFN) )
- return 0;
-
- clear_page(iorp->va);
-
- rc = guest_physmap_add_page(d, iorp->gfn,
- page_to_mfn(iorp->page), 0);
- if ( rc == 0 )
- paging_mark_pfn_dirty(d, _pfn(gfn_x(iorp->gfn)));
-
- return rc;
-}
-
-static void hvm_update_ioreq_evtchn(struct hvm_ioreq_server *s,
- struct hvm_ioreq_vcpu *sv)
-{
- ASSERT(spin_is_locked(&s->lock));
-
- if ( s->ioreq.va != NULL )
- {
- ioreq_t *p = get_ioreq(s, sv->vcpu);
-
- p->vp_eport = sv->ioreq_evtchn;
- }
-}
-
-#define HANDLE_BUFIOREQ(s) \
- ((s)->bufioreq_handling != HVM_IOREQSRV_BUFIOREQ_OFF)
-
-static int hvm_ioreq_server_add_vcpu(struct hvm_ioreq_server *s,
- struct vcpu *v)
-{
- struct hvm_ioreq_vcpu *sv;
- int rc;
-
- sv = xzalloc(struct hvm_ioreq_vcpu);
-
- rc = -ENOMEM;
- if ( !sv )
- goto fail1;
-
- spin_lock(&s->lock);
-
- rc = alloc_unbound_xen_event_channel(v->domain, v->vcpu_id,
- s->emulator->domain_id, NULL);
- if ( rc < 0 )
- goto fail2;
-
- sv->ioreq_evtchn = rc;
-
- if ( v->vcpu_id == 0 && HANDLE_BUFIOREQ(s) )
- {
- rc = alloc_unbound_xen_event_channel(v->domain, 0,
- s->emulator->domain_id, NULL);
- if ( rc < 0 )
- goto fail3;
-
- s->bufioreq_evtchn = rc;
- }
-
- sv->vcpu = v;
-
- list_add(&sv->list_entry, &s->ioreq_vcpu_list);
-
- if ( s->enabled )
- hvm_update_ioreq_evtchn(s, sv);
-
- spin_unlock(&s->lock);
- return 0;
-
- fail3:
- free_xen_event_channel(v->domain, sv->ioreq_evtchn);
-
- fail2:
- spin_unlock(&s->lock);
- xfree(sv);
-
- fail1:
- return rc;
-}
-
-static void hvm_ioreq_server_remove_vcpu(struct hvm_ioreq_server *s,
- struct vcpu *v)
-{
- struct hvm_ioreq_vcpu *sv;
-
- spin_lock(&s->lock);
-
- list_for_each_entry ( sv,
- &s->ioreq_vcpu_list,
- list_entry )
- {
- if ( sv->vcpu != v )
- continue;
-
- list_del(&sv->list_entry);
-
- if ( v->vcpu_id == 0 && HANDLE_BUFIOREQ(s) )
- free_xen_event_channel(v->domain, s->bufioreq_evtchn);
-
- free_xen_event_channel(v->domain, sv->ioreq_evtchn);
-
- xfree(sv);
- break;
- }
-
- spin_unlock(&s->lock);
-}
-
-static void hvm_ioreq_server_remove_all_vcpus(struct hvm_ioreq_server *s)
-{
- struct hvm_ioreq_vcpu *sv, *next;
-
- spin_lock(&s->lock);
-
- list_for_each_entry_safe ( sv,
- next,
- &s->ioreq_vcpu_list,
- list_entry )
- {
- struct vcpu *v = sv->vcpu;
-
- list_del(&sv->list_entry);
-
- if ( v->vcpu_id == 0 && HANDLE_BUFIOREQ(s) )
- free_xen_event_channel(v->domain, s->bufioreq_evtchn);
-
- free_xen_event_channel(v->domain, sv->ioreq_evtchn);
-
- xfree(sv);
- }
-
- spin_unlock(&s->lock);
-}
-
-static int hvm_ioreq_server_map_pages(struct hvm_ioreq_server *s)
-{
- int rc;
-
- rc = hvm_map_ioreq_gfn(s, false);
-
- if ( !rc && HANDLE_BUFIOREQ(s) )
- rc = hvm_map_ioreq_gfn(s, true);
-
- if ( rc )
- hvm_unmap_ioreq_gfn(s, false);
-
- return rc;
-}
-
-static void hvm_ioreq_server_unmap_pages(struct hvm_ioreq_server *s)
-{
- hvm_unmap_ioreq_gfn(s, true);
- hvm_unmap_ioreq_gfn(s, false);
-}
-
-static int hvm_ioreq_server_alloc_pages(struct hvm_ioreq_server *s)
-{
- int rc;
-
- rc = hvm_alloc_ioreq_mfn(s, false);
-
- if ( !rc && (s->bufioreq_handling != HVM_IOREQSRV_BUFIOREQ_OFF) )
- rc = hvm_alloc_ioreq_mfn(s, true);
-
- if ( rc )
- hvm_free_ioreq_mfn(s, false);
-
- return rc;
-}
-
-static void hvm_ioreq_server_free_pages(struct hvm_ioreq_server *s)
-{
- hvm_free_ioreq_mfn(s, true);
- hvm_free_ioreq_mfn(s, false);
-}
-
-static void hvm_ioreq_server_free_rangesets(struct hvm_ioreq_server *s)
-{
- unsigned int i;
-
- for ( i = 0; i < NR_IO_RANGE_TYPES; i++ )
- rangeset_destroy(s->range[i]);
-}
-
-static int hvm_ioreq_server_alloc_rangesets(struct hvm_ioreq_server *s,
- ioservid_t id)
-{
- unsigned int i;
- int rc;
-
- for ( i = 0; i < NR_IO_RANGE_TYPES; i++ )
- {
- char *name;
-
- rc = asprintf(&name, "ioreq_server %d %s", id,
- (i == XEN_DMOP_IO_RANGE_PORT) ? "port" :
- (i == XEN_DMOP_IO_RANGE_MEMORY) ? "memory" :
- (i == XEN_DMOP_IO_RANGE_PCI) ? "pci" :
- "");
- if ( rc )
- goto fail;
-
- s->range[i] = rangeset_new(s->target, name,
- RANGESETF_prettyprint_hex);
-
- xfree(name);
-
- rc = -ENOMEM;
- if ( !s->range[i] )
- goto fail;
-
- rangeset_limit(s->range[i], MAX_NR_IO_RANGES);
- }
-
- return 0;
-
- fail:
- hvm_ioreq_server_free_rangesets(s);
-
- return rc;
-}
-
-static void hvm_ioreq_server_enable(struct hvm_ioreq_server *s)
-{
- struct hvm_ioreq_vcpu *sv;
-
- spin_lock(&s->lock);
-
- if ( s->enabled )
- goto done;
-
- hvm_remove_ioreq_gfn(s, false);
- hvm_remove_ioreq_gfn(s, true);
-
- s->enabled = true;
-
- list_for_each_entry ( sv,
- &s->ioreq_vcpu_list,
- list_entry )
- hvm_update_ioreq_evtchn(s, sv);
-
- done:
- spin_unlock(&s->lock);
-}
-
-static void hvm_ioreq_server_disable(struct hvm_ioreq_server *s)
-{
- spin_lock(&s->lock);
-
- if ( !s->enabled )
- goto done;
-
- hvm_add_ioreq_gfn(s, true);
- hvm_add_ioreq_gfn(s, false);
-
- s->enabled = false;
-
- done:
- spin_unlock(&s->lock);
-}
-
-static int hvm_ioreq_server_init(struct hvm_ioreq_server *s,
- struct domain *d, int bufioreq_handling,
- ioservid_t id)
-{
- struct domain *currd = current->domain;
- struct vcpu *v;
- int rc;
-
- s->target = d;
-
- get_knownalive_domain(currd);
- s->emulator = currd;
-
- spin_lock_init(&s->lock);
- INIT_LIST_HEAD(&s->ioreq_vcpu_list);
- spin_lock_init(&s->bufioreq_lock);
-
- s->ioreq.gfn = INVALID_GFN;
- s->bufioreq.gfn = INVALID_GFN;
-
- rc = hvm_ioreq_server_alloc_rangesets(s, id);
- if ( rc )
- return rc;
-
- s->bufioreq_handling = bufioreq_handling;
-
- for_each_vcpu ( d, v )
- {
- rc = hvm_ioreq_server_add_vcpu(s, v);
- if ( rc )
- goto fail_add;
- }
-
- return 0;
-
- fail_add:
- hvm_ioreq_server_remove_all_vcpus(s);
- hvm_ioreq_server_unmap_pages(s);
-
- hvm_ioreq_server_free_rangesets(s);
-
- put_domain(s->emulator);
- return rc;
-}
-
-static void hvm_ioreq_server_deinit(struct hvm_ioreq_server *s)
-{
- ASSERT(!s->enabled);
- hvm_ioreq_server_remove_all_vcpus(s);
-
- /*
- * NOTE: It is safe to call both hvm_ioreq_server_unmap_pages() and
- * hvm_ioreq_server_free_pages() in that order.
- * This is because the former will do nothing if the pages
- * are not mapped, leaving the page to be freed by the latter.
- * However if the pages are mapped then the former will set
- * the page_info pointer to NULL, meaning the latter will do
- * nothing.
- */
- hvm_ioreq_server_unmap_pages(s);
- hvm_ioreq_server_free_pages(s);
-
- hvm_ioreq_server_free_rangesets(s);
-
- put_domain(s->emulator);
-}
-
-int hvm_create_ioreq_server(struct domain *d, int bufioreq_handling,
- ioservid_t *id)
-{
- struct hvm_ioreq_server *s;
- unsigned int i;
- int rc;
-
- if ( bufioreq_handling > HVM_IOREQSRV_BUFIOREQ_ATOMIC )
- return -EINVAL;
-
- s = xzalloc(struct hvm_ioreq_server);
- if ( !s )
- return -ENOMEM;
-
- domain_pause(d);
- spin_lock_recursive(&d->arch.hvm.ioreq_server.lock);
-
- for ( i = 0; i < MAX_NR_IOREQ_SERVERS; i++ )
- {
- if ( !GET_IOREQ_SERVER(d, i) )
- break;
- }
-
- rc = -ENOSPC;
- if ( i >= MAX_NR_IOREQ_SERVERS )
- goto fail;
-
- /*
- * It is safe to call set_ioreq_server() prior to
- * hvm_ioreq_server_init() since the target domain is paused.
- */
- set_ioreq_server(d, i, s);
-
- rc = hvm_ioreq_server_init(s, d, bufioreq_handling, i);
- if ( rc )
- {
- set_ioreq_server(d, i, NULL);
- goto fail;
- }
-
- if ( id )
- *id = i;
-
- spin_unlock_recursive(&d->arch.hvm.ioreq_server.lock);
- domain_unpause(d);
-
- return 0;
-
- fail:
- spin_unlock_recursive(&d->arch.hvm.ioreq_server.lock);
- domain_unpause(d);
-
- xfree(s);
- return rc;
-}
-
-int hvm_destroy_ioreq_server(struct domain *d, ioservid_t id)
-{
- struct hvm_ioreq_server *s;
- int rc;
-
- spin_lock_recursive(&d->arch.hvm.ioreq_server.lock);
-
- s = get_ioreq_server(d, id);
-
- rc = -ENOENT;
- if ( !s )
- goto out;
-
- rc = -EPERM;
- if ( s->emulator != current->domain )
- goto out;
-
- domain_pause(d);
-
- p2m_set_ioreq_server(d, 0, s);
-
- hvm_ioreq_server_disable(s);
-
- /*
- * It is safe to call hvm_ioreq_server_deinit() prior to
- * set_ioreq_server() since the target domain is paused.
- */
- hvm_ioreq_server_deinit(s);
- set_ioreq_server(d, id, NULL);
-
- domain_unpause(d);
-
- xfree(s);
-
- rc = 0;
-
- out:
- spin_unlock_recursive(&d->arch.hvm.ioreq_server.lock);
-
- return rc;
-}
-
-int hvm_get_ioreq_server_info(struct domain *d, ioservid_t id,
- unsigned long *ioreq_gfn,
- unsigned long *bufioreq_gfn,
- evtchn_port_t *bufioreq_port)
-{
- struct hvm_ioreq_server *s;
- int rc;
-
- spin_lock_recursive(&d->arch.hvm.ioreq_server.lock);
-
- s = get_ioreq_server(d, id);
-
- rc = -ENOENT;
- if ( !s )
- goto out;
-
- rc = -EPERM;
- if ( s->emulator != current->domain )
- goto out;
-
- if ( ioreq_gfn || bufioreq_gfn )
- {
- rc = hvm_ioreq_server_map_pages(s);
- if ( rc )
- goto out;
- }
-
- if ( ioreq_gfn )
- *ioreq_gfn = gfn_x(s->ioreq.gfn);
-
- if ( HANDLE_BUFIOREQ(s) )
- {
- if ( bufioreq_gfn )
- *bufioreq_gfn = gfn_x(s->bufioreq.gfn);
-
- if ( bufioreq_port )
- *bufioreq_port = s->bufioreq_evtchn;
- }
-
- rc = 0;
-
- out:
- spin_unlock_recursive(&d->arch.hvm.ioreq_server.lock);
-
- return rc;
-}
-
-int hvm_get_ioreq_server_frame(struct domain *d, ioservid_t id,
- unsigned long idx, mfn_t *mfn)
-{
- struct hvm_ioreq_server *s;
- int rc;
-
- ASSERT(is_hvm_domain(d));
-
- spin_lock_recursive(&d->arch.hvm.ioreq_server.lock);
-
- s = get_ioreq_server(d, id);
-
- rc = -ENOENT;
- if ( !s )
- goto out;
-
- rc = -EPERM;
- if ( s->emulator != current->domain )
- goto out;
-
- rc = hvm_ioreq_server_alloc_pages(s);
- if ( rc )
- goto out;
-
- switch ( idx )
- {
- case XENMEM_resource_ioreq_server_frame_bufioreq:
- rc = -ENOENT;
- if ( !HANDLE_BUFIOREQ(s) )
- goto out;
-
- *mfn = page_to_mfn(s->bufioreq.page);
- rc = 0;
- break;
-
- case XENMEM_resource_ioreq_server_frame_ioreq(0):
- *mfn = page_to_mfn(s->ioreq.page);
- rc = 0;
- break;
-
- default:
- rc = -EINVAL;
- break;
- }
-
- out:
- spin_unlock_recursive(&d->arch.hvm.ioreq_server.lock);
-
- return rc;
-}
-
-int hvm_map_io_range_to_ioreq_server(struct domain *d, ioservid_t id,
- uint32_t type, uint64_t start,
- uint64_t end)
-{
- struct hvm_ioreq_server *s;
- struct rangeset *r;
- int rc;
-
- if ( start > end )
- return -EINVAL;
-
- spin_lock_recursive(&d->arch.hvm.ioreq_server.lock);
-
- s = get_ioreq_server(d, id);
-
- rc = -ENOENT;
- if ( !s )
- goto out;
-
- rc = -EPERM;
- if ( s->emulator != current->domain )
- goto out;
-
- switch ( type )
- {
- case XEN_DMOP_IO_RANGE_PORT:
- case XEN_DMOP_IO_RANGE_MEMORY:
- case XEN_DMOP_IO_RANGE_PCI:
- r = s->range[type];
- break;
-
- default:
- r = NULL;
- break;
- }
-
- rc = -EINVAL;
- if ( !r )
- goto out;
-
- rc = -EEXIST;
- if ( rangeset_overlaps_range(r, start, end) )
- goto out;
-
- rc = rangeset_add_range(r, start, end);
-
- out:
- spin_unlock_recursive(&d->arch.hvm.ioreq_server.lock);
-
- return rc;
-}
+#include <public/hvm/ioreq.h>
+#include <public/hvm/params.h>

-int hvm_unmap_io_range_from_ioreq_server(struct domain *d, ioservid_t id,
- uint32_t type, uint64_t start,
- uint64_t end)
+void handle_realmode_completion(void)
{
- struct hvm_ioreq_server *s;
- struct rangeset *r;
- int rc;
-
- if ( start > end )
- return -EINVAL;
-
- spin_lock_recursive(&d->arch.hvm.ioreq_server.lock);
-
- s = get_ioreq_server(d, id);
-
- rc = -ENOENT;
- if ( !s )
- goto out;
-
- rc = -EPERM;
- if ( s->emulator != current->domain )
- goto out;
-
- switch ( type )
- {
- case XEN_DMOP_IO_RANGE_PORT:
- case XEN_DMOP_IO_RANGE_MEMORY:
- case XEN_DMOP_IO_RANGE_PCI:
- r = s->range[type];
- break;
-
- default:
- r = NULL;
- break;
- }
-
- rc = -EINVAL;
- if ( !r )
- goto out;
-
- rc = -ENOENT;
- if ( !rangeset_contains_range(r, start, end) )
- goto out;
-
- rc = rangeset_remove_range(r, start, end);
-
- out:
- spin_unlock_recursive(&d->arch.hvm.ioreq_server.lock);
+ struct hvm_emulate_ctxt ctxt;

- return rc;
+ hvm_emulate_init_once(&ctxt, NULL, guest_cpu_user_regs());
+ vmx_realmode_emulate_one(&ctxt);
+ hvm_emulate_writeback(&ctxt);
}

/*
@@ -1141,130 +89,12 @@ int hvm_map_mem_type_to_ioreq_server(struct domain *d, ioservid_t id,
return rc;
}

-int hvm_set_ioreq_server_state(struct domain *d, ioservid_t id,
- bool enabled)
-{
- struct hvm_ioreq_server *s;
- int rc;
-
- spin_lock_recursive(&d->arch.hvm.ioreq_server.lock);
-
- s = get_ioreq_server(d, id);
-
- rc = -ENOENT;
- if ( !s )
- goto out;
-
- rc = -EPERM;
- if ( s->emulator != current->domain )
- goto out;
-
- domain_pause(d);
-
- if ( enabled )
- hvm_ioreq_server_enable(s);
- else
- hvm_ioreq_server_disable(s);
-
- domain_unpause(d);
-
- rc = 0;
-
- out:
- spin_unlock_recursive(&d->arch.hvm.ioreq_server.lock);
- return rc;
-}
-
-int hvm_all_ioreq_servers_add_vcpu(struct domain *d, struct vcpu *v)
-{
- struct hvm_ioreq_server *s;
- unsigned int id;
- int rc;
-
- spin_lock_recursive(&d->arch.hvm.ioreq_server.lock);
-
- FOR_EACH_IOREQ_SERVER(d, id, s)
- {
- rc = hvm_ioreq_server_add_vcpu(s, v);
- if ( rc )
- goto fail;
- }
-
- spin_unlock_recursive(&d->arch.hvm.ioreq_server.lock);
-
- return 0;
-
- fail:
- while ( ++id != MAX_NR_IOREQ_SERVERS )
- {
- s = GET_IOREQ_SERVER(d, id);
-
- if ( !s )
- continue;
-
- hvm_ioreq_server_remove_vcpu(s, v);
- }
-
- spin_unlock_recursive(&d->arch.hvm.ioreq_server.lock);
-
- return rc;
-}
-
-void hvm_all_ioreq_servers_remove_vcpu(struct domain *d, struct vcpu *v)
-{
- struct hvm_ioreq_server *s;
- unsigned int id;
-
- spin_lock_recursive(&d->arch.hvm.ioreq_server.lock);
-
- FOR_EACH_IOREQ_SERVER(d, id, s)
- hvm_ioreq_server_remove_vcpu(s, v);
-
- spin_unlock_recursive(&d->arch.hvm.ioreq_server.lock);
-}
-
-void hvm_destroy_all_ioreq_servers(struct domain *d)
+void hvm_get_ioreq_server_range_type(struct domain *d,
+ ioreq_t *p,
+ uint8_t *type,
+ uint64_t *addr)
{
- struct hvm_ioreq_server *s;
- unsigned int id;
-
- if ( !relocate_portio_handler(d, 0xcf8, 0xcf8, 4) )
- return;
-
- spin_lock_recursive(&d->arch.hvm.ioreq_server.lock);
-
- /* No need to domain_pause() as the domain is being torn down */
-
- FOR_EACH_IOREQ_SERVER(d, id, s)
- {
- hvm_ioreq_server_disable(s);
-
- /*
- * It is safe to call hvm_ioreq_server_deinit() prior to
- * set_ioreq_server() since the target domain is being destroyed.
- */
- hvm_ioreq_server_deinit(s);
- set_ioreq_server(d, id, NULL);
-
- xfree(s);
- }
-
- spin_unlock_recursive(&d->arch.hvm.ioreq_server.lock);
-}
-
-struct hvm_ioreq_server *hvm_select_ioreq_server(struct domain *d,
- ioreq_t *p)
-{
- struct hvm_ioreq_server *s;
- uint32_t cf8;
- uint8_t type;
- uint64_t addr;
- unsigned int id;
-
- if ( p->type != IOREQ_TYPE_COPY && p->type != IOREQ_TYPE_PIO )
- return NULL;
-
- cf8 = d->arch.hvm.pci_cf8;
+ uint32_t cf8 = d->arch.hvm.pci_cf8;

if ( p->type == IOREQ_TYPE_PIO &&
(p->addr & ~3) == 0xcfc &&
@@ -1277,8 +107,8 @@ struct hvm_ioreq_server *hvm_select_ioreq_server(struct domain *d,
reg = hvm_pci_decode_addr(cf8, p->addr, &sbdf);

/* PCI config data cycle */
- type = XEN_DMOP_IO_RANGE_PCI;
- addr = ((uint64_t)sbdf.sbdf << 32) | reg;
+ *type = XEN_DMOP_IO_RANGE_PCI;
+ *addr = ((uint64_t)sbdf.sbdf << 32) | reg;
/* AMD extended configuration space access? */
if ( CF8_ADDR_HI(cf8) &&
d->arch.cpuid->x86_vendor == X86_VENDOR_AMD &&
@@ -1290,230 +120,15 @@ struct hvm_ioreq_server *hvm_select_ioreq_server(struct domain *d,

if ( !rdmsr_safe(MSR_AMD64_NB_CFG, msr_val) &&
(msr_val & (1ULL << AMD64_NB_CFG_CF8_EXT_ENABLE_BIT)) )
- addr |= CF8_ADDR_HI(cf8);
+ *addr |= CF8_ADDR_HI(cf8);
}
}
else
{
- type = (p->type == IOREQ_TYPE_PIO) ?
- XEN_DMOP_IO_RANGE_PORT : XEN_DMOP_IO_RANGE_MEMORY;
- addr = p->addr;
- }
-
- FOR_EACH_IOREQ_SERVER(d, id, s)
- {
- struct rangeset *r;
-
- if ( !s->enabled )
- continue;
-
- r = s->range[type];
-
- switch ( type )
- {
- unsigned long start, end;
-
- case XEN_DMOP_IO_RANGE_PORT:
- start = addr;
- end = start + p->size - 1;
- if ( rangeset_contains_range(r, start, end) )
- return s;
-
- break;
-
- case XEN_DMOP_IO_RANGE_MEMORY:
- start = hvm_mmio_first_byte(p);
- end = hvm_mmio_last_byte(p);
-
- if ( rangeset_contains_range(r, start, end) )
- return s;
-
- break;
-
- case XEN_DMOP_IO_RANGE_PCI:
- if ( rangeset_contains_singleton(r, addr >> 32) )
- {
- p->type = IOREQ_TYPE_PCI_CONFIG;
- p->addr = addr;
- return s;
- }
-
- break;
- }
- }
-
- return NULL;
-}
-
-static int hvm_send_buffered_ioreq(struct hvm_ioreq_server *s, ioreq_t *p)
-{
- struct domain *d = current->domain;
- struct hvm_ioreq_page *iorp;
- buffered_iopage_t *pg;
- buf_ioreq_t bp = { .data = p->data,
- .addr = p->addr,
- .type = p->type,
- .dir = p->dir };
- /* Timeoffset sends 64b data, but no address. Use two consecutive slots. */
- int qw = 0;
-
- /* Ensure buffered_iopage fits in a page */
- BUILD_BUG_ON(sizeof(buffered_iopage_t) > PAGE_SIZE);
-
- iorp = &s->bufioreq;
- pg = iorp->va;
-
- if ( !pg )
- return X86EMUL_UNHANDLEABLE;
-
- /*
- * Return 0 for the cases we can't deal with:
- * - 'addr' is only a 20-bit field, so we cannot address beyond 1MB
- * - we cannot buffer accesses to guest memory buffers, as the guest
- * may expect the memory buffer to be synchronously accessed
- * - the count field is usually used with data_is_ptr and since we don't
- * support data_is_ptr we do not waste space for the count field either
- */
- if ( (p->addr > 0xffffful) || p->data_is_ptr || (p->count != 1) )
- return 0;
-
- switch ( p->size )
- {
- case 1:
- bp.size = 0;
- break;
- case 2:
- bp.size = 1;
- break;
- case 4:
- bp.size = 2;
- break;
- case 8:
- bp.size = 3;
- qw = 1;
- break;
- default:
- gdprintk(XENLOG_WARNING, "unexpected ioreq size: %u\n", p->size);
- return X86EMUL_UNHANDLEABLE;
- }
-
- spin_lock(&s->bufioreq_lock);
-
- if ( (pg->ptrs.write_pointer - pg->ptrs.read_pointer) >=
- (IOREQ_BUFFER_SLOT_NUM - qw) )
- {
- /* The queue is full: send the iopacket through the normal path. */
- spin_unlock(&s->bufioreq_lock);
- return X86EMUL_UNHANDLEABLE;
- }
-
- pg->buf_ioreq[pg->ptrs.write_pointer % IOREQ_BUFFER_SLOT_NUM] = bp;
-
- if ( qw )
- {
- bp.data = p->data >> 32;
- pg->buf_ioreq[(pg->ptrs.write_pointer+1) % IOREQ_BUFFER_SLOT_NUM] = bp;
- }
-
- /* Make the ioreq_t visible /before/ write_pointer. */
- smp_wmb();
- pg->ptrs.write_pointer += qw ? 2 : 1;
-
- /* Canonicalize read/write pointers to prevent their overflow. */
- while ( (s->bufioreq_handling == HVM_IOREQSRV_BUFIOREQ_ATOMIC) &&
- qw++ < IOREQ_BUFFER_SLOT_NUM &&
- pg->ptrs.read_pointer >= IOREQ_BUFFER_SLOT_NUM )
- {
- union bufioreq_pointers old = pg->ptrs, new;
- unsigned int n = old.read_pointer / IOREQ_BUFFER_SLOT_NUM;
-
- new.read_pointer = old.read_pointer - n * IOREQ_BUFFER_SLOT_NUM;
- new.write_pointer = old.write_pointer - n * IOREQ_BUFFER_SLOT_NUM;
- cmpxchg(&pg->ptrs.full, old.full, new.full);
- }
-
- notify_via_xen_event_channel(d, s->bufioreq_evtchn);
- spin_unlock(&s->bufioreq_lock);
-
- return X86EMUL_OKAY;
-}
-
-int hvm_send_ioreq(struct hvm_ioreq_server *s, ioreq_t *proto_p,
- bool buffered)
-{
- struct vcpu *curr = current;
- struct domain *d = curr->domain;
- struct hvm_ioreq_vcpu *sv;
-
- ASSERT(s);
-
- if ( buffered )
- return hvm_send_buffered_ioreq(s, proto_p);
-
- if ( unlikely(!vcpu_start_shutdown_deferral(curr)) )
- return X86EMUL_RETRY;
-
- list_for_each_entry ( sv,
- &s->ioreq_vcpu_list,
- list_entry )
- {
- if ( sv->vcpu == curr )
- {
- evtchn_port_t port = sv->ioreq_evtchn;
- ioreq_t *p = get_ioreq(s, curr);
-
- if ( unlikely(p->state != STATE_IOREQ_NONE) )
- {
- gprintk(XENLOG_ERR, "device model set bad IO state %d\n",
- p->state);
- break;
- }
-
- if ( unlikely(p->vp_eport != port) )
- {
- gprintk(XENLOG_ERR, "device model set bad event channel %d\n",
- p->vp_eport);
- break;
- }
-
- proto_p->state = STATE_IOREQ_NONE;
- proto_p->vp_eport = port;
- *p = *proto_p;
-
- prepare_wait_on_xen_event_channel(port);
-
- /*
- * Following happens /after/ blocking and setting up ioreq
- * contents. prepare_wait_on_xen_event_channel() is an implicit
- * barrier.
- */
- p->state = STATE_IOREQ_READY;
- notify_via_xen_event_channel(d, port);
-
- sv->pending = true;
- return X86EMUL_RETRY;
- }
- }
-
- return X86EMUL_UNHANDLEABLE;
-}
-
-unsigned int hvm_broadcast_ioreq(ioreq_t *p, bool buffered)
-{
- struct domain *d = current->domain;
- struct hvm_ioreq_server *s;
- unsigned int id, failed = 0;
-
- FOR_EACH_IOREQ_SERVER(d, id, s)
- {
- if ( !s->enabled )
- continue;
-
- if ( hvm_send_ioreq(s, p, buffered) == X86EMUL_UNHANDLEABLE )
- failed++;
+ *type = (p->type == IOREQ_TYPE_PIO) ?
+ XEN_DMOP_IO_RANGE_PORT : XEN_DMOP_IO_RANGE_MEMORY;
+ *addr = p->addr;
}
-
- return failed;
}

static int hvm_access_cf8(
@@ -1528,13 +143,19 @@ static int hvm_access_cf8(
return X86EMUL_UNHANDLEABLE;
}

-void hvm_ioreq_init(struct domain *d)
+void arch_hvm_ioreq_init(struct domain *d)
{
spin_lock_init(&d->arch.hvm.ioreq_server.lock);

register_portio_handler(d, 0xcf8, 4, hvm_access_cf8);
}

+void arch_hvm_ioreq_destroy(struct domain *d)
+{
+ if ( !relocate_portio_handler(d, 0xcf8, 0xcf8, 4) )
+ return;
+}
+
/*
* Local variables:
* mode: C
diff --git a/xen/arch/x86/hvm/stdvga.c b/xen/arch/x86/hvm/stdvga.c
index e267513..ab6d315 100644
--- a/xen/arch/x86/hvm/stdvga.c
+++ b/xen/arch/x86/hvm/stdvga.c
@@ -27,10 +27,10 @@
* can have side effects.
*/

+#include <xen/hvm/ioreq.h>
#include <xen/types.h>
#include <xen/sched.h>
#include <xen/domain_page.h>
-#include <asm/hvm/ioreq.h>
#include <asm/hvm/support.h>
#include <xen/numa.h>
#include <xen/paging.h>
diff --git a/xen/arch/x86/hvm/vmx/realmode.c b/xen/arch/x86/hvm/vmx/realmode.c
index bdbd9cb..b804262 100644
--- a/xen/arch/x86/hvm/vmx/realmode.c
+++ b/xen/arch/x86/hvm/vmx/realmode.c
@@ -9,6 +9,7 @@
* Keir Fraser <keir@xen.org>
*/

+#include <xen/hvm/ioreq.h>
#include <xen/init.h>
#include <xen/lib.h>
#include <xen/sched.h>
diff --git a/xen/arch/x86/hvm/vmx/vvmx.c b/xen/arch/x86/hvm/vmx/vvmx.c
index 7dfff6c..acfeb1c 100644
--- a/xen/arch/x86/hvm/vmx/vvmx.c
+++ b/xen/arch/x86/hvm/vmx/vvmx.c
@@ -18,11 +18,11 @@
* this program; If not, see <http://www.gnu.org/licenses/>.
*
*/
+#include <xen/hvm/ioreq.h>

#include <asm/types.h>
#include <asm/mtrr.h>
#include <asm/p2m.h>
-#include <asm/hvm/ioreq.h>
#include <asm/hvm/vmx/vmx.h>
#include <asm/hvm/vmx/vvmx.h>
#include <asm/hvm/nestedhvm.h>
diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c
index 82bc676..2b06e15 100644
--- a/xen/arch/x86/mm.c
+++ b/xen/arch/x86/mm.c
@@ -99,6 +99,7 @@
* doing the final put_page(), and remove it from the iommu if so.
*/

+#include <xen/hvm/ioreq.h>
#include <xen/init.h>
#include <xen/kernel.h>
#include <xen/lib.h>
@@ -141,7 +142,6 @@
#include <asm/io_apic.h>
#include <asm/pci.h>
#include <asm/guest.h>
-#include <asm/hvm/ioreq.h>

#include <asm/hvm/grant_table.h>
#include <asm/pv/domain.h>
diff --git a/xen/arch/x86/mm/shadow/common.c b/xen/arch/x86/mm/shadow/common.c
index 7737773..c84cbb2 100644
--- a/xen/arch/x86/mm/shadow/common.c
+++ b/xen/arch/x86/mm/shadow/common.c
@@ -20,6 +20,7 @@
* along with this program; If not, see <http://www.gnu.org/licenses/>.
*/

+#include <xen/hvm/ioreq.h>
#include <xen/types.h>
#include <xen/mm.h>
#include <xen/trace.h>
@@ -34,7 +35,6 @@
#include <asm/current.h>
#include <asm/flushtlb.h>
#include <asm/shadow.h>
-#include <asm/hvm/ioreq.h>
#include <xen/numa.h>
#include "private.h"

diff --git a/xen/common/Kconfig b/xen/common/Kconfig
index 15e3b79..fb6fb51 100644
--- a/xen/common/Kconfig
+++ b/xen/common/Kconfig
@@ -139,6 +139,9 @@ config HYPFS_CONFIG
Disable this option in case you want to spare some memory or you
want to hide the .config contents from dom0.

+config IOREQ_SERVER
+ bool
+
config KEXEC
bool "kexec support"
default y
diff --git a/xen/common/Makefile b/xen/common/Makefile
index 06881d0..f6fc3f8 100644
--- a/xen/common/Makefile
+++ b/xen/common/Makefile
@@ -70,6 +70,7 @@ extra-y := symbols-dummy.o

obj-$(CONFIG_COVERAGE) += coverage/
obj-y += sched/
+obj-$(CONFIG_IOREQ_SERVER) += hvm/
obj-$(CONFIG_UBSAN) += ubsan/

obj-$(CONFIG_NEEDS_LIBELF) += libelf/
diff --git a/xen/common/hvm/Makefile b/xen/common/hvm/Makefile
new file mode 100644
index 0000000..326215d
--- /dev/null
+++ b/xen/common/hvm/Makefile
@@ -0,0 +1 @@
+obj-y += ioreq.o
diff --git a/xen/common/hvm/ioreq.c b/xen/common/hvm/ioreq.c
new file mode 100644
index 0000000..7e1fa23
--- /dev/null
+++ b/xen/common/hvm/ioreq.c
@@ -0,0 +1,1430 @@
+/*
+ * hvm/ioreq.c: hardware virtual machine I/O emulation
+ *
+ * Copyright (c) 2016 Citrix Systems Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <xen/ctype.h>
+#include <xen/hvm/ioreq.h>
+#include <xen/init.h>
+#include <xen/lib.h>
+#include <xen/trace.h>
+#include <xen/sched.h>
+#include <xen/irq.h>
+#include <xen/softirq.h>
+#include <xen/domain.h>
+#include <xen/domain_page.h>
+#include <xen/event.h>
+#include <xen/paging.h>
+#include <xen/vpci.h>
+
+#include <public/hvm/dm_op.h>
+#include <public/hvm/ioreq.h>
+#include <public/hvm/params.h>
+
+static void set_ioreq_server(struct domain *d, unsigned int id,
+ struct hvm_ioreq_server *s)
+{
+ ASSERT(id < MAX_NR_IOREQ_SERVERS);
+ ASSERT(!s || !d->arch.hvm.ioreq_server.server[id]);
+
+ d->arch.hvm.ioreq_server.server[id] = s;
+}
+
+/*
+ * Iterate over all possible ioreq servers.
+ *
+ * NOTE: The iteration is backwards such that more recently created
+ * ioreq servers are favoured in hvm_select_ioreq_server().
+ * This is a semantic that previously existed when ioreq servers
+ * were held in a linked list.
+ */
+#define FOR_EACH_IOREQ_SERVER(d, id, s) \
+ for ( (id) = MAX_NR_IOREQ_SERVERS; (id) != 0; ) \
+ if ( !(s = GET_IOREQ_SERVER(d, --(id))) ) \
+ continue; \
+ else
+
+static ioreq_t *get_ioreq(struct hvm_ioreq_server *s, struct vcpu *v)
+{
+ shared_iopage_t *p = s->ioreq.va;
+
+ ASSERT((v == current) || !vcpu_runnable(v));
+ ASSERT(p != NULL);
+
+ return &p->vcpu_ioreq[v->vcpu_id];
+}
+
+bool hvm_io_pending(struct vcpu *v)
+{
+ struct domain *d = v->domain;
+ struct hvm_ioreq_server *s;
+ unsigned int id;
+
+ FOR_EACH_IOREQ_SERVER(d, id, s)
+ {
+ struct hvm_ioreq_vcpu *sv;
+
+ list_for_each_entry ( sv,
+ &s->ioreq_vcpu_list,
+ list_entry )
+ {
+ if ( sv->vcpu == v && sv->pending )
+ return true;
+ }
+ }
+
+ return false;
+}
+
+static void hvm_io_assist(struct hvm_ioreq_vcpu *sv, uint64_t data)
+{
+ struct vcpu *v = sv->vcpu;
+ ioreq_t *ioreq = &v->arch.hvm.hvm_io.io_req;
+
+ if ( hvm_ioreq_needs_completion(ioreq) )
+ ioreq->data = data;
+
+ sv->pending = false;
+}
+
+static bool hvm_wait_for_io(struct hvm_ioreq_vcpu *sv, ioreq_t *p)
+{
+ unsigned int prev_state = STATE_IOREQ_NONE;
+
+ while ( sv->pending )
+ {
+ unsigned int state = p->state;
+
+ smp_rmb();
+
+ recheck:
+ if ( unlikely(state == STATE_IOREQ_NONE) )
+ {
+ /*
+ * The only reason we should see this case is when an
+ * emulator is dying and it races with an I/O being
+ * requested.
+ */
+ hvm_io_assist(sv, ~0ul);
+ break;
+ }
+
+ if ( unlikely(state < prev_state) )
+ {
+ gdprintk(XENLOG_ERR, "Weird HVM ioreq state transition %u -> %u\n",
+ prev_state, state);
+ sv->pending = false;
+ domain_crash(sv->vcpu->domain);
+ return false; /* bail */
+ }
+
+ switch ( prev_state = state )
+ {
+ case STATE_IORESP_READY: /* IORESP_READY -> NONE */
+ p->state = STATE_IOREQ_NONE;
+ hvm_io_assist(sv, p->data);
+ break;
+ case STATE_IOREQ_READY: /* IOREQ_{READY,INPROCESS} -> IORESP_READY */
+ case STATE_IOREQ_INPROCESS:
+ wait_on_xen_event_channel(sv->ioreq_evtchn,
+ ({ state = p->state;
+ smp_rmb();
+ state != prev_state; }));
+ goto recheck;
+ default:
+ gdprintk(XENLOG_ERR, "Weird HVM iorequest state %u\n", state);
+ sv->pending = false;
+ domain_crash(sv->vcpu->domain);
+ return false; /* bail */
+ }
+ }
+
+ return true;
+}
+
+bool handle_hvm_io_completion(struct vcpu *v)
+{
+ struct domain *d = v->domain;
+ struct hvm_vcpu_io *vio = &v->arch.hvm.hvm_io;
+ struct hvm_ioreq_server *s;
+ enum hvm_io_completion io_completion;
+ unsigned int id;
+
+ if ( has_vpci(d) && vpci_process_pending(v) )
+ {
+ raise_softirq(SCHEDULE_SOFTIRQ);
+ return false;
+ }
+
+ FOR_EACH_IOREQ_SERVER(d, id, s)
+ {
+ struct hvm_ioreq_vcpu *sv;
+
+ list_for_each_entry ( sv,
+ &s->ioreq_vcpu_list,
+ list_entry )
+ {
+ if ( sv->vcpu == v && sv->pending )
+ {
+ if ( !hvm_wait_for_io(sv, get_ioreq(s, v)) )
+ return false;
+
+ break;
+ }
+ }
+ }
+
+ vio->io_req.state = hvm_ioreq_needs_completion(&vio->io_req) ?
+ STATE_IORESP_READY : STATE_IOREQ_NONE;
+
+ msix_write_completion(v);
+ vcpu_end_shutdown_deferral(v);
+
+ io_completion = vio->io_completion;
+ vio->io_completion = HVMIO_no_completion;
+
+ switch ( io_completion )
+ {
+ case HVMIO_no_completion:
+ break;
+
+ case HVMIO_mmio_completion:
+ return handle_mmio();
+
+ case HVMIO_pio_completion:
+ return handle_pio(vio->io_req.addr, vio->io_req.size,
+ vio->io_req.dir);
+
+ case HVMIO_realmode_completion:
+ handle_realmode_completion();
+ break;
+
+ default:
+ ASSERT_UNREACHABLE();
+ break;
+ }
+
+ return true;
+}
+
+static gfn_t hvm_alloc_legacy_ioreq_gfn(struct hvm_ioreq_server *s)
+{
+ struct domain *d = s->target;
+ unsigned int i;
+
+ BUILD_BUG_ON(HVM_PARAM_BUFIOREQ_PFN != HVM_PARAM_IOREQ_PFN + 1);
+
+ for ( i = HVM_PARAM_IOREQ_PFN; i <= HVM_PARAM_BUFIOREQ_PFN; i++ )
+ {
+ if ( !test_and_clear_bit(i, &d->arch.hvm.ioreq_gfn.legacy_mask) )
+ return _gfn(d->arch.hvm.params[i]);
+ }
+
+ return INVALID_GFN;
+}
+
+static gfn_t hvm_alloc_ioreq_gfn(struct hvm_ioreq_server *s)
+{
+ struct domain *d = s->target;
+ unsigned int i;
+
+ for ( i = 0; i < sizeof(d->arch.hvm.ioreq_gfn.mask) * 8; i++ )
+ {
+ if ( test_and_clear_bit(i, &d->arch.hvm.ioreq_gfn.mask) )
+ return _gfn(d->arch.hvm.ioreq_gfn.base + i);
+ }
+
+ /*
+ * If we are out of 'normal' GFNs then we may still have a 'legacy'
+ * GFN available.
+ */
+ return hvm_alloc_legacy_ioreq_gfn(s);
+}
+
+static bool hvm_free_legacy_ioreq_gfn(struct hvm_ioreq_server *s,
+ gfn_t gfn)
+{
+ struct domain *d = s->target;
+ unsigned int i;
+
+ for ( i = HVM_PARAM_IOREQ_PFN; i <= HVM_PARAM_BUFIOREQ_PFN; i++ )
+ {
+ if ( gfn_eq(gfn, _gfn(d->arch.hvm.params[i])) )
+ break;
+ }
+ if ( i > HVM_PARAM_BUFIOREQ_PFN )
+ return false;
+
+ set_bit(i, &d->arch.hvm.ioreq_gfn.legacy_mask);
+ return true;
+}
+
+static void hvm_free_ioreq_gfn(struct hvm_ioreq_server *s, gfn_t gfn)
+{
+ struct domain *d = s->target;
+ unsigned int i = gfn_x(gfn) - d->arch.hvm.ioreq_gfn.base;
+
+ ASSERT(!gfn_eq(gfn, INVALID_GFN));
+
+ if ( !hvm_free_legacy_ioreq_gfn(s, gfn) )
+ {
+ ASSERT(i < sizeof(d->arch.hvm.ioreq_gfn.mask) * 8);
+ set_bit(i, &d->arch.hvm.ioreq_gfn.mask);
+ }
+}
+
+static void hvm_unmap_ioreq_gfn(struct hvm_ioreq_server *s, bool buf)
+{
+ struct hvm_ioreq_page *iorp = buf ? &s->bufioreq : &s->ioreq;
+
+ if ( gfn_eq(iorp->gfn, INVALID_GFN) )
+ return;
+
+ destroy_ring_for_helper(&iorp->va, iorp->page);
+ iorp->page = NULL;
+
+ hvm_free_ioreq_gfn(s, iorp->gfn);
+ iorp->gfn = INVALID_GFN;
+}
+
+static int hvm_map_ioreq_gfn(struct hvm_ioreq_server *s, bool buf)
+{
+ struct domain *d = s->target;
+ struct hvm_ioreq_page *iorp = buf ? &s->bufioreq : &s->ioreq;
+ int rc;
+
+ if ( iorp->page )
+ {
+ /*
+ * If a page has already been allocated (which will happen on
+ * demand if hvm_get_ioreq_server_frame() is called), then
+ * mapping a guest frame is not permitted.
+ */
+ if ( gfn_eq(iorp->gfn, INVALID_GFN) )
+ return -EPERM;
+
+ return 0;
+ }
+
+ if ( d->is_dying )
+ return -EINVAL;
+
+ iorp->gfn = hvm_alloc_ioreq_gfn(s);
+
+ if ( gfn_eq(iorp->gfn, INVALID_GFN) )
+ return -ENOMEM;
+
+ rc = prepare_ring_for_helper(d, gfn_x(iorp->gfn), &iorp->page,
+ &iorp->va);
+
+ if ( rc )
+ hvm_unmap_ioreq_gfn(s, buf);
+
+ return rc;
+}
+
+static int hvm_alloc_ioreq_mfn(struct hvm_ioreq_server *s, bool buf)
+{
+ struct hvm_ioreq_page *iorp = buf ? &s->bufioreq : &s->ioreq;
+ struct page_info *page;
+
+ if ( iorp->page )
+ {
+ /*
+ * If a guest frame has already been mapped (which may happen
+ * on demand if hvm_get_ioreq_server_info() is called), then
+ * allocating a page is not permitted.
+ */
+ if ( !gfn_eq(iorp->gfn, INVALID_GFN) )
+ return -EPERM;
+
+ return 0;
+ }
+
+ page = alloc_domheap_page(s->target, MEMF_no_refcount);
+
+ if ( !page )
+ return -ENOMEM;
+
+ if ( !get_page_and_type(page, s->target, PGT_writable_page) )
+ {
+ /*
+ * The domain can't possibly know about this page yet, so failure
+ * here is a clear indication of something fishy going on.
+ */
+ domain_crash(s->emulator);
+ return -ENODATA;
+ }
+
+ iorp->va = __map_domain_page_global(page);
+ if ( !iorp->va )
+ goto fail;
+
+ iorp->page = page;
+ clear_page(iorp->va);
+ return 0;
+
+ fail:
+ put_page_alloc_ref(page);
+ put_page_and_type(page);
+
+ return -ENOMEM;
+}
+
+static void hvm_free_ioreq_mfn(struct hvm_ioreq_server *s, bool buf)
+{
+ struct hvm_ioreq_page *iorp = buf ? &s->bufioreq : &s->ioreq;
+ struct page_info *page = iorp->page;
+
+ if ( !page )
+ return;
+
+ iorp->page = NULL;
+
+ unmap_domain_page_global(iorp->va);
+ iorp->va = NULL;
+
+ put_page_alloc_ref(page);
+ put_page_and_type(page);
+}
+
+bool is_ioreq_server_page(struct domain *d, const struct page_info *page)
+{
+ const struct hvm_ioreq_server *s;
+ unsigned int id;
+ bool found = false;
+
+ spin_lock_recursive(&d->arch.hvm.ioreq_server.lock);
+
+ FOR_EACH_IOREQ_SERVER(d, id, s)
+ {
+ if ( (s->ioreq.page == page) || (s->bufioreq.page == page) )
+ {
+ found = true;
+ break;
+ }
+ }
+
+ spin_unlock_recursive(&d->arch.hvm.ioreq_server.lock);
+
+ return found;
+}
+
+static void hvm_remove_ioreq_gfn(struct hvm_ioreq_server *s, bool buf)
+
+{
+ struct domain *d = s->target;
+ struct hvm_ioreq_page *iorp = buf ? &s->bufioreq : &s->ioreq;
+
+ if ( gfn_eq(iorp->gfn, INVALID_GFN) )
+ return;
+
+ if ( guest_physmap_remove_page(d, iorp->gfn,
+ page_to_mfn(iorp->page), 0) )
+ domain_crash(d);
+ clear_page(iorp->va);
+}
+
+static int hvm_add_ioreq_gfn(struct hvm_ioreq_server *s, bool buf)
+{
+ struct domain *d = s->target;
+ struct hvm_ioreq_page *iorp = buf ? &s->bufioreq : &s->ioreq;
+ int rc;
+
+ if ( gfn_eq(iorp->gfn, INVALID_GFN) )
+ return 0;
+
+ clear_page(iorp->va);
+
+ rc = guest_physmap_add_page(d, iorp->gfn,
+ page_to_mfn(iorp->page), 0);
+ if ( rc == 0 )
+ paging_mark_pfn_dirty(d, _pfn(gfn_x(iorp->gfn)));
+
+ return rc;
+}
+
+static void hvm_update_ioreq_evtchn(struct hvm_ioreq_server *s,
+ struct hvm_ioreq_vcpu *sv)
+{
+ ASSERT(spin_is_locked(&s->lock));
+
+ if ( s->ioreq.va != NULL )
+ {
+ ioreq_t *p = get_ioreq(s, sv->vcpu);
+
+ p->vp_eport = sv->ioreq_evtchn;
+ }
+}
+
+#define HANDLE_BUFIOREQ(s) \
+ ((s)->bufioreq_handling != HVM_IOREQSRV_BUFIOREQ_OFF)
+
+static int hvm_ioreq_server_add_vcpu(struct hvm_ioreq_server *s,
+ struct vcpu *v)
+{
+ struct hvm_ioreq_vcpu *sv;
+ int rc;
+
+ sv = xzalloc(struct hvm_ioreq_vcpu);
+
+ rc = -ENOMEM;
+ if ( !sv )
+ goto fail1;
+
+ spin_lock(&s->lock);
+
+ rc = alloc_unbound_xen_event_channel(v->domain, v->vcpu_id,
+ s->emulator->domain_id, NULL);
+ if ( rc < 0 )
+ goto fail2;
+
+ sv->ioreq_evtchn = rc;
+
+ if ( v->vcpu_id == 0 && HANDLE_BUFIOREQ(s) )
+ {
+ rc = alloc_unbound_xen_event_channel(v->domain, 0,
+ s->emulator->domain_id, NULL);
+ if ( rc < 0 )
+ goto fail3;
+
+ s->bufioreq_evtchn = rc;
+ }
+
+ sv->vcpu = v;
+
+ list_add(&sv->list_entry, &s->ioreq_vcpu_list);
+
+ if ( s->enabled )
+ hvm_update_ioreq_evtchn(s, sv);
+
+ spin_unlock(&s->lock);
+ return 0;
+
+ fail3:
+ free_xen_event_channel(v->domain, sv->ioreq_evtchn);
+
+ fail2:
+ spin_unlock(&s->lock);
+ xfree(sv);
+
+ fail1:
+ return rc;
+}
+
+static void hvm_ioreq_server_remove_vcpu(struct hvm_ioreq_server *s,
+ struct vcpu *v)
+{
+ struct hvm_ioreq_vcpu *sv;
+
+ spin_lock(&s->lock);
+
+ list_for_each_entry ( sv,
+ &s->ioreq_vcpu_list,
+ list_entry )
+ {
+ if ( sv->vcpu != v )
+ continue;
+
+ list_del(&sv->list_entry);
+
+ if ( v->vcpu_id == 0 && HANDLE_BUFIOREQ(s) )
+ free_xen_event_channel(v->domain, s->bufioreq_evtchn);
+
+ free_xen_event_channel(v->domain, sv->ioreq_evtchn);
+
+ xfree(sv);
+ break;
+ }
+
+ spin_unlock(&s->lock);
+}
+
+static void hvm_ioreq_server_remove_all_vcpus(struct hvm_ioreq_server *s)
+{
+ struct hvm_ioreq_vcpu *sv, *next;
+
+ spin_lock(&s->lock);
+
+ list_for_each_entry_safe ( sv,
+ next,
+ &s->ioreq_vcpu_list,
+ list_entry )
+ {
+ struct vcpu *v = sv->vcpu;
+
+ list_del(&sv->list_entry);
+
+ if ( v->vcpu_id == 0 && HANDLE_BUFIOREQ(s) )
+ free_xen_event_channel(v->domain, s->bufioreq_evtchn);
+
+ free_xen_event_channel(v->domain, sv->ioreq_evtchn);
+
+ xfree(sv);
+ }
+
+ spin_unlock(&s->lock);
+}
+
+static int hvm_ioreq_server_map_pages(struct hvm_ioreq_server *s)
+{
+ int rc;
+
+ rc = hvm_map_ioreq_gfn(s, false);
+
+ if ( !rc && HANDLE_BUFIOREQ(s) )
+ rc = hvm_map_ioreq_gfn(s, true);
+
+ if ( rc )
+ hvm_unmap_ioreq_gfn(s, false);
+
+ return rc;
+}
+
+static void hvm_ioreq_server_unmap_pages(struct hvm_ioreq_server *s)
+{
+ hvm_unmap_ioreq_gfn(s, true);
+ hvm_unmap_ioreq_gfn(s, false);
+}
+
+static int hvm_ioreq_server_alloc_pages(struct hvm_ioreq_server *s)
+{
+ int rc;
+
+ rc = hvm_alloc_ioreq_mfn(s, false);
+
+ if ( !rc && (s->bufioreq_handling != HVM_IOREQSRV_BUFIOREQ_OFF) )
+ rc = hvm_alloc_ioreq_mfn(s, true);
+
+ if ( rc )
+ hvm_free_ioreq_mfn(s, false);
+
+ return rc;
+}
+
+static void hvm_ioreq_server_free_pages(struct hvm_ioreq_server *s)
+{
+ hvm_free_ioreq_mfn(s, true);
+ hvm_free_ioreq_mfn(s, false);
+}
+
+static void hvm_ioreq_server_free_rangesets(struct hvm_ioreq_server *s)
+{
+ unsigned int i;
+
+ for ( i = 0; i < NR_IO_RANGE_TYPES; i++ )
+ rangeset_destroy(s->range[i]);
+}
+
+static int hvm_ioreq_server_alloc_rangesets(struct hvm_ioreq_server *s,
+ ioservid_t id)
+{
+ unsigned int i;
+ int rc;
+
+ for ( i = 0; i < NR_IO_RANGE_TYPES; i++ )
+ {
+ char *name;
+
+ rc = asprintf(&name, "ioreq_server %d %s", id,
+ (i == XEN_DMOP_IO_RANGE_PORT) ? "port" :
+ (i == XEN_DMOP_IO_RANGE_MEMORY) ? "memory" :
+ (i == XEN_DMOP_IO_RANGE_PCI) ? "pci" :
+ "");
+ if ( rc )
+ goto fail;
+
+ s->range[i] = rangeset_new(s->target, name,
+ RANGESETF_prettyprint_hex);
+
+ xfree(name);
+
+ rc = -ENOMEM;
+ if ( !s->range[i] )
+ goto fail;
+
+ rangeset_limit(s->range[i], MAX_NR_IO_RANGES);
+ }
+
+ return 0;
+
+ fail:
+ hvm_ioreq_server_free_rangesets(s);
+
+ return rc;
+}
+
+static void hvm_ioreq_server_enable(struct hvm_ioreq_server *s)
+{
+ struct hvm_ioreq_vcpu *sv;
+
+ spin_lock(&s->lock);
+
+ if ( s->enabled )
+ goto done;
+
+ hvm_remove_ioreq_gfn(s, false);
+ hvm_remove_ioreq_gfn(s, true);
+
+ s->enabled = true;
+
+ list_for_each_entry ( sv,
+ &s->ioreq_vcpu_list,
+ list_entry )
+ hvm_update_ioreq_evtchn(s, sv);
+
+ done:
+ spin_unlock(&s->lock);
+}
+
+static void hvm_ioreq_server_disable(struct hvm_ioreq_server *s)
+{
+ spin_lock(&s->lock);
+
+ if ( !s->enabled )
+ goto done;
+
+ hvm_add_ioreq_gfn(s, true);
+ hvm_add_ioreq_gfn(s, false);
+
+ s->enabled = false;
+
+ done:
+ spin_unlock(&s->lock);
+}
+
+static int hvm_ioreq_server_init(struct hvm_ioreq_server *s,
+ struct domain *d, int bufioreq_handling,
+ ioservid_t id)
+{
+ struct domain *currd = current->domain;
+ struct vcpu *v;
+ int rc;
+
+ s->target = d;
+
+ get_knownalive_domain(currd);
+ s->emulator = currd;
+
+ spin_lock_init(&s->lock);
+ INIT_LIST_HEAD(&s->ioreq_vcpu_list);
+ spin_lock_init(&s->bufioreq_lock);
+
+ s->ioreq.gfn = INVALID_GFN;
+ s->bufioreq.gfn = INVALID_GFN;
+
+ rc = hvm_ioreq_server_alloc_rangesets(s, id);
+ if ( rc )
+ return rc;
+
+ s->bufioreq_handling = bufioreq_handling;
+
+ for_each_vcpu ( d, v )
+ {
+ rc = hvm_ioreq_server_add_vcpu(s, v);
+ if ( rc )
+ goto fail_add;
+ }
+
+ return 0;
+
+ fail_add:
+ hvm_ioreq_server_remove_all_vcpus(s);
+ hvm_ioreq_server_unmap_pages(s);
+
+ hvm_ioreq_server_free_rangesets(s);
+
+ put_domain(s->emulator);
+ return rc;
+}
+
+static void hvm_ioreq_server_deinit(struct hvm_ioreq_server *s)
+{
+ ASSERT(!s->enabled);
+ hvm_ioreq_server_remove_all_vcpus(s);
+
+ /*
+ * NOTE: It is safe to call both hvm_ioreq_server_unmap_pages() and
+ * hvm_ioreq_server_free_pages() in that order.
+ * This is because the former will do nothing if the pages
+ * are not mapped, leaving the page to be freed by the latter.
+ * However if the pages are mapped then the former will set
+ * the page_info pointer to NULL, meaning the latter will do
+ * nothing.
+ */
+ hvm_ioreq_server_unmap_pages(s);
+ hvm_ioreq_server_free_pages(s);
+
+ hvm_ioreq_server_free_rangesets(s);
+
+ put_domain(s->emulator);
+}
+
+int hvm_create_ioreq_server(struct domain *d, int bufioreq_handling,
+ ioservid_t *id)
+{
+ struct hvm_ioreq_server *s;
+ unsigned int i;
+ int rc;
+
+ if ( bufioreq_handling > HVM_IOREQSRV_BUFIOREQ_ATOMIC )
+ return -EINVAL;
+
+ s = xzalloc(struct hvm_ioreq_server);
+ if ( !s )
+ return -ENOMEM;
+
+ domain_pause(d);
+ spin_lock_recursive(&d->arch.hvm.ioreq_server.lock);
+
+ for ( i = 0; i < MAX_NR_IOREQ_SERVERS; i++ )
+ {
+ if ( !GET_IOREQ_SERVER(d, i) )
+ break;
+ }
+
+ rc = -ENOSPC;
+ if ( i >= MAX_NR_IOREQ_SERVERS )
+ goto fail;
+
+ /*
+ * It is safe to call set_ioreq_server() prior to
+ * hvm_ioreq_server_init() since the target domain is paused.
+ */
+ set_ioreq_server(d, i, s);
+
+ rc = hvm_ioreq_server_init(s, d, bufioreq_handling, i);
+ if ( rc )
+ {
+ set_ioreq_server(d, i, NULL);
+ goto fail;
+ }
+
+ if ( id )
+ *id = i;
+
+ spin_unlock_recursive(&d->arch.hvm.ioreq_server.lock);
+ domain_unpause(d);
+
+ return 0;
+
+ fail:
+ spin_unlock_recursive(&d->arch.hvm.ioreq_server.lock);
+ domain_unpause(d);
+
+ xfree(s);
+ return rc;
+}
+
+int hvm_destroy_ioreq_server(struct domain *d, ioservid_t id)
+{
+ struct hvm_ioreq_server *s;
+ int rc;
+
+ spin_lock_recursive(&d->arch.hvm.ioreq_server.lock);
+
+ s = get_ioreq_server(d, id);
+
+ rc = -ENOENT;
+ if ( !s )
+ goto out;
+
+ rc = -EPERM;
+ if ( s->emulator != current->domain )
+ goto out;
+
+ domain_pause(d);
+
+ p2m_set_ioreq_server(d, 0, s);
+
+ hvm_ioreq_server_disable(s);
+
+ /*
+ * It is safe to call hvm_ioreq_server_deinit() prior to
+ * set_ioreq_server() since the target domain is paused.
+ */
+ hvm_ioreq_server_deinit(s);
+ set_ioreq_server(d, id, NULL);
+
+ domain_unpause(d);
+
+ xfree(s);
+
+ rc = 0;
+
+ out:
+ spin_unlock_recursive(&d->arch.hvm.ioreq_server.lock);
+
+ return rc;
+}
+
+int hvm_get_ioreq_server_info(struct domain *d, ioservid_t id,
+ unsigned long *ioreq_gfn,
+ unsigned long *bufioreq_gfn,
+ evtchn_port_t *bufioreq_port)
+{
+ struct hvm_ioreq_server *s;
+ int rc;
+
+ spin_lock_recursive(&d->arch.hvm.ioreq_server.lock);
+
+ s = get_ioreq_server(d, id);
+
+ rc = -ENOENT;
+ if ( !s )
+ goto out;
+
+ rc = -EPERM;
+ if ( s->emulator != current->domain )
+ goto out;
+
+ if ( ioreq_gfn || bufioreq_gfn )
+ {
+ rc = hvm_ioreq_server_map_pages(s);
+ if ( rc )
+ goto out;
+ }
+
+ if ( ioreq_gfn )
+ *ioreq_gfn = gfn_x(s->ioreq.gfn);
+
+ if ( HANDLE_BUFIOREQ(s) )
+ {
+ if ( bufioreq_gfn )
+ *bufioreq_gfn = gfn_x(s->bufioreq.gfn);
+
+ if ( bufioreq_port )
+ *bufioreq_port = s->bufioreq_evtchn;
+ }
+
+ rc = 0;
+
+ out:
+ spin_unlock_recursive(&d->arch.hvm.ioreq_server.lock);
+
+ return rc;
+}
+
+int hvm_get_ioreq_server_frame(struct domain *d, ioservid_t id,
+ unsigned long idx, mfn_t *mfn)
+{
+ struct hvm_ioreq_server *s;
+ int rc;
+
+ ASSERT(is_hvm_domain(d));
+
+ spin_lock_recursive(&d->arch.hvm.ioreq_server.lock);
+
+ s = get_ioreq_server(d, id);
+
+ rc = -ENOENT;
+ if ( !s )
+ goto out;
+
+ rc = -EPERM;
+ if ( s->emulator != current->domain )
+ goto out;
+
+ rc = hvm_ioreq_server_alloc_pages(s);
+ if ( rc )
+ goto out;
+
+ switch ( idx )
+ {
+ case XENMEM_resource_ioreq_server_frame_bufioreq:
+ rc = -ENOENT;
+ if ( !HANDLE_BUFIOREQ(s) )
+ goto out;
+
+ *mfn = page_to_mfn(s->bufioreq.page);
+ rc = 0;
+ break;
+
+ case XENMEM_resource_ioreq_server_frame_ioreq(0):
+ *mfn = page_to_mfn(s->ioreq.page);
+ rc = 0;
+ break;
+
+ default:
+ rc = -EINVAL;
+ break;
+ }
+
+ out:
+ spin_unlock_recursive(&d->arch.hvm.ioreq_server.lock);
+
+ return rc;
+}
+
+int hvm_map_io_range_to_ioreq_server(struct domain *d, ioservid_t id,
+ uint32_t type, uint64_t start,
+ uint64_t end)
+{
+ struct hvm_ioreq_server *s;
+ struct rangeset *r;
+ int rc;
+
+ if ( start > end )
+ return -EINVAL;
+
+ spin_lock_recursive(&d->arch.hvm.ioreq_server.lock);
+
+ s = get_ioreq_server(d, id);
+
+ rc = -ENOENT;
+ if ( !s )
+ goto out;
+
+ rc = -EPERM;
+ if ( s->emulator != current->domain )
+ goto out;
+
+ switch ( type )
+ {
+ case XEN_DMOP_IO_RANGE_PORT:
+ case XEN_DMOP_IO_RANGE_MEMORY:
+ case XEN_DMOP_IO_RANGE_PCI:
+ r = s->range[type];
+ break;
+
+ default:
+ r = NULL;
+ break;
+ }
+
+ rc = -EINVAL;
+ if ( !r )
+ goto out;
+
+ rc = -EEXIST;
+ if ( rangeset_overlaps_range(r, start, end) )
+ goto out;
+
+ rc = rangeset_add_range(r, start, end);
+
+ out:
+ spin_unlock_recursive(&d->arch.hvm.ioreq_server.lock);
+
+ return rc;
+}
+
+int hvm_unmap_io_range_from_ioreq_server(struct domain *d, ioservid_t id,
+ uint32_t type, uint64_t start,
+ uint64_t end)
+{
+ struct hvm_ioreq_server *s;
+ struct rangeset *r;
+ int rc;
+
+ if ( start > end )
+ return -EINVAL;
+
+ spin_lock_recursive(&d->arch.hvm.ioreq_server.lock);
+
+ s = get_ioreq_server(d, id);
+
+ rc = -ENOENT;
+ if ( !s )
+ goto out;
+
+ rc = -EPERM;
+ if ( s->emulator != current->domain )
+ goto out;
+
+ switch ( type )
+ {
+ case XEN_DMOP_IO_RANGE_PORT:
+ case XEN_DMOP_IO_RANGE_MEMORY:
+ case XEN_DMOP_IO_RANGE_PCI:
+ r = s->range[type];
+ break;
+
+ default:
+ r = NULL;
+ break;
+ }
+
+ rc = -EINVAL;
+ if ( !r )
+ goto out;
+
+ rc = -ENOENT;
+ if ( !rangeset_contains_range(r, start, end) )
+ goto out;
+
+ rc = rangeset_remove_range(r, start, end);
+
+ out:
+ spin_unlock_recursive(&d->arch.hvm.ioreq_server.lock);
+
+ return rc;
+}
+
+int hvm_set_ioreq_server_state(struct domain *d, ioservid_t id,
+ bool enabled)
+{
+ struct hvm_ioreq_server *s;
+ int rc;
+
+ spin_lock_recursive(&d->arch.hvm.ioreq_server.lock);
+
+ s = get_ioreq_server(d, id);
+
+ rc = -ENOENT;
+ if ( !s )
+ goto out;
+
+ rc = -EPERM;
+ if ( s->emulator != current->domain )
+ goto out;
+
+ domain_pause(d);
+
+ if ( enabled )
+ hvm_ioreq_server_enable(s);
+ else
+ hvm_ioreq_server_disable(s);
+
+ domain_unpause(d);
+
+ rc = 0;
+
+ out:
+ spin_unlock_recursive(&d->arch.hvm.ioreq_server.lock);
+ return rc;
+}
+
+int hvm_all_ioreq_servers_add_vcpu(struct domain *d, struct vcpu *v)
+{
+ struct hvm_ioreq_server *s;
+ unsigned int id;
+ int rc;
+
+ spin_lock_recursive(&d->arch.hvm.ioreq_server.lock);
+
+ FOR_EACH_IOREQ_SERVER(d, id, s)
+ {
+ rc = hvm_ioreq_server_add_vcpu(s, v);
+ if ( rc )
+ goto fail;
+ }
+
+ spin_unlock_recursive(&d->arch.hvm.ioreq_server.lock);
+
+ return 0;
+
+ fail:
+ while ( ++id != MAX_NR_IOREQ_SERVERS )
+ {
+ s = GET_IOREQ_SERVER(d, id);
+
+ if ( !s )
+ continue;
+
+ hvm_ioreq_server_remove_vcpu(s, v);
+ }
+
+ spin_unlock_recursive(&d->arch.hvm.ioreq_server.lock);
+
+ return rc;
+}
+
+void hvm_all_ioreq_servers_remove_vcpu(struct domain *d, struct vcpu *v)
+{
+ struct hvm_ioreq_server *s;
+ unsigned int id;
+
+ spin_lock_recursive(&d->arch.hvm.ioreq_server.lock);
+
+ FOR_EACH_IOREQ_SERVER(d, id, s)
+ hvm_ioreq_server_remove_vcpu(s, v);
+
+ spin_unlock_recursive(&d->arch.hvm.ioreq_server.lock);
+}
+
+void hvm_destroy_all_ioreq_servers(struct domain *d)
+{
+ struct hvm_ioreq_server *s;
+ unsigned int id;
+
+ arch_hvm_ioreq_destroy(d);
+
+ spin_lock_recursive(&d->arch.hvm.ioreq_server.lock);
+
+ /* No need to domain_pause() as the domain is being torn down */
+
+ FOR_EACH_IOREQ_SERVER(d, id, s)
+ {
+ hvm_ioreq_server_disable(s);
+
+ /*
+ * It is safe to call hvm_ioreq_server_deinit() prior to
+ * set_ioreq_server() since the target domain is being destroyed.
+ */
+ hvm_ioreq_server_deinit(s);
+ set_ioreq_server(d, id, NULL);
+
+ xfree(s);
+ }
+
+ spin_unlock_recursive(&d->arch.hvm.ioreq_server.lock);
+}
+
+struct hvm_ioreq_server *hvm_select_ioreq_server(struct domain *d,
+ ioreq_t *p)
+{
+ struct hvm_ioreq_server *s;
+ uint8_t type;
+ uint64_t addr;
+ unsigned int id;
+
+ if ( p->type != IOREQ_TYPE_COPY && p->type != IOREQ_TYPE_PIO )
+ return NULL;
+
+ hvm_get_ioreq_server_range_type(d, p, &type, &addr);
+
+ FOR_EACH_IOREQ_SERVER(d, id, s)
+ {
+ struct rangeset *r;
+
+ if ( !s->enabled )
+ continue;
+
+ r = s->range[type];
+
+ switch ( type )
+ {
+ unsigned long start, end;
+
+ case XEN_DMOP_IO_RANGE_PORT:
+ start = addr;
+ end = start + p->size - 1;
+ if ( rangeset_contains_range(r, start, end) )
+ return s;
+
+ break;
+
+ case XEN_DMOP_IO_RANGE_MEMORY:
+ start = hvm_mmio_first_byte(p);
+ end = hvm_mmio_last_byte(p);
+
+ if ( rangeset_contains_range(r, start, end) )
+ return s;
+
+ break;
+
+ case XEN_DMOP_IO_RANGE_PCI:
+ if ( rangeset_contains_singleton(r, addr >> 32) )
+ {
+ p->type = IOREQ_TYPE_PCI_CONFIG;
+ p->addr = addr;
+ return s;
+ }
+
+ break;
+ }
+ }
+
+ return NULL;
+}
+
+static int hvm_send_buffered_ioreq(struct hvm_ioreq_server *s, ioreq_t *p)
+{
+ struct domain *d = current->domain;
+ struct hvm_ioreq_page *iorp;
+ buffered_iopage_t *pg;
+ buf_ioreq_t bp = { .data = p->data,
+ .addr = p->addr,
+ .type = p->type,
+ .dir = p->dir };
+ /* Timeoffset sends 64b data, but no address. Use two consecutive slots. */
+ int qw = 0;
+
+ /* Ensure buffered_iopage fits in a page */
+ BUILD_BUG_ON(sizeof(buffered_iopage_t) > PAGE_SIZE);
+
+ iorp = &s->bufioreq;
+ pg = iorp->va;
+
+ if ( !pg )
+ return IOREQ_IO_UNHANDLED;
+
+ /*
+ * Return 0 for the cases we can't deal with:
+ * - 'addr' is only a 20-bit field, so we cannot address beyond 1MB
+ * - we cannot buffer accesses to guest memory buffers, as the guest
+ * may expect the memory buffer to be synchronously accessed
+ * - the count field is usually used with data_is_ptr and since we don't
+ * support data_is_ptr we do not waste space for the count field either
+ */
+ if ( (p->addr > 0xffffful) || p->data_is_ptr || (p->count != 1) )
+ return 0;
+
+ switch ( p->size )
+ {
+ case 1:
+ bp.size = 0;
+ break;
+ case 2:
+ bp.size = 1;
+ break;
+ case 4:
+ bp.size = 2;
+ break;
+ case 8:
+ bp.size = 3;
+ qw = 1;
+ break;
+ default:
+ gdprintk(XENLOG_WARNING, "unexpected ioreq size: %u\n", p->size);
+ return IOREQ_IO_UNHANDLED;
+ }
+
+ spin_lock(&s->bufioreq_lock);
+
+ if ( (pg->ptrs.write_pointer - pg->ptrs.read_pointer) >=
+ (IOREQ_BUFFER_SLOT_NUM - qw) )
+ {
+ /* The queue is full: send the iopacket through the normal path. */
+ spin_unlock(&s->bufioreq_lock);
+ return IOREQ_IO_UNHANDLED;
+ }
+
+ pg->buf_ioreq[pg->ptrs.write_pointer % IOREQ_BUFFER_SLOT_NUM] = bp;
+
+ if ( qw )
+ {
+ bp.data = p->data >> 32;
+ pg->buf_ioreq[(pg->ptrs.write_pointer+1) % IOREQ_BUFFER_SLOT_NUM] = bp;
+ }
+
+ /* Make the ioreq_t visible /before/ write_pointer. */
+ smp_wmb();
+ pg->ptrs.write_pointer += qw ? 2 : 1;
+
+ /* Canonicalize read/write pointers to prevent their overflow. */
+ while ( (s->bufioreq_handling == HVM_IOREQSRV_BUFIOREQ_ATOMIC) &&
+ qw++ < IOREQ_BUFFER_SLOT_NUM &&
+ pg->ptrs.read_pointer >= IOREQ_BUFFER_SLOT_NUM )
+ {
+ union bufioreq_pointers old = pg->ptrs, new;
+ unsigned int n = old.read_pointer / IOREQ_BUFFER_SLOT_NUM;
+
+ new.read_pointer = old.read_pointer - n * IOREQ_BUFFER_SLOT_NUM;
+ new.write_pointer = old.write_pointer - n * IOREQ_BUFFER_SLOT_NUM;
+ cmpxchg(&pg->ptrs.full, old.full, new.full);
+ }
+
+ notify_via_xen_event_channel(d, s->bufioreq_evtchn);
+ spin_unlock(&s->bufioreq_lock);
+
+ return IOREQ_IO_HANDLED;
+}
+
+int hvm_send_ioreq(struct hvm_ioreq_server *s, ioreq_t *proto_p,
+ bool buffered)
+{
+ struct vcpu *curr = current;
+ struct domain *d = curr->domain;
+ struct hvm_ioreq_vcpu *sv;
+
+ ASSERT(s);
+
+ if ( buffered )
+ return hvm_send_buffered_ioreq(s, proto_p);
+
+ if ( unlikely(!vcpu_start_shutdown_deferral(curr)) )
+ return IOREQ_IO_RETRY;
+
+ list_for_each_entry ( sv,
+ &s->ioreq_vcpu_list,
+ list_entry )
+ {
+ if ( sv->vcpu == curr )
+ {
+ evtchn_port_t port = sv->ioreq_evtchn;
+ ioreq_t *p = get_ioreq(s, curr);
+
+ if ( unlikely(p->state != STATE_IOREQ_NONE) )
+ {
+ gprintk(XENLOG_ERR, "device model set bad IO state %d\n",
+ p->state);
+ break;
+ }
+
+ if ( unlikely(p->vp_eport != port) )
+ {
+ gprintk(XENLOG_ERR, "device model set bad event channel %d\n",
+ p->vp_eport);
+ break;
+ }
+
+ proto_p->state = STATE_IOREQ_NONE;
+ proto_p->vp_eport = port;
+ *p = *proto_p;
+
+ prepare_wait_on_xen_event_channel(port);
+
+ /*
+ * Following happens /after/ blocking and setting up ioreq
+ * contents. prepare_wait_on_xen_event_channel() is an implicit
+ * barrier.
+ */
+ p->state = STATE_IOREQ_READY;
+ notify_via_xen_event_channel(d, port);
+
+ sv->pending = true;
+ return IOREQ_IO_RETRY;
+ }
+ }
+
+ return IOREQ_IO_UNHANDLED;
+}
+
+unsigned int hvm_broadcast_ioreq(ioreq_t *p, bool buffered)
+{
+ struct domain *d = current->domain;
+ struct hvm_ioreq_server *s;
+ unsigned int id, failed = 0;
+
+ FOR_EACH_IOREQ_SERVER(d, id, s)
+ {
+ if ( !s->enabled )
+ continue;
+
+ if ( hvm_send_ioreq(s, p, buffered) == IOREQ_IO_UNHANDLED )
+ failed++;
+ }
+
+ return failed;
+}
+
+void hvm_ioreq_init(struct domain *d)
+{
+ spin_lock_init(&d->arch.hvm.ioreq_server.lock);
+
+ arch_hvm_ioreq_init(d);
+}
+
+/*
+ * Local variables:
+ * mode: C
+ * c-file-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff --git a/xen/include/asm-x86/hvm/ioreq.h b/xen/include/asm-x86/hvm/ioreq.h
index e2588e9..0e871e0 100644
--- a/xen/include/asm-x86/hvm/ioreq.h
+++ b/xen/include/asm-x86/hvm/ioreq.h
@@ -19,41 +19,26 @@
#ifndef __ASM_X86_HVM_IOREQ_H__
#define __ASM_X86_HVM_IOREQ_H__

-bool hvm_io_pending(struct vcpu *v);
-bool handle_hvm_io_completion(struct vcpu *v);
-bool is_ioreq_server_page(struct domain *d, const struct page_info *page);
+#include <asm/hvm/emulate.h>
+#include <asm/hvm/hvm.h>
+#include <asm/hvm/vmx/vmx.h>
+
+void handle_realmode_completion(void);
+
+void hvm_get_ioreq_server_range_type(struct domain *d,
+ ioreq_t *p,
+ uint8_t *type,
+ uint64_t *addr);

-int hvm_create_ioreq_server(struct domain *d, int bufioreq_handling,
- ioservid_t *id);
-int hvm_destroy_ioreq_server(struct domain *d, ioservid_t id);
-int hvm_get_ioreq_server_info(struct domain *d, ioservid_t id,
- unsigned long *ioreq_gfn,
- unsigned long *bufioreq_gfn,
- evtchn_port_t *bufioreq_port);
-int hvm_get_ioreq_server_frame(struct domain *d, ioservid_t id,
- unsigned long idx, mfn_t *mfn);
-int hvm_map_io_range_to_ioreq_server(struct domain *d, ioservid_t id,
- uint32_t type, uint64_t start,
- uint64_t end);
-int hvm_unmap_io_range_from_ioreq_server(struct domain *d, ioservid_t id,
- uint32_t type, uint64_t start,
- uint64_t end);
int hvm_map_mem_type_to_ioreq_server(struct domain *d, ioservid_t id,
uint32_t type, uint32_t flags);
-int hvm_set_ioreq_server_state(struct domain *d, ioservid_t id,
- bool enabled);
-
-int hvm_all_ioreq_servers_add_vcpu(struct domain *d, struct vcpu *v);
-void hvm_all_ioreq_servers_remove_vcpu(struct domain *d, struct vcpu *v);
-void hvm_destroy_all_ioreq_servers(struct domain *d);

-struct hvm_ioreq_server *hvm_select_ioreq_server(struct domain *d,
- ioreq_t *p);
-int hvm_send_ioreq(struct hvm_ioreq_server *s, ioreq_t *proto_p,
- bool buffered);
-unsigned int hvm_broadcast_ioreq(ioreq_t *p, bool buffered);
+void arch_hvm_ioreq_init(struct domain *d);
+void arch_hvm_ioreq_destroy(struct domain *d);

-void hvm_ioreq_init(struct domain *d);
+#define IOREQ_IO_HANDLED X86EMUL_OKAY
+#define IOREQ_IO_UNHANDLED X86EMUL_UNHANDLEABLE
+#define IOREQ_IO_RETRY X86EMUL_RETRY

#endif /* __ASM_X86_HVM_IOREQ_H__ */

diff --git a/xen/include/asm-x86/hvm/vcpu.h b/xen/include/asm-x86/hvm/vcpu.h
index 5ccd075..6c1feda 100644
--- a/xen/include/asm-x86/hvm/vcpu.h
+++ b/xen/include/asm-x86/hvm/vcpu.h
@@ -91,13 +91,6 @@ struct hvm_vcpu_io {
const struct g2m_ioport *g2m_ioport;
};

-static inline bool hvm_ioreq_needs_completion(const ioreq_t *ioreq)
-{
- return ioreq->state == STATE_IOREQ_READY &&
- !ioreq->data_is_ptr &&
- (ioreq->type != IOREQ_TYPE_PIO || ioreq->dir != IOREQ_WRITE);
-}
-
struct nestedvcpu {
bool_t nv_guestmode; /* vcpu in guestmode? */
void *nv_vvmcx; /* l1 guest virtual VMCB/VMCS */
diff --git a/xen/include/xen/hvm/ioreq.h b/xen/include/xen/hvm/ioreq.h
new file mode 100644
index 0000000..40b7b5e
--- /dev/null
+++ b/xen/include/xen/hvm/ioreq.h
@@ -0,0 +1,89 @@
+/*
+ * hvm.h: Hardware virtual machine assist interface definitions.
+ *
+ * Copyright (c) 2016 Citrix Systems Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef __HVM_IOREQ_H__
+#define __HVM_IOREQ_H__
+
+#include <xen/sched.h>
+
+#include <asm/hvm/ioreq.h>
+
+#define GET_IOREQ_SERVER(d, id) \
+ (d)->arch.hvm.ioreq_server.server[id]
+
+static inline struct hvm_ioreq_server *get_ioreq_server(const struct domain *d,
+ unsigned int id)
+{
+ if ( id >= MAX_NR_IOREQ_SERVERS )
+ return NULL;
+
+ return GET_IOREQ_SERVER(d, id);
+}
+
+static inline bool hvm_ioreq_needs_completion(const ioreq_t *ioreq)
+{
+ return ioreq->state == STATE_IOREQ_READY &&
+ !ioreq->data_is_ptr &&
+ (ioreq->type != IOREQ_TYPE_PIO || ioreq->dir != IOREQ_WRITE);
+}
+
+bool hvm_io_pending(struct vcpu *v);
+bool handle_hvm_io_completion(struct vcpu *v);
+bool is_ioreq_server_page(struct domain *d, const struct page_info *page);
+
+int hvm_create_ioreq_server(struct domain *d, int bufioreq_handling,
+ ioservid_t *id);
+int hvm_destroy_ioreq_server(struct domain *d, ioservid_t id);
+int hvm_get_ioreq_server_info(struct domain *d, ioservid_t id,
+ unsigned long *ioreq_gfn,
+ unsigned long *bufioreq_gfn,
+ evtchn_port_t *bufioreq_port);
+int hvm_get_ioreq_server_frame(struct domain *d, ioservid_t id,
+ unsigned long idx, mfn_t *mfn);
+int hvm_map_io_range_to_ioreq_server(struct domain *d, ioservid_t id,
+ uint32_t type, uint64_t start,
+ uint64_t end);
+int hvm_unmap_io_range_from_ioreq_server(struct domain *d, ioservid_t id,
+ uint32_t type, uint64_t start,
+ uint64_t end);
+int hvm_set_ioreq_server_state(struct domain *d, ioservid_t id,
+ bool enabled);
+
+int hvm_all_ioreq_servers_add_vcpu(struct domain *d, struct vcpu *v);
+void hvm_all_ioreq_servers_remove_vcpu(struct domain *d, struct vcpu *v);
+void hvm_destroy_all_ioreq_servers(struct domain *d);
+
+struct hvm_ioreq_server *hvm_select_ioreq_server(struct domain *d,
+ ioreq_t *p);
+int hvm_send_ioreq(struct hvm_ioreq_server *s, ioreq_t *proto_p,
+ bool buffered);
+unsigned int hvm_broadcast_ioreq(ioreq_t *p, bool buffered);
+
+void hvm_ioreq_init(struct domain *d);
+
+#endif /* __HVM_IOREQ_H__ */
+
+/*
+ * Local variables:
+ * mode: C
+ * c-file-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
--
2.7.4
RE: [RFC PATCH V1 01/12] hvm/ioreq: Make x86's IOREQ feature common [ In reply to ]
> -----Original Message-----
> From: Oleksandr Tyshchenko <olekstysh@gmail.com>
> Sent: 03 August 2020 19:21
> To: xen-devel@lists.xenproject.org
> Cc: Oleksandr Tyshchenko <oleksandr_tyshchenko@epam.com>; Jan Beulich <jbeulich@suse.com>; Andrew
> Cooper <andrew.cooper3@citrix.com>; Wei Liu <wl@xen.org>; Roger Pau Monné <roger.pau@citrix.com>;
> George Dunlap <george.dunlap@citrix.com>; Ian Jackson <ian.jackson@eu.citrix.com>; Julien Grall
> <julien@xen.org>; Stefano Stabellini <sstabellini@kernel.org>; Paul Durrant <paul@xen.org>; Jun
> Nakajima <jun.nakajima@intel.com>; Kevin Tian <kevin.tian@intel.com>; Tim Deegan <tim@xen.org>; Julien
> Grall <julien.grall@arm.com>
> Subject: [RFC PATCH V1 01/12] hvm/ioreq: Make x86's IOREQ feature common
>
> From: Oleksandr Tyshchenko <oleksandr_tyshchenko@epam.com>
>
> As a lot of x86 code can be re-used on Arm later on, this patch
> splits IOREQ support into common and arch specific parts.
>
> This support is going to be used on Arm to be able run device
> emulator outside of Xen hypervisor.
>
> Please note, this is a split/cleanup of Julien's PoC:
> "Add support for Guest IO forwarding to a device emulator"
>
> Signed-off-by: Julien Grall <julien.grall@arm.com>
> Signed-off-by: Oleksandr Tyshchenko <oleksandr_tyshchenko@epam.com>
> ---
> xen/arch/x86/Kconfig | 1 +
> xen/arch/x86/hvm/dm.c | 2 +-
> xen/arch/x86/hvm/emulate.c | 2 +-
> xen/arch/x86/hvm/hvm.c | 2 +-
> xen/arch/x86/hvm/io.c | 2 +-
> xen/arch/x86/hvm/ioreq.c | 1431 +--------------------------------------
> xen/arch/x86/hvm/stdvga.c | 2 +-
> xen/arch/x86/hvm/vmx/realmode.c | 1 +
> xen/arch/x86/hvm/vmx/vvmx.c | 2 +-
> xen/arch/x86/mm.c | 2 +-
> xen/arch/x86/mm/shadow/common.c | 2 +-
> xen/common/Kconfig | 3 +
> xen/common/Makefile | 1 +
> xen/common/hvm/Makefile | 1 +
> xen/common/hvm/ioreq.c | 1430 ++++++++++++++++++++++++++++++++++++++
> xen/include/asm-x86/hvm/ioreq.h | 45 +-
> xen/include/asm-x86/hvm/vcpu.h | 7 -
> xen/include/xen/hvm/ioreq.h | 89 +++
> 18 files changed, 1575 insertions(+), 1450 deletions(-)
> create mode 100644 xen/common/hvm/Makefile
> create mode 100644 xen/common/hvm/ioreq.c
> create mode 100644 xen/include/xen/hvm/ioreq.h

You need to adjust the MAINTAINERS file since there will now be common 'I/O EMULATION' code. Since I wrote most of ioreq.c, please retain me as a maintainer of the common code.

[snip]
> @@ -1528,13 +143,19 @@ static int hvm_access_cf8(
> return X86EMUL_UNHANDLEABLE;
> }
>
> -void hvm_ioreq_init(struct domain *d)
> +void arch_hvm_ioreq_init(struct domain *d)
> {
> spin_lock_init(&d->arch.hvm.ioreq_server.lock);
>
> register_portio_handler(d, 0xcf8, 4, hvm_access_cf8);
> }
>
> +void arch_hvm_ioreq_destroy(struct domain *d)
> +{
> + if ( !relocate_portio_handler(d, 0xcf8, 0xcf8, 4) )
> + return;

There's not really a lot of point in this. I think an empty function here would be ok.

> +}
> +
> /*
> * Local variables:
> * mode: C

[snip]
> +struct hvm_ioreq_server *hvm_select_ioreq_server(struct domain *d,
> + ioreq_t *p)
> +{
> + struct hvm_ioreq_server *s;
> + uint8_t type;
> + uint64_t addr;
> + unsigned int id;
> +
> + if ( p->type != IOREQ_TYPE_COPY && p->type != IOREQ_TYPE_PIO )
> + return NULL;
> +
> + hvm_get_ioreq_server_range_type(d, p, &type, &addr);

Looking at this, I think it would make more sense to fold the check of p->type into hvm_get_ioreq_server_range_type() and have it return success/failure.

> +
> + FOR_EACH_IOREQ_SERVER(d, id, s)
> + {
> + struct rangeset *r;
> +
> + if ( !s->enabled )
> + continue;
> +
> + r = s->range[type];
> +
> + switch ( type )
> + {
> + unsigned long start, end;
> +
> + case XEN_DMOP_IO_RANGE_PORT:
> + start = addr;
> + end = start + p->size - 1;
> + if ( rangeset_contains_range(r, start, end) )
> + return s;
> +
> + break;
> +
> + case XEN_DMOP_IO_RANGE_MEMORY:
> + start = hvm_mmio_first_byte(p);
> + end = hvm_mmio_last_byte(p);
> +
> + if ( rangeset_contains_range(r, start, end) )
> + return s;
> +
> + break;
> +
> + case XEN_DMOP_IO_RANGE_PCI:
> + if ( rangeset_contains_singleton(r, addr >> 32) )
> + {
> + p->type = IOREQ_TYPE_PCI_CONFIG;
> + p->addr = addr;
> + return s;
> + }
> +
> + break;
> + }
> + }
> +
> + return NULL;
> +}

[snip]
> diff --git a/xen/include/xen/hvm/ioreq.h b/xen/include/xen/hvm/ioreq.h
> new file mode 100644
> index 0000000..40b7b5e
> --- /dev/null
> +++ b/xen/include/xen/hvm/ioreq.h
> @@ -0,0 +1,89 @@
> +/*
> + * hvm.h: Hardware virtual machine assist interface definitions.
> + *
> + * Copyright (c) 2016 Citrix Systems Inc.
> + *
> + * This program is free software; you can redistribute it and/or modify it
> + * under the terms and conditions of the GNU General Public License,
> + * version 2, as published by the Free Software Foundation.
> + *
> + * This program is distributed in the hope it will be useful, but WITHOUT
> + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
> + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
> + * more details.
> + *
> + * You should have received a copy of the GNU General Public License along with
> + * this program; If not, see <http://www.gnu.org/licenses/>.
> + */
> +
> +#ifndef __HVM_IOREQ_H__
> +#define __HVM_IOREQ_H__
> +
> +#include <xen/sched.h>
> +
> +#include <asm/hvm/ioreq.h>
> +
> +#define GET_IOREQ_SERVER(d, id) \
> + (d)->arch.hvm.ioreq_server.server[id]
> +
> +static inline struct hvm_ioreq_server *get_ioreq_server(const struct domain *d,
> + unsigned int id)
> +{
> + if ( id >= MAX_NR_IOREQ_SERVERS )
> + return NULL;
> +
> + return GET_IOREQ_SERVER(d, id);
> +}
> +
> +static inline bool hvm_ioreq_needs_completion(const ioreq_t *ioreq)
> +{
> + return ioreq->state == STATE_IOREQ_READY &&
> + !ioreq->data_is_ptr &&
> + (ioreq->type != IOREQ_TYPE_PIO || ioreq->dir != IOREQ_WRITE);
> +}

I don't think having this in common code is correct. The short-cut of not completing PIO reads seems somewhat x86 specific. Does ARM even have the concept of PIO?

Paul

> +
> +bool hvm_io_pending(struct vcpu *v);
> +bool handle_hvm_io_completion(struct vcpu *v);
> +bool is_ioreq_server_page(struct domain *d, const struct page_info *page);
> +
> +int hvm_create_ioreq_server(struct domain *d, int bufioreq_handling,
> + ioservid_t *id);
> +int hvm_destroy_ioreq_server(struct domain *d, ioservid_t id);
> +int hvm_get_ioreq_server_info(struct domain *d, ioservid_t id,
> + unsigned long *ioreq_gfn,
> + unsigned long *bufioreq_gfn,
> + evtchn_port_t *bufioreq_port);
> +int hvm_get_ioreq_server_frame(struct domain *d, ioservid_t id,
> + unsigned long idx, mfn_t *mfn);
> +int hvm_map_io_range_to_ioreq_server(struct domain *d, ioservid_t id,
> + uint32_t type, uint64_t start,
> + uint64_t end);
> +int hvm_unmap_io_range_from_ioreq_server(struct domain *d, ioservid_t id,
> + uint32_t type, uint64_t start,
> + uint64_t end);
> +int hvm_set_ioreq_server_state(struct domain *d, ioservid_t id,
> + bool enabled);
> +
> +int hvm_all_ioreq_servers_add_vcpu(struct domain *d, struct vcpu *v);
> +void hvm_all_ioreq_servers_remove_vcpu(struct domain *d, struct vcpu *v);
> +void hvm_destroy_all_ioreq_servers(struct domain *d);
> +
> +struct hvm_ioreq_server *hvm_select_ioreq_server(struct domain *d,
> + ioreq_t *p);
> +int hvm_send_ioreq(struct hvm_ioreq_server *s, ioreq_t *proto_p,
> + bool buffered);
> +unsigned int hvm_broadcast_ioreq(ioreq_t *p, bool buffered);
> +
> +void hvm_ioreq_init(struct domain *d);
> +
> +#endif /* __HVM_IOREQ_H__ */
> +
> +/*
> + * Local variables:
> + * mode: C
> + * c-file-style: "BSD"
> + * c-basic-offset: 4
> + * tab-width: 4
> + * indent-tabs-mode: nil
> + * End:
> + */
> --
> 2.7.4
Re: [RFC PATCH V1 01/12] hvm/ioreq: Make x86's IOREQ feature common [ In reply to ]
On 04.08.20 10:45, Paul Durrant wrote:

Hi Paul

>> -----Original Message-----
>> From: Oleksandr Tyshchenko <olekstysh@gmail.com>
>> Sent: 03 August 2020 19:21
>> To: xen-devel@lists.xenproject.org
>> Cc: Oleksandr Tyshchenko <oleksandr_tyshchenko@epam.com>; Jan Beulich <jbeulich@suse.com>; Andrew
>> Cooper <andrew.cooper3@citrix.com>; Wei Liu <wl@xen.org>; Roger Pau Monné <roger.pau@citrix.com>;
>> George Dunlap <george.dunlap@citrix.com>; Ian Jackson <ian.jackson@eu.citrix.com>; Julien Grall
>> <julien@xen.org>; Stefano Stabellini <sstabellini@kernel.org>; Paul Durrant <paul@xen.org>; Jun
>> Nakajima <jun.nakajima@intel.com>; Kevin Tian <kevin.tian@intel.com>; Tim Deegan <tim@xen.org>; Julien
>> Grall <julien.grall@arm.com>
>> Subject: [RFC PATCH V1 01/12] hvm/ioreq: Make x86's IOREQ feature common
>>
>> From: Oleksandr Tyshchenko <oleksandr_tyshchenko@epam.com>
>>
>> As a lot of x86 code can be re-used on Arm later on, this patch
>> splits IOREQ support into common and arch specific parts.
>>
>> This support is going to be used on Arm to be able run device
>> emulator outside of Xen hypervisor.
>>
>> Please note, this is a split/cleanup of Julien's PoC:
>> "Add support for Guest IO forwarding to a device emulator"
>>
>> Signed-off-by: Julien Grall <julien.grall@arm.com>
>> Signed-off-by: Oleksandr Tyshchenko <oleksandr_tyshchenko@epam.com>
>> ---
>> xen/arch/x86/Kconfig | 1 +
>> xen/arch/x86/hvm/dm.c | 2 +-
>> xen/arch/x86/hvm/emulate.c | 2 +-
>> xen/arch/x86/hvm/hvm.c | 2 +-
>> xen/arch/x86/hvm/io.c | 2 +-
>> xen/arch/x86/hvm/ioreq.c | 1431 +--------------------------------------
>> xen/arch/x86/hvm/stdvga.c | 2 +-
>> xen/arch/x86/hvm/vmx/realmode.c | 1 +
>> xen/arch/x86/hvm/vmx/vvmx.c | 2 +-
>> xen/arch/x86/mm.c | 2 +-
>> xen/arch/x86/mm/shadow/common.c | 2 +-
>> xen/common/Kconfig | 3 +
>> xen/common/Makefile | 1 +
>> xen/common/hvm/Makefile | 1 +
>> xen/common/hvm/ioreq.c | 1430 ++++++++++++++++++++++++++++++++++++++
>> xen/include/asm-x86/hvm/ioreq.h | 45 +-
>> xen/include/asm-x86/hvm/vcpu.h | 7 -
>> xen/include/xen/hvm/ioreq.h | 89 +++
>> 18 files changed, 1575 insertions(+), 1450 deletions(-)
>> create mode 100644 xen/common/hvm/Makefile
>> create mode 100644 xen/common/hvm/ioreq.c
>> create mode 100644 xen/include/xen/hvm/ioreq.h
> You need to adjust the MAINTAINERS file since there will now be common 'I/O EMULATION' code. Since I wrote most of ioreq.c, please retain me as a maintainer of the common code.

Oh, I completely forgot about MAINTAINERS file. Sure, I will update file
and retain you.


>
> [snip]
>> @@ -1528,13 +143,19 @@ static int hvm_access_cf8(
>> return X86EMUL_UNHANDLEABLE;
>> }
>>
>> -void hvm_ioreq_init(struct domain *d)
>> +void arch_hvm_ioreq_init(struct domain *d)
>> {
>> spin_lock_init(&d->arch.hvm.ioreq_server.lock);
>>
>> register_portio_handler(d, 0xcf8, 4, hvm_access_cf8);
>> }
>>
>> +void arch_hvm_ioreq_destroy(struct domain *d)
>> +{
>> + if ( !relocate_portio_handler(d, 0xcf8, 0xcf8, 4) )
>> + return;
> There's not really a lot of point in this. I think an empty function here would be ok.

ok


>> +struct hvm_ioreq_server *hvm_select_ioreq_server(struct domain *d,
>> + ioreq_t *p)
>> +{
>> + struct hvm_ioreq_server *s;
>> + uint8_t type;
>> + uint64_t addr;
>> + unsigned int id;
>> +
>> + if ( p->type != IOREQ_TYPE_COPY && p->type != IOREQ_TYPE_PIO )
>> + return NULL;
>> +
>> + hvm_get_ioreq_server_range_type(d, p, &type, &addr);
> Looking at this, I think it would make more sense to fold the check of p->type into hvm_get_ioreq_server_range_type() and have it return success/failure.

ok, will update.


> diff --git a/xen/include/xen/hvm/ioreq.h b/xen/include/xen/hvm/ioreq.h
>> new file mode 100644
>> index 0000000..40b7b5e
>> --- /dev/null
>> +++ b/xen/include/xen/hvm/ioreq.h
>> @@ -0,0 +1,89 @@
>> +/*
>> + * hvm.h: Hardware virtual machine assist interface definitions.
>> + *
>> + * Copyright (c) 2016 Citrix Systems Inc.
>> + *
>> + * This program is free software; you can redistribute it and/or modify it
>> + * under the terms and conditions of the GNU General Public License,
>> + * version 2, as published by the Free Software Foundation.
>> + *
>> + * This program is distributed in the hope it will be useful, but WITHOUT
>> + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
>> + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
>> + * more details.
>> + *
>> + * You should have received a copy of the GNU General Public License along with
>> + * this program; If not, see <http://www.gnu.org/licenses/>.
>> + */
>> +
>> +#ifndef __HVM_IOREQ_H__
>> +#define __HVM_IOREQ_H__
>> +
>> +#include <xen/sched.h>
>> +
>> +#include <asm/hvm/ioreq.h>
>> +
>> +#define GET_IOREQ_SERVER(d, id) \
>> + (d)->arch.hvm.ioreq_server.server[id]
>> +
>> +static inline struct hvm_ioreq_server *get_ioreq_server(const struct domain *d,
>> + unsigned int id)
>> +{
>> + if ( id >= MAX_NR_IOREQ_SERVERS )
>> + return NULL;
>> +
>> + return GET_IOREQ_SERVER(d, id);
>> +}
>> +
>> +static inline bool hvm_ioreq_needs_completion(const ioreq_t *ioreq)
>> +{
>> + return ioreq->state == STATE_IOREQ_READY &&
>> + !ioreq->data_is_ptr &&
>> + (ioreq->type != IOREQ_TYPE_PIO || ioreq->dir != IOREQ_WRITE);
>> +}
> I don't think having this in common code is correct. The short-cut of not completing PIO reads seems somewhat x86 specific. Does ARM even have the concept of PIO?

I am not 100% sure here, but it seems that doesn't have.

Shall I make hvm_ioreq_needs_completion() per arch? Arm variant would
have the same implementation, but without "ioreq->type !=
IOREQ_TYPE_PIO" check...

--
Regards,

Oleksandr Tyshchenko
RE: [RFC PATCH V1 01/12] hvm/ioreq: Make x86's IOREQ feature common [ In reply to ]
> -----Original Message-----
> From: Oleksandr <olekstysh@gmail.com>
> Sent: 04 August 2020 12:10
> To: paul@xen.org; xen-devel@lists.xenproject.org
> Cc: 'Oleksandr Tyshchenko' <oleksandr_tyshchenko@epam.com>; 'Jan Beulich' <jbeulich@suse.com>; 'Andrew
> Cooper' <andrew.cooper3@citrix.com>; 'Wei Liu' <wl@xen.org>; 'Roger Pau Monné' <roger.pau@citrix.com>;
> 'George Dunlap' <george.dunlap@citrix.com>; 'Ian Jackson' <ian.jackson@eu.citrix.com>; 'Julien Grall'
> <julien@xen.org>; 'Stefano Stabellini' <sstabellini@kernel.org>; 'Jun Nakajima'
> <jun.nakajima@intel.com>; 'Kevin Tian' <kevin.tian@intel.com>; 'Tim Deegan' <tim@xen.org>; 'Julien
> Grall' <julien.grall@arm.com>
> Subject: Re: [RFC PATCH V1 01/12] hvm/ioreq: Make x86's IOREQ feature common
>
>
> On 04.08.20 10:45, Paul Durrant wrote:
>
> Hi Paul
>
> >> -----Original Message-----
> >> From: Oleksandr Tyshchenko <olekstysh@gmail.com>
> >> Sent: 03 August 2020 19:21
> >> To: xen-devel@lists.xenproject.org
> >> Cc: Oleksandr Tyshchenko <oleksandr_tyshchenko@epam.com>; Jan Beulich <jbeulich@suse.com>; Andrew
> >> Cooper <andrew.cooper3@citrix.com>; Wei Liu <wl@xen.org>; Roger Pau Monné <roger.pau@citrix.com>;
> >> George Dunlap <george.dunlap@citrix.com>; Ian Jackson <ian.jackson@eu.citrix.com>; Julien Grall
> >> <julien@xen.org>; Stefano Stabellini <sstabellini@kernel.org>; Paul Durrant <paul@xen.org>; Jun
> >> Nakajima <jun.nakajima@intel.com>; Kevin Tian <kevin.tian@intel.com>; Tim Deegan <tim@xen.org>;
> Julien
> >> Grall <julien.grall@arm.com>
> >> Subject: [RFC PATCH V1 01/12] hvm/ioreq: Make x86's IOREQ feature common
> >>
> >> From: Oleksandr Tyshchenko <oleksandr_tyshchenko@epam.com>
> >>
> >> As a lot of x86 code can be re-used on Arm later on, this patch
> >> splits IOREQ support into common and arch specific parts.
> >>
> >> This support is going to be used on Arm to be able run device
> >> emulator outside of Xen hypervisor.
> >>
> >> Please note, this is a split/cleanup of Julien's PoC:
> >> "Add support for Guest IO forwarding to a device emulator"
> >>
> >> Signed-off-by: Julien Grall <julien.grall@arm.com>
> >> Signed-off-by: Oleksandr Tyshchenko <oleksandr_tyshchenko@epam.com>
> >> ---
> >> xen/arch/x86/Kconfig | 1 +
> >> xen/arch/x86/hvm/dm.c | 2 +-
> >> xen/arch/x86/hvm/emulate.c | 2 +-
> >> xen/arch/x86/hvm/hvm.c | 2 +-
> >> xen/arch/x86/hvm/io.c | 2 +-
> >> xen/arch/x86/hvm/ioreq.c | 1431 +--------------------------------------
> >> xen/arch/x86/hvm/stdvga.c | 2 +-
> >> xen/arch/x86/hvm/vmx/realmode.c | 1 +
> >> xen/arch/x86/hvm/vmx/vvmx.c | 2 +-
> >> xen/arch/x86/mm.c | 2 +-
> >> xen/arch/x86/mm/shadow/common.c | 2 +-
> >> xen/common/Kconfig | 3 +
> >> xen/common/Makefile | 1 +
> >> xen/common/hvm/Makefile | 1 +
> >> xen/common/hvm/ioreq.c | 1430 ++++++++++++++++++++++++++++++++++++++
> >> xen/include/asm-x86/hvm/ioreq.h | 45 +-
> >> xen/include/asm-x86/hvm/vcpu.h | 7 -
> >> xen/include/xen/hvm/ioreq.h | 89 +++
> >> 18 files changed, 1575 insertions(+), 1450 deletions(-)
> >> create mode 100644 xen/common/hvm/Makefile
> >> create mode 100644 xen/common/hvm/ioreq.c
> >> create mode 100644 xen/include/xen/hvm/ioreq.h
> > You need to adjust the MAINTAINERS file since there will now be common 'I/O EMULATION' code. Since I
> wrote most of ioreq.c, please retain me as a maintainer of the common code.
>
> Oh, I completely forgot about MAINTAINERS file. Sure, I will update file
> and retain you.
>
>
> >
> > [snip]
> >> @@ -1528,13 +143,19 @@ static int hvm_access_cf8(
> >> return X86EMUL_UNHANDLEABLE;
> >> }
> >>
> >> -void hvm_ioreq_init(struct domain *d)
> >> +void arch_hvm_ioreq_init(struct domain *d)
> >> {
> >> spin_lock_init(&d->arch.hvm.ioreq_server.lock);
> >>
> >> register_portio_handler(d, 0xcf8, 4, hvm_access_cf8);
> >> }
> >>
> >> +void arch_hvm_ioreq_destroy(struct domain *d)
> >> +{
> >> + if ( !relocate_portio_handler(d, 0xcf8, 0xcf8, 4) )
> >> + return;
> > There's not really a lot of point in this. I think an empty function here would be ok.
>
> ok
>
>
> >> +struct hvm_ioreq_server *hvm_select_ioreq_server(struct domain *d,
> >> + ioreq_t *p)
> >> +{
> >> + struct hvm_ioreq_server *s;
> >> + uint8_t type;
> >> + uint64_t addr;
> >> + unsigned int id;
> >> +
> >> + if ( p->type != IOREQ_TYPE_COPY && p->type != IOREQ_TYPE_PIO )
> >> + return NULL;
> >> +
> >> + hvm_get_ioreq_server_range_type(d, p, &type, &addr);
> > Looking at this, I think it would make more sense to fold the check of p->type into
> hvm_get_ioreq_server_range_type() and have it return success/failure.
>
> ok, will update.
>
>
> > diff --git a/xen/include/xen/hvm/ioreq.h b/xen/include/xen/hvm/ioreq.h
> >> new file mode 100644
> >> index 0000000..40b7b5e
> >> --- /dev/null
> >> +++ b/xen/include/xen/hvm/ioreq.h
> >> @@ -0,0 +1,89 @@
> >> +/*
> >> + * hvm.h: Hardware virtual machine assist interface definitions.
> >> + *
> >> + * Copyright (c) 2016 Citrix Systems Inc.
> >> + *
> >> + * This program is free software; you can redistribute it and/or modify it
> >> + * under the terms and conditions of the GNU General Public License,
> >> + * version 2, as published by the Free Software Foundation.
> >> + *
> >> + * This program is distributed in the hope it will be useful, but WITHOUT
> >> + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
> >> + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
> >> + * more details.
> >> + *
> >> + * You should have received a copy of the GNU General Public License along with
> >> + * this program; If not, see <http://www.gnu.org/licenses/>.
> >> + */
> >> +
> >> +#ifndef __HVM_IOREQ_H__
> >> +#define __HVM_IOREQ_H__
> >> +
> >> +#include <xen/sched.h>
> >> +
> >> +#include <asm/hvm/ioreq.h>
> >> +
> >> +#define GET_IOREQ_SERVER(d, id) \
> >> + (d)->arch.hvm.ioreq_server.server[id]
> >> +
> >> +static inline struct hvm_ioreq_server *get_ioreq_server(const struct domain *d,
> >> + unsigned int id)
> >> +{
> >> + if ( id >= MAX_NR_IOREQ_SERVERS )
> >> + return NULL;
> >> +
> >> + return GET_IOREQ_SERVER(d, id);
> >> +}
> >> +
> >> +static inline bool hvm_ioreq_needs_completion(const ioreq_t *ioreq)
> >> +{
> >> + return ioreq->state == STATE_IOREQ_READY &&
> >> + !ioreq->data_is_ptr &&
> >> + (ioreq->type != IOREQ_TYPE_PIO || ioreq->dir != IOREQ_WRITE);
> >> +}
> > I don't think having this in common code is correct. The short-cut of not completing PIO reads seems
> somewhat x86 specific. Does ARM even have the concept of PIO?
>
> I am not 100% sure here, but it seems that doesn't have.
>
> Shall I make hvm_ioreq_needs_completion() per arch? Arm variant would
> have the same implementation, but without "ioreq->type !=
> IOREQ_TYPE_PIO" check...
>

With your series applied, does any common code actually call hvm_ioreq_needs_completion()? I suspect it will remain x86 specific, without any need for an Arm variant.

Paul

> --
> Regards,
>
> Oleksandr Tyshchenko
Re: [RFC PATCH V1 01/12] hvm/ioreq: Make x86's IOREQ feature common [ In reply to ]
On 04.08.20 14:23, Paul Durrant wrote:
>>
>>> diff --git a/xen/include/xen/hvm/ioreq.h b/xen/include/xen/hvm/ioreq.h
>>>> new file mode 100644
>>>> index 0000000..40b7b5e
>>>> --- /dev/null
>>>> +++ b/xen/include/xen/hvm/ioreq.h
>>>> @@ -0,0 +1,89 @@
>>>> +/*
>>>> + * hvm.h: Hardware virtual machine assist interface definitions.
>>>> + *
>>>> + * Copyright (c) 2016 Citrix Systems Inc.
>>>> + *
>>>> + * This program is free software; you can redistribute it and/or modify it
>>>> + * under the terms and conditions of the GNU General Public License,
>>>> + * version 2, as published by the Free Software Foundation.
>>>> + *
>>>> + * This program is distributed in the hope it will be useful, but WITHOUT
>>>> + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
>>>> + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
>>>> + * more details.
>>>> + *
>>>> + * You should have received a copy of the GNU General Public License along with
>>>> + * this program; If not, see <http://www.gnu.org/licenses/>.
>>>> + */
>>>> +
>>>> +#ifndef __HVM_IOREQ_H__
>>>> +#define __HVM_IOREQ_H__
>>>> +
>>>> +#include <xen/sched.h>
>>>> +
>>>> +#include <asm/hvm/ioreq.h>
>>>> +
>>>> +#define GET_IOREQ_SERVER(d, id) \
>>>> + (d)->arch.hvm.ioreq_server.server[id]
>>>> +
>>>> +static inline struct hvm_ioreq_server *get_ioreq_server(const struct domain *d,
>>>> + unsigned int id)
>>>> +{
>>>> + if ( id >= MAX_NR_IOREQ_SERVERS )
>>>> + return NULL;
>>>> +
>>>> + return GET_IOREQ_SERVER(d, id);
>>>> +}
>>>> +
>>>> +static inline bool hvm_ioreq_needs_completion(const ioreq_t *ioreq)
>>>> +{
>>>> + return ioreq->state == STATE_IOREQ_READY &&
>>>> + !ioreq->data_is_ptr &&
>>>> + (ioreq->type != IOREQ_TYPE_PIO || ioreq->dir != IOREQ_WRITE);
>>>> +}
>>> I don't think having this in common code is correct. The short-cut of not completing PIO reads seems
>> somewhat x86 specific. Does ARM even have the concept of PIO?
>>
>> I am not 100% sure here, but it seems that doesn't have.
>>
>> Shall I make hvm_ioreq_needs_completion() per arch? Arm variant would
>> have the same implementation, but without "ioreq->type !=
>> IOREQ_TYPE_PIO" check...
>>
> With your series applied, does any common code actually call hvm_ioreq_needs_completion()? I suspect it will remain x86 specific, without any need for an Arm variant.
Yes, it does. Please see common usage in hvm_io_assist() and
handle_hvm_io_completion() (current patch) and usage in Arm code
(arch/arm/io.c: io_state try_fwd_ioserv) [1]


[1]
https://lists.xenproject.org/archives/html/xen-devel/2020-08/msg00072.html


--
Regards,

Oleksandr Tyshchenko
RE: [RFC PATCH V1 01/12] hvm/ioreq: Make x86's IOREQ feature common [ In reply to ]
> -----Original Message-----
> From: Oleksandr <olekstysh@gmail.com>
> Sent: 04 August 2020 12:51
> To: paul@xen.org; xen-devel@lists.xenproject.org
> Cc: 'Oleksandr Tyshchenko' <oleksandr_tyshchenko@epam.com>; 'Jan Beulich' <jbeulich@suse.com>; 'Andrew
> Cooper' <andrew.cooper3@citrix.com>; 'Wei Liu' <wl@xen.org>; 'Roger Pau Monné' <roger.pau@citrix.com>;
> 'George Dunlap' <george.dunlap@citrix.com>; 'Ian Jackson' <ian.jackson@eu.citrix.com>; 'Julien Grall'
> <julien@xen.org>; 'Stefano Stabellini' <sstabellini@kernel.org>; 'Jun Nakajima'
> <jun.nakajima@intel.com>; 'Kevin Tian' <kevin.tian@intel.com>; 'Tim Deegan' <tim@xen.org>; 'Julien
> Grall' <julien.grall@arm.com>
> Subject: Re: [RFC PATCH V1 01/12] hvm/ioreq: Make x86's IOREQ feature common
>
>
> On 04.08.20 14:23, Paul Durrant wrote:
> >>
> >>> diff --git a/xen/include/xen/hvm/ioreq.h b/xen/include/xen/hvm/ioreq.h
> >>>> new file mode 100644
> >>>> index 0000000..40b7b5e
> >>>> --- /dev/null
> >>>> +++ b/xen/include/xen/hvm/ioreq.h
> >>>> @@ -0,0 +1,89 @@
> >>>> +/*
> >>>> + * hvm.h: Hardware virtual machine assist interface definitions.
> >>>> + *
> >>>> + * Copyright (c) 2016 Citrix Systems Inc.
> >>>> + *
> >>>> + * This program is free software; you can redistribute it and/or modify it
> >>>> + * under the terms and conditions of the GNU General Public License,
> >>>> + * version 2, as published by the Free Software Foundation.
> >>>> + *
> >>>> + * This program is distributed in the hope it will be useful, but WITHOUT
> >>>> + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
> >>>> + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
> >>>> + * more details.
> >>>> + *
> >>>> + * You should have received a copy of the GNU General Public License along with
> >>>> + * this program; If not, see <http://www.gnu.org/licenses/>.
> >>>> + */
> >>>> +
> >>>> +#ifndef __HVM_IOREQ_H__
> >>>> +#define __HVM_IOREQ_H__
> >>>> +
> >>>> +#include <xen/sched.h>
> >>>> +
> >>>> +#include <asm/hvm/ioreq.h>
> >>>> +
> >>>> +#define GET_IOREQ_SERVER(d, id) \
> >>>> + (d)->arch.hvm.ioreq_server.server[id]
> >>>> +
> >>>> +static inline struct hvm_ioreq_server *get_ioreq_server(const struct domain *d,
> >>>> + unsigned int id)
> >>>> +{
> >>>> + if ( id >= MAX_NR_IOREQ_SERVERS )
> >>>> + return NULL;
> >>>> +
> >>>> + return GET_IOREQ_SERVER(d, id);
> >>>> +}
> >>>> +
> >>>> +static inline bool hvm_ioreq_needs_completion(const ioreq_t *ioreq)
> >>>> +{
> >>>> + return ioreq->state == STATE_IOREQ_READY &&
> >>>> + !ioreq->data_is_ptr &&
> >>>> + (ioreq->type != IOREQ_TYPE_PIO || ioreq->dir != IOREQ_WRITE);
> >>>> +}
> >>> I don't think having this in common code is correct. The short-cut of not completing PIO reads
> seems
> >> somewhat x86 specific. Does ARM even have the concept of PIO?
> >>
> >> I am not 100% sure here, but it seems that doesn't have.
> >>
> >> Shall I make hvm_ioreq_needs_completion() per arch? Arm variant would
> >> have the same implementation, but without "ioreq->type !=
> >> IOREQ_TYPE_PIO" check...
> >>
> > With your series applied, does any common code actually call hvm_ioreq_needs_completion()? I suspect
> it will remain x86 specific, without any need for an Arm variant.
> Yes, it does. Please see common usage in hvm_io_assist() and
> handle_hvm_io_completion() (current patch) and usage in Arm code
> (arch/arm/io.c: io_state try_fwd_ioserv) [1]
>
>
> [1]
> https://lists.xenproject.org/archives/html/xen-devel/2020-08/msg00072.html
>

Yes, but that code is clearly not finished since, after setting io_completion it says:

/* XXX: Decide what to do */
if ( rc == IO_RETRY )
rc = IO_HANDLED;

So, it's not clear what the eventual implementation will be and whether it will need to make that call.

Paul

>
> --
> Regards,
>
> Oleksandr Tyshchenko
Re: [RFC PATCH V1 01/12] hvm/ioreq: Make x86's IOREQ feature common [ In reply to ]
Hi,

On 04/08/2020 12:10, Oleksandr wrote:
> On 04.08.20 10:45, Paul Durrant wrote:
>>> +static inline bool hvm_ioreq_needs_completion(const ioreq_t *ioreq)
>>> +{
>>> +    return ioreq->state == STATE_IOREQ_READY &&
>>> +           !ioreq->data_is_ptr &&
>>> +           (ioreq->type != IOREQ_TYPE_PIO || ioreq->dir !=
>>> IOREQ_WRITE);
>>> +}
>> I don't think having this in common code is correct. The short-cut of
>> not completing PIO reads seems somewhat x86 specific.

Hmmm, looking at the code, I think it doesn't wait for PIO writes to
complete (not read). Did I miss anything?

> Does ARM even
>> have the concept of PIO?
>
> I am not 100% sure here, but it seems that doesn't have.

Technically, the PIOs exist on Arm, however they are accessed the same
way as MMIO and will have a dedicated area defined by the HW.

AFAICT, on Arm64, they are only used for PCI IO Bar.

Now the question is whether we want to expose them to the Device
Emulator as PIO or MMIO access. From a generic PoV, a DM shouldn't have
to care about the architecture used. It should just be able to request a
given IOport region.

So it may make sense to differentiate them in the common ioreq code as well.

I had a quick look at QEMU and wasn't able to tell if PIOs and MMIOs
address space are different on Arm as well. Paul, Stefano, do you know
what they are doing?

Cheers,

--
Julien Grall
Re: [RFC PATCH V1 01/12] hvm/ioreq: Make x86's IOREQ feature common [ In reply to ]
On 04.08.2020 15:52, Julien Grall wrote:
> On 04/08/2020 12:10, Oleksandr wrote:
>> On 04.08.20 10:45, Paul Durrant wrote:
>>>> +static inline bool hvm_ioreq_needs_completion(const ioreq_t *ioreq)
>>>> +{
>>>> +    return ioreq->state == STATE_IOREQ_READY &&
>>>> +           !ioreq->data_is_ptr &&
>>>> +           (ioreq->type != IOREQ_TYPE_PIO || ioreq->dir !=
>>>> IOREQ_WRITE);
>>>> +}
>>> I don't think having this in common code is correct. The short-cut of
>>> not completing PIO reads seems somewhat x86 specific.
>
> Hmmm, looking at the code, I think it doesn't wait for PIO writes to
> complete (not read). Did I miss anything?

The point of the check isn't to determine whether to wait, but
what to do after having waited. Reads need a retry round through
the emulator (to store the result in the designated place),
while writes don't have such a requirement (and hence guest
execution can continue immediately in the general case).

Jan
Re: [RFC PATCH V1 01/12] hvm/ioreq: Make x86's IOREQ feature common [ In reply to ]
On Tue, 4 Aug 2020, Julien Grall wrote:
> On 04/08/2020 12:10, Oleksandr wrote:
> > On 04.08.20 10:45, Paul Durrant wrote:
> > > > +static inline bool hvm_ioreq_needs_completion(const ioreq_t *ioreq)
> > > > +{
> > > > +    return ioreq->state == STATE_IOREQ_READY &&
> > > > +           !ioreq->data_is_ptr &&
> > > > +           (ioreq->type != IOREQ_TYPE_PIO || ioreq->dir !=
> > > > IOREQ_WRITE);
> > > > +}
> > > I don't think having this in common code is correct. The short-cut of not
> > > completing PIO reads seems somewhat x86 specific.
>
> Hmmm, looking at the code, I think it doesn't wait for PIO writes to complete
> (not read). Did I miss anything?
>
> > Does ARM even
> > > have the concept of PIO?
> >
> > I am not 100% sure here, but it seems that doesn't have.
>
> Technically, the PIOs exist on Arm, however they are accessed the same way as
> MMIO and will have a dedicated area defined by the HW.
>
> AFAICT, on Arm64, they are only used for PCI IO Bar.
>
> Now the question is whether we want to expose them to the Device Emulator as
> PIO or MMIO access. From a generic PoV, a DM shouldn't have to care about the
> architecture used. It should just be able to request a given IOport region.
>
> So it may make sense to differentiate them in the common ioreq code as well.
>
> I had a quick look at QEMU and wasn't able to tell if PIOs and MMIOs address
> space are different on Arm as well. Paul, Stefano, do you know what they are
> doing?

On the QEMU side, it looks like PIO (address_space_io) is used in
connection with the emulation of the "in" or "out" instructions, see
ioport.c:cpu_inb for instance. Some parts of PCI on QEMU emulate PIO
space regardless of the architecture, such as
hw/pci/pci_bridge.c:pci_bridge_initfn.

However, because there is no "in" and "out" on ARM, I don't think
address_space_io can be accessed. Specifically, there is no equivalent
for target/i386/misc_helper.c:helper_inb on ARM.

So I think PIO is unused on ARM in QEMU.


FYI the ioreq type for PCI conf space reads and writes is
IOREQ_TYPE_PCI_CONFIG (neither MMIO nor PIO) which is implemented as
pci_host_config_read_common/pci_host_config_write_common directly
(neither PIO nor MMIO).


It looks like PIO-specific things could be kept x86-specific, without
loss of functionalities on the ARM side.


> The point of the check isn't to determine whether to wait, but
> what to do after having waited. Reads need a retry round through
> the emulator (to store the result in the designated place),
> while writes don't have such a requirement (and hence guest
> execution can continue immediately in the general case).

The x86 code looks like this:

rc = hvm_send_ioreq(s, &p, 0);
if ( rc != X86EMUL_RETRY || currd->is_shutting_down )
vio->io_req.state = STATE_IOREQ_NONE;
else if ( !hvm_ioreq_needs_completion(&vio->io_req) )
rc = X86EMUL_OKAY;

Basically hvm_send_ioreq is expected to return RETRY.
Then, if it is a PIO write operation only, it is turned into OKAY right
away. Otherwise, rc stays as RETRY.

So, normally, hvmemul_do_io is expected to return RETRY, because the
emulator is not done yet. Am I understanding the code correctly?

If so, who is handling RETRY on x86? It tried to follow the call chain
but ended up in the x86 emulator and got lost :-)


At some point later, after the emulator (QEMU) has completed the
request, handle_hvm_io_completion gets called which ends up calling
handle_mmio() finishing the job on the Xen side too.


In other words:
RETRY ==> emulation in progress
OKAY ==> emulation completed


Is that correct?
Re: [RFC PATCH V1 01/12] hvm/ioreq: Make x86's IOREQ feature common [ In reply to ]
On 04.08.2020 21:11, Stefano Stabellini wrote:
>> The point of the check isn't to determine whether to wait, but
>> what to do after having waited. Reads need a retry round through
>> the emulator (to store the result in the designated place),
>> while writes don't have such a requirement (and hence guest
>> execution can continue immediately in the general case).
>
> The x86 code looks like this:
>
> rc = hvm_send_ioreq(s, &p, 0);
> if ( rc != X86EMUL_RETRY || currd->is_shutting_down )
> vio->io_req.state = STATE_IOREQ_NONE;
> else if ( !hvm_ioreq_needs_completion(&vio->io_req) )
> rc = X86EMUL_OKAY;
>
> Basically hvm_send_ioreq is expected to return RETRY.
> Then, if it is a PIO write operation only, it is turned into OKAY right
> away. Otherwise, rc stays as RETRY.
>
> So, normally, hvmemul_do_io is expected to return RETRY, because the
> emulator is not done yet. Am I understanding the code correctly?

"The emulator" unfortunately is ambiguous here: Do you mean qemu
(or whichever else ioreq server) or the x86 emulator inside Xen?
There are various conditions leading to RETRY. As far as
hvm_send_ioreq() goes, it is expected to return RETRY whenever
some sort of response is to be expected (the most notable
exception being the hvm_send_buffered_ioreq() path), or when
submitting the request isn't possible in the first place.

> If so, who is handling RETRY on x86? It tried to follow the call chain
> but ended up in the x86 emulator and got lost :-)

Not sure I understand the question correctly, but I'll try an
answer nevertheless: hvm_send_ioreq() arranges for the vCPU to be
put to sleep (prepare_wait_on_xen_event_channel()). Once the event
channel got signaled (and vCPU unblocked), hvm_do_resume() ->
handle_hvm_io_completion() -> hvm_wait_for_io() then check whether
the wait reason has been satisfied (wait_on_xen_event_channel()),
and ...

> At some point later, after the emulator (QEMU) has completed the
> request, handle_hvm_io_completion gets called which ends up calling
> handle_mmio() finishing the job on the Xen side too.

..., as you say, handle_hvm_io_completion() invokes the retry of
the original operation (handle_mmio() or handle_pio() in
particular) if need be.

What's potentially confusing is that there's a second form of
retry, invoked by the x86 insn emulator itself when it needs to
split complex insns (the repeated string insns being the most
important example). This results in actually exiting back to guest
context without having advanced rIP, but after having updated
other register state suitably (to express the progress made so
far).

Jan
Re: [RFC PATCH V1 01/12] hvm/ioreq: Make x86's IOREQ feature common [ In reply to ]
Hi,

On 04/08/2020 20:11, Stefano Stabellini wrote:
> On Tue, 4 Aug 2020, Julien Grall wrote:
>> On 04/08/2020 12:10, Oleksandr wrote:
>>> On 04.08.20 10:45, Paul Durrant wrote:
>>>>> +static inline bool hvm_ioreq_needs_completion(const ioreq_t *ioreq)
>>>>> +{
>>>>> +    return ioreq->state == STATE_IOREQ_READY &&
>>>>> +           !ioreq->data_is_ptr &&
>>>>> +           (ioreq->type != IOREQ_TYPE_PIO || ioreq->dir !=
>>>>> IOREQ_WRITE);
>>>>> +}
>>>> I don't think having this in common code is correct. The short-cut of not
>>>> completing PIO reads seems somewhat x86 specific.
>>
>> Hmmm, looking at the code, I think it doesn't wait for PIO writes to complete
>> (not read). Did I miss anything?
>>
>>> Does ARM even
>>>> have the concept of PIO?
>>>
>>> I am not 100% sure here, but it seems that doesn't have.
>>
>> Technically, the PIOs exist on Arm, however they are accessed the same way as
>> MMIO and will have a dedicated area defined by the HW.
>>
>> AFAICT, on Arm64, they are only used for PCI IO Bar.
>>
>> Now the question is whether we want to expose them to the Device Emulator as
>> PIO or MMIO access. From a generic PoV, a DM shouldn't have to care about the
>> architecture used. It should just be able to request a given IOport region.
>>
>> So it may make sense to differentiate them in the common ioreq code as well.
>>
>> I had a quick look at QEMU and wasn't able to tell if PIOs and MMIOs address
>> space are different on Arm as well. Paul, Stefano, do you know what they are
>> doing?
>
> On the QEMU side, it looks like PIO (address_space_io) is used in
> connection with the emulation of the "in" or "out" instructions, see
> ioport.c:cpu_inb for instance. Some parts of PCI on QEMU emulate PIO
> space regardless of the architecture, such as
> hw/pci/pci_bridge.c:pci_bridge_initfn.
>
> However, because there is no "in" and "out" on ARM, I don't think
> address_space_io can be accessed. Specifically, there is no equivalent
> for target/i386/misc_helper.c:helper_inb on ARM.

So how PCI I/O BAR are accessed? Surely, they could be used on Arm, right?

Cheers,

--
Julien Grall
Re: [RFC PATCH V1 01/12] hvm/ioreq: Make x86's IOREQ feature common [ In reply to ]
Hi,

On 03/08/2020 19:21, Oleksandr Tyshchenko wrote:
> From: Oleksandr Tyshchenko <oleksandr_tyshchenko@epam.com>
>
> As a lot of x86 code can be re-used on Arm later on, this patch
> splits IOREQ support into common and arch specific parts.
>
> This support is going to be used on Arm to be able run device
> emulator outside of Xen hypervisor.
>
> Please note, this is a split/cleanup of Julien's PoC:
> "Add support for Guest IO forwarding to a device emulator"
>
> Signed-off-by: Julien Grall <julien.grall@arm.com>
> Signed-off-by: Oleksandr Tyshchenko <oleksandr_tyshchenko@epam.com>
> ---
> xen/arch/x86/Kconfig | 1 +
> xen/arch/x86/hvm/dm.c | 2 +-
> xen/arch/x86/hvm/emulate.c | 2 +-
> xen/arch/x86/hvm/hvm.c | 2 +-
> xen/arch/x86/hvm/io.c | 2 +-
> xen/arch/x86/hvm/ioreq.c | 1431 +--------------------------------------
> xen/arch/x86/hvm/stdvga.c | 2 +-
> xen/arch/x86/hvm/vmx/realmode.c | 1 +
> xen/arch/x86/hvm/vmx/vvmx.c | 2 +-
> xen/arch/x86/mm.c | 2 +-
> xen/arch/x86/mm/shadow/common.c | 2 +-
> xen/common/Kconfig | 3 +
> xen/common/Makefile | 1 +
> xen/common/hvm/Makefile | 1 +
> xen/common/hvm/ioreq.c | 1430 ++++++++++++++++++++++++++++++++++++++
> xen/include/asm-x86/hvm/ioreq.h | 45 +-
> xen/include/asm-x86/hvm/vcpu.h | 7 -
> xen/include/xen/hvm/ioreq.h | 89 +++
> 18 files changed, 1575 insertions(+), 1450 deletions(-)

That's quite a lot of code moved in a single patch. How can we check the
code moved is still correct? Is it a verbatim copy?

> create mode 100644 xen/common/hvm/Makefile
> create mode 100644 xen/common/hvm/ioreq.c
> create mode 100644 xen/include/xen/hvm/ioreq.h

[...]

> +static bool hvm_wait_for_io(struct hvm_ioreq_vcpu *sv, ioreq_t *p)
> +{
> + unsigned int prev_state = STATE_IOREQ_NONE;
> +
> + while ( sv->pending )
> + {
> + unsigned int state = p->state;
> +
> + smp_rmb();
> +
> + recheck:
> + if ( unlikely(state == STATE_IOREQ_NONE) )
> + {
> + /*
> + * The only reason we should see this case is when an
> + * emulator is dying and it races with an I/O being
> + * requested.
> + */
> + hvm_io_assist(sv, ~0ul);
> + break;
> + }
> +
> + if ( unlikely(state < prev_state) )
> + {
> + gdprintk(XENLOG_ERR, "Weird HVM ioreq state transition %u -> %u\n",
> + prev_state, state);
> + sv->pending = false;
> + domain_crash(sv->vcpu->domain);
> + return false; /* bail */
> + }
> +
> + switch ( prev_state = state )
> + {
> + case STATE_IORESP_READY: /* IORESP_READY -> NONE */
> + p->state = STATE_IOREQ_NONE;
> + hvm_io_assist(sv, p->data);
> + break;
> + case STATE_IOREQ_READY: /* IOREQ_{READY,INPROCESS} -> IORESP_READY */
> + case STATE_IOREQ_INPROCESS:
> + wait_on_xen_event_channel(sv->ioreq_evtchn,
> + ({ state = p->state;
> + smp_rmb();
> + state != prev_state; }));
> + goto recheck;

I recall some discussion on security@ about this specific code. An IOREQ
server can be destroyed at any time. When destroying IOREQ server, the
all the vCPUs will be paused to avoid race.

On x86, this was considered to be safe because
wait_on_xen_event_channel() will never return if the vCPU is re-scheduled.

However, on Arm, this function will return even after rescheduling. In
this case, sv and p may point to invalid memory.

IIRC, the suggestion was to harden hvm_wait_for_io(). I guess we could
fetch the sv and p after wait_on_xen_event_channel.

Any opinions?

Cheers,

--
Julien Grall
Re: [RFC PATCH V1 01/12] hvm/ioreq: Make x86's IOREQ feature common [ In reply to ]
On 03/08/2020 19:21, Oleksandr Tyshchenko wrote:
> diff --git a/xen/common/Makefile b/xen/common/Makefile
> index 06881d0..f6fc3f8 100644
> --- a/xen/common/Makefile
> +++ b/xen/common/Makefile
> @@ -70,6 +70,7 @@ extra-y := symbols-dummy.o
>
> obj-$(CONFIG_COVERAGE) += coverage/
> obj-y += sched/
> +obj-$(CONFIG_IOREQ_SERVER) += hvm/
> obj-$(CONFIG_UBSAN) += ubsan/
>
> obj-$(CONFIG_NEEDS_LIBELF) += libelf/
> diff --git a/xen/common/hvm/Makefile b/xen/common/hvm/Makefile
> new file mode 100644
> index 0000000..326215d
> --- /dev/null
> +++ b/xen/common/hvm/Makefile
> @@ -0,0 +1 @@
> +obj-y += ioreq.o
> diff --git a/xen/common/hvm/ioreq.c b/xen/common/hvm/ioreq.c
> new file mode 100644
> index 0000000..7e1fa23
> --- /dev/null
> +++ b/xen/common/hvm/ioreq.c
> <snip>

HVM is an internal detail of arch specific code.  It should not escape
into common code.

From x86's point of view, there is nothing conceptually wrong with
having an IOREQ server for PV guests, although it is very unlikely at
this point that adding support would be a good use of time.

Please make this into a proper top-level common set of functionality.

~Andrew
Re: [RFC PATCH V1 01/12] hvm/ioreq: Make x86's IOREQ feature common [ In reply to ]
On Wed, 5 Aug 2020, Jan Beulich wrote:
> On 04.08.2020 21:11, Stefano Stabellini wrote:
> >> The point of the check isn't to determine whether to wait, but
> >> what to do after having waited. Reads need a retry round through
> >> the emulator (to store the result in the designated place),
> >> while writes don't have such a requirement (and hence guest
> >> execution can continue immediately in the general case).
> >
> > The x86 code looks like this:
> >
> > rc = hvm_send_ioreq(s, &p, 0);
> > if ( rc != X86EMUL_RETRY || currd->is_shutting_down )
> > vio->io_req.state = STATE_IOREQ_NONE;
> > else if ( !hvm_ioreq_needs_completion(&vio->io_req) )
> > rc = X86EMUL_OKAY;
> >
> > Basically hvm_send_ioreq is expected to return RETRY.
> > Then, if it is a PIO write operation only, it is turned into OKAY right
> > away. Otherwise, rc stays as RETRY.
> >
> > So, normally, hvmemul_do_io is expected to return RETRY, because the
> > emulator is not done yet. Am I understanding the code correctly?
>
> "The emulator" unfortunately is ambiguous here: Do you mean qemu
> (or whichever else ioreq server) or the x86 emulator inside Xen?

I meant QEMU. I'll use "QEMU" instead of "emulator" in this thread going
forward for clarity.


> There are various conditions leading to RETRY. As far as
> hvm_send_ioreq() goes, it is expected to return RETRY whenever
> some sort of response is to be expected (the most notable
> exception being the hvm_send_buffered_ioreq() path), or when
> submitting the request isn't possible in the first place.
>
> > If so, who is handling RETRY on x86? It tried to follow the call chain
> > but ended up in the x86 emulator and got lost :-)
>
> Not sure I understand the question correctly, but I'll try an
> answer nevertheless: hvm_send_ioreq() arranges for the vCPU to be
> put to sleep (prepare_wait_on_xen_event_channel()). Once the event
> channel got signaled (and vCPU unblocked), hvm_do_resume() ->
> handle_hvm_io_completion() -> hvm_wait_for_io() then check whether
> the wait reason has been satisfied (wait_on_xen_event_channel()),
> and ...
>
> > At some point later, after the emulator (QEMU) has completed the
> > request, handle_hvm_io_completion gets called which ends up calling
> > handle_mmio() finishing the job on the Xen side too.
>
> ..., as you say, handle_hvm_io_completion() invokes the retry of
> the original operation (handle_mmio() or handle_pio() in
> particular) if need be.

OK, thanks for the details. My interpretation seems to be correct.

In which case, it looks like xen/arch/arm/io.c:try_fwd_ioserv should
return IO_RETRY. Then, xen/arch/arm/traps.c:do_trap_stage2_abort_guest
also needs to handle try_handle_mmio returning IO_RETRY the first
around, and IO_HANDLED when after QEMU does its job.

What should do_trap_stage2_abort_guest do on IO_RETRY? Simply return
early and let the scheduler do its job? Something like:

enum io_state state = try_handle_mmio(regs, hsr, gpa);

switch ( state )
{
case IO_ABORT:
goto inject_abt;
case IO_HANDLED:
advance_pc(regs, hsr);
return;
case IO_RETRY:
/* finish later */
return;
case IO_UNHANDLED:
/* IO unhandled, try another way to handle it. */
break;
default:
ASSERT_UNREACHABLE();
}

Then, xen/arch/arm/ioreq.c:handle_mmio() gets called by
handle_hvm_io_completion() after QEMU completes the emulation. Today,
handle_mmio just sets the user register with the read value.

But it would be better if it called again the original function
do_trap_stage2_abort_guest to actually retry the original operation.
This time do_trap_stage2_abort_guest calls try_handle_mmio() and gets
IO_HANDLED instead of IO_RETRY, thus, it will advance_pc (the program
counter) completing the handling of this instruction.

The user register with the read value could be set by try_handle_mmio if
try_fwd_ioserv returns IO_HANDLED instead of IO_RETRY.

Is that how the state machine is expected to work?


> What's potentially confusing is that there's a second form of
> retry, invoked by the x86 insn emulator itself when it needs to
> split complex insns (the repeated string insns being the most
> important example). This results in actually exiting back to guest
> context without having advanced rIP, but after having updated
> other register state suitably (to express the progress made so
> far).

Ah! And it seems to be exactly the same label: X86EMUL_RETRY. It would
be a good idea to differentiate between them.
Re: [RFC PATCH V1 01/12] hvm/ioreq: Make x86's IOREQ feature common [ In reply to ]
On Wed, 5 Aug 2020, Julien Grall wrote:
> On 04/08/2020 20:11, Stefano Stabellini wrote:
> > On Tue, 4 Aug 2020, Julien Grall wrote:
> > > On 04/08/2020 12:10, Oleksandr wrote:
> > > > On 04.08.20 10:45, Paul Durrant wrote:
> > > > > > +static inline bool hvm_ioreq_needs_completion(const ioreq_t *ioreq)
> > > > > > +{
> > > > > > +    return ioreq->state == STATE_IOREQ_READY &&
> > > > > > +           !ioreq->data_is_ptr &&
> > > > > > +           (ioreq->type != IOREQ_TYPE_PIO || ioreq->dir !=
> > > > > > IOREQ_WRITE);
> > > > > > +}
> > > > > I don't think having this in common code is correct. The short-cut of
> > > > > not
> > > > > completing PIO reads seems somewhat x86 specific.
> > >
> > > Hmmm, looking at the code, I think it doesn't wait for PIO writes to
> > > complete
> > > (not read). Did I miss anything?
> > >
> > > > Does ARM even
> > > > > have the concept of PIO?
> > > >
> > > > I am not 100% sure here, but it seems that doesn't have.
> > >
> > > Technically, the PIOs exist on Arm, however they are accessed the same way
> > > as
> > > MMIO and will have a dedicated area defined by the HW.
> > >
> > > AFAICT, on Arm64, they are only used for PCI IO Bar.
> > >
> > > Now the question is whether we want to expose them to the Device Emulator
> > > as
> > > PIO or MMIO access. From a generic PoV, a DM shouldn't have to care about
> > > the
> > > architecture used. It should just be able to request a given IOport
> > > region.
> > >
> > > So it may make sense to differentiate them in the common ioreq code as
> > > well.
> > >
> > > I had a quick look at QEMU and wasn't able to tell if PIOs and MMIOs
> > > address
> > > space are different on Arm as well. Paul, Stefano, do you know what they
> > > are
> > > doing?
> >
> > On the QEMU side, it looks like PIO (address_space_io) is used in
> > connection with the emulation of the "in" or "out" instructions, see
> > ioport.c:cpu_inb for instance. Some parts of PCI on QEMU emulate PIO
> > space regardless of the architecture, such as
> > hw/pci/pci_bridge.c:pci_bridge_initfn.
> >
> > However, because there is no "in" and "out" on ARM, I don't think
> > address_space_io can be accessed. Specifically, there is no equivalent
> > for target/i386/misc_helper.c:helper_inb on ARM.
>
> So how PCI I/O BAR are accessed? Surely, they could be used on Arm, right?

PIO is also memory mapped on ARM and it seems to have its own MMIO
address window.
Re: [RFC PATCH V1 01/12] hvm/ioreq: Make x86's IOREQ feature common [ In reply to ]
On 06.08.2020 02:37, Stefano Stabellini wrote:
> What should do_trap_stage2_abort_guest do on IO_RETRY? Simply return
> early and let the scheduler do its job? Something like:
>
> enum io_state state = try_handle_mmio(regs, hsr, gpa);
>
> switch ( state )
> {
> case IO_ABORT:
> goto inject_abt;
> case IO_HANDLED:
> advance_pc(regs, hsr);
> return;
> case IO_RETRY:
> /* finish later */
> return;
> case IO_UNHANDLED:
> /* IO unhandled, try another way to handle it. */
> break;
> default:
> ASSERT_UNREACHABLE();
> }
>
> Then, xen/arch/arm/ioreq.c:handle_mmio() gets called by
> handle_hvm_io_completion() after QEMU completes the emulation. Today,
> handle_mmio just sets the user register with the read value.
>
> But it would be better if it called again the original function
> do_trap_stage2_abort_guest to actually retry the original operation.
> This time do_trap_stage2_abort_guest calls try_handle_mmio() and gets
> IO_HANDLED instead of IO_RETRY, thus, it will advance_pc (the program
> counter) completing the handling of this instruction.
>
> The user register with the read value could be set by try_handle_mmio if
> try_fwd_ioserv returns IO_HANDLED instead of IO_RETRY.
>
> Is that how the state machine is expected to work?

I think so. Just because it has taken us quite some time (years) on
the x86 side to get reasonably close to how hardware would behave
(I think we're still not fully there): The re-execution path needs
to make sure it observes exactly the same machine state as the
original path did. In particular changes to memory (by another vCPU)
must not be observed.

Jan
Re: [RFC PATCH V1 01/12] hvm/ioreq: Make x86's IOREQ feature common [ In reply to ]
On 05.08.20 19:15, Andrew Cooper wrote:

Hi Andrew

> On 03/08/2020 19:21, Oleksandr Tyshchenko wrote:
>> diff --git a/xen/common/Makefile b/xen/common/Makefile
>> index 06881d0..f6fc3f8 100644
>> --- a/xen/common/Makefile
>> +++ b/xen/common/Makefile
>> @@ -70,6 +70,7 @@ extra-y := symbols-dummy.o
>>
>> obj-$(CONFIG_COVERAGE) += coverage/
>> obj-y += sched/
>> +obj-$(CONFIG_IOREQ_SERVER) += hvm/
>> obj-$(CONFIG_UBSAN) += ubsan/
>>
>> obj-$(CONFIG_NEEDS_LIBELF) += libelf/
>> diff --git a/xen/common/hvm/Makefile b/xen/common/hvm/Makefile
>> new file mode 100644
>> index 0000000..326215d
>> --- /dev/null
>> +++ b/xen/common/hvm/Makefile
>> @@ -0,0 +1 @@
>> +obj-y += ioreq.o
>> diff --git a/xen/common/hvm/ioreq.c b/xen/common/hvm/ioreq.c
>> new file mode 100644
>> index 0000000..7e1fa23
>> --- /dev/null
>> +++ b/xen/common/hvm/ioreq.c
>> <snip>
> HVM is an internal detail of arch specific code.  It should not escape
> into common code.
>
> From x86's point of view, there is nothing conceptually wrong with
> having an IOREQ server for PV guests, although it is very unlikely at
> this point that adding support would be a good use of time.

Got it.


> Please make this into a proper top-level common set of functionality.

ok.


--
Regards,

Oleksandr Tyshchenko
Re: [RFC PATCH V1 01/12] hvm/ioreq: Make x86's IOREQ feature common [ In reply to ]
Hi,

On 06/08/2020 01:37, Stefano Stabellini wrote:
> On Wed, 5 Aug 2020, Julien Grall wrote:
>> On 04/08/2020 20:11, Stefano Stabellini wrote:
>>> On Tue, 4 Aug 2020, Julien Grall wrote:
>>>> On 04/08/2020 12:10, Oleksandr wrote:
>>>>> On 04.08.20 10:45, Paul Durrant wrote:
>>>>>>> +static inline bool hvm_ioreq_needs_completion(const ioreq_t *ioreq)
>>>>>>> +{
>>>>>>> +    return ioreq->state == STATE_IOREQ_READY &&
>>>>>>> +           !ioreq->data_is_ptr &&
>>>>>>> +           (ioreq->type != IOREQ_TYPE_PIO || ioreq->dir !=
>>>>>>> IOREQ_WRITE);
>>>>>>> +}
>>>>>> I don't think having this in common code is correct. The short-cut of
>>>>>> not
>>>>>> completing PIO reads seems somewhat x86 specific.
>>>>
>>>> Hmmm, looking at the code, I think it doesn't wait for PIO writes to
>>>> complete
>>>> (not read). Did I miss anything?
>>>>
>>>>> Does ARM even
>>>>>> have the concept of PIO?
>>>>>
>>>>> I am not 100% sure here, but it seems that doesn't have.
>>>>
>>>> Technically, the PIOs exist on Arm, however they are accessed the same way
>>>> as
>>>> MMIO and will have a dedicated area defined by the HW.
>>>>
>>>> AFAICT, on Arm64, they are only used for PCI IO Bar.
>>>>
>>>> Now the question is whether we want to expose them to the Device Emulator
>>>> as
>>>> PIO or MMIO access. From a generic PoV, a DM shouldn't have to care about
>>>> the
>>>> architecture used. It should just be able to request a given IOport
>>>> region.
>>>>
>>>> So it may make sense to differentiate them in the common ioreq code as
>>>> well.
>>>>
>>>> I had a quick look at QEMU and wasn't able to tell if PIOs and MMIOs
>>>> address
>>>> space are different on Arm as well. Paul, Stefano, do you know what they
>>>> are
>>>> doing?
>>>
>>> On the QEMU side, it looks like PIO (address_space_io) is used in
>>> connection with the emulation of the "in" or "out" instructions, see
>>> ioport.c:cpu_inb for instance. Some parts of PCI on QEMU emulate PIO
>>> space regardless of the architecture, such as
>>> hw/pci/pci_bridge.c:pci_bridge_initfn.
>>>
>>> However, because there is no "in" and "out" on ARM, I don't think
>>> address_space_io can be accessed. Specifically, there is no equivalent
>>> for target/i386/misc_helper.c:helper_inb on ARM.
>>
>> So how PCI I/O BAR are accessed? Surely, they could be used on Arm, right?
>
> PIO is also memory mapped on ARM and it seems to have its own MMIO
> address window.
This part is already well-understood :). However, this only tell us how
an OS is accessing a PIO.

What I am trying to figure out is how the hardware (or QEMU) is meant to
work.

From my understanding, the MMIO access will be received by the
hostbridge and then forwarded to the appropriate PCI device. The two
questions I am trying to answer is: How the I/O BARs are configured?
Will it contain an MMIO address or an offset?

If the answer is the latter, then we will need PIO because a DM will
never see the MMIO address (the hostbridge will be emulated in Xen).

I am still trying to navigate through the code and didn't manage to find
an answer so far.

Cheers,

--
Julien Grall
Re: [RFC PATCH V1 01/12] hvm/ioreq: Make x86's IOREQ feature common [ In reply to ]
On 05.08.20 16:30, Julien Grall wrote:
> Hi,

Hi Julien


>
> On 03/08/2020 19:21, Oleksandr Tyshchenko wrote:
>> From: Oleksandr Tyshchenko <oleksandr_tyshchenko@epam.com>
>>
>> As a lot of x86 code can be re-used on Arm later on, this patch
>> splits IOREQ support into common and arch specific parts.
>>
>> This support is going to be used on Arm to be able run device
>> emulator outside of Xen hypervisor.
>>
>> Please note, this is a split/cleanup of Julien's PoC:
>> "Add support for Guest IO forwarding to a device emulator"
>>
>> Signed-off-by: Julien Grall <julien.grall@arm.com>
>> Signed-off-by: Oleksandr Tyshchenko <oleksandr_tyshchenko@epam.com>
>> ---
>>   xen/arch/x86/Kconfig            |    1 +
>>   xen/arch/x86/hvm/dm.c           |    2 +-
>>   xen/arch/x86/hvm/emulate.c      |    2 +-
>>   xen/arch/x86/hvm/hvm.c          |    2 +-
>>   xen/arch/x86/hvm/io.c           |    2 +-
>>   xen/arch/x86/hvm/ioreq.c        | 1431
>> +--------------------------------------
>>   xen/arch/x86/hvm/stdvga.c       |    2 +-
>>   xen/arch/x86/hvm/vmx/realmode.c |    1 +
>>   xen/arch/x86/hvm/vmx/vvmx.c     |    2 +-
>>   xen/arch/x86/mm.c               |    2 +-
>>   xen/arch/x86/mm/shadow/common.c |    2 +-
>>   xen/common/Kconfig              |    3 +
>>   xen/common/Makefile             |    1 +
>>   xen/common/hvm/Makefile         |    1 +
>>   xen/common/hvm/ioreq.c          | 1430
>> ++++++++++++++++++++++++++++++++++++++
>>   xen/include/asm-x86/hvm/ioreq.h |   45 +-
>>   xen/include/asm-x86/hvm/vcpu.h  |    7 -
>>   xen/include/xen/hvm/ioreq.h     |   89 +++
>>   18 files changed, 1575 insertions(+), 1450 deletions(-)
>
> That's quite a lot of code moved in a single patch. How can we check
> the code moved is still correct? Is it a verbatim copy?
In this patch I mostly tried to separate a common IOREQ part which also
resulted in updating x86 sources to include new header.  Also I moved
hvm_ioreq_needs_completion() to common header (which probably wanted to
be in a separate patch). It was a verbatim copy initially (w/o
hvm_map_mem_type_to_ioreq_server) and then updated to deal with arch
specific parts.
In which way do you want me to split this patch?

I could think of the following:

1. Copy of x86's ioreq.c/ioreq.h to common code
2. Update common ioreq.c/ioreq.h
3. Update x86's parts to be able to deal with common code
4. Move hvm_ioreq_needs_completion() to common code

--
Regards,

Oleksandr Tyshchenko
Re: [RFC PATCH V1 01/12] hvm/ioreq: Make x86's IOREQ feature common [ In reply to ]
On Thu, 6 Aug 2020, Jan Beulich wrote:
> On 06.08.2020 02:37, Stefano Stabellini wrote:
> > What should do_trap_stage2_abort_guest do on IO_RETRY? Simply return
> > early and let the scheduler do its job? Something like:
> >
> > enum io_state state = try_handle_mmio(regs, hsr, gpa);
> >
> > switch ( state )
> > {
> > case IO_ABORT:
> > goto inject_abt;
> > case IO_HANDLED:
> > advance_pc(regs, hsr);
> > return;
> > case IO_RETRY:
> > /* finish later */
> > return;
> > case IO_UNHANDLED:
> > /* IO unhandled, try another way to handle it. */
> > break;
> > default:
> > ASSERT_UNREACHABLE();
> > }
> >
> > Then, xen/arch/arm/ioreq.c:handle_mmio() gets called by
> > handle_hvm_io_completion() after QEMU completes the emulation. Today,
> > handle_mmio just sets the user register with the read value.
> >
> > But it would be better if it called again the original function
> > do_trap_stage2_abort_guest to actually retry the original operation.
> > This time do_trap_stage2_abort_guest calls try_handle_mmio() and gets
> > IO_HANDLED instead of IO_RETRY, thus, it will advance_pc (the program
> > counter) completing the handling of this instruction.
> >
> > The user register with the read value could be set by try_handle_mmio if
> > try_fwd_ioserv returns IO_HANDLED instead of IO_RETRY.
> >
> > Is that how the state machine is expected to work?
>
> I think so. Just because it has taken us quite some time (years) on
> the x86 side to get reasonably close to how hardware would behave
> (I think we're still not fully there): The re-execution path needs
> to make sure it observes exactly the same machine state as the
> original path did. In particular changes to memory (by another vCPU)
> must not be observed.

Thanks for the heads up. I think I understand how it is supposed to work
now. I hope Oleksandr is on the same page.
Re: [RFC PATCH V1 01/12] hvm/ioreq: Make x86's IOREQ feature common [ In reply to ]
On Thu, 6 Aug 2020, Julien Grall wrote:
> On 06/08/2020 01:37, Stefano Stabellini wrote:
> > On Wed, 5 Aug 2020, Julien Grall wrote:
> > > On 04/08/2020 20:11, Stefano Stabellini wrote:
> > > > On Tue, 4 Aug 2020, Julien Grall wrote:
> > > > > On 04/08/2020 12:10, Oleksandr wrote:
> > > > > > On 04.08.20 10:45, Paul Durrant wrote:
> > > > > > > > +static inline bool hvm_ioreq_needs_completion(const ioreq_t
> > > > > > > > *ioreq)
> > > > > > > > +{
> > > > > > > > +    return ioreq->state == STATE_IOREQ_READY &&
> > > > > > > > +           !ioreq->data_is_ptr &&
> > > > > > > > +           (ioreq->type != IOREQ_TYPE_PIO || ioreq->dir !=
> > > > > > > > IOREQ_WRITE);
> > > > > > > > +}
> > > > > > > I don't think having this in common code is correct. The short-cut
> > > > > > > of
> > > > > > > not
> > > > > > > completing PIO reads seems somewhat x86 specific.
> > > > >
> > > > > Hmmm, looking at the code, I think it doesn't wait for PIO writes to
> > > > > complete
> > > > > (not read). Did I miss anything?
> > > > >
> > > > > > Does ARM even
> > > > > > > have the concept of PIO?
> > > > > >
> > > > > > I am not 100% sure here, but it seems that doesn't have.
> > > > >
> > > > > Technically, the PIOs exist on Arm, however they are accessed the same
> > > > > way
> > > > > as
> > > > > MMIO and will have a dedicated area defined by the HW.
> > > > >
> > > > > AFAICT, on Arm64, they are only used for PCI IO Bar.
> > > > >
> > > > > Now the question is whether we want to expose them to the Device
> > > > > Emulator
> > > > > as
> > > > > PIO or MMIO access. From a generic PoV, a DM shouldn't have to care
> > > > > about
> > > > > the
> > > > > architecture used. It should just be able to request a given IOport
> > > > > region.
> > > > >
> > > > > So it may make sense to differentiate them in the common ioreq code as
> > > > > well.
> > > > >
> > > > > I had a quick look at QEMU and wasn't able to tell if PIOs and MMIOs
> > > > > address
> > > > > space are different on Arm as well. Paul, Stefano, do you know what
> > > > > they
> > > > > are
> > > > > doing?
> > > >
> > > > On the QEMU side, it looks like PIO (address_space_io) is used in
> > > > connection with the emulation of the "in" or "out" instructions, see
> > > > ioport.c:cpu_inb for instance. Some parts of PCI on QEMU emulate PIO
> > > > space regardless of the architecture, such as
> > > > hw/pci/pci_bridge.c:pci_bridge_initfn.
> > > >
> > > > However, because there is no "in" and "out" on ARM, I don't think
> > > > address_space_io can be accessed. Specifically, there is no equivalent
> > > > for target/i386/misc_helper.c:helper_inb on ARM.
> > >
> > > So how PCI I/O BAR are accessed? Surely, they could be used on Arm, right?
> >
> > PIO is also memory mapped on ARM and it seems to have its own MMIO
> > address window.
> This part is already well-understood :). However, this only tell us how an OS
> is accessing a PIO.
>
> What I am trying to figure out is how the hardware (or QEMU) is meant to work.
>
> From my understanding, the MMIO access will be received by the hostbridge and
> then forwarded to the appropriate PCI device. The two questions I am trying to
> answer is: How the I/O BARs are configured? Will it contain an MMIO address or
> an offset?
>
> If the answer is the latter, then we will need PIO because a DM will never see
> the MMIO address (the hostbridge will be emulated in Xen).

Now I understand the question :-)

This is the way I understand it works. Let's say that the PIO aperture
is 0x1000-0x2000 which is aliased to 0x3eff0000-0x3eff1000.
0x1000-0x2000 are addresses that cannot be accessed directly.
0x3eff0000-0x3eff1000 is the range that works.

A PCI device PIO BAR will have an address in the 0x1000-0x2000 range,
for instance 0x1100.

However, when the operating system access 0x1100, it will issue a read
to 0x3eff0100.

Xen will trap the read to 0x3eff0100 and send it to QEMU.

QEMU has to know that 0x3eff0000-0x3eff1000 is the alias to the PIO
aperture and that 0x3eff0100 correspond to PCI device foobar. Similarly,
QEMU has also to know the address range of the MMIO aperture and its
remappings, if any (it is possible to have address remapping for MMIO
addresses too.)

I think today this information is "built-in" QEMU, not configurable. It
works fine because *I think* the PCI aperture is pretty much the same on
x86 boards, at least the one supported by QEMU for Xen.

On ARM, I think we should explicitly declare the PCI MMIO aperture and
its alias/address-remapping. When we do that, we can also declare the
PIO aperture and its alias/address-remapping.
Re: [RFC PATCH V1 01/12] hvm/ioreq: Make x86's IOREQ feature common [ In reply to ]
On 06.08.20 23:32, Stefano Stabellini wrote:

Hi Stefano

> On Thu, 6 Aug 2020, Jan Beulich wrote:
>> On 06.08.2020 02:37, Stefano Stabellini wrote:
>>> What should do_trap_stage2_abort_guest do on IO_RETRY? Simply return
>>> early and let the scheduler do its job? Something like:
>>>
>>> enum io_state state = try_handle_mmio(regs, hsr, gpa);
>>>
>>> switch ( state )
>>> {
>>> case IO_ABORT:
>>> goto inject_abt;
>>> case IO_HANDLED:
>>> advance_pc(regs, hsr);
>>> return;
>>> case IO_RETRY:
>>> /* finish later */
>>> return;
>>> case IO_UNHANDLED:
>>> /* IO unhandled, try another way to handle it. */
>>> break;
>>> default:
>>> ASSERT_UNREACHABLE();
>>> }
>>>
>>> Then, xen/arch/arm/ioreq.c:handle_mmio() gets called by
>>> handle_hvm_io_completion() after QEMU completes the emulation. Today,
>>> handle_mmio just sets the user register with the read value.
>>>
>>> But it would be better if it called again the original function
>>> do_trap_stage2_abort_guest to actually retry the original operation.
>>> This time do_trap_stage2_abort_guest calls try_handle_mmio() and gets
>>> IO_HANDLED instead of IO_RETRY, thus, it will advance_pc (the program
>>> counter) completing the handling of this instruction.
>>>
>>> The user register with the read value could be set by try_handle_mmio if
>>> try_fwd_ioserv returns IO_HANDLED instead of IO_RETRY.
>>>
>>> Is that how the state machine is expected to work?
>> I think so. Just because it has taken us quite some time (years) on
>> the x86 side to get reasonably close to how hardware would behave
>> (I think we're still not fully there): The re-execution path needs
>> to make sure it observes exactly the same machine state as the
>> original path did. In particular changes to memory (by another vCPU)
>> must not be observed.
> Thanks for the heads up. I think I understand how it is supposed to work
> now. I hope Oleksandr is on the same page.

Not completely. I am still going through the discussion and navigating
the code trying to understand missing bits for me.

Thanks for trying to clarify how it supposed to work.


--
Regards,

Oleksandr Tyshchenko
Re: [RFC PATCH V1 01/12] hvm/ioreq: Make x86's IOREQ feature common [ In reply to ]
On 06.08.20 03:37, Stefano Stabellini wrote:

Hi Stefano

Trying to simulate IO_RETRY handling mechanism (according to model
below) I continuously get IO_RETRY from try_fwd_ioserv() ...

> OK, thanks for the details. My interpretation seems to be correct.
>
> In which case, it looks like xen/arch/arm/io.c:try_fwd_ioserv should
> return IO_RETRY. Then, xen/arch/arm/traps.c:do_trap_stage2_abort_guest
> also needs to handle try_handle_mmio returning IO_RETRY the first
> around, and IO_HANDLED when after QEMU does its job.
>
> What should do_trap_stage2_abort_guest do on IO_RETRY? Simply return
> early and let the scheduler do its job? Something like:
>
> enum io_state state = try_handle_mmio(regs, hsr, gpa);
>
> switch ( state )
> {
> case IO_ABORT:
> goto inject_abt;
> case IO_HANDLED:
> advance_pc(regs, hsr);
> return;
> case IO_RETRY:
> /* finish later */
> return;
> case IO_UNHANDLED:
> /* IO unhandled, try another way to handle it. */
> break;
> default:
> ASSERT_UNREACHABLE();
> }
>
> Then, xen/arch/arm/ioreq.c:handle_mmio() gets called by
> handle_hvm_io_completion() after QEMU completes the emulation. Today,
> handle_mmio just sets the user register with the read value.
>
> But it would be better if it called again the original function
> do_trap_stage2_abort_guest to actually retry the original operation.
> This time do_trap_stage2_abort_guest calls try_handle_mmio() and gets
> IO_HANDLED instead of IO_RETRY,
I may miss some important point, but I failed to see why try_handle_mmio
(try_fwd_ioserv) will return IO_HANDLED instead of IO_RETRY at this stage.
Or current try_fwd_ioserv() logic needs rework?


> thus, it will advance_pc (the program
> counter) completing the handling of this instruction.
>
> The user register with the read value could be set by try_handle_mmio if
> try_fwd_ioserv returns IO_HANDLED instead of IO_RETRY.
>
> Is that how the state machine is expected to work?



--
Regards,

Oleksandr Tyshchenko
Re: [RFC PATCH V1 01/12] hvm/ioreq: Make x86's IOREQ feature common [ In reply to ]
On Fri, 7 Aug 2020, Oleksandr wrote:
> On 06.08.20 03:37, Stefano Stabellini wrote:
>
> Hi Stefano
>
> Trying to simulate IO_RETRY handling mechanism (according to model below) I
> continuously get IO_RETRY from try_fwd_ioserv() ...
>
> > OK, thanks for the details. My interpretation seems to be correct.
> >
> > In which case, it looks like xen/arch/arm/io.c:try_fwd_ioserv should
> > return IO_RETRY. Then, xen/arch/arm/traps.c:do_trap_stage2_abort_guest
> > also needs to handle try_handle_mmio returning IO_RETRY the first
> > around, and IO_HANDLED when after QEMU does its job.
> >
> > What should do_trap_stage2_abort_guest do on IO_RETRY? Simply return
> > early and let the scheduler do its job? Something like:
> >
> > enum io_state state = try_handle_mmio(regs, hsr, gpa);
> >
> > switch ( state )
> > {
> > case IO_ABORT:
> > goto inject_abt;
> > case IO_HANDLED:
> > advance_pc(regs, hsr);
> > return;
> > case IO_RETRY:
> > /* finish later */
> > return;
> > case IO_UNHANDLED:
> > /* IO unhandled, try another way to handle it. */
> > break;
> > default:
> > ASSERT_UNREACHABLE();
> > }
> >
> > Then, xen/arch/arm/ioreq.c:handle_mmio() gets called by
> > handle_hvm_io_completion() after QEMU completes the emulation. Today,
> > handle_mmio just sets the user register with the read value.
> >
> > But it would be better if it called again the original function
> > do_trap_stage2_abort_guest to actually retry the original operation.
> > This time do_trap_stage2_abort_guest calls try_handle_mmio() and gets
> > IO_HANDLED instead of IO_RETRY,
> I may miss some important point, but I failed to see why try_handle_mmio
> (try_fwd_ioserv) will return IO_HANDLED instead of IO_RETRY at this stage.
> Or current try_fwd_ioserv() logic needs rework?

I think you should check the ioreq->state in try_fwd_ioserv(), if the
result is ready, then ioreq->state should be STATE_IORESP_READY, and you
can return IO_HANDLED.

That is assuming that you are looking at the live version of the ioreq
shared with QEMU instead of a private copy of it, which I am not sure.
Looking at try_fwd_ioserv() it would seem that vio->io_req is just a
copy? The live version is returned by get_ioreq() ?

Even in handle_hvm_io_completion, instead of setting vio->io_req.state
to STATE_IORESP_READY by hand, it would be better to look at the live
version of the ioreq because QEMU will have already set ioreq->state
to STATE_IORESP_READY (hw/i386/xen/xen-hvm.c:cpu_handle_ioreq).


> > thus, it will advance_pc (the program
> > counter) completing the handling of this instruction.
> >
> > The user register with the read value could be set by try_handle_mmio if
> > try_fwd_ioserv returns IO_HANDLED instead of IO_RETRY.
> >
> > Is that how the state machine is expected to work?
Re: [RFC PATCH V1 01/12] hvm/ioreq: Make x86's IOREQ feature common [ In reply to ]
On 08.08.20 00:50, Stefano Stabellini wrote:

Hi Stefano

> On Fri, 7 Aug 2020, Oleksandr wrote:
>> On 06.08.20 03:37, Stefano Stabellini wrote:
>>
>> Hi Stefano
>>
>> Trying to simulate IO_RETRY handling mechanism (according to model below) I
>> continuously get IO_RETRY from try_fwd_ioserv() ...
>>
>>> OK, thanks for the details. My interpretation seems to be correct.
>>>
>>> In which case, it looks like xen/arch/arm/io.c:try_fwd_ioserv should
>>> return IO_RETRY. Then, xen/arch/arm/traps.c:do_trap_stage2_abort_guest
>>> also needs to handle try_handle_mmio returning IO_RETRY the first
>>> around, and IO_HANDLED when after QEMU does its job.
>>>
>>> What should do_trap_stage2_abort_guest do on IO_RETRY? Simply return
>>> early and let the scheduler do its job? Something like:
>>>
>>> enum io_state state = try_handle_mmio(regs, hsr, gpa);
>>>
>>> switch ( state )
>>> {
>>> case IO_ABORT:
>>> goto inject_abt;
>>> case IO_HANDLED:
>>> advance_pc(regs, hsr);
>>> return;
>>> case IO_RETRY:
>>> /* finish later */
>>> return;
>>> case IO_UNHANDLED:
>>> /* IO unhandled, try another way to handle it. */
>>> break;
>>> default:
>>> ASSERT_UNREACHABLE();
>>> }
>>>
>>> Then, xen/arch/arm/ioreq.c:handle_mmio() gets called by
>>> handle_hvm_io_completion() after QEMU completes the emulation. Today,
>>> handle_mmio just sets the user register with the read value.
>>>
>>> But it would be better if it called again the original function
>>> do_trap_stage2_abort_guest to actually retry the original operation.
>>> This time do_trap_stage2_abort_guest calls try_handle_mmio() and gets
>>> IO_HANDLED instead of IO_RETRY,
>> I may miss some important point, but I failed to see why try_handle_mmio
>> (try_fwd_ioserv) will return IO_HANDLED instead of IO_RETRY at this stage.
>> Or current try_fwd_ioserv() logic needs rework?
> I think you should check the ioreq->state in try_fwd_ioserv(), if the
> result is ready, then ioreq->state should be STATE_IORESP_READY, and you
> can return IO_HANDLED.

Indeed! Just coming to this opinion I saw your answer)

This is a dirty test patch:


---
 xen/arch/arm/io.c               | 12 ++++++++++++
 xen/arch/arm/ioreq.c            | 12 ++++++++++++
 xen/arch/arm/traps.c            |  6 ++++--
 xen/include/asm-arm/hvm/ioreq.h |  2 ++
 xen/include/asm-arm/traps.h     |  3 +++
 5 files changed, 33 insertions(+), 2 deletions(-)

diff --git a/xen/arch/arm/io.c b/xen/arch/arm/io.c
index 436f669..65a08f8 100644
--- a/xen/arch/arm/io.c
+++ b/xen/arch/arm/io.c
@@ -130,6 +130,10 @@ static enum io_state try_fwd_ioserv(struct
cpu_user_regs *regs,
     {
     case STATE_IOREQ_NONE:
         break;
+
+    case STATE_IORESP_READY:
+        return IO_HANDLED;
+
     default:
         printk("d%u wrong state %u\n", v->domain->domain_id,
                vio->io_req.state);
@@ -156,9 +160,11 @@ static enum io_state try_fwd_ioserv(struct
cpu_user_regs *regs,
     else
         vio->io_completion = HVMIO_mmio_completion;

+#if 0
     /* XXX: Decide what to do */
     if ( rc == IO_RETRY )
         rc = IO_HANDLED;
+#endif

     return rc;
 }
@@ -185,6 +191,12 @@ enum io_state try_handle_mmio(struct cpu_user_regs
*regs,

 #ifdef CONFIG_IOREQ_SERVER
         rc = try_fwd_ioserv(regs, v, &info);
+        if ( rc == IO_HANDLED )
+        {
+            printk("HANDLED %s[%d]\n", __func__, __LINE__);
+            handle_mmio_finish();
+        } else
+            printk("RETRY %s[%d]\n", __func__, __LINE__);
 #endif

         return rc;
diff --git a/xen/arch/arm/ioreq.c b/xen/arch/arm/ioreq.c
index 8f60c41..c8ed454 100644
--- a/xen/arch/arm/ioreq.c
+++ b/xen/arch/arm/ioreq.c
@@ -33,8 +33,20 @@
 #include <public/hvm/dm_op.h>
 #include <public/hvm/ioreq.h>

+#include <asm/traps.h>
+
 bool handle_mmio(void)
 {
+    struct cpu_user_regs *regs = guest_cpu_user_regs();
+    const union hsr hsr = { .bits = regs->hsr };
+
+    do_trap_stage2_abort_guest(regs, hsr);
+
+    return true;
+}
+
+bool handle_mmio_finish(void)
+{
     struct vcpu *v = current;
     struct cpu_user_regs *regs = guest_cpu_user_regs();
     const union hsr hsr = { .bits = regs->hsr };
diff --git a/xen/arch/arm/traps.c b/xen/arch/arm/traps.c
index ea472d1..3493d77 100644
--- a/xen/arch/arm/traps.c
+++ b/xen/arch/arm/traps.c
@@ -1882,7 +1882,7 @@ static bool try_map_mmio(gfn_t gfn)
     return !map_regions_p2mt(d, gfn, 1, mfn, p2m_mmio_direct_c);
 }

-static void do_trap_stage2_abort_guest(struct cpu_user_regs *regs,
+void do_trap_stage2_abort_guest(struct cpu_user_regs *regs,
                                        const union hsr hsr)
 {
     /*
@@ -1965,11 +1965,13 @@ static void do_trap_stage2_abort_guest(struct
cpu_user_regs *regs,
             case IO_HANDLED:
                 advance_pc(regs, hsr);
                 return;
+            case IO_RETRY:
+                /* finish later */
+                return;
             case IO_UNHANDLED:
                 /* IO unhandled, try another way to handle it. */
                 break;
             default:
-                /* XXX: Handle IO_RETRY */
                 ASSERT_UNREACHABLE();
             }
         }
diff --git a/xen/include/asm-arm/hvm/ioreq.h
b/xen/include/asm-arm/hvm/ioreq.h
index 392ce64..fb4684d 100644
--- a/xen/include/asm-arm/hvm/ioreq.h
+++ b/xen/include/asm-arm/hvm/ioreq.h
@@ -27,6 +27,8 @@

 bool handle_mmio(void);

+bool handle_mmio_finish(void);
+
 static inline bool handle_pio(uint16_t port, unsigned int size, int dir)
 {
     /* XXX */
diff --git a/xen/include/asm-arm/traps.h b/xen/include/asm-arm/traps.h
index 997c378..392fdb1 100644
--- a/xen/include/asm-arm/traps.h
+++ b/xen/include/asm-arm/traps.h
@@ -40,6 +40,9 @@ void advance_pc(struct cpu_user_regs *regs, const
union hsr hsr);

 void inject_undef_exception(struct cpu_user_regs *regs, const union
hsr hsr);

+void do_trap_stage2_abort_guest(struct cpu_user_regs *regs,
+                                        const union hsr hsr);
+
 /* read as zero and write ignore */
 void handle_raz_wi(struct cpu_user_regs *regs, int regidx, bool read,
                    const union hsr hsr, int min_el);
--
2.7.4


>
> That is assuming that you are looking at the live version of the ioreq
> shared with QEMU instead of a private copy of it, which I am not sure.
> Looking at try_fwd_ioserv() it would seem that vio->io_req is just a
> copy? The live version is returned by get_ioreq() ?
>
> Even in handle_hvm_io_completion, instead of setting vio->io_req.state
> to STATE_IORESP_READY by hand, it would be better to look at the live
> version of the ioreq because QEMU will have already set ioreq->state
> to STATE_IORESP_READY (hw/i386/xen/xen-hvm.c:cpu_handle_ioreq).

I need to recheck that.


Thank you.


--
Regards,

Oleksandr Tyshchenko

1 2  View All