Mailing List Archive

[RFC PATCH 3/3] Implement 3-level event channel routines.
From: Wei Liu <liuw@liuw.name>

Add some fields in struct domain to hold pointer to 3-level shared arrays.
Also add per-cpu second level selector in struct vcpu.

These structures are mapped by a new evtchn op. Guest should fall back to use
2-level event channel if mapping fails.

The routines are more or less as the 2-level ones.

Signed-off-by: Wei Liu <wei.liu2@citrix.com>
---
xen/arch/x86/domain.c | 2 +
xen/arch/x86/irq.c | 2 +-
xen/common/event_channel.c | 520 ++++++++++++++++++++++++++++++++++--
xen/common/keyhandler.c | 12 +-
xen/common/schedule.c | 2 +-
xen/include/public/event_channel.h | 24 ++
xen/include/public/xen.h | 17 +-
xen/include/xen/event.h | 4 +
xen/include/xen/sched.h | 12 +-
9 files changed, 565 insertions(+), 30 deletions(-)

diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c
index 7a07c06..b457b00 100644
--- a/xen/arch/x86/domain.c
+++ b/xen/arch/x86/domain.c
@@ -2058,6 +2058,8 @@ int domain_relinquish_resources(struct domain *d)
}
}

+ event_channel_unmap_nlevel(d);
+
d->arch.relmem = RELMEM_shared;
/* fallthrough */

diff --git a/xen/arch/x86/irq.c b/xen/arch/x86/irq.c
index 05cede5..d517e39 100644
--- a/xen/arch/x86/irq.c
+++ b/xen/arch/x86/irq.c
@@ -1452,7 +1452,7 @@ int pirq_guest_unmask(struct domain *d)
{
pirq = pirqs[i]->pirq;
if ( pirqs[i]->masked &&
- !test_bit(pirqs[i]->evtchn, &shared_info(d, evtchn_mask)) )
+ !evtchn_is_masked(d, pirqs[i]->evtchn) )
pirq_guest_eoi(pirqs[i]);
}
} while ( ++pirq < d->nr_pirqs && n == ARRAY_SIZE(pirqs) );
diff --git a/xen/common/event_channel.c b/xen/common/event_channel.c
index 9898f8e..fb3a7b4 100644
--- a/xen/common/event_channel.c
+++ b/xen/common/event_channel.c
@@ -27,6 +27,7 @@
#include <xen/guest_access.h>
#include <xen/keyhandler.h>
#include <asm/current.h>
+#include <xen/paging.h>

#include <public/xen.h>
#include <public/event_channel.h>
@@ -413,6 +414,85 @@ static long evtchn_bind_pirq(evtchn_bind_pirq_t *bind)
return rc;
}

+static inline int __evtchn_is_masked_l2(struct domain *d, int port)
+{
+ return test_bit(port, &shared_info(d, evtchn_mask));
+}
+
+static inline int __evtchn_is_masked_l3(struct domain *d, int port)
+{
+ return test_bit(port % EVTCHNS_PER_PAGE,
+ d->evtchn_mask[port / EVTCHNS_PER_PAGE]);
+}
+
+int evtchn_is_masked(struct domain *d, int port)
+{
+ switch ( d->evtchn_level )
+ {
+ case 2:
+ return __evtchn_is_masked_l2(d, port);
+ case 3:
+ return __evtchn_is_masked_l3(d, port);
+ default:
+ printk(" %s: unknown event channel level %d for domain %d \n",
+ __FUNCTION__, d->evtchn_level, d->domain_id);
+ return 1;
+ }
+}
+
+static inline int __evtchn_is_pending_l2(struct domain *d, int port)
+{
+ return test_bit(port, &shared_info(d, evtchn_pending));
+}
+
+static inline int __evtchn_is_pending_l3(struct domain *d, int port)
+{
+ return test_bit(port % EVTCHNS_PER_PAGE,
+ d->evtchn_pending[port / EVTCHNS_PER_PAGE]);
+}
+
+int evtchn_is_pending(struct domain *d, int port)
+{
+ switch ( d->evtchn_level )
+ {
+ case 2:
+ return __evtchn_is_pending_l2(d, port);
+ case 3:
+ return __evtchn_is_pending_l3(d, port);
+ default:
+ printk(" %s: unknown event channel level %d for domain %d\n",
+ __FUNCTION__, d->evtchn_level, d->domain_id);
+ return 0;
+ }
+}
+
+static inline void __evtchn_clear_pending_l2(struct domain *d, int port)
+{
+ clear_bit(port, &shared_info(d, evtchn_pending));
+}
+
+static inline void __evtchn_clear_pending_l3(struct domain *d, int port)
+{
+ clear_bit(port % EVTCHNS_PER_PAGE,
+ d->evtchn_pending[port / EVTCHNS_PER_PAGE]);
+}
+
+static void evtchn_clear_pending(struct domain *d, int port)
+{
+ switch ( d->evtchn_level )
+ {
+ case 2:
+ __evtchn_clear_pending_l2(d, port);
+ break;
+ case 3:
+ __evtchn_clear_pending_l3(d, port);
+ break;
+ default:
+ printk("Cannot clear pending for %d level event channel"
+ " for domain %d, port %d\n",
+ d->evtchn_level, d->domain_id, port);
+ }
+}

static long __evtchn_close(struct domain *d1, int port1)
{
@@ -529,7 +609,7 @@ static long __evtchn_close(struct domain *d1, int port1)
}

/* Clear pending event to avoid unexpected behavior on re-bind. */
- clear_bit(port1, &shared_info(d1, evtchn_pending));
+ evtchn_clear_pending(d1, port1);

/* Reset binding to vcpu0 when the channel is freed. */
chn1->state = ECS_FREE;
@@ -606,16 +686,15 @@ int evtchn_send(struct domain *d, unsigned int lport)
ret = -EINVAL;
}

-out:
+ out:
spin_unlock(&ld->event_lock);

return ret;
}

-static void evtchn_set_pending(struct vcpu *v, int port)
+static void __evtchn_set_pending_l2(struct vcpu *v, int port)
{
struct domain *d = v->domain;
- int vcpuid;

/*
* The following bit operations must happen in strict order.
@@ -633,7 +712,50 @@ static void evtchn_set_pending(struct vcpu *v, int port)
{
vcpu_mark_events_pending(v);
}
-
+}
+
+static void __evtchn_set_pending_l3(struct vcpu *v, int port)
+{
+ struct domain *d = v->domain;
+
+ int page_no = port / EVTCHNS_PER_PAGE;
+ int offset = port % EVTCHNS_PER_PAGE;
+ int l1cb = BITS_PER_EVTCHN_WORD(d)*BITS_PER_EVTCHN_WORD(d);
+ int l2cb = BITS_PER_EVTCHN_WORD(d);
+
+ if ( test_and_set_bit(offset, d->evtchn_pending[page_no]) )
+ return;
+
+ if ( !test_bit(offset, d->evtchn_mask[page_no]) &&
+ !test_and_set_bit(port / l2cb,
+ v->evtchn_pending_sel_l2) &&
+ !test_and_set_bit(port / l1cb,
+ &vcpu_info(v, evtchn_pending_sel)) )
+ {
+ vcpu_mark_events_pending(v);
+ }
+}
+
+static void evtchn_set_pending(struct vcpu *v, int port)
+{
+ struct domain *d = v->domain;
+ int vcpuid;
+
+ switch ( d->evtchn_level )
+ {
+ case 2:
+ __evtchn_set_pending_l2(v, port);
+ break;
+ case 3:
+ __evtchn_set_pending_l3(v, port);
+ break;
+ default:
+ printk("Cannot set pending for %d level event channel"
+ " for domain %d, port %d\n",
+ d->evtchn_level, d->domain_id, port);
+ return;
+ }
+
/* Check if some VCPU might be polling for this event. */
if ( likely(bitmap_empty(d->poll_mask, d->max_vcpus)) )
return;
@@ -916,21 +1038,16 @@ long evtchn_bind_vcpu(unsigned int port, unsigned int vcpu_id)
}


-int evtchn_unmask(unsigned int port)
+static int __evtchn_unmask_l2(unsigned int port)
{
struct domain *d = current->domain;
struct vcpu *v;

- ASSERT(spin_is_locked(&d->event_lock));
-
- if ( unlikely(!port_is_valid(d, port)) )
- return -EINVAL;
-
v = d->vcpu[evtchn_from_port(d, port)->notify_vcpu_id];

/*
* These operations must happen in strict order. Based on
- * include/xen/event.h:evtchn_set_pending().
+ * __evtchn_set_pending_l2().
*/
if ( test_and_clear_bit(port, &shared_info(d, evtchn_mask)) &&
test_bit (port, &shared_info(d, evtchn_pending)) &&
@@ -943,6 +1060,58 @@ int evtchn_unmask(unsigned int port)
return 0;
}

+static int __evtchn_unmask_l3(unsigned int port)
+{
+ struct domain *d = current->domain;
+ struct vcpu *v;
+
+ int page_no = port / EVTCHNS_PER_PAGE;
+ int offset = port % EVTCHNS_PER_PAGE;
+ int l1cb = BITS_PER_EVTCHN_WORD(d)*BITS_PER_EVTCHN_WORD(d);
+ int l2cb = BITS_PER_EVTCHN_WORD(d);
+
+ v = d->vcpu[evtchn_from_port(d, port)->notify_vcpu_id];
+
+ if ( test_and_clear_bit(offset, d->evtchn_mask[page_no]) &&
+ test_bit(offset, d->evtchn_pending[page_no]) &&
+ !test_and_set_bit(port / l2cb,
+ v->evtchn_pending_sel_l2) &&
+ !test_and_set_bit(port / l1cb,
+ &vcpu_info(v, evtchn_pending_sel)) )
+ {
+ vcpu_mark_events_pending(v);
+ }
+
+ return 0;
+}
+
+int evtchn_unmask(unsigned int port)
+{
+ struct domain *d = current->domain;
+ int rc = -EINVAL;
+
+ ASSERT(spin_is_locked(&d->event_lock));
+
+ if ( unlikely(!port_is_valid(d, port)) )
+ return -EINVAL;
+
+ switch ( d->evtchn_level )
+ {
+ case 2:
+ rc = __evtchn_unmask_l2(port);
+ break;
+ case 3:
+ rc = __evtchn_unmask_l3(port);
+ break;
+ default:
+ printk("Cannot unmask port %d for %d level event channel"
+ " for domain %d\n", port,
+ d->evtchn_level, d->domain_id);
+ }
+
+ return rc;
+}
+

static long evtchn_reset(evtchn_reset_t *r)
{
@@ -969,6 +1138,290 @@ out:
return rc;
}

+static void __unmap_l2_sel(struct vcpu *v)
+{
+ unsigned long mfn;
+
+ if ( v->evtchn_pending_sel_l2 != 0 )
+ {
+ unmap_domain_page_global(v->evtchn_pending_sel_l2);
+ mfn = virt_to_mfn(v->evtchn_pending_sel_l2);
+ put_page_and_type(mfn_to_page(mfn));
+
+ v->evtchn_pending_sel_l2 = 0;
+ }
+}
+
+static int __map_l2_sel(struct vcpu *v, unsigned long gfn, unsigned long off)
+{
+ void *mapping;
+ int rc = -EINVAL;
+ struct page_info *page;
+ struct domain *d = v->domain;
+
+ if ( off >= PAGE_SIZE )
+ return rc;
+
+ page = get_page_from_gfn(d, gfn, NULL, P2M_ALLOC);
+ if ( !page )
+ goto out;
+
+ if ( !get_page_type(page, PGT_writable_page) )
+ {
+ put_page(page);
+ goto out;
+ }
+
+ mapping = __map_domain_page_global(page);
+ if ( mapping == NULL )
+ {
+ put_page_and_type(page);
+ rc = -ENOMEM;
+ goto out;
+ }
+
+ v->evtchn_pending_sel_l2 = (unsigned long *)(mapping + off);
+ rc = 0;
+
+ out:
+ return rc;
+}
+
+
+static void __unmap_l3_arrays(struct domain *d)
+{
+ int i;
+ unsigned long mfn;
+
+ for ( i = 0; i < MAX_L3_PAGES; i++ )
+ {
+ if ( d->evtchn_pending[i] != 0 )
+ {
+ unmap_domain_page_global(d->evtchn_pending[i]);
+ mfn = virt_to_mfn(d->evtchn_pending[i]);
+ put_page_and_type(mfn_to_page(mfn));
+ d->evtchn_pending[i] = 0;
+ }
+ if ( d->evtchn_mask[i] != 0 )
+ {
+ unmap_domain_page_global(d->evtchn_mask[i]);
+ mfn = virt_to_mfn(d->evtchn_mask[i]);
+ put_page_and_type(mfn_to_page(mfn));
+ d->evtchn_mask[i] = 0;
+ }
+ }
+}
+
+static int __map_l3_arrays(struct domain *d, unsigned long *pending,
+ unsigned long *mask)
+{
+ int i;
+ void *pending_mapping, *mask_mapping;
+ struct page_info *pending_page, *mask_page;
+ unsigned long pending_gfn, mask_gfn;
+ int rc = -EINVAL;
+
+ for ( i = 0;
+ i < MAX_L3_PAGES && pending[i] != 0 && mask[i] != 0;
+ i++ )
+ {
+ pending_gfn = pending[i];
+ mask_gfn = mask[i];
+
+ pending_page = get_page_from_gfn(d, pending_gfn, NULL, P2M_ALLOC);
+ if ( !pending_page )
+ goto err;
+
+ mask_page = get_page_from_gfn(d, mask_gfn, NULL, P2M_ALLOC);
+ if ( !mask_page )
+ {
+ put_page(pending_page);
+ goto err;
+ }
+
+ if ( !get_page_type(pending_page, PGT_writable_page) )
+ {
+ put_page(pending_page);
+ put_page(mask_page);
+ goto err;
+ }
+
+ if ( !get_page_type(mask_page, PGT_writable_page) )
+ {
+ put_page_and_type(pending_page);
+ put_page(mask_page);
+ goto err;
+ }
+
+ pending_mapping = __map_domain_page_global(pending_page);
+ if ( !pending_mapping )
+ {
+ put_page_and_type(pending_page);
+ put_page_and_type(mask_page);
+ rc = -ENOMEM;
+ goto err;
+ }
+
+ mask_mapping = __map_domain_page_global(mask_page);
+ if ( !mask_mapping )
+ {
+ unmap_domain_page_global(pending_mapping);
+ put_page_and_type(pending_page);
+ put_page_and_type(mask_page);
+ rc = -ENOMEM;
+ goto err;
+ }
+
+ d->evtchn_pending[i] = pending_mapping;
+ d->evtchn_mask[i] = mask_mapping;
+ }
+
+ rc = 0;
+
+err:
+ return rc;
+}
+
+static void __evtchn_unmap_all_3level(struct domain *d)
+{
+ struct vcpu *v;
+ /* This is called when destroying a domain, so no pausing... */
+ for_each_vcpu ( d, v )
+ __unmap_l2_sel(v);
+ __unmap_l3_arrays(d);
+}
+
+void event_channel_unmap_nlevel(struct domain *d)
+{
+ switch ( d->evtchn_level )
+ {
+ case 3:
+ __evtchn_unmap_all_3level(d);
+ break;
+ default:
+ break;
+ }
+}
+
+static void __evtchn_migrate_bitmap_l3(struct domain *d)
+{
+ struct vcpu *v;
+
+ /* Easy way to migrate, just move existing selector down one level
+ * then copy the pending array and mask array */
+ for_each_vcpu ( d, v )
+ {
+ memcpy(&v->evtchn_pending_sel_l2[0],
+ &vcpu_info(v, evtchn_pending_sel),
+ sizeof(vcpu_info(v, evtchn_pending_sel)));
+
+ memset(&vcpu_info(v, evtchn_pending_sel), 0,
+ sizeof(vcpu_info(v, evtchn_pending_sel)));
+
+ set_bit(0, &vcpu_info(v, evtchn_pending_sel));
+ }
+
+ memcpy(d->evtchn_pending[0], &shared_info(d, evtchn_pending),
+ sizeof(shared_info(d, evtchn_pending)));
+ memcpy(d->evtchn_mask[0], &shared_info(d, evtchn_mask),
+ sizeof(shared_info(d, evtchn_mask)));
+}
+
+static long __evtchn_register_3level(struct evtchn_register_3level *r)
+{
+ struct domain *d = current->domain;
+ unsigned long mfns[r->nr_vcpus];
+ unsigned long offsets[r->nr_vcpus];
+ unsigned char was_up[r->nr_vcpus];
+ int rc, i;
+ struct vcpu *v;
+
+ if ( d->evtchn_level == 3 )
+ return -EINVAL;
+
+ if ( r->nr_vcpus > d->max_vcpus )
+ return -EINVAL;
+
+ for ( i = 0; i < MAX_L3_PAGES; i++ )
+ if ( d->evtchn_pending[i] || d->evtchn_mask[i] )
+ return -EINVAL;
+
+ for_each_vcpu ( d, v )
+ if ( v->evtchn_pending_sel_l2 )
+ return -EINVAL;
+
+ if ( copy_from_user(mfns, r->l2sel_mfn,
+ sizeof(unsigned long)*r->nr_vcpus) )
+ return -EFAULT;
+
+ if ( copy_from_user(offsets, r->l2sel_offset,
+ sizeof(unsigned long)*r->nr_vcpus) )
+ return -EFAULT;
+
+ /* put cpu offline */
+ for_each_vcpu ( d, v )
+ {
+ if ( v == current )
+ was_up[v->vcpu_id] = 1;
+ else
+ was_up[v->vcpu_id] = !test_and_set_bit(_VPF_down,
+ &v->pause_flags);
+ }
+
+ /* map evtchn pending array and evtchn mask array */
+ rc = __map_l3_arrays(d, r->evtchn_pending, r->evtchn_mask);
+ if ( rc )
+ goto out;
+
+ for_each_vcpu ( d, v )
+ {
+ if ( (rc = __map_l2_sel(v, mfns[v->vcpu_id], offsets[v->vcpu_id])) )
+ {
+ struct vcpu *v1;
+ for_each_vcpu ( d, v1 )
+ __unmap_l2_sel(v1);
+ __unmap_l3_arrays(d);
+ goto out;
+ }
+ }
+
+ /* Scan current bitmap and migrate all outstanding events to new bitmap */
+ __evtchn_migrate_bitmap_l3(d);
+
+ /* make sure all writes take effect before switching to new routines */
+ wmb();
+ d->evtchn_level = 3;
+
+ out:
+ /* bring cpus back online */
+ for_each_vcpu ( d, v )
+ if ( was_up[v->vcpu_id] &&
+ test_and_clear_bit(_VPF_down, &v->pause_flags) )
+ vcpu_wake(v);
+
+ return rc;
+}
+
+static long evtchn_register_nlevel(evtchn_register_nlevel_t *r)
+{
+ struct domain *d = current->domain;
+ int rc;
+
+ spin_lock(&d->event_lock);
+
+ switch ( r->level )
+ {
+ case 3:
+ rc = __evtchn_register_3level(&r->u.l3);
+ break;
+ default:
+ rc = -EINVAL;
+ }
+
+ spin_unlock(&d->event_lock);
+
+ return rc;
+}

long do_event_channel_op(int cmd, XEN_GUEST_HANDLE_PARAM(void) arg)
{
@@ -1078,6 +1531,14 @@ long do_event_channel_op(int cmd, XEN_GUEST_HANDLE_PARAM(void) arg)
break;
}

+ case EVTCHNOP_register_nlevel: {
+ struct evtchn_register_nlevel reg;
+ if ( copy_from_guest(&reg, arg, 1) != 0 )
+ return -EFAULT;
+ rc = evtchn_register_nlevel(&reg);
+ break;
+ }
+
default:
rc = -ENOSYS;
break;
@@ -1251,7 +1712,6 @@ void evtchn_move_pirqs(struct vcpu *v)
spin_unlock(&d->event_lock);
}

-
static void domain_dump_evtchn_info(struct domain *d)
{
unsigned int port;
@@ -1260,8 +1720,10 @@ static void domain_dump_evtchn_info(struct domain *d)
bitmap_scnlistprintf(keyhandler_scratch, sizeof(keyhandler_scratch),
d->poll_mask, d->max_vcpus);
printk("Event channel information for domain %d:\n"
+ "using %d-level event channel\n"
"Polling vCPUs: {%s}\n"
- " port [p/m]\n", d->domain_id, keyhandler_scratch);
+ " port [p/m]\n", d->domain_id, d->evtchn_level,
+ keyhandler_scratch);

spin_lock(&d->event_lock);

@@ -1269,6 +1731,8 @@ static void domain_dump_evtchn_info(struct domain *d)
{
const struct evtchn *chn;
char *ssid;
+ int page_no = port / EVTCHNS_PER_PAGE;
+ int offset = port % EVTCHNS_PER_PAGE;

if ( !port_is_valid(d, port) )
continue;
@@ -1276,11 +1740,28 @@ static void domain_dump_evtchn_info(struct domain *d)
if ( chn->state == ECS_FREE )
continue;

- printk(" %4u [%d/%d]: s=%d n=%d x=%d",
- port,
- !!test_bit(port, &shared_info(d, evtchn_pending)),
- !!test_bit(port, &shared_info(d, evtchn_mask)),
- chn->state, chn->notify_vcpu_id, chn->xen_consumer);
+ printk(" %4u", port);
+
+ switch ( d->evtchn_level )
+ {
+ case 2:
+ printk(" [%d/%d]:",
+ !!test_bit(port, &shared_info(d, evtchn_pending)),
+ !!test_bit(port, &shared_info(d, evtchn_mask)));
+ break;
+ case 3:
+ printk(" [%d/%d]:",
+ !!test_bit(offset, d->evtchn_pending[page_no]),
+ !!test_bit(offset, d->evtchn_mask[page_no]));
+ break;
+ default:
+ printk(" %s: unknown event channel level %d for domain %d \n",
+ __FUNCTION__, d->evtchn_level, d->domain_id);
+ goto out;
+ }
+
+ printk(" s=%d n=%d x=%d", chn->state, chn->notify_vcpu_id,
+ chn->xen_consumer);

switch ( chn->state )
{
@@ -1310,6 +1791,7 @@ static void domain_dump_evtchn_info(struct domain *d)
}
}

+ out:
spin_unlock(&d->event_lock);
}

diff --git a/xen/common/keyhandler.c b/xen/common/keyhandler.c
index 2c5c230..294cca9 100644
--- a/xen/common/keyhandler.c
+++ b/xen/common/keyhandler.c
@@ -298,15 +298,15 @@ static void dump_domains(unsigned char key)
{
for_each_vcpu ( d, v )
{
+ unsigned int bits = BITS_PER_EVTCHN_WORD(d);
+ for (int i = 2; i < d->evtchn_level; i++)
+ bits *= BITS_PER_EVTCHN_WORD(d);
printk("Notifying guest %d:%d (virq %d, port %d, stat %d/%d/%d)\n",
d->domain_id, v->vcpu_id,
VIRQ_DEBUG, v->virq_to_evtchn[VIRQ_DEBUG],
- test_bit(v->virq_to_evtchn[VIRQ_DEBUG],
- &shared_info(d, evtchn_pending)),
- test_bit(v->virq_to_evtchn[VIRQ_DEBUG],
- &shared_info(d, evtchn_mask)),
- test_bit(v->virq_to_evtchn[VIRQ_DEBUG] /
- BITS_PER_EVTCHN_WORD(d),
+ evtchn_is_pending(d, v->virq_to_evtchn[VIRQ_DEBUG]),
+ evtchn_is_masked(d, v->virq_to_evtchn[VIRQ_DEBUG]),
+ test_bit(v->virq_to_evtchn[VIRQ_DEBUG] / bits,
&vcpu_info(v, evtchn_pending_sel)));
send_guest_vcpu_virq(v, VIRQ_DEBUG);
}
diff --git a/xen/common/schedule.c b/xen/common/schedule.c
index ae798c9..b676c9c 100644
--- a/xen/common/schedule.c
+++ b/xen/common/schedule.c
@@ -693,7 +693,7 @@ static long do_poll(struct sched_poll *sched_poll)
goto out;

rc = 0;
- if ( test_bit(port, &shared_info(d, evtchn_pending)) )
+ if ( evtchn_is_pending(d, port) )
goto out;
}

diff --git a/xen/include/public/event_channel.h b/xen/include/public/event_channel.h
index 07ff321..29cd6e9 100644
--- a/xen/include/public/event_channel.h
+++ b/xen/include/public/event_channel.h
@@ -71,6 +71,7 @@
#define EVTCHNOP_bind_vcpu 8
#define EVTCHNOP_unmask 9
#define EVTCHNOP_reset 10
+#define EVTCHNOP_register_nlevel 11
/* ` } */

typedef uint32_t evtchn_port_t;
@@ -258,6 +259,29 @@ struct evtchn_reset {
typedef struct evtchn_reset evtchn_reset_t;

/*
+ * EVTCHNOP_register_nlevel: Register N level event channels.
+ * NOTES:
+ * 1. currently only 3-level is supported.
+ * 2. should fall back to basic 2-level if this call fails.
+ */
+#define MAX_L3_PAGES 8
+struct evtchn_register_3level {
+ unsigned long evtchn_pending[MAX_L3_PAGES];
+ unsigned long evtchn_mask[MAX_L3_PAGES];
+ unsigned long *l2sel_mfn;
+ unsigned long *l2sel_offset;
+ uint32_t nr_vcpus;
+};
+
+struct evtchn_register_nlevel {
+ uint32_t level;
+ union {
+ struct evtchn_register_3level l3;
+ } u;
+};
+typedef struct evtchn_register_nlevel evtchn_register_nlevel_t;
+
+/*
* ` enum neg_errnoval
* ` HYPERVISOR_event_channel_op_compat(struct evtchn_op *op)
* `
diff --git a/xen/include/public/xen.h b/xen/include/public/xen.h
index 5593066..1d4ef2d 100644
--- a/xen/include/public/xen.h
+++ b/xen/include/public/xen.h
@@ -554,9 +554,24 @@ DEFINE_XEN_GUEST_HANDLE(multicall_entry_t);

/*
* Event channel endpoints per domain:
+ * 2-level:
* 1024 if a long is 32 bits; 4096 if a long is 64 bits.
+ * 3-level:
+ * 32k if a long is 32 bits; 256k if a long is 64 bits;
*/
-#define NR_EVENT_CHANNELS (sizeof(unsigned long) * sizeof(unsigned long) * 64)
+#define NR_EVENT_CHANNELS_L2 (sizeof(unsigned long) * sizeof(unsigned long) * 64)
+#define NR_EVENT_CHANNELS_L3 (NR_EVENT_CHANNELS_L2 * 64)
+#define NR_EVENT_CHANNELS(x) ({ unsigned int __v = 0; \
+ switch (x) { \
+ case 2: \
+ __v = NR_EVENT_CHANNELS_L2; break; \
+ case 3: \
+ __v = NR_EVENT_CHANNELS_L3; break; \
+ default: \
+ BUG(); \
+ } \
+ __v;})
+

struct vcpu_time_info {
/*
diff --git a/xen/include/xen/event.h b/xen/include/xen/event.h
index 71c3e92..e7cd6be 100644
--- a/xen/include/xen/event.h
+++ b/xen/include/xen/event.h
@@ -102,4 +102,8 @@ void notify_via_xen_event_channel(struct domain *ld, int lport);
mb(); /* set blocked status /then/ caller does his work */ \
} while ( 0 )

+extern void event_channel_unmap_nlevel(struct domain *d);
+int evtchn_is_masked(struct domain *d, int port);
+int evtchn_is_pending(struct domain *d, int port);
+
#endif /* __XEN_EVENT_H__ */
diff --git a/xen/include/xen/sched.h b/xen/include/xen/sched.h
index 5f23213..ae78549 100644
--- a/xen/include/xen/sched.h
+++ b/xen/include/xen/sched.h
@@ -64,7 +64,8 @@ extern struct domain *dom0;
__v;})

#define EVTCHNS_PER_BUCKET 512
-#define NR_EVTCHN_BUCKETS (NR_EVENT_CHANNELS / EVTCHNS_PER_BUCKET)
+#define NR_EVTCHN_BUCKETS (NR_EVENT_CHANNELS_L3 / EVTCHNS_PER_BUCKET)
+#define EVTCHNS_PER_PAGE (PAGE_SIZE * 8)

struct evtchn
{
@@ -104,7 +105,7 @@ void evtchn_destroy_final(struct domain *d); /* from complete_domain_destroy */

struct waitqueue_vcpu;

-struct vcpu
+struct vcpu
{
int vcpu_id;

@@ -112,6 +113,9 @@ struct vcpu

vcpu_info_t *vcpu_info;

+ /* For 3-level event channel */
+ unsigned long *evtchn_pending_sel_l2;
+
struct domain *domain;

struct vcpu *next_in_list;
@@ -275,6 +279,10 @@ struct domain
struct evtchn **evtchn;
spinlock_t event_lock;
unsigned int evtchn_level;
+#define L3_PAGES (NR_EVENT_CHANNELS_L3/BITS_PER_LONG*sizeof(unsigned long)/PAGE_SIZE)
+ unsigned long *evtchn_pending[L3_PAGES];
+ unsigned long *evtchn_mask[L3_PAGES];
+#undef L3_PAGES

struct grant_table *grant_table;

--
1.7.10.4


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel
Re: [RFC PATCH 3/3] Implement 3-level event channel routines. [ In reply to ]
On Mon, 2012-12-31 at 18:22 +0000, Wei Liu wrote:
> diff --git a/xen/common/event_channel.c b/xen/common/event_channel.c
> index 9898f8e..fb3a7b4 100644
> --- a/xen/common/event_channel.c
> +++ b/xen/common/event_channel.c
> @@ -27,6 +27,7 @@
> #include <xen/guest_access.h>
> #include <xen/keyhandler.h>
> #include <asm/current.h>
> +#include <xen/paging.h>
>
> #include <public/xen.h>
> #include <public/event_channel.h>
> @@ -413,6 +414,85 @@ static long evtchn_bind_pirq(evtchn_bind_pirq_t *bind)
> return rc;
> }
>
> +static inline int __evtchn_is_masked_l2(struct domain *d, int port)
> +{
> + return test_bit(port, &shared_info(d, evtchn_mask));
> +}
> +
> +static inline int __evtchn_is_masked_l3(struct domain *d, int port)
> +{
> + return test_bit(port % EVTCHNS_PER_PAGE,
> + d->evtchn_mask[port / EVTCHNS_PER_PAGE]);
> +}
> +
> +int evtchn_is_masked(struct domain *d, int port)
> +{
> + switch ( d->evtchn_level )
> + {
> + case 2:
> + return __evtchn_is_masked_l2(d, port);
> + case 3:
> + return __evtchn_is_masked_l3(d, port);
> + default:
> + printk(" %s: unknown event channel level %d for domain %d \n",
> + __FUNCTION__, d->evtchn_level, d->domain_id);
> + return 1;
> + }
> +}

How much of this soft of thing goes away if we arrange for
d->evtchn_mask to point to &shared_info(d, evtchn_mask) when
evtchn_level == 2? (Likewise for pending etc)

> @@ -633,7 +712,50 @@ static void evtchn_set_pending(struct vcpu *v, int port)
> {
> vcpu_mark_events_pending(v);
> }
> -
> +}
> +
> +static void __evtchn_set_pending_l3(struct vcpu *v, int port)
> +{
> + struct domain *d = v->domain;
> +
> + int page_no = port / EVTCHNS_PER_PAGE;
> + int offset = port % EVTCHNS_PER_PAGE;
> + int l1cb = BITS_PER_EVTCHN_WORD(d)*BITS_PER_EVTCHN_WORD(d);
> + int l2cb = BITS_PER_EVTCHN_WORD(d);

What does "cb" in these variable stand for?

> +
> + if ( test_and_set_bit(offset, d->evtchn_pending[page_no]) )
> + return;
> +
> + if ( !test_bit(offset, d->evtchn_mask[page_no]) &&
> + !test_and_set_bit(port / l2cb,
> + v->evtchn_pending_sel_l2) &&
> + !test_and_set_bit(port / l1cb,
> + &vcpu_info(v, evtchn_pending_sel)) )
> + {
> + vcpu_mark_events_pending(v);
> + }
> +}

It doesn't seem like it would be too hard to merge this with the l2
version. Perhaps using a scheme similar to Linux's page table where a
per-level macro collapses into a nop (with the appropriate return).

If you do end up duplicating the function then you should duplicate the
command about the importance of the ordering and the barriers etc too.

You should write down the datastructure somewhere. In particular I'm not
sure if l1 is at the top or the bottom. I think it's the top. I think it
would also be useful to be explicit about what the l3 things
corresponding to the fields in the shared info and vcpu info are and
which is the new level.

> +static int __map_l2_sel(struct vcpu *v, unsigned long gfn, unsigned long off)
> +{
> + void *mapping;
> + int rc = -EINVAL;
> + struct page_info *page;
> + struct domain *d = v->domain;
> +
> + if ( off >= PAGE_SIZE )
> + return rc;
> +
> + page = get_page_from_gfn(d, gfn, NULL, P2M_ALLOC);
> + if ( !page )
> + goto out;
> +
> + if ( !get_page_type(page, PGT_writable_page) )
> + {
> + put_page(page);
> + goto out;
> + }
> +
> + mapping = __map_domain_page_global(page);

I don't think you can validly use map_domain_page for a long lived
mapping. domain_page.h says "Allow temporary mapping of domain page
frames into Xen space.".

Are these array per-vcpu or shared? Because if they are per-vcpu you
don't need the global variant.

> +static int __map_l3_arrays(struct domain *d, unsigned long *pending,
> + unsigned long *mask)
> +{
> + int i;
> + void *pending_mapping, *mask_mapping;
> + struct page_info *pending_page, *mask_page;
> + unsigned long pending_gfn, mask_gfn;
> + int rc = -EINVAL;
> +
> + for ( i = 0;
> + i < MAX_L3_PAGES && pending[i] != 0 && mask[i] != 0;
> + i++ )
> + {
> + pending_gfn = pending[i];
> + mask_gfn = mask[i];
> +
> + pending_page = get_page_from_gfn(d, pending_gfn, NULL, P2M_ALLOC);
> + if ( !pending_page )

I think you need to initialise rc here and for each subsequent goto err.
The initial EINVAL is overwritten to ENOMEM below on the first iteration
and isn't reset back to EINVAL at the top of the loop.

> + goto err;
> +
> + mask_page = get_page_from_gfn(d, mask_gfn, NULL, P2M_ALLOC);
> + if ( !mask_page )
> + {
> + put_page(pending_page);
> + goto err;
> + }
> +
> + if ( !get_page_type(pending_page, PGT_writable_page) )
> + {
> + put_page(pending_page);
> + put_page(mask_page);
> + goto err;
> + }
> +
> + if ( !get_page_type(mask_page, PGT_writable_page) )
> + {
> + put_page_and_type(pending_page);
> + put_page(mask_page);
> + goto err;
> + }
> +
> + pending_mapping = __map_domain_page_global(pending_page);
> + if ( !pending_mapping )
> + {
> + put_page_and_type(pending_page);
> + put_page_and_type(mask_page);
> + rc = -ENOMEM;
> + goto err;
> + }
> +
> + mask_mapping = __map_domain_page_global(mask_page);
> + if ( !mask_mapping )
> + {
> + unmap_domain_page_global(pending_mapping);
> + put_page_and_type(pending_page);
> + put_page_and_type(mask_page);
> + rc = -ENOMEM;
> + goto err;
> + }
> +
> + d->evtchn_pending[i] = pending_mapping;
> + d->evtchn_mask[i] = mask_mapping;
> + }
> +
> + rc = 0;
> +
> +err:
> + return rc;
> +}

> diff --git a/xen/include/public/event_channel.h b/xen/include/public/event_channel.h
> index 07ff321..29cd6e9 100644
> --- a/xen/include/public/event_channel.h
> +++ b/xen/include/public/event_channel.h
> @@ -71,6 +71,7 @@
> #define EVTCHNOP_bind_vcpu 8
> #define EVTCHNOP_unmask 9
> #define EVTCHNOP_reset 10
> +#define EVTCHNOP_register_nlevel 11
> /* ` } */
>
> typedef uint32_t evtchn_port_t;
> @@ -258,6 +259,29 @@ struct evtchn_reset {
> typedef struct evtchn_reset evtchn_reset_t;
>
> /*
> + * EVTCHNOP_register_nlevel: Register N level event channels.
> + * NOTES:
> + * 1. currently only 3-level is supported.
> + * 2. should fall back to basic 2-level if this call fails.
> + */
> +#define MAX_L3_PAGES 8
> +struct evtchn_register_3level {
> + unsigned long evtchn_pending[MAX_L3_PAGES];
> + unsigned long evtchn_mask[MAX_L3_PAGES];
> + unsigned long *l2sel_mfn;
> + unsigned long *l2sel_offset;

You can't use bare pointers in a hypercall argument like this, you have
to use the GUEST_HANDLE stuff.

Please also document which are IN and OUT parameters (look at other
struct definitions for examples).

The evtchn_{pending,mask} can't be the actual pages, are they mfns?

> + uint32_t nr_vcpus;
> +};
> +
> +struct evtchn_register_nlevel {
> + uint32_t level;
> + union {
> + struct evtchn_register_3level l3;

Might be more extensible to N>3 if evtchn_{pending,mask} were also guest
handles rather than fixed size arrays?

> + } u;
> +};
> +typedef struct evtchn_register_nlevel evtchn_register_nlevel_t;
> +
> +/*
> * ` enum neg_errnoval
> * ` HYPERVISOR_event_channel_op_compat(struct evtchn_op *op)
> * `
> diff --git a/xen/include/public/xen.h b/xen/include/public/xen.h
> index 5593066..1d4ef2d 100644
> --- a/xen/include/public/xen.h
> +++ b/xen/include/public/xen.h
> @@ -554,9 +554,24 @@ DEFINE_XEN_GUEST_HANDLE(multicall_entry_t);
>
> /*
> * Event channel endpoints per domain:
> + * 2-level:
> * 1024 if a long is 32 bits; 4096 if a long is 64 bits.
> + * 3-level:
> + * 32k if a long is 32 bits; 256k if a long is 64 bits;
> */
> -#define NR_EVENT_CHANNELS (sizeof(unsigned long) * sizeof(unsigned long) * 64)
> +#define NR_EVENT_CHANNELS_L2 (sizeof(unsigned long) * sizeof(unsigned long) * 64)
> +#define NR_EVENT_CHANNELS_L3 (NR_EVENT_CHANNELS_L2 * 64)
> +#define NR_EVENT_CHANNELS(x) ({ unsigned int __v = 0; \
> + switch (x) { \
> + case 2: \
> + __v = NR_EVENT_CHANNELS_L2; break; \
> + case 3: \
> + __v = NR_EVENT_CHANNELS_L3; break; \
> + default: \
> + BUG(); \
> + } \
> + __v;})

xen/include/public is supposed to be ANSI-C and I think the #define ({...})
syntax is a gcc-ism.

I think the ..._L{N} defines are sufficient.

> diff --git a/xen/include/xen/sched.h b/xen/include/xen/sched.h
> index 5f23213..ae78549 100644
> --- a/xen/include/xen/sched.h
> +++ b/xen/include/xen/sched.h
> @@ -64,7 +64,8 @@ extern struct domain *dom0;
> __v;})
>
> #define EVTCHNS_PER_BUCKET 512
> -#define NR_EVTCHN_BUCKETS (NR_EVENT_CHANNELS / EVTCHNS_PER_BUCKET)
> +#define NR_EVTCHN_BUCKETS (NR_EVENT_CHANNELS_L3 / EVTCHNS_PER_BUCKET)
> +#define EVTCHNS_PER_PAGE (PAGE_SIZE * 8)
>
> struct evtchn
> {
> @@ -104,7 +105,7 @@ void evtchn_destroy_final(struct domain *d); /* from complete_domain_destroy */
>
> struct waitqueue_vcpu;
>
> -struct vcpu
> +struct vcpu
> {
> int vcpu_id;
>
> @@ -112,6 +113,9 @@ struct vcpu
>
> vcpu_info_t *vcpu_info;
>
> + /* For 3-level event channel */
> + unsigned long *evtchn_pending_sel_l2;
> +
> struct domain *domain;
>
> struct vcpu *next_in_list;
> @@ -275,6 +279,10 @@ struct domain
> struct evtchn **evtchn;
> spinlock_t event_lock;
> unsigned int evtchn_level;
> +#define L3_PAGES (NR_EVENT_CHANNELS_L3/BITS_PER_LONG*sizeof(unsigned long)/PAGE_SIZE)
> + unsigned long *evtchn_pending[L3_PAGES];
> + unsigned long *evtchn_mask[L3_PAGES];
> +#undef L3_PAGES
>
> struct grant_table *grant_table;
>
> --
> 1.7.10.4
>
>
> _______________________________________________
> Xen-devel mailing list
> Xen-devel@lists.xen.org
> http://lists.xen.org/xen-devel



_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel
Re: [RFC PATCH 3/3] Implement 3-level event channel routines. [ In reply to ]
>>> On 02.01.13 at 17:45, Ian Campbell <Ian.Campbell@citrix.com> wrote:
> On Mon, 2012-12-31 at 18:22 +0000, Wei Liu wrote:
>> + mapping = __map_domain_page_global(page);
>
> I don't think you can validly use map_domain_page for a long lived
> mapping. domain_page.h says "Allow temporary mapping of domain page
> frames into Xen space.".

That's what distinguishes map_domain_page_global() from
map_domain_page().

> Are these array per-vcpu or shared? Because if they are per-vcpu you
> don't need the global variant.

You still would, as the non-global set of mappings that can be
active at a time is limited. But the leaf pages ought to be shared
anyway.

Jan


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel
Re: [RFC PATCH 3/3] Implement 3-level event channel routines. [ In reply to ]
>>> On 31.12.12 at 19:22, Wei Liu <wei.liu2@citrix.com> wrote:
> @@ -633,7 +712,50 @@ static void evtchn_set_pending(struct vcpu *v, int port)
> {
> vcpu_mark_events_pending(v);
> }
> -
> +}
> +
> +static void __evtchn_set_pending_l3(struct vcpu *v, int port)
> +{
> + struct domain *d = v->domain;
> +
> + int page_no = port / EVTCHNS_PER_PAGE;

Anything used as array index should be unsigned int.

> + int offset = port % EVTCHNS_PER_PAGE;
> + int l1cb = BITS_PER_EVTCHN_WORD(d)*BITS_PER_EVTCHN_WORD(d);
> + int l2cb = BITS_PER_EVTCHN_WORD(d);

These always being powers of 2 (afaict), using divides rather
than shifts is pretty inefficient (and I don't think the compiler has
reasonable chances to recognize this and do the replacement for
you).

> +
> + if ( test_and_set_bit(offset, d->evtchn_pending[page_no]) )
> + return;
> +
> + if ( !test_bit(offset, d->evtchn_mask[page_no]) &&
> + !test_and_set_bit(port / l2cb,
> + v->evtchn_pending_sel_l2) &&
> + !test_and_set_bit(port / l1cb,
> + &vcpu_info(v, evtchn_pending_sel)) )

Indentation.

> @@ -969,6 +1138,290 @@ out:
> return rc;
> }
>
> +static void __unmap_l2_sel(struct vcpu *v)
> +{
> + unsigned long mfn;
> +
> + if ( v->evtchn_pending_sel_l2 != 0 )
> + {
> + unmap_domain_page_global(v->evtchn_pending_sel_l2);
> + mfn = virt_to_mfn(v->evtchn_pending_sel_l2);

virt_to_mfn() is not valid on the output of
map_domain_page{,_global}() (same further down). Yes, there is
at least one example of this in the existing code, but that's wrong
too, and is getting eliminated by the 16Tb patch series I'm in the
process of putting together.

> + put_page_and_type(mfn_to_page(mfn));
> +
> + v->evtchn_pending_sel_l2 = 0;
> + }
> +}
> +
> +static int __map_l2_sel(struct vcpu *v, unsigned long gfn, unsigned long off)
> +{
> + void *mapping;
> + int rc = -EINVAL;
> + struct page_info *page;
> + struct domain *d = v->domain;
> +
> + if ( off >= PAGE_SIZE )

Is e.g. off == PAGE_SIZE-1 really valid here? As you're mapping
guest memory here, _anything_ that is invalid must be rejected, or
else you're creating a security hole.

> + return rc;
> +
> + page = get_page_from_gfn(d, gfn, NULL, P2M_ALLOC);
> + if ( !page )
> + goto out;

Please be consistent here - always use return (as above) or
(less preferred by me personally) always use goto.

> +
> + if ( !get_page_type(page, PGT_writable_page) )
> + {
> + put_page(page);
> + goto out;
> + }
> +
> + mapping = __map_domain_page_global(page);
> + if ( mapping == NULL )
> + {
> + put_page_and_type(page);
> + rc = -ENOMEM;
> + goto out;
> + }
> +
> + v->evtchn_pending_sel_l2 = (unsigned long *)(mapping + off);

Pointless cast?

> +static int __map_l3_arrays(struct domain *d, unsigned long *pending,
> + unsigned long *mask)
> +{
> + int i;
> + void *pending_mapping, *mask_mapping;
> + struct page_info *pending_page, *mask_page;
> + unsigned long pending_gfn, mask_gfn;
> + int rc = -EINVAL;
> +
> + for ( i = 0;
> + i < MAX_L3_PAGES && pending[i] != 0 && mask[i] != 0;
> + i++ )
> + {
> + pending_gfn = pending[i];
> + mask_gfn = mask[i];
> +
> + pending_page = get_page_from_gfn(d, pending_gfn, NULL, P2M_ALLOC);
> + if ( !pending_page )
> + goto err;
> +
> + mask_page = get_page_from_gfn(d, mask_gfn, NULL, P2M_ALLOC);
> + if ( !mask_page )
> + {
> + put_page(pending_page);
> + goto err;
> + }
> +
> + if ( !get_page_type(pending_page, PGT_writable_page) )
> + {
> + put_page(pending_page);
> + put_page(mask_page);
> + goto err;
> + }
> +
> + if ( !get_page_type(mask_page, PGT_writable_page) )
> + {
> + put_page_and_type(pending_page);
> + put_page(mask_page);
> + goto err;
> + }
> +
> + pending_mapping = __map_domain_page_global(pending_page);
> + if ( !pending_mapping )
> + {
> + put_page_and_type(pending_page);
> + put_page_and_type(mask_page);
> + rc = -ENOMEM;
> + goto err;
> + }
> +
> + mask_mapping = __map_domain_page_global(mask_page);
> + if ( !mask_mapping )
> + {
> + unmap_domain_page_global(pending_mapping);
> + put_page_and_type(pending_page);
> + put_page_and_type(mask_page);
> + rc = -ENOMEM;
> + goto err;

The error paths in this function constitute well over half of it - can
this not be consolidated in some way?

> +static void __evtchn_migrate_bitmap_l3(struct domain *d)
> +{
> + struct vcpu *v;
> +
> + /* Easy way to migrate, just move existing selector down one level
> + * then copy the pending array and mask array */
> + for_each_vcpu ( d, v )
> + {
> + memcpy(&v->evtchn_pending_sel_l2[0],
> + &vcpu_info(v, evtchn_pending_sel),
> + sizeof(vcpu_info(v, evtchn_pending_sel)));
> +
> + memset(&vcpu_info(v, evtchn_pending_sel), 0,
> + sizeof(vcpu_info(v, evtchn_pending_sel)));
> +
> + set_bit(0, &vcpu_info(v, evtchn_pending_sel));
> + }
> +
> + memcpy(d->evtchn_pending[0], &shared_info(d, evtchn_pending),
> + sizeof(shared_info(d, evtchn_pending)));
> + memcpy(d->evtchn_mask[0], &shared_info(d, evtchn_mask),
> + sizeof(shared_info(d, evtchn_mask)));
> +}
> +
> +static long __evtchn_register_3level(struct evtchn_register_3level *r)
> +{
> + struct domain *d = current->domain;
> + unsigned long mfns[r->nr_vcpus];
> + unsigned long offsets[r->nr_vcpus];
> + unsigned char was_up[r->nr_vcpus];

This is an absolute no-go in the hypervisor: Did you consider what
happens when a guest has a few hundred vCPU-s?

> + int rc, i;
> + struct vcpu *v;
> +
> + if ( d->evtchn_level == 3 )
> + return -EINVAL;
> +
> + if ( r->nr_vcpus > d->max_vcpus )
> + return -EINVAL;
> +
> + for ( i = 0; i < MAX_L3_PAGES; i++ )
> + if ( d->evtchn_pending[i] || d->evtchn_mask[i] )
> + return -EINVAL;

This and the check immediately below should be redundant with
the level == 3 check earlier on (or if they aren't redundant, then
you need to fix the code elsewhere).

> +
> + for_each_vcpu ( d, v )
> + if ( v->evtchn_pending_sel_l2 )
> + return -EINVAL;
> +
> + if ( copy_from_user(mfns, r->l2sel_mfn,
> + sizeof(unsigned long)*r->nr_vcpus) )

copy_from_guest()!

> + return -EFAULT;
> +
> + if ( copy_from_user(offsets, r->l2sel_offset,
> + sizeof(unsigned long)*r->nr_vcpus) )
> + return -EFAULT;
> +
> + /* put cpu offline */
> + for_each_vcpu ( d, v )
> + {
> + if ( v == current )
> + was_up[v->vcpu_id] = 1;
> + else
> + was_up[v->vcpu_id] = !test_and_set_bit(_VPF_down,
> + &v->pause_flags);

This does not in any way halt that remote vCPU.

But anyway - is it really that useful to allow establishing the 3rd
level with all vCPU-s already up and event processing already
happening for the subject domain? I.e. wouldn't it suffice to
require the guest to do the setup _before_ setting up the first
event channel (which would imply on vCPU 0 to be active)?

> @@ -1251,7 +1712,6 @@ void evtchn_move_pirqs(struct vcpu *v)
> spin_unlock(&d->event_lock);
> }
>
> -

???

> static void domain_dump_evtchn_info(struct domain *d)
> {
> unsigned int port;
> @@ -1276,11 +1740,28 @@ static void domain_dump_evtchn_info(struct domain *d)
> if ( chn->state == ECS_FREE )
> continue;
>
> - printk(" %4u [%d/%d]: s=%d n=%d x=%d",
> - port,
> - !!test_bit(port, &shared_info(d, evtchn_pending)),
> - !!test_bit(port, &shared_info(d, evtchn_mask)),
> - chn->state, chn->notify_vcpu_id, chn->xen_consumer);
> + printk(" %4u", port);
> +
> + switch ( d->evtchn_level )
> + {
> + case 2:
> + printk(" [%d/%d]:",
> + !!test_bit(port, &shared_info(d, evtchn_pending)),
> + !!test_bit(port, &shared_info(d, evtchn_mask)));
> + break;
> + case 3:
> + printk(" [%d/%d]:",
> + !!test_bit(offset, d->evtchn_pending[page_no]),
> + !!test_bit(offset, d->evtchn_mask[page_no]));

Can't you, btw, set [0] of these two to the &shared_info() values,
and thus fold code here and perhaps elsewhere?

> + break;
> + default:
> + printk(" %s: unknown event channel level %d for domain %d \n",
> + __FUNCTION__, d->evtchn_level, d->domain_id);
> + goto out;
> + }
> +
> + printk(" s=%d n=%d x=%d", chn->state, chn->notify_vcpu_id,
> + chn->xen_consumer);

So still all event channels in one go?

> --- a/xen/common/keyhandler.c
> +++ b/xen/common/keyhandler.c
> @@ -298,15 +298,15 @@ static void dump_domains(unsigned char key)
> {
> for_each_vcpu ( d, v )
> {
> + unsigned int bits = BITS_PER_EVTCHN_WORD(d);
> + for (int i = 2; i < d->evtchn_level; i++)

Did you check that gcc 4.1.x deals with this non-C89 code fine,
with no warning?

> --- a/xen/common/schedule.c
> +++ b/xen/common/schedule.c
> @@ -693,7 +693,7 @@ static long do_poll(struct sched_poll *sched_poll)
> goto out;
>
> rc = 0;
> - if ( test_bit(port, &shared_info(d, evtchn_pending)) )
> + if ( evtchn_is_pending(d, port) )

It would likely make the patch better readable if mechanical
adjustments like this got split out into a prerequisite patch, even
if used only in very few places.

> --- a/xen/include/public/event_channel.h
> +++ b/xen/include/public/event_channel.h
> @@ -71,6 +71,7 @@
> #define EVTCHNOP_bind_vcpu 8
> #define EVTCHNOP_unmask 9
> #define EVTCHNOP_reset 10
> +#define EVTCHNOP_register_nlevel 11
> /* ` } */
>
> typedef uint32_t evtchn_port_t;
> @@ -258,6 +259,29 @@ struct evtchn_reset {
> typedef struct evtchn_reset evtchn_reset_t;
>
> /*
> + * EVTCHNOP_register_nlevel: Register N level event channels.
> + * NOTES:
> + * 1. currently only 3-level is supported.
> + * 2. should fall back to basic 2-level if this call fails.
> + */
> +#define MAX_L3_PAGES 8

Without explanation, no-one will easily understand whether this
number is made up or the result of some calculation. Also, with it
having "MAX" in its name, I'd conclude there can be fewer than 8
pages passed by the guest, yet the interface lacks any indication
of how many pages there are being passed.

> +struct evtchn_register_3level {
> + unsigned long evtchn_pending[MAX_L3_PAGES];
> + unsigned long evtchn_mask[MAX_L3_PAGES];
> + unsigned long *l2sel_mfn;
> + unsigned long *l2sel_offset;
> + uint32_t nr_vcpus;
> +};
> +
> +struct evtchn_register_nlevel {
> + uint32_t level;
> + union {
> + struct evtchn_register_3level l3;
> + } u;
> +};
> +typedef struct evtchn_register_nlevel evtchn_register_nlevel_t;
> +
> +/*
> * ` enum neg_errnoval
> * ` HYPERVISOR_event_channel_op_compat(struct evtchn_op *op)
> * `
> --- a/xen/include/public/xen.h
> +++ b/xen/include/public/xen.h
> @@ -554,9 +554,24 @@ DEFINE_XEN_GUEST_HANDLE(multicall_entry_t);
>
> /*
> * Event channel endpoints per domain:
> + * 2-level:
> * 1024 if a long is 32 bits; 4096 if a long is 64 bits.
> + * 3-level:
> + * 32k if a long is 32 bits; 256k if a long is 64 bits;
> */
> -#define NR_EVENT_CHANNELS (sizeof(unsigned long) * sizeof(unsigned long) * 64)
> +#define NR_EVENT_CHANNELS_L2 (sizeof(unsigned long) * sizeof(unsigned long) * 64)
> +#define NR_EVENT_CHANNELS_L3 (NR_EVENT_CHANNELS_L2 * 64)
> +#define NR_EVENT_CHANNELS(x) ({ unsigned int __v = 0; \

This is not a valid thing to do in a public header: You can't replace
the object like NR_EVENT_CHANNELS with a function like alternative,
you need to retain the old definition for consumers unaware of the
new extension.

> --- a/xen/include/xen/sched.h
> +++ b/xen/include/xen/sched.h
> @@ -64,7 +64,8 @@ extern struct domain *dom0;
> __v;})
>
> #define EVTCHNS_PER_BUCKET 512
> -#define NR_EVTCHN_BUCKETS (NR_EVENT_CHANNELS / EVTCHNS_PER_BUCKET)
> +#define NR_EVTCHN_BUCKETS (NR_EVENT_CHANNELS_L3 / EVTCHNS_PER_BUCKET)
> +#define EVTCHNS_PER_PAGE (PAGE_SIZE * 8)

So is this 8 the same as the one above? If so, why don't you use
the #define? If not, where does this 8 come from?

> @@ -104,7 +105,7 @@ void evtchn_destroy_final(struct domain *d); /* from
> complete_domain_destroy */
>
> struct waitqueue_vcpu;
>
> -struct vcpu
> +struct vcpu

???

> @@ -275,6 +279,10 @@ struct domain
> struct evtchn **evtchn;
> spinlock_t event_lock;
> unsigned int evtchn_level;
> +#define L3_PAGES (NR_EVENT_CHANNELS_L3/BITS_PER_LONG*sizeof(unsigned long)/PAGE_SIZE)

Without the use of BITS_TO_LONGS() and PFN_UP() it is not clear
that you don't cut off any non-zero bits here. Hence you ought to
either use those macros, or put a validating BUILD_BUG_ON()
somewhere.

Jan

_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel
Re: [RFC PATCH 3/3] Implement 3-level event channel routines. [ In reply to ]
On Thu, 2013-01-03 at 11:35 +0000, Jan Beulich wrote:
> >>> On 31.12.12 at 19:22, Wei Liu <wei.liu2@citrix.com> wrote:
> >
> > +static void __unmap_l2_sel(struct vcpu *v)
> > +{
> > + unsigned long mfn;
> > +
> > + if ( v->evtchn_pending_sel_l2 != 0 )
> > + {
> > + unmap_domain_page_global(v->evtchn_pending_sel_l2);
> > + mfn = virt_to_mfn(v->evtchn_pending_sel_l2);
>
> virt_to_mfn() is not valid on the output of
> map_domain_page{,_global}() (same further down). Yes, there is
> at least one example of this in the existing code, but that's wrong
> too, and is getting eliminated by the 16Tb patch series I'm in the
> process of putting together.
>

So what's the correct way to do this? Do I need to wait for your patch
series?


Wei.


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel
Re: [RFC PATCH 3/3] Implement 3-level event channel routines. [ In reply to ]
>>> On 08.01.13 at 18:33, Wei Liu <Wei.Liu2@citrix.com> wrote:
> On Thu, 2013-01-03 at 11:35 +0000, Jan Beulich wrote:
>> >>> On 31.12.12 at 19:22, Wei Liu <wei.liu2@citrix.com> wrote:
>> >
>> > +static void __unmap_l2_sel(struct vcpu *v)
>> > +{
>> > + unsigned long mfn;
>> > +
>> > + if ( v->evtchn_pending_sel_l2 != 0 )
>> > + {
>> > + unmap_domain_page_global(v->evtchn_pending_sel_l2);
>> > + mfn = virt_to_mfn(v->evtchn_pending_sel_l2);
>>
>> virt_to_mfn() is not valid on the output of
>> map_domain_page{,_global}() (same further down). Yes, there is
>> at least one example of this in the existing code, but that's wrong
>> too, and is getting eliminated by the 16Tb patch series I'm in the
>> process of putting together.
>>
>
> So what's the correct way to do this? Do I need to wait for your patch
> series?

Considering that the old 32-bit case of map_domain_page()
doesn't matter anymore, using domain_page_map_to_mfn()
here would be the way to go (while the 32-bit version of it
didn't allow map_domain_page_global() to be handled through
it, my 64-bit implementation of it will).

And of course there's the obvious alternative of storing the
MFN rather than re-obtaining it in the teardown path.

Jan


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel
Re: [RFC PATCH 3/3] Implement 3-level event channel routines. [ In reply to ]
On Wed, 2013-01-09 at 08:38 +0000, Jan Beulich wrote:
> >>> On 08.01.13 at 18:33, Wei Liu <Wei.Liu2@citrix.com> wrote:
> > On Thu, 2013-01-03 at 11:35 +0000, Jan Beulich wrote:
> >> >>> On 31.12.12 at 19:22, Wei Liu <wei.liu2@citrix.com> wrote:
> >> >
> >> > +static void __unmap_l2_sel(struct vcpu *v)
> >> > +{
> >> > + unsigned long mfn;
> >> > +
> >> > + if ( v->evtchn_pending_sel_l2 != 0 )
> >> > + {
> >> > + unmap_domain_page_global(v->evtchn_pending_sel_l2);
> >> > + mfn = virt_to_mfn(v->evtchn_pending_sel_l2);
> >>
> >> virt_to_mfn() is not valid on the output of
> >> map_domain_page{,_global}() (same further down). Yes, there is
> >> at least one example of this in the existing code, but that's wrong
> >> too, and is getting eliminated by the 16Tb patch series I'm in the
> >> process of putting together.
> >>
> >
> > So what's the correct way to do this? Do I need to wait for your patch
> > series?
>
> Considering that the old 32-bit case of map_domain_page()
> doesn't matter anymore,

We still care about it on 32-bit ARM FWIW. (I@m not sure if that
invalidates your advice below though)

> using domain_page_map_to_mfn()
> here would be the way to go (while the 32-bit version of it
> didn't allow map_domain_page_global() to be handled through
> it, my 64-bit implementation of it will).
>
> And of course there's the obvious alternative of storing the
> MFN rather than re-obtaining it in the teardown path.
>
> Jan
>
>
> _______________________________________________
> Xen-devel mailing list
> Xen-devel@lists.xen.org
> http://lists.xen.org/xen-devel



_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel
Re: [RFC PATCH 3/3] Implement 3-level event channel routines. [ In reply to ]
>>> On 09.01.13 at 11:56, Ian Campbell <Ian.Campbell@citrix.com> wrote:
> On Wed, 2013-01-09 at 08:38 +0000, Jan Beulich wrote:
>> >>> On 08.01.13 at 18:33, Wei Liu <Wei.Liu2@citrix.com> wrote:
>> > On Thu, 2013-01-03 at 11:35 +0000, Jan Beulich wrote:
>> >> >>> On 31.12.12 at 19:22, Wei Liu <wei.liu2@citrix.com> wrote:
>> >> >
>> >> > +static void __unmap_l2_sel(struct vcpu *v)
>> >> > +{
>> >> > + unsigned long mfn;
>> >> > +
>> >> > + if ( v->evtchn_pending_sel_l2 != 0 )
>> >> > + {
>> >> > + unmap_domain_page_global(v->evtchn_pending_sel_l2);
>> >> > + mfn = virt_to_mfn(v->evtchn_pending_sel_l2);
>> >>
>> >> virt_to_mfn() is not valid on the output of
>> >> map_domain_page{,_global}() (same further down). Yes, there is
>> >> at least one example of this in the existing code, but that's wrong
>> >> too, and is getting eliminated by the 16Tb patch series I'm in the
>> >> process of putting together.
>> >>
>> >
>> > So what's the correct way to do this? Do I need to wait for your patch
>> > series?
>>
>> Considering that the old 32-bit case of map_domain_page()
>> doesn't matter anymore,
>
> We still care about it on 32-bit ARM FWIW. (I@m not sure if that
> invalidates your advice below though)

Right - which would require ARM to implement
domain_page_map_to_mfn() (so far used only in x86 and tmem
code).

>> using domain_page_map_to_mfn()
>> here would be the way to go (while the 32-bit version of it
>> didn't allow map_domain_page_global() to be handled through
>> it, my 64-bit implementation of it will).
>>
>> And of course there's the obvious alternative of storing the
>> MFN rather than re-obtaining it in the teardown path.

This second option of course would alway be valid, without
architecture specific changes.

Jan


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel
Re: [RFC PATCH 3/3] Implement 3-level event channel routines. [ In reply to ]
On Wed, 2013-01-09 at 11:24 +0000, Jan Beulich wrote:
> >>> On 09.01.13 at 11:56, Ian Campbell <Ian.Campbell@citrix.com> wrote:
> > On Wed, 2013-01-09 at 08:38 +0000, Jan Beulich wrote:
> >> >>> On 08.01.13 at 18:33, Wei Liu <Wei.Liu2@citrix.com> wrote:
> >> > On Thu, 2013-01-03 at 11:35 +0000, Jan Beulich wrote:
> >> >> >>> On 31.12.12 at 19:22, Wei Liu <wei.liu2@citrix.com> wrote:
> >> >> >
> >> >> > +static void __unmap_l2_sel(struct vcpu *v)
> >> >> > +{
> >> >> > + unsigned long mfn;
> >> >> > +
> >> >> > + if ( v->evtchn_pending_sel_l2 != 0 )
> >> >> > + {
> >> >> > + unmap_domain_page_global(v->evtchn_pending_sel_l2);
> >> >> > + mfn = virt_to_mfn(v->evtchn_pending_sel_l2);
> >> >>
> >> >> virt_to_mfn() is not valid on the output of
> >> >> map_domain_page{,_global}() (same further down). Yes, there is
> >> >> at least one example of this in the existing code, but that's wrong
> >> >> too, and is getting eliminated by the 16Tb patch series I'm in the
> >> >> process of putting together.
> >> >>
> >> >
> >> > So what's the correct way to do this? Do I need to wait for your patch
> >> > series?
> >>
> >> Considering that the old 32-bit case of map_domain_page()
> >> doesn't matter anymore,
> >
> > We still care about it on 32-bit ARM FWIW. (I@m not sure if that
> > invalidates your advice below though)
>
> Right - which would require ARM to implement
> domain_page_map_to_mfn() (so far used only in x86 and tmem
> code).

I only see it in x86 (specifically hvm_unmap_guest_frame) but I guess
this is a useful interface for ARM to implement in any case and it
doesn't look to be terribly hard.

> >> using domain_page_map_to_mfn()
> >> here would be the way to go (while the 32-bit version of it
> >> didn't allow map_domain_page_global() to be handled through
> >> it, my 64-bit implementation of it will).
> >>
> >> And of course there's the obvious alternative of storing the
> >> MFN rather than re-obtaining it in the teardown path.
>
> This second option of course would alway be valid, without
> architecture specific changes.

Right.

Ian.


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel
Re: [RFC PATCH 3/3] Implement 3-level event channel routines. [ In reply to ]
>>> On 09.01.13 at 12:31, Ian Campbell <Ian.Campbell@citrix.com> wrote:
> On Wed, 2013-01-09 at 11:24 +0000, Jan Beulich wrote:
>> >>> On 09.01.13 at 11:56, Ian Campbell <Ian.Campbell@citrix.com> wrote:
>> > On Wed, 2013-01-09 at 08:38 +0000, Jan Beulich wrote:
>> >> >>> On 08.01.13 at 18:33, Wei Liu <Wei.Liu2@citrix.com> wrote:
>> >> > On Thu, 2013-01-03 at 11:35 +0000, Jan Beulich wrote:
>> >> >> >>> On 31.12.12 at 19:22, Wei Liu <wei.liu2@citrix.com> wrote:
>> >> >> >
>> >> >> > +static void __unmap_l2_sel(struct vcpu *v)
>> >> >> > +{
>> >> >> > + unsigned long mfn;
>> >> >> > +
>> >> >> > + if ( v->evtchn_pending_sel_l2 != 0 )
>> >> >> > + {
>> >> >> > + unmap_domain_page_global(v->evtchn_pending_sel_l2);
>> >> >> > + mfn = virt_to_mfn(v->evtchn_pending_sel_l2);
>> >> >>
>> >> >> virt_to_mfn() is not valid on the output of
>> >> >> map_domain_page{,_global}() (same further down). Yes, there is
>> >> >> at least one example of this in the existing code, but that's wrong
>> >> >> too, and is getting eliminated by the 16Tb patch series I'm in the
>> >> >> process of putting together.
>> >> >>
>> >> >
>> >> > So what's the correct way to do this? Do I need to wait for your patch
>> >> > series?
>> >>
>> >> Considering that the old 32-bit case of map_domain_page()
>> >> doesn't matter anymore,
>> >
>> > We still care about it on 32-bit ARM FWIW. (I@m not sure if that
>> > invalidates your advice below though)
>>
>> Right - which would require ARM to implement
>> domain_page_map_to_mfn() (so far used only in x86 and tmem
>> code).
>
> I only see it in x86 (specifically hvm_unmap_guest_frame) but I guess
> this is a useful interface for ARM to implement in any case and it
> doesn't look to be terribly hard.

Oh, the tmem one is from my 16Tb series that I'm hoping to post
soon.

Jan


_______________________________________________
Xen-devel mailing list
Xen-devel@lists.xen.org
http://lists.xen.org/xen-devel