Mailing List Archive

[xen-unstable] credit2: Add credit2 scheduler to hypervisor
# HG changeset patch
# User Keir Fraser <keir.fraser@citrix.com>
# Date 1271243241 -3600
# Node ID 9b91a253528b8bc2f174a9a47df690f3123678b2
# Parent dca0baecc909ac0f2e6725d05718adbb69fc2dd5
credit2: Add credit2 scheduler to hypervisor

This is the core credit2 patch. It adds the new credit2 scheduler to
the hypervisor, as the non-default scheduler. It should be emphasized
that this is still in the development phase, and is probably still
unstable. It is known to be suboptimal for multi-socket systems.

Signed-off-by: George Dunlap <george.dunlap@eu.citrix.com>
---
xen/common/Makefile | 1
xen/common/sched_credit2.c | 1125 ++++++++++++++++++++++++++++++++++++++++++++
xen/common/schedule.c | 2
xen/include/public/domctl.h | 4
4 files changed, 1132 insertions(+)

diff -r dca0baecc909 -r 9b91a253528b xen/common/Makefile
--- a/xen/common/Makefile Wed Apr 14 12:06:05 2010 +0100
+++ b/xen/common/Makefile Wed Apr 14 12:07:21 2010 +0100
@@ -13,6 +13,7 @@ obj-y += page_alloc.o
obj-y += page_alloc.o
obj-y += rangeset.o
obj-y += sched_credit.o
+obj-y += sched_credit2.o
obj-y += sched_sedf.o
obj-y += schedule.o
obj-y += shutdown.o
diff -r dca0baecc909 -r 9b91a253528b xen/common/sched_credit2.c
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/xen/common/sched_credit2.c Wed Apr 14 12:07:21 2010 +0100
@@ -0,0 +1,1125 @@
+
+/****************************************************************************
+ * (C) 2009 - George Dunlap - Citrix Systems R&D UK, Ltd
+ ****************************************************************************
+ *
+ * File: common/csched_credit2.c
+ * Author: George Dunlap
+ *
+ * Description: Credit-based SMP CPU scheduler
+ * Based on an earlier verson by Emmanuel Ackaouy.
+ */
+
+#include <xen/config.h>
+#include <xen/init.h>
+#include <xen/lib.h>
+#include <xen/sched.h>
+#include <xen/domain.h>
+#include <xen/delay.h>
+#include <xen/event.h>
+#include <xen/time.h>
+#include <xen/perfc.h>
+#include <xen/sched-if.h>
+#include <xen/softirq.h>
+#include <asm/atomic.h>
+#include <xen/errno.h>
+#include <xen/trace.h>
+
+#if __i386__
+#define PRI_stime "lld"
+#else
+#define PRI_stime "ld"
+#endif
+
+#define d2printk(x...)
+//#define d2printk printk
+
+#define TRC_CSCHED2_TICK TRC_SCHED_CLASS + 1
+#define TRC_CSCHED2_RUNQ_POS TRC_SCHED_CLASS + 2
+#define TRC_CSCHED2_CREDIT_BURN TRC_SCHED_CLASS + 3
+#define TRC_CSCHED2_CREDIT_ADD TRC_SCHED_CLASS + 4
+#define TRC_CSCHED2_TICKLE_CHECK TRC_SCHED_CLASS + 5
+
+/*
+ * WARNING: This is still in an experimental phase. Status and work can be found at the
+ * credit2 wiki page:
+ * http://wiki.xensource.com/xenwiki/Credit2_Scheduler_Development
+ * TODO:
+ * + Immediate bug-fixes
+ * - Do per-runqueue, grab proper lock for dump debugkey
+ * + Multiple sockets
+ * - Detect cpu layout and make runqueue map, one per L2 (make_runq_map())
+ * - Simple load balancer / runqueue assignment
+ * - Runqueue load measurement
+ * - Load-based load balancer
+ * + Hyperthreading
+ * - Look for non-busy core if possible
+ * - "Discount" time run on a thread with busy siblings
+ * + Algorithm:
+ * - "Mixed work" problem: if a VM is playing audio (5%) but also burning cpu (e.g.,
+ * a flash animation in the background) can we schedule it with low enough latency
+ * so that audio doesn't skip?
+ * - Cap and reservation: How to implement with the current system?
+ * + Optimizing
+ * - Profiling, making new algorithms, making math more efficient (no long division)
+ */
+
+/*
+ * Design:
+ *
+ * VMs "burn" credits based on their weight; higher weight means
+ * credits burn more slowly. The highest weight vcpu burns credits at
+ * a rate of 1 credit per nanosecond. Others burn proportionally
+ * more.
+ *
+ * vcpus are inserted into the runqueue by credit order.
+ *
+ * Credits are "reset" when the next vcpu in the runqueue is less than
+ * or equal to zero. At that point, everyone's credits are "clipped"
+ * to a small value, and a fixed credit is added to everyone.
+ *
+ * The plan is for all cores that share an L2 will share the same
+ * runqueue. At the moment, there is one global runqueue for all
+ * cores.
+ */
+
+/*
+ * Locking:
+ * - Schedule-lock is per-runqueue
+ * + Protects runqueue data, runqueue insertion, &c
+ * + Also protects updates to private sched vcpu structure
+ * + Must be grabbed using vcpu_schedule_lock_irq() to make sure vcpu->processr
+ * doesn't change under our feet.
+ * - Private data lock
+ * + Protects access to global domain list
+ * + All other private data is written at init and only read afterwards.
+ * Ordering:
+ * - We grab private->schedule when updating domain weight; so we
+ * must never grab private if a schedule lock is held.
+ */
+
+/*
+ * Basic constants
+ */
+/* Default weight: How much a new domain starts with */
+#define CSCHED_DEFAULT_WEIGHT 256
+/* Min timer: Minimum length a timer will be set, to
+ * achieve efficiency */
+#define CSCHED_MIN_TIMER MICROSECS(500)
+/* Amount of credit VMs begin with, and are reset to.
+ * ATM, set so that highest-weight VMs can only run for 10ms
+ * before a reset event. */
+#define CSCHED_CREDIT_INIT MILLISECS(10)
+/* Carryover: How much "extra" credit may be carried over after
+ * a reset. */
+#define CSCHED_CARRYOVER_MAX CSCHED_MIN_TIMER
+/* Reset: Value below which credit will be reset. */
+#define CSCHED_CREDIT_RESET 0
+/* Max timer: Maximum time a guest can be run for. */
+#define CSCHED_MAX_TIMER MILLISECS(2)
+
+
+#define CSCHED_IDLE_CREDIT (-(1<<30))
+
+/*
+ * Flags
+ */
+/* CSFLAG_scheduled: Is this vcpu either running on, or context-switching off,
+ * a physical cpu?
+ * + Accessed only with runqueue lock held
+ * + Set when chosen as next in csched_schedule().
+ * + Cleared after context switch has been saved in csched_context_saved()
+ * + Checked in vcpu_wake to see if we can add to the runqueue, or if we should
+ * set CSFLAG_delayed_runq_add
+ * + Checked to be false in runq_insert.
+ */
+#define __CSFLAG_scheduled 1
+#define CSFLAG_scheduled (1<<__CSFLAG_scheduled)
+/* CSFLAG_delayed_runq_add: Do we need to add this to the runqueue once it'd done
+ * being context switched out?
+ * + Set when scheduling out in csched_schedule() if prev is runnable
+ * + Set in csched_vcpu_wake if it finds CSFLAG_scheduled set
+ * + Read in csched_context_saved(). If set, it adds prev to the runqueue and
+ * clears the bit.
+ */
+#define __CSFLAG_delayed_runq_add 2
+#define CSFLAG_delayed_runq_add (1<<__CSFLAG_delayed_runq_add)
+
+
+/*
+ * Useful macros
+ */
+#define CSCHED_VCPU(_vcpu) ((struct csched_vcpu *) (_vcpu)->sched_priv)
+#define CSCHED_DOM(_dom) ((struct csched_dom *) (_dom)->sched_priv)
+/* CPU to runq_id macro */
+#define c2r(_cpu) (csched_priv.runq_map[(_cpu)])
+/* CPU to runqueue struct macro */
+#define RQD(_cpu) (&csched_priv.rqd[c2r(_cpu)])
+
+/*
+ * Per-runqueue data
+ */
+struct csched_runqueue_data {
+ int id;
+ struct list_head runq; /* Ordered list of runnable vms */
+ struct list_head svc; /* List of all vcpus assigned to this runqueue */
+ int max_weight;
+ int cpu_min, cpu_max; /* Range of physical cpus this runqueue runs */
+};
+
+/*
+ * System-wide private data
+ */
+struct csched_private {
+ spinlock_t lock;
+ uint32_t ncpus;
+ struct domain *idle_domain;
+
+ struct list_head sdom; /* Used mostly for dump keyhandler. */
+
+ int runq_map[NR_CPUS];
+ uint32_t runq_count;
+ struct csched_runqueue_data rqd[NR_CPUS];
+};
+
+/*
+ * Virtual CPU
+ */
+struct csched_vcpu {
+ struct list_head rqd_elem; /* On the runqueue data list */
+ struct list_head sdom_elem; /* On the domain vcpu list */
+ struct list_head runq_elem; /* On the runqueue */
+
+ /* Up-pointers */
+ struct csched_dom *sdom;
+ struct vcpu *vcpu;
+
+ int weight;
+
+ int credit;
+ s_time_t start_time; /* When we were scheduled (used for credit) */
+ unsigned flags; /* 16 bits doesn't seem to play well with clear_bit() */
+
+};
+
+/*
+ * Domain
+ */
+struct csched_dom {
+ struct list_head vcpu;
+ struct list_head sdom_elem;
+ struct domain *dom;
+ uint16_t weight;
+ uint16_t nr_vcpus;
+};
+
+
+/*
+ * Global variables
+ */
+static struct csched_private csched_priv;
+
+/*
+ * Time-to-credit, credit-to-time.
+ * FIXME: Do pre-calculated division?
+ */
+static s_time_t t2c(struct csched_runqueue_data *rqd, s_time_t time, struct csched_vcpu *svc)
+{
+ return time * rqd->max_weight / svc->weight;
+}
+
+static s_time_t c2t(struct csched_runqueue_data *rqd, s_time_t credit, struct csched_vcpu *svc)
+{
+ return credit * svc->weight / rqd->max_weight;
+}
+
+/*
+ * Runqueue related code
+ */
+
+static /*inline*/ int
+__vcpu_on_runq(struct csched_vcpu *svc)
+{
+ return !list_empty(&svc->runq_elem);
+}
+
+static /*inline*/ struct csched_vcpu *
+__runq_elem(struct list_head *elem)
+{
+ return list_entry(elem, struct csched_vcpu, runq_elem);
+}
+
+static int
+__runq_insert(struct list_head *runq, struct csched_vcpu *svc)
+{
+ struct list_head *iter;
+ int pos = 0;
+
+ d2printk("rqi d%dv%d\n",
+ svc->vcpu->domain->domain_id,
+ svc->vcpu->vcpu_id);
+
+ /* Idle vcpus not allowed on the runqueue anymore */
+ BUG_ON(is_idle_vcpu(svc->vcpu));
+ BUG_ON(svc->vcpu->is_running);
+ BUG_ON(test_bit(__CSFLAG_scheduled, &svc->flags));
+
+ list_for_each( iter, runq )
+ {
+ struct csched_vcpu * iter_svc = __runq_elem(iter);
+
+ if ( svc->credit > iter_svc->credit )
+ {
+ d2printk(" p%d d%dv%d\n",
+ pos,
+ iter_svc->vcpu->domain->domain_id,
+ iter_svc->vcpu->vcpu_id);
+ break;
+ }
+ pos++;
+ }
+
+ list_add_tail(&svc->runq_elem, iter);
+
+ return pos;
+}
+
+static void
+runq_insert(unsigned int cpu, struct csched_vcpu *svc)
+{
+ struct list_head * runq = &RQD(cpu)->runq;
+ int pos = 0;
+
+ ASSERT( spin_is_locked(per_cpu(schedule_data, cpu).schedule_lock) );
+
+ BUG_ON( __vcpu_on_runq(svc) );
+ BUG_ON( c2r(cpu) != c2r(svc->vcpu->processor) );
+
+ pos = __runq_insert(runq, svc);
+
+ {
+ struct {
+ unsigned dom:16,vcpu:16;
+ unsigned pos;
+ } d;
+ d.dom = svc->vcpu->domain->domain_id;
+ d.vcpu = svc->vcpu->vcpu_id;
+ d.pos = pos;
+ trace_var(TRC_CSCHED2_RUNQ_POS, 1,
+ sizeof(d),
+ (unsigned char *)&d);
+ }
+
+ return;
+}
+
+static inline void
+__runq_remove(struct csched_vcpu *svc)
+{
+ BUG_ON( !__vcpu_on_runq(svc) );
+ list_del_init(&svc->runq_elem);
+}
+
+void burn_credits(struct csched_runqueue_data *rqd, struct csched_vcpu *, s_time_t);
+
+/* Check to see if the item on the runqueue is higher priority than what's
+ * currently running; if so, wake up the processor */
+static /*inline*/ void
+runq_tickle(unsigned int cpu, struct csched_vcpu *new, s_time_t now)
+{
+ int i, ipid=-1;
+ s_time_t lowest=(1<<30);
+ struct csched_runqueue_data *rqd = RQD(cpu);
+
+ d2printk("rqt d%dv%d cd%dv%d\n",
+ new->vcpu->domain->domain_id,
+ new->vcpu->vcpu_id,
+ current->domain->domain_id,
+ current->vcpu_id);
+
+ /* Find the cpu in this queue group that has the lowest credits */
+ for ( i=rqd->cpu_min ; i < rqd->cpu_max ; i++ )
+ {
+ struct csched_vcpu * cur;
+
+ /* Skip cpus that aren't online */
+ if ( !cpu_online(i) )
+ continue;
+
+ cur = CSCHED_VCPU(per_cpu(schedule_data, i).curr);
+
+ /* FIXME: keep track of idlers, chose from the mask */
+ if ( is_idle_vcpu(cur->vcpu) )
+ {
+ ipid = i;
+ lowest = CSCHED_IDLE_CREDIT;
+ break;
+ }
+ else
+ {
+ /* Update credits for current to see if we want to preempt */
+ burn_credits(rqd, cur, now);
+
+ if ( cur->credit < lowest )
+ {
+ ipid = i;
+ lowest = cur->credit;
+ }
+
+ /* TRACE */ {
+ struct {
+ unsigned dom:16,vcpu:16;
+ unsigned credit;
+ } d;
+ d.dom = cur->vcpu->domain->domain_id;
+ d.vcpu = cur->vcpu->vcpu_id;
+ d.credit = cur->credit;
+ trace_var(TRC_CSCHED2_TICKLE_CHECK, 1,
+ sizeof(d),
+ (unsigned char *)&d);
+ }
+ }
+ }
+
+ if ( ipid != -1 )
+ {
+ int cdiff = lowest - new->credit;
+
+ if ( lowest == CSCHED_IDLE_CREDIT || cdiff < 0 ) {
+ d2printk("si %d\n", ipid);
+ cpu_raise_softirq(ipid, SCHEDULE_SOFTIRQ);
+ }
+ else
+ /* FIXME: Wake up later? */;
+ }
+}
+
+/*
+ * Credit-related code
+ */
+static void reset_credit(int cpu, s_time_t now)
+{
+ struct list_head *iter;
+
+ list_for_each( iter, &RQD(cpu)->svc )
+ {
+ struct csched_vcpu * svc = list_entry(iter, struct csched_vcpu, rqd_elem);
+
+ BUG_ON( is_idle_vcpu(svc->vcpu) );
+
+ /* "Clip" credits to max carryover */
+ if ( svc->credit > CSCHED_CARRYOVER_MAX )
+ svc->credit = CSCHED_CARRYOVER_MAX;
+ /* And add INIT */
+ svc->credit += CSCHED_CREDIT_INIT;
+ svc->start_time = now;
+
+ /* FIXME: Trace credit */
+ }
+
+ /* No need to resort runqueue, as everyone's order should be the same. */
+}
+
+void burn_credits(struct csched_runqueue_data *rqd, struct csched_vcpu *svc, s_time_t now)
+{
+ s_time_t delta;
+
+ /* Assert svc is current */
+ ASSERT(svc==CSCHED_VCPU(per_cpu(schedule_data, svc->vcpu->processor).curr));
+
+ if ( is_idle_vcpu(svc->vcpu) )
+ {
+ BUG_ON(svc->credit != CSCHED_IDLE_CREDIT);
+ return;
+ }
+
+ delta = now - svc->start_time;
+
+ if ( delta > 0 ) {
+ /* This will round down; should we consider rounding up...? */
+ svc->credit -= t2c(rqd, delta, svc);
+ svc->start_time = now;
+
+ d2printk("b d%dv%d c%d\n",
+ svc->vcpu->domain->domain_id,
+ svc->vcpu->vcpu_id,
+ svc->credit);
+ } else {
+ d2printk("%s: Time went backwards? now %"PRI_stime" start %"PRI_stime"\n",
+ __func__, now, svc->start_time);
+ }
+
+ /* TRACE */
+ {
+ struct {
+ unsigned dom:16,vcpu:16;
+ unsigned credit;
+ int delta;
+ } d;
+ d.dom = svc->vcpu->domain->domain_id;
+ d.vcpu = svc->vcpu->vcpu_id;
+ d.credit = svc->credit;
+ d.delta = delta;
+ trace_var(TRC_CSCHED2_CREDIT_BURN, 1,
+ sizeof(d),
+ (unsigned char *)&d);
+ }
+}
+
+/* Find the domain with the highest weight. */
+void update_max_weight(struct csched_runqueue_data *rqd, int new_weight, int old_weight)
+{
+ /* Try to avoid brute-force search:
+ * - If new_weight is larger, max_weigth <- new_weight
+ * - If old_weight != max_weight, someone else is still max_weight
+ * (No action required)
+ * - If old_weight == max_weight, brute-force search for max weight
+ */
+ if ( new_weight > rqd->max_weight )
+ {
+ rqd->max_weight = new_weight;
+ printk("%s: Runqueue id %d max weight %d\n", __func__, rqd->id, rqd->max_weight);
+ }
+ else if ( old_weight == rqd->max_weight )
+ {
+ struct list_head *iter;
+ int max_weight = 1;
+
+ list_for_each( iter, &rqd->svc )
+ {
+ struct csched_vcpu * svc = list_entry(iter, struct csched_vcpu, rqd_elem);
+
+ if ( svc->weight > max_weight )
+ max_weight = svc->weight;
+ }
+
+ rqd->max_weight = max_weight;
+ printk("%s: Runqueue %d max weight %d\n", __func__, rqd->id, rqd->max_weight);
+ }
+}
+
+#ifndef NDEBUG
+static /*inline*/ void
+__csched_vcpu_check(struct vcpu *vc)
+{
+ struct csched_vcpu * const svc = CSCHED_VCPU(vc);
+ struct csched_dom * const sdom = svc->sdom;
+
+ BUG_ON( svc->vcpu != vc );
+ BUG_ON( sdom != CSCHED_DOM(vc->domain) );
+ if ( sdom )
+ {
+ BUG_ON( is_idle_vcpu(vc) );
+ BUG_ON( sdom->dom != vc->domain );
+ }
+ else
+ {
+ BUG_ON( !is_idle_vcpu(vc) );
+ }
+}
+#define CSCHED_VCPU_CHECK(_vc) (__csched_vcpu_check(_vc))
+#else
+#define CSCHED_VCPU_CHECK(_vc)
+#endif
+
+static int
+csched_vcpu_init(struct vcpu *vc)
+{
+ struct domain * const dom = vc->domain;
+ struct csched_dom *sdom = CSCHED_DOM(dom);
+ struct csched_vcpu *svc;
+
+ printk("%s: Initializing d%dv%d\n",
+ __func__, dom->domain_id, vc->vcpu_id);
+
+ /* Allocate per-VCPU info */
+ svc = xmalloc(struct csched_vcpu);
+ if ( svc == NULL )
+ return -1;
+
+ INIT_LIST_HEAD(&svc->rqd_elem);
+ INIT_LIST_HEAD(&svc->sdom_elem);
+ INIT_LIST_HEAD(&svc->runq_elem);
+
+ svc->sdom = sdom;
+ svc->vcpu = vc;
+ svc->flags = 0U;
+ vc->sched_priv = svc;
+
+ if ( ! is_idle_vcpu(vc) )
+ {
+ BUG_ON( sdom == NULL );
+
+ svc->credit = CSCHED_CREDIT_INIT;
+ svc->weight = sdom->weight;
+
+ /* FIXME: Do we need the private lock here? */
+ list_add_tail(&svc->sdom_elem, &sdom->vcpu);
+
+ /* Add vcpu to runqueue of initial processor */
+ /* FIXME: Abstract for multiple runqueues */
+ vcpu_schedule_lock_irq(vc);
+
+ list_add_tail(&svc->rqd_elem, &RQD(vc->processor)->svc);
+ update_max_weight(RQD(vc->processor), svc->weight, 0);
+
+ vcpu_schedule_unlock_irq(vc);
+
+ sdom->nr_vcpus++;
+ }
+ else
+ {
+ BUG_ON( sdom != NULL );
+ svc->credit = CSCHED_IDLE_CREDIT;
+ svc->weight = 0;
+ if ( csched_priv.idle_domain == NULL )
+ csched_priv.idle_domain = dom;
+ }
+
+ CSCHED_VCPU_CHECK(vc);
+ return 0;
+}
+
+static void
+csched_vcpu_destroy(struct vcpu *vc)
+{
+ struct csched_vcpu * const svc = CSCHED_VCPU(vc);
+ struct csched_dom * const sdom = svc->sdom;
+
+ BUG_ON( sdom == NULL );
+ BUG_ON( !list_empty(&svc->runq_elem) );
+
+ /* Remove from runqueue */
+ vcpu_schedule_lock_irq(vc);
+
+ list_del_init(&svc->rqd_elem);
+ update_max_weight(RQD(vc->processor), 0, svc->weight);
+
+ vcpu_schedule_unlock_irq(vc);
+
+ /* Remove from sdom list. Don't need a lock for this, as it's called
+ * syncronously when nothing else can happen. */
+ list_del_init(&svc->sdom_elem);
+
+ sdom->nr_vcpus--;
+
+ xfree(svc);
+}
+
+static void
+csched_vcpu_sleep(struct vcpu *vc)
+{
+ struct csched_vcpu * const svc = CSCHED_VCPU(vc);
+
+ BUG_ON( is_idle_vcpu(vc) );
+
+ if ( per_cpu(schedule_data, vc->processor).curr == vc )
+ cpu_raise_softirq(vc->processor, SCHEDULE_SOFTIRQ);
+ else if ( __vcpu_on_runq(svc) )
+ __runq_remove(svc);
+}
+
+static void
+csched_vcpu_wake(struct vcpu *vc)
+{
+ struct csched_vcpu * const svc = CSCHED_VCPU(vc);
+ const unsigned int cpu = vc->processor;
+ s_time_t now = 0;
+
+ /* Schedule lock should be held at this point. */
+
+ d2printk("w d%dv%d\n", vc->domain->domain_id, vc->vcpu_id);
+
+ BUG_ON( is_idle_vcpu(vc) );
+
+ /* Make sure svc priority mod happens before runq check */
+ if ( unlikely(per_cpu(schedule_data, cpu).curr == vc) )
+ {
+ goto out;
+ }
+
+ if ( unlikely(__vcpu_on_runq(svc)) )
+ {
+ /* If we've boosted someone that's already on a runqueue, prioritize
+ * it and inform the cpu in question. */
+ goto out;
+ }
+
+ /* If the context hasn't been saved for this vcpu yet, we can't put it on
+ * another runqueue. Instead, we set a flag so that it will be put on the runqueue
+ * after the context has been saved. */
+ if ( unlikely (test_bit(__CSFLAG_scheduled, &svc->flags) ) )
+ {
+ set_bit(__CSFLAG_delayed_runq_add, &svc->flags);
+ goto out;
+ }
+
+ now = NOW();
+
+ /* Put the VCPU on the runq */
+ runq_insert(cpu, svc);
+ runq_tickle(cpu, svc, now);
+
+out:
+ d2printk("w-\n");
+ return;
+}
+
+static void
+csched_context_saved(struct vcpu *vc)
+{
+ struct csched_vcpu * const svc = CSCHED_VCPU(vc);
+
+ vcpu_schedule_lock_irq(vc);
+
+ /* This vcpu is now eligible to be put on the runqueue again */
+ clear_bit(__CSFLAG_scheduled, &svc->flags);
+
+ /* If someone wants it on the runqueue, put it there. */
+ /*
+ * NB: We can get rid of CSFLAG_scheduled by checking for
+ * vc->is_running and __vcpu_on_runq(svc) here. However,
+ * since we're accessing the flags cacheline anyway,
+ * it seems a bit pointless; especially as we have plenty of
+ * bits free.
+ */
+ if ( test_bit(__CSFLAG_delayed_runq_add, &svc->flags) )
+ {
+ const unsigned int cpu = vc->processor;
+
+ clear_bit(__CSFLAG_delayed_runq_add, &svc->flags);
+
+ BUG_ON(__vcpu_on_runq(svc));
+
+ runq_insert(cpu, svc);
+ runq_tickle(cpu, svc, NOW());
+ }
+
+ vcpu_schedule_unlock_irq(vc);
+}
+
+static int
+csched_cpu_pick(struct vcpu *vc)
+{
+ /* FIXME: Chose a schedule group based on load */
+ /* FIXME: Migrate the vcpu to the new runqueue list, updating
+ max_weight for each runqueue */
+ return 0;
+}
+
+static int
+csched_dom_cntl(
+ struct domain *d,
+ struct xen_domctl_scheduler_op *op)
+{
+ struct csched_dom * const sdom = CSCHED_DOM(d);
+ unsigned long flags;
+
+ if ( op->cmd == XEN_DOMCTL_SCHEDOP_getinfo )
+ {
+ op->u.credit2.weight = sdom->weight;
+ }
+ else
+ {
+ ASSERT(op->cmd == XEN_DOMCTL_SCHEDOP_putinfo);
+
+ if ( op->u.credit2.weight != 0 )
+ {
+ struct list_head *iter;
+ int old_weight;
+
+ /* Must hold csched_priv lock to update sdom, runq lock to
+ * update csvcs. */
+ spin_lock_irqsave(&csched_priv.lock, flags);
+
+ old_weight = sdom->weight;
+
+ sdom->weight = op->u.credit2.weight;
+
+ /* Update weights for vcpus, and max_weight for runqueues on which they reside */
+ list_for_each ( iter, &sdom->vcpu )
+ {
+ struct csched_vcpu *svc = list_entry(iter, struct csched_vcpu, sdom_elem);
+
+ /* NB: Locking order is important here. Because we grab this lock here, we
+ * must never lock csched_priv.lock if we're holding a runqueue
+ * lock. */
+ vcpu_schedule_lock_irq(svc->vcpu);
+
+ svc->weight = sdom->weight;
+ update_max_weight(RQD(svc->vcpu->processor), svc->weight, old_weight);
+
+ vcpu_schedule_unlock_irq(svc->vcpu);
+ }
+
+ spin_unlock_irqrestore(&csched_priv.lock, flags);
+ }
+ }
+
+ return 0;
+}
+
+static int
+csched_dom_init(struct domain *dom)
+{
+ struct csched_dom *sdom;
+ int flags;
+
+ printk("%s: Initializing domain %d\n", __func__, dom->domain_id);
+
+ if ( is_idle_domain(dom) )
+ return 0;
+
+ sdom = xmalloc(struct csched_dom);
+ if ( sdom == NULL )
+ return -ENOMEM;
+
+ /* Initialize credit and weight */
+ INIT_LIST_HEAD(&sdom->vcpu);
+ INIT_LIST_HEAD(&sdom->sdom_elem);
+ sdom->dom = dom;
+ sdom->weight = CSCHED_DEFAULT_WEIGHT;
+ sdom->nr_vcpus = 0;
+
+ dom->sched_priv = sdom;
+
+ spin_lock_irqsave(&csched_priv.lock, flags);
+
+ list_add_tail(&sdom->sdom_elem, &csched_priv.sdom);
+
+ spin_unlock_irqrestore(&csched_priv.lock, flags);
+
+ return 0;
+}
+
+static void
+csched_dom_destroy(struct domain *dom)
+{
+ struct csched_dom *sdom = CSCHED_DOM(dom);
+ int flags;
+
+ BUG_ON(!list_empty(&sdom->vcpu));
+
+ spin_lock_irqsave(&csched_priv.lock, flags);
+
+ list_del_init(&sdom->sdom_elem);
+
+ spin_unlock_irqrestore(&csched_priv.lock, flags);
+
+ xfree(CSCHED_DOM(dom));
+}
+
+/* How long should we let this vcpu run for? */
+static s_time_t
+csched_runtime(int cpu, struct csched_vcpu *snext)
+{
+ s_time_t time = CSCHED_MAX_TIMER;
+ struct csched_runqueue_data *rqd = RQD(cpu);
+ struct list_head *runq = &rqd->runq;
+
+ if ( is_idle_vcpu(snext->vcpu) )
+ return CSCHED_MAX_TIMER;
+
+ /* Basic time */
+ time = c2t(rqd, snext->credit, snext);
+
+ /* Next guy on runqueue */
+ if ( ! list_empty(runq) )
+ {
+ struct csched_vcpu *svc = __runq_elem(runq->next);
+ s_time_t ntime;
+
+ if ( ! is_idle_vcpu(svc->vcpu) )
+ {
+ ntime = c2t(rqd, snext->credit - svc->credit, snext);
+
+ if ( time > ntime )
+ time = ntime;
+ }
+ }
+
+ /* Check limits */
+ if ( time < CSCHED_MIN_TIMER )
+ time = CSCHED_MIN_TIMER;
+ else if ( time > CSCHED_MAX_TIMER )
+ time = CSCHED_MAX_TIMER;
+
+ return time;
+}
+
+void __dump_execstate(void *unused);
+
+/*
+ * This function is in the critical path. It is designed to be simple and
+ * fast for the common case.
+ */
+static struct task_slice
+csched_schedule(s_time_t now)
+{
+ const int cpu = smp_processor_id();
+ struct csched_runqueue_data *rqd = RQD(cpu);
+ struct list_head * const runq = &rqd->runq;
+ struct csched_vcpu * const scurr = CSCHED_VCPU(current);
+ struct csched_vcpu *snext = NULL;
+ struct task_slice ret;
+
+ CSCHED_VCPU_CHECK(current);
+
+ d2printk("sc p%d c d%dv%d now %"PRI_stime"\n",
+ cpu,
+ scurr->vcpu->domain->domain_id,
+ scurr->vcpu->vcpu_id,
+ now);
+
+
+ /* Protected by runqueue lock */
+
+ /* Update credits */
+ burn_credits(rqd, scurr, now);
+
+ /*
+ * Select next runnable local VCPU (ie top of local runq).
+ *
+ * If the current vcpu is runnable, and has higher credit than
+ * the next guy on the queue (or there is noone else), we want to run him again.
+ *
+ * If the current vcpu is runnable, and the next guy on the queue
+ * has higher credit, we want to mark current for delayed runqueue
+ * add, and remove the next guy from the queue.
+ *
+ * If the current vcpu is not runnable, we want to chose the idle
+ * vcpu for this processor.
+ */
+ if ( list_empty(runq) )
+ snext = CSCHED_VCPU(csched_priv.idle_domain->vcpu[cpu]);
+ else
+ snext = __runq_elem(runq->next);
+
+ if ( !is_idle_vcpu(current) && vcpu_runnable(current) )
+ {
+ /* If the current vcpu is runnable, and has higher credit
+ * than the next on the runqueue, run him again.
+ * Otherwise, set him for delayed runq add. */
+ if ( scurr->credit > snext->credit)
+ snext = scurr;
+ else
+ set_bit(__CSFLAG_delayed_runq_add, &scurr->flags);
+ }
+
+ if ( snext != scurr && !is_idle_vcpu(snext->vcpu) )
+ {
+ __runq_remove(snext);
+ if ( snext->vcpu->is_running )
+ {
+ printk("p%d: snext d%dv%d running on p%d! scurr d%dv%d\n",
+ cpu,
+ snext->vcpu->domain->domain_id, snext->vcpu->vcpu_id,
+ snext->vcpu->processor,
+ scurr->vcpu->domain->domain_id,
+ scurr->vcpu->vcpu_id);
+ BUG();
+ }
+ set_bit(__CSFLAG_scheduled, &snext->flags);
+ }
+
+ if ( !is_idle_vcpu(snext->vcpu) && snext->credit <= CSCHED_CREDIT_RESET )
+ reset_credit(cpu, now);
+
+#if 0
+ /*
+ * Update idlers mask if necessary. When we're idling, other CPUs
+ * will tickle us when they get extra work.
+ */
+ if ( is_idle_vcpu(snext->vcpu) )
+ {
+ if ( !cpu_isset(cpu, csched_priv.idlers) )
+ cpu_set(cpu, csched_priv.idlers);
+ }
+ else if ( cpu_isset(cpu, csched_priv.idlers) )
+ {
+ cpu_clear(cpu, csched_priv.idlers);
+ }
+#endif
+
+ if ( !is_idle_vcpu(snext->vcpu) )
+ {
+ snext->start_time = now;
+ snext->vcpu->processor = cpu; /* Safe because lock for old processor is held */
+ }
+ /*
+ * Return task to run next...
+ */
+ ret.time = csched_runtime(cpu, snext);
+ ret.task = snext->vcpu;
+
+ CSCHED_VCPU_CHECK(ret.task);
+ return ret;
+}
+
+static void
+csched_dump_vcpu(struct csched_vcpu *svc)
+{
+ printk("[%i.%i] flags=%x cpu=%i",
+ svc->vcpu->domain->domain_id,
+ svc->vcpu->vcpu_id,
+ svc->flags,
+ svc->vcpu->processor);
+
+ printk(" credit=%" PRIi32" [w=%u]", svc->credit, svc->weight);
+
+ printk("\n");
+}
+
+static void
+csched_dump_pcpu(int cpu)
+{
+ struct list_head *runq, *iter;
+ struct csched_vcpu *svc;
+ int loop;
+ char cpustr[100];
+
+ /* FIXME: Do locking properly for access to runqueue structures */
+
+ runq = &RQD(cpu)->runq;
+
+ cpumask_scnprintf(cpustr, sizeof(cpustr), per_cpu(cpu_sibling_map,cpu));
+ printk(" sibling=%s, ", cpustr);
+ cpumask_scnprintf(cpustr, sizeof(cpustr), per_cpu(cpu_core_map,cpu));
+ printk("core=%s\n", cpustr);
+
+ /* current VCPU */
+ svc = CSCHED_VCPU(per_cpu(schedule_data, cpu).curr);
+ if ( svc )
+ {
+ printk("\trun: ");
+ csched_dump_vcpu(svc);
+ }
+
+ loop = 0;
+ list_for_each( iter, runq )
+ {
+ svc = __runq_elem(iter);
+ if ( svc )
+ {
+ printk("\t%3d: ", ++loop);
+ csched_dump_vcpu(svc);
+ }
+ }
+}
+
+static void
+csched_dump(void)
+{
+ struct list_head *iter_sdom, *iter_svc;
+ int loop;
+
+ printk("info:\n"
+ "\tncpus = %u\n"
+ "\tdefault-weight = %d\n",
+ csched_priv.ncpus,
+ CSCHED_DEFAULT_WEIGHT);
+
+ /* FIXME: Locking! */
+
+ printk("active vcpus:\n");
+ loop = 0;
+ list_for_each( iter_sdom, &csched_priv.sdom )
+ {
+ struct csched_dom *sdom;
+ sdom = list_entry(iter_sdom, struct csched_dom, sdom_elem);
+
+ list_for_each( iter_svc, &sdom->vcpu )
+ {
+ struct csched_vcpu *svc;
+ svc = list_entry(iter_svc, struct csched_vcpu, sdom_elem);
+
+ printk("\t%3d: ", ++loop);
+ csched_dump_vcpu(svc);
+ }
+ }
+}
+
+static void
+make_runq_map(void)
+{
+ int cpu, cpu_count=0;
+
+ /* FIXME: Read pcpu layout and do this properly */
+ for_each_possible_cpu( cpu )
+ {
+ csched_priv.runq_map[cpu] = 0;
+ cpu_count++;
+ }
+ csched_priv.runq_count = 1;
+
+ /* Move to the init code...? */
+ csched_priv.rqd[0].cpu_min = 0;
+ csched_priv.rqd[0].cpu_max = cpu_count;
+}
+
+static void
+csched_init(void)
+{
+ int i;
+
+ printk("Initializing Credit2 scheduler\n" \
+ " WARNING: This is experimental software in development.\n" \
+ " Use at your own risk.\n");
+
+ spin_lock_init(&csched_priv.lock);
+ INIT_LIST_HEAD(&csched_priv.sdom);
+
+ csched_priv.ncpus = 0;
+
+ make_runq_map();
+
+ for ( i=0; i<csched_priv.runq_count ; i++ )
+ {
+ struct csched_runqueue_data *rqd = csched_priv.rqd + i;
+
+ rqd->max_weight = 1;
+ rqd->id = i;
+ INIT_LIST_HEAD(&rqd->svc);
+ INIT_LIST_HEAD(&rqd->runq);
+ }
+
+ /* Initialize pcpu structures */
+ for_each_possible_cpu(i)
+ {
+ int runq_id;
+ spinlock_t *lock;
+
+ /* Point the per-cpu schedule lock to the runq_id lock */
+ runq_id = csched_priv.runq_map[i];
+ lock = &per_cpu(schedule_data, runq_id)._lock;
+
+ per_cpu(schedule_data, i).schedule_lock = lock;
+
+ csched_priv.ncpus++;
+ }
+}
+
+struct scheduler sched_credit2_def = {
+ .name = "SMP Credit Scheduler rev2",
+ .opt_name = "credit2",
+ .sched_id = XEN_SCHEDULER_CREDIT2,
+
+ .init_domain = csched_dom_init,
+ .destroy_domain = csched_dom_destroy,
+
+ .init_vcpu = csched_vcpu_init,
+ .destroy_vcpu = csched_vcpu_destroy,
+
+ .sleep = csched_vcpu_sleep,
+ .wake = csched_vcpu_wake,
+
+ .adjust = csched_dom_cntl,
+
+ .pick_cpu = csched_cpu_pick,
+ .do_schedule = csched_schedule,
+ .context_saved = csched_context_saved,
+
+ .dump_cpu_state = csched_dump_pcpu,
+ .dump_settings = csched_dump,
+ .init = csched_init,
+};
diff -r dca0baecc909 -r 9b91a253528b xen/common/schedule.c
--- a/xen/common/schedule.c Wed Apr 14 12:06:05 2010 +0100
+++ b/xen/common/schedule.c Wed Apr 14 12:07:21 2010 +0100
@@ -56,9 +56,11 @@ DEFINE_PER_CPU(struct schedule_data, sch

extern const struct scheduler sched_sedf_def;
extern const struct scheduler sched_credit_def;
+extern const struct scheduler sched_credit2_def;
static const struct scheduler *__initdata schedulers[] = {
&sched_sedf_def,
&sched_credit_def,
+ &sched_credit2_def,
NULL
};

diff -r dca0baecc909 -r 9b91a253528b xen/include/public/domctl.h
--- a/xen/include/public/domctl.h Wed Apr 14 12:06:05 2010 +0100
+++ b/xen/include/public/domctl.h Wed Apr 14 12:07:21 2010 +0100
@@ -303,6 +303,7 @@ DEFINE_XEN_GUEST_HANDLE(xen_domctl_max_v
/* Scheduler types. */
#define XEN_SCHEDULER_SEDF 4
#define XEN_SCHEDULER_CREDIT 5
+#define XEN_SCHEDULER_CREDIT2 6
/* Set or get info? */
#define XEN_DOMCTL_SCHEDOP_putinfo 0
#define XEN_DOMCTL_SCHEDOP_getinfo 1
@@ -321,6 +322,9 @@ struct xen_domctl_scheduler_op {
uint16_t weight;
uint16_t cap;
} credit;
+ struct xen_domctl_sched_credit2 {
+ uint16_t weight;
+ } credit2;
} u;
};
typedef struct xen_domctl_scheduler_op xen_domctl_scheduler_op_t;

_______________________________________________
Xen-changelog mailing list
Xen-changelog@lists.xensource.com
http://lists.xensource.com/xen-changelog