Mailing List Archive: Support CFQ scheduling of guest block requests by creating

# HG changeset patch
# User kaf24@firebug.cl.cam.ac.uk
# Node ID 6f62ad959f6b6d7fd98b2adfb3f07a9a15613e07
# Parent c9772105fead52abf64213aa1eda6419871acf94
Support CFQ scheduling of guest block requests by creating
a kernel thread per blkif connection. General cleanup work
in blkback driver.

Signed-off-by: Gerd Knorr <kraxel@suse.de>

diff -r c9772105fead -r 6f62ad959f6b linux-2.6-xen-sparse/drivers/xen/blkback/blkback.c
--- a/linux-2.6-xen-sparse/drivers/xen/blkback/blkback.c Thu Dec 8 15:04:41 2005
+++ b/linux-2.6-xen-sparse/drivers/xen/blkback/blkback.c Thu Dec 8 15:53:53 2005
@@ -12,6 +12,8 @@
*/

#include <linux/spinlock.h>
+#include <linux/kthread.h>
+#include <linux/list.h>
#include <asm-xen/balloon.h>
#include <asm/hypervisor.h>
#include "common.h"
@@ -21,26 +23,26 @@
* pulled from a communication ring are quite likely to end up being part of
* the same scatter/gather request at the disc.
*
- * ** TRY INCREASING 'MAX_PENDING_REQS' IF WRITE SPEEDS SEEM TOO LOW **
+ * ** TRY INCREASING 'blkif_reqs' IF WRITE SPEEDS SEEM TOO LOW **
+ *
* This will increase the chances of being able to write whole tracks.
* 64 should be enough to keep us competitive with Linux.
*/
-#define MAX_PENDING_REQS 64
-#define BATCH_PER_DOMAIN 16
-
-static unsigned long mmap_vstart;
-#define MMAP_PAGES \
- (MAX_PENDING_REQS * BLKIF_MAX_SEGMENTS_PER_REQUEST)
-#ifdef __ia64__
-static void *pending_vaddrs[MMAP_PAGES];
-#define MMAP_VADDR(_idx, _i) \
- (unsigned long)(pending_vaddrs[((_idx) * BLKIF_MAX_SEGMENTS_PER_REQUEST) + (_i)])
-#else
-#define MMAP_VADDR(_req,_seg) \
- (mmap_vstart + \
- ((_req) * BLKIF_MAX_SEGMENTS_PER_REQUEST * PAGE_SIZE) + \
- ((_seg) * PAGE_SIZE))
-#endif
+static int blkif_reqs = 64;
+static int mmap_pages;
+
+static int __init set_blkif_reqs(char *str)
+{
+ get_option(&str, &blkif_reqs);
+ return 1;
+}
+__setup("blkif_reqs=", set_blkif_reqs);
+
+/* Run-time switchable: /sys/module/blkback/parameters/ */
+static unsigned int log_stats = 0;
+static unsigned int debug_lvl = 0;
+module_param(log_stats, int, 0644);
+module_param(debug_lvl, int, 0644);

/*
* Each outstanding request that we've passed to the lower device layers has a
@@ -55,43 +57,33 @@
atomic_t pendcnt;
unsigned short operation;
int status;
+ struct list_head free_list;
} pending_req_t;

-/*
- * We can't allocate pending_req's in order, since they may complete out of
- * order. We therefore maintain an allocation ring. This ring also indicates
- * when enough work has been passed down -- at that point the allocation ring
- * will be empty.
- */
-static pending_req_t pending_reqs[MAX_PENDING_REQS];
-static unsigned char pending_ring[MAX_PENDING_REQS];
-static spinlock_t pend_prod_lock = SPIN_LOCK_UNLOCKED;
-/* NB. We use a different index type to differentiate from shared blk rings. */
-typedef unsigned int PEND_RING_IDX;
-#define MASK_PEND_IDX(_i) ((_i)&(MAX_PENDING_REQS-1))
-static PEND_RING_IDX pending_prod, pending_cons;
-#define NR_PENDING_REQS (MAX_PENDING_REQS - pending_prod + pending_cons)
-
-static request_queue_t *plugged_queue;
-static inline void flush_plugged_queue(void)
-{
- request_queue_t *q = plugged_queue;
- if (q != NULL) {
- if ( q->unplug_fn != NULL )
- q->unplug_fn(q);
- blk_put_queue(q);
- plugged_queue = NULL;
- }
-}
-
-/* When using grant tables to map a frame for device access then the
- * handle returned must be used to unmap the frame. This is needed to
- * drop the ref count on the frame.
- */
-static grant_handle_t pending_grant_handles[MMAP_PAGES];
-#define pending_handle(_idx, _i) \
- (pending_grant_handles[((_idx) * BLKIF_MAX_SEGMENTS_PER_REQUEST) + (_i)])
+static pending_req_t *pending_reqs;
+static struct list_head pending_free;
+static spinlock_t pending_free_lock = SPIN_LOCK_UNLOCKED;
+static DECLARE_WAIT_QUEUE_HEAD(pending_free_wq);
+
#define BLKBACK_INVALID_HANDLE (~0)
+
+static unsigned long mmap_vstart;
+static unsigned long *pending_vaddrs;
+static grant_handle_t *pending_grant_handles;
+
+static inline int vaddr_pagenr(pending_req_t *req, int seg)
+{
+ return (req - pending_reqs) * BLKIF_MAX_SEGMENTS_PER_REQUEST + seg;
+}
+
+static inline unsigned long vaddr(pending_req_t *req, int seg)
+{
+ return pending_vaddrs[vaddr_pagenr(req, seg)];
+}
+
+#define pending_handle(_req, _seg) \
+ (pending_grant_handles[vaddr_pagenr(_req, _seg)])
+

#ifdef CONFIG_XEN_BLKDEV_TAP_BE
/*
@@ -105,26 +97,79 @@
static inline domid_t ID_TO_DOM(unsigned long id) { return (id >> 16); }
#endif

-static int do_block_io_op(blkif_t *blkif, int max_to_do);
-static void dispatch_rw_block_io(blkif_t *blkif, blkif_request_t *req);
+static int do_block_io_op(blkif_t *blkif);
+static void dispatch_rw_block_io(blkif_t *blkif,
+ blkif_request_t *req,
+ pending_req_t *pending_req);
static void make_response(blkif_t *blkif, unsigned long id,
unsigned short op, int st);

-static void fast_flush_area(int idx, int nr_pages)
+/******************************************************************
+ * misc small helpers
+ */
+static pending_req_t* alloc_req(void)
+{
+ pending_req_t *req = NULL;
+ unsigned long flags;
+
+ spin_lock_irqsave(&pending_free_lock, flags);
+ if (!list_empty(&pending_free)) {
+ req = list_entry(pending_free.next, pending_req_t, free_list);
+ list_del(&req->free_list);
+ }
+ spin_unlock_irqrestore(&pending_free_lock, flags);
+ return req;
+}
+
+static void free_req(pending_req_t *req)
+{
+ unsigned long flags;
+ int was_empty;
+
+ spin_lock_irqsave(&pending_free_lock, flags);
+ was_empty = list_empty(&pending_free);
+ list_add(&req->free_list, &pending_free);
+ spin_unlock_irqrestore(&pending_free_lock, flags);
+ if (was_empty)
+ wake_up(&pending_free_wq);
+}
+
+static void unplug_queue(blkif_t *blkif)
+{
+ if (blkif->plug == NULL)
+ return;
+ if (blkif->plug->unplug_fn)
+ blkif->plug->unplug_fn(blkif->plug);
+ blk_put_queue(blkif->plug);
+ blkif->plug = NULL;
+}
+
+static void plug_queue(blkif_t *blkif, struct bio *bio)
+{
+ request_queue_t *q = bdev_get_queue(bio->bi_bdev);
+
+ if (q == blkif->plug)
+ return;
+ unplug_queue(blkif);
+ blk_get_queue(q);
+ blkif->plug = q;
+}
+
+static void fast_flush_area(pending_req_t *req)
{
struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST];
unsigned int i, invcount = 0;
grant_handle_t handle;
int ret;

- for (i = 0; i < nr_pages; i++) {
- handle = pending_handle(idx, i);
+ for (i = 0; i < req->nr_pages; i++) {
+ handle = pending_handle(req, i);
if (handle == BLKBACK_INVALID_HANDLE)
continue;
- unmap[invcount].host_addr = MMAP_VADDR(idx, i);
+ unmap[invcount].host_addr = vaddr(req, i);
unmap[invcount].dev_bus_addr = 0;
unmap[invcount].handle = handle;
- pending_handle(idx, i) = BLKBACK_INVALID_HANDLE;
+ pending_handle(req, i) = BLKBACK_INVALID_HANDLE;
invcount++;
}

@@ -133,109 +178,83 @@
BUG_ON(ret);
}

-
-/******************************************************************
- * BLOCK-DEVICE SCHEDULER LIST MAINTENANCE
- */
-
-static struct list_head blkio_schedule_list;
-static spinlock_t blkio_schedule_list_lock;
-
-static int __on_blkdev_list(blkif_t *blkif)
-{
- return blkif->blkdev_list.next != NULL;
-}
-
-static void remove_from_blkdev_list(blkif_t *blkif)
-{
- unsigned long flags;
-
- if (!__on_blkdev_list(blkif))
- return;
-
- spin_lock_irqsave(&blkio_schedule_list_lock, flags);
- if (__on_blkdev_list(blkif)) {
- list_del(&blkif->blkdev_list);
- blkif->blkdev_list.next = NULL;
- blkif_put(blkif);
- }
- spin_unlock_irqrestore(&blkio_schedule_list_lock, flags);
-}
-
-static void add_to_blkdev_list_tail(blkif_t *blkif)
-{
- unsigned long flags;
-
- if (__on_blkdev_list(blkif))
- return;
-
- spin_lock_irqsave(&blkio_schedule_list_lock, flags);
- if (!__on_blkdev_list(blkif) && (blkif->status == CONNECTED)) {
- list_add_tail(&blkif->blkdev_list, &blkio_schedule_list);
- blkif_get(blkif);
- }
- spin_unlock_irqrestore(&blkio_schedule_list_lock, flags);
-}
-
-
/******************************************************************
* SCHEDULER FUNCTIONS
*/

-static DECLARE_WAIT_QUEUE_HEAD(blkio_schedule_wait);
-
-static int blkio_schedule(void *arg)
-{
- DECLARE_WAITQUEUE(wq, current);
-
- blkif_t *blkif;
- struct list_head *ent;
-
- daemonize("xenblkd");
-
+static void print_stats(blkif_t *blkif)
+{
+ printk(KERN_DEBUG "%s: oo %3d | rd %4d | wr %4d\n",
+ current->comm, blkif->st_oo_req,
+ blkif->st_rd_req, blkif->st_wr_req);
+ blkif->st_print = jiffies + msecs_to_jiffies(10 * 1000);
+ blkif->st_rd_req = 0;
+ blkif->st_wr_req = 0;
+ blkif->st_oo_req = 0;
+}
+
+int blkif_schedule(void *arg)
+{
+ blkif_t *blkif = arg;
+
+ blkif_get(blkif);
+ if (debug_lvl)
+ printk(KERN_DEBUG "%s: started\n", current->comm);
for (;;) {
- /* Wait for work to do. */
- add_wait_queue(&blkio_schedule_wait, &wq);
- set_current_state(TASK_INTERRUPTIBLE);
- if ( (NR_PENDING_REQS == MAX_PENDING_REQS) ||
- list_empty(&blkio_schedule_list) )
- schedule();
- __set_current_state(TASK_RUNNING);
- remove_wait_queue(&blkio_schedule_wait, &wq);
-
- /* Queue up a batch of requests. */
- while ((NR_PENDING_REQS < MAX_PENDING_REQS) &&
- !list_empty(&blkio_schedule_list)) {
- ent = blkio_schedule_list.next;
- blkif = list_entry(ent, blkif_t, blkdev_list);
- blkif_get(blkif);
- remove_from_blkdev_list(blkif);
- if (do_block_io_op(blkif, BATCH_PER_DOMAIN))
- add_to_blkdev_list_tail(blkif);
- blkif_put(blkif);
- }
-
- /* Push the batch through to disc. */
- flush_plugged_queue();
- }
-}
-
-static void maybe_trigger_blkio_schedule(void)
-{
- /*
- * Needed so that two processes, which together make the following
- * predicate true, don't both read stale values and evaluate the
- * predicate incorrectly. Incredibly unlikely to stall the scheduler
- * on x86, but...
- */
- smp_mb();
-
- if ((NR_PENDING_REQS < (MAX_PENDING_REQS/2)) &&
- !list_empty(&blkio_schedule_list))
- wake_up(&blkio_schedule_wait);
-}
-
-
+ if (kthread_should_stop()) {
+ /* asked to quit? */
+ if (!atomic_read(&blkif->io_pending))
+ break;
+ if (debug_lvl)
+ printk(KERN_DEBUG "%s: I/O pending, "
+ "delaying exit\n", current->comm);
+ }
+
+ if (!atomic_read(&blkif->io_pending)) {
+ /* Wait for work to do. */
+ wait_event_interruptible(
+ blkif->wq,
+ (atomic_read(&blkif->io_pending) ||
+ kthread_should_stop()));
+ } else if (list_empty(&pending_free)) {
+ /* Wait for pending_req becoming available. */
+ wait_event_interruptible(
+ pending_free_wq,
+ !list_empty(&pending_free));
+ }
+
+ if (blkif->status != CONNECTED) {
+ /* make sure we are connected */
+ if (debug_lvl)
+ printk(KERN_DEBUG "%s: not connected "
+ "(%d pending)\n",
+ current->comm,
+ atomic_read(&blkif->io_pending));
+ wait_event_interruptible(
+ blkif->wq,
+ (blkif->status == CONNECTED ||
+ kthread_should_stop()));
+ continue;
+ }
+
+ /* Schedule I/O */
+ atomic_set(&blkif->io_pending, 0);
+ if (do_block_io_op(blkif))
+ atomic_inc(&blkif->io_pending);
+ unplug_queue(blkif);
+
+ if (log_stats && time_after(jiffies, blkif->st_print))
+ print_stats(blkif);
+ }
+
+ if (log_stats)
+ print_stats(blkif);
+ if (debug_lvl)
+ printk(KERN_DEBUG "%s: exiting\n", current->comm);
+ blkif->xenblkd = NULL;
+ blkif_put(blkif);
+ return 0;
+}

/******************************************************************
* COMPLETION CALLBACK -- Called as bh->b_end_io()
@@ -243,8 +262,6 @@

static void __end_block_io_op(pending_req_t *pending_req, int uptodate)
{
- unsigned long flags;
-
/* An error fails the entire request. */
if (!uptodate) {
DPRINTK("Buffer not up-to-date at end of operation\n");
@@ -252,15 +269,11 @@
}

if (atomic_dec_and_test(&pending_req->pendcnt)) {
- int pending_idx = pending_req - pending_reqs;
- fast_flush_area(pending_idx, pending_req->nr_pages);
+ fast_flush_area(pending_req);
make_response(pending_req->blkif, pending_req->id,
pending_req->operation, pending_req->status);
blkif_put(pending_req->blkif);
- spin_lock_irqsave(&pend_prod_lock, flags);
- pending_ring[MASK_PEND_IDX(pending_prod++)] = pending_idx;
- spin_unlock_irqrestore(&pend_prod_lock, flags);
- maybe_trigger_blkio_schedule();
+ free_req(pending_req);
}
}

@@ -281,8 +294,9 @@
irqreturn_t blkif_be_int(int irq, void *dev_id, struct pt_regs *regs)
{
blkif_t *blkif = dev_id;
- add_to_blkdev_list_tail(blkif);
- maybe_trigger_blkio_schedule();
+
+ atomic_inc(&blkif->io_pending);
+ wake_up(&blkif->wq);
return IRQ_HANDLED;
}

@@ -292,10 +306,11 @@
* DOWNWARD CALLS -- These interface with the block-device layer proper.
*/

-static int do_block_io_op(blkif_t *blkif, int max_to_do)
+static int do_block_io_op(blkif_t *blkif)
{
blkif_back_ring_t *blk_ring = &blkif->blk_ring;
blkif_request_t *req;
+ pending_req_t *pending_req;
RING_IDX rc, rp;
int more_to_do = 0;

@@ -304,8 +319,10 @@
rmb(); /* Ensure we see queued requests up to 'rp'. */

while ((rc != rp) && !RING_REQUEST_CONS_OVERFLOW(blk_ring, rc)) {
- if ((max_to_do-- == 0) ||
- (NR_PENDING_REQS == MAX_PENDING_REQS)) {
+
+ pending_req = alloc_req();
+ if (NULL == pending_req) {
+ blkif->st_oo_req++;
more_to_do = 1;
break;
}
@@ -315,28 +332,31 @@

switch (req->operation) {
case BLKIF_OP_READ:
+ blkif->st_rd_req++;
+ dispatch_rw_block_io(blkif, req, pending_req);
+ break;
case BLKIF_OP_WRITE:
- dispatch_rw_block_io(blkif, req);
+ blkif->st_wr_req++;
+ dispatch_rw_block_io(blkif, req, pending_req);
break;
-
default:
DPRINTK("error: unknown block io operation [%d]\n",
req->operation);
make_response(blkif, req->id, req->operation,
BLKIF_RSP_ERROR);
+ free_req(pending_req);
break;
}
}
-
return more_to_do;
}

-static void dispatch_rw_block_io(blkif_t *blkif, blkif_request_t *req)
+static void dispatch_rw_block_io(blkif_t *blkif,
+ blkif_request_t *req,
+ pending_req_t *pending_req)
{
extern void ll_rw_block(int rw, int nr, struct buffer_head * bhs[]);
int operation = (req->operation == BLKIF_OP_WRITE) ? WRITE : READ;
- int i, pending_idx = pending_ring[MASK_PEND_IDX(pending_cons)];
- pending_req_t *pending_req;
struct gnttab_map_grant_ref map[BLKIF_MAX_SEGMENTS_PER_REQUEST];
struct phys_req preq;
struct {
@@ -344,32 +364,36 @@
} seg[BLKIF_MAX_SEGMENTS_PER_REQUEST];
unsigned int nseg;
struct bio *bio = NULL, *biolist[BLKIF_MAX_SEGMENTS_PER_REQUEST];
- int nbio = 0;
- request_queue_t *q;
- int ret, errors = 0;
+ int ret, i, nbio = 0;

/* Check that number of segments is sane. */
nseg = req->nr_segments;
if (unlikely(nseg == 0) ||
unlikely(nseg > BLKIF_MAX_SEGMENTS_PER_REQUEST)) {
DPRINTK("Bad number of segments in request (%d)\n", nseg);
- goto bad_descriptor;
+ goto fail_response;
}

preq.dev = req->handle;
preq.sector_number = req->sector_number;
preq.nr_sects = 0;

+ pending_req->blkif = blkif;
+ pending_req->id = req->id;
+ pending_req->operation = operation;
+ pending_req->status = BLKIF_RSP_OKAY;
+ pending_req->nr_pages = nseg;
+
for (i = 0; i < nseg; i++) {
seg[i].nsec = req->seg[i].last_sect -
req->seg[i].first_sect + 1;

if ((req->seg[i].last_sect >= (PAGE_SIZE >> 9)) ||
(seg[i].nsec <= 0))
- goto bad_descriptor;
+ goto fail_response;
preq.nr_sects += seg[i].nsec;

- map[i].host_addr = MMAP_VADDR(pending_idx, i);
+ map[i].host_addr = vaddr(pending_req, i);
map[i].dom = blkif->domid;
map[i].ref = req->seg[i].gref;
map[i].flags = GNTMAP_host_map;
@@ -381,26 +405,22 @@
BUG_ON(ret);

for (i = 0; i < nseg; i++) {
- if (likely(map[i].status == 0)) {
- pending_handle(pending_idx, i) = map[i].handle;
+ if (unlikely(map[i].status != 0)) {
+ DPRINTK("invalid buffer -- could not remap it\n");
+ goto fail_flush;
+ }
+
+ pending_handle(pending_req, i) = map[i].handle;
#ifdef __ia64__
- MMAP_VADDR(pending_idx,i) = gnttab_map_vaddr(map[i]);
+ pending_vaddrs[vaddr_pagenr(req, seg)] =
+ = gnttab_map_vaddr(map[i]);
#else
- set_phys_to_machine(__pa(MMAP_VADDR(
- pending_idx, i)) >> PAGE_SHIFT,
- FOREIGN_FRAME(map[i].dev_bus_addr>>PAGE_SHIFT));
+ set_phys_to_machine(__pa(vaddr(
+ pending_req, i)) >> PAGE_SHIFT,
+ FOREIGN_FRAME(map[i].dev_bus_addr >> PAGE_SHIFT));
#endif
- seg[i].buf = map[i].dev_bus_addr |
- (req->seg[i].first_sect << 9);
- } else {
- errors++;
- }
- }
-
- if (errors) {
- DPRINTK("invalid buffer -- could not remap it\n");
- fast_flush_area(pending_idx, nseg);
- goto bad_descriptor;
+ seg[i].buf = map[i].dev_bus_addr |
+ (req->seg[i].first_sect << 9);
}

if (vbd_translate(&preq, blkif, operation) != 0) {
@@ -408,37 +428,25 @@
operation == READ ? "read" : "write",
preq.sector_number,
preq.sector_number + preq.nr_sects, preq.dev);
- goto bad_descriptor;
- }
-
- pending_req = &pending_reqs[pending_idx];
- pending_req->blkif = blkif;
- pending_req->id = req->id;
- pending_req->operation = operation;
- pending_req->status = BLKIF_RSP_OKAY;
- pending_req->nr_pages = nseg;
+ goto fail_flush;
+ }

for (i = 0; i < nseg; i++) {
if (((int)preq.sector_number|(int)seg[i].nsec) &
((bdev_hardsect_size(preq.bdev) >> 9) - 1)) {
DPRINTK("Misaligned I/O request from domain %d",
blkif->domid);
- goto cleanup_and_fail;
+ goto fail_put_bio;
}

while ((bio == NULL) ||
(bio_add_page(bio,
- virt_to_page(MMAP_VADDR(pending_idx, i)),
+ virt_to_page(vaddr(pending_req, i)),
seg[i].nsec << 9,
seg[i].buf & ~PAGE_MASK) == 0)) {
bio = biolist[nbio++] = bio_alloc(GFP_KERNEL, nseg-i);
- if (unlikely(bio == NULL)) {
- cleanup_and_fail:
- for (i = 0; i < (nbio-1); i++)
- bio_put(biolist[i]);
- fast_flush_area(pending_idx, nseg);
- goto bad_descriptor;
- }
+ if (unlikely(bio == NULL))
+ goto fail_put_bio;

bio->bi_bdev = preq.bdev;
bio->bi_private = pending_req;
@@ -449,14 +457,8 @@
preq.sector_number += seg[i].nsec;
}

- if ((q = bdev_get_queue(bio->bi_bdev)) != plugged_queue) {
- flush_plugged_queue();
- blk_get_queue(q);
- plugged_queue = q;
- }
-
+ plug_queue(blkif, bio);
atomic_set(&pending_req->pendcnt, nbio);
- pending_cons++;
blkif_get(blkif);

for (i = 0; i < nbio; i++)
@@ -464,8 +466,14 @@

return;

- bad_descriptor:
+ fail_put_bio:
+ for (i = 0; i < (nbio-1); i++)
+ bio_put(biolist[i]);
+ fail_flush:
+ fast_flush_area(pending_req);
+ fail_response:
make_response(blkif, req->id, req->operation, BLKIF_RSP_ERROR);
+ free_req(pending_req);
}

@@ -481,6 +489,7 @@
blkif_response_t *resp;
unsigned long flags;
blkif_back_ring_t *blk_ring = &blkif->blk_ring;
+ int more_to_do = 0;
int notify;

spin_lock_irqsave(&blkif->blk_ring_lock, flags);
@@ -499,76 +508,69 @@
* notifications if requests are already in flight (lower
* overheads and promotes batching).
*/
- int more_to_do;
RING_FINAL_CHECK_FOR_REQUESTS(blk_ring, more_to_do);
- if (more_to_do) {
- add_to_blkdev_list_tail(blkif);
- maybe_trigger_blkio_schedule();
- }
- }
- else if (!__on_blkdev_list(blkif)
- && RING_HAS_UNCONSUMED_REQUESTS(blk_ring)) {
- /* Keep pulling requests as they become available... */
- add_to_blkdev_list_tail(blkif);
- maybe_trigger_blkio_schedule();
- }
-
+
+ } else if (RING_HAS_UNCONSUMED_REQUESTS(blk_ring)) {
+ more_to_do = 1;
+
+ }
spin_unlock_irqrestore(&blkif->blk_ring_lock, flags);

+ if (more_to_do) {
+ atomic_inc(&blkif->io_pending);
+ wake_up(&blkif->wq);
+ }
if (notify)
notify_remote_via_irq(blkif->irq);
}

-void blkif_deschedule(blkif_t *blkif)
-{
- remove_from_blkdev_list(blkif);
-}
-
static int __init blkif_init(void)
{
+ struct page *page;
int i;
- struct page *page;
- int ret;
-
- for (i = 0; i < MMAP_PAGES; i++)
- pending_grant_handles[i] = BLKBACK_INVALID_HANDLE;

if (xen_init() < 0)
return -ENODEV;

+ mmap_pages = blkif_reqs * BLKIF_MAX_SEGMENTS_PER_REQUEST;
+ pending_reqs = kmalloc(sizeof(pending_reqs[0]) *
+ blkif_reqs, GFP_KERNEL);
+ pending_grant_handles = kmalloc(sizeof(pending_grant_handles[0]) *
+ mmap_pages, GFP_KERNEL);
+ pending_vaddrs = kmalloc(sizeof(pending_vaddrs[0]) *
+ mmap_pages, GFP_KERNEL);
+ if (!pending_reqs || !pending_grant_handles || !pending_vaddrs) {
+ printk("%s: out of memory\n", __FUNCTION__);
+ return -1;
+ }
+
blkif_interface_init();
-
+
#ifdef __ia64__
- {
- extern unsigned long alloc_empty_foreign_map_page_range(unsigned long pages);
- int i;
-
- mmap_vstart = alloc_empty_foreign_map_page_range(MMAP_PAGES);
- printk("Allocated mmap_vstart: 0x%lx\n", mmap_vstart);
- for(i = 0; i < MMAP_PAGES; i++)
- pending_vaddrs[i] = mmap_vstart + (i << PAGE_SHIFT);
- BUG_ON(mmap_vstart == NULL);
- }
-#else
- page = balloon_alloc_empty_page_range(MMAP_PAGES);
+ extern unsigned long alloc_empty_foreign_map_page_range(
+ unsigned long pages);
+ mmap_vstart = (unsigned long)
+ alloc_empty_foreign_map_page_range(mmap_pages);
+#else /* ! ia64 */
+ page = balloon_alloc_empty_page_range(mmap_pages);
BUG_ON(page == NULL);
mmap_vstart = (unsigned long)pfn_to_kaddr(page_to_pfn(page));
#endif
-
- pending_cons = 0;
- pending_prod = MAX_PENDING_REQS;
+ printk("%s: reqs=%d, pages=%d, mmap_vstart=0x%lx\n",
+ __FUNCTION__, blkif_reqs, mmap_pages, mmap_vstart);
+ BUG_ON(mmap_vstart == 0);
+ for (i = 0; i < mmap_pages; i++) {
+ pending_vaddrs[i] = mmap_vstart + (i << PAGE_SHIFT);
+ pending_grant_handles[i] = BLKBACK_INVALID_HANDLE;
+ }
+
memset(pending_reqs, 0, sizeof(pending_reqs));
- for (i = 0; i < MAX_PENDING_REQS; i++)
- pending_ring[i] = i;
+ INIT_LIST_HEAD(&pending_free);
+
+ for (i = 0; i < blkif_reqs; i++)
+ list_add_tail(&pending_reqs[i].free_list, &pending_free);

- spin_lock_init(&blkio_schedule_list_lock);
- INIT_LIST_HEAD(&blkio_schedule_list);
-
- ret = kernel_thread(blkio_schedule, NULL, CLONE_FS | CLONE_FILES);
- BUG_ON(ret < 0);
-
blkif_xenbus_init();
-
return 0;
}

diff -r c9772105fead -r 6f62ad959f6b linux-2.6-xen-sparse/drivers/xen/blkback/common.h
--- a/linux-2.6-xen-sparse/drivers/xen/blkback/common.h Thu Dec 8 15:04:41 2005
+++ b/linux-2.6-xen-sparse/drivers/xen/blkback/common.h Thu Dec 8 15:53:53 2005
@@ -60,9 +60,19 @@
/* Is this a blktap frontend */
unsigned int is_blktap;
#endif
- struct list_head blkdev_list;
spinlock_t blk_ring_lock;
atomic_t refcnt;
+
+ wait_queue_head_t wq;
+ struct task_struct *xenblkd;
+ atomic_t io_pending;
+ request_queue_t *plug;
+
+ /* statistics */
+ unsigned long st_print;
+ int st_rd_req;
+ int st_wr_req;
+ int st_oo_req;

struct work_struct free_work;

@@ -101,11 +111,10 @@

void blkif_interface_init(void);

-void blkif_deschedule(blkif_t *blkif);
-
void blkif_xenbus_init(void);

irqreturn_t blkif_be_int(int irq, void *dev_id, struct pt_regs *regs);
+int blkif_schedule(void *arg);

void update_blkif_status(blkif_t *blkif);

diff -r c9772105fead -r 6f62ad959f6b linux-2.6-xen-sparse/drivers/xen/blkback/interface.c
--- a/linux-2.6-xen-sparse/drivers/xen/blkback/interface.c Thu Dec 8 15:04:41 2005
+++ b/linux-2.6-xen-sparse/drivers/xen/blkback/interface.c Thu Dec 8 15:53:53 2005
@@ -24,6 +24,8 @@
blkif->status = DISCONNECTED;
spin_lock_init(&blkif->blk_ring_lock);
atomic_set(&blkif->refcnt, 1);
+ init_waitqueue_head(&blkif->wq);
+ blkif->st_print = jiffies;

return blkif;
}
diff -r c9772105fead -r 6f62ad959f6b linux-2.6-xen-sparse/drivers/xen/blkback/xenbus.c
--- a/linux-2.6-xen-sparse/drivers/xen/blkback/xenbus.c Thu Dec 8 15:04:41 2005
+++ b/linux-2.6-xen-sparse/drivers/xen/blkback/xenbus.c Thu Dec 8 15:53:53 2005
@@ -20,6 +20,7 @@

#include <stdarg.h>
#include <linux/module.h>
+#include <linux/kthread.h>
#include <asm-xen/xenbus.h>
#include "common.h"

@@ -92,6 +93,8 @@
}
if (be->blkif) {
be->blkif->status = DISCONNECTED;
+ if (be->blkif->xenblkd)
+ kthread_stop(be->blkif->xenblkd);
blkif_put(be->blkif);
be->blkif = NULL;
}
@@ -217,6 +220,17 @@
be->major = 0;
be->minor = 0;
xenbus_dev_fatal(dev, err, "creating vbd structure");
+ return;
+ }
+
+ be->blkif->xenblkd = kthread_run(blkif_schedule, be->blkif,
+ "xvd %d %02x:%02x",
+ be->blkif->domid,
+ be->major, be->minor);
+ if (IS_ERR(be->blkif->xenblkd)) {
+ err = PTR_ERR(be->blkif->xenblkd);
+ be->blkif->xenblkd = NULL;
+ xenbus_dev_error(dev, err, "start xenblkd");
return;
}

_______________________________________________
Xen-changelog mailing list
Xen-changelog@lists.xensource.com
http://lists.xensource.com/xen-changelog