Mailing List Archive: [Patch 4/7] pvSCSI driver

> # HG changeset patch
> # User t.horikoshi@jp.fujitsu.com
> # Date 1202956035 -32400
> # Node ID 0aa34865e7497723b1156d4d2feac3a9bdf1305d
> # Parent 3b3cd9e0743097910676093a7a3f79e5acac757a
> [LINUX][scsifront] add scsi frontend driver
>
> Signed-off-by: Tomonari Horikoshi <t.horikoshi@jp.fujitsu.com>
> Signed-off-by: Jun Kamada <kama@jp.fujitsu.com>
> Signed-off-by: Akira Hayakawa <hayakawa.akira@jp.fujitsu.com>
>
> diff -r 3b3cd9e07430 -r 0aa34865e749 drivers/xen/scsifront/comfront.c
> --- /dev/null Thu Jan 01 00:00:00 1970 +0000
> +++ b/drivers/xen/scsifront/comfront.c Thu Feb 14 11:27:15 2008 +0900
What's a comfront?

...
> +#include <linux/version.h>
> +#include "comfront.h"
> +
> +extern int scsifront_cmd_done(struct comfront_info *,
> + struct vscsiif_response *);
Ick.

> +int do_comfront_cmd_done(struct comfront_info *);
> +
> +static inline int GET_ID_FROM_FREELIST(struct comfront_info *info)
Why is this all-caps? It's not a macro.

> +{
> + unsigned long flags;
> + uint32_t free;
> +
> + spin_lock_irqsave(&info->shadow_lock, flags);
> +
> + free = info->shadow_free;
> + BUG_ON(free > VSCSIIF_DEFAULT_CMD_PER_LUN);
> + info->shadow_free = info->shadow[free].rqid;
> + info->shadow[free].rqid = 0x0fff; /* debug */
> +
> + info->shadow[free].cond = 0;
> +
> + spin_unlock_irqrestore(&info->shadow_lock, flags);
> +
> + return free;
> +}
> +
> +void ADD_ID_TO_FREELIST(struct comfront_info *info, uint32_t id)
Again.

> +{
> + unsigned long flags;
> +
> + spin_lock_irqsave(&info->shadow_lock, flags);
> +
> + info->shadow[id].rqid = info->shadow_free;
> + info->shadow[id].req_scsi_cmnd = 0;
> + info->shadow_free = id;
> +
> + spin_unlock_irqrestore(&info->shadow_lock, flags);
> +}
> +
> +
> +struct vscsiif_request * comfront_pre_request(struct comfront_info *info)
> +{
> + struct vscsiif_front_ring *ring = &(info->ring);
> + struct vscsiif_request *ring_req;
> + uint32_t id;
> +
> + ring_req = RING_GET_REQUEST(&(info->ring), ring->req_prod_pvt);
> +
> + ring->req_prod_pvt++;
> +
> + id = GET_ID_FROM_FREELIST(info); /* use id by response */
> + ring_req->rqid = (uint16_t)id;
> +
> + return ring_req;
> +}
> +
> +void comfront_do_request(struct comfront_info *info)
> +{
> + struct vscsiif_front_ring *ring = &(info->ring);
> + unsigned int irq = info->irq;
> + int notify;
> +
> + RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(ring, notify);
> +
> + if (notify)
> + notify_remote_via_irq(irq);
> +}
> +
> +static void comfront_notify_work(struct comfront_info *info)
> +{
> + info->waiting_resp = 1;
> + wake_up(&info->wq);
> +}
> +
> +irqreturn_t comfront_intr(int irq, void *dev_id, struct pt_regs *ptregs)
> +{
> + comfront_notify_work((struct comfront_info *)dev_id);
> + return IRQ_HANDLED;
> +}
> +
> +static void __sync_cmd_done(struct comfront_info *info,
> + struct vscsiif_response *ring_res)
> +{
> + uint16_t id = ring_res->rqid;
> +
> + info->ring_res = *ring_res;
> +
> + info->shadow[id].cond++;
Can cond ever be anything other than 0 or 1? Does it need any kind of
locking?

> + wake_up(&(info->shadow[id].wq));
> +}
> +
> +int do_comfront_cmd_done(struct comfront_info *info)
> +{
> + struct vscsiif_response *ring_res;
> + RING_IDX i, rp;
> + int more_to_do = 0;
> + unsigned long flags;
> +
> + if (info->dev->state != XenbusStateConnected)
> + return 0;
> +
> + spin_lock_irqsave(&info->io_lock, flags);
> +
> + rp = info->ring.sring->rsp_prod;
> + rmb();
> +
> + for (i = info->ring.rsp_cons; i != rp; i++) {
> +
> + ring_res = RING_GET_RESPONSE(&info->ring, i);
> +
> + if (info->shadow[ring_res->rqid].cmd == VSCSIIF_CMND_SCSI) {
> + if (scsifront_cmd_done(info, ring_res)) {
> + BUG();
> + }
> + } else {
> + __sync_cmd_done(info, ring_res);
Can this ever happen?

> + }
> + }
> +
> + info->ring.rsp_cons = i;
> +

> + if (i != info->ring.req_prod_pvt) {
> + RING_FINAL_CHECK_FOR_RESPONSES(&info->ring, more_to_do);
> + } else {
> + info->ring.sring->rsp_event = i + 1;
> + }
I think you just want

RING_FINAL_CHECK_FOR_RESPONSES(&info->ring, more_to_do);

here, don't you? I can't immediately see anything that breaks with
your approach, but I can't see any reason why the standard one doesn't
work, either.

> +
> + spin_unlock_irqrestore(&info->io_lock, flags);
> +
> + return more_to_do;
> +}
> +
> +int comfront_schedule(void *data)
> +{
> + struct comfront_info *info = (struct comfront_info *)data;
> +
> + while (!kthread_should_stop()) {
> + wait_event_interruptible(
> + info->wq,
> + info->waiting_resp || kthread_should_stop());
Could you replace the test of waiting_resp with
RING_HAS_UNCONSUMED_RESPONSES()? That seems like it would be a bit
clearer, at least to me. It'd also eliminate the only consumer of
that field, so you could stop maintaining it.

> +
> + info->waiting_resp = 0;
> + smp_mb();
> +
> + if (do_comfront_cmd_done(info))
> + info->waiting_resp = 1;
> + }
> +
> + info->kthread = NULL;
> +
> + return 0;
> +}
I don't really understand why you need to do the frontend stuff from a
kthread rather than just doing it directly from the interrupt handler.
The only thing you really do from the thread is call sc->scsi_done(),
which other HBAs seem to do from their interrupt handler. Perhaps I'm
missing something obvious.

> +
> +struct vscsiif_response *comfront_do_request_and_wait_response(
> + struct comfront_info *info,
> + struct vscsiif_request *ring_req)
> +{
> + struct vscsiif_response *ring_res;
> + uint16_t rqid = ring_req->rqid;
> +
> + info->shadow[rqid].cmd = ring_req->cmd;
> +
> + comfront_do_request(info);
> + wait_event_interruptible(info->shadow[rqid].wq,
> + info->shadow[rqid].cond);
> + info->shadow[rqid].cond--;
> +
> + ring_res = &(info->ring_res);
> +
> + return ring_res;
> +}
I don't think this is ever actually used, is it? If it is, you may
need some additional locking around comfront_do_request().

> diff -r 3b3cd9e07430 -r 0aa34865e749 drivers/xen/scsifront/comfront.h
> --- /dev/null Thu Jan 01 00:00:00 1970 +0000
> +++ b/drivers/xen/scsifront/comfront.h Thu Feb 14 11:27:15 2008 +0900
> @@ -0,0 +1,111 @@
...
> +
> +#define GRANT_INVALID_REF 0
I'm not entirely convinced that this is really the right place for
that. I don't think you actually use it, either.

> +#define VSCSIIF_DEFAULT_CMD_PER_LUN \
> + __RING_SIZE((struct vscsiif_sring *)0, PAGE_SIZE)
This is more of a maximum commands per LUN than a default, I think.
Of course, given that the number of commands per LUN is hard coded to
a constant it doesn't really make much difference.

> +
> +struct comfront_shadow {
> + uint16_t rqid;
I think this is the reference of the next free shadow, isn't it?
That's kind of a misleading name.

> + unsigned char cmd;
> + wait_queue_head_t wq;

> + int cond;
That's not the most descriptive name imaginable.

> + unsigned int sc_data_direction;
Is this field 0 for read and 1 for write, or 1 for write and 0 for
read? Or is it something more complicated?

> + unsigned int use_sg;
Hmm. The name implies that that's a flag, but this is actually the
count of the number of SG segments, isn't it? That's kind of
confusing (even if it does match up with the Linux terminology).

> + unsigned int request_bufflen;
> + unsigned long req_scsi_cmnd;
Okay, so cmd is the command on the ring, and req_scsi_cmnd is the SCSI
command in the CDB?

> + int gref[VSCSIIF_SG_TABLESIZE];
> +};
> +
> +struct comfront_info {
> + struct xenbus_device *dev;
> + unsigned int hst_id;
> + unsigned int chn_id;
> + unsigned int tgt_id;
> + unsigned int lun_id;
Okay, so you have a channel, target, and LUN in each comfront, and
hence one 3-tuple for each ring, and also in each request? Is there
any reason why the per-request values would ever be different from the
per-ring ones?

> + spinlock_t io_lock;
> + spinlock_t shadow_lock;
> + unsigned int evtchn;
> + unsigned int irq;
> + int ring_ref;
grant_ref_t, please.

> + struct vscsiif_front_ring ring;
> + struct vscsiif_response ring_res;
Hmm. The ring can have several responses on it. What's special about
this one?

> + struct comfront_shadow shadow[VSCSIIF_DEFAULT_CMD_PER_LUN];
> + uint32_t shadow_free;
Is this a count of the number of free shadows, or a pointer to the
first one?

> +
> + struct task_struct *kthread;
> + wait_queue_head_t wq;
> + unsigned int waiting_resp;
> +};
> +
> +#define DPRINTK(_f, _a...) \
> + pr_debug("(file=%s, line=%d) " _f, \
> + __FILE__ , __LINE__ , ## _a )
> +
> +
> +void ADD_ID_TO_FREELIST(struct comfront_info *info, uint32_t id);
> +
> +struct vscsiif_request * comfront_pre_request(struct comfront_info *);
> +struct vscsiif_response *comfront_do_request_and_wait_response(
> + struct comfront_info *info,
> + struct vscsiif_request *ring_req);
> +void comfront_do_request(struct comfront_info *);
> +irqreturn_t comfront_intr(int, void *, struct pt_regs *);
> +int comfront_schedule(void *);
> +
> +#endif /* __XEN_DRIVERS_SCSIFRONT_H__ */
> diff -r 3b3cd9e07430 -r 0aa34865e749 drivers/xen/scsifront/scsifront.c
> --- /dev/null Thu Jan 01 00:00:00 1970 +0000
> +++ b/drivers/xen/scsifront/scsifront.c Thu Feb 14 11:27:15 2008 +0900
> @@ -0,0 +1,668 @@
...
> +
> +#include <linux/version.h>
> +#include "comfront.h"
> +
> +#define VSCSIIF_DEFAULT_TASK_COMM_LEN 16
Okay, so this is the length of the buffer used to build the thread
name, and isn't used anywhere else, yes? That probably doesn't
warrant a top-level #define.

> +
> +struct list_head vscsi_host_list;
> +static DEFINE_SPINLOCK(vscsi_hostlist_lock);
> +
> +struct vscsi_host_list {
> + unsigned short phy_hostno;
> + struct Scsi_Host *host;
> + struct list_head list;
> +};
> +
> +static void add_host_list(struct Scsi_Host *host, unsigned short phy_hostno)
> +{
> + unsigned long flags;
> + struct vscsi_host_list *vscsi_host;
> +
> + vscsi_host = kmalloc(sizeof(struct vscsi_host_list), GFP_KERNEL);
Need to check for failure here.

> + vscsi_host->phy_hostno = phy_hostno;
> + vscsi_host->host = host;
> + spin_lock_irqsave(&vscsi_hostlist_lock, flags);
> + list_add(&vscsi_host->list, &vscsi_host_list);
> + spin_unlock_irqrestore(&vscsi_hostlist_lock, flags);
> +}
> +
> +struct Scsi_Host *phyhostno_to_vhostno_host_lookup(unsigned short tgt_no)
Is a target number always the same thing as a phyhostno? What's a
vhostno?

> +{
> + unsigned long flags;
> + struct Scsi_Host *shost = ERR_PTR(-ENXIO);
> + struct vscsi_host_list *vscsi_host;
> +
> + spin_lock_irqsave(&vscsi_hostlist_lock, flags);
> + list_for_each_entry(vscsi_host, &vscsi_host_list, list) {
> + if (tgt_no == vscsi_host->phy_hostno) {
> + shost = scsi_host_lookup(vscsi_host->host->host_no);
> + break;
> + }
> + }
> + spin_unlock_irqrestore(&vscsi_hostlist_lock, flags);
> +
> + return shost;
> +}
> +

> +static struct scsi_device *scsifront_device_lookup(struct comfront_info *info)
> +{
> + struct Scsi_Host *shost;
> + struct scsi_device *sdev = NULL;
> +
> + shost = scsi_host_lookup(info->hst_id);
> + if (IS_ERR(shost))
> + return sdev;
> +
> + sdev = scsi_device_lookup(shost, info->chn_id,
> + info->tgt_id, info->lun_id);
> + if (sdev && scsi_device_get(sdev))
> + sdev = NULL;
> +
> + return sdev;
> +}
Could you not just cache a pointer to the scsi_device in the
comfront_info and make this trivial?

> +
> +static void scsifront_free(struct comfront_info *info)
> +{
> + struct scsi_device *sdev;
> +
> + sdev = scsifront_device_lookup(info);
> + if (sdev) {
> + scsi_remove_device(sdev);
> + scsi_device_put(sdev);
> + }
> +
> + flush_scheduled_work();
What is this supposed to be waiting for? You don't see to be
explicitly using any workqueues here.

> +
> + if (info->ring_ref != GRANT_INVALID_REF) {
> + gnttab_end_foreign_access(info->ring_ref,
> + (unsigned long)info->ring.sring);
> + info->ring_ref = GRANT_INVALID_REF;
> + info->ring.sring = NULL;
Do you not want to free the sring?

> + }
> +
> + if (info->irq)
> + unbind_from_irqhandler(info->irq, info);
> + info->irq = 0;
> +}
> +
> +
> +static int map_data_for_request(struct comfront_info *info,
> + struct scsi_cmnd *sc, struct vscsiif_request *ring_req, uint32_t id)
> +{
> + grant_ref_t gref_head;
> + struct page *page;
> + int err, i, ref, ref_cnt = 0;
> + int write = (sc->sc_data_direction == DMA_TO_DEVICE);
> + int nr_pages, off, len, bytes;
> + unsigned long buffer_pfn;
> + unsigned int data_len = 0;
> +
> + if (sc->sc_data_direction == DMA_NONE)
> + return 0;
> +
> + err = gnttab_alloc_grant_references(VSCSIIF_SG_TABLESIZE, &gref_head);
> + if (err) {
> + printk(KERN_ERR "scsifront: gnttab_alloc_grant_references() error\n");
> + return -ENOMEM;
> + }
> +
> + if (sc->use_sg) {
> + /* quoted scsi_lib.c/scsi_req_map_sg . */
> + struct scatterlist *sg = (struct scatterlist *)sc->request_buffer;
> + nr_pages = (sc->request_bufflen + sg[0].offset + PAGE_SIZE - 1) >> PAGE_SHIFT;
Hmm. That's the number of pages is required if the original buffer is
contiguous in the virtual address space, which might be significantly
less than the number of pages mentioned in the scatter-gather list.
For example, if it's a true scatter-gather operation then you might
have to take a sector from each of 100 different pages.
request_bufflen would then be 512*100=51200 bytes, so nr_pages would
probably be 13, but the actual number of pages touched would be 100.
That's going to lead to you overflowing the request structure in just
a few instructions' time.

I think the right answer is probably going to be to not even try to
calculate nr_pages in advance, and instead detect the overflow when
you're actually populating the request.

Of course, I'm basically ignorant of the Linux SCSI subsystem, so it
could be that you do have the required guarantees, and nothing needs
to change. Still, a comment might be nice.

> +
> + if (nr_pages > VSCSIIF_SG_TABLESIZE) {
> + ref_cnt = (-ENOMEM);
I'm not convinced that ENOMEM is the right error code here; it seems
like E2BIG would be more appropriate. It doesn't make a great deal of
difference, though, because the caller throws the error code away,
anyway.

Incidentally, do you know what happens when requests fail here? It
would be really rather bad if Linux just backed off a bit and retried,
because it would keep failing forever.

> + goto big_to_sg;
> + }
> +
> + for (i = 0; i < sc->use_sg; i++) {
> + page = sg[i].page;
> + off = sg[i].offset;
> + len = sg[i].length;
> + data_len += len;
> +
> + buffer_pfn = page_to_phys(page) >> PAGE_SHIFT;
> +
> + while (len > 0) {
> + bytes = min_t(unsigned int, len, PAGE_SIZE - off);
> +
> + ref = gnttab_claim_grant_reference(&gref_head);
> + BUG_ON(ref == -ENOSPC);
> +
> + gnttab_grant_foreign_access_ref(ref, info->dev->otherend_id,
> + buffer_pfn, write);
> +
> + info->shadow[id].gref[ref_cnt] = ref;
> + ring_req->seg[ref_cnt].gref = ref;
> + ring_req->seg[ref_cnt].offset = (uint16_t)off;
> + ring_req->seg[ref_cnt].length = (uint16_t)bytes;
> +
> + buffer_pfn++;
> + len -= bytes;
> + off = 0;
> + ref_cnt++;
> + }
> + }
> + } else if (sc->request_bufflen) {
> + unsigned long end = ((unsigned long)sc->request_buffer
> + + sc->request_bufflen + PAGE_SIZE - 1) >> PAGE_SHIFT;
> + unsigned long start = (unsigned long)sc->request_buffer >> PAGE_SHIFT;
> +
> + page = virt_to_page(sc->request_buffer);
> + nr_pages = end - start;
> + len = sc->request_bufflen;
> +
> + if (nr_pages > VSCSIIF_SG_TABLESIZE) {
> + ref_cnt = (-ENOMEM);
> + goto big_to_sg;
> + }
> +
> + buffer_pfn = page_to_phys(page) >> PAGE_SHIFT;
> +
> + off = offset_in_page((unsigned long)sc->request_buffer);
> + for (i = 0; i < nr_pages; i++) {
> + bytes = PAGE_SIZE - off;
> +
> + if (bytes > len)
> + bytes = len;
> +
> + ref = gnttab_claim_grant_reference(&gref_head);
> + BUG_ON(ref == -ENOSPC);
> +
> + gnttab_grant_foreign_access_ref(ref, info->dev->otherend_id,
> + buffer_pfn, write);
> +
> + info->shadow[id].gref[i] = ref;
> + ring_req->seg[i].gref = ref;
> + ring_req->seg[i].offset = (uint16_t)off;
> + ring_req->seg[i].length = (uint16_t)bytes;
> +
> + buffer_pfn++;
> + len -= bytes;
> + off = 0;
> + ref_cnt++;
> + }
> + }
> +
> +big_to_sg:
> +
> + gnttab_free_grant_references(gref_head);
> +
> + return ref_cnt;
> +}
> +
> +static int scsifront_queuecommand(struct scsi_cmnd *sc,
> + void (*done)(struct scsi_cmnd *))
> +{
> + struct comfront_info *info = (struct comfront_info *) sc->device->hostdata;
> + struct vscsiif_request *ring_req;
> + unsigned int ref_cnt;
> + uint16_t rqid;
> +
> + if (info->dev->state != XenbusStateConnected) {
> + printk(KERN_ERR "scsifront: XenbusState not connected %u!\n",
> + info->dev->state);
> + sc->result = DID_NO_CONNECT << 16;
> + done(sc);
> + return 0;
> + }
> +
> + if (RING_FULL(&info->ring)) {
> + return SCSI_MLQUEUE_HOST_BUSY;
> + }
> +
> + sc->scsi_done = done;
> + sc->result = 0;
> +
> + ring_req = comfront_pre_request(info);
> + rqid = ring_req->rqid;
> + ring_req->cmd = VSCSIIF_CMND_SCSI;
> +
> + ring_req->id = sc->device->id;
> + ring_req->lun = sc->device->lun;
> + ring_req->channel = sc->device->channel;
> + ring_req->cmd_len = sc->cmd_len;
> +
> + BUG_ON(sc->cmd_len > VSCSIIF_MAX_COMMAND_SIZE);
> + BUG_ON(sc == NULL);
The second BUG_ON() is kind of pointless, given that we've already
dereferenced sc at this point. It might make sense towards the top of
the function.

> +
> + if ( sc->cmd_len )
> + memcpy(ring_req->cmnd, sc->cmnd, sc->cmd_len);
> + else
> + memset(ring_req->cmnd, 0, VSCSIIF_MAX_COMMAND_SIZE);
What does it mean to send a zero-length command to a scsi host?

> + ring_req->sc_data_direction = (uint8_t)sc->sc_data_direction;
> + ring_req->request_bufflen = sc->request_bufflen;
> + /*ring_req->timeout_per_command = (int32_t)sc->timeout_per_command;*/
> +
> + info->shadow[rqid].req_scsi_cmnd = (unsigned long)sc;
> + info->shadow[rqid].sc_data_direction = sc->sc_data_direction;
> + info->shadow[rqid].request_bufflen = sc->request_bufflen;
> + info->shadow[rqid].cmd = ring_req->cmd;
> +
> + ref_cnt = map_data_for_request(info, sc, ring_req, rqid);
> + if (unlikely(ref_cnt < 0)) {
> + ADD_ID_TO_FREELIST(info, rqid);
> + notify_remote_via_irq(info->irq);
Why do you need to notify the remote here?

> + return SCSI_MLQUEUE_HOST_BUSY;
> + }
> +
> + ring_req->use_sg = (uint8_t)ref_cnt;
> + info->shadow[rqid].use_sg = ref_cnt;
> +
> + comfront_do_request(info);
> +
> + return 0;
> +}
> +
> +
> +static int scsifront_eh_abort_handler(struct scsi_cmnd *sc)
> +{
> + /* not implemented */
> + printk(KERN_ERR "scsifront: abort! 0 return.\n");
> + BUG();
Yay!

> + return 0;
> +}
> +
> +static void scsifront_gnttab_done(struct comfront_shadow *s, uint32_t id)
> +{
> + int i;
> +
> + if (s->sc_data_direction == DMA_NONE)
> + return;
> +
> + if (s->use_sg) {
> + for (i = 0; i < s->use_sg; i++) {
> + if (unlikely(gnttab_query_foreign_access(
> + s->gref[i]) != 0)) {
> + printk(KERN_ALERT "scsifront: "
> + "grant still in use by backend.\n");
> + BUG();
> + }
> + gnttab_end_foreign_access(s->gref[i], 0UL);
> + }
> + } else if (s->request_bufflen) {
> + gnttab_end_foreign_access(s->gref[0], 0UL);
> + } else {
> + return;
> + }
> +}
> +
> +int scsifront_cmd_done(struct comfront_info *info,
> + struct vscsiif_response *ring_res)
> +{
> + struct scsi_cmnd *sc;
> + uint32_t id;
> +
> + id = ring_res->rqid;
> + sc = (struct scsi_cmnd *)info->shadow[id].req_scsi_cmnd;
> +

> + if (sc == NULL)
> + return 1;
Can that ever happen?

> +
> + scsifront_gnttab_done(&info->shadow[id], id);
> + ADD_ID_TO_FREELIST(info, id);
> +
> + sc->result = ring_res->rslt;
> + sc->resid = 0;
> +
> + BUG_ON(ring_res->sense_len > VSCSIIF_SENSE_BUFFERSIZE);
> +
> + if(ring_res->rslt != 0) {
> + if (ring_res->sense_len)
> + memcpy(sc->sense_buffer, ring_res->sense_buffer,
> + ring_res->sense_len);
> + }
> +
> + sc->scsi_done(sc);
> +
> + return 0;
> +}
> +
> +static int scsifront_alloc_ring(struct comfront_info *info)
> +{
> + struct xenbus_device *dev = info->dev;
> + struct vscsiif_sring *sring;
> + int err = -ENOMEM;
> +
> + info->ring_ref = GRANT_INVALID_REF;
> +
> + sring = (struct vscsiif_sring *) __get_free_page(GFP_KERNEL);
> + if (!sring) {
> + xenbus_dev_fatal(dev, err, "fail to allocate shared ring (Front to Back)");
> + return err;
> + }
> + SHARED_RING_INIT(sring);
> + FRONT_RING_INIT(&info->ring, sring, PAGE_SIZE);
> +
> + err = xenbus_grant_ring(dev, virt_to_mfn(sring));
> + if (err < 0) {
> + free_page((unsigned long) sring);
> + info->ring.sring = NULL;
> + xenbus_dev_fatal(dev, err, "fail to grant shared ring (Front to Back)");
> + goto free_sring;
> + }
> + info->ring_ref = err;
> +
> + err = bind_listening_port_to_irqhandler(
> + dev->otherend_id, comfront_intr,
> + SA_SAMPLE_RANDOM, "scsifront", info);
> +
> + if (err <= 0) {
> + xenbus_dev_fatal(dev, err, "bind_listening_port_to_irqhandler");
> + goto fail;
> + }
> + info->irq = err;
> +
> + return 0;
> +fail:
> + /* free resource */
> +free_sring:
> + scsifront_free(info);
> +
> + return err;
> +}
> +
> +static int scsifront_init_ring(struct comfront_info *info)
> +{
> + struct xenbus_device *dev = info->dev;
> + struct xenbus_transaction xbt;
> + int err;
> +
> + DPRINTK("%s\n",__FUNCTION__);
> +
> + err = scsifront_alloc_ring(info);
> + if (err)
> + return err;
> + DPRINTK("%u %u\n", info->ring_ref, info->evtchn);
> +
> +again:
> + err = xenbus_transaction_start(&xbt);
> + if (err) {
> + xenbus_dev_fatal(dev, err, "starting transaction");
> + }
> +
> + err = xenbus_printf(xbt, dev->nodename, "ring-ref", "%u",
> + info->ring_ref);
> + if (err) {
> + xenbus_dev_fatal(dev, err, "%s", "writing ring-ref");
> + goto fail;
> + }
> +
> + err = xenbus_printf(xbt, dev->nodename, "event-channel", "%u",
> + irq_to_evtchn_port(info->irq));
> +
> + if (err) {
> + xenbus_dev_fatal(dev, err, "%s", "writing event-channel");
> + goto fail;
> + }
> +
> + err = xenbus_transaction_end(xbt, 0);
> + if (err) {
> + if (err == -EAGAIN)
> + goto again;
> + xenbus_dev_fatal(dev, err, "completing transaction");
> + } else
> + xenbus_switch_state(dev, XenbusStateInitialised);
> +
> + return 0;
> +
> +fail:
> + xenbus_transaction_end(xbt, 1);
> + /* free resource */
> + scsifront_free(info);
> +
> + return err;
> +}
> +
> +static struct scsi_host_template scsifront_sht = {
> + .module = THIS_MODULE,
> + .name = "Xen SCSI frontend driver",
> + .queuecommand = scsifront_queuecommand,
> + .eh_abort_handler = scsifront_eh_abort_handler,
> + .cmd_per_lun = VSCSIIF_DEFAULT_CMD_PER_LUN,
> + .can_queue = VSCSIIF_DEFAULT_CAN_QUEUE,
> + .this_id = -1,
> + .sg_tablesize = VSCSIIF_SG_TABLESIZE,
> + .use_clustering = DISABLE_CLUSTERING,
> + .proc_name = "scsifront",
> +};
> +
> +static int scsifront_connect(struct comfront_info *info)
> +{
> + struct xenbus_device *dev = info->dev;
> + struct Scsi_Host *host;
> + struct scsi_device *sdev;
> + char name[VSCSIIF_DEFAULT_TASK_COMM_LEN];
> + unsigned int hst_id, chn_id;
> + unsigned int tgt_id, lun_id;
> +
> + int err = -ENOMEM;
> +
> + DPRINTK("%u\n", dev->state);
> + if (dev->state == XenbusStateConnected)
> + return 0;
> +
> + err = xenbus_scanf(XBT_NIL, dev->nodename, "b-dev", "%d:%d:%d:%d",
> + &hst_id, &chn_id, &tgt_id, &lun_id);
> + if (err != 4) {
> + xenbus_dev_fatal(dev, err, "reading dev");
> + return err;
> + }
> +
> + snprintf(name, VSCSIIF_DEFAULT_TASK_COMM_LEN, "vscsi.%u:%u:%u:%u",
> + hst_id, chn_id, tgt_id, lun_id);
> + info->kthread = kthread_run(comfront_schedule, info, name);
> + if (IS_ERR(info->kthread)) {
> + err = PTR_ERR(info->kthread);
> + info->kthread = NULL;
It looks like you're discarding this error. Was that deliberate?

> + }
> +
> + host = phyhostno_to_vhostno_host_lookup(hst_id);
> +
> + if (IS_ERR(host)) {
> + /* disk connected first by this host number */
> + host = scsi_host_alloc(&scsifront_sht,
> + sizeof(struct comfront_info));
> + add_host_list(host, hst_id);
> +
> + host->max_id = 256;
> + host->max_channel = 0;
> + host->max_lun = 255;
> +
> + if ((err = scsi_add_host(host, &dev->dev)) != 0) {
> + printk(KERN_ERR "scsifront: scsi_add_host() error\n");
> + return err;
> + }
> +
> + }
Okay, so the devices on xenbus kind-of represent individual SCSI
targets, but you then kind-of merge them back together when they have
the same host ID? That's quite exciting. Could you explain why you
did this, please?

> +
> + info->hst_id = host->host_no;
> + info->chn_id = chn_id;
> + info->tgt_id = tgt_id;
> + info->lun_id = lun_id;
> +
> + xenbus_switch_state(dev, XenbusStateConnected);
> +
> + sdev = __scsi_add_device(host, chn_id, tgt_id, lun_id, info);
> + if (!IS_ERR(sdev)) {
> + scsi_device_put(sdev);
> + }
Why not just use scsi_add_device()?

> +
> + err = xenbus_printf(XBT_NIL, dev->nodename, "f-dev", "%d:%d:%d:%d",
> + host->host_no, chn_id, tgt_id, lun_id);
> + if (err) {
> + xenbus_dev_fatal(dev, err, "%s", "writing f-dev");
> + }
> +
> + return 0;
> +}
> +
> +static int scsifront_disconnect(struct comfront_info *info)
> +{
> + struct xenbus_device *dev = info->dev;
> + struct scsi_device *sdev;
> +
> + DPRINTK("%s: %s remove\n",__FUNCTION__ ,dev->nodename);
> +
> + sdev = scsifront_device_lookup(info);
> + if (!sdev)
> + return -1;
> +
> + scsi_remove_device(sdev);
> + scsi_device_put(sdev);
> +
> + xenbus_frontend_closed(dev);
> +
> + return 0;
> +}
> +
> +static int scsifront_probe(struct xenbus_device *dev,
> + const struct xenbus_device_id *id)
> +{
> + struct comfront_info *info;
> + int i, err = -ENOMEM;
> +
> + if ((info = kzalloc(sizeof(*info), GFP_KERNEL)) == NULL) {
> + printk(KERN_ERR "scsifront: kzalloc error\n");
> + return -ENOMEM;
> + }
> +
> + dev->dev.driver_data = info;
> + info->dev = dev;
> +
> + for (i = 0; i < VSCSIIF_DEFAULT_CMD_PER_LUN; i++) {
> + info->shadow[i].rqid = i + 1;
> + init_waitqueue_head(&(info->shadow[i].wq));
> + info->shadow[i].cond = 0;
> + }
> + info->shadow[VSCSIIF_DEFAULT_CMD_PER_LUN - 1].rqid = 0x0fff;
> +
> + err = scsifront_init_ring(info);
> + if (err) {
> + return err;
> + }
> +
> + init_waitqueue_head(&info->wq);
> + spin_lock_init(&info->io_lock);
> + spin_lock_init(&info->shadow_lock);
> +
> + return 0;
> +}
> +
> +
> +static int scsifront_remove(struct xenbus_device *dev)
> +{
> + struct comfront_info *info = dev->dev.driver_data;
> +
> + DPRINTK("%s: %s removed\n",__FUNCTION__ ,dev->nodename);
> +
> + if (info->kthread) {
> + kthread_stop(info->kthread);
> + info->kthread = NULL;
> + }
> +
> + scsifront_free(info);
> +
> + return 0;
> +}
> +
> +static void scsifront_backend_changed(struct xenbus_device *dev,
> + XenbusState backend_state)
> +{
> + struct comfront_info *info = dev->dev.driver_data;
> +
> + DPRINTK("%p %u %u\n", dev, dev->state, backend_state);
> +
> + switch (backend_state) {
> + case XenbusStateUnknown:
> + case XenbusStateInitialising:
> + case XenbusStateInitWait:
> + case XenbusStateInitialised:
> + case XenbusStateClosed:
> + break;
> +
> + case XenbusStateConnected:
> + scsifront_connect(info);
> + break;
> +
> + case XenbusStateClosing:
> + scsifront_disconnect(info);
> + break;
> + }
> +}
Okay, so you're state machine has the frontend coming up in
Initialising (courtesy of the tools), then transitioning to
Initialised once it's all set up, waiting for the backend to go to
Connected, and then going to Connected itself?

> +
> +static struct xenbus_device_id scsifront_ids[] = {
> + { "vscsi" },
> + { "" }
> +};
> +
> +
> +static struct xenbus_driver scsifront_driver = {
> + .name = "vscsi",
> + .owner = THIS_MODULE,
> + .ids = scsifront_ids,
> + .probe = scsifront_probe,
> + .remove = scsifront_remove,
> +/* .resume = scsifront_resume, */
> + .otherend_changed = scsifront_backend_changed,
> +};
> +
> +static int __init scsifront_init(void)
> +{
> + int err;
> +
> + if (!is_running_on_xen())
> + return -ENODEV;
> +
> + INIT_LIST_HEAD(&vscsi_host_list);
> +
> + err = xenbus_register_frontend(&scsifront_driver);
> +
> + return err;
> +}
> +
> +static void scsifront_exit(void)
> +{
> + xenbus_unregister_driver(&scsifront_driver);
> +
> +}
> +
> +module_init(scsifront_init);
> +module_exit(scsifront_exit);
> +
> +MODULE_DESCRIPTION("Xen SCSI frontend driver");
> +MODULE_LICENSE("GPL");

Mailing List Archive

Attached Files:

Attached Files:

Attached Files:

Attached Files: