Mailing List Archive

[PATCH, RFC 6/7] IOMMU: add phantom function support
Apart from generating device context entries for the base function,
all phantom functions also need context entries to be generated for
them.

In order to distinguish different use cases, a variant of
pci_get_pdev() is being introduced that, even when passed a phantom
function number, would return the underlying actual device.

--- a/xen/drivers/passthrough/amd/iommu_cmd.c
+++ b/xen/drivers/passthrough/amd/iommu_cmd.c
@@ -339,7 +339,15 @@ static void amd_iommu_flush_all_iotlbs(s
return;

for_each_pdev( d, pdev )
- amd_iommu_flush_iotlb(pdev->devfn, pdev, gaddr, order);
+ {
+ u8 devfn = pdev->devfn;
+
+ do {
+ amd_iommu_flush_iotlb(devfn, pdev, gaddr, order);
+ devfn += pdev->phantom_stride;
+ } while ( devfn != pdev->devfn &&
+ PCI_SLOT(devfn) == PCI_SLOT(pdev->devfn) );
+ }
}

/* Flush iommu cache after p2m changes. */
--- a/xen/drivers/passthrough/amd/iommu_init.c
+++ b/xen/drivers/passthrough/amd/iommu_init.c
@@ -667,7 +667,7 @@ void parse_ppr_log_entry(struct amd_iomm
devfn = PCI_DEVFN2(device_id);

spin_lock(&pcidevs_lock);
- pdev = pci_get_pdev(iommu->seg, bus, devfn);
+ pdev = pci_get_real_pdev(iommu->seg, bus, devfn);
spin_unlock(&pcidevs_lock);

if ( pdev )
--- a/xen/drivers/passthrough/amd/iommu_map.c
+++ b/xen/drivers/passthrough/amd/iommu_map.c
@@ -598,7 +598,6 @@ static int update_paging_mode(struct dom
for_each_pdev( d, pdev )
{
bdf = PCI_BDF2(pdev->bus, pdev->devfn);
- req_id = get_dma_requestor_id(pdev->seg, bdf);
iommu = find_iommu_for_device(pdev->seg, bdf);
if ( !iommu )
{
@@ -607,16 +606,21 @@ static int update_paging_mode(struct dom
}

spin_lock_irqsave(&iommu->lock, flags);
- device_entry = iommu->dev_table.buffer +
- (req_id * IOMMU_DEV_TABLE_ENTRY_SIZE);
-
- /* valid = 0 only works for dom0 passthrough mode */
- amd_iommu_set_root_page_table((u32 *)device_entry,
- page_to_maddr(hd->root_table),
- hd->domain_id,
- hd->paging_mode, 1);
-
- amd_iommu_flush_device(iommu, req_id);
+ do {
+ req_id = get_dma_requestor_id(pdev->seg, bdf);
+ device_entry = iommu->dev_table.buffer +
+ (req_id * IOMMU_DEV_TABLE_ENTRY_SIZE);
+
+ /* valid = 0 only works for dom0 passthrough mode */
+ amd_iommu_set_root_page_table((u32 *)device_entry,
+ page_to_maddr(hd->root_table),
+ hd->domain_id,
+ hd->paging_mode, 1);
+
+ amd_iommu_flush_device(iommu, req_id);
+ bdf += pdev->phantom_stride;
+ } while ( PCI_DEVFN2(bdf) != pdev->devfn &&
+ PCI_SLOT(bdf) == PCI_SLOT(pdev->devfn) );
spin_unlock_irqrestore(&iommu->lock, flags);
}

--- a/xen/drivers/passthrough/iommu.c
+++ b/xen/drivers/passthrough/iommu.c
@@ -154,6 +154,8 @@ void __init iommu_dom0_init(struct domai
int iommu_add_device(struct pci_dev *pdev)
{
struct hvm_iommu *hd;
+ int rc;
+ u8 devfn;

if ( !pdev->domain )
return -EINVAL;
@@ -164,7 +166,20 @@ int iommu_add_device(struct pci_dev *pde
if ( !iommu_enabled || !hd->platform_ops )
return 0;

- return hd->platform_ops->add_device(pdev->devfn, pdev);
+ rc = hd->platform_ops->add_device(pdev->devfn, pdev);
+ if ( rc || !pdev->phantom_stride )
+ return rc;
+
+ for ( devfn = pdev->devfn ; ; )
+ {
+ devfn += pdev->phantom_stride;
+ if ( PCI_SLOT(devfn) != PCI_SLOT(pdev->devfn) )
+ return 0;
+ rc = hd->platform_ops->add_device(devfn, pdev);
+ if ( rc )
+ printk(XENLOG_WARNING "IOMMU: add %04x:%02x:%02x.%u failed (%d)\n",
+ pdev->seg, pdev->bus, PCI_SLOT(devfn), PCI_FUNC(devfn), rc);
+ }
}

int iommu_enable_device(struct pci_dev *pdev)
@@ -187,6 +202,8 @@ int iommu_enable_device(struct pci_dev *
int iommu_remove_device(struct pci_dev *pdev)
{
struct hvm_iommu *hd;
+ u8 devfn;
+
if ( !pdev->domain )
return -EINVAL;

@@ -194,6 +211,22 @@ int iommu_remove_device(struct pci_dev *
if ( !iommu_enabled || !hd->platform_ops )
return 0;

+ for ( devfn = pdev->devfn ; pdev->phantom_stride; )
+ {
+ int rc;
+
+ devfn += pdev->phantom_stride;
+ if ( PCI_SLOT(devfn) != PCI_SLOT(pdev->devfn) )
+ break;
+ rc = hd->platform_ops->remove_device(devfn, pdev);
+ if ( !rc )
+ continue;
+
+ printk(XENLOG_ERR "IOMMU: remove %04x:%02x:%02x.%u failed (%d)\n",
+ pdev->seg, pdev->bus, PCI_SLOT(devfn), PCI_FUNC(devfn), rc);
+ return rc;
+ }
+
return hd->platform_ops->remove_device(pdev->devfn, pdev);
}

@@ -241,6 +274,18 @@ static int assign_device(struct domain *
if ( (rc = hd->platform_ops->assign_device(d, devfn, pdev)) )
goto done;

+ for ( ; pdev->phantom_stride; rc = 0 )
+ {
+ devfn += pdev->phantom_stride;
+ if ( PCI_SLOT(devfn) != PCI_SLOT(pdev->devfn) )
+ break;
+ rc = hd->platform_ops->assign_device(d, devfn, pdev);
+ if ( rc )
+ printk(XENLOG_G_WARNING "d%d: assign %04x:%02x:%02x.%u failed (%d)\n",
+ d->domain_id, seg, bus, PCI_SLOT(devfn), PCI_FUNC(devfn),
+ rc);
+ }
+
if ( has_arch_pdevs(d) && !need_iommu(d) )
{
d->need_iommu = 1;
@@ -373,6 +418,21 @@ int deassign_device(struct domain *d, u1
if ( !pdev )
return -ENODEV;

+ while ( pdev->phantom_stride )
+ {
+ devfn += pdev->phantom_stride;
+ if ( PCI_SLOT(devfn) != PCI_SLOT(pdev->devfn) )
+ break;
+ ret = hd->platform_ops->reassign_device(d, dom0, devfn, pdev);
+ if ( !ret )
+ continue;
+
+ printk(XENLOG_G_ERR "d%d: deassign %04x:%02x:%02x.%u failed (%d)\n",
+ d->domain_id, seg, bus, PCI_SLOT(devfn), PCI_FUNC(devfn), ret);
+ return ret;
+ }
+
+ devfn = pdev->devfn;
ret = hd->platform_ops->reassign_device(d, dom0, devfn, pdev);
if ( ret )
{
--- a/xen/drivers/passthrough/pci.c
+++ b/xen/drivers/passthrough/pci.c
@@ -144,6 +144,8 @@ static struct pci_dev *alloc_pdev(struct
/* update bus2bridge */
switch ( pdev->type = pdev_type(pseg->nr, bus, devfn) )
{
+ int pos;
+ u16 cap;
u8 sec_bus, sub_bus;

case DEV_TYPE_PCIe_BRIDGE:
@@ -167,6 +169,20 @@ static struct pci_dev *alloc_pdev(struct
break;

case DEV_TYPE_PCIe_ENDPOINT:
+ pos = pci_find_cap_offset(pseg->nr, bus, PCI_SLOT(devfn),
+ PCI_FUNC(devfn), PCI_CAP_ID_EXP);
+ BUG_ON(!pos);
+ cap = pci_conf_read16(pseg->nr, bus, PCI_SLOT(devfn),
+ PCI_FUNC(devfn), pos + PCI_EXP_DEVCAP);
+ if ( cap & PCI_EXP_DEVCAP_PHANTOM )
+ {
+ pdev->phantom_stride = 8 >> MASK_EXTR(cap,
+ PCI_EXP_DEVCAP_PHANTOM);
+ if ( PCI_FUNC(devfn) >= pdev->phantom_stride )
+ pdev->phantom_stride = 0;
+ }
+ break;
+
case DEV_TYPE_PCI:
break;

@@ -290,6 +306,27 @@ struct pci_dev *pci_get_pdev(int seg, in
return NULL;
}

+struct pci_dev *pci_get_real_pdev(int seg, int bus, int devfn)
+{
+ struct pci_dev *pdev;
+ int stride;
+
+ if ( seg < 0 || bus < 0 || devfn < 0 )
+ return NULL;
+
+ for ( pdev = pci_get_pdev(seg, bus, devfn), stride = 4;
+ !pdev && stride; stride >>= 1 )
+ {
+ if ( !(devfn & (8 - stride)) )
+ continue;
+ pdev = pci_get_pdev(seg, bus, devfn & ~(8 - stride));
+ if ( pdev && stride != pdev->phantom_stride )
+ pdev = NULL;
+ }
+
+ return pdev;
+}
+
struct pci_dev *pci_get_pdev_by_domain(
struct domain *d, int seg, int bus, int devfn)
{
@@ -488,8 +525,19 @@ int pci_add_device(u16 seg, u8 bus, u8 d

out:
spin_unlock(&pcidevs_lock);
- printk(XENLOG_DEBUG "PCI add %s %04x:%02x:%02x.%u\n", pdev_type,
- seg, bus, slot, func);
+ if ( !ret )
+ {
+ printk(XENLOG_DEBUG "PCI add %s %04x:%02x:%02x.%u\n", pdev_type,
+ seg, bus, slot, func);
+ while ( pdev->phantom_stride )
+ {
+ func += pdev->phantom_stride;
+ if ( PCI_SLOT(func) )
+ break;
+ printk(XENLOG_DEBUG "PCI phantom %04x:%02x:%02x.%u\n",
+ seg, bus, slot, func);
+ }
+ }
return ret;
}

@@ -681,7 +729,7 @@ void pci_check_disable_device(u16 seg, u
u16 cword;

spin_lock(&pcidevs_lock);
- pdev = pci_get_pdev(seg, bus, devfn);
+ pdev = pci_get_real_pdev(seg, bus, devfn);
if ( pdev )
{
if ( now < pdev->fault.time ||
@@ -698,6 +746,7 @@ void pci_check_disable_device(u16 seg, u

/* Tell the device to stop DMAing; we can't rely on the guest to
* control it for us. */
+ devfn = pdev->devfn;
cword = pci_conf_read16(seg, bus, PCI_SLOT(devfn), PCI_FUNC(devfn),
PCI_COMMAND);
pci_conf_write16(seg, bus, PCI_SLOT(devfn), PCI_FUNC(devfn),
@@ -759,6 +808,27 @@ struct setup_dom0 {
int (*handler)(u8 devfn, struct pci_dev *);
};

+static void setup_one_dom0_device(const struct setup_dom0 *ctxt,
+ struct pci_dev *pdev)
+{
+ u8 devfn = pdev->devfn;
+
+ do {
+ int err = ctxt->handler(devfn, pdev);
+
+ if ( err )
+ {
+ printk(XENLOG_ERR "setup %04x:%02x:%02x.%u for d%d failed (%d)\n",
+ pdev->seg, pdev->bus, PCI_SLOT(devfn), PCI_FUNC(devfn),
+ ctxt->d->domain_id, err);
+ if ( devfn == pdev->devfn )
+ return;
+ }
+ devfn += pdev->phantom_stride;
+ } while ( devfn != pdev->devfn &&
+ PCI_SLOT(devfn) == PCI_SLOT(pdev->devfn) );
+}
+
static int __init _setup_dom0_pci_devices(struct pci_seg *pseg, void *arg)
{
struct setup_dom0 *ctxt = arg;
@@ -777,12 +847,12 @@ static int __init _setup_dom0_pci_device
{
pdev->domain = ctxt->d;
list_add(&pdev->domain_list, &ctxt->d->arch.pdev_list);
- ctxt->handler(devfn, pdev);
+ setup_one_dom0_device(ctxt, pdev);
}
else if ( pdev->domain == dom_xen )
{
pdev->domain = ctxt->d;
- ctxt->handler(devfn, pdev);
+ setup_one_dom0_device(ctxt, pdev);
pdev->domain = dom_xen;
}
else if ( pdev->domain != ctxt->d )
--- a/xen/include/xen/lib.h
+++ b/xen/include/xen/lib.h
@@ -58,6 +58,9 @@ do {

#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]) + __must_be_array(x))

+#define MASK_EXTR(v, m) (((v) & (m)) / ((m) & -(m)))
+#define MASK_INSR(v, m) (((v) * ((m) & -(m))) & (m))
+
#define reserve_bootmem(_p,_l) ((void)0)

struct domain;
--- a/xen/include/xen/pci.h
+++ b/xen/include/xen/pci.h
@@ -63,6 +63,8 @@ struct pci_dev {
const u8 bus;
const u8 devfn;

+ u8 phantom_stride;
+
enum pdev_type {
DEV_TYPE_PCI_UNKNOWN,
DEV_TYPE_PCIe_ENDPOINT,
@@ -114,6 +116,7 @@ int pci_ro_device(int seg, int bus, int
void arch_pci_ro_device(int seg, int bdf);
int pci_hide_device(int bus, int devfn);
struct pci_dev *pci_get_pdev(int seg, int bus, int devfn);
+struct pci_dev *pci_get_real_pdev(int seg, int bus, int devfn);
struct pci_dev *pci_get_pdev_by_domain(
struct domain *, int seg, int bus, int devfn);
void pci_check_disable_device(u16 seg, u8 bus, u8 devfn);