Mailing List Archive

[PATCH v19 044/130] KVM: TDX: Do TDX specific vcpu initialization
From: Isaku Yamahata <isaku.yamahata@intel.com>

TD guest vcpu needs TDX specific initialization before running. Repurpose
KVM_MEMORY_ENCRYPT_OP to vcpu-scope, add a new sub-command
KVM_TDX_INIT_VCPU, and implement the callback for it.

Signed-off-by: Sean Christopherson <sean.j.christopherson@intel.com>
Signed-off-by: Isaku Yamahata <isaku.yamahata@intel.com>
---
v18:
- Use tdh_sys_rd() instead of struct tdsysinfo_struct.
- Rename tdx_reclaim_td_page() => tdx_reclaim_control_page()
- Remove the change of tools/arch/x86/include/uapi/asm/kvm.h.
---
arch/x86/include/asm/kvm-x86-ops.h | 1 +
arch/x86/include/asm/kvm_host.h | 1 +
arch/x86/include/uapi/asm/kvm.h | 1 +
arch/x86/kvm/vmx/main.c | 9 ++
arch/x86/kvm/vmx/tdx.c | 184 ++++++++++++++++++++++++++++-
arch/x86/kvm/vmx/tdx.h | 8 ++
arch/x86/kvm/vmx/x86_ops.h | 4 +
arch/x86/kvm/x86.c | 6 +
8 files changed, 211 insertions(+), 3 deletions(-)

diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h
index f78200492a3d..a8e96804a252 100644
--- a/arch/x86/include/asm/kvm-x86-ops.h
+++ b/arch/x86/include/asm/kvm-x86-ops.h
@@ -129,6 +129,7 @@ KVM_X86_OP(leave_smm)
KVM_X86_OP(enable_smi_window)
#endif
KVM_X86_OP(mem_enc_ioctl)
+KVM_X86_OP_OPTIONAL(vcpu_mem_enc_ioctl)
KVM_X86_OP_OPTIONAL(mem_enc_register_region)
KVM_X86_OP_OPTIONAL(mem_enc_unregister_region)
KVM_X86_OP_OPTIONAL(vm_copy_enc_context_from)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 0e2408a4707e..5da3c211955d 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1778,6 +1778,7 @@ struct kvm_x86_ops {
#endif

int (*mem_enc_ioctl)(struct kvm *kvm, void __user *argp);
+ int (*vcpu_mem_enc_ioctl)(struct kvm_vcpu *vcpu, void __user *argp);
int (*mem_enc_register_region)(struct kvm *kvm, struct kvm_enc_region *argp);
int (*mem_enc_unregister_region)(struct kvm *kvm, struct kvm_enc_region *argp);
int (*vm_copy_enc_context_from)(struct kvm *kvm, unsigned int source_fd);
diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h
index 9ac0246bd974..4000a2e087a8 100644
--- a/arch/x86/include/uapi/asm/kvm.h
+++ b/arch/x86/include/uapi/asm/kvm.h
@@ -571,6 +571,7 @@ struct kvm_pmu_event_filter {
enum kvm_tdx_cmd_id {
KVM_TDX_CAPABILITIES = 0,
KVM_TDX_INIT_VM,
+ KVM_TDX_INIT_VCPU,

KVM_TDX_CMD_NR_MAX,
};
diff --git a/arch/x86/kvm/vmx/main.c b/arch/x86/kvm/vmx/main.c
index 5796fb45433f..d0f75020579f 100644
--- a/arch/x86/kvm/vmx/main.c
+++ b/arch/x86/kvm/vmx/main.c
@@ -131,6 +131,14 @@ static int vt_mem_enc_ioctl(struct kvm *kvm, void __user *argp)
return tdx_vm_ioctl(kvm, argp);
}

+static int vt_vcpu_mem_enc_ioctl(struct kvm_vcpu *vcpu, void __user *argp)
+{
+ if (!is_td_vcpu(vcpu))
+ return -EINVAL;
+
+ return tdx_vcpu_ioctl(vcpu, argp);
+}
+
#define VMX_REQUIRED_APICV_INHIBITS \
(BIT(APICV_INHIBIT_REASON_DISABLE)| \
BIT(APICV_INHIBIT_REASON_ABSENT) | \
@@ -291,6 +299,7 @@ struct kvm_x86_ops vt_x86_ops __initdata = {
.get_untagged_addr = vmx_get_untagged_addr,

.mem_enc_ioctl = vt_mem_enc_ioctl,
+ .vcpu_mem_enc_ioctl = vt_vcpu_mem_enc_ioctl,
};

struct kvm_x86_init_ops vt_init_ops __initdata = {
diff --git a/arch/x86/kvm/vmx/tdx.c b/arch/x86/kvm/vmx/tdx.c
index 51283d2cd011..aa1da51b8af7 100644
--- a/arch/x86/kvm/vmx/tdx.c
+++ b/arch/x86/kvm/vmx/tdx.c
@@ -49,6 +49,7 @@ struct tdx_info {
u64 xfam_fixed1;

u8 nr_tdcs_pages;
+ u8 nr_tdvpx_pages;

u16 num_cpuid_config;
/* This must the last member. */
@@ -104,6 +105,11 @@ static __always_inline hpa_t set_hkid_to_hpa(hpa_t pa, u16 hkid)
return pa | ((hpa_t)hkid << boot_cpu_data.x86_phys_bits);
}

+static inline bool is_td_vcpu_created(struct vcpu_tdx *tdx)
+{
+ return tdx->td_vcpu_created;
+}
+
static inline bool is_td_created(struct kvm_tdx *kvm_tdx)
{
return kvm_tdx->tdr_pa;
@@ -121,6 +127,11 @@ static inline bool is_hkid_assigned(struct kvm_tdx *kvm_tdx)
return kvm_tdx->hkid > 0;
}

+static inline bool is_td_finalized(struct kvm_tdx *kvm_tdx)
+{
+ return kvm_tdx->finalized;
+}
+
static void tdx_clear_page(unsigned long page_pa)
{
const void *zero_page = (const void *) __va(page_to_phys(ZERO_PAGE(0)));
@@ -399,7 +410,32 @@ int tdx_vcpu_create(struct kvm_vcpu *vcpu)

void tdx_vcpu_free(struct kvm_vcpu *vcpu)
{
- /* This is stub for now. More logic will come. */
+ struct vcpu_tdx *tdx = to_tdx(vcpu);
+ int i;
+
+ /*
+ * This methods can be called when vcpu allocation/initialization
+ * failed. So it's possible that hkid, tdvpx and tdvpr are not assigned
+ * yet.
+ */
+ if (is_hkid_assigned(to_kvm_tdx(vcpu->kvm))) {
+ WARN_ON_ONCE(tdx->tdvpx_pa);
+ WARN_ON_ONCE(tdx->tdvpr_pa);
+ return;
+ }
+
+ if (tdx->tdvpx_pa) {
+ for (i = 0; i < tdx_info->nr_tdvpx_pages; i++) {
+ if (tdx->tdvpx_pa[i])
+ tdx_reclaim_control_page(tdx->tdvpx_pa[i]);
+ }
+ kfree(tdx->tdvpx_pa);
+ tdx->tdvpx_pa = NULL;
+ }
+ if (tdx->tdvpr_pa) {
+ tdx_reclaim_control_page(tdx->tdvpr_pa);
+ tdx->tdvpr_pa = 0;
+ }
}

void tdx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
@@ -408,8 +444,13 @@ void tdx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
/* Ignore INIT silently because TDX doesn't support INIT event. */
if (init_event)
return;
+ if (KVM_BUG_ON(is_td_vcpu_created(to_tdx(vcpu)), vcpu->kvm))
+ return;

- /* This is stub for now. More logic will come here. */
+ /*
+ * Don't update mp_state to runnable because more initialization
+ * is needed by TDX_VCPU_INIT.
+ */
}

static int tdx_get_capabilities(struct kvm_tdx_cmd *cmd)
@@ -904,6 +945,137 @@ int tdx_vm_ioctl(struct kvm *kvm, void __user *argp)
return r;
}

+/* VMM can pass one 64bit auxiliary data to vcpu via RCX for guest BIOS. */
+static int tdx_td_vcpu_init(struct kvm_vcpu *vcpu, u64 vcpu_rcx)
+{
+ struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm);
+ struct vcpu_tdx *tdx = to_tdx(vcpu);
+ unsigned long *tdvpx_pa = NULL;
+ unsigned long tdvpr_pa;
+ unsigned long va;
+ int ret, i;
+ u64 err;
+
+ if (is_td_vcpu_created(tdx))
+ return -EINVAL;
+
+ /*
+ * vcpu_free method frees allocated pages. Avoid partial setup so
+ * that the method can't handle it.
+ */
+ va = __get_free_page(GFP_KERNEL_ACCOUNT);
+ if (!va)
+ return -ENOMEM;
+ tdvpr_pa = __pa(va);
+
+ tdvpx_pa = kcalloc(tdx_info->nr_tdvpx_pages, sizeof(*tdx->tdvpx_pa),
+ GFP_KERNEL_ACCOUNT);
+ if (!tdvpx_pa) {
+ ret = -ENOMEM;
+ goto free_tdvpr;
+ }
+ for (i = 0; i < tdx_info->nr_tdvpx_pages; i++) {
+ va = __get_free_page(GFP_KERNEL_ACCOUNT);
+ if (!va) {
+ ret = -ENOMEM;
+ goto free_tdvpx;
+ }
+ tdvpx_pa[i] = __pa(va);
+ }
+
+ err = tdh_vp_create(kvm_tdx->tdr_pa, tdvpr_pa);
+ if (KVM_BUG_ON(err, vcpu->kvm)) {
+ ret = -EIO;
+ pr_tdx_error(TDH_VP_CREATE, err, NULL);
+ goto free_tdvpx;
+ }
+ tdx->tdvpr_pa = tdvpr_pa;
+
+ tdx->tdvpx_pa = tdvpx_pa;
+ for (i = 0; i < tdx_info->nr_tdvpx_pages; i++) {
+ err = tdh_vp_addcx(tdx->tdvpr_pa, tdvpx_pa[i]);
+ if (KVM_BUG_ON(err, vcpu->kvm)) {
+ pr_tdx_error(TDH_VP_ADDCX, err, NULL);
+ for (; i < tdx_info->nr_tdvpx_pages; i++) {
+ free_page((unsigned long)__va(tdvpx_pa[i]));
+ tdvpx_pa[i] = 0;
+ }
+ /* vcpu_free method frees TDVPX and TDR donated to TDX */
+ return -EIO;
+ }
+ }
+
+ err = tdh_vp_init(tdx->tdvpr_pa, vcpu_rcx);
+ if (KVM_BUG_ON(err, vcpu->kvm)) {
+ pr_tdx_error(TDH_VP_INIT, err, NULL);
+ return -EIO;
+ }
+
+ vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
+ tdx->td_vcpu_created = true;
+ return 0;
+
+free_tdvpx:
+ for (i = 0; i < tdx_info->nr_tdvpx_pages; i++) {
+ if (tdvpx_pa[i])
+ free_page((unsigned long)__va(tdvpx_pa[i]));
+ tdvpx_pa[i] = 0;
+ }
+ kfree(tdvpx_pa);
+ tdx->tdvpx_pa = NULL;
+free_tdvpr:
+ if (tdvpr_pa)
+ free_page((unsigned long)__va(tdvpr_pa));
+ tdx->tdvpr_pa = 0;
+
+ return ret;
+}
+
+int tdx_vcpu_ioctl(struct kvm_vcpu *vcpu, void __user *argp)
+{
+ struct msr_data apic_base_msr;
+ struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm);
+ struct vcpu_tdx *tdx = to_tdx(vcpu);
+ struct kvm_tdx_cmd cmd;
+ int ret;
+
+ if (tdx->initialized)
+ return -EINVAL;
+
+ if (!is_hkid_assigned(kvm_tdx) || is_td_finalized(kvm_tdx))
+ return -EINVAL;
+
+ if (copy_from_user(&cmd, argp, sizeof(cmd)))
+ return -EFAULT;
+
+ if (cmd.error)
+ return -EINVAL;
+
+ /* Currently only KVM_TDX_INTI_VCPU is defined for vcpu operation. */
+ if (cmd.flags || cmd.id != KVM_TDX_INIT_VCPU)
+ return -EINVAL;
+
+ /*
+ * As TDX requires X2APIC, set local apic mode to X2APIC. User space
+ * VMM, e.g. qemu, is required to set CPUID[0x1].ecx.X2APIC=1 by
+ * KVM_SET_CPUID2. Otherwise kvm_set_apic_base() will fail.
+ */
+ apic_base_msr = (struct msr_data) {
+ .host_initiated = true,
+ .data = APIC_DEFAULT_PHYS_BASE | LAPIC_MODE_X2APIC |
+ (kvm_vcpu_is_reset_bsp(vcpu) ? MSR_IA32_APICBASE_BSP : 0),
+ };
+ if (kvm_set_apic_base(vcpu, &apic_base_msr))
+ return -EINVAL;
+
+ ret = tdx_td_vcpu_init(vcpu, (u64)cmd.data);
+ if (ret)
+ return ret;
+
+ tdx->initialized = true;
+ return 0;
+}
+
#define TDX_MD_MAP(_fid, _ptr) \
{ .fid = MD_FIELD_ID_##_fid, \
.ptr = (_ptr), }
@@ -953,13 +1125,14 @@ static int tdx_md_read(struct tdx_md_map *maps, int nr_maps)

static int __init tdx_module_setup(void)
{
- u16 num_cpuid_config, tdcs_base_size;
+ u16 num_cpuid_config, tdcs_base_size, tdvps_base_size;
int ret;
u32 i;

struct tdx_md_map mds[] = {
TDX_MD_MAP(NUM_CPUID_CONFIG, &num_cpuid_config),
TDX_MD_MAP(TDCS_BASE_SIZE, &tdcs_base_size),
+ TDX_MD_MAP(TDVPS_BASE_SIZE, &tdvps_base_size),
};

struct tdx_metadata_field_mapping fields[] = {
@@ -1013,6 +1186,11 @@ static int __init tdx_module_setup(void)
}

tdx_info->nr_tdcs_pages = tdcs_base_size / PAGE_SIZE;
+ /*
+ * TDVPS = TDVPR(4K page) + TDVPX(multiple 4K pages).
+ * -1 for TDVPR.
+ */
+ tdx_info->nr_tdvpx_pages = tdvps_base_size / PAGE_SIZE - 1;

/*
* Make TDH.VP.ENTER preserve RBP so that the stack unwinder
diff --git a/arch/x86/kvm/vmx/tdx.h b/arch/x86/kvm/vmx/tdx.h
index 173ed19207fb..d3077151252c 100644
--- a/arch/x86/kvm/vmx/tdx.h
+++ b/arch/x86/kvm/vmx/tdx.h
@@ -17,12 +17,20 @@ struct kvm_tdx {
u64 xfam;
int hkid;

+ bool finalized;
+
u64 tsc_offset;
};

struct vcpu_tdx {
struct kvm_vcpu vcpu;

+ unsigned long tdvpr_pa;
+ unsigned long *tdvpx_pa;
+ bool td_vcpu_created;
+
+ bool initialized;
+
/*
* Dummy to make pmu_intel not corrupt memory.
* TODO: Support PMU for TDX. Future work.
diff --git a/arch/x86/kvm/vmx/x86_ops.h b/arch/x86/kvm/vmx/x86_ops.h
index bb73a9b5b354..f5820f617b2e 100644
--- a/arch/x86/kvm/vmx/x86_ops.h
+++ b/arch/x86/kvm/vmx/x86_ops.h
@@ -150,6 +150,8 @@ int tdx_vm_ioctl(struct kvm *kvm, void __user *argp);
int tdx_vcpu_create(struct kvm_vcpu *vcpu);
void tdx_vcpu_free(struct kvm_vcpu *vcpu);
void tdx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event);
+
+int tdx_vcpu_ioctl(struct kvm_vcpu *vcpu, void __user *argp);
#else
static inline int tdx_hardware_setup(struct kvm_x86_ops *x86_ops) { return -EOPNOTSUPP; }
static inline void tdx_hardware_unsetup(void) {}
@@ -169,6 +171,8 @@ static inline int tdx_vm_ioctl(struct kvm *kvm, void __user *argp) { return -EOP
static inline int tdx_vcpu_create(struct kvm_vcpu *vcpu) { return -EOPNOTSUPP; }
static inline void tdx_vcpu_free(struct kvm_vcpu *vcpu) {}
static inline void tdx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) {}
+
+static inline int tdx_vcpu_ioctl(struct kvm_vcpu *vcpu, void __user *argp) { return -EOPNOTSUPP; }
#endif

#endif /* __KVM_X86_VMX_X86_OPS_H */
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index c002761bb662..2bd4b7c8fa51 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -6274,6 +6274,12 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
case KVM_SET_DEVICE_ATTR:
r = kvm_vcpu_ioctl_device_attr(vcpu, ioctl, argp);
break;
+ case KVM_MEMORY_ENCRYPT_OP:
+ r = -ENOTTY;
+ if (!kvm_x86_ops.vcpu_mem_enc_ioctl)
+ goto out;
+ r = kvm_x86_ops.vcpu_mem_enc_ioctl(vcpu, argp);
+ break;
default:
r = -EINVAL;
}
--
2.25.1
Re: [PATCH v19 044/130] KVM: TDX: Do TDX specific vcpu initialization [ In reply to ]
>+/* VMM can pass one 64bit auxiliary data to vcpu via RCX for guest BIOS. */
>+static int tdx_td_vcpu_init(struct kvm_vcpu *vcpu, u64 vcpu_rcx)
>+{
>+ struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm);
>+ struct vcpu_tdx *tdx = to_tdx(vcpu);
>+ unsigned long *tdvpx_pa = NULL;
>+ unsigned long tdvpr_pa;
>+ unsigned long va;
>+ int ret, i;
>+ u64 err;
>+
>+ if (is_td_vcpu_created(tdx))
>+ return -EINVAL;
>+
>+ /*
>+ * vcpu_free method frees allocated pages. Avoid partial setup so
>+ * that the method can't handle it.
>+ */
>+ va = __get_free_page(GFP_KERNEL_ACCOUNT);
>+ if (!va)
>+ return -ENOMEM;
>+ tdvpr_pa = __pa(va);
>+
>+ tdvpx_pa = kcalloc(tdx_info->nr_tdvpx_pages, sizeof(*tdx->tdvpx_pa),
>+ GFP_KERNEL_ACCOUNT);
>+ if (!tdvpx_pa) {
>+ ret = -ENOMEM;
>+ goto free_tdvpr;
>+ }
>+ for (i = 0; i < tdx_info->nr_tdvpx_pages; i++) {
>+ va = __get_free_page(GFP_KERNEL_ACCOUNT);
>+ if (!va) {
>+ ret = -ENOMEM;
>+ goto free_tdvpx;
>+ }
>+ tdvpx_pa[i] = __pa(va);
>+ }
>+
>+ err = tdh_vp_create(kvm_tdx->tdr_pa, tdvpr_pa);
>+ if (KVM_BUG_ON(err, vcpu->kvm)) {
>+ ret = -EIO;
>+ pr_tdx_error(TDH_VP_CREATE, err, NULL);
>+ goto free_tdvpx;
>+ }
>+ tdx->tdvpr_pa = tdvpr_pa;
>+
>+ tdx->tdvpx_pa = tdvpx_pa;
>+ for (i = 0; i < tdx_info->nr_tdvpx_pages; i++) {

Can you merge the for-loop above into this one? then ...

>+ err = tdh_vp_addcx(tdx->tdvpr_pa, tdvpx_pa[i]);
>+ if (KVM_BUG_ON(err, vcpu->kvm)) {
>+ pr_tdx_error(TDH_VP_ADDCX, err, NULL);

>+ for (; i < tdx_info->nr_tdvpx_pages; i++) {
>+ free_page((unsigned long)__va(tdvpx_pa[i]));
>+ tdvpx_pa[i] = 0;
>+ }

.. no need to free remaining pages.

>+ /* vcpu_free method frees TDVPX and TDR donated to TDX */
>+ return -EIO;
>+ }
>+ }
>+
>+ err = tdh_vp_init(tdx->tdvpr_pa, vcpu_rcx);
>+ if (KVM_BUG_ON(err, vcpu->kvm)) {
>+ pr_tdx_error(TDH_VP_INIT, err, NULL);
>+ return -EIO;
>+ }
>+
>+ vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
>+ tdx->td_vcpu_created = true;
>+ return 0;
>+
>+free_tdvpx:
>+ for (i = 0; i < tdx_info->nr_tdvpx_pages; i++) {
>+ if (tdvpx_pa[i])
>+ free_page((unsigned long)__va(tdvpx_pa[i]));
>+ tdvpx_pa[i] = 0;
>+ }
>+ kfree(tdvpx_pa);
>+ tdx->tdvpx_pa = NULL;
>+free_tdvpr:
>+ if (tdvpr_pa)
>+ free_page((unsigned long)__va(tdvpr_pa));
>+ tdx->tdvpr_pa = 0;
>+
>+ return ret;
>+}
>+
>+int tdx_vcpu_ioctl(struct kvm_vcpu *vcpu, void __user *argp)
>+{
>+ struct msr_data apic_base_msr;
>+ struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm);
>+ struct vcpu_tdx *tdx = to_tdx(vcpu);
>+ struct kvm_tdx_cmd cmd;
>+ int ret;
>+
>+ if (tdx->initialized)
>+ return -EINVAL;
>+
>+ if (!is_hkid_assigned(kvm_tdx) || is_td_finalized(kvm_tdx))

These checks look random e.g., I am not sure why is_td_created() isn't check here.

A few helper functions and boolean variables are added to track which stage the
TD or TD vCPU is in. e.g.,

is_hkid_assigned()
is_td_finalized()
is_td_created()
tdx->initialized
td_vcpu_created

Insteading of doing this, I am wondering if adding two state machines for
TD and TD vCPU would make the implementation clear and easy to extend.

>+ return -EINVAL;
>+
>+ if (copy_from_user(&cmd, argp, sizeof(cmd)))
>+ return -EFAULT;
>+
>+ if (cmd.error)
>+ return -EINVAL;
>+
>+ /* Currently only KVM_TDX_INTI_VCPU is defined for vcpu operation. */
>+ if (cmd.flags || cmd.id != KVM_TDX_INIT_VCPU)
>+ return -EINVAL;

Even though KVM_TD_INIT_VCPU is the only supported command, it is worthwhile to
use a switch-case statement. New commands can be added easily without the need
to refactor this function first.

>+
>+ /*
>+ * As TDX requires X2APIC, set local apic mode to X2APIC. User space
>+ * VMM, e.g. qemu, is required to set CPUID[0x1].ecx.X2APIC=1 by
>+ * KVM_SET_CPUID2. Otherwise kvm_set_apic_base() will fail.
>+ */
>+ apic_base_msr = (struct msr_data) {
>+ .host_initiated = true,
>+ .data = APIC_DEFAULT_PHYS_BASE | LAPIC_MODE_X2APIC |
>+ (kvm_vcpu_is_reset_bsp(vcpu) ? MSR_IA32_APICBASE_BSP : 0),
>+ };
>+ if (kvm_set_apic_base(vcpu, &apic_base_msr))
>+ return -EINVAL;

Exporting kvm_vcpu_is_reset_bsp() and kvm_set_apic_base() should be done
here (rather than in a previous patch).

>+
>+ ret = tdx_td_vcpu_init(vcpu, (u64)cmd.data);
>+ if (ret)
>+ return ret;
>+
>+ tdx->initialized = true;
>+ return 0;
>+}
>+

>diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
>index c002761bb662..2bd4b7c8fa51 100644
>--- a/arch/x86/kvm/x86.c
>+++ b/arch/x86/kvm/x86.c
>@@ -6274,6 +6274,12 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
> case KVM_SET_DEVICE_ATTR:
> r = kvm_vcpu_ioctl_device_attr(vcpu, ioctl, argp);
> break;
>+ case KVM_MEMORY_ENCRYPT_OP:
>+ r = -ENOTTY;

Maybe -EINVAL is better. Because previously trying to call this on vCPU fd
failed with -EINVAL given ...

>+ if (!kvm_x86_ops.vcpu_mem_enc_ioctl)
>+ goto out;
>+ r = kvm_x86_ops.vcpu_mem_enc_ioctl(vcpu, argp);
>+ break;
> default:
> r = -EINVAL;

.. this.

> }
>--
>2.25.1
>
>
Re: [PATCH v19 044/130] KVM: TDX: Do TDX specific vcpu initialization [ In reply to ]
On Thu, Mar 21, 2024 at 01:43:14PM +0800,
Chao Gao <chao.gao@intel.com> wrote:

> >+/* VMM can pass one 64bit auxiliary data to vcpu via RCX for guest BIOS. */
> >+static int tdx_td_vcpu_init(struct kvm_vcpu *vcpu, u64 vcpu_rcx)
> >+{
> >+ struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm);
> >+ struct vcpu_tdx *tdx = to_tdx(vcpu);
> >+ unsigned long *tdvpx_pa = NULL;
> >+ unsigned long tdvpr_pa;
> >+ unsigned long va;
> >+ int ret, i;
> >+ u64 err;
> >+
> >+ if (is_td_vcpu_created(tdx))
> >+ return -EINVAL;
> >+
> >+ /*
> >+ * vcpu_free method frees allocated pages. Avoid partial setup so
> >+ * that the method can't handle it.
> >+ */
> >+ va = __get_free_page(GFP_KERNEL_ACCOUNT);
> >+ if (!va)
> >+ return -ENOMEM;
> >+ tdvpr_pa = __pa(va);
> >+
> >+ tdvpx_pa = kcalloc(tdx_info->nr_tdvpx_pages, sizeof(*tdx->tdvpx_pa),
> >+ GFP_KERNEL_ACCOUNT);
> >+ if (!tdvpx_pa) {
> >+ ret = -ENOMEM;
> >+ goto free_tdvpr;
> >+ }
> >+ for (i = 0; i < tdx_info->nr_tdvpx_pages; i++) {
> >+ va = __get_free_page(GFP_KERNEL_ACCOUNT);
> >+ if (!va) {
> >+ ret = -ENOMEM;
> >+ goto free_tdvpx;
> >+ }
> >+ tdvpx_pa[i] = __pa(va);
> >+ }
> >+
> >+ err = tdh_vp_create(kvm_tdx->tdr_pa, tdvpr_pa);
> >+ if (KVM_BUG_ON(err, vcpu->kvm)) {
> >+ ret = -EIO;
> >+ pr_tdx_error(TDH_VP_CREATE, err, NULL);
> >+ goto free_tdvpx;
> >+ }
> >+ tdx->tdvpr_pa = tdvpr_pa;
> >+
> >+ tdx->tdvpx_pa = tdvpx_pa;
> >+ for (i = 0; i < tdx_info->nr_tdvpx_pages; i++) {
>
> Can you merge the for-loop above into this one? then ...
>
> >+ err = tdh_vp_addcx(tdx->tdvpr_pa, tdvpx_pa[i]);
> >+ if (KVM_BUG_ON(err, vcpu->kvm)) {
> >+ pr_tdx_error(TDH_VP_ADDCX, err, NULL);
>
> >+ for (; i < tdx_info->nr_tdvpx_pages; i++) {
> >+ free_page((unsigned long)__va(tdvpx_pa[i]));
> >+ tdvpx_pa[i] = 0;
> >+ }
>
> ... no need to free remaining pages.

Makes sense. Let me clean up this.


> >+ /* vcpu_free method frees TDVPX and TDR donated to TDX */
> >+ return -EIO;
> >+ }
> >+ }
> >+
> >+ err = tdh_vp_init(tdx->tdvpr_pa, vcpu_rcx);
> >+ if (KVM_BUG_ON(err, vcpu->kvm)) {
> >+ pr_tdx_error(TDH_VP_INIT, err, NULL);
> >+ return -EIO;
> >+ }
> >+
> >+ vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
> >+ tdx->td_vcpu_created = true;
> >+ return 0;
> >+
> >+free_tdvpx:
> >+ for (i = 0; i < tdx_info->nr_tdvpx_pages; i++) {
> >+ if (tdvpx_pa[i])
> >+ free_page((unsigned long)__va(tdvpx_pa[i]));
> >+ tdvpx_pa[i] = 0;
> >+ }
> >+ kfree(tdvpx_pa);
> >+ tdx->tdvpx_pa = NULL;
> >+free_tdvpr:
> >+ if (tdvpr_pa)
> >+ free_page((unsigned long)__va(tdvpr_pa));
> >+ tdx->tdvpr_pa = 0;
> >+
> >+ return ret;
> >+}
> >+
> >+int tdx_vcpu_ioctl(struct kvm_vcpu *vcpu, void __user *argp)
> >+{
> >+ struct msr_data apic_base_msr;
> >+ struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm);
> >+ struct vcpu_tdx *tdx = to_tdx(vcpu);
> >+ struct kvm_tdx_cmd cmd;
> >+ int ret;
> >+
> >+ if (tdx->initialized)
> >+ return -EINVAL;
> >+
> >+ if (!is_hkid_assigned(kvm_tdx) || is_td_finalized(kvm_tdx))
>
> These checks look random e.g., I am not sure why is_td_created() isn't check here.
>
> A few helper functions and boolean variables are added to track which stage the
> TD or TD vCPU is in. e.g.,
>
> is_hkid_assigned()
> is_td_finalized()
> is_td_created()
> tdx->initialized
> td_vcpu_created
>
> Insteading of doing this, I am wondering if adding two state machines for
> TD and TD vCPU would make the implementation clear and easy to extend.

Let me look into the state machine. Originally I hoped we don't need it, but
it seems to deserve the state machine..


> >+ return -EINVAL;
> >+
> >+ if (copy_from_user(&cmd, argp, sizeof(cmd)))
> >+ return -EFAULT;
> >+
> >+ if (cmd.error)
> >+ return -EINVAL;
> >+
> >+ /* Currently only KVM_TDX_INTI_VCPU is defined for vcpu operation. */
> >+ if (cmd.flags || cmd.id != KVM_TDX_INIT_VCPU)
> >+ return -EINVAL;
>
> Even though KVM_TD_INIT_VCPU is the only supported command, it is worthwhile to
> use a switch-case statement. New commands can be added easily without the need
> to refactor this function first.

Yes. For KVM_MAP_MEMORY, I will make KVM_TDX_INIT_MEM_REGION vcpu ioctl instead
of vm ioctl because it is consistent and scalable. We'll have switch statement
in the next respin.

> >+
> >+ /*
> >+ * As TDX requires X2APIC, set local apic mode to X2APIC. User space
> >+ * VMM, e.g. qemu, is required to set CPUID[0x1].ecx.X2APIC=1 by
> >+ * KVM_SET_CPUID2. Otherwise kvm_set_apic_base() will fail.
> >+ */
> >+ apic_base_msr = (struct msr_data) {
> >+ .host_initiated = true,
> >+ .data = APIC_DEFAULT_PHYS_BASE | LAPIC_MODE_X2APIC |
> >+ (kvm_vcpu_is_reset_bsp(vcpu) ? MSR_IA32_APICBASE_BSP : 0),
> >+ };
> >+ if (kvm_set_apic_base(vcpu, &apic_base_msr))
> >+ return -EINVAL;
>
> Exporting kvm_vcpu_is_reset_bsp() and kvm_set_apic_base() should be done
> here (rather than in a previous patch).

Sure.


> >+
> >+ ret = tdx_td_vcpu_init(vcpu, (u64)cmd.data);
> >+ if (ret)
> >+ return ret;
> >+
> >+ tdx->initialized = true;
> >+ return 0;
> >+}
> >+
>
> >diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> >index c002761bb662..2bd4b7c8fa51 100644
> >--- a/arch/x86/kvm/x86.c
> >+++ b/arch/x86/kvm/x86.c
> >@@ -6274,6 +6274,12 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
> > case KVM_SET_DEVICE_ATTR:
> > r = kvm_vcpu_ioctl_device_attr(vcpu, ioctl, argp);
> > break;
> >+ case KVM_MEMORY_ENCRYPT_OP:
> >+ r = -ENOTTY;
>
> Maybe -EINVAL is better. Because previously trying to call this on vCPU fd
> failed with -EINVAL given ...

Oh, ok. Will change it. I followed VM ioctl case as default value. But vcpu
ioctl seems to have -EINVAL as default value.
--
Isaku Yamahata <isaku.yamahata@intel.com>
Re: [PATCH v19 044/130] KVM: TDX: Do TDX specific vcpu initialization [ In reply to ]
On Mon, 2024-02-26 at 00:25 -0800, isaku.yamahata@intel.com wrote:
> +/* VMM can pass one 64bit auxiliary data to vcpu via RCX for guest BIOS. */
> +static int tdx_td_vcpu_init(struct kvm_vcpu *vcpu, u64 vcpu_rcx)
> +{
> +       struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm);
> +       struct vcpu_tdx *tdx = to_tdx(vcpu);
> +       unsigned long *tdvpx_pa = NULL;
> +       unsigned long tdvpr_pa;


I think we could drop theselocal variables and just use tdx->tdvpr_pa and tdx->tdvpx_pa. Then we
don't have to have the assignments later.

> +       unsigned long va;
> +       int ret, i;
> +       u64 err;
> +
> +       if (is_td_vcpu_created(tdx))
> +               return -EINVAL;
> +
> +       /*
> +        * vcpu_free method frees allocated pages.  Avoid partial setup so
> +        * that the method can't handle it.
> +        */
> +       va = __get_free_page(GFP_KERNEL_ACCOUNT);
> +       if (!va)
> +               return -ENOMEM;
> +       tdvpr_pa = __pa(va);
> +
> +       tdvpx_pa = kcalloc(tdx_info->nr_tdvpx_pages, sizeof(*tdx->tdvpx_pa),
> +                          GFP_KERNEL_ACCOUNT);
> +       if (!tdvpx_pa) {
> +               ret = -ENOMEM;
> +               goto free_tdvpr;
> +       }
> +       for (i = 0; i < tdx_info->nr_tdvpx_pages; i++) {
> +               va = __get_free_page(GFP_KERNEL_ACCOUNT);
> +               if (!va) {
> +                       ret = -ENOMEM;
> +                       goto free_tdvpx;
> +               }
> +               tdvpx_pa[i] = __pa(va);
> +       }
> +
> +       err = tdh_vp_create(kvm_tdx->tdr_pa, tdvpr_pa);
> +       if (KVM_BUG_ON(err, vcpu->kvm)) {
> +               ret = -EIO;
> +               pr_tdx_error(TDH_VP_CREATE, err, NULL);
> +               goto free_tdvpx;
> +       }
> +       tdx->tdvpr_pa = tdvpr_pa;
> +
> +       tdx->tdvpx_pa = tdvpx_pa;

Or alternatively let's move these to right before they are used. (in the current branch

> +       for (i = 0; i < tdx_info->nr_tdvpx_pages; i++) {
> +               err = tdh_vp_addcx(tdx->tdvpr_pa, tdvpx_pa[i]);
> +               if (KVM_BUG_ON(err, vcpu->kvm)) {
> +                       pr_tdx_error(TDH_VP_ADDCX, err, NULL);
> +                       for (; i < tdx_info->nr_tdvpx_pages; i++) {
> +                               free_page((unsigned long)__va(tdvpx_pa[i]));
> +                               tdvpx_pa[i] = 0;
> +                       }
> +                       /* vcpu_free method frees TDVPX and TDR donated to TDX */
> +                       return -EIO;
> +               }
> +       }
>
>
In the current branch tdh_vp_init() takes struct vcpu_tdx, so they would be moved right here.

What do you think?

> +
> +       err = tdh_vp_init(tdx->tdvpr_pa, vcpu_rcx);
> +       if (KVM_BUG_ON(err, vcpu->kvm)) {
> +               pr_tdx_error(TDH_VP_INIT, err, NULL);
> +               return -EIO;
> +       }
> +
> +       vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
> +       tdx->td_vcpu_created = true;
> +       return 0;
> +
> +free_tdvpx:
> +       for (i = 0; i < tdx_info->nr_tdvpx_pages; i++) {
> +               if (tdvpx_pa[i])
> +                       free_page((unsigned long)__va(tdvpx_pa[i]));
> +               tdvpx_pa[i] = 0;
> +       }
> +       kfree(tdvpx_pa);
> +       tdx->tdvpx_pa = NULL;
> +free_tdvpr:
> +       if (tdvpr_pa)
> +               free_page((unsigned long)__va(tdvpr_pa));
> +       tdx->tdvpr_pa = 0;
> +
> +       return ret;
> +}
Re: [PATCH v19 044/130] KVM: TDX: Do TDX specific vcpu initialization [ In reply to ]
On Wed, Mar 27, 2024 at 12:27:03AM +0000,
"Edgecombe, Rick P" <rick.p.edgecombe@intel.com> wrote:

> On Mon, 2024-02-26 at 00:25 -0800, isaku.yamahata@intel.com wrote:
> > +/* VMM can pass one 64bit auxiliary data to vcpu via RCX for guest BIOS. */
> > +static int tdx_td_vcpu_init(struct kvm_vcpu *vcpu, u64 vcpu_rcx)
> > +{
> > +       struct kvm_tdx *kvm_tdx = to_kvm_tdx(vcpu->kvm);
> > +       struct vcpu_tdx *tdx = to_tdx(vcpu);
> > +       unsigned long *tdvpx_pa = NULL;
> > +       unsigned long tdvpr_pa;
>
>
> I think we could drop theselocal variables and just use tdx->tdvpr_pa and tdx->tdvpx_pa. Then we
> don't have to have the assignments later.

Yes, let me clean it up. The old version acquired spin lock in the middle. Now
we don't have it.


> > +       unsigned long va;
> > +       int ret, i;
> > +       u64 err;
> > +
> > +       if (is_td_vcpu_created(tdx))
> > +               return -EINVAL;
> > +
> > +       /*
> > +        * vcpu_free method frees allocated pages.  Avoid partial setup so
> > +        * that the method can't handle it.
> > +        */
> > +       va = __get_free_page(GFP_KERNEL_ACCOUNT);
> > +       if (!va)
> > +               return -ENOMEM;
> > +       tdvpr_pa = __pa(va);
> > +
> > +       tdvpx_pa = kcalloc(tdx_info->nr_tdvpx_pages, sizeof(*tdx->tdvpx_pa),
> > +                          GFP_KERNEL_ACCOUNT);
> > +       if (!tdvpx_pa) {
> > +               ret = -ENOMEM;
> > +               goto free_tdvpr;
> > +       }
> > +       for (i = 0; i < tdx_info->nr_tdvpx_pages; i++) {
> > +               va = __get_free_page(GFP_KERNEL_ACCOUNT);
> > +               if (!va) {
> > +                       ret = -ENOMEM;
> > +                       goto free_tdvpx;
> > +               }
> > +               tdvpx_pa[i] = __pa(va);
> > +       }
> > +
> > +       err = tdh_vp_create(kvm_tdx->tdr_pa, tdvpr_pa);
> > +       if (KVM_BUG_ON(err, vcpu->kvm)) {
> > +               ret = -EIO;
> > +               pr_tdx_error(TDH_VP_CREATE, err, NULL);
> > +               goto free_tdvpx;
> > +       }
> > +       tdx->tdvpr_pa = tdvpr_pa;
> > +
> > +       tdx->tdvpx_pa = tdvpx_pa;
>
> Or alternatively let's move these to right before they are used. (in the current branch
>
> > +       for (i = 0; i < tdx_info->nr_tdvpx_pages; i++) {
> > +               err = tdh_vp_addcx(tdx->tdvpr_pa, tdvpx_pa[i]);
> > +               if (KVM_BUG_ON(err, vcpu->kvm)) {
> > +                       pr_tdx_error(TDH_VP_ADDCX, err, NULL);
> > +                       for (; i < tdx_info->nr_tdvpx_pages; i++) {
> > +                               free_page((unsigned long)__va(tdvpx_pa[i]));
> > +                               tdvpx_pa[i] = 0;
> > +                       }
> > +                       /* vcpu_free method frees TDVPX and TDR donated to TDX */
> > +                       return -EIO;
> > +               }
> > +       }
> >
> >
> In the current branch tdh_vp_init() takes struct vcpu_tdx, so they would be moved right here.
>
> What do you think?

Yes, I should revise the error recovery path.
--
Isaku Yamahata <isaku.yamahata@intel.com>