Mailing List Archive

[PATCH V11 10/10] arm/arm64: KVM: add guest SEA support
Currently external aborts are unsupported by the guest abort
handling. Add handling for SEAs so that the host kernel reports
SEAs which occur in the guest kernel.

Signed-off-by: Tyler Baicar <tbaicar@codeaurora.org>
---
arch/arm/include/asm/kvm_arm.h | 1 +
arch/arm/include/asm/system_misc.h | 5 +++++
arch/arm/kvm/mmu.c | 18 ++++++++++++++++--
arch/arm64/include/asm/kvm_arm.h | 1 +
arch/arm64/include/asm/system_misc.h | 2 ++
arch/arm64/mm/fault.c | 18 ++++++++++++++++++
6 files changed, 43 insertions(+), 2 deletions(-)

diff --git a/arch/arm/include/asm/kvm_arm.h b/arch/arm/include/asm/kvm_arm.h
index e22089f..33a77509 100644
--- a/arch/arm/include/asm/kvm_arm.h
+++ b/arch/arm/include/asm/kvm_arm.h
@@ -187,6 +187,7 @@
#define FSC_FAULT (0x04)
#define FSC_ACCESS (0x08)
#define FSC_PERM (0x0c)
+#define FSC_EXTABT (0x10)

/* Hyp Prefetch Fault Address Register (HPFAR/HDFAR) */
#define HPFAR_MASK (~0xf)
diff --git a/arch/arm/include/asm/system_misc.h b/arch/arm/include/asm/system_misc.h
index a3d61ad..ea45d94 100644
--- a/arch/arm/include/asm/system_misc.h
+++ b/arch/arm/include/asm/system_misc.h
@@ -24,4 +24,9 @@

#endif /* !__ASSEMBLY__ */

+static inline int handle_guest_sea(unsigned long addr, unsigned int esr)
+{
+ return -1;
+}
+
#endif /* __ASM_ARM_SYSTEM_MISC_H */
diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
index a5265ed..04f1dd50 100644
--- a/arch/arm/kvm/mmu.c
+++ b/arch/arm/kvm/mmu.c
@@ -29,6 +29,7 @@
#include <asm/kvm_asm.h>
#include <asm/kvm_emulate.h>
#include <asm/virt.h>
+#include <asm/system_misc.h>

#include "trace.h"

@@ -1444,8 +1445,21 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu, struct kvm_run *run)

/* Check the stage-2 fault is trans. fault or write fault */
fault_status = kvm_vcpu_trap_get_fault_type(vcpu);
- if (fault_status != FSC_FAULT && fault_status != FSC_PERM &&
- fault_status != FSC_ACCESS) {
+
+ /* The host kernel will handle the synchronous external abort. There
+ * is no need to pass the error into the guest.
+ */
+ if (fault_status == FSC_EXTABT) {
+ if(handle_guest_sea((unsigned long)fault_ipa,
+ kvm_vcpu_get_hsr(vcpu))) {
+ kvm_err("Failed to handle guest SEA, FSC: EC=%#x xFSC=%#lx ESR_EL2=%#lx\n",
+ kvm_vcpu_trap_get_class(vcpu),
+ (unsigned long)kvm_vcpu_trap_get_fault(vcpu),
+ (unsigned long)kvm_vcpu_get_hsr(vcpu));
+ return -EFAULT;
+ }
+ } else if (fault_status != FSC_FAULT && fault_status != FSC_PERM &&
+ fault_status != FSC_ACCESS) {
kvm_err("Unsupported FSC: EC=%#x xFSC=%#lx ESR_EL2=%#lx\n",
kvm_vcpu_trap_get_class(vcpu),
(unsigned long)kvm_vcpu_trap_get_fault(vcpu),
diff --git a/arch/arm64/include/asm/kvm_arm.h b/arch/arm64/include/asm/kvm_arm.h
index 2a2752b..2b11d59 100644
--- a/arch/arm64/include/asm/kvm_arm.h
+++ b/arch/arm64/include/asm/kvm_arm.h
@@ -201,6 +201,7 @@
#define FSC_FAULT ESR_ELx_FSC_FAULT
#define FSC_ACCESS ESR_ELx_FSC_ACCESS
#define FSC_PERM ESR_ELx_FSC_PERM
+#define FSC_EXTABT ESR_ELx_FSC_EXTABT

/* Hyp Prefetch Fault Address Register (HPFAR/HDFAR) */
#define HPFAR_MASK (~UL(0xf))
diff --git a/arch/arm64/include/asm/system_misc.h b/arch/arm64/include/asm/system_misc.h
index bc81243..5b2cecd1 100644
--- a/arch/arm64/include/asm/system_misc.h
+++ b/arch/arm64/include/asm/system_misc.h
@@ -58,4 +58,6 @@ void hook_debug_fault_code(int nr, int (*fn)(unsigned long, unsigned int,

#endif /* __ASSEMBLY__ */

+int handle_guest_sea(unsigned long addr, unsigned int esr);
+
#endif /* __ASM_SYSTEM_MISC_H */
diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
index b2d57fc..403277b 100644
--- a/arch/arm64/mm/fault.c
+++ b/arch/arm64/mm/fault.c
@@ -602,6 +602,24 @@ static const char *fault_name(unsigned int esr)
}

/*
+ * Handle Synchronous External Aborts that occur in a guest kernel.
+ */
+int handle_guest_sea(unsigned long addr, unsigned int esr)
+{
+ /*
+ * synchronize_rcu() will wait for nmi_exit(), so no need to
+ * rcu_read_lock().
+ */
+ if(IS_ENABLED(HAVE_ACPI_APEI_SEA)) {
+ nmi_enter();
+ ghes_notify_sea();
+ nmi_exit();
+ }
+
+ return 0;
+}
+
+/*
* Dispatch a data abort to the relevant handler.
*/
asmlinkage void __exception do_mem_abort(unsigned long addr, unsigned int esr,
--
Qualcomm Datacenter Technologies, Inc. as an affiliate of Qualcomm Technologies, Inc.
Qualcomm Technologies, Inc. is a member of the Code Aurora Forum,
a Linux Foundation Collaborative Project.
Re: [PATCH V11 10/10] arm/arm64: KVM: add guest SEA support [ In reply to ]
Hi Tyler,

On 21/02/17 21:22, Tyler Baicar wrote:
> Currently external aborts are unsupported by the guest abort
> handling. Add handling for SEAs so that the host kernel reports
> SEAs which occur in the guest kernel.

> diff --git a/arch/arm/include/asm/kvm_arm.h b/arch/arm/include/asm/kvm_arm.h
> index e22089f..33a77509 100644
> --- a/arch/arm/include/asm/kvm_arm.h
> +++ b/arch/arm/include/asm/kvm_arm.h
> @@ -187,6 +187,7 @@
> #define FSC_FAULT (0x04)
> #define FSC_ACCESS (0x08)
> #define FSC_PERM (0x0c)
> +#define FSC_EXTABT (0x10)

arm64 has ESR_ELx_FSC_EXTABT which is used in inject_abt64(), but for matching
an external abort coming from hardware the range is wider.

Looking at the ARM-ARMs 'ISS encoding for an exception from an Instruction
Abort' in 'D7.2.27 ESR_ELx, Exception Syndrome Register (ELx)' (page D7-1954 of
version 'k'...iss10775), the ten flavours of you Synchronous abort you hooked
with do_sea() in patch 4 occupy 0x10 to 0x1f...


> diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
> index a5265ed..04f1dd50 100644
> --- a/arch/arm/kvm/mmu.c
> +++ b/arch/arm/kvm/mmu.c
> @@ -29,6 +29,7 @@
> #include <asm/kvm_asm.h>
> #include <asm/kvm_emulate.h>
> #include <asm/virt.h>
> +#include <asm/system_misc.h>
>
> #include "trace.h"
>
> @@ -1444,8 +1445,21 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu, struct kvm_run *run)
>
> /* Check the stage-2 fault is trans. fault or write fault */
> fault_status = kvm_vcpu_trap_get_fault_type(vcpu);

... kvm_vcpu_trap_get_fault_type() on both arm and arm64 masks the HSR/ESR_EL2
with 0x3c ...


> - if (fault_status != FSC_FAULT && fault_status != FSC_PERM &&
> - fault_status != FSC_ACCESS) {
> +
> + /* The host kernel will handle the synchronous external abort. There
> + * is no need to pass the error into the guest.
> + */
> + if (fault_status == FSC_EXTABT) {

... but here we only check for 'Synchronous external abort, not on a translation
table walk'. Are the other types relevant?

If so we need some helper as this range is sparse and 'all other values are
reserved'. The aarch32 HSR format is slightly different. (G6-4411 ISS encoding
from an exception from a Data Abort).

If not, can we change patch 4 to check this type too so we don't call out to
APEI for a fault type we know isn't relevant.


> + if(handle_guest_sea((unsigned long)fault_ipa,
> + kvm_vcpu_get_hsr(vcpu))) {
> + kvm_err("Failed to handle guest SEA, FSC: EC=%#x xFSC=%#lx ESR_EL2=%#lx\n",
> + kvm_vcpu_trap_get_class(vcpu),
> + (unsigned long)kvm_vcpu_trap_get_fault(vcpu),
> + (unsigned long)kvm_vcpu_get_hsr(vcpu));
> + return -EFAULT;
> + }
> + } else if (fault_status != FSC_FAULT && fault_status != FSC_PERM &&
> + fault_status != FSC_ACCESS) {
> kvm_err("Unsupported FSC: EC=%#x xFSC=%#lx ESR_EL2=%#lx\n",
> kvm_vcpu_trap_get_class(vcpu),
> (unsigned long)kvm_vcpu_trap_get_fault(vcpu),

> diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
> index b2d57fc..403277b 100644
> --- a/arch/arm64/mm/fault.c
> +++ b/arch/arm64/mm/fault.c
> @@ -602,6 +602,24 @@ static const char *fault_name(unsigned int esr)
> }
>
> /*
> + * Handle Synchronous External Aborts that occur in a guest kernel.
> + */
> +int handle_guest_sea(unsigned long addr, unsigned int esr)
> +{

> + if(IS_ENABLED(HAVE_ACPI_APEI_SEA)) {
> + nmi_enter();
> + ghes_notify_sea();
> + nmi_exit();

This nmi stuff was needed for synchronous aborts that may have interrupted
APEI's interrupts-masked code. We want to avoid trying to take the same set of
locks, hence taking the in_nmi() path through APEI. Here we know we interrupted
a guest, so there is no risk that we have interrupted APEI on the host.
ghes_notify_sea() can safely take the normal path.


Thanks,

James
Re: [PATCH V11 10/10] arm/arm64: KVM: add guest SEA support [ In reply to ]
Hi Tyler,


On 2017/2/22 5:22, Tyler Baicar wrote:
> Currently external aborts are unsupported by the guest abort
> handling. Add handling for SEAs so that the host kernel reports
> SEAs which occur in the guest kernel.
>
> Signed-off-by: Tyler Baicar <tbaicar@codeaurora.org>
> ---
> arch/arm/include/asm/kvm_arm.h | 1 +
> arch/arm/include/asm/system_misc.h | 5 +++++
> arch/arm/kvm/mmu.c | 18 ++++++++++++++++--
> arch/arm64/include/asm/kvm_arm.h | 1 +
> arch/arm64/include/asm/system_misc.h | 2 ++
> arch/arm64/mm/fault.c | 18 ++++++++++++++++++
> 6 files changed, 43 insertions(+), 2 deletions(-)
>
> diff --git a/arch/arm/include/asm/kvm_arm.h b/arch/arm/include/asm/kvm_arm.h
> index e22089f..33a77509 100644
> --- a/arch/arm/include/asm/kvm_arm.h
> +++ b/arch/arm/include/asm/kvm_arm.h
> @@ -187,6 +187,7 @@
> #define FSC_FAULT (0x04)
> #define FSC_ACCESS (0x08)
> #define FSC_PERM (0x0c)
> +#define FSC_EXTABT (0x10)
>
> /* Hyp Prefetch Fault Address Register (HPFAR/HDFAR) */
> #define HPFAR_MASK (~0xf)
> diff --git a/arch/arm/include/asm/system_misc.h b/arch/arm/include/asm/system_misc.h
> index a3d61ad..ea45d94 100644
> --- a/arch/arm/include/asm/system_misc.h
> +++ b/arch/arm/include/asm/system_misc.h
> @@ -24,4 +24,9 @@
>
> #endif /* !__ASSEMBLY__ */
>
> +static inline int handle_guest_sea(unsigned long addr, unsigned int esr)
> +{
> + return -1;
> +}
> +
> #endif /* __ASM_ARM_SYSTEM_MISC_H */
> diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
> index a5265ed..04f1dd50 100644
> --- a/arch/arm/kvm/mmu.c
> +++ b/arch/arm/kvm/mmu.c
> @@ -29,6 +29,7 @@
> #include <asm/kvm_asm.h>
> #include <asm/kvm_emulate.h>
> #include <asm/virt.h>
> +#include <asm/system_misc.h>
>
> #include "trace.h"
>
> @@ -1444,8 +1445,21 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu, struct kvm_run *run)
>
> /* Check the stage-2 fault is trans. fault or write fault */
> fault_status = kvm_vcpu_trap_get_fault_type(vcpu);
> - if (fault_status != FSC_FAULT && fault_status != FSC_PERM &&
> - fault_status != FSC_ACCESS) {
> +
> + /* The host kernel will handle the synchronous external abort. There
> + * is no need to pass the error into the guest.
> + */

Can we inject an sea into the guest, so that the guest can kill the
application which causes the error if the guest won't be terminated
later. I'm not sure whether ghes_handle_memory_failure() called in
ghes_do_proc() will kill the qemu process. I think it only kill user
processes marked with PF_MCE_PROCESS & PF_MCE_EARLY.

> + if (fault_status == FSC_EXTABT) {
> + if(handle_guest_sea((unsigned long)fault_ipa,
> + kvm_vcpu_get_hsr(vcpu))) {
> + kvm_err("Failed to handle guest SEA, FSC: EC=%#x xFSC=%#lx ESR_EL2=%#lx\n",
> + kvm_vcpu_trap_get_class(vcpu),
> + (unsigned long)kvm_vcpu_trap_get_fault(vcpu),
> + (unsigned long)kvm_vcpu_get_hsr(vcpu));
> + return -EFAULT;
> + }
> + } else if (fault_status != FSC_FAULT && fault_status != FSC_PERM &&
> + fault_status != FSC_ACCESS) {
> kvm_err("Unsupported FSC: EC=%#x xFSC=%#lx ESR_EL2=%#lx\n",
> kvm_vcpu_trap_get_class(vcpu),
> (unsigned long)kvm_vcpu_trap_get_fault(vcpu),
> diff --git a/arch/arm64/include/asm/kvm_arm.h b/arch/arm64/include/asm/kvm_arm.h
> index 2a2752b..2b11d59 100644
> --- a/arch/arm64/include/asm/kvm_arm.h
> +++ b/arch/arm64/include/asm/kvm_arm.h
> @@ -201,6 +201,7 @@
> #define FSC_FAULT ESR_ELx_FSC_FAULT
> #define FSC_ACCESS ESR_ELx_FSC_ACCESS
> #define FSC_PERM ESR_ELx_FSC_PERM
> +#define FSC_EXTABT ESR_ELx_FSC_EXTABT
>
> /* Hyp Prefetch Fault Address Register (HPFAR/HDFAR) */
> #define HPFAR_MASK (~UL(0xf))
> diff --git a/arch/arm64/include/asm/system_misc.h b/arch/arm64/include/asm/system_misc.h
> index bc81243..5b2cecd1 100644
> --- a/arch/arm64/include/asm/system_misc.h
> +++ b/arch/arm64/include/asm/system_misc.h
> @@ -58,4 +58,6 @@ void hook_debug_fault_code(int nr, int (*fn)(unsigned long, unsigned int,
>
> #endif /* __ASSEMBLY__ */
>
> +int handle_guest_sea(unsigned long addr, unsigned int esr);
> +
> #endif /* __ASM_SYSTEM_MISC_H */
> diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
> index b2d57fc..403277b 100644
> --- a/arch/arm64/mm/fault.c
> +++ b/arch/arm64/mm/fault.c
> @@ -602,6 +602,24 @@ static const char *fault_name(unsigned int esr)
> }
>
> /*
> + * Handle Synchronous External Aborts that occur in a guest kernel.
> + */
> +int handle_guest_sea(unsigned long addr, unsigned int esr)
> +{
> + /*
> + * synchronize_rcu() will wait for nmi_exit(), so no need to
> + * rcu_read_lock().
> + */
> + if(IS_ENABLED(HAVE_ACPI_APEI_SEA)) {
> + nmi_enter();
> + ghes_notify_sea();
> + nmi_exit();
> + }
> +
> + return 0;
> +}
> +
> +/*
> * Dispatch a data abort to the relevant handler.
> */
> asmlinkage void __exception do_mem_abort(unsigned long addr, unsigned int esr,
>

Thanks,
Wang Xiongfeng
Re: [PATCH V11 10/10] arm/arm64: KVM: add guest SEA support [ In reply to ]
@@ -1444,8 +1445,21 @@ int kvm_handle_guest_abort(struct kvm_vcpu
*vcpu, struct kvm_run *run)

/* Check the stage-2 fault is trans. fault or write fault */
fault_status = kvm_vcpu_trap_get_fault_type(vcpu);
- if (fault_status != FSC_FAULT && fault_status != FSC_PERM &&
- fault_status != FSC_ACCESS) {
+
+ /* The host kernel will handle the synchronous external abort. There
+ * is no need to pass the error into the guest.
+ */
+ if (fault_status == FSC_EXTABT) {
+ if(handle_guest_sea((unsigned long)fault_ipa,
+ kvm_vcpu_get_hsr(vcpu))) {
+ kvm_err("Failed to handle guest SEA, FSC:
EC=%#x xFSC=%#lx ESR_EL2=%#lx\n",
+ kvm_vcpu_trap_get_class(vcpu),
+ (unsigned long)kvm_vcpu_trap_get_fault(vcpu),
+ (unsigned long)kvm_vcpu_get_hsr(vcpu));
+ return -EFAULT;
+ }
+ } else if (fault_status != FSC_FAULT && fault_status != FSC_PERM &&
+ fault_status != FSC_ACCESS) {
kvm_err("Unsupported FSC: EC=%#x xFSC=%#lx ESR_EL2=%#lx\n",
kvm_vcpu_trap_get_class(vcpu),
(unsigned long)kvm_vcpu_trap_get_fault(vcpu),



if the error is SEA and we want to inject the sea to guest OK, after
finish the handle, whether we need to directly return? instead of
continuation? as shown below:

if (fault_status == FSC_EXTABT) {
if(handle_guest_sea((unsigned long)fault_ipa,
kvm_vcpu_get_hsr(vcpu))) {
kvm_err("Failed to handle guest SEA, FSC:
EC=%#x xFSC=%#lx ESR_EL2=%#lx\n",
kvm_vcpu_trap_get_class(vcpu),
(unsigned long)kvm_vcpu_trap_get_fault(vcpu),
(unsigned long)kvm_vcpu_get_hsr(vcpu));
return -EFAULT;
} else
return 1;






2017-02-24 18:42 GMT+08:00 James Morse <james.morse@arm.com>:
> Hi Tyler,
>
> On 21/02/17 21:22, Tyler Baicar wrote:
>> Currently external aborts are unsupported by the guest abort
>> handling. Add handling for SEAs so that the host kernel reports
>> SEAs which occur in the guest kernel.
>
>> diff --git a/arch/arm/include/asm/kvm_arm.h b/arch/arm/include/asm/kvm_arm.h
>> index e22089f..33a77509 100644
>> --- a/arch/arm/include/asm/kvm_arm.h
>> +++ b/arch/arm/include/asm/kvm_arm.h
>> @@ -187,6 +187,7 @@
>> #define FSC_FAULT (0x04)
>> #define FSC_ACCESS (0x08)
>> #define FSC_PERM (0x0c)
>> +#define FSC_EXTABT (0x10)
>
> arm64 has ESR_ELx_FSC_EXTABT which is used in inject_abt64(), but for matching
> an external abort coming from hardware the range is wider.
>
> Looking at the ARM-ARMs 'ISS encoding for an exception from an Instruction
> Abort' in 'D7.2.27 ESR_ELx, Exception Syndrome Register (ELx)' (page D7-1954 of
> version 'k'...iss10775), the ten flavours of you Synchronous abort you hooked
> with do_sea() in patch 4 occupy 0x10 to 0x1f...
>
>
>> diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
>> index a5265ed..04f1dd50 100644
>> --- a/arch/arm/kvm/mmu.c
>> +++ b/arch/arm/kvm/mmu.c
>> @@ -29,6 +29,7 @@
>> #include <asm/kvm_asm.h>
>> #include <asm/kvm_emulate.h>
>> #include <asm/virt.h>
>> +#include <asm/system_misc.h>
>>
>> #include "trace.h"
>>
>> @@ -1444,8 +1445,21 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu, struct kvm_run *run)
>>
>> /* Check the stage-2 fault is trans. fault or write fault */
>> fault_status = kvm_vcpu_trap_get_fault_type(vcpu);
>
> ... kvm_vcpu_trap_get_fault_type() on both arm and arm64 masks the HSR/ESR_EL2
> with 0x3c ...
>
>
>> - if (fault_status != FSC_FAULT && fault_status != FSC_PERM &&
>> - fault_status != FSC_ACCESS) {
>> +
>> + /* The host kernel will handle the synchronous external abort. There
>> + * is no need to pass the error into the guest.
>> + */
>> + if (fault_status == FSC_EXTABT) {
>
> ... but here we only check for 'Synchronous external abort, not on a translation
> table walk'. Are the other types relevant?
>
> If so we need some helper as this range is sparse and 'all other values are
> reserved'. The aarch32 HSR format is slightly different. (G6-4411 ISS encoding
> from an exception from a Data Abort).
>
> If not, can we change patch 4 to check this type too so we don't call out to
> APEI for a fault type we know isn't relevant.
>
>
>> + if(handle_guest_sea((unsigned long)fault_ipa,
>> + kvm_vcpu_get_hsr(vcpu))) {
>> + kvm_err("Failed to handle guest SEA, FSC: EC=%#x xFSC=%#lx ESR_EL2=%#lx\n",
>> + kvm_vcpu_trap_get_class(vcpu),
>> + (unsigned long)kvm_vcpu_trap_get_fault(vcpu),
>> + (unsigned long)kvm_vcpu_get_hsr(vcpu));
>> + return -EFAULT;
>> + }
>> + } else if (fault_status != FSC_FAULT && fault_status != FSC_PERM &&
>> + fault_status != FSC_ACCESS) {
>> kvm_err("Unsupported FSC: EC=%#x xFSC=%#lx ESR_EL2=%#lx\n",
>> kvm_vcpu_trap_get_class(vcpu),
>> (unsigned long)kvm_vcpu_trap_get_fault(vcpu),
>
>> diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
>> index b2d57fc..403277b 100644
>> --- a/arch/arm64/mm/fault.c
>> +++ b/arch/arm64/mm/fault.c
>> @@ -602,6 +602,24 @@ static const char *fault_name(unsigned int esr)
>> }
>>
>> /*
>> + * Handle Synchronous External Aborts that occur in a guest kernel.
>> + */
>> +int handle_guest_sea(unsigned long addr, unsigned int esr)
>> +{
>
>> + if(IS_ENABLED(HAVE_ACPI_APEI_SEA)) {
>> + nmi_enter();
>> + ghes_notify_sea();
>> + nmi_exit();
>
> This nmi stuff was needed for synchronous aborts that may have interrupted
> APEI's interrupts-masked code. We want to avoid trying to take the same set of
> locks, hence taking the in_nmi() path through APEI. Here we know we interrupted
> a guest, so there is no risk that we have interrupted APEI on the host.
> ghes_notify_sea() can safely take the normal path.
>
>
> Thanks,
>
> James
Re: [PATCH V11 10/10] arm/arm64: KVM: add guest SEA support [ In reply to ]
Hi Wang Xiongfeng,

On 25/02/17 07:15, Xiongfeng Wang wrote:
> On 2017/2/22 5:22, Tyler Baicar wrote:
>> Currently external aborts are unsupported by the guest abort
>> handling. Add handling for SEAs so that the host kernel reports
>> SEAs which occur in the guest kernel.

>> diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
>> index a5265ed..04f1dd50 100644
>> --- a/arch/arm/kvm/mmu.c
>> +++ b/arch/arm/kvm/mmu.c
>> @@ -1444,8 +1445,21 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu, struct kvm_run *run)
>>
>> /* Check the stage-2 fault is trans. fault or write fault */
>> fault_status = kvm_vcpu_trap_get_fault_type(vcpu);
>> - if (fault_status != FSC_FAULT && fault_status != FSC_PERM &&
>> - fault_status != FSC_ACCESS) {
>> +
>> + /* The host kernel will handle the synchronous external abort. There
>> + * is no need to pass the error into the guest.
>> + */

> Can we inject an sea into the guest, so that the guest can kill the
> application which causes the error if the guest won't be terminated
> later. I'm not sure whether ghes_handle_memory_failure() called in
> ghes_do_proc() will kill the qemu process. I think it only kill user
> processes marked with PF_MCE_PROCESS & PF_MCE_EARLY.

My understanding is the pages will get unmapped and recovered where possible
(e.g. re-read from disk), the user space process will get SIGBUS/SIGSEV when it
next tries to access that page, which could be some time later.
These flags in find_early_kill_thread() are a way to make the memory-failure
code signal the process early, before it does any recovery. The 'MCE' makes me
think its x86 specific.
(early and late are described more in [0])


Guests are a special case as QEMU may never access the faulty memory itself, so
it won't receive the 'late' signal. It looks like ARM/arm64 KVM lacks support
for KVM_PFN_ERR_HWPOISON which sends SIGBUS from KVM's fault-handling code. I
have patches to add support for this which I intend to send at rc1.

[0] suggests 'KVM qemu' sets these MCE flags to take the 'early' path, but given
x86s KVM_PFN_ERR_HWPOISON, this may be out of date.


Either way, once QEMU gets a signal indicating the virtual address, it can
generate its own APEI CPER records and use the KVM APIs to mock up an
Synchronous External Abort, (or inject an IRQ or run the vcpu waiting for the
guest's polling thread to come round, whichever was described to the guest via
the HEST/GHES tables).

We can't hand the APEI CPER records we have in the kernel to the guest, as they
hold a host physical address, and maybe a host virtual address. We don't know
where in guest memory we could write new APEI CPER records as these locations
have to be reserved in the guests-UEFI memory map, and only QEMU knows where
they are.

To deliver RAS events to a guest we have to get QEMU involved.


Thanks,

James


[0] https://www.kernel.org/doc/Documentation/vm/hwpoison.txt
Re: [PATCH V11 10/10] arm/arm64: KVM: add guest SEA support [ In reply to ]
Hi James,

On 2017/2/27 21:58, James Morse wrote:
> Hi Wang Xiongfeng,
>
> On 25/02/17 07:15, Xiongfeng Wang wrote:
>> On 2017/2/22 5:22, Tyler Baicar wrote:
>>> Currently external aborts are unsupported by the guest abort
>>> handling. Add handling for SEAs so that the host kernel reports
>>> SEAs which occur in the guest kernel.
>
>>> diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
>>> index a5265ed..04f1dd50 100644
>>> --- a/arch/arm/kvm/mmu.c
>>> +++ b/arch/arm/kvm/mmu.c
>>> @@ -1444,8 +1445,21 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu, struct kvm_run *run)
>>>
>>> /* Check the stage-2 fault is trans. fault or write fault */
>>> fault_status = kvm_vcpu_trap_get_fault_type(vcpu);
>>> - if (fault_status != FSC_FAULT && fault_status != FSC_PERM &&
>>> - fault_status != FSC_ACCESS) {
>>> +
>>> + /* The host kernel will handle the synchronous external abort. There
>>> + * is no need to pass the error into the guest.
>>> + */
>
>> Can we inject an sea into the guest, so that the guest can kill the
>> application which causes the error if the guest won't be terminated
>> later. I'm not sure whether ghes_handle_memory_failure() called in
>> ghes_do_proc() will kill the qemu process. I think it only kill user
>> processes marked with PF_MCE_PROCESS & PF_MCE_EARLY.
>
> My understanding is the pages will get unmapped and recovered where possible
> (e.g. re-read from disk), the user space process will get SIGBUS/SIGSEV when it
> next tries to access that page, which could be some time later.
> These flags in find_early_kill_thread() are a way to make the memory-failure
> code signal the process early, before it does any recovery. The 'MCE' makes me
> think its x86 specific.
> (early and late are described more in [0])
>
>
> Guests are a special case as QEMU may never access the faulty memory itself, so
> it won't receive the 'late' signal. It looks like ARM/arm64 KVM lacks support
> for KVM_PFN_ERR_HWPOISON which sends SIGBUS from KVM's fault-handling code. I
> have patches to add support for this which I intend to send at rc1.
>
> [0] suggests 'KVM qemu' sets these MCE flags to take the 'early' path, but given
> x86s KVM_PFN_ERR_HWPOISON, this may be out of date.
>
>
> Either way, once QEMU gets a signal indicating the virtual address, it can
> generate its own APEI CPER records and use the KVM APIs to mock up an
> Synchronous External Abort, (or inject an IRQ or run the vcpu waiting for the
> guest's polling thread to come round, whichever was described to the guest via
> the HEST/GHES tables).
>
> We can't hand the APEI CPER records we have in the kernel to the guest, as they
> hold a host physical address, and maybe a host virtual address. We don't know
> where in guest memory we could write new APEI CPER records as these locations
> have to be reserved in the guests-UEFI memory map, and only QEMU knows where
> they are.
>
> To deliver RAS events to a guest we have to get QEMU involved.

Thanks for your reply!

I have another idea about the handling procedure of SEA. Can we divide
the SEA handing procedure into two procedures? The first procedure does
the more urgent work, including sending SIGBUS to user process or panic,
just as PATCH 04/10 does. The second procedure does the APEI analysis
work, including calling memory_failure. The second procedure is executed
when actual errors detected in memory, such as a 2-bit ECC error is
detected on memory read or write, in which case, a fault handling
interrupt is generated by the memory controller, as RAS Extension
specification says.

We can route this fault handling interrupt into EL3. After BIOS has
filled the HEST table, it can notify OS with an IRQ. And the second
procedure is executed in the IRQ handler. The notification type of
HEST/GHES tables is GSIV.

When uncorrectable data error is detected on write data, a fault
handling interrupt is generated, and no SEA is generated, as RAS
extension specification 6.4.4 says. In this situation, the second
procedure should be executed since error occurs in memory.

In ARM/arm64 KVM situation, when an SEA takes place, an SEA is injected
into guest os directly in kvm_handle_guest_abort(). And the guest os can
execute the first procedure.

When the host OS executes the second procedure and analyses the HEST
table, it sends SIGBUS to qemu process in memory_failure(). And the qemu
process can mock up a HEST table with IPA of the error data. Then the
qemu process can notify the guest OS with an IRQ, and the second
procedure is executed in guest OS. Is this idea reasonable?


Thanks!
Wang Xiongfeng
Re: [PATCH V11 10/10] arm/arm64: KVM: add guest SEA support [ In reply to ]
Hi,

On 28/02/17 06:25, Xiongfeng Wang wrote:
> On 2017/2/27 21:58, James Morse wrote:
>> On 25/02/17 07:15, Xiongfeng Wang wrote:
>>> Can we inject an sea into the guest, so that the guest can kill the
>>> application which causes the error if the guest won't be terminated
>>> later. I'm not sure whether ghes_handle_memory_failure() called in
>>> ghes_do_proc() will kill the qemu process. I think it only kill user
>>> processes marked with PF_MCE_PROCESS & PF_MCE_EARLY.
>>
>> My understanding is the pages will get unmapped and recovered where possible
>> (e.g. re-read from disk), the user space process will get SIGBUS/SIGSEV when it
>> next tries to access that page, which could be some time later.
>> These flags in find_early_kill_thread() are a way to make the memory-failure
>> code signal the process early, before it does any recovery. The 'MCE' makes me
>> think its x86 specific.
>> (early and late are described more in [0])
>>
>>
>> Guests are a special case as QEMU may never access the faulty memory itself, so
>> it won't receive the 'late' signal. It looks like ARM/arm64 KVM lacks support
>> for KVM_PFN_ERR_HWPOISON which sends SIGBUS from KVM's fault-handling code. I
>> have patches to add support for this which I intend to send at rc1.
>>
>> [0] suggests 'KVM qemu' sets these MCE flags to take the 'early' path, but given
>> x86s KVM_PFN_ERR_HWPOISON, this may be out of date.
>>
>>
>> Either way, once QEMU gets a signal indicating the virtual address, it can
>> generate its own APEI CPER records and use the KVM APIs to mock up an
>> Synchronous External Abort, (or inject an IRQ or run the vcpu waiting for the
>> guest's polling thread to come round, whichever was described to the guest via
>> the HEST/GHES tables).
>>
>> We can't hand the APEI CPER records we have in the kernel to the guest, as they
>> hold a host physical address, and maybe a host virtual address. We don't know
>> where in guest memory we could write new APEI CPER records as these locations
>> have to be reserved in the guests-UEFI memory map, and only QEMU knows where
>> they are.
>>
>> To deliver RAS events to a guest we have to get QEMU involved.

> I have another idea about the handling procedure of SEA. Can we divide
> the SEA handing procedure into two procedures? The first procedure does
> the more urgent work, including sending SIGBUS to user process or panic,
> just as PATCH 04/10 does.

How do we know which user processes to signal? (There may be more than one - we
need a memory address to find them).
How do we know if this error is serious and we should panic?
This information is in the APEI CPER records.


> The second procedure does the APEI analysis
> work, including calling memory_failure. The second procedure is executed
> when actual errors detected in memory, such as a 2-bit ECC error is
> detected on memory read or write, in which case, a fault handling
> interrupt is generated by the memory controller, as RAS Extension
> specification says.

You are splitting the APEI notification and the processing of records. One has
to happen immediately after the other because we want to contain the error.


> We can route this fault handling interrupt into EL3. After BIOS has
> filled the HEST table, it can notify OS with an IRQ. And the second
> procedure is executed in the IRQ handler. The notification type of
> HEST/GHES tables is GSIV.
>
> When uncorrectable data error is detected on write data, a fault
> handling interrupt is generated, and no SEA is generated,

This sounds more like ACPI's firmware first error handling. Yes errors should be
routed to EL3 where firmware can do some platform-specific work, then describe
them to the host OS via CPER records.
By doing this, you could prevent a hardware-generated External Abort reaching
the host OS, but you still need to notify the OS via one of the mechanisms in
'18.3.2.9 Hardware Error Notification'.

If the error is synchronous (we read a bad page of memory instead of it being
detected on background DRAM scrub) we need to notify the OS synchronously. Using
SEA would be a firmware-generated External Abort delivered to EL2/EL1.

However the notification is done it needs to match one of the GHES records in
the HEST, so firmware has to decide which notification methods it will use
before we boot the OS.


> In ARM/arm64 KVM situation, when an SEA takes place, an SEA is injected
> into guest os directly in kvm_handle_guest_abort(). And the guest os can
> execute the first procedure.

What can the guest do with this? Without the APEI CPER records it doesn't know
what happened. Was it unrecoverable memory corruption? In which case killing the
running task is a start... Which memory ranges should we mark as unusable? Maybe
it was something more catastrophic for the running CPU, in which case we should
panic().


> When the host OS executes the second procedure and analyses the HEST
> table, it sends SIGBUS to qemu process in memory_failure(). And the qemu
> process can mock up a HEST table with IPA of the error data. Then the
> qemu process can notify the guest OS with an IRQ, and the second
> procedure is executed in guest OS. Is this idea reasonable?

So we tell the guest something happened, and it should wait a while to find out
what... I don't think this will work. It is best to not run the guest until Qemu
has done its work and called VCPU_RUN again. This way we only notify the guest
once the records are available for processing. This is how APEI's firmware-first
works between the host OS and EL3, it should be the same between a guest and
QEMU (which plays the part of firmware for the guest).


Can you share more details of the problem you are trying to solve? I don't think
we can get RAS support in a guest 'for free', somewhere along the line we need
support from Qemu.


Thanks,

James
Re: [PATCH V11 10/10] arm/arm64: KVM: add guest SEA support [ In reply to ]
Hello James,


On 2/24/2017 3:42 AM, James Morse wrote:
> On 21/02/17 21:22, Tyler Baicar wrote:
>> Currently external aborts are unsupported by the guest abort
>> handling. Add handling for SEAs so that the host kernel reports
>> SEAs which occur in the guest kernel.
>> diff --git a/arch/arm/include/asm/kvm_arm.h b/arch/arm/include/asm/kvm_arm.h
>> index e22089f..33a77509 100644
>> --- a/arch/arm/include/asm/kvm_arm.h
>> +++ b/arch/arm/include/asm/kvm_arm.h
>> @@ -187,6 +187,7 @@
>> #define FSC_FAULT (0x04)
>> #define FSC_ACCESS (0x08)
>> #define FSC_PERM (0x0c)
>> +#define FSC_EXTABT (0x10)
> arm64 has ESR_ELx_FSC_EXTABT which is used in inject_abt64(), but for matching
> an external abort coming from hardware the range is wider.
>
> Looking at the ARM-ARMs 'ISS encoding for an exception from an Instruction
> Abort' in 'D7.2.27 ESR_ELx, Exception Syndrome Register (ELx)' (page D7-1954 of
> version 'k'...iss10775), the ten flavours of you Synchronous abort you hooked
> with do_sea() in patch 4 occupy 0x10 to 0x1f...
>
>
>> diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
>> index a5265ed..04f1dd50 100644
>> --- a/arch/arm/kvm/mmu.c
>> +++ b/arch/arm/kvm/mmu.c
>> @@ -29,6 +29,7 @@
>> #include <asm/kvm_asm.h>
>> #include <asm/kvm_emulate.h>
>> #include <asm/virt.h>
>> +#include <asm/system_misc.h>
>>
>> #include "trace.h"
>>
>> @@ -1444,8 +1445,21 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu, struct kvm_run *run)
>>
>> /* Check the stage-2 fault is trans. fault or write fault */
>> fault_status = kvm_vcpu_trap_get_fault_type(vcpu);
> ... kvm_vcpu_trap_get_fault_type() on both arm and arm64 masks the HSR/ESR_EL2
> with 0x3c ...
>
>
>> - if (fault_status != FSC_FAULT && fault_status != FSC_PERM &&
>> - fault_status != FSC_ACCESS) {
>> +
>> + /* The host kernel will handle the synchronous external abort. There
>> + * is no need to pass the error into the guest.
>> + */
>> + if (fault_status == FSC_EXTABT) {
> ... but here we only check for 'Synchronous external abort, not on a translation
> table walk'. Are the other types relevant?
I believe the others should be relevant here as well. I haven't been
able to test the other types within a guest though.
> If so we need some helper as this range is sparse and 'all other values are
> reserved'. The aarch32 HSR format is slightly different. (G6-4411 ISS encoding
> from an exception from a Data Abort).
I can add a helper so that this if statement matches any of the 10 FSC
values which are mapped to the do_sea in the host code.
> If not, can we change patch 4 to check this type too so we don't call out to
> APEI for a fault type we know isn't relevant.
>
>
>> + if(handle_guest_sea((unsigned long)fault_ipa,
>> + kvm_vcpu_get_hsr(vcpu))) {
>> + kvm_err("Failed to handle guest SEA, FSC: EC=%#x xFSC=%#lx ESR_EL2=%#lx\n",
>> + kvm_vcpu_trap_get_class(vcpu),
>> + (unsigned long)kvm_vcpu_trap_get_fault(vcpu),
>> + (unsigned long)kvm_vcpu_get_hsr(vcpu));
>> + return -EFAULT;
>> + }
>> + } else if (fault_status != FSC_FAULT && fault_status != FSC_PERM &&
>> + fault_status != FSC_ACCESS) {
>> kvm_err("Unsupported FSC: EC=%#x xFSC=%#lx ESR_EL2=%#lx\n",
>> kvm_vcpu_trap_get_class(vcpu),
>> (unsigned long)kvm_vcpu_trap_get_fault(vcpu),
>> diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
>> index b2d57fc..403277b 100644
>> --- a/arch/arm64/mm/fault.c
>> +++ b/arch/arm64/mm/fault.c
>> @@ -602,6 +602,24 @@ static const char *fault_name(unsigned int esr)
>> }
>>
>> /*
>> + * Handle Synchronous External Aborts that occur in a guest kernel.
>> + */
>> +int handle_guest_sea(unsigned long addr, unsigned int esr)
>> +{
>> + if(IS_ENABLED(HAVE_ACPI_APEI_SEA)) {
>> + nmi_enter();
>> + ghes_notify_sea();
>> + nmi_exit();
> This nmi stuff was needed for synchronous aborts that may have interrupted
> APEI's interrupts-masked code. We want to avoid trying to take the same set of
> locks, hence taking the in_nmi() path through APEI. Here we know we interrupted
> a guest, so there is no risk that we have interrupted APEI on the host.
> ghes_notify_sea() can safely take the normal path.
Makes sense, I can remove the nmi_* calls here.

Thanks,
Tyler

--
Qualcomm Datacenter Technologies, Inc. as an affiliate of Qualcomm Technologies, Inc.
Qualcomm Technologies, Inc. is a member of the Code Aurora Forum,
a Linux Foundation Collaborative Project.
Re: [PATCH V11 10/10] arm/arm64: KVM: add guest SEA support [ In reply to ]
Hi James,

On 2017/2/28 21:21, James Morse wrote:
> Hi,
>
> On 28/02/17 06:25, Xiongfeng Wang wrote:
>> On 2017/2/27 21:58, James Morse wrote:
>>> On 25/02/17 07:15, Xiongfeng Wang wrote:
>>>> Can we inject an sea into the guest, so that the guest can kill the
>>>> application which causes the error if the guest won't be terminated
>>>> later. I'm not sure whether ghes_handle_memory_failure() called in
>>>> ghes_do_proc() will kill the qemu process. I think it only kill user
>>>> processes marked with PF_MCE_PROCESS & PF_MCE_EARLY.
>>>
>>> My understanding is the pages will get unmapped and recovered where possible
>>> (e.g. re-read from disk), the user space process will get SIGBUS/SIGSEV when it
>>> next tries to access that page, which could be some time later.
>>> These flags in find_early_kill_thread() are a way to make the memory-failure
>>> code signal the process early, before it does any recovery. The 'MCE' makes me
>>> think its x86 specific.
>>> (early and late are described more in [0])
>>>
>>>
>>> Guests are a special case as QEMU may never access the faulty memory itself, so
>>> it won't receive the 'late' signal. It looks like ARM/arm64 KVM lacks support
>>> for KVM_PFN_ERR_HWPOISON which sends SIGBUS from KVM's fault-handling code. I
>>> have patches to add support for this which I intend to send at rc1.
>>>
>>> [0] suggests 'KVM qemu' sets these MCE flags to take the 'early' path, but given
>>> x86s KVM_PFN_ERR_HWPOISON, this may be out of date.
>>>
>>>
>>> Either way, once QEMU gets a signal indicating the virtual address, it can
>>> generate its own APEI CPER records and use the KVM APIs to mock up an
>>> Synchronous External Abort, (or inject an IRQ or run the vcpu waiting for the
>>> guest's polling thread to come round, whichever was described to the guest via
>>> the HEST/GHES tables).
>>>
>>> We can't hand the APEI CPER records we have in the kernel to the guest, as they
>>> hold a host physical address, and maybe a host virtual address. We don't know
>>> where in guest memory we could write new APEI CPER records as these locations
>>> have to be reserved in the guests-UEFI memory map, and only QEMU knows where
>>> they are.
>>>
>>> To deliver RAS events to a guest we have to get QEMU involved.
>
>> I have another idea about the handling procedure of SEA. Can we divide
>> the SEA handing procedure into two procedures? The first procedure does
>> the more urgent work, including sending SIGBUS to user process or panic,
>> just as PATCH 04/10 does.
>
> How do we know which user processes to signal? (There may be more than one - we
> need a memory address to find them).
> How do we know if this error is serious and we should panic?
> This information is in the APEI CPER records.
>
Since the SEA exception is synchronous, the current user process is the
one to be signaled if the exception is taken from EL0. Certainly, the
error memory may be mapped to several processes. When another process
read that area again, another SEA will be generated, and that process
will be signaled. Also we can get the virtual address of the error data
from FAR_EL1. When the user process is signaled, virtual address is
attached, and the process can register its own signal handler if it can
handle the error according to the virtual address of the error data.

We can determine where the exception is taken from according to CPSR
stored in the stack. And if the exception is taken from EL1, the error
is in the kernel space now, and we are going to consume it, so we need
to panic now.
>
>> The second procedure does the APEI analysis
>> work, including calling memory_failure. The second procedure is executed
>> when actual errors detected in memory, such as a 2-bit ECC error is
>> detected on memory read or write, in which case, a fault handling
>> interrupt is generated by the memory controller, as RAS Extension
>> specification says.
>
> You are splitting the APEI notification and the processing of records. One has
> to happen immediately after the other because we want to contain the error.
>
My understanding is that processing of records is not so urgent since
the process access the error data has been killed (The first procedure
is executed in SEA exception handler). Other codes won't access the
error data, so the error won't be consumed and propagated.
>
>> We can route this fault handling interrupt into EL3. After BIOS has
>> filled the HEST table, it can notify OS with an IRQ. And the second
>> procedure is executed in the IRQ handler. The notification type of
>> HEST/GHES tables is GSIV.
>>
>> When uncorrectable data error is detected on write data, a fault
>> handling interrupt is generated, and no SEA is generated,
>
> This sounds more like ACPI's firmware first error handling. Yes errors should be
> routed to EL3 where firmware can do some platform-specific work, then describe
> them to the host OS via CPER records.

Yes , I'm saying the ACPI's firmware first error handling.

> By doing this, you could prevent a hardware-generated External Abort reaching
> the host OS, but you still need to notify the OS via one of the mechanisms in
> '18.3.2.9 Hardware Error Notification'.

Yes, the BIOS will notify OS with GSIV notify type, which will rely on
Shiju's patch 'acpi: apei: handle GSIV notification type'

>
> If the error is synchronous (we read a bad page of memory instead of it being
> detected on background DRAM scrub) we need to notify the OS synchronously. Using
> SEA would be a firmware-generated External Abort delivered to EL2/EL1.

Yes, the first procedure is executed in SEA exception handler and is
synchronous. The second procedure won't access the error data and is not
so urgent, so it may not need to be synchronous.
>
> However the notification is done it needs to match one of the GHES records in
> the HEST, so firmware has to decide which notification methods it will use
> before we boot the OS.
>
>
>> In ARM/arm64 KVM situation, when an SEA takes place, an SEA is injected
>> into guest os directly in kvm_handle_guest_abort(). And the guest os can
>> execute the first procedure.
>
> What can the guest do with this? Without the APEI CPER records it doesn't know
> what happened. Was it unrecoverable memory corruption? In which case killing the
> running task is a start... Which memory ranges should we mark as unusable? Maybe
> it was something more catastrophic for the running CPU, in which case we should
> panic().
>
If an SEA is injected into guest OS, the guest OS will jump to the SEA
exception entry when the context switched to guest OS. And the CPSR and
FAR_EL1 are recovered according to the content of vcpu. Then the guest
OS can signal a process or panic. If another guest process read the
error data, another SEA will be generated and it will be single too.

Without QEMU involved, the drawback is that no APEI table can be mocked
up in guest OS, and no memory_failure() will be called. So the memory of
error data will be released into buddy system and assigned to another
process. If the error was caused by instantaneous radiation or
electromagnetic, the memory is usable again if it is written with a
correct data. If the memory has wore out and a correct data is written,
the ECC error may occurs again with high possibility. Before a 2-bit ECC
error is reported, much more 1-bit errors will be reported. This is
report to host os, the host os can determine the memory node has worn
out and hot-plug out the memory node, and guest os may be terminated if
its memory data can't be migrated.

Of course, it is better to get QEMU involved, so the memory_failure can
be executed in guest OS. But before that implemented, can we add SEA
injection in kvm_handle_guest_abort()?
>
>> When the host OS executes the second procedure and analyses the HEST
>> table, it sends SIGBUS to qemu process in memory_failure(). And the qemu
>> process can mock up a HEST table with IPA of the error data. Then the
>> qemu process can notify the guest OS with an IRQ, and the second
>> procedure is executed in guest OS. Is this idea reasonable?
>
> So we tell the guest something happened, and it should wait a while to find out
> what... I don't think this will work. It is best to not run the guest until Qemu
> has done its work and called VCPU_RUN again. This way we only notify the guest
> once the records are available for processing. This is how APEI's firmware-first
> works between the host OS and EL3, it should be the same between a guest and
> QEMU (which plays the part of firmware for the guest).
>
>
> Can you share more details of the problem you are trying to solve? I don't think
> we can get RAS support in a guest 'for free', somewhere along the line we need
> support from Qemu.
>


Thanks,

Wang Xiongfeng

.
Re: [PATCH V11 10/10] arm/arm64: KVM: add guest SEA support [ In reply to ]
On 01/03/17 02:31, Xiongfeng Wang wrote:

[lot of things]

> If an SEA is injected into guest OS, the guest OS will jump to the SEA
> exception entry when the context switched to guest OS. And the CPSR and
> FAR_EL1 are recovered according to the content of vcpu. Then the guest
> OS can signal a process or panic. If another guest process read the
> error data, another SEA will be generated and it will be single too.
>
> Without QEMU involved, the drawback is that no APEI table can be mocked
> up in guest OS, and no memory_failure() will be called. So the memory of
> error data will be released into buddy system and assigned to another
> process. If the error was caused by instantaneous radiation or
> electromagnetic, the memory is usable again if it is written with a
> correct data. If the memory has wore out and a correct data is written,
> the ECC error may occurs again with high possibility. Before a 2-bit ECC
> error is reported, much more 1-bit errors will be reported. This is
> report to host os, the host os can determine the memory node has worn
> out and hot-plug out the memory node, and guest os may be terminated if
> its memory data can't be migrated.
>
> Of course, it is better to get QEMU involved, so the memory_failure can
> be executed in guest OS. But before that implemented, can we add SEA
> injection in kvm_handle_guest_abort()?

No. I will strongly object to that. This is a platform decision to
forward SEAs, not an architectural one. The core KVM code is only
concerned about implementing the ARM architecture, and not something
that is firmware dependent.

Thanks,

M.
--
Jazz is not dead. It just smells funny...
Re: [PATCH V11 10/10] arm/arm64: KVM: add guest SEA support [ In reply to ]
Hi James,


> Hi Wang Xiongfeng,
>
> On 25/02/17 07:15, Xiongfeng Wang wrote:
>> On 2017/2/22 5:22, Tyler Baicar wrote:
>>> Currently external aborts are unsupported by the guest abort
>>> handling. Add handling for SEAs so that the host kernel reports
>>> SEAs which occur in the guest kernel.
>
>>> diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
>>> index a5265ed..04f1dd50 100644
>>> --- a/arch/arm/kvm/mmu.c
>>> +++ b/arch/arm/kvm/mmu.c
>>> @@ -1444,8 +1445,21 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu, struct kvm_run *run)
>>>
>>> /* Check the stage-2 fault is trans. fault or write fault */
>>> fault_status = kvm_vcpu_trap_get_fault_type(vcpu);
>>> - if (fault_status != FSC_FAULT && fault_status != FSC_PERM &&
>>> - fault_status != FSC_ACCESS) {
>>> +
>>> + /* The host kernel will handle the synchronous external abort. There
>>> + * is no need to pass the error into the guest.
>>> + */
>
>> Can we inject an sea into the guest, so that the guest can kill the
>> application which causes the error if the guest won't be terminated
>> later. I'm not sure whether ghes_handle_memory_failure() called in
>> ghes_do_proc() will kill the qemu process. I think it only kill user
>> processes marked with PF_MCE_PROCESS & PF_MCE_EARLY.
>
> My understanding is the pages will get unmapped and recovered where possible
> (e.g. re-read from disk), the user space process will get SIGBUS/SIGSEV when it
> next tries to access that page, which could be some time later.
> These flags in find_early_kill_thread() are a way to make the memory-failure
> code signal the process early, before it does any recovery. The 'MCE' makes me
> think its x86 specific.
> (early and late are described more in [0])
>
>
> Guests are a special case as QEMU may never access the faulty memory itself, so
> it won't receive the 'late' signal. It looks like ARM/arm64 KVM lacks support
> for KVM_PFN_ERR_HWPOISON which sends SIGBUS from KVM's fault-handling code. I
> have patches to add support for this which I intend to send at rc1.

could you push this patch to opensource?


>
> [0] suggests 'KVM qemu' sets these MCE flags to take the 'early' path, but given
> x86s KVM_PFN_ERR_HWPOISON, this may be out of date.
>
>
> Either way, once QEMU gets a signal indicating the virtual address, it can
> generate its own APEI CPER records and use the KVM APIs to mock up an
> Synchronous External Abort, (or inject an IRQ or run the vcpu waiting for the
> guest's polling thread to come round, whichever was described to the guest via
> the HEST/GHES tables).
>
> We can't hand the APEI CPER records we have in the kernel to the guest, as they
> hold a host physical address, and maybe a host virtual address. We don't know
> where in guest memory we could write new APEI CPER records as these locations
> have to be reserved in the guests-UEFI memory map, and only QEMU knows where
> they are.
>
> To deliver RAS events to a guest we have to get QEMU involved.
>
>
> Thanks,
>
> James
>
>
> [0] https://www.kernel.org/doc/Documentation/vm/hwpoison.txt
>
Re: [PATCH V11 10/10] arm/arm64: KVM: add guest SEA support [ In reply to ]
Hi Marc,

On 2017/3/2 17:39, Marc Zyngier wrote:
> On 01/03/17 02:31, Xiongfeng Wang wrote:
>
> [lot of things]
>
>> If an SEA is injected into guest OS, the guest OS will jump to the SEA
>> exception entry when the context switched to guest OS. And the CPSR and
>> FAR_EL1 are recovered according to the content of vcpu. Then the guest
>> OS can signal a process or panic. If another guest process read the
>> error data, another SEA will be generated and it will be single too.
>>
>> Without QEMU involved, the drawback is that no APEI table can be mocked
>> up in guest OS, and no memory_failure() will be called. So the memory of
>> error data will be released into buddy system and assigned to another
>> process. If the error was caused by instantaneous radiation or
>> electromagnetic, the memory is usable again if it is written with a
>> correct data. If the memory has wore out and a correct data is written,
>> the ECC error may occurs again with high possibility. Before a 2-bit ECC
>> error is reported, much more 1-bit errors will be reported. This is
>> report to host os, the host os can determine the memory node has worn
>> out and hot-plug out the memory node, and guest os may be terminated if
>> its memory data can't be migrated.
>>
>> Of course, it is better to get QEMU involved, so the memory_failure can
>> be executed in guest OS. But before that implemented, can we add SEA
>> injection in kvm_handle_guest_abort()?
>
> No. I will strongly object to that. This is a platform decision to
> forward SEAs, not an architectural one. The core KVM code is only
> concerned about implementing the ARM architecture, and not something
> that is firmware dependent.
>
Thanks for the reply!
I'm not sure if there exists some misunderstanding here. I was saying
that APEI stuff is not included in the core KVM code, but SEA injection
can be included, just like the SEI injection in the core KVM code. Yes,
APEI is firmware dependent, but I think SEA is not. And the processing
for SEA doesn't depend on whether APEI is implemented.


Thanks,

Wang Xiongfeng
.
Re: [PATCH V11 10/10] arm/arm64: KVM: add guest SEA support [ In reply to ]
Hi Tyler,

On 28/02/17 19:43, Baicar, Tyler wrote:
> On 2/24/2017 3:42 AM, James Morse wrote:
>> On 21/02/17 21:22, Tyler Baicar wrote:
>>> Currently external aborts are unsupported by the guest abort
>>> handling. Add handling for SEAs so that the host kernel reports
>>> SEAs which occur in the guest kernel.

>>> diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
>>> index b2d57fc..403277b 100644
>>> --- a/arch/arm64/mm/fault.c
>>> +++ b/arch/arm64/mm/fault.c
>>> @@ -602,6 +602,24 @@ static const char *fault_name(unsigned int esr)
>>> }
>>>
>>> /*
>>> + * Handle Synchronous External Aborts that occur in a guest kernel.
>>> + */
>>> +int handle_guest_sea(unsigned long addr, unsigned int esr)
>>> +{
>>> + if(IS_ENABLED(HAVE_ACPI_APEI_SEA)) {
>>> + nmi_enter();
>>> + ghes_notify_sea();
>>> + nmi_exit();

>> This nmi stuff was needed for synchronous aborts that may have interrupted
>> APEI's interrupts-masked code. We want to avoid trying to take the same set of
>> locks, hence taking the in_nmi() path through APEI. Here we know we interrupted
>> a guest, so there is no risk that we have interrupted APEI on the host.
>> ghes_notify_sea() can safely take the normal path.

> Makes sense, I can remove the nmi_* calls here.

Just occurs to me: if we do this we need to add the rcu_read_lock() in
ghes_notify_sea() as its not protected by the rcu/nmi weirdness.


Thanks,

James
Re: [PATCH V11 10/10] arm/arm64: KVM: add guest SEA support [ In reply to ]
Hello James,


On 3/6/2017 3:28 AM, James Morse wrote:
> On 28/02/17 19:43, Baicar, Tyler wrote:
>> On 2/24/2017 3:42 AM, James Morse wrote:
>>> On 21/02/17 21:22, Tyler Baicar wrote:
>>>> Currently external aborts are unsupported by the guest abort
>>>> handling. Add handling for SEAs so that the host kernel reports
>>>> SEAs which occur in the guest kernel.
>>>> diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
>>>> index b2d57fc..403277b 100644
>>>> --- a/arch/arm64/mm/fault.c
>>>> +++ b/arch/arm64/mm/fault.c
>>>> @@ -602,6 +602,24 @@ static const char *fault_name(unsigned int esr)
>>>> }
>>>>
>>>> /*
>>>> + * Handle Synchronous External Aborts that occur in a guest kernel.
>>>> + */
>>>> +int handle_guest_sea(unsigned long addr, unsigned int esr)
>>>> +{
>>>> + if(IS_ENABLED(HAVE_ACPI_APEI_SEA)) {
>>>> + nmi_enter();
>>>> + ghes_notify_sea();
>>>> + nmi_exit();
>>> This nmi stuff was needed for synchronous aborts that may have interrupted
>>> APEI's interrupts-masked code. We want to avoid trying to take the same set of
>>> locks, hence taking the in_nmi() path through APEI. Here we know we interrupted
>>> a guest, so there is no risk that we have interrupted APEI on the host.
>>> ghes_notify_sea() can safely take the normal path.
>> Makes sense, I can remove the nmi_* calls here.
> Just occurs to me: if we do this we need to add the rcu_read_lock() in
> ghes_notify_sea() as its not protected by the rcu/nmi weirdness.
>
True, would you suggest leaving these nmi_* calls or adding the rcu_*
calls? And since that's only needed for this KVM case, shouldn't the
rcu_* calls just replace the nmi_* calls here (outside of ghes_notify_sea)?

Thanks,
Tyler

--
Qualcomm Datacenter Technologies, Inc. as an affiliate of Qualcomm Technologies, Inc.
Qualcomm Technologies, Inc. is a member of the Code Aurora Forum,
a Linux Foundation Collaborative Project.
Re: [PATCH V11 10/10] arm/arm64: KVM: add guest SEA support [ In reply to ]
Hi James,


> Guests are a special case as QEMU may never access the faulty memory itself, so
> it won't receive the 'late' signal. It looks like ARM/arm64 KVM lacks support
> for KVM_PFN_ERR_HWPOISON which sends SIGBUS from KVM's fault-handling code. I
> have patches to add support for this which I intend to send at rc1.
>
> [0] suggests 'KVM qemu' sets these MCE flags to take the 'early' path, but given
> x86s KVM_PFN_ERR_HWPOISON, this may be out of date.
>
>
> Either way, once QEMU gets a signal indicating the virtual address, it can
> generate its own APEI CPER records and use the KVM APIs to mock up an
> Synchronous External Abort, (or inject an IRQ or run the vcpu waiting for the
> guest's polling thread to come round, whichever was described to the guest via
> the HEST/GHES tables).
>

I have another confusion about the SIGBUS signal. Can QEMU always get a SIGBUS when needed.
I know one circumstance which will send SIGBUS. The ghes_handle_memory_failure() in
ghes_do_proc() will send SIGBUS to QEMU, but this only happens when there exists memory section
in ghes, that is the section type is CPER_SEC_PLATFORM_MEM.
Suppose this case, an load error in guest application causes an SEA, and the firmware take it.
The firmware begin to scan the error record and fill the ghes. But the error record in memory node
has been read by other handler. The firmware won't add memory section in ghes, so
ghes_handle_memory_failure() won't be called.
I mean that we may not rely on ghes_handle_memory_failure() to send SIGBUS to QEMU. Whether we should
add some other code to send SIGBUS in handle_guest_abort(). I don't know whether the ARM/arm64
KVM_PFN_ERR_HWPOISON you mentioned above will cover all the cases.

Thanks,

Wang Xiongfeng
.
Re: [PATCH V11 10/10] arm/arm64: KVM: add guest SEA support [ In reply to ]
Hi Wang Xiongfeng,

On 22/03/17 02:46, Xiongfeng Wang wrote:
>> Guests are a special case as QEMU may never access the faulty memory itself, so
>> it won't receive the 'late' signal. It looks like ARM/arm64 KVM lacks support
>> for KVM_PFN_ERR_HWPOISON which sends SIGBUS from KVM's fault-handling code. I
>> have patches to add support for this which I intend to send at rc1.
>>
>> [0] suggests 'KVM qemu' sets these MCE flags to take the 'early' path, but given
>> x86s KVM_PFN_ERR_HWPOISON, this may be out of date.
>>
>>
>> Either way, once QEMU gets a signal indicating the virtual address, it can
>> generate its own APEI CPER records and use the KVM APIs to mock up an
>> Synchronous External Abort, (or inject an IRQ or run the vcpu waiting for the
>> guest's polling thread to come round, whichever was described to the guest via
>> the HEST/GHES tables).
>>
>
> I have another confusion about the SIGBUS signal. Can QEMU always get a SIGBUS when needed.
> I know one circumstance which will send SIGBUS. The ghes_handle_memory_failure() in
> ghes_do_proc() will send SIGBUS to QEMU, but this only happens when there exists memory section
> in ghes, that is the section type is CPER_SEC_PLATFORM_MEM.
> Suppose this case, an load error in guest application causes an SEA, and the firmware take it.
> The firmware begin to scan the error record and fill the ghes. But the error record in memory node
> has been read by other handler.

(this looks like a race)

> The firmware won't add memory section in ghes, so ghes_handle_memory_failure() won't be called.

I think this would be a firmware bug. Firmware can reserve as much memory as it
needs for writing CPER records, there should not be a case where 'the memory' is
currently being processed by another handler.

The memory firmware uses to write CPER records too shouldn't be published to the
OS until it has finished. Once firmware has finished writing the CPER records it
can update the memory pointed to by GHES->ErrorStatusAddress with the location
of the CPER records and invoke the Notification method for this GHES. (SEI, SEA,
IRQ etc). We should always get a complete set of CPER records to describe the error.

It firmware uses GHESv2 it can get an 'ack' write from APEI once it has finished
processing the records. Once it gets this firmware knows it can re-use the memory.

(Obviously each GHES entry can only process one error at a time. Firmware should
either handle this, or have one entry for each Error Source that can happen
independently)


> I mean that we may not rely on ghes_handle_memory_failure() to send SIGBUS to QEMU. Whether we should
> add some other code to send SIGBUS in handle_guest_abort(). I don't know whether the ARM/arm64
> KVM_PFN_ERR_HWPOISON you mentioned above will cover all the cases.

The SIGBUS routine is part of the kernel's recovery method for memory errors. It
should cover all the errors reported with this CPER_SEC_PLATFORM_MEM.

Back to the race you describe. It shouldn't matter if one CPU processes an error
for guest memory while a vcpu is running on another. This may happen if the
error was detected by DRAM's background scrub.
If we don't treat KVM/Qemu as anything special the memory_failure()->SIGBUS path
will happen regardless of whether the fault interrupted the guest or not.


There are other types of error such as PCIe, CPU, BUS error etc. If it's
possible to recover from these we may need additional code in the kernel. This
shouldn't necessarily treat KVM as a special case.


Thanks,

James
Re: [PATCH V11 10/10] arm/arm64: KVM: add guest SEA support [ In reply to ]
Hi James,

On 2017/3/22 19:14, James Morse wrote:
> Hi Wang Xiongfeng,
>
> On 22/03/17 02:46, Xiongfeng Wang wrote:
>>> Guests are a special case as QEMU may never access the faulty memory itself, so
>>> it won't receive the 'late' signal. It looks like ARM/arm64 KVM lacks support
>>> for KVM_PFN_ERR_HWPOISON which sends SIGBUS from KVM's fault-handling code. I
>>> have patches to add support for this which I intend to send at rc1.
>>>
>>> [0] suggests 'KVM qemu' sets these MCE flags to take the 'early' path, but given
>>> x86s KVM_PFN_ERR_HWPOISON, this may be out of date.
>>>
>>>
>>> Either way, once QEMU gets a signal indicating the virtual address, it can
>>> generate its own APEI CPER records and use the KVM APIs to mock up an
>>> Synchronous External Abort, (or inject an IRQ or run the vcpu waiting for the
>>> guest's polling thread to come round, whichever was described to the guest via
>>> the HEST/GHES tables).
>>>
>>
>> I have another confusion about the SIGBUS signal. Can QEMU always get a SIGBUS when needed.
>> I know one circumstance which will send SIGBUS. The ghes_handle_memory_failure() in
>> ghes_do_proc() will send SIGBUS to QEMU, but this only happens when there exists memory section
>> in ghes, that is the section type is CPER_SEC_PLATFORM_MEM.
>> Suppose this case, an load error in guest application causes an SEA, and the firmware take it.
>> The firmware begin to scan the error record and fill the ghes. But the error record in memory node
>> has been read by other handler.
>
> (this looks like a race)
>
>> The firmware won't add memory section in ghes, so ghes_handle_memory_failure() won't be called.
>
> I think this would be a firmware bug. Firmware can reserve as much memory as it
> needs for writing CPER records, there should not be a case where 'the memory' is
> currently being processed by another handler.

I have a question here:
Consider this case, the memory controller first detected a memory error,
but it has not been consumed. So it will not generate the SEA. Memory error
may be reported to the OS by IRQ with MEM section in CPER record; and
after for a while, the error data was loaded into the cache and consumed,
when the SEA is generated. Is it possible only processor section, and no
MEM section in CPER record?

Obviously there are two different GHES above, one for SEA and another for IRQ/GSIV.
Could we assume that there is mem section in the SEA ghes table?

>
> The memory firmware uses to write CPER records too shouldn't be published to the
> OS until it has finished. Once firmware has finished writing the CPER records it
> can update the memory pointed to by GHES->ErrorStatusAddress with the location
> of the CPER records and invoke the Notification method for this GHES. (SEI, SEA,
> IRQ etc). We should always get a complete set of CPER records to describe the error.
>

Does it mean that the BIOS has the responsibility to ensure that the GHES table has a
complete error info, including memory, bus, tlb, cache and other related error info?

--
Thanks,
Xie XiuQi

> It firmware uses GHESv2 it can get an 'ack' write from APEI once it has finished
> processing the records. Once it gets this firmware knows it can re-use the memory.
>
> (Obviously each GHES entry can only process one error at a time. Firmware should
> either handle this, or have one entry for each Error Source that can happen
> independently)
>
>
>> I mean that we may not rely on ghes_handle_memory_failure() to send SIGBUS to QEMU. Whether we should
>> add some other code to send SIGBUS in handle_guest_abort(). I don't know whether the ARM/arm64
>> KVM_PFN_ERR_HWPOISON you mentioned above will cover all the cases.
>
> The SIGBUS routine is part of the kernel's recovery method for memory errors. It
> should cover all the errors reported with this CPER_SEC_PLATFORM_MEM.
>
> Back to the race you describe. It shouldn't matter if one CPU processes an error
> for guest memory while a vcpu is running on another. This may happen if the
> error was detected by DRAM's background scrub.
> If we don't treat KVM/Qemu as anything special the memory_failure()->SIGBUS path
> will happen regardless of whether the fault interrupted the guest or not.
>
>
> There are other types of error such as PCIe, CPU, BUS error etc. If it's
> possible to recover from these we may need additional code in the kernel. This
> shouldn't necessarily treat KVM as a special case.
>
>
> Thanks,
>
> James
>
>
> .
>