Mailing List Archive

[PATCH v5 22/34] x86/fred: FRED initialization code
From: "H. Peter Anvin (Intel)" <hpa@zytor.com>

The code to initialize FRED when it's available and _not_ disabled.

cpu_init_fred_exceptions() is the core function to initialize FRED,
which
1. Sets up FRED entrypoints for events happening in ring 0 and 3.
2. Sets up a default stack for event handling.
3. Sets up dedicated event stacks for DB/NMI/MC/DF, equivalent to
the IDT IST stacks.
4. Forces 32-bit system calls to use "int $0x80" only.
5. Enables FRED and invalidtes IDT.

When the FRED is used, cpu_init_exception_handling() initializes FRED
through calling cpu_init_fred_exceptions(), otherwise it sets up TSS
IST and loads IDT.

As FRED uses the ring 3 FRED entrypoint for SYSCALL and SYSENTER,
it skips setting up SYSCALL/SYSENTER related MSRs, e.g., MSR_LSTAR.

Signed-off-by: H. Peter Anvin (Intel) <hpa@zytor.com>
Co-developed-by: Xin Li <xin3.li@intel.com>
Tested-by: Shan Kang <shan.kang@intel.com>
Signed-off-by: Xin Li <xin3.li@intel.com>
---
arch/x86/include/asm/fred.h | 14 +++++++
arch/x86/include/asm/traps.h | 2 +
arch/x86/kernel/Makefile | 1 +
arch/x86/kernel/cpu/common.c | 74 +++++++++++++++++++++++-------------
arch/x86/kernel/fred.c | 73 +++++++++++++++++++++++++++++++++++
arch/x86/kernel/irqinit.c | 7 +++-
arch/x86/kernel/traps.c | 16 +++++++-
7 files changed, 157 insertions(+), 30 deletions(-)
create mode 100644 arch/x86/kernel/fred.c

diff --git a/arch/x86/include/asm/fred.h b/arch/x86/include/asm/fred.h
index 54746e8c0a17..cd974edc8e8a 100644
--- a/arch/x86/include/asm/fred.h
+++ b/arch/x86/include/asm/fred.h
@@ -99,8 +99,22 @@ DECLARE_FRED_HANDLER(fred_exc_debug);
DECLARE_FRED_HANDLER(fred_exc_page_fault);
DECLARE_FRED_HANDLER(fred_exc_machine_check);

+/*
+ * The actual assembly entry and exit points
+ */
+extern __visible void fred_entrypoint_user(void);
+
+/*
+ * Initialization
+ */
+void cpu_init_fred_exceptions(void);
+void fred_setup_apic(void);
+
#endif /* __ASSEMBLY__ */

+#else
+#define cpu_init_fred_exceptions() BUG()
+#define fred_setup_apic() BUG()
#endif /* CONFIG_X86_FRED */

#endif /* ASM_X86_FRED_H */
diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h
index da4c21ed68b4..69fafef1136e 100644
--- a/arch/x86/include/asm/traps.h
+++ b/arch/x86/include/asm/traps.h
@@ -56,6 +56,8 @@ void __noreturn handle_stack_overflow(struct pt_regs *regs,
void f (struct pt_regs *regs)
typedef DECLARE_SYSTEM_INTERRUPT_HANDLER((*system_interrupt_handler));

+system_interrupt_handler get_system_interrupt_handler(unsigned int i);
+
int external_interrupt(struct pt_regs *regs, unsigned int vector);

#endif /* _ASM_X86_TRAPS_H */
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index dd61752f4c96..08d9c0a0bfbe 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -47,6 +47,7 @@ obj-y += platform-quirks.o
obj-y += process_$(BITS).o signal.o signal_$(BITS).o
obj-y += traps.o idt.o irq.o irq_$(BITS).o dumpstack_$(BITS).o
obj-y += time.o ioport.o dumpstack.o nmi.o
+obj-$(CONFIG_X86_FRED) += fred.o
obj-$(CONFIG_MODIFY_LDT_SYSCALL) += ldt.o
obj-y += setup.o x86_init.o i8259.o irqinit.o
obj-$(CONFIG_JUMP_LABEL) += jump_label.o
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index e8cf6f4cfb52..eea41cb8722e 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -58,6 +58,7 @@
#include <asm/microcode_intel.h>
#include <asm/intel-family.h>
#include <asm/cpu_device_id.h>
+#include <asm/fred.h>
#include <asm/uv/uv.h>
#include <asm/sigframe.h>
#include <asm/traps.h>
@@ -2054,28 +2055,6 @@ static void wrmsrl_cstar(unsigned long val)
/* May not be marked __init: used by software suspend */
void syscall_init(void)
{
- wrmsr(MSR_STAR, 0, (__USER32_CS << 16) | __KERNEL_CS);
- wrmsrl(MSR_LSTAR, (unsigned long)entry_SYSCALL_64);
-
-#ifdef CONFIG_IA32_EMULATION
- wrmsrl_cstar((unsigned long)entry_SYSCALL_compat);
- /*
- * This only works on Intel CPUs.
- * On AMD CPUs these MSRs are 32-bit, CPU truncates MSR_IA32_SYSENTER_EIP.
- * This does not cause SYSENTER to jump to the wrong location, because
- * AMD doesn't allow SYSENTER in long mode (either 32- or 64-bit).
- */
- wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS);
- wrmsrl_safe(MSR_IA32_SYSENTER_ESP,
- (unsigned long)(cpu_entry_stack(smp_processor_id()) + 1));
- wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)entry_SYSENTER_compat);
-#else
- wrmsrl_cstar((unsigned long)ignore_sysret);
- wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)GDT_ENTRY_INVALID_SEG);
- wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL);
- wrmsrl_safe(MSR_IA32_SYSENTER_EIP, 0ULL);
-#endif
-
/*
* Flags to clear on syscall; clear as much as possible
* to minimize user space-kernel interference.
@@ -2086,6 +2065,41 @@ void syscall_init(void)
X86_EFLAGS_IF|X86_EFLAGS_DF|X86_EFLAGS_OF|
X86_EFLAGS_IOPL|X86_EFLAGS_NT|X86_EFLAGS_RF|
X86_EFLAGS_AC|X86_EFLAGS_ID);
+
+ /*
+ * The default user and kernel segments
+ */
+ wrmsr(MSR_STAR, 0, (__USER32_CS << 16) | __KERNEL_CS);
+
+ if (cpu_feature_enabled(X86_FEATURE_FRED)) {
+ /* Both sysexit and sysret cause #UD when FRED is enabled */
+ wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)GDT_ENTRY_INVALID_SEG);
+ wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL);
+ wrmsrl_safe(MSR_IA32_SYSENTER_EIP, 0ULL);
+ } else {
+ wrmsrl(MSR_LSTAR, (unsigned long)entry_SYSCALL_64);
+
+#ifdef CONFIG_IA32_EMULATION
+ wrmsrl_cstar((unsigned long)entry_SYSCALL_compat);
+ /*
+ * This only works on Intel CPUs.
+ * On AMD CPUs these MSRs are 32-bit, CPU truncates
+ * MSR_IA32_SYSENTER_EIP.
+ * This does not cause SYSENTER to jump to the wrong
+ * location, because AMD doesn't allow SYSENTER in
+ * long mode (either 32- or 64-bit).
+ */
+ wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS);
+ wrmsrl_safe(MSR_IA32_SYSENTER_ESP,
+ (unsigned long)(cpu_entry_stack(smp_processor_id()) + 1));
+ wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)entry_SYSENTER_compat);
+#else
+ wrmsrl_cstar((unsigned long)ignore_sysret);
+ wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)GDT_ENTRY_INVALID_SEG);
+ wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL);
+ wrmsrl_safe(MSR_IA32_SYSENTER_EIP, 0ULL);
+#endif
+ }
}

#else /* CONFIG_X86_64 */
@@ -2218,18 +2232,24 @@ void cpu_init_exception_handling(void)
/* paranoid_entry() gets the CPU number from the GDT */
setup_getcpu(cpu);

- /* IST vectors need TSS to be set up. */
- tss_setup_ist(tss);
+ /* Set up the TSS */
tss_setup_io_bitmap(tss);
set_tss_desc(cpu, &get_cpu_entry_area(cpu)->tss.x86_tss);
-
load_TR_desc();

/* GHCB needs to be setup to handle #VC. */
setup_ghcb();

- /* Finally load the IDT */
- load_current_idt();
+ if (cpu_feature_enabled(X86_FEATURE_FRED)) {
+ /* Set up FRED exception handling */
+ cpu_init_fred_exceptions();
+ } else {
+ /* IST vectors need TSS to be set up. */
+ tss_setup_ist(tss);
+
+ /* Finally load the IDT */
+ load_current_idt();
+ }
}

/*
diff --git a/arch/x86/kernel/fred.c b/arch/x86/kernel/fred.c
new file mode 100644
index 000000000000..827b58fd98d4
--- /dev/null
+++ b/arch/x86/kernel/fred.c
@@ -0,0 +1,73 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <linux/kernel.h>
+#include <asm/desc.h>
+#include <asm/fred.h>
+#include <asm/tlbflush.h> /* For cr4_set_bits() */
+#include <asm/traps.h>
+
+/*
+ * Initialize FRED on this CPU. This cannot be __init as it is called
+ * during CPU hotplug.
+ */
+void cpu_init_fred_exceptions(void)
+{
+ wrmsrl(MSR_IA32_FRED_CONFIG,
+ FRED_CONFIG_ENTRYPOINT(fred_entrypoint_user) |
+ FRED_CONFIG_REDZONE(8) | /* Reserve for CALL emulation */
+ FRED_CONFIG_INT_STKLVL(0));
+
+ wrmsrl(MSR_IA32_FRED_STKLVLS,
+ FRED_STKLVL(X86_TRAP_DB, 1) |
+ FRED_STKLVL(X86_TRAP_NMI, 2) |
+ FRED_STKLVL(X86_TRAP_MC, 2) |
+ FRED_STKLVL(X86_TRAP_DF, 3));
+
+ /* The FRED equivalents to IST stacks... */
+ wrmsrl(MSR_IA32_FRED_RSP1, __this_cpu_ist_top_va(DB));
+ wrmsrl(MSR_IA32_FRED_RSP2, __this_cpu_ist_top_va(NMI));
+ wrmsrl(MSR_IA32_FRED_RSP3, __this_cpu_ist_top_va(DF));
+
+ /* Not used with FRED */
+ wrmsrl(MSR_LSTAR, 0ULL);
+ wrmsrl(MSR_CSTAR, 0ULL);
+ wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)GDT_ENTRY_INVALID_SEG);
+ wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL);
+ wrmsrl_safe(MSR_IA32_SYSENTER_EIP, 0ULL);
+
+ /* Enable FRED */
+ cr4_set_bits(X86_CR4_FRED);
+ idt_invalidate(); /* Any further IDT use is a bug */
+
+ /* Use int $0x80 for 32-bit system calls in FRED mode */
+ setup_clear_cpu_cap(X86_FEATURE_SYSENTER32);
+ setup_clear_cpu_cap(X86_FEATURE_SYSCALL32);
+}
+
+/*
+ * Initialize system vectors from a FRED perspective, so
+ * lapic_assign_system_vectors() can do its job.
+ */
+void __init fred_setup_apic(void)
+{
+ int i;
+
+ for (i = 0; i < FIRST_EXTERNAL_VECTOR; i++)
+ set_bit(i, system_vectors);
+
+ /*
+ * Don't set the non assigned system vectors in the
+ * system_vectors bitmap. Otherwise they show up in
+ * /proc/interrupts.
+ */
+#ifdef CONFIG_SMP
+ set_bit(IRQ_MOVE_CLEANUP_VECTOR, system_vectors);
+#endif
+
+ for (i = 0; i < NR_SYSTEM_VECTORS; i++) {
+ if (get_system_interrupt_handler(i) != NULL) {
+ set_bit(i + FIRST_SYSTEM_VECTOR, system_vectors);
+ }
+ }
+
+ /* The rest are fair game... */
+}
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
index c683666876f1..2a510f72dd11 100644
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -28,6 +28,7 @@
#include <asm/setup.h>
#include <asm/i8259.h>
#include <asm/traps.h>
+#include <asm/fred.h>
#include <asm/prom.h>

/*
@@ -96,7 +97,11 @@ void __init native_init_IRQ(void)
/* Execute any quirks before the call gates are initialised: */
x86_init.irqs.pre_vector_init();

- idt_setup_apic_and_irq_gates();
+ if (cpu_feature_enabled(X86_FEATURE_FRED))
+ fred_setup_apic();
+ else
+ idt_setup_apic_and_irq_gates();
+
lapic_assign_system_vectors();

if (!acpi_ioapic && !of_ioapic && nr_legacy_irqs()) {
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 4b0f63344526..c7253b4901f0 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -1517,12 +1517,21 @@ static system_interrupt_handler system_interrupt_handlers[NR_SYSTEM_VECTORS] = {

#undef SYSV

+system_interrupt_handler get_system_interrupt_handler(unsigned int i)
+{
+ if (i >= NR_SYSTEM_VECTORS)
+ return NULL;
+
+ return system_interrupt_handlers[i];
+}
+
void __init install_system_interrupt_handler(unsigned int n, const void *asm_addr, const void *addr)
{
BUG_ON(n < FIRST_SYSTEM_VECTOR);

system_interrupt_handlers[n - FIRST_SYSTEM_VECTOR] = (system_interrupt_handler)addr;
- alloc_intr_gate(n, asm_addr);
+ if (!cpu_feature_enabled(X86_FEATURE_FRED))
+ alloc_intr_gate(n, asm_addr);
}

#ifndef CONFIG_X86_LOCAL_APIC
@@ -1590,7 +1599,10 @@ void __init trap_init(void)

/* Initialize TSS before setting up traps so ISTs work */
cpu_init_exception_handling();
+
/* Setup traps as cpu_init() might #GP */
- idt_setup_traps();
+ if (!cpu_feature_enabled(X86_FEATURE_FRED))
+ idt_setup_traps();
+
cpu_init();
}
--
2.34.1
Re: [PATCH v5 22/34] x86/fred: FRED initialization code [ In reply to ]
Hello


Comments in cpu_init_fred_exceptions() seem scarce for understanding.

On Tue, Mar 7, 2023 at 11:07?AM Xin Li <xin3.li@intel.com> wrote:

> +/*
> + * Initialize FRED on this CPU. This cannot be __init as it is called
> + * during CPU hotplug.
> + */
> +void cpu_init_fred_exceptions(void)
> +{
> + wrmsrl(MSR_IA32_FRED_CONFIG,
> + FRED_CONFIG_ENTRYPOINT(fred_entrypoint_user) |
> + FRED_CONFIG_REDZONE(8) | /* Reserve for CALL emulation */
> + FRED_CONFIG_INT_STKLVL(0));

What is it about "Reserve for CALL emulation"?

I guess it relates to X86_TRAP_BP. In entry_64.S:

.if \vector == X86_TRAP_BP
/*
* If coming from kernel space, create a 6-word gap to allow the
* int3 handler to emulate a call instruction.
*/

> +
> + wrmsrl(MSR_IA32_FRED_STKLVLS,
> + FRED_STKLVL(X86_TRAP_DB, 1) |
> + FRED_STKLVL(X86_TRAP_NMI, 2) |
> + FRED_STKLVL(X86_TRAP_MC, 2) |
> + FRED_STKLVL(X86_TRAP_DF, 3));

Why each exception here needs a stack level > 0?
Especially for X86_TRAP_DB and X86_TRAP_NMI.

Why does or why does not X86_TRAP_VE have a stack level > 0?

X86_TRAP_DF is the highest stack level, is it accidental
or deliberate?

Thanks
Lai
Re: [PATCH v5 22/34] x86/fred: FRED initialization code [ In reply to ]
On March 17, 2023 6:35:57 AM PDT, Lai Jiangshan <jiangshanlai@gmail.com> wrote:
>Hello
>
>
>Comments in cpu_init_fred_exceptions() seem scarce for understanding.
>
>On Tue, Mar 7, 2023 at 11:07?AM Xin Li <xin3.li@intel.com> wrote:
>
>> +/*
>> + * Initialize FRED on this CPU. This cannot be __init as it is called
>> + * during CPU hotplug.
>> + */
>> +void cpu_init_fred_exceptions(void)
>> +{
>> + wrmsrl(MSR_IA32_FRED_CONFIG,
>> + FRED_CONFIG_ENTRYPOINT(fred_entrypoint_user) |
>> + FRED_CONFIG_REDZONE(8) | /* Reserve for CALL emulation */
>> + FRED_CONFIG_INT_STKLVL(0));
>
>What is it about "Reserve for CALL emulation"?
>
>I guess it relates to X86_TRAP_BP. In entry_64.S:
>
> .if \vector == X86_TRAP_BP
> /*
> * If coming from kernel space, create a 6-word gap to allow the
> * int3 handler to emulate a call instruction.
> */
>
>> +
>> + wrmsrl(MSR_IA32_FRED_STKLVLS,
>> + FRED_STKLVL(X86_TRAP_DB, 1) |
>> + FRED_STKLVL(X86_TRAP_NMI, 2) |
>> + FRED_STKLVL(X86_TRAP_MC, 2) |
>> + FRED_STKLVL(X86_TRAP_DF, 3));
>
>Why each exception here needs a stack level > 0?
>Especially for X86_TRAP_DB and X86_TRAP_NMI.
>
>Why does or why does not X86_TRAP_VE have a stack level > 0?
>
>X86_TRAP_DF is the highest stack level, is it accidental
>or deliberate?
>
>Thanks
>Lai
>

Yes, the extra redzone space is there to allow for the call emulation without having to adjust the stack frame "manually".

In theory we could enable it only while code patching is in progress, but that would probably just result in stack overflows becoming utterly impossible to debug as we have to consider the worst case.

The purpose of separate stacks for NMI, #DB and #MC *in the kernel* (remember that user space faults are always taken on stack level 0) is to avoid overflowing the kernel stack. #DB in the kernel would imply the use of a kernel debugger.

#DF is the highest level because a #DF means "something went wrong *while delivering an exception*." The number of cases for which that can happen with FRED is drastically reduced and basically amount to "the stack you pointed me to is broken."

Thus, you basically always want to change stacks on #DF, which means it should be at the highest level.
Re: [PATCH v5 22/34] x86/fred: FRED initialization code [ In reply to ]
On Sat, Mar 18, 2023 at 5:32?AM H. Peter Anvin <hpa@zytor.com> wrote:
>
> On March 17, 2023 6:35:57 AM PDT, Lai Jiangshan <jiangshanlai@gmail.com> wrote:
> >Hello
> >
> >
> >Comments in cpu_init_fred_exceptions() seem scarce for understanding.
> >
> >On Tue, Mar 7, 2023 at 11:07?AM Xin Li <xin3.li@intel.com> wrote:
> >
> >> +/*
> >> + * Initialize FRED on this CPU. This cannot be __init as it is called
> >> + * during CPU hotplug.
> >> + */
> >> +void cpu_init_fred_exceptions(void)
> >> +{
> >> + wrmsrl(MSR_IA32_FRED_CONFIG,
> >> + FRED_CONFIG_ENTRYPOINT(fred_entrypoint_user) |
> >> + FRED_CONFIG_REDZONE(8) | /* Reserve for CALL emulation */
> >> + FRED_CONFIG_INT_STKLVL(0));
> >
> >What is it about "Reserve for CALL emulation"?
> >
> >I guess it relates to X86_TRAP_BP. In entry_64.S:
> >
> > .if \vector == X86_TRAP_BP
> > /*
> > * If coming from kernel space, create a 6-word gap to allow the
> > * int3 handler to emulate a call instruction.
> > */
> >
> >> +
> >> + wrmsrl(MSR_IA32_FRED_STKLVLS,
> >> + FRED_STKLVL(X86_TRAP_DB, 1) |
> >> + FRED_STKLVL(X86_TRAP_NMI, 2) |
> >> + FRED_STKLVL(X86_TRAP_MC, 2) |
> >> + FRED_STKLVL(X86_TRAP_DF, 3));
> >
> >Why each exception here needs a stack level > 0?
> >Especially for X86_TRAP_DB and X86_TRAP_NMI.
> >
> >Why does or why does not X86_TRAP_VE have a stack level > 0?
> >
> >X86_TRAP_DF is the highest stack level, is it accidental
> >or deliberate?
> >
> >Thanks
> >Lai
> >
>
> Yes, the extra redzone space is there to allow for the call emulation without having to adjust the stack frame "manually".
>
> In theory we could enable it only while code patching is in progress, but that would probably just result in stack overflows becoming utterly impossible to debug as we have to consider the worst case.
>
> The purpose of separate stacks for NMI, #DB and #MC *in the kernel* (remember that user space faults are always taken on stack level 0) is to avoid overflowing the kernel stack. #DB in the kernel would imply the use of a kernel debugger.

Could you add it to the code, please? I think it can help other reviewers.

If there is no other concrete reason other than overflowing for
assigning NMI and #DB with a stack level > 0, #VE should also
be assigned with a stack level > 0, and #BP too. #VE can happen
anytime and anywhere, so it is subject to overflowing too.

>
>
> #DF is the highest level because a #DF means "something went wrong *while delivering an exception*." The number of cases for which that can happen with FRED is drastically reduced and basically amount to "the stack you pointed me to is broken."
>
> Thus, you basically always want to change stacks on #DF, which means it should be at the highest level.
Re: [PATCH v5 22/34] x86/fred: FRED initialization code [ In reply to ]
On Fri, Mar 17, 2023 at 02:32:28PM -0700, H. Peter Anvin wrote:
> The purpose of separate stacks for NMI, #DB and #MC *in the kernel*
> (remember that user space faults are always taken on stack level 0) is
> to avoid overflowing the kernel stack. #DB in the kernel would imply
> the use of a kernel debugger.

Perf (and through it bpf) also has access to #DB. They can set
breakpoints on kernel instructions/memory just fine provided permission
etc.
Re: [PATCH v5 22/34] x86/fred: FRED initialization code [ In reply to ]
On Sat, Mar 18, 2023 at 02:33:30PM +0800, Lai Jiangshan wrote:
> If there is no other concrete reason other than overflowing for
> assigning NMI and #DB with a stack level > 0, #VE should also
> be assigned with a stack level > 0, and #BP too. #VE can happen
> anytime and anywhere, so it is subject to overflowing too.

So #BP needs the stack-gap (redzone) for text_poke_bp().

#BP can end up in kprobes which can then end up in ftrace/perf,
depending on how it's all wired up.

#VE is currently a trainwreck vs NMI/MCE, but I think FRED solves the
worst of that. I'm not exactly sure how deep the #VE handler goes.
RE: [PATCH v5 22/34] x86/fred: FRED initialization code [ In reply to ]
> > If there is no other concrete reason other than overflowing for
> > assigning NMI and #DB with a stack level > 0, #VE should also be
> > assigned with a stack level > 0, and #BP too. #VE can happen anytime
> > and anywhere, so it is subject to overflowing too.
>
> So #BP needs the stack-gap (redzone) for text_poke_bp().
>
> #BP can end up in kprobes which can then end up in ftrace/perf, depending on
> how it's all wired up.
>
> #VE is currently a trainwreck vs NMI/MCE, but I think FRED solves the worst of
> that. I'm not exactly sure how deep the #VE handler goes.
>

VE under IDT is *not* using an IST, we need some solid rationales here.

Thanks!
Xin
RE: [PATCH v5 22/34] x86/fred: FRED initialization code [ In reply to ]
> > The purpose of separate stacks for NMI, #DB and #MC *in the kernel*
> > (remember that user space faults are always taken on stack level 0) is
> > to avoid overflowing the kernel stack. #DB in the kernel would imply
> > the use of a kernel debugger.
>
> Perf (and through it bpf) also has access to #DB. They can set
> breakpoints on kernel instructions/memory just fine provided permission
> etc.

So they are still *kernel* debuggers :)
Re: [PATCH v5 22/34] x86/fred: FRED initialization code [ In reply to ]
On 21/03/2023 12:12 am, Li, Xin3 wrote:
>>> If there is no other concrete reason other than overflowing for
>>> assigning NMI and #DB with a stack level > 0, #VE should also be
>>> assigned with a stack level > 0, and #BP too. #VE can happen anytime
>>> and anywhere, so it is subject to overflowing too.
>> So #BP needs the stack-gap (redzone) for text_poke_bp().
>>
>> #BP can end up in kprobes which can then end up in ftrace/perf, depending on
>> how it's all wired up.
>>
>> #VE is currently a trainwreck vs NMI/MCE, but I think FRED solves the worst of
>> that. I'm not exactly sure how deep the #VE handler goes.
>>
> VE under IDT is *not* using an IST, we need some solid rationales here.

#VE, and #VC on AMD, are borderline unusable.  Both under IDT and FRED.

The reason #VE is not IST is because there are plenty of real cases
where a non-malicious outer hypervisor could create reentrant faults
that lose program state.  e.g. hitting an IO instruction, then hitting
an emulated MSR.

There are fewer cases where a non-IST #VE ends up in a re-entrant fault
(IIRC, you can still manage it by unmapping the entry stack), but you're
still trusting the outer hypervisor to not e.g. unmap the SYSCALL entry
point.

FRED gets rid of the "reentrant fault overwriting it on the stack" case,
and removes the syscall gap case, replacing them instead with a stack
overflow in the worst case because there is still no upper bound to how
many times #VE can actually be delivered in the course of servicing a
single #VE.

~Andrew

P.S. While I hate to cite myself, if you haven't read
https://docs.google.com/document/d/1hWejnyDkjRRAW-JEsRjA5c9CKLOPc6VKJQsuvODlQEI/edit?usp=sharing
yet, do so.  It did feed into some of the FRED design.
RE: [PATCH v5 22/34] x86/fred: FRED initialization code [ In reply to ]
> >>> If there is no other concrete reason other than overflowing for
> >>> assigning NMI and #DB with a stack level > 0, #VE should also be
> >>> assigned with a stack level > 0, and #BP too. #VE can happen anytime
> >>> and anywhere, so it is subject to overflowing too.
> >> So #BP needs the stack-gap (redzone) for text_poke_bp().
> >>
> >> #BP can end up in kprobes which can then end up in ftrace/perf, depending
> on
> >> how it's all wired up.
> >>
> >> #VE is currently a trainwreck vs NMI/MCE, but I think FRED solves the worst of
> >> that. I'm not exactly sure how deep the #VE handler goes.
> >>
> > VE under IDT is *not* using an IST, we need some solid rationales here.
>
> #VE, and #VC on AMD, are borderline unusable.  Both under IDT and FRED.

Oops!

> The reason #VE is not IST is because there are plenty of real cases
> where a non-malicious outer hypervisor could create reentrant faults
> that lose program state.  e.g. hitting an IO instruction, then hitting
> an emulated MSR.
>
> There are fewer cases where a non-IST #VE ends up in a re-entrant fault
> (IIRC, you can still manage it by unmapping the entry stack), but you're
> still trusting the outer hypervisor to not e.g. unmap the SYSCALL entry
> point.
>
> FRED gets rid of the "reentrant fault overwriting it on the stack" case,
> and removes the syscall gap case, replacing them instead with a stack
> overflow in the worst case because there is still no upper bound to how
> many times #VE can actually be delivered in the course of servicing a
> single #VE.

Exactly, FRED stack levels can make use of the whole regular stack space.

I guess you don't seem to support #VE on a higher stack level?

> ~Andrew
>
> P.S. While I hate to cite myself, if you haven't read
> https://docs.google.com/document/d/1hWejnyDkjRRAW-
> JEsRjA5c9CKLOPc6VKJQsuvODlQEI/edit?usp=sharing
> yet, do so.  It did feed into some of the FRED design.
RE: [PATCH v5 22/34] x86/fred: FRED initialization code [ In reply to ]
> If there is no other concrete reason other than overflowing for assigning NMI and
> #DB with a stack level > 0, #VE should also be assigned with a stack level > 0, and
> #BP too. #VE can happen anytime and anywhere, so it is subject to overflowing too.

With IDT, both #VE and #BP do not use IST, but NMI, #DB, #MC and #DF do.

Let's keep this "secret" logic for now, i.e., not change the stack levels
for #VE and #BP at this point. We can do "optimization", i.e., change them
later :).
Re: [PATCH v5 22/34] x86/fred: FRED initialization code [ In reply to ]
On 3/21/23 19:22, Li, Xin3 wrote:
>> If there is no other concrete reason other than overflowing for assigning NMI and
>> #DB with a stack level > 0, #VE should also be assigned with a stack level > 0, and
>> #BP too. #VE can happen anytime and anywhere, so it is subject to overflowing too.
> With IDT, both #VE and #BP do not use IST, but NMI, #DB, #MC and #DF do.
>
> Let's keep this "secret" logic for now, i.e., not change the stack levels
> for #VE and #BP at this point. We can do "optimization", i.e., change them
> later ????.

#VE also can't happen anywhere. There is some documentation about it in
here:

https://docs.kernel.org/x86/tdx.html#linux-ve-handler

But, basically, the only halfway sane thing a guest might do to hit a
#VE is touch some "MMIO". The host can *not* cause them in arbitrary
places because of the SEPT_VE_DISABLE attribute.

#VE's also can't nest until after the guest retrieves the "VE info".
That means that the #VE handler at _least_ reaches C code before it's
subject to another #VE and that second one would still need to be
induced by something the guest does explicitly.
RE: [PATCH v5 22/34] x86/fred: FRED initialization code [ In reply to ]
> >> If there is no other concrete reason other than overflowing for assigning NMI
> and
> >> #DB with a stack level > 0, #VE should also be assigned with a stack level > 0,
> and
> >> #BP too. #VE can happen anytime and anywhere, so it is subject to
> overflowing too.
> > With IDT, both #VE and #BP do not use IST, but NMI, #DB, #MC and #DF do.
> >
> > Let's keep this "secret" logic for now, i.e., not change the stack levels
> > for #VE and #BP at this point. We can do "optimization", i.e., change them
> > later ????.
>
> #VE also can't happen anywhere. There is some documentation about it in
> here:
>
> https://docs.kernel.org/x86/tdx.html#linux-ve-handler
>
> But, basically, the only halfway sane thing a guest might do to hit a
> #VE is touch some "MMIO". The host can *not* cause them in arbitrary
> places because of the SEPT_VE_DISABLE attribute.
>
> #VE's also can't nest until after the guest retrieves the "VE info".
> That means that the #VE handler at _least_ reaches C code before it's
> subject to another #VE and that second one would still need to be
> induced by something the guest does explicitly.

Thanks a lot for the detailed background!
Re: [PATCH v5 22/34] x86/fred: FRED initialization code [ In reply to ]
On 3/20/23 18:02, andrew.cooper3@citrix.com wrote:
> There are fewer cases where a non-IST #VE ends up in a re-entrant fault
> (IIRC, you can still manage it by unmapping the entry stack), but you're
> still trusting the outer hypervisor to not e.g. unmap the SYSCALL entry
> point.

This is a general weakness of #VE. But, the current Linux TDX guest
implementation is not vulnerable to it. If the host unmaps something
unexpectedly, the guest will just die because of ATTR_SEPT_VE_DISABLE.
No #VE:

> https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/arch/x86/coco/tdx/tdx.c#n216
Re: [PATCH v5 22/34] x86/fred: FRED initialization code [ In reply to ]
On 22/03/2023 2:22 am, Li, Xin3 wrote:
>> If there is no other concrete reason other than overflowing for assigning NMI and
>> #DB with a stack level > 0, #VE should also be assigned with a stack level > 0, and
>> #BP too. #VE can happen anytime and anywhere, so it is subject to overflowing too.
> With IDT, both #VE and #BP do not use IST, but NMI, #DB, #MC and #DF do.
>
> Let's keep this "secret" logic for now, i.e., not change the stack levels
> for #VE and #BP at this point. We can do "optimization", i.e., change them
> later :).

Fun fact.  #BP used to be IST, and used to share the same IST as #DF.

This was spoiled by CVE-2018-8897 and a MovSS-delayed breakpoint over
INT3, at which point hardware queued both a #BP and #DB on the same IST
stack and lost program state.

There's no need specific need for #BP to be IST to begin with, hence why
making it not-IST was the security fix.

~Andrew