Mailing List Archive

[PATCH v2 3/4] x86: Allow non-faulting accesses to non-emulated MSRs if policy permits this
Starting with commit 84e848fd7a16 ("x86/hvm: disallow access to unknown MSRs")
accesses to unhandled MSRs result in #GP sent to the guest. This caused a
regression for Solaris who tries to acccess MSR_RAPL_POWER_UNIT and (unlike,
for example, Linux) does not catch exceptions when accessing MSRs that
potentially may not be present.

Instead of special-casing RAPL registers we decide what to do when any
non-emulated MSR is accessed based on ignore_msrs field of msr_policy.

Signed-off-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
---
Changes in v2:
* define x86_emul_guest_msr_access() and use it to determine whether emulated
instruction is rd/wrmsr.
* Don't use ignore_msrs for MSR accesses that are not guest's rd/wrmsr.
* Clear @val for writes too in guest_unhandled_msr()

xen/arch/x86/hvm/svm/svm.c | 10 ++++------
xen/arch/x86/hvm/vmx/vmx.c | 10 ++++------
xen/arch/x86/msr.c | 28 ++++++++++++++++++++++++++++
xen/arch/x86/pv/emul-priv-op.c | 10 ++++++----
xen/arch/x86/x86_emulate/x86_emulate.h | 6 ++++++
xen/include/asm-x86/msr.h | 3 +++
6 files changed, 51 insertions(+), 16 deletions(-)

diff --git a/xen/arch/x86/hvm/svm/svm.c b/xen/arch/x86/hvm/svm/svm.c
index b819897a4a9f..7b59885b2619 100644
--- a/xen/arch/x86/hvm/svm/svm.c
+++ b/xen/arch/x86/hvm/svm/svm.c
@@ -1965,8 +1965,8 @@ static int svm_msr_read_intercept(unsigned int msr, uint64_t *msr_content)
break;

default:
- gdprintk(XENLOG_WARNING, "RDMSR 0x%08x unimplemented\n", msr);
- goto gpf;
+ if ( guest_unhandled_msr(v, msr, msr_content, false, true) )
+ goto gpf;
}

HVM_DBG_LOG(DBG_LEVEL_MSR, "returns: ecx=%x, msr_value=%"PRIx64,
@@ -2151,10 +2151,8 @@ static int svm_msr_write_intercept(unsigned int msr, uint64_t msr_content)
break;

default:
- gdprintk(XENLOG_WARNING,
- "WRMSR 0x%08x val 0x%016"PRIx64" unimplemented\n",
- msr, msr_content);
- goto gpf;
+ if ( guest_unhandled_msr(v, msr, &msr_content, true, true) )
+ goto gpf;
}

return X86EMUL_OKAY;
diff --git a/xen/arch/x86/hvm/vmx/vmx.c b/xen/arch/x86/hvm/vmx/vmx.c
index 2d4475ee3de2..87baca57d33f 100644
--- a/xen/arch/x86/hvm/vmx/vmx.c
+++ b/xen/arch/x86/hvm/vmx/vmx.c
@@ -3017,8 +3017,8 @@ static int vmx_msr_read_intercept(unsigned int msr, uint64_t *msr_content)
break;
}

- gdprintk(XENLOG_WARNING, "RDMSR 0x%08x unimplemented\n", msr);
- goto gp_fault;
+ if ( guest_unhandled_msr(curr, msr, msr_content, false, true) )
+ goto gp_fault;
}

done:
@@ -3319,10 +3319,8 @@ static int vmx_msr_write_intercept(unsigned int msr, uint64_t msr_content)
is_last_branch_msr(msr) )
break;

- gdprintk(XENLOG_WARNING,
- "WRMSR 0x%08x val 0x%016"PRIx64" unimplemented\n",
- msr, msr_content);
- goto gp_fault;
+ if ( guest_unhandled_msr(v, msr, &msr_content, true, true) )
+ goto gp_fault;
}

return X86EMUL_OKAY;
diff --git a/xen/arch/x86/msr.c b/xen/arch/x86/msr.c
index 433d16c80728..a57d838f642b 100644
--- a/xen/arch/x86/msr.c
+++ b/xen/arch/x86/msr.c
@@ -164,6 +164,34 @@ int init_vcpu_msr_policy(struct vcpu *v)
return 0;
}

+/* Returns true if policy requires #GP to the guest. */
+bool guest_unhandled_msr(const struct vcpu *v, uint32_t msr, uint64_t *val,
+ bool is_write, bool is_guest_msr_access)
+{
+ u8 ignore_msrs = v->domain->arch.msr->ignore_msrs;
+
+ /*
+ * Accesses to unimplemented MSRs as part of emulation of instructions
+ * other than guest's RDMSR/WRMSR should never succeed.
+ */
+ if ( !is_guest_msr_access )
+ ignore_msrs = MSR_UNHANDLED_NEVER;
+
+ if ( unlikely(ignore_msrs != MSR_UNHANDLED_NEVER) )
+ *val = 0;
+
+ if ( likely(ignore_msrs != MSR_UNHANDLED_SILENT) )
+ {
+ if ( is_write )
+ gdprintk(XENLOG_WARNING, "WRMSR 0x%08x val 0x%016"PRIx64
+ " unimplemented\n", msr, *val);
+ else
+ gdprintk(XENLOG_WARNING, "RDMSR 0x%08x unimplemented\n", msr);
+ }
+
+ return (ignore_msrs == MSR_UNHANDLED_NEVER);
+}
+
int guest_rdmsr(struct vcpu *v, uint32_t msr, uint64_t *val)
{
const struct vcpu *curr = current;
diff --git a/xen/arch/x86/pv/emul-priv-op.c b/xen/arch/x86/pv/emul-priv-op.c
index dbceed8a05fd..6b378dbe2239 100644
--- a/xen/arch/x86/pv/emul-priv-op.c
+++ b/xen/arch/x86/pv/emul-priv-op.c
@@ -984,7 +984,9 @@ static int read_msr(unsigned int reg, uint64_t *val,
}
/* fall through */
default:
- gdprintk(XENLOG_WARNING, "RDMSR 0x%08x unimplemented\n", reg);
+ if ( !guest_unhandled_msr(curr, reg, val, false,
+ x86_emul_guest_msr_access(ctxt)) )
+ return X86EMUL_OKAY;
break;

normal:
@@ -1146,9 +1148,9 @@ static int write_msr(unsigned int reg, uint64_t val,
}
/* fall through */
default:
- gdprintk(XENLOG_WARNING,
- "WRMSR 0x%08x val 0x%016"PRIx64" unimplemented\n",
- reg, val);
+ if ( !guest_unhandled_msr(curr, reg, &val, true,
+ x86_emul_guest_msr_access(ctxt)) )
+ return X86EMUL_OKAY;
break;

invalid:
diff --git a/xen/arch/x86/x86_emulate/x86_emulate.h b/xen/arch/x86/x86_emulate/x86_emulate.h
index d8fb3a990933..06e6b7479f37 100644
--- a/xen/arch/x86/x86_emulate/x86_emulate.h
+++ b/xen/arch/x86/x86_emulate/x86_emulate.h
@@ -850,4 +850,10 @@ static inline void x86_emul_reset_event(struct x86_emulate_ctxt *ctxt)
ctxt->event = (struct x86_event){};
}

+static inline bool x86_emul_guest_msr_access(struct x86_emulate_ctxt *ctxt)
+{
+ return ctxt->opcode == X86EMUL_OPC(0x0f, 0x32) || /* RDMSR */
+ ctxt->opcode == X86EMUL_OPC(0x0f, 0x30); /* WRMSR */
+}
+
#endif /* __X86_EMULATE_H__ */
diff --git a/xen/include/asm-x86/msr.h b/xen/include/asm-x86/msr.h
index 16f95e734428..e7d69ad5bf29 100644
--- a/xen/include/asm-x86/msr.h
+++ b/xen/include/asm-x86/msr.h
@@ -345,5 +345,8 @@ int init_vcpu_msr_policy(struct vcpu *v);
*/
int guest_rdmsr(struct vcpu *v, uint32_t msr, uint64_t *val);
int guest_wrmsr(struct vcpu *v, uint32_t msr, uint64_t val);
+bool guest_unhandled_msr(const struct vcpu *v, uint32_t msr,
+ uint64_t *val, bool is_write,
+ bool is_guest_msr_access);

#endif /* __ASM_MSR_H */
--
1.8.3.1
Re: [PATCH v2 3/4] x86: Allow non-faulting accesses to non-emulated MSRs if policy permits this [ In reply to ]
On 20.01.2021 23:49, Boris Ostrovsky wrote:
> --- a/xen/arch/x86/msr.c
> +++ b/xen/arch/x86/msr.c
> @@ -164,6 +164,34 @@ int init_vcpu_msr_policy(struct vcpu *v)
> return 0;
> }
>
> +/* Returns true if policy requires #GP to the guest. */
> +bool guest_unhandled_msr(const struct vcpu *v, uint32_t msr, uint64_t *val,
> + bool is_write, bool is_guest_msr_access)

Would you mind dropping the "_msr" infix from the last
parameter's name? We're anyway aware we're talking about MSR
accesses here, from the function name.

As a nit - while I'm okay with the uint64_t *, I think the
uint32_t would better be unsigned int - see ./CODING_STYLE.

Finally, this being a policy function and policy being per-
domain here, I think the first parameter wants to be const
struct domain *.

> +{
> + u8 ignore_msrs = v->domain->arch.msr->ignore_msrs;

uint8_t please or, as per above, even better unsigned int.

> +
> + /*
> + * Accesses to unimplemented MSRs as part of emulation of instructions
> + * other than guest's RDMSR/WRMSR should never succeed.
> + */
> + if ( !is_guest_msr_access )
> + ignore_msrs = MSR_UNHANDLED_NEVER;

Wouldn't you better "return true" here? Such accesses also
shouldn't be logged imo (albeit I agree that's a change from
current behavior).

> + if ( unlikely(ignore_msrs != MSR_UNHANDLED_NEVER) )
> + *val = 0;

I don't understand the conditional here, even more so with
the respective changelog entry. In any event you don't
want to clobber the value ahead of ...

> + if ( likely(ignore_msrs != MSR_UNHANDLED_SILENT) )
> + {
> + if ( is_write )
> + gdprintk(XENLOG_WARNING, "WRMSR 0x%08x val 0x%016"PRIx64
> + " unimplemented\n", msr, *val);

... logging it.

> --- a/xen/arch/x86/x86_emulate/x86_emulate.h
> +++ b/xen/arch/x86/x86_emulate/x86_emulate.h
> @@ -850,4 +850,10 @@ static inline void x86_emul_reset_event(struct x86_emulate_ctxt *ctxt)
> ctxt->event = (struct x86_event){};
> }
>
> +static inline bool x86_emul_guest_msr_access(struct x86_emulate_ctxt *ctxt)

The parameter wants to be pointer-to-const. In addition I wonder
whether this wouldn't better be a sibling to
x86_insn_is_cr_access() (without a "state" parameter, which
would be unused and unavailable to the callers), which may end
up finding further uses down the road.

> +{
> + return ctxt->opcode == X86EMUL_OPC(0x0f, 0x32) || /* RDMSR */
> + ctxt->opcode == X86EMUL_OPC(0x0f, 0x30); /* WRMSR */
> +}

Personally I'd prefer if this was a single comparison:

return (ctxt->opcode | 2) == X86EMUL_OPC(0x0f, 0x32);

But maybe nowadays' compilers are capable of this
transformation?

I notice you use this function only from PV priv-op emulation.
What about the call paths through hvmemul_{read,write}_msr()?
(It's also questionable whether the write paths need this -
the only MSR written outside of WRMSR emulation is
MSR_SHADOW_GS_BASE, which can't possibly reach the "unhandled"
logic anywhere. But maybe better to be future proof here in
case new MSR writes appear in the emulator, down the road.)

Jan
Re: [PATCH v2 3/4] x86: Allow non-faulting accesses to non-emulated MSRs if policy permits this [ In reply to ]
On 1/22/21 7:51 AM, Jan Beulich wrote:
> On 20.01.2021 23:49, Boris Ostrovsky wrote:


>
>> +
>> + /*
>> + * Accesses to unimplemented MSRs as part of emulation of instructions
>> + * other than guest's RDMSR/WRMSR should never succeed.
>> + */
>> + if ( !is_guest_msr_access )
>> + ignore_msrs = MSR_UNHANDLED_NEVER;
>
> Wouldn't you better "return true" here? Such accesses also
> shouldn't be logged imo (albeit I agree that's a change from
> current behavior).


Yes, that's why I didn't return here. We will be here in !is_guest_msr_access case most likely due to a bug in the emulator so I think we do want to see the error logged.


>
>> + if ( unlikely(ignore_msrs != MSR_UNHANDLED_NEVER) )
>> + *val = 0;
>
> I don't understand the conditional here, even more so with
> the respective changelog entry. In any event you don't
> want to clobber the value ahead of ...
>
>> + if ( likely(ignore_msrs != MSR_UNHANDLED_SILENT) )
>> + {
>> + if ( is_write )
>> + gdprintk(XENLOG_WARNING, "WRMSR 0x%08x val 0x%016"PRIx64
>> + " unimplemented\n", msr, *val);
>
> ... logging it.


True. I dropped !is_write from v1 without considering this.

As far as the conditional --- dropping it too would be a behavior change.


>
>> --- a/xen/arch/x86/x86_emulate/x86_emulate.h
>> +++ b/xen/arch/x86/x86_emulate/x86_emulate.h
>> @@ -850,4 +850,10 @@ static inline void x86_emul_reset_event(struct x86_emulate_ctxt *ctxt)
>> ctxt->event = (struct x86_event){};
>> }
>>
>> +static inline bool x86_emul_guest_msr_access(struct x86_emulate_ctxt *ctxt)
>
> The parameter wants to be pointer-to-const. In addition I wonder
> whether this wouldn't better be a sibling to
> x86_insn_is_cr_access() (without a "state" parameter, which
> would be unused and unavailable to the callers), which may end
> up finding further uses down the road.


"Sibling" in terms of name (yes, it would be) or something else?


>
>> +{
>> + return ctxt->opcode == X86EMUL_OPC(0x0f, 0x32) || /* RDMSR */
>> + ctxt->opcode == X86EMUL_OPC(0x0f, 0x30); /* WRMSR */
>> +}
>
> Personally I'd prefer if this was a single comparison:
>
> return (ctxt->opcode | 2) == X86EMUL_OPC(0x0f, 0x32);
>
> But maybe nowadays' compilers are capable of this
> transformation?

Here is what I've got (not an inline but shouldn't make much difference I'd think)

ffff82d040385960 <x86_emul_guest_msr_access_2>: # your code
ffff82d040385960: 8b 47 2c mov 0x2c(%rdi),%eax
ffff82d040385963: 83 e0 fd and $0xfffffffd,%eax
ffff82d040385966: 3d 30 00 0f 00 cmp $0xf0030,%eax
ffff82d04038596b: 0f 94 c0 sete %al
ffff82d04038596e: c3 retq

ffff82d04038596f <x86_emul_guest_msr_access_1>: # my code
ffff82d04038596f: 8b 47 2c mov 0x2c(%rdi),%eax
ffff82d040385972: 83 c8 02 or $0x2,%eax
ffff82d040385975: 3d 32 00 0f 00 cmp $0xf0032,%eax
ffff82d04038597a: 0f 94 c0 sete %al
ffff82d04038597d: c3 retq


So it's a wash in terms of generated code.

>
> I notice you use this function only from PV priv-op emulation.
> What about the call paths through hvmemul_{read,write}_msr()?
> (It's also questionable whether the write paths need this -
> the only MSR written outside of WRMSR emulation is
> MSR_SHADOW_GS_BASE, which can't possibly reach the "unhandled"
> logic anywhere. But maybe better to be future proof here in
> case new MSR writes appear in the emulator, down the road.)


Won't we end up in hvm_funcs.msr_write_intercept ops which do call it?


-boris
Re: [PATCH v2 3/4] x86: Allow non-faulting accesses to non-emulated MSRs if policy permits this [ In reply to ]
On 22.01.2021 20:52, Boris Ostrovsky wrote:
> On 1/22/21 7:51 AM, Jan Beulich wrote:
>> On 20.01.2021 23:49, Boris Ostrovsky wrote:
>>> +
>>> + /*
>>> + * Accesses to unimplemented MSRs as part of emulation of instructions
>>> + * other than guest's RDMSR/WRMSR should never succeed.
>>> + */
>>> + if ( !is_guest_msr_access )
>>> + ignore_msrs = MSR_UNHANDLED_NEVER;
>>
>> Wouldn't you better "return true" here? Such accesses also
>> shouldn't be logged imo (albeit I agree that's a change from
>> current behavior).
>
>
> Yes, that's why I didn't return here. We will be here in !is_guest_msr_access case most likely due to a bug in the emulator so I think we do want to see the error logged.

Why "most likely"?

>>> + if ( unlikely(ignore_msrs != MSR_UNHANDLED_NEVER) )
>>> + *val = 0;
>>
>> I don't understand the conditional here, even more so with
>> the respective changelog entry. In any event you don't
>> want to clobber the value ahead of ...
>>
>>> + if ( likely(ignore_msrs != MSR_UNHANDLED_SILENT) )
>>> + {
>>> + if ( is_write )
>>> + gdprintk(XENLOG_WARNING, "WRMSR 0x%08x val 0x%016"PRIx64
>>> + " unimplemented\n", msr, *val);
>>
>> ... logging it.
>
>
> True. I dropped !is_write from v1 without considering this.
>
> As far as the conditional --- dropping it too would be a behavior change.

Albeit an intentional one then? Plus I think I have trouble
seeing what behavior it would be that would change.

>>> --- a/xen/arch/x86/x86_emulate/x86_emulate.h
>>> +++ b/xen/arch/x86/x86_emulate/x86_emulate.h
>>> @@ -850,4 +850,10 @@ static inline void x86_emul_reset_event(struct x86_emulate_ctxt *ctxt)
>>> ctxt->event = (struct x86_event){};
>>> }
>>>
>>> +static inline bool x86_emul_guest_msr_access(struct x86_emulate_ctxt *ctxt)
>>
>> The parameter wants to be pointer-to-const. In addition I wonder
>> whether this wouldn't better be a sibling to
>> x86_insn_is_cr_access() (without a "state" parameter, which
>> would be unused and unavailable to the callers), which may end
>> up finding further uses down the road.
>
>
> "Sibling" in terms of name (yes, it would be) or something else?

Name and (possible) purpose - a validate hook could want to
make use of this, for example.

>>> +{
>>> + return ctxt->opcode == X86EMUL_OPC(0x0f, 0x32) || /* RDMSR */
>>> + ctxt->opcode == X86EMUL_OPC(0x0f, 0x30); /* WRMSR */
>>> +}
>>
>> Personally I'd prefer if this was a single comparison:
>>
>> return (ctxt->opcode | 2) == X86EMUL_OPC(0x0f, 0x32);
>>
>> But maybe nowadays' compilers are capable of this
>> transformation?
>
> Here is what I've got (not an inline but shouldn't make much difference I'd think)
>
> ffff82d040385960 <x86_emul_guest_msr_access_2>: # your code
> ffff82d040385960: 8b 47 2c mov 0x2c(%rdi),%eax
> ffff82d040385963: 83 e0 fd and $0xfffffffd,%eax
> ffff82d040385966: 3d 30 00 0f 00 cmp $0xf0030,%eax
> ffff82d04038596b: 0f 94 c0 sete %al
> ffff82d04038596e: c3 retq
>
> ffff82d04038596f <x86_emul_guest_msr_access_1>: # my code
> ffff82d04038596f: 8b 47 2c mov 0x2c(%rdi),%eax
> ffff82d040385972: 83 c8 02 or $0x2,%eax
> ffff82d040385975: 3d 32 00 0f 00 cmp $0xf0032,%eax
> ffff82d04038597a: 0f 94 c0 sete %al
> ffff82d04038597d: c3 retq
>
>
> So it's a wash in terms of generated code.

True, albeit I guess you got "your code" and "my code" the
wrong way round, as I don't expect the compiler to
translate | into "and".

>> I notice you use this function only from PV priv-op emulation.
>> What about the call paths through hvmemul_{read,write}_msr()?
>> (It's also questionable whether the write paths need this -
>> the only MSR written outside of WRMSR emulation is
>> MSR_SHADOW_GS_BASE, which can't possibly reach the "unhandled"
>> logic anywhere. But maybe better to be future proof here in
>> case new MSR writes appear in the emulator, down the road.)
>
>
> Won't we end up in hvm_funcs.msr_write_intercept ops which do call it?

Of course we will - the boolean will very likely need
propagating (a possible alternative being a per-vCPU flag
indicating "in emulator").

Jan
Re: [PATCH v2 3/4] x86: Allow non-faulting accesses to non-emulated MSRs if policy permits this [ In reply to ]
On 21-01-25 11:22:08, Jan Beulich wrote:
> On 22.01.2021 20:52, Boris Ostrovsky wrote:
> > On 1/22/21 7:51 AM, Jan Beulich wrote:
> >> On 20.01.2021 23:49, Boris Ostrovsky wrote:
> >>> +
> >>> + /*
> >>> + * Accesses to unimplemented MSRs as part of emulation of instructions
> >>> + * other than guest's RDMSR/WRMSR should never succeed.
> >>> + */
> >>> + if ( !is_guest_msr_access )
> >>> + ignore_msrs = MSR_UNHANDLED_NEVER;
> >>
> >> Wouldn't you better "return true" here? Such accesses also
> >> shouldn't be logged imo (albeit I agree that's a change from
> >> current behavior).
> >
> >
> > Yes, that's why I didn't return here. We will be here in !is_guest_msr_access case most likely due to a bug in the emulator so I think we do want to see the error logged.
>
> Why "most likely"?


OK, definitely ;-) But I still think logging these accesses would be helpful.

>
> >>> + if ( unlikely(ignore_msrs != MSR_UNHANDLED_NEVER) )
> >>> + *val = 0;
> >>
> >> I don't understand the conditional here, even more so with
> >> the respective changelog entry. In any event you don't
> >> want to clobber the value ahead of ...
> >>
> >>> + if ( likely(ignore_msrs != MSR_UNHANDLED_SILENT) )
> >>> + {
> >>> + if ( is_write )
> >>> + gdprintk(XENLOG_WARNING, "WRMSR 0x%08x val 0x%016"PRIx64
> >>> + " unimplemented\n", msr, *val);
> >>
> >> ... logging it.
> >
> >
> > True. I dropped !is_write from v1 without considering this.
> >
> > As far as the conditional --- dropping it too would be a behavior change.
>
> Albeit an intentional one then? Plus I think I have trouble
> seeing what behavior it would be that would change.


Currently callers of, say, read_msr() don't expect the argument that they pass in to change. Granted, they shouldn't (and AFAICS don't) look at it but it's a change nonetheless.

>
> >>> --- a/xen/arch/x86/x86_emulate/x86_emulate.h
> >>> +++ b/xen/arch/x86/x86_emulate/x86_emulate.h
> >>> @@ -850,4 +850,10 @@ static inline void x86_emul_reset_event(struct x86_emulate_ctxt *ctxt)
> >>> ctxt->event = (struct x86_event){};
> >>> }
> >>>
> >>> +static inline bool x86_emul_guest_msr_access(struct x86_emulate_ctxt *ctxt)
> >>
> >> The parameter wants to be pointer-to-const. In addition I wonder
> >> whether this wouldn't better be a sibling to
> >> x86_insn_is_cr_access() (without a "state" parameter, which
> >> would be unused and unavailable to the callers), which may end
> >> up finding further uses down the road.
> >
> >
> > "Sibling" in terms of name (yes, it would be) or something else?
>
> Name and (possible) purpose - a validate hook could want to
> make use of this, for example.

A validate hook?

>
> >>> +{
> >>> + return ctxt->opcode == X86EMUL_OPC(0x0f, 0x32) || /* RDMSR */
> >>> + ctxt->opcode == X86EMUL_OPC(0x0f, 0x30); /* WRMSR */
> >>> +}
> >>
> >> Personally I'd prefer if this was a single comparison:
> >>
> >> return (ctxt->opcode | 2) == X86EMUL_OPC(0x0f, 0x32);
> >>
> >> But maybe nowadays' compilers are capable of this
> >> transformation?
> >
> > Here is what I've got (not an inline but shouldn't make much difference I'd think)
> >
> > ffff82d040385960 <x86_emul_guest_msr_access_2>: # your code
> > ffff82d040385960: 8b 47 2c mov 0x2c(%rdi),%eax
> > ffff82d040385963: 83 e0 fd and $0xfffffffd,%eax
> > ffff82d040385966: 3d 30 00 0f 00 cmp $0xf0030,%eax
> > ffff82d04038596b: 0f 94 c0 sete %al
> > ffff82d04038596e: c3 retq
> >
> > ffff82d04038596f <x86_emul_guest_msr_access_1>: # my code
> > ffff82d04038596f: 8b 47 2c mov 0x2c(%rdi),%eax
> > ffff82d040385972: 83 c8 02 or $0x2,%eax
> > ffff82d040385975: 3d 32 00 0f 00 cmp $0xf0032,%eax
> > ffff82d04038597a: 0f 94 c0 sete %al
> > ffff82d04038597d: c3 retq
> >
> >
> > So it's a wash in terms of generated code.
>
> True, albeit I guess you got "your code" and "my code" the
> wrong way round, as I don't expect the compiler to
> translate | into "and".


Yes, looks like I did switch them.

>
> >> I notice you use this function only from PV priv-op emulation.
> >> What about the call paths through hvmemul_{read,write}_msr()?
> >> (It's also questionable whether the write paths need this -
> >> the only MSR written outside of WRMSR emulation is
> >> MSR_SHADOW_GS_BASE, which can't possibly reach the "unhandled"
> >> logic anywhere. But maybe better to be future proof here in
> >> case new MSR writes appear in the emulator, down the road.)
> >
> >
> > Won't we end up in hvm_funcs.msr_write_intercept ops which do call it?
>
> Of course we will - the boolean will very likely need
> propagating (a possible alternative being a per-vCPU flag
> indicating "in emulator").


Oh, I see what you mean. By per-vcpu flag you mean arch_vcpu field I assume?


-boris
Re: [PATCH v2 3/4] x86: Allow non-faulting accesses to non-emulated MSRs if policy permits this [ In reply to ]
On 25.01.2021 19:42, Boris Ostrovsky wrote:
> On 21-01-25 11:22:08, Jan Beulich wrote:
>> On 22.01.2021 20:52, Boris Ostrovsky wrote:
>>> On 1/22/21 7:51 AM, Jan Beulich wrote:
>>>> On 20.01.2021 23:49, Boris Ostrovsky wrote:
>>>>> +
>>>>> + /*
>>>>> + * Accesses to unimplemented MSRs as part of emulation of instructions
>>>>> + * other than guest's RDMSR/WRMSR should never succeed.
>>>>> + */
>>>>> + if ( !is_guest_msr_access )
>>>>> + ignore_msrs = MSR_UNHANDLED_NEVER;
>>>>
>>>> Wouldn't you better "return true" here? Such accesses also
>>>> shouldn't be logged imo (albeit I agree that's a change from
>>>> current behavior).
>>>
>>>
>>> Yes, that's why I didn't return here. We will be here in !is_guest_msr_access case most likely due to a bug in the emulator so I think we do want to see the error logged.
>>
>> Why "most likely"?
>
>
> OK, definitely ;-)

Oops - I was thinking the other way around, considering such
to possibly be legitimate. It just so happens that curently
we have no such path.

> But I still think logging these accesses would be helpful.

Because of the above I continue to question this.

>>>>> + if ( unlikely(ignore_msrs != MSR_UNHANDLED_NEVER) )
>>>>> + *val = 0;
>>>>
>>>> I don't understand the conditional here, even more so with
>>>> the respective changelog entry. In any event you don't
>>>> want to clobber the value ahead of ...
>>>>
>>>>> + if ( likely(ignore_msrs != MSR_UNHANDLED_SILENT) )
>>>>> + {
>>>>> + if ( is_write )
>>>>> + gdprintk(XENLOG_WARNING, "WRMSR 0x%08x val 0x%016"PRIx64
>>>>> + " unimplemented\n", msr, *val);
>>>>
>>>> ... logging it.
>>>
>>>
>>> True. I dropped !is_write from v1 without considering this.
>>>
>>> As far as the conditional --- dropping it too would be a behavior change.
>>
>> Albeit an intentional one then? Plus I think I have trouble
>> seeing what behavior it would be that would change.
>
>
> Currently callers of, say, read_msr() don't expect the argument that they pass in to change. Granted, they shouldn't (and AFAICS don't) look at it but it's a change nonetheless.

Hmm, I'm confused: The purpose of read_msr() is to change the
value pointed at by the passed in argument. And for write_msr()
the users of the hook pass the argument by value, i.e. wouldn't
observe the changed value (it would only possibly be
intermediate layers which might observe the change, but those
ought to not care).

>>>>> --- a/xen/arch/x86/x86_emulate/x86_emulate.h
>>>>> +++ b/xen/arch/x86/x86_emulate/x86_emulate.h
>>>>> @@ -850,4 +850,10 @@ static inline void x86_emul_reset_event(struct x86_emulate_ctxt *ctxt)
>>>>> ctxt->event = (struct x86_event){};
>>>>> }
>>>>>
>>>>> +static inline bool x86_emul_guest_msr_access(struct x86_emulate_ctxt *ctxt)
>>>>
>>>> The parameter wants to be pointer-to-const. In addition I wonder
>>>> whether this wouldn't better be a sibling to
>>>> x86_insn_is_cr_access() (without a "state" parameter, which
>>>> would be unused and unavailable to the callers), which may end
>>>> up finding further uses down the road.
>>>
>>>
>>> "Sibling" in terms of name (yes, it would be) or something else?
>>
>> Name and (possible) purpose - a validate hook could want to
>> make use of this, for example.
>
> A validate hook?

Quoting from struct x86_emulate_ops:

/*
* validate: Post-decode, pre-emulate hook to allow caller controlled
* filtering.
*/
int (*validate)(
const struct x86_emulate_state *state,
struct x86_emulate_ctxt *ctxt);

Granted to be directly usable the function would need to have a
"state" parameter. As that's unused, having it have one and
passing NULL in your case might be acceptable. But I also could
see arguments towards this not being a good idea.

>>>> I notice you use this function only from PV priv-op emulation.
>>>> What about the call paths through hvmemul_{read,write}_msr()?
>>>> (It's also questionable whether the write paths need this -
>>>> the only MSR written outside of WRMSR emulation is
>>>> MSR_SHADOW_GS_BASE, which can't possibly reach the "unhandled"
>>>> logic anywhere. But maybe better to be future proof here in
>>>> case new MSR writes appear in the emulator, down the road.)
>>>
>>>
>>> Won't we end up in hvm_funcs.msr_write_intercept ops which do call it?
>>
>> Of course we will - the boolean will very likely need
>> propagating (a possible alternative being a per-vCPU flag
>> indicating "in emulator").
>
>
> Oh, I see what you mean. By per-vcpu flag you mean arch_vcpu field I assume?

Yes, a boolean in one of the arch-specific per-vCPU structs.
Whether that's arch_vcpu or perhaps something HVM specific is
another question.

Jan
Re: [PATCH v2 3/4] x86: Allow non-faulting accesses to non-emulated MSRs if policy permits this [ In reply to ]
On 21-01-26 10:05:47, Jan Beulich wrote:
> On 25.01.2021 19:42, Boris Ostrovsky wrote:
> > On 21-01-25 11:22:08, Jan Beulich wrote:
> >> On 22.01.2021 20:52, Boris Ostrovsky wrote:
> >>> On 1/22/21 7:51 AM, Jan Beulich wrote:
> >>>> On 20.01.2021 23:49, Boris Ostrovsky wrote:
> >>>>> +
> >>>>> + /*
> >>>>> + * Accesses to unimplemented MSRs as part of emulation of instructions
> >>>>> + * other than guest's RDMSR/WRMSR should never succeed.
> >>>>> + */
> >>>>> + if ( !is_guest_msr_access )
> >>>>> + ignore_msrs = MSR_UNHANDLED_NEVER;
> >>>>
> >>>> Wouldn't you better "return true" here? Such accesses also
> >>>> shouldn't be logged imo (albeit I agree that's a change from
> >>>> current behavior).
> >>>
> >>>
> >>> Yes, that's why I didn't return here. We will be here in !is_guest_msr_access case most likely due to a bug in the emulator so I think we do want to see the error logged.
> >>
> >> Why "most likely"?
> >
> >
> > OK, definitely ;-)
>
> Oops - I was thinking the other way around, considering such
> to possibly be legitimate. It just so happens that curently
> we have no such path.

I was imagining a case where we'd add have another "ops->read_msr(_regs.ecx, ...)"
in the emulator and some values of ecx may not be tested. (Or even passing
an explicit index that is not handled, although that is highly unlikely to
pass code review).

Yes, these cases do not currently exist.

>
> > But I still think logging these accesses would be helpful.
>
> Because of the above I continue to question this.
>
> >>>>> + if ( unlikely(ignore_msrs != MSR_UNHANDLED_NEVER) )
> >>>>> + *val = 0;
> >>>>
> >>>> I don't understand the conditional here, even more so with
> >>>> the respective changelog entry. In any event you don't
> >>>> want to clobber the value ahead of ...
> >>>>
> >>>>> + if ( likely(ignore_msrs != MSR_UNHANDLED_SILENT) )
> >>>>> + {
> >>>>> + if ( is_write )
> >>>>> + gdprintk(XENLOG_WARNING, "WRMSR 0x%08x val 0x%016"PRIx64
> >>>>> + " unimplemented\n", msr, *val);
> >>>>
> >>>> ... logging it.
> >>>
> >>>
> >>> True. I dropped !is_write from v1 without considering this.
> >>>
> >>> As far as the conditional --- dropping it too would be a behavior change.
> >>
> >> Albeit an intentional one then? Plus I think I have trouble
> >> seeing what behavior it would be that would change.
> >
> >
> > Currently callers of, say, read_msr() don't expect the argument that they pass in to change. Granted, they shouldn't (and AFAICS don't) look at it but it's a change nonetheless.
>
> Hmm, I'm confused: The purpose of read_msr() is to change the
> value pointed at by the passed in argument.

Only if MSR exists/handled. We add parameters that allow it to be set
to zero for unhandled case but default (which is existing behavior, i.e.
MSR_UNHANDLED_NEVER) would leave the (pointed to) argument unchanged.

> And for write_msr()
> the users of the hook pass the argument by value, i.e. wouldn't
> observe the changed value (it would only possibly be
> intermediate layers which might observe the change, but those
> ought to not care).

Yes, this would only be relevant to reads but since I dropped is_write check
the conditional covers both reads and writes.

In any case, this (and the one above) are minor issues and I don't mind
making the change.

>
> >>>>> --- a/xen/arch/x86/x86_emulate/x86_emulate.h
> >>>>> +++ b/xen/arch/x86/x86_emulate/x86_emulate.h
> >>>>> @@ -850,4 +850,10 @@ static inline void x86_emul_reset_event(struct x86_emulate_ctxt *ctxt)
> >>>>> ctxt->event = (struct x86_event){};
> >>>>> }
> >>>>>
> >>>>> +static inline bool x86_emul_guest_msr_access(struct x86_emulate_ctxt *ctxt)
> >>>>
> >>>> The parameter wants to be pointer-to-const. In addition I wonder
> >>>> whether this wouldn't better be a sibling to
> >>>> x86_insn_is_cr_access() (without a "state" parameter, which
> >>>> would be unused and unavailable to the callers), which may end
> >>>> up finding further uses down the road.
> >>>
> >>>
> >>> "Sibling" in terms of name (yes, it would be) or something else?
> >>
> >> Name and (possible) purpose - a validate hook could want to
> >> make use of this, for example.
> >
> > A validate hook?
>
> Quoting from struct x86_emulate_ops:
>
> /*
> * validate: Post-decode, pre-emulate hook to allow caller controlled
> * filtering.
> */
> int (*validate)(
> const struct x86_emulate_state *state,
> struct x86_emulate_ctxt *ctxt);
>
> Granted to be directly usable the function would need to have a
> "state" parameter. As that's unused, having it have one and
> passing NULL in your case might be acceptable. But I also could
> see arguments towards this not being a good idea.

I see. Where would we need to call this hook though? We are never directly
emulating MSR access (compared to, for example, CR accesses where
x86_insn_is_cr_access is used). And for PV we already validate it in
emul-priv-op.c():validate().

>
> >>>> I notice you use this function only from PV priv-op emulation.
> >>>> What about the call paths through hvmemul_{read,write}_msr()?
> >>>> (It's also questionable whether the write paths need this -
> >>>> the only MSR written outside of WRMSR emulation is
> >>>> MSR_SHADOW_GS_BASE, which can't possibly reach the "unhandled"
> >>>> logic anywhere. But maybe better to be future proof here in
> >>>> case new MSR writes appear in the emulator, down the road.)
> >>>
> >>>
> >>> Won't we end up in hvm_funcs.msr_write_intercept ops which do call it?
> >>
> >> Of course we will - the boolean will very likely need
> >> propagating (a possible alternative being a per-vCPU flag
> >> indicating "in emulator").
> >
> >
> > Oh, I see what you mean. By per-vcpu flag you mean arch_vcpu field I assume?
>
> Yes, a boolean in one of the arch-specific per-vCPU structs.
> Whether that's arch_vcpu or perhaps something HVM specific is
> another question.

Currently we only need this for HVM but using it for both will avoid the need to
check guest type.


-boris
Re: [PATCH v2 3/4] x86: Allow non-faulting accesses to non-emulated MSRs if policy permits this [ In reply to ]
On 26.01.2021 17:02, Boris Ostrovsky wrote:
> On 21-01-26 10:05:47, Jan Beulich wrote:
>> On 25.01.2021 19:42, Boris Ostrovsky wrote:
>>> On 21-01-25 11:22:08, Jan Beulich wrote:
>>>> On 22.01.2021 20:52, Boris Ostrovsky wrote:
>>>>> "Sibling" in terms of name (yes, it would be) or something else?
>>>>
>>>> Name and (possible) purpose - a validate hook could want to
>>>> make use of this, for example.
>>>
>>> A validate hook?
>>
>> Quoting from struct x86_emulate_ops:
>>
>> /*
>> * validate: Post-decode, pre-emulate hook to allow caller controlled
>> * filtering.
>> */
>> int (*validate)(
>> const struct x86_emulate_state *state,
>> struct x86_emulate_ctxt *ctxt);
>>
>> Granted to be directly usable the function would need to have a
>> "state" parameter. As that's unused, having it have one and
>> passing NULL in your case might be acceptable. But I also could
>> see arguments towards this not being a good idea.
>
> I see. Where would we need to call this hook though? We are never directly
> emulating MSR access (compared to, for example, CR accesses where
> x86_insn_is_cr_access is used). And for PV we already validate it in
> emul-priv-op.c():validate().

If you look at some of the functions used for this hook, you may
realize that what your function does could also fit a hypothetical
future hook. Hence I was suggesting to make the new function
suitable for such use right away (and in particular fit their
naming scheme). But it looks like this has ended up more confusing
than anything else, so never mind.

Jan
Re: [PATCH v2 3/4] x86: Allow non-faulting accesses to non-emulated MSRs if policy permits this [ In reply to ]
On Wed, Jan 20, 2021 at 05:49:11PM -0500, Boris Ostrovsky wrote:
> Starting with commit 84e848fd7a16 ("x86/hvm: disallow access to unknown MSRs")
> accesses to unhandled MSRs result in #GP sent to the guest. This caused a
> regression for Solaris who tries to acccess MSR_RAPL_POWER_UNIT and (unlike,
> for example, Linux) does not catch exceptions when accessing MSRs that
> potentially may not be present.
>
> Instead of special-casing RAPL registers we decide what to do when any
> non-emulated MSR is accessed based on ignore_msrs field of msr_policy.
>
> Signed-off-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
> ---
> Changes in v2:
> * define x86_emul_guest_msr_access() and use it to determine whether emulated
> instruction is rd/wrmsr.
> * Don't use ignore_msrs for MSR accesses that are not guest's rd/wrmsr.
> * Clear @val for writes too in guest_unhandled_msr()
>
> xen/arch/x86/hvm/svm/svm.c | 10 ++++------
> xen/arch/x86/hvm/vmx/vmx.c | 10 ++++------
> xen/arch/x86/msr.c | 28 ++++++++++++++++++++++++++++
> xen/arch/x86/pv/emul-priv-op.c | 10 ++++++----
> xen/arch/x86/x86_emulate/x86_emulate.h | 6 ++++++
> xen/include/asm-x86/msr.h | 3 +++
> 6 files changed, 51 insertions(+), 16 deletions(-)
>
> diff --git a/xen/arch/x86/hvm/svm/svm.c b/xen/arch/x86/hvm/svm/svm.c
> index b819897a4a9f..7b59885b2619 100644
> --- a/xen/arch/x86/hvm/svm/svm.c
> +++ b/xen/arch/x86/hvm/svm/svm.c
> @@ -1965,8 +1965,8 @@ static int svm_msr_read_intercept(unsigned int msr, uint64_t *msr_content)
> break;
>
> default:
> - gdprintk(XENLOG_WARNING, "RDMSR 0x%08x unimplemented\n", msr);
> - goto gpf;
> + if ( guest_unhandled_msr(v, msr, msr_content, false, true) )
> + goto gpf;
> }
>
> HVM_DBG_LOG(DBG_LEVEL_MSR, "returns: ecx=%x, msr_value=%"PRIx64,
> @@ -2151,10 +2151,8 @@ static int svm_msr_write_intercept(unsigned int msr, uint64_t msr_content)
> break;
>
> default:
> - gdprintk(XENLOG_WARNING,
> - "WRMSR 0x%08x val 0x%016"PRIx64" unimplemented\n",
> - msr, msr_content);
> - goto gpf;
> + if ( guest_unhandled_msr(v, msr, &msr_content, true, true) )
> + goto gpf;
> }
>
> return X86EMUL_OKAY;
> diff --git a/xen/arch/x86/hvm/vmx/vmx.c b/xen/arch/x86/hvm/vmx/vmx.c
> index 2d4475ee3de2..87baca57d33f 100644
> --- a/xen/arch/x86/hvm/vmx/vmx.c
> +++ b/xen/arch/x86/hvm/vmx/vmx.c
> @@ -3017,8 +3017,8 @@ static int vmx_msr_read_intercept(unsigned int msr, uint64_t *msr_content)
> break;
> }
>
> - gdprintk(XENLOG_WARNING, "RDMSR 0x%08x unimplemented\n", msr);
> - goto gp_fault;
> + if ( guest_unhandled_msr(curr, msr, msr_content, false, true) )
> + goto gp_fault;
> }
>
> done:
> @@ -3319,10 +3319,8 @@ static int vmx_msr_write_intercept(unsigned int msr, uint64_t msr_content)
> is_last_branch_msr(msr) )
> break;
>
> - gdprintk(XENLOG_WARNING,
> - "WRMSR 0x%08x val 0x%016"PRIx64" unimplemented\n",
> - msr, msr_content);
> - goto gp_fault;
> + if ( guest_unhandled_msr(v, msr, &msr_content, true, true) )
> + goto gp_fault;
> }

I think this could be done in hvm_msr_read_intercept instead of having
to call guest_unhandled_msr from each vendor specific handler?

Oh, I see, that's likely done to differentiate between guest MSR
accesses and emulator ones? I'm not sure we really need to make a
difference between guests MSR accesses and emulator ones, surely in
the past they would be treated equally?

Thanks, Roger.
Re: [PATCH v2 3/4] x86: Allow non-faulting accesses to non-emulated MSRs if policy permits this [ In reply to ]
On 18.02.2021 12:24, Roger Pau Monné wrote:
> On Wed, Jan 20, 2021 at 05:49:11PM -0500, Boris Ostrovsky wrote:
>> --- a/xen/arch/x86/hvm/vmx/vmx.c
>> +++ b/xen/arch/x86/hvm/vmx/vmx.c
>> @@ -3017,8 +3017,8 @@ static int vmx_msr_read_intercept(unsigned int msr, uint64_t *msr_content)
>> break;
>> }
>>
>> - gdprintk(XENLOG_WARNING, "RDMSR 0x%08x unimplemented\n", msr);
>> - goto gp_fault;
>> + if ( guest_unhandled_msr(curr, msr, msr_content, false, true) )
>> + goto gp_fault;
>> }
>>
>> done:
>> @@ -3319,10 +3319,8 @@ static int vmx_msr_write_intercept(unsigned int msr, uint64_t msr_content)
>> is_last_branch_msr(msr) )
>> break;
>>
>> - gdprintk(XENLOG_WARNING,
>> - "WRMSR 0x%08x val 0x%016"PRIx64" unimplemented\n",
>> - msr, msr_content);
>> - goto gp_fault;
>> + if ( guest_unhandled_msr(v, msr, &msr_content, true, true) )
>> + goto gp_fault;
>> }
>
> I think this could be done in hvm_msr_read_intercept instead of having
> to call guest_unhandled_msr from each vendor specific handler?
>
> Oh, I see, that's likely done to differentiate between guest MSR
> accesses and emulator ones? I'm not sure we really need to make a
> difference between guests MSR accesses and emulator ones, surely in
> the past they would be treated equally?

We did discuss this before. Even if they were treated the same in
the past, that's not correct, and hence we shouldn't suppress the
distinction going forward. A guest explicitly asking to access an
MSR (via RDMSR/WRMSR) is entirely different from the emulator
perhaps just probing an MSR, falling back to some default behavior
if it's unavailable.

Jan
Re: [PATCH v2 3/4] x86: Allow non-faulting accesses to non-emulated MSRs if policy permits this [ In reply to ]
On Thu, Feb 18, 2021 at 12:57:13PM +0100, Jan Beulich wrote:
> On 18.02.2021 12:24, Roger Pau Monné wrote:
> > On Wed, Jan 20, 2021 at 05:49:11PM -0500, Boris Ostrovsky wrote:
> >> --- a/xen/arch/x86/hvm/vmx/vmx.c
> >> +++ b/xen/arch/x86/hvm/vmx/vmx.c
> >> @@ -3017,8 +3017,8 @@ static int vmx_msr_read_intercept(unsigned int msr, uint64_t *msr_content)
> >> break;
> >> }
> >>
> >> - gdprintk(XENLOG_WARNING, "RDMSR 0x%08x unimplemented\n", msr);
> >> - goto gp_fault;
> >> + if ( guest_unhandled_msr(curr, msr, msr_content, false, true) )
> >> + goto gp_fault;
> >> }
> >>
> >> done:
> >> @@ -3319,10 +3319,8 @@ static int vmx_msr_write_intercept(unsigned int msr, uint64_t msr_content)
> >> is_last_branch_msr(msr) )
> >> break;
> >>
> >> - gdprintk(XENLOG_WARNING,
> >> - "WRMSR 0x%08x val 0x%016"PRIx64" unimplemented\n",
> >> - msr, msr_content);
> >> - goto gp_fault;
> >> + if ( guest_unhandled_msr(v, msr, &msr_content, true, true) )
> >> + goto gp_fault;
> >> }
> >
> > I think this could be done in hvm_msr_read_intercept instead of having
> > to call guest_unhandled_msr from each vendor specific handler?
> >
> > Oh, I see, that's likely done to differentiate between guest MSR
> > accesses and emulator ones? I'm not sure we really need to make a
> > difference between guests MSR accesses and emulator ones, surely in
> > the past they would be treated equally?
>
> We did discuss this before. Even if they were treated the same in
> the past, that's not correct, and hence we shouldn't suppress the
> distinction going forward. A guest explicitly asking to access an
> MSR (via RDMSR/WRMSR) is entirely different from the emulator
> perhaps just probing an MSR, falling back to some default behavior
> if it's unavailable.

Ack, then placing the calls to guest_unhandled_msr in vendor code
seems like the best option.

Thanks, Roger.