Mailing List Archive

[xen-unstable] x86 mce: Make mce_action action be usable for both delayed handler and
# HG changeset patch
# User Keir Fraser <keir.fraser@citrix.com>
# Date 1276154291 -3600
# Node ID 2d2812de6792e51c722e51baf6b16e4b776f41b3
# Parent 2979e10a3ca8c3e9d3f172b92fdcf3a66b670b80
x86 mce: Make mce_action action be usable for both delayed handler and
urgent handler

Originally mce_action is called for delayed hander. Change it to be
used for both delayed handler and urgent handler.Wrap it with
mce_delayed_action for delay handler.

Change the return value to be more clearly.

Change the mca handler from mca_code to a function to be more
flexible. And change the interface to mce_handler to be mca_binfo to
pass more information.

Signed-off-by: Jiang, Yunhong <yunhong.jiang@intel.com>
---
xen/arch/x86/cpu/mcheck/mce_intel.c | 210 ++++++++++++++++++++++--------------
xen/arch/x86/cpu/mcheck/x86_mca.h | 15 +-
2 files changed, 140 insertions(+), 85 deletions(-)

diff -r 2979e10a3ca8 -r 2d2812de6792 xen/arch/x86/cpu/mcheck/mce_intel.c
--- a/xen/arch/x86/cpu/mcheck/mce_intel.c Thu Jun 10 08:17:38 2010 +0100
+++ b/xen/arch/x86/cpu/mcheck/mce_intel.c Thu Jun 10 08:18:11 2010 +0100
@@ -154,89 +154,124 @@ static void mce_barrier_enter(struct mce
static void mce_barrier_enter(struct mce_softirq_barrier *);
static void mce_barrier_exit(struct mce_softirq_barrier *);

-static void intel_UCR_handler(struct mcinfo_bank *bank,
- struct mcinfo_global *global,
- struct mcinfo_extended *extension,
- struct mca_handle_result *result);
-#define INTEL_MAX_RECOVERY 2
-struct mca_error_handler intel_recovery_handler[INTEL_MAX_RECOVERY] =
- {{0x017A, intel_UCR_handler}, {0x00C0, intel_UCR_handler}};
+struct mca_error_handler *mce_dhandlers, *mce_uhandlers;
+int mce_dhandler_num, mce_uhandler_num;
+
+enum mce_result
+{
+ MCER_NOERROR,
+ MCER_RECOVERED,
+ /* Not recoverd, but can continue */
+ MCER_CONTINUE,
+ MCER_RESET,
+};
+
+/* Maybe called in MCE context, no lock, no printk */
+static enum mce_result mce_action(struct cpu_user_regs *regs,
+ mctelem_cookie_t mctc)
+{
+ struct mc_info *local_mi;
+ enum mce_result ret = MCER_NOERROR;
+ uint32_t i;
+ struct mcinfo_common *mic = NULL;
+ struct mca_handle_result mca_res;
+ struct mca_binfo binfo;
+ struct mca_error_handler *handlers = mce_dhandlers;
+ int handler_num = mce_dhandler_num;
+
+ /* When in mce context, regs is valid */
+ if (regs)
+ {
+ handler_num = mce_uhandler_num;
+ handlers = mce_uhandlers;
+ }
+
+ /* At least a default handler should be registerd */
+ ASSERT(handler_num);
+
+ local_mi = (struct mc_info*)mctelem_dataptr(mctc);
+ x86_mcinfo_lookup(mic, local_mi, MC_TYPE_GLOBAL);
+ if (mic == NULL) {
+ printk(KERN_ERR "MCE: get local buffer entry failed\n ");
+ return MCER_CONTINUE;
+ }
+
+ memset(&binfo, 0, sizeof(binfo));
+ binfo.mig = (struct mcinfo_global *)mic;
+ binfo.mi = local_mi;
+
+ /* Processing bank information */
+ x86_mcinfo_lookup(mic, local_mi, MC_TYPE_BANK);
+
+ for ( ; ret != MCER_RESET && mic && mic->size;
+ mic = x86_mcinfo_next(mic) )
+ {
+ if (mic->type != MC_TYPE_BANK) {
+ continue;
+ }
+ binfo.mib = (struct mcinfo_bank*)mic;
+ binfo.bank = binfo.mib->mc_bank;
+ memset(&mca_res, 0x0f, sizeof(mca_res));
+ for ( i = 0; i < handler_num; i++ ) {
+ if (handlers[i].owned_error(binfo.mib->mc_status))
+ {
+ handlers[i].recovery_handler(binfo.bank, &binfo, &mca_res);
+
+ if (mca_res.result & MCA_OWNER)
+ binfo.mib->mc_domid = mca_res.owner;
+
+ if (mca_res.result == MCA_NEED_RESET)
+ ret = MCER_RESET;
+ else if (mca_res.result == MCA_RECOVERED)
+ {
+ if (ret < MCER_RECOVERED)
+ ret = MCER_RECOVERED;
+ }
+ else if (mca_res.result == MCA_NO_ACTION)
+ {
+ if (ret < MCER_CONTINUE)
+ ret = MCER_CONTINUE;
+ }
+ break;
+ }
+ }
+ ASSERT(i != handler_num);
+ }
+
+ return ret;
+}

/*
* Called from mctelem_process_deferred. Return 1 if the telemetry
* should be committed for dom0 consumption, 0 if it should be
* dismissed.
*/
-static int mce_action(mctelem_cookie_t mctc)
-{
- struct mc_info *local_mi;
- uint32_t i;
- struct mcinfo_common *mic = NULL;
- struct mcinfo_global *mc_global;
- struct mcinfo_bank *mc_bank;
- struct mca_handle_result mca_res;
-
- local_mi = (struct mc_info*)mctelem_dataptr(mctc);
- x86_mcinfo_lookup(mic, local_mi, MC_TYPE_GLOBAL);
- if (mic == NULL) {
- printk(KERN_ERR "MCE: get local buffer entry failed\n ");
- return 0;
- }
-
- mc_global = (struct mcinfo_global *)mic;
-
- /* Processing bank information */
- x86_mcinfo_lookup(mic, local_mi, MC_TYPE_BANK);
-
- for ( ; mic && mic->size; mic = x86_mcinfo_next(mic) ) {
- if (mic->type != MC_TYPE_BANK) {
- continue;
- }
- mc_bank = (struct mcinfo_bank*)mic;
-
- /* TODO: Add recovery actions here, such as page-offline, etc */
- memset(&mca_res, 0x0f, sizeof(mca_res));
- for ( i = 0; i < INTEL_MAX_RECOVERY; i++ ) {
- if ( ((mc_bank->mc_status & 0xffff) ==
- intel_recovery_handler[i].mca_code) ||
- ((mc_bank->mc_status & 0xfff0) ==
- intel_recovery_handler[i].mca_code)) {
- /* For SRAR, OVER = 1 should have caused reset
- * For SRAO, OVER = 1 skip recovery action, continue execution
- */
- if (!(mc_bank->mc_status & MCi_STATUS_OVER))
- intel_recovery_handler[i].recovery_handler
- (mc_bank, mc_global, NULL, &mca_res);
- else {
- if (!(mc_global->mc_gstatus & MCG_STATUS_RIPV))
- mca_res.result = MCA_NEED_RESET;
- else
- mca_res.result = MCA_NO_ACTION;
- }
- if (mca_res.result & MCA_OWNER)
- mc_bank->mc_domid = mca_res.owner;
- if (mca_res.result == MCA_NEED_RESET)
- /* DOMID_XEN*/
- mc_panic("MCE: Software recovery failed for the UCR "
- "error\n");
- else if (mca_res.result == MCA_RECOVERED)
- mce_printk(MCE_VERBOSE, "MCE: The UCR error is"
- "successfully recovered by software!\n");
- else if (mca_res.result == MCA_NO_ACTION)
- mce_printk(MCE_VERBOSE, "MCE: Overwrite SRAO error can't"
- "do recover action, RIPV=1, let it be.\n");
- break;
- }
- }
- /* For SRAR, no defined recovery action should have caused reset
- * in MCA Handler
- */
- if ( i >= INTEL_MAX_RECOVERY )
- mce_printk(MCE_VERBOSE, "MCE: No software recovery action"
- " found for this SRAO error\n");
-
- }
- return 1;
+static int mce_delayed_action(mctelem_cookie_t mctc)
+{
+ enum mce_result result;
+ int ret = 0;
+
+ result = mce_action(NULL, mctc);
+
+ switch (result)
+ {
+ case MCER_RESET:
+ panic("MCE: Software recovery failed for the UCR\n");
+ break;
+ case MCER_RECOVERED:
+ dprintk(XENLOG_INFO, "MCE: Error is successfully recovered\n");
+ ret = 1;
+ break;
+ case MCER_CONTINUE:
+ dprintk(XENLOG_INFO, "MCE: Error can't be recovered, "
+ "system is tainted\n");
+ ret = 1;
+ break;
+ default:
+ ret = 0;
+ break;
+ }
+ return ret;
}

/* Softirq Handler for this MCE# processing */
@@ -274,7 +309,7 @@ static void mce_softirq(void)
* vMCE MSRs virtualization buffer
*/
for_each_online_cpu(workcpu) {
- mctelem_process_deferred(workcpu, mce_action);
+ mctelem_process_deferred(workcpu, mce_delayed_action);
}

/* Step2: Send Log to DOM0 through vIRQ */
@@ -466,11 +501,18 @@ intel_get_extended_msrs(struct mcinfo_gl
return mc_ext;
}

-static void intel_UCR_handler(struct mcinfo_bank *bank,
- struct mcinfo_global *global,
- struct mcinfo_extended *extension,
+#define INTEL_MAX_RECOVERY 2
+static int is_async_memerr(uint64_t status)
+{
+ return (status & 0xFFFF) == 0x17A || (status & 0xFFF0) == 0xC0;
+}
+
+static void intel_memerr_dhandler(int bnum,
+ struct mca_binfo *binfo,
struct mca_handle_result *result)
{
+ struct mcinfo_bank *bank = binfo->mib;
+ struct mcinfo_global *global = binfo->mig;
struct domain *d;
unsigned long mfn, gfn;
uint32_t status;
@@ -545,6 +587,9 @@ static void intel_UCR_handler(struct mci
}
}

+struct mca_error_handler intel_mce_dhandlers[] =
+ {{is_async_memerr, intel_memerr_dhandler}};
+
static void intel_machine_check(struct cpu_user_regs * regs, long error_code)
{
uint64_t gstatus;
@@ -1007,6 +1052,9 @@ static void intel_init_mce(void)
x86_mce_vector_register(intel_machine_check);
mce_recoverable_register(intel_recoverable_scan);
mce_need_clearbank_register(intel_need_clearbank_scan);
+
+ mce_dhandlers = intel_mce_dhandlers;
+ mce_dhandler_num = sizeof(intel_mce_dhandlers)/sizeof(struct mca_error_handler);
}

static int intel_init_mca_banks(void)
diff -r 2979e10a3ca8 -r 2d2812de6792 xen/arch/x86/cpu/mcheck/x86_mca.h
--- a/xen/arch/x86/cpu/mcheck/x86_mca.h Thu Jun 10 08:17:38 2010 +0100
+++ b/xen/arch/x86/cpu/mcheck/x86_mca.h Thu Jun 10 08:18:11 2010 +0100
@@ -151,6 +151,15 @@ struct mca_handle_result
struct recovery_action *action;
};

+/*Keep bank so that we can get staus even if mib is NULL */
+struct mca_binfo {
+ int bank;
+ struct mcinfo_global *mig;
+ struct mcinfo_bank *mib;
+ struct mc_info *mi;
+ struct cpu_user_regs *regs;
+};
+
extern void (*mca_prehandler)( struct cpu_user_regs *regs,
struct mca_handle_result *result);

@@ -161,10 +170,8 @@ struct mca_error_handler
* a seperate function to decode the corresponding actions
* for the particular mca error later.
*/
- uint16_t mca_code;
- void (*recovery_handler)( struct mcinfo_bank *bank,
- struct mcinfo_global *global,
- struct mcinfo_extended *extension,
+ int (*owned_error)(uint64_t status);
+ void (*recovery_handler)(int bank, struct mca_binfo *binfo,
struct mca_handle_result *result);
};


_______________________________________________
Xen-changelog mailing list
Xen-changelog@lists.xensource.com
http://lists.xensource.com/xen-changelog