[linux-yocto] [PATCH 2/6] drivers/edac: include/trace: EDAC improvements
Daniel Dragomir
daniel.dragomir at windriver.com
Thu Jul 26 03:57:42 PDT 2018
From: Marek Majtyka <marekx.majtyka at intel.com>
Changes and fixes:
- CMEM/SMEM driver trace capabilities added
- CMEM/SMEM cpu0 affinity used
- SMEM code maintainability improved
- CMEM multiple ca parity error event added
- CMEM mpr dump on signal bit [21]
- CMEM clear mr5 ca parity error flag added
- CMEM irq storm fix
- kernel 4.9 Axxia EDAC code alignment
Signed-off-by: Marek Majtyka <marekx.majtyka at intel.com>
---
arch/arm64/boot/dts/intel/axc67xx.dtsi | 44 ++--
drivers/edac/Kconfig | 7 +
drivers/edac/axxia_edac-cmc_56xx.c | 383 +++++++++++++++++++++++----------
drivers/edac/axxia_edac-l2_cpu_56xx.c | 3 +-
drivers/edac/axxia_edac-l3_56xx.c | 46 +++-
drivers/edac/axxia_edac-mc_56xx.c | 295 +++++++++++++++++--------
include/trace/events/edac_cmc.h | 101 +++++++++
include/trace/events/edac_mc.h | 104 +++++++++
8 files changed, 744 insertions(+), 239 deletions(-)
create mode 100644 include/trace/events/edac_cmc.h
create mode 100644 include/trace/events/edac_mc.h
diff --git a/arch/arm64/boot/dts/intel/axc67xx.dtsi b/arch/arm64/boot/dts/intel/axc67xx.dtsi
index 7bb4cd8..d4d3171 100644
--- a/arch/arm64/boot/dts/intel/axc67xx.dtsi
+++ b/arch/arm64/boot/dts/intel/axc67xx.dtsi
@@ -232,33 +232,33 @@
};
gpdma0: gpdma at 8005020000 {
- compatible = "lsi,dma32";
+ compatible = "lsi,dma32";
reg = <0x80 0x05020000 0 0x10000>;
interrupts = <GIC_SPI 44 IRQ_TYPE_LEVEL_HIGH>,
- <GIC_SPI 45 IRQ_TYPE_LEVEL_HIGH>;
- channel0 {
- interrupts = <GIC_SPI 46 IRQ_TYPE_LEVEL_HIGH>;
- };
-
- channel1 {
- interrupts = <GIC_SPI 47 IRQ_TYPE_LEVEL_HIGH>;
- };
- };
-
- gpdma1: gpdma at 8005030000 {
- compatible = "lsi,dma32";
+ <GIC_SPI 45 IRQ_TYPE_LEVEL_HIGH>;
+ channel0 {
+ interrupts = <GIC_SPI 46 IRQ_TYPE_LEVEL_HIGH>;
+ };
+
+ channel1 {
+ interrupts = <GIC_SPI 47 IRQ_TYPE_LEVEL_HIGH>;
+ };
+ };
+
+ gpdma1: gpdma at 8005030000 {
+ compatible = "lsi,dma32";
reg = <0x80 0x05030000 0 0x10000>;
interrupts = <GIC_SPI 48 IRQ_TYPE_LEVEL_HIGH>,
- <GIC_SPI 49 IRQ_TYPE_LEVEL_HIGH>;
+ <GIC_SPI 49 IRQ_TYPE_LEVEL_HIGH>;
status = "disabled";
- channel0 {
- interrupts = <GIC_SPI 50 IRQ_TYPE_LEVEL_HIGH>;
- };
-
- channel1 {
- interrupts = <GIC_SPI 51 IRQ_TYPE_LEVEL_HIGH>;
- };
- };
+ channel0 {
+ interrupts = <GIC_SPI 50 IRQ_TYPE_LEVEL_HIGH>;
+ };
+
+ channel1 {
+ interrupts = <GIC_SPI 51 IRQ_TYPE_LEVEL_HIGH>;
+ };
+ };
i2c0: i2c at 8080600000 {
compatible = "lsi,api2c";
diff --git a/drivers/edac/Kconfig b/drivers/edac/Kconfig
index 856fd8f..5588930 100644
--- a/drivers/edac/Kconfig
+++ b/drivers/edac/Kconfig
@@ -413,6 +413,13 @@ config EDAC_AXXIA_SYSMEM_6700
the System Memory error detection. System Memory error
detection is interrupt driven.
+config DEBUG_EDAC_AXXIA_SYSMEM
+ depends on ARCH_AXXIA
+ bool "AXXIA EDAC SYSMEM error injection interface."
+ help
+ Support for configuration of SYSMEM edac tracing functionality.
+ It works for both 5600 and 6700 board families.
+
config EDAC_AXXIA_CMEM_5600
depends on ARCH_AXXIA
bool "AXXIA EDAC CMem Controller for 5600"
diff --git a/drivers/edac/axxia_edac-cmc_56xx.c b/drivers/edac/axxia_edac-cmc_56xx.c
index 884f746..f29e6926 100644
--- a/drivers/edac/axxia_edac-cmc_56xx.c
+++ b/drivers/edac/axxia_edac-cmc_56xx.c
@@ -1,7 +1,7 @@
/*
* drivers/edac/axxia_edac-mc.c
*
- * EDAC Driver for Intel's Axxia 5600 Configuration Memory Controller
+ * EDAC Driver for Intel's Axxia 5600/6700 Configuration Memory Controller
*
* Copyright (C) 2016 Intel Inc.
*
@@ -31,6 +31,9 @@
#include "edac_core.h"
#include "edac_module.h"
+#define CREATE_TRACE_POINTS
+#include <trace/events/edac_cmc.h>
+
#define FMT "%s: syscon lookup failed hence using hardcoded register address\n"
#define MPR_FMT2 "\n%3d %#010x %#010x"
@@ -53,7 +56,9 @@
#define CM_MPR_PAGE 0x1
#define CM_56XX_DENALI_CTL_00 0x0
+#define CM_56XX_DENALI_CTL_33 0x84
#define CM_56XX_DENALI_CTL_34 0x88
+#define CM_56XX_DENALI_CTL_45 0xb4
#define CM_56XX_DENALI_CTL_74 0x128
#define CM_56XX_DENALI_CTL_80 0x140
@@ -90,6 +95,7 @@
#define INT_BIT_8 (0x00000100)
#define INT_BIT_11 (0x00000800)
#define INT_BIT_21 (0x00200000)
+#define INT_BIT_23 (0x00800000)
#define INT_BIT_25 (0x02000000)
#define INT_BIT_30 (0x40000000)
#define INT_BIT_31 (0x80000000)
@@ -108,6 +114,7 @@
INT_BIT_7 |\
INT_BIT_11 |\
INT_BIT_21 |\
+ INT_BIT_23 |\
INT_BIT_31))
#define CM_INT_MASK_FULL (~(\
@@ -120,6 +127,7 @@
INT_BIT_7 |\
INT_BIT_11 |\
INT_BIT_21 |\
+ INT_BIT_23 |\
INT_BIT_25 |\
INT_BIT_30 |\
INT_BIT_31))
@@ -127,67 +135,66 @@
#define CM_INT_MASK_ALL (0x7fffffff)
#define ALIVE_NOTIFICATION_PERIOD (90*1000)
-static int log = 1;
-module_param(log, int, S_IRUGO|S_IWUSR);
-MODULE_PARM_DESC(log, "Log each error to kernel log.");
+static cpumask_t only_cpu_0 = { CPU_BITS_CPU0 };
static int force_restart = 1;
-module_param(force_restart, int, S_IRUGO|S_IWUSR);
+module_param(force_restart, int, 0644);
MODULE_PARM_DESC(force_restart, "Machine restart on fatal error.");
static atomic64_t mc_counter = ATOMIC_INIT(0);
/*
- Bit [31] = Logical OR of all lower bits.
- Bit [30] = A CRC error occurred on the write data bus.
- Bit [29] = The user-initiated DLL resync has completed.
- Bit [28] = A state change has been detected on the dfi_init_complete
- signal after initialization.
- Bit [27] = The assertion of the INHIBIT_DRAM_CMD parameter has successfully
- inhibited the command queue.
- Bit [26] = The register interface-initiated mode register write has completed
- and another mode register write may be issued.
- Bit [25] = MPR read command, initiated with a software MPR_READ request, is
- complete.
- Bit [24] = Error received from the PHY on the DFI bus.
- Bit [23] = RESERVED
- Bit [22] = RESERVED
- Bit [21] = A parity error has been detected on the address/control bus on
- a registered DIMM.
- Bit [20] = The leveling operation has completed.
- Bit [19] = A read leveling gate training operation has been requested.
- Bit [18] = A read leveling operation has been requested.
- Bit [17] = A write leveling operation has been requested.
- Bit [16] = A DFI update error has occurred. Error information can be found in
- the UPDATE_ERROR_STATUS parameter.
- Bit [15] = A write leveling error has occurred. Error information can be found
- in the WRLVL_ERROR_STATUS parameter.
- Bit [14] = A read leveling gate training error has occurred. Error information
- can be found in the RDLVL_ERROR_STATUS parameter.
- Bit [13] = A read leveling error has occurred. Error information can be found
- in the RDLVL_ERROR_STATUS parameter.
- Bit [12] = The user has programmed an invalid setting associated with user
- words per burst.
- Examples:
- Setting param_reduc when burst length = 2.
- A 1:2 MC:PHY clock ratio with burst length = 2.
- Bit [11] = A wrap cycle crossing a DRAM page has been detected. This is
- unsupported & may result in memory data corruption.
- Bit [10] = The BIST operation has been completed.
- Bit [9] = The low power operation has been completed.
- Bit [8] = The MC initialization has been completed.
- Bit [7] = An error occurred on the port command channel.
- Bit [6] = Multiple uncorrectable ECC events have been detected.
- Bit [5] = An uncorrectable ECC event has been detected.
- Bit [4] = Multiple correctable ECC events have been detected.
- Bit [3] = A correctable ECC event has been detected.
- Bit [2] = Multiple accesses outside the defined PHYSICAL memory space
- have occurred.
- Bit [1] = A memory access outside the defined PHYSICAL memory space
- has occurred.
- Bit [0] = The memory reset is valid on the DFI bus.
-
- Of these 1, 2, 3, 4, 5, 6, 7, 11, 21, 25, and 30 are of interest.
-*/
+ * Bit [31] = Logical OR of all lower bits.
+ * Bit [30] = A CRC error occurred on the write data bus.
+ * Bit [29] = The user-initiated DLL resync has completed.
+ * Bit [28] = A state change has been detected on the dfi_init_complete
+ * signal after initialization.
+ * Bit [27] = The assertion of the INHIBIT_DRAM_CMD parameter has successfully
+ * inhibited the command queue.
+ * Bit [26] = The register interface-initiated mode register write has completed
+ * and another mode register write may be issued.
+ * Bit [25] = MPR read command, initiated with a software MPR_READ request, is
+ * complete.
+ * Bit [24] = Error received from the PHY on the DFI bus.
+ * Bit [23] = RESERVED
+ * Bit [22] = RESERVED
+ * Bit [21] = A parity error has been detected on the address/control bus on
+ * a registered DIMM.
+ * Bit [20] = The leveling operation has completed.
+ * Bit [19] = A read leveling gate training operation has been requested.
+ * Bit [18] = A read leveling operation has been requested.
+ * Bit [17] = A write leveling operation has been requested.
+ * Bit [16] = A DFI update error has occurred. Error information can be found
+ * in the UPDATE_ERROR_STATUS parameter.
+ * Bit [15] = A write leveling error has occurred. Error information can be
+ * found in the WRLVL_ERROR_STATUS parameter.
+ * Bit [14] = A read leveling gate training error has occurred. Error
+ * information can be found in the RDLVL_ERROR_STATUS parameter.
+ * Bit [13] = A read leveling error has occurred. Error information can be
+ * found in the RDLVL_ERROR_STATUS parameter.
+ * Bit [12] = The user has programmed an invalid setting associated with user
+ * words per burst.
+ * Examples:
+ * Setting param_reduc when burst length = 2.
+ * A 1:2 MC:PHY clock ratio with burst length = 2.
+ * Bit [11] = A wrap cycle crossing a DRAM page has been detected. This is
+ * unsupported & may result in memory data corruption.
+ * Bit [10] = The BIST operation has been completed.
+ * Bit [9] = The low power operation has been completed.
+ * Bit [8] = The MC initialization has been completed.
+ * Bit [7] = An error occurred on the port command channel.
+ * Bit [6] = Multiple uncorrectable ECC events have been detected.
+ * Bit [5] = An uncorrectable ECC event has been detected.
+ * Bit [4] = Multiple correctable ECC events have been detected.
+ * Bit [3] = A correctable ECC event has been detected.
+ * Bit [2] = Multiple accesses outside the defined PHYSICAL memory space
+ * have occurred.
+ * Bit [1] = A memory access outside the defined PHYSICAL memory space
+ * has occurred.
+ * Bit [0] = The memory reset is valid on the DFI bus.
+ *
+ * Of these 1, 2, 3, 4, 5, 6, 7, 11, 13, 14, 15, 16, 21, 24, 25, and 30
+ * are of our interest.
+ */
/*
* MPR dump processing - overview.
@@ -196,10 +203,10 @@ static atomic64_t mc_counter = ATOMIC_INIT(0);
* one need to collect dumps for all available cs. Below given example
* for two cs0/cs1.
*
- * CMEM MC cmmon_isr_sw cmmon_wq
+ * CMEM MC cmmon_isr_sw wq_alerts
* | | |
* | | |
- * |ALERT_N - int_status bit [30] |
+ * |ALERT_N - int_status bit [30] or [21] |
* |------------------>| |
* | |schedule cmmon_wq |
* | |------------------>|
@@ -270,6 +277,19 @@ struct __packed cm_56xx_denali_ctl_00
#endif
};
+struct __packed cm_56xx_denali_ctl_33
+{
+#ifdef CPU_BIG_ENDIAN
+ unsigned reserved : 6;
+ unsigned write : 1;
+ unsigned write_modereg : 25;
+#else /* Little Endian */
+ unsigned write_modereg : 25;
+ unsigned write : 1;
+ unsigned reserved : 6;
+#endif
+};
+
/* Trigger MPR */
struct __packed cm_56xx_denali_ctl_34
{
@@ -292,6 +312,24 @@ struct __packed cm_56xx_denali_ctl_34
#endif
};
+/*
+ * this structure is the same for all registers(one definition used)
+ * cm_56xx_denali_ctl_45, cm_56xx_denali_ctl_48,
+ * cm_56xx_denali_ctl_53, cm_56xx_denali_ctl_56
+ */
+struct __packed cm_56xx_denali_ctl_45
+{
+#ifdef CPU_BIG_ENDIAN
+ unsigned absolete1 : 6;
+ unsigned reserved : 7;
+ unsigned mrsingle_data_0 : 17;
+#else /* Little Endian */
+ unsigned mrsingle_data_0 : 17;
+ unsigned reserved : 7;
+ unsigned absolete1 : 6;
+#endif
+};
+
#ifdef CONFIG_DEBUG_EDAC_AXXIA_CMEM
#define CM_56XX_DENALI_CTL_62 0xf8
@@ -395,6 +433,7 @@ enum events {
EV_PORT_ERROR,
EV_WRAP_ERROR,
EV_PARITY_ERROR,
+ EV_SEC_PARITY_ERROR,
NR_EVENTS
};
@@ -409,6 +448,7 @@ static char *block_name[] = {
"port_error",
"wrap_error",
"parity_error",
+ "second_parity_error",
"alert_n_cs0_dram0_ca_par_error",
"alert_n_cs0_dram0_crc_error",
"alert_n_cs0_dram1_ca_par_error",
@@ -437,6 +477,7 @@ static const u32 event_mask[NR_EVENTS] = {
[EV_PORT_ERROR] = INT_BIT_7,
[EV_WRAP_ERROR] = INT_BIT_11,
[EV_PARITY_ERROR] = INT_BIT_21,
+ [EV_SEC_PARITY_ERROR] = INT_BIT_23,
};
static const struct event_logging {
@@ -453,6 +494,7 @@ static const struct event_logging {
[EV_PORT_ERROR] = {0, KERN_CRIT, "Port error"},
[EV_WRAP_ERROR] = {0, KERN_CRIT, "Wrap error"},
[EV_PARITY_ERROR] = {0, KERN_CRIT, "Parity error"},
+ [EV_SEC_PARITY_ERROR] = {1, KERN_CRIT, "Second parity error"},
};
/* Private structure for common edac device */
@@ -513,28 +555,28 @@ static int setup_fault_injection(struct intel_edac_dev_info *dev_info,
struct cm_56xx_denali_ctl_62 denali_ctl_62;
if (ncr_read(dev_info->cm_region,
- CM_56XX_DENALI_CTL_62,
- 4, &denali_ctl_62))
- goto error_read;
+ CM_56XX_DENALI_CTL_62,
+ 4, &denali_ctl_62))
+ goto error_read;
denali_ctl_62.xor_check_bits = fault;
if (ncr_write(dev_info->cm_region,
- CM_56XX_DENALI_CTL_62,
- 4, (u32 *) &denali_ctl_62))
- goto error_write;
+ CM_56XX_DENALI_CTL_62,
+ 4, (u32 *) &denali_ctl_62))
+ goto error_write;
if (ncr_read(dev_info->cm_region,
- CM_56XX_DENALI_CTL_62,
- 4, &denali_ctl_62))
- goto error_read;
+ CM_56XX_DENALI_CTL_62,
+ 4, &denali_ctl_62))
+ goto error_read;
denali_ctl_62.fwc = (enable > 0 ? 0x1 : 0x0);
if (ncr_write(dev_info->cm_region,
- CM_56XX_DENALI_CTL_62,
- 4, (u32 *) &denali_ctl_62))
- goto error_write;
+ CM_56XX_DENALI_CTL_62,
+ 4, (u32 *) &denali_ctl_62))
+ goto error_write;
return 0;
error_read:
@@ -639,7 +681,7 @@ static struct edac_dev_sysfs_attribute device_block_attr[] = {
{
.attr = {
.name = "mpr_page1",
- .mode = (S_IRUGO | S_IWUSR)
+ .mode = (0644)
},
.show = mpr1_dump_show,
.store = NULL
@@ -670,7 +712,7 @@ handle_events(struct intel_edac_dev_info *edac_dev,
set_val = readl(
edac_dev->axi2ser3_region
+ SYSCON_PERSIST_SCRATCH);
- /* set bit 3 in pscratch reg */
+ /* set bit 7 in pscratch reg */
set_val = set_val
| CMEM_PERSIST_SCRATCH_BIT;
writel(set_val,
@@ -721,6 +763,48 @@ store_mpr_dump(struct intel_edac_dev_info *edac_dev, int cs)
MAX_DQ * MPR_PAGE_BYTES);
}
+static int clear_ca_parity_error(struct intel_edac_dev_info *dev_info, int cs)
+{
+
+ struct cm_56xx_denali_ctl_45 denali_ctl_45;
+ struct cm_56xx_denali_ctl_33 denali_ctl_33;
+
+ if (ncr_read(dev_info->cm_region,
+ CM_56XX_DENALI_CTL_45 + 0xc + 0x20 * cs,
+ 4, (u32 *) &denali_ctl_45))
+ goto error_read;
+
+ /*
+ * Clear always as we can't get info about state change
+ * from denali_ctl_45, which means this check would be faulty!!!
+ * if (denali_ctl_45.mrsingle_data_0 & 0x10)
+ */
+ denali_ctl_45.mrsingle_data_0 &= 0x3FFEF; /* clear A4 bit */
+ denali_ctl_33.write = 1; /* write */
+ denali_ctl_33.write_modereg = 0x800005; /* MR5 write */
+ denali_ctl_33.write_modereg |= (cs << 8); /* chip select */
+
+ if (ncr_write(dev_info->cm_region,
+ CM_56XX_DENALI_CTL_45 + 0x20 * cs,
+ 4, (u32 *) &denali_ctl_45))
+ goto error_write;
+
+ if (ncr_write(dev_info->cm_region,
+ CM_56XX_DENALI_CTL_33,
+ 4, (u32 *) &denali_ctl_33))
+ goto error_write;
+ return 0;
+
+error_write:
+ printk_ratelimited("%s: Write error when clearing ca parity in mr5\n",
+ dev_name(&dev_info->pdev->dev));
+ return 1;
+error_read:
+ printk_ratelimited("%s: Read error when clearing ca parity in mr5\n",
+ dev_name(&dev_info->pdev->dev));
+ return 1;
+}
+
static inline void __attribute__((always_inline))
update_alert_counters(struct intel_edac_dev_info *edac_dev, int cs)
{
@@ -739,8 +823,11 @@ update_alert_counters(struct intel_edac_dev_info *edac_dev, int cs)
(u8 (*)[MPR_PAGE_BYTES]) (&edac_dev->data->mpr.dram_0_page[0]);
int i;
- for (i = 0; i < MAX_DQ; ++i)
+ for (i = 0; i < edac_dev->data->dram_count; ++i) {
inc_alert_counter(edac_dev->data->alerts, cs, i, dram[i][3]);
+ trace_edac_cmc_dump_processed(edac_dev->cm_region >> 16,
+ cs, i, (int) dram[i][3]);
+ }
}
@@ -751,6 +838,9 @@ collect_mpr_dump(struct intel_edac_dev_info *edac_dev, u8 page, int cs)
unsigned long flags;
u32 regval;
int i;
+#ifdef CONFIG_DEBUG_EDAC_AXXIA_CMEM
+ u32 node = edac_dev->cm_region >> 16;
+#endif
mpr->mpr_page_id = page;
@@ -761,11 +851,32 @@ collect_mpr_dump(struct intel_edac_dev_info *edac_dev, u8 page, int cs)
goto error_read;
mpr->dram_0_page[i] = regval & 0xff;
+
+#ifdef CONFIG_DEBUG_EDAC_AXXIA_CMEM
+ trace_edac_cmc_dump_collected(node, cs, i, 0,
+ (int) mpr->dram_0_page[i]);
+#endif
+
mpr->dram_1_page[i] = ((regval & 0xff00) >> 8);
+#ifdef CONFIG_DEBUG_EDAC_AXXIA_CMEM
+ trace_edac_cmc_dump_collected(node, cs, i, 1,
+ (int) mpr->dram_1_page[i]);
+#endif
if (edac_dev->data->dram_count == MAX_DQ) {
mpr->dram_2_page[i] = ((regval & 0xff0000) >> 16);
+
+#ifdef CONFIG_DEBUG_EDAC_AXXIA_CMEM
+ trace_edac_cmc_dump_collected(node, cs, i, 2,
+ (int) mpr->dram_2_page[i]);
+#endif
+
mpr->dram_3_page[i] = ((regval & 0xff000000) >> 24);
+
+#ifdef CONFIG_DEBUG_EDAC_AXXIA_CMEM
+ trace_edac_cmc_dump_collected(node, cs, i, 3,
+ (int) mpr->dram_3_page[i]);
+#endif
}
}
raw_spin_lock_irqsave(&edac_dev->data->mpr_data_lock, flags);
@@ -773,6 +884,7 @@ collect_mpr_dump(struct intel_edac_dev_info *edac_dev, u8 page, int cs)
raw_spin_unlock_irqrestore(&edac_dev->data->mpr_data_lock, flags);
update_alert_counters(edac_dev, cs);
+ clear_ca_parity_error(edac_dev, cs);
return 0;
error_read:
@@ -804,13 +916,16 @@ cmmon_isr_sw(int interrupt, void *device)
/*
* NOTE:
* ISR function is only reading int_status, and write into int_act
- * registers.
+ * and int_mask registers (as well as rte config load)
*
- * - first handle critical events, which might require restart
+ * - first handles driver initialization if not configured in uboot,
+ * once initialized it mask irq bit 8 from raising interrupt
+ * as this bit must be acknowledged by rte,
+ * - second handle critical events, which might require restart
* (handle_events) and then to the job outside isr
- * - second collect MPR dump if any exists and then trigger new if
+ * - third collect MPR dump if any exists and then trigger new if
* needed - all outside isr,
- * - third wake up job outside isr to trigger mpr dump procedure when
+ * - finally wake up job outside isr to trigger mpr dump procedure when
* ALERT_N reported (bit [30] is on)
*/
@@ -818,8 +933,12 @@ cmmon_isr_sw(int interrupt, void *device)
4, (u32 *) &denali_ctl_84))
goto error_read;
- if (denali_ctl_84.int_status & INT_BIT_8) {
- if (dev_info->is_controller_configured == 0) {
+ trace_edac_cmc_int_status(dev_info->cm_region >> 16,
+ denali_ctl_84.int_status);
+
+ if (dev_info->is_controller_configured == 0) {
+ /* first init case */
+ if (denali_ctl_84.int_status & INT_BIT_8) {
ret = initialize(dev_info);
if (ret)
goto error_init;
@@ -829,28 +948,46 @@ cmmon_isr_sw(int interrupt, void *device)
goto error_init;
dev_info->is_controller_configured = 1;
- }
- if (dev_info->is_ddr4)
- denali_ctl_86.int_mask = CM_INT_MASK_FULL;
- else
- denali_ctl_86.int_mask = CM_INT_MASK_BASE;
+ denali_ctl_85.int_ack = INT_BIT_8;
+ if (ncr_write(dev_info->cm_region,
+ CM_56XX_DENALI_CTL_85,
+ 4, (u32 *) &denali_ctl_85))
+ goto error_write;
+
+ denali_ctl_86.int_mask = CM_INT_MASK_ALL;
+ if (ncr_write(dev_info->cm_region,
+ CM_56XX_DENALI_CTL_86,
+ 4, (u32 *) &denali_ctl_86))
+ goto error_write;
- if (ncr_write(dev_info->cm_region,
- CM_56XX_DENALI_CTL_86,
- 4, (u32 *) &denali_ctl_86)) {
- goto error_write;
}
+ /*
+ * SAFETY CHECK
+ * One cannot go further if driver is not fully functional!!!
+ */
return IRQ_HANDLED;
- }
- /*
- * SAFETY CHECK
- * one cannot go further if driver is not fully functional!!!
- */
- if (dev_info->is_controller_configured == 0)
- return IRQ_HANDLED;
+ } else {
+ /* reload config case */
+ if (denali_ctl_84.int_status & INT_BIT_8) {
+ denali_ctl_85.int_ack = INT_BIT_8;
+ if (ncr_write(dev_info->cm_region,
+ CM_56XX_DENALI_CTL_85,
+ 4, (u32 *) &denali_ctl_85))
+ goto error_write;
+
+ denali_ctl_86.int_mask = CM_INT_MASK_ALL;
+ if (ncr_write(dev_info->cm_region,
+ CM_56XX_DENALI_CTL_86,
+ 4, (u32 *) &denali_ctl_86))
+ goto error_write;
+
+
+ return IRQ_HANDLED;
+ }
+ }
handle_events(dev_info, &denali_ctl_84);
atomic_set(&dev_info->data->event_ready, 1);
@@ -867,10 +1004,9 @@ cmmon_isr_sw(int interrupt, void *device)
denali_ctl_85.int_ack |= INT_BIT_25;
}
- if (denali_ctl_84.int_status & INT_BIT_30) {
+ if (denali_ctl_84.int_status & (INT_BIT_30 | INT_BIT_21)) {
atomic_inc(&dev_info->data->dump_in_progress);
wake_up(&dev_info->data->dump_wq);
- denali_ctl_85.int_ack |= INT_BIT_30;
}
}
@@ -892,7 +1028,7 @@ error_init:
printk_ratelimited("%s: Error during driver initialization\n",
dev_name(&dev_info->pdev->dev));
uninitialize(dev_info, ret,
- 0 == dev_info->is_controller_configured ? 1 : 0);
+ dev_info->is_controller_configured == 0 ? 1 : 0);
return IRQ_HANDLED;
}
@@ -909,9 +1045,9 @@ static void intel_cm_alerts_error_check(struct edac_device_ctl_info *edac_dev)
start:
/* keep hung up monitor happy 90 sec's */
- if (0 == wait_event_timeout(dev_info->data->dump_wq,
+ if (wait_event_timeout(dev_info->data->dump_wq,
atomic_read(&dev_info->data->dump_in_progress),
- msecs_to_jiffies(ALIVE_NOTIFICATION_PERIOD)))
+ msecs_to_jiffies(ALIVE_NOTIFICATION_PERIOD)) == 0)
goto start;
if (dev_info->finish_alerts)
@@ -940,13 +1076,15 @@ start:
4, (u32 *) &denali_ctl_34))
goto error_write;
+ trace_edac_cmc_dump_triggered(dev_info->cm_region >> 16, i);
+
/* wait */
ret = wait_event_timeout(dev_info->data->dump_wq,
atomic_read(&dev_info->data->dump_ready),
msecs_to_jiffies(1000));
if (dev_info->finish_alerts)
goto finish;
- if (0 == ret)
+ if (ret == 0)
goto timeout_error;
atomic_set(&dev_info->data->dump_ready, 0);
@@ -1003,9 +1141,9 @@ static void intel_cm_events_error_check(struct edac_device_ctl_info *edac_dev)
u32 counter;
while (1) {
- if (0 == wait_event_timeout(dev_info->data->event_wq,
+ if (wait_event_timeout(dev_info->data->event_wq,
atomic_read(&dev_info->data->event_ready),
- msecs_to_jiffies(ALIVE_NOTIFICATION_PERIOD)))
+ msecs_to_jiffies(ALIVE_NOTIFICATION_PERIOD)) == 0)
continue;
atomic_set(&dev_info->data->event_ready, 0);
@@ -1026,6 +1164,7 @@ static void intel_cm_events_error_check(struct edac_device_ctl_info *edac_dev)
case EV_MULT_ILLEGAL:
case EV_UNCORR_ECC:
case EV_MULT_UNCORR_ECC:
+ case EV_SEC_PARITY_ERROR:
edac_device_handle_multi_ue(edac_dev,
0, i, counter,
edac_dev->ctl_name);
@@ -1094,10 +1233,10 @@ static int get_active_dram(struct intel_edac_dev_info *dev_info)
return dram;
}
- if (0 == denali_ctl_74.bank_diff)
+ if (denali_ctl_74.bank_diff == 0)
dram = MAX_DQ/2;
- if (1 == denali_ctl_74.bank_diff)
+ if (denali_ctl_74.bank_diff == 1)
dram = MAX_DQ;
return dram;
@@ -1214,7 +1353,6 @@ static int initialize(struct intel_edac_dev_info *dev_info)
pr_err("Could not get dram version. Is config loaded?\n");
return ERR_STAGE_1;
}
- /*dev_info->is_ddr4 = 1;*/
dev_info->finish_alerts = 0;
dev_info->finish_events = 0;
@@ -1237,6 +1375,7 @@ static int initialize(struct intel_edac_dev_info *dev_info)
}
dev_info->edac_dev->log_ce = 0;
+
instance = &dev_info->edac_dev->instances[0];
/* It just gives more descriptive name. */
@@ -1303,14 +1442,14 @@ static int enable_workers(struct intel_edac_dev_info *dev_info)
atomic_set(&dev_info->data->event_ready, 0);
atomic_set(&dev_info->data->dump_in_progress, 0);
- dev_info->wq_events = alloc_workqueue("%s-events", WQ_MEM_RECLAIM, 1,
+ dev_info->wq_events = alloc_workqueue("%s-events", 0, 1,
(dev_info->ctl_name));
if (!dev_info->wq_events)
return ERR_STAGE_3;
if (dev_info->is_ddr4) {
dev_info->wq_alerts =
- alloc_workqueue("%s-alerts", WQ_MEM_RECLAIM, 1,
+ alloc_workqueue("%s-alerts", 0, 1,
(dev_info->ctl_name));
if (!dev_info->wq_alerts)
return ERR_STAGE_4;
@@ -1321,8 +1460,9 @@ static int enable_workers(struct intel_edac_dev_info *dev_info)
INIT_WORK(&dev_info->offload_events, axxia_events_work);
if (dev_info->is_ddr4)
- queue_work(dev_info->wq_alerts, &dev_info->offload_alerts);
- queue_work(dev_info->wq_events, &dev_info->offload_events);
+ queue_work_on(0, dev_info->wq_alerts,
+ &dev_info->offload_alerts);
+ queue_work_on(0, dev_info->wq_events, &dev_info->offload_events);
return 0;
}
@@ -1331,6 +1471,7 @@ static int enable_driver_irq(struct intel_edac_dev_info *dev_info)
{
int irq = -1, rc = 0;
struct cm_56xx_denali_ctl_86 denali_ctl_86;
+ struct irq_desc *desc;
snprintf(&dev_info->data->irq_name[0], IRQ_NAME_LEN,
"%s-mon", dev_info->ctl_name);
@@ -1381,6 +1522,10 @@ static int enable_driver_irq(struct intel_edac_dev_info *dev_info)
return ERR_STAGE_6;
}
+
+ desc = irq_to_desc(irq);
+ sched_setaffinity(desc->action->thread->pid, &only_cpu_0);
+
return 0;
}
@@ -1397,8 +1542,8 @@ axxia_cmem_read(struct file *filp, char *buffer, size_t length, loff_t *offset)
if (*offset > 0)
return 0;
- buf = kmalloc(PAGE_SIZE, __GFP_WAIT);
- if (NULL == buf)
+ buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
+ if (buf == NULL)
goto no_mem_buffer;
mutex_lock(&dev_info->state_machine_lock);
@@ -1443,8 +1588,8 @@ axxia_cmem_write(struct file *file, const char __user *buffer,
struct intel_edac_dev_info *dev_info =
(struct intel_edac_dev_info *) file->private_data;
- buf = kmalloc(count + 1, __GFP_WAIT);
- if (NULL == buf)
+ buf = kmalloc(count + 1, GFP_KERNEL);
+ if (buf == NULL)
goto no_mem_buffer;
memset(buf, 0, count + 1);
@@ -1606,12 +1751,12 @@ static int intel_edac_mc_probe(struct platform_device *pdev)
if (denali_ctl_00.start == 1) {
/* uboot has configured CMEM */
- if (0xa == denali_ctl_00.dram_class) {
+ if (denali_ctl_00.dram_class == 0xa) {
pr_info("%s supports mpr dump (DDR4).\n",
dev_info->ctl_name);
dev_info->is_ddr4 = 1;
}
- if (0x6 == denali_ctl_00.dram_class) {
+ if (denali_ctl_00.dram_class == 0x6) {
pr_info("%s doesn't support mpr dump (DDR3).\n",
dev_info->ctl_name);
}
@@ -1656,7 +1801,7 @@ static int intel_edac_mc_probe(struct platform_device *pdev)
/* each instance shall know each private data */
dev_info->dir_entry =
- proc_create_data(dev_info->proc_name, S_IWUSR,
+ proc_create_data(dev_info->proc_name, 0200,
NULL, &axxia_edac_cmem_proc_ops,
dev_info);
@@ -1671,7 +1816,7 @@ static int intel_edac_mc_probe(struct platform_device *pdev)
err_uninit:
uninitialize(dev_info, ret,
- 0 == dev_info->is_controller_configured ? 1 : 0);
+ dev_info->is_controller_configured == 0 ? 1 : 0);
err_init:
mutex_destroy(&dev_info->data->edac_sysfs_data_lock);
mutex_destroy(&dev_info->state_machine_lock);
@@ -1698,7 +1843,7 @@ static int intel_edac_mc_remove(struct platform_device *pdev)
#endif
uninitialize(dev_info, ERR_STAGE_8,
- 0 == dev_info->is_controller_configured ? 1 : 0);
+ dev_info->is_controller_configured == 0 ? 1 : 0);
if (dev_info->edac_dev != NULL) {
edac_device_del_device(&dev_info->pdev->dev);
diff --git a/drivers/edac/axxia_edac-l2_cpu_56xx.c b/drivers/edac/axxia_edac-l2_cpu_56xx.c
index a9400e8..a183c65 100644
--- a/drivers/edac/axxia_edac-l2_cpu_56xx.c
+++ b/drivers/edac/axxia_edac-l2_cpu_56xx.c
@@ -1,7 +1,7 @@
/*
* drivers/edac/axxia_edac-l2_cpu_56xx.c
*
- * EDAC Driver for Intel's Axxia 5600 System Memory Controller
+ * EDAC Driver for Intel's Axxia 5600/6700 System Memory Controller
*
* Copyright (C) 2016 Intel Inc.
*
@@ -291,6 +291,7 @@ static const struct of_device_id intel_edac_l2_match[] = {
},
#endif
+
{},
};
diff --git a/drivers/edac/axxia_edac-l3_56xx.c b/drivers/edac/axxia_edac-l3_56xx.c
index 32427c6..b630031 100644
--- a/drivers/edac/axxia_edac-l3_56xx.c
+++ b/drivers/edac/axxia_edac-l3_56xx.c
@@ -21,6 +21,8 @@
#include <linux/of_platform.h>
#include <linux/of.h>
#include <linux/of_address.h>
+#include <linux/of_irq.h>
+#include <linux/irq.h>
#include <linux/platform_device.h>
#include <linux/reboot.h>
#include <linux/mfd/syscon.h>
@@ -77,6 +79,8 @@
#define CCN_NODE_ERR_SYND_REG1 0x408
#define CCN_NODE_ERR_SYND_CLR 0x480
+static cpumask_t only_cpu_0 = { CPU_BITS_CPU0};
+
union dickens_hnf_err_syndrome_reg0 {
struct __packed {
#ifdef CPU_BIG_ENDIAN
@@ -250,17 +254,16 @@ static irqreturn_t ccn_irq_thread(int irq, void *device)
return IRQ_HANDLED;
}
-static irqreturn_t ccn_irq_handler(int irq, void *device)
+static irqreturn_t collect_and_clean(struct intel_edac_dev_info *dev_info,
+ int report_error)
{
- struct intel_edac_dev_info *dev_info = device;
void __iomem *ccn_base = dev_info->dickens_L3;
-
- irqreturn_t res = IRQ_NONE;
u64 err_sig_val[3];
u64 err_type_value[4];
u64 err_or;
u64 err_synd_reg0 = 0, err_synd_reg1 = 0;
int i;
+ irqreturn_t res = IRQ_NONE;
/* PMU overflow is a special case - for the future */
err_or = err_sig_val[0] = readq(ccn_base + CCN_MN_ERR_SIG_VAL_63_0);
@@ -351,11 +354,21 @@ static irqreturn_t ccn_irq_handler(int irq, void *device)
}
}
- if (err_or)
+ if (err_or && report_error)
dev_err(&dev_info->pdev->dev,
"Error reported in %016llx %016llx %016llx.\n",
err_sig_val[2], err_sig_val[1], err_sig_val[0]);
+ return res;
+}
+
+static irqreturn_t ccn_irq_handler(int irq, void *device)
+{
+ struct intel_edac_dev_info *dev_info = device;
+ irqreturn_t res = IRQ_NONE;
+
+ res = collect_and_clean(dev_info, 1);
+
/* HERE all error data collected, but interrupt not deasserted */
return IRQ_WAKE_THREAD;
}
@@ -414,8 +427,8 @@ static int intel_edac_l3_probe(struct platform_device *pdev)
struct intel_edac_dev_info *dev_info = NULL;
struct device_node *np = pdev->dev.of_node;
struct resource *r;
-
struct arm_smccc_res ret;
+ struct irq_desc *desc;
dev_info = devm_kzalloc(&pdev->dev, sizeof(*dev_info), GFP_KERNEL);
if (!dev_info)
@@ -457,6 +470,7 @@ static int intel_edac_l3_probe(struct platform_device *pdev)
}
dev_info->edac_dev->log_ce = 0;
+
r = platform_get_resource(pdev, IORESOURCE_IRQ, 0);
if (!r)
return -EINVAL;
@@ -467,10 +481,21 @@ static int intel_edac_l3_probe(struct platform_device *pdev)
* Once -1 return, it means old uboot without ccn service.
* Then only polling mechanism is allowed, as it was before.
*/
- if (ARM_SMCCC_UNKNOWN != __arm_smccc_smc(0xc4000027,
- CCN_MN_ERRINT_STATUS__PMU_EVENTS__DISABLE,
- 0, 0, &ret))
+ __arm_smccc_smc(0xc4000027, CCN_MN_ERRINT_STATUS__PMU_EVENTS__DISABLE,
+ 0, 0, &ret);
+ trace_edacl3_smc_results(&ret);
+
+ if (ret.a0 != ARM_SMCCC_UNKNOWN) {
+ irqreturn_t res;
+
dev_info->irq_used = 1;
+ /* clear all error from earlier boot stage */
+ res = collect_and_clean(dev_info, 0);
+ __arm_smccc_smc(0xc4000027,
+ CCN_MN_ERRINT_STATUS__INTREQ__DESSERT,
+ 0, 0, &ret);
+ trace_edacl3_smc_results(&ret);
+ }
dev_info->edac_dev->pvt_info = dev_info;
dev_info->edac_dev->dev = &dev_info->pdev->dev;
@@ -499,6 +524,9 @@ static int intel_edac_l3_probe(struct platform_device *pdev)
goto err2;
}
+ desc = irq_to_desc(r->start);
+ sched_setaffinity(desc->action->thread->pid, &only_cpu_0);
+
return 0;
err2:
edac_device_free_ctl_info(dev_info->edac_dev);
diff --git a/drivers/edac/axxia_edac-mc_56xx.c b/drivers/edac/axxia_edac-mc_56xx.c
index 00f3462..fba04d1 100644
--- a/drivers/edac/axxia_edac-mc_56xx.c
+++ b/drivers/edac/axxia_edac-mc_56xx.c
@@ -1,7 +1,7 @@
/*
* drivers/edac/axxia_edac-mc.c
*
- * EDAC Driver for Intel's Axxia 5600 System Memory Controller
+ * EDAC Driver for Intel's Axxia 5600/6700 System Memory Controller
*
* Copyright (C) 2016 Intel Inc.
*
@@ -29,6 +29,9 @@
#include "edac_core.h"
#include "edac_module.h"
+#define CREATE_TRACE_POINTS
+#include <trace/events/edac_mc.h>
+
#define FMT "%s: syscon lookup failed hence using hardcoded register address\n"
#define MPR_FMT9 "\n%3d %#010x %#010x %#010x %#010x"\
@@ -91,75 +94,99 @@
#define MPR_PAGE_BYTES 4
#define MPR_ERRORS 2 /* CRC, CA Parity error */
-#define SM_INT_MASK_LOW (0xfbbfef01)
+#define INT_BIT_0 (0x00000001)
+#define INT_BIT_1 (0x00000002)
+#define INT_BIT_2 (0x00000004)
+#define INT_BIT_3 (0x00000008)
+#define INT_BIT_4 (0x00000010)
+#define INT_BIT_5 (0x00000020)
+#define INT_BIT_6 (0x00000040)
+#define INT_BIT_7 (0x00000080)
+#define INT_BIT_12 (0x00001000)
+#define INT_BIT_22 (0x00400000)
+#define INT_BIT_24 (0x01000000)
+#define INT_BIT_26 (0x04000000)
+
+
+#define SM_INT_MASK_LOW (~(\
+ INT_BIT_1 |\
+ INT_BIT_2 |\
+ INT_BIT_3 |\
+ INT_BIT_4 |\
+ INT_BIT_5 |\
+ INT_BIT_6 |\
+ INT_BIT_7 |\
+ INT_BIT_12 |\
+ INT_BIT_22 |\
+ INT_BIT_24 |\
+ INT_BIT_26))
+
#define SM_INT_MASK_ALL_LOW (0xffffffff)
-#define SM_INT_MASK_HIGH (0x1)
-#define SM_INT_MASK_ALL_HIGH (0x7)
+#define SM_INT_MASK_HIGH (INT_BIT_0)
+#define SM_INT_MASK_ALL_HIGH (INT_BIT_0|INT_BIT_1|INT_BIT_2)
#define ALIVE_NOTIFICATION_PERIOD (90*1000)
-static int log = 1;
-module_param(log, int, S_IRUGO|S_IWUSR);
-MODULE_PARM_DESC(log, "Log each error to kernel log.");
+static cpumask_t only_cpu_0 = { CPU_BITS_CPU0};
static int force_restart = 1;
-module_param(force_restart, int, S_IRUGO|S_IWUSR);
+module_param(force_restart, int, 0644);
MODULE_PARM_DESC(force_restart, "Machine restart on fatal error.");
static atomic64_t mc_counter = ATOMIC_INIT(0);
/*
- Bit [34] = Logical OR of all lower bits.
- Bit [33] = A CRC error occurred on the write data bus.
- Bit [32] = The software-initiated control word write has completed.
- Bit [31] = The user-initiated DLL resync has completed.
- Bit [30] = A state change has been detected on the
- dfi_init_complete signal after initialization.
- Bit [29] = The assertion of the INHIBIT_DRAM_CMD parameter has
- successfully inhibited the command queue.
- Bit [28] = The register interface-initiated mode register write has
- completed and another mode register write may be issued.
- Bit [27] = A Low Power Interface (LPI) timeout error has occurred.
- Bit [26] = MPR read command, initiated with a software MPR_READ request,
- is complete.
- Bit [25] = Error received from the PHY on the DFI bus.
- Bit [24] = RESERVED
- Bit [23] = RESERVED
- Bit [22] = A parity error has been detected on the address/control bus
- on a registered DIMM.
- Bit [21] = The leveling operation has completed.
- Bit [20] = A read leveling gate training operation has been requested.
- Bit [19] = A read leveling operation has been requested.
- Bit [18] = A write leveling operation has been requested.
- Bit [17] = A DFI update error has occurred. Error information can be
- found in the UPDATE_ERROR_STATUS parameter.
- Bit [16] = A write leveling error has occurred. Error information can
- be found in the WRLVL_ERROR_STATUS parameter.
- Bit [15] = A read leveling gate training error has occurred. Error
- information can be found in the RDLVL_ERROR_STATUS parameter.
- Bit [14] = A read leveling error has occurred. Error information can be
- found in the RDLVL_ERROR_STATUS parameter.
- Bit [13] = The user has programmed an invalid setting associated with
- user words per burst.
- Examples: Setting param_reduc when burst length = 2. A 1:2
- MC:PHY clock ratio with burst length = 2.
- Bit [12] = A wrap cycle crossing a DRAM page has been detected. This
- is unsupported & may result in memory data corruption.
- Bit [11] = A write was attempted to a writeprotected region.
- Bit [10] = The BIST operation has been completed.
- Bit [9] = The low power operation has been completed.
- Bit [8] = The MC initialization has been completed.
- Bit [7] = An error occurred on the port command channel.
- Bit [6] = Multiple uncorrectable ECC events have been detected.
- Bit [5] = An uncorrectable ECC event has been detected.
- Bit [4] = Multiple correctable ECC events have been detected.
- Bit [3] = A correctable ECC event has been detected.
- Bit [2] = Multiple accesses outside the defined PHYSICAL memory space
- have occurred.
- Bit [1] = A memory access outside the defined PHYSICAL memory space
- has occurred.
- Bit [0] = The memory reset is valid on the DFI bus.
-
- Of these 1, 2, 3, 4, 5, 6, 7, 12, 22 and 26 are of interest.
-*/
+ * Bit [34] = Logical OR of all lower bits.
+ * Bit [33] = A CRC error occurred on the write data bus.
+ * Bit [32] = The software-initiated control word write has completed.
+ * Bit [31] = The user-initiated DLL resync has completed.
+ * Bit [30] = A state change has been detected on the
+ * dfi_init_complete signal after initialization.
+ * Bit [29] = The assertion of the INHIBIT_DRAM_CMD parameter has
+ * successfully inhibited the command queue.
+ * Bit [28] = The register interface-initiated mode register write has
+ * completed and another mode register write may be issued.
+ * Bit [27] = A Low Power Interface (LPI) timeout error has occurred.
+ * Bit [26] = MPR read command, initiated with a software MPR_READ request,
+ * is complete.
+ * Bit [25] = Error received from the PHY on the DFI bus.
+ * Bit [24] = RESERVED
+ * Bit [23] = RESERVED
+ * Bit [22] = A parity error has been detected on the address/control bus
+ * on a registered DIMM.
+ * Bit [21] = The leveling operation has completed.
+ * Bit [20] = A read leveling gate training operation has been requested.
+ * Bit [19] = A read leveling operation has been requested.
+ * Bit [18] = A write leveling operation has been requested.
+ * Bit [17] = A DFI update error has occurred. Error information can be
+ * found in the UPDATE_ERROR_STATUS parameter.
+ * Bit [16] = A write leveling error has occurred. Error information can
+ * be found in the WRLVL_ERROR_STATUS parameter.
+ * Bit [15] = A read leveling gate training error has occurred. Error
+ * information can be found in the RDLVL_ERROR_STATUS parameter.
+ * Bit [14] = A read leveling error has occurred. Error information can be
+ * found in the RDLVL_ERROR_STATUS parameter.
+ * Bit [13] = The user has programmed an invalid setting associated with
+ * user words per burst.
+ * Examples: Setting param_reduc when burst length = 2. A 1:2
+ * MC:PHY clock ratio with burst length = 2.
+ * Bit [12] = A wrap cycle crossing a DRAM page has been detected. This
+ * is unsupported & may result in memory data corruption.
+ * Bit [11] = A write was attempted to a writeprotected region.
+ * Bit [10] = The BIST operation has been completed.
+ * Bit [9] = The low power operation has been completed.
+ * Bit [8] = The MC initialization has been completed.
+ * Bit [7] = An error occurred on the port command channel.
+ * Bit [6] = Multiple uncorrectable ECC events have been detected.
+ * Bit [5] = An uncorrectable ECC event has been detected.
+ * Bit [4] = Multiple correctable ECC events have been detected.
+ * Bit [3] = A correctable ECC event has been detected.
+ * Bit [2] = Multiple accesses outside the defined PHYSICAL memory space
+ * have occurred.
+ * Bit [1] = A memory access outside the defined PHYSICAL memory space
+ * has occurred.
+ * Bit [0] = The memory reset is valid on the DFI bus.
+ *
+ * Of these 1, 2, 3, 4, 5, 6, 7, 12, 22, 24 and 26 are of interest.
+ */
/*
* MPR dump processing - overview.
@@ -559,15 +586,15 @@ static char *block_name[] = {
static const u32 event_mask[NR_EVENTS] = {
- [EV_ILLEGAL] = 0x00000002,
- [EV_MULT_ILLEGAL] = 0x00000004,
- [EV_CORR_ECC] = 0x00000008,
- [EV_MULT_CORR_ECC] = 0x00000010,
- [EV_UNCORR_ECC] = 0x00000020,
- [EV_MULT_UNCORR_ECC] = 0x00000040,
- [EV_PORT_ERROR] = 0x00000080,
- [EV_WRAP_ERROR] = 0x00001000,
- [EV_PARITY_ERROR] = 0x00400000,
+ [EV_ILLEGAL] = INT_BIT_1,
+ [EV_MULT_ILLEGAL] = INT_BIT_2,
+ [EV_CORR_ECC] = INT_BIT_3,
+ [EV_MULT_CORR_ECC] = INT_BIT_4,
+ [EV_UNCORR_ECC] = INT_BIT_5,
+ [EV_MULT_UNCORR_ECC] = INT_BIT_6,
+ [EV_PORT_ERROR] = INT_BIT_7,
+ [EV_WRAP_ERROR] = INT_BIT_12,
+ [EV_PARITY_ERROR] = INT_BIT_22,
};
static const struct event_logging {
@@ -735,7 +762,7 @@ static struct edac_dev_sysfs_attribute device_block_attr[] = {
{
.attr = {
.name = "mpr_page1",
- .mode = (S_IRUGO | S_IWUSR)
+ .mode = (0644)
},
.show = mpr1_dump_show,
.store = NULL},
@@ -833,8 +860,11 @@ update_alert_counters(struct intel_edac_dev_info *edac_dev, int cs)
(u8 (*)[MPR_PAGE_BYTES]) (&edac_dev->data->mpr.dram_0_page[0]);
int i;
- for (i = 0; i < MAX_DQ; ++i)
+ for (i = 0; i < edac_dev->data->dram_count; ++i) {
inc_alert_counter(edac_dev->data->alerts, cs, i, dram[i][3]);
+ trace_edac_mc_dump_processed(edac_dev->sm_region >> 16,
+ cs, i, (int) dram[i][3]);
+ }
}
@@ -845,6 +875,9 @@ collect_mpr_dump(struct intel_edac_dev_info *edac_dev, u8 page, int cs)
unsigned long flags;
u32 regval;
int i;
+#ifdef CONFIG_DEBUG_EDAC_AXXIA_SYSMEM
+ u32 node = edac_dev->sm_region >> 16;
+#endif
mpr->mpr_page_id = page;
@@ -855,9 +888,25 @@ collect_mpr_dump(struct intel_edac_dev_info *edac_dev, u8 page, int cs)
goto error_read;
mpr->dram_0_page[i] = regval & 0xff;
+#ifdef CONFIG_DEBUG_EDAC_AXXIA_SYSMEM
+ trace_edac_mc_dump_collected(node, cs, i, 0,
+ (int) mpr->dram_0_page[i]);
+#endif
mpr->dram_1_page[i] = ((regval & 0xff00) >> 8);
+#ifdef CONFIG_DEBUG_EDAC_AXXIA_SYSMEM
+ trace_edac_mc_dump_collected(node, cs, i, 1,
+ (int) mpr->dram_1_page[i]);
+#endif
mpr->dram_2_page[i] = ((regval & 0xff0000) >> 16);
+#ifdef CONFIG_DEBUG_EDAC_AXXIA_SYSMEM
+ trace_edac_mc_dump_collected(node, cs, i, 2,
+ (int) mpr->dram_2_page[i]);
+#endif
mpr->dram_3_page[i] = ((regval & 0xff000000) >> 24);
+#ifdef CONFIG_DEBUG_EDAC_AXXIA_SYSMEM
+ trace_edac_mc_dump_collected(node, cs, i, 3,
+ (int) mpr->dram_3_page[i]);
+#endif
if (ncr_read(edac_dev->sm_region,
(SM_56XX_DENALI_CTL_59 + (0x14 * i)),
@@ -865,9 +914,25 @@ collect_mpr_dump(struct intel_edac_dev_info *edac_dev, u8 page, int cs)
goto error_read;
mpr->dram_4_page[i] = regval & 0xff;
+#ifdef CONFIG_DEBUG_EDAC_AXXIA_SYSMEM
+ trace_edac_mc_dump_collected(node, cs, i, 4,
+ (int) mpr->dram_4_page[i]);
+#endif
mpr->dram_5_page[i] = ((regval & 0xff00) >> 8);
+#ifdef CONFIG_DEBUG_EDAC_AXXIA_SYSMEM
+ trace_edac_mc_dump_collected(node, cs, i, 5,
+ (int) mpr->dram_5_page[i]);
+#endif
mpr->dram_6_page[i] = ((regval & 0xff0000) >> 16);
+#ifdef CONFIG_DEBUG_EDAC_AXXIA_SYSMEM
+ trace_edac_mc_dump_collected(node, cs, i, 6,
+ (int) mpr->dram_6_page[i]);
+#endif
mpr->dram_7_page[i] = ((regval & 0xff000000) >> 24);
+#ifdef CONFIG_DEBUG_EDAC_AXXIA_SYSMEM
+ trace_edac_mc_dump_collected(node, cs, i, 7,
+ (int) mpr->dram_7_page[i]);
+#endif
if (ncr_read(edac_dev->sm_region,
(SM_56XX_DENALI_CTL_60 + (0x14 * i)),
@@ -875,11 +940,27 @@ collect_mpr_dump(struct intel_edac_dev_info *edac_dev, u8 page, int cs)
goto error_read;
mpr->dram_8_page[i] = regval & 0xff;
+#ifdef CONFIG_DEBUG_EDAC_AXXIA_SYSMEM
+ trace_edac_mc_dump_collected(node, cs, i, 8,
+ (int) mpr->dram_8_page[i]);
+#endif
if (edac_dev->data->dram_count == MAX_DQ) {
mpr->dram_9_page[i] = ((regval & 0xff00) >> 8);
+#ifdef CONFIG_DEBUG_EDAC_AXXIA_SYSMEM
+ trace_edac_mc_dump_collected(node, cs, i, 9,
+ (int) mpr->dram_9_page[i]);
+#endif
mpr->dram_10_page[i] = ((regval & 0xff0000) >> 16);
+#ifdef CONFIG_DEBUG_EDAC_AXXIA_SYSMEM
+ trace_edac_mc_dump_collected(node, cs, i, 10,
+ (int) mpr->dram_10_page[i]);
+#endif
mpr->dram_11_page[i] = ((regval & 0xff000000) >> 24);
+#ifdef CONFIG_DEBUG_EDAC_AXXIA_SYSMEM
+ trace_edac_mc_dump_collected(node, cs, i, 11,
+ (int) mpr->dram_11_page[i]);
+#endif
if (ncr_read(edac_dev->sm_region,
(SM_56XX_DENALI_CTL_60 +
@@ -887,9 +968,25 @@ collect_mpr_dump(struct intel_edac_dev_info *edac_dev, u8 page, int cs)
goto error_read;
mpr->dram_12_page[i] = regval & 0xff;
+#ifdef CONFIG_DEBUG_EDAC_AXXIA_SYSMEM
+ trace_edac_mc_dump_collected(node, cs, i, 12,
+ (int) mpr->dram_12_page[i]);
+#endif
mpr->dram_13_page[i] = ((regval & 0xff00) >> 8);
+#ifdef CONFIG_DEBUG_EDAC_AXXIA_SYSMEM
+ trace_edac_mc_dump_collected(node, cs, i, 13,
+ (int) mpr->dram_13_page[i]);
+#endif
mpr->dram_14_page[i] = ((regval & 0xff0000) >> 16);
+#ifdef CONFIG_DEBUG_EDAC_AXXIA_SYSMEM
+ trace_edac_mc_dump_collected(node, cs, i, 14,
+ (int) mpr->dram_14_page[i]);
+#endif
mpr->dram_15_page[i] = ((regval & 0xff000000) >> 24);
+#ifdef CONFIG_DEBUG_EDAC_AXXIA_SYSMEM
+ trace_edac_mc_dump_collected(node, cs, i, 15,
+ (int) mpr->dram_15_page[i]);
+#endif
if (ncr_read(edac_dev->sm_region,
(SM_56XX_DENALI_CTL_61 +
@@ -897,7 +994,15 @@ collect_mpr_dump(struct intel_edac_dev_info *edac_dev, u8 page, int cs)
goto error_read;
mpr->dram_16_page[i] = regval & 0xff;
+#ifdef CONFIG_DEBUG_EDAC_AXXIA_SYSMEM
+ trace_edac_mc_dump_collected(node, cs, i, 16,
+ (int) mpr->dram_16_page[i]);
+#endif
mpr->dram_17_page[i] = ((regval & 0xff00) >> 8);
+#ifdef CONFIG_DEBUG_EDAC_AXXIA_SYSMEM
+ trace_edac_mc_dump_collected(node, cs, i, 17,
+ (int) mpr->dram_17_page[i]);
+#endif
}
}
raw_spin_lock_irqsave(&edac_dev->data->mpr_data_lock, flags);
@@ -945,24 +1050,30 @@ smmon_isr_sw(int interrupt, void *device)
4, (u32 *) &denali_ctl_367))
goto error_read;
- if (denali_ctl_367.int_status & 0x4) {
+ trace_edac_mc_int_status(dev_info->sm_region >> 16, 0,
+ denali_ctl_367.int_status);
+
+ if (denali_ctl_367.int_status & INT_BIT_2) {
if (ncr_read(dev_info->sm_region, SM_56XX_DENALI_CTL_366,
4, (u32 *) &denali_ctl_366))
goto error_read;
+ trace_edac_mc_int_status(dev_info->sm_region >> 16, 1,
+ denali_ctl_366.int_status);
+
handle_events(dev_info, &denali_ctl_366);
atomic_set(&dev_info->data->event_ready, 1);
wake_up(&dev_info->data->event_wq);
denali_ctl_368.int_ack =
- (denali_ctl_366.int_status & 0xf8ffffff);
+ (denali_ctl_366.int_status & (~(INT_BIT_26)));
if (dev_info->is_ddr4) {
- if (denali_ctl_366.int_status & 0x4000000) {
+ if (denali_ctl_366.int_status & INT_BIT_26) {
atomic_set(&dev_info->data->dump_ready, 1);
wake_up(&dev_info->data->dump_wq);
- denali_ctl_368.int_ack |= 0x4000000;
+ denali_ctl_368.int_ack |= INT_BIT_26;
}
}
if (ncr_write(dev_info->sm_region, SM_56XX_DENALI_CTL_368,
@@ -970,12 +1081,12 @@ smmon_isr_sw(int interrupt, void *device)
goto error_write;
}
- if (denali_ctl_367.int_status & 0x2) {
+ if (denali_ctl_367.int_status & INT_BIT_1) {
if (dev_info->is_ddr4) {
atomic_inc(&dev_info->data->dump_in_progress);
wake_up(&dev_info->data->dump_wq);
}
- denali_ctl_369.int_ack = 0x2;
+ denali_ctl_369.int_ack = INT_BIT_1;
if (ncr_write(dev_info->sm_region, SM_56XX_DENALI_CTL_369,
4, (u32 *) &denali_ctl_369))
goto error_write;
@@ -1006,9 +1117,9 @@ static void intel_sm_alerts_error_check(struct edac_device_ctl_info *edac_dev)
start:
/* keep hung up monitor happy 90 sec's */
- if (0 == wait_event_timeout(dev_info->data->dump_wq,
+ if (wait_event_timeout(dev_info->data->dump_wq,
atomic_read(&dev_info->data->dump_in_progress),
- msecs_to_jiffies(ALIVE_NOTIFICATION_PERIOD)))
+ msecs_to_jiffies(ALIVE_NOTIFICATION_PERIOD)) == 0)
goto start;
if (dev_info->finish_alerts)
@@ -1036,6 +1147,9 @@ start:
SM_56XX_DENALI_CTL_57,
4, (u32 *) &denali_ctl_57))
goto error_write;
+
+ trace_edac_mc_dump_triggered(dev_info->sm_region >> 16, i);
+
/* wait */
wait_event(dev_info->data->dump_wq,
atomic_read(&dev_info->data->dump_ready));
@@ -1091,9 +1205,9 @@ static void intel_sm_events_error_check(struct edac_device_ctl_info *edac_dev)
u32 counter;
while (1) {
- if (0 == wait_event_timeout(dev_info->data->event_wq,
+ if (wait_event_timeout(dev_info->data->event_wq,
atomic_read(&dev_info->data->event_ready),
- msecs_to_jiffies(ALIVE_NOTIFICATION_PERIOD)))
+ msecs_to_jiffies(ALIVE_NOTIFICATION_PERIOD)) == 0)
continue;
atomic_set(&dev_info->data->event_ready, 0);
@@ -1221,6 +1335,7 @@ static int intel_edac_mc_probe(struct platform_device *pdev)
struct sm_56xx_denali_ctl_371 denali_ctl_371;
int cs_count = MAX_CS;
int dram_count = MAX_DQ;
+ struct irq_desc *desc;
count = atomic64_inc_return(&mc_counter);
if ((count - 1) == MEMORY_CONTROLLERS)
@@ -1342,6 +1457,7 @@ static int intel_edac_mc_probe(struct platform_device *pdev)
}
dev_info->edac_dev->log_ce = 0;
+
instance = &dev_info->edac_dev->instances[0];
/* It just gives more descriptive name. */
@@ -1401,14 +1517,14 @@ static int intel_edac_mc_probe(struct platform_device *pdev)
"%s-mon", dev_info->ctl_name);
dev_info->wq_events =
- alloc_workqueue("%s-events", WQ_MEM_RECLAIM, 1,
+ alloc_workqueue("%s-events", 0, 1,
(dev_info->ctl_name));
if (!dev_info->wq_events)
goto err_nosysfs;
if (dev_info->is_ddr4) {
dev_info->wq_alerts =
- alloc_workqueue("%s-alerts", WQ_MEM_RECLAIM, 1,
+ alloc_workqueue("%s-alerts", 0, 1,
(dev_info->ctl_name));
if (!dev_info->wq_alerts)
@@ -1420,8 +1536,9 @@ static int intel_edac_mc_probe(struct platform_device *pdev)
INIT_WORK(&dev_info->offload_events, axxia_events_work);
if (dev_info->is_ddr4)
- queue_work(dev_info->wq_alerts, &dev_info->offload_alerts);
- queue_work(dev_info->wq_events, &dev_info->offload_events);
+ queue_work_on(0, dev_info->wq_alerts,
+ &dev_info->offload_alerts);
+ queue_work_on(0, dev_info->wq_events, &dev_info->offload_events);
irq = platform_get_irq(pdev, 0);
if (irq < 0) {
@@ -1435,8 +1552,7 @@ static int intel_edac_mc_probe(struct platform_device *pdev)
if (dev_info->is_ddr4)
denali_ctl_370.int_mask = SM_INT_MASK_LOW;
else
- denali_ctl_370.int_mask = SM_INT_MASK_LOW |
- 0x04000000;
+ denali_ctl_370.int_mask = SM_INT_MASK_LOW | INT_BIT_26;
if (ncr_write(dev_info->sm_region, SM_56XX_DENALI_CTL_370,
4, (u32 *) &denali_ctl_370)) {
@@ -1487,6 +1603,9 @@ static int intel_edac_mc_probe(struct platform_device *pdev)
}
goto err_noirq;
}
+ desc = irq_to_desc(irq);
+ sched_setaffinity(desc->action->thread->pid, &only_cpu_0);
+
return 0;
err_noirq:
diff --git a/include/trace/events/edac_cmc.h b/include/trace/events/edac_cmc.h
new file mode 100644
index 0000000..143d58f
--- /dev/null
+++ b/include/trace/events/edac_cmc.h
@@ -0,0 +1,101 @@
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM edac_cmc
+
+#if !defined(_TRACE_EDAC_CMC_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_EDAC_CMC_H
+
+#include <linux/types.h>
+#include <linux/tracepoint.h>
+
+
+TRACE_EVENT(edac_cmc_int_status,
+ TP_PROTO(u32 node, u32 int_status),
+
+ TP_ARGS(node, int_status),
+
+ TP_STRUCT__entry(
+ __field(u32, node)
+ __field(u32, int_status)
+ ),
+
+ TP_fast_assign(
+ __entry->node = node;
+ __entry->int_status = int_status;
+ ),
+
+ TP_printk("CMEM(node=0x%x) int_status=0x%08x",
+ (u32) __entry->node, (u32) __entry->int_status)
+);
+
+TRACE_EVENT(edac_cmc_dump_processed,
+ TP_PROTO(u32 node, u32 cs, u32 dram, u32 val),
+
+ TP_ARGS(node, cs, dram, val),
+
+ TP_STRUCT__entry(
+ __field(u32, node)
+ __field(u32, cs)
+ __field(u32, dram)
+ __field(u32, val)
+ ),
+
+ TP_fast_assign(
+ __entry->node = node;
+ __entry->cs = cs;
+ __entry->dram = dram;
+ __entry->val = val;
+ ),
+
+ TP_printk("CMEM(node=0x%x) cs=%d dram=%d value=0x%x",
+ (u32) __entry->node, (u32) __entry->cs,
+ (u32) __entry->dram, (u32) __entry->val)
+);
+
+TRACE_EVENT(edac_cmc_dump_collected,
+ TP_PROTO(u32 node, u32 cs, u32 byte, u32 dram, u32 val),
+
+ TP_ARGS(node, cs, byte, dram, val),
+
+ TP_STRUCT__entry(
+ __field(u32, node)
+ __field(u32, cs)
+ __field(u32, byte)
+ __field(u32, dram)
+ __field(u32, val)
+ ),
+
+ TP_fast_assign(
+ __entry->node = node;
+ __entry->cs = cs;
+ __entry->byte = byte;
+ __entry->dram = dram;
+ __entry->val = val;
+ ),
+
+ TP_printk("CMEM(node=0x%x) cs=%d dram_%d_page[byte=%d]=0x%x",
+ (u32) __entry->node, (u32) __entry->cs, (u32) __entry->dram,
+ (u32) __entry->byte, (u32) __entry->val)
+);
+
+TRACE_EVENT(edac_cmc_dump_triggered,
+ TP_PROTO(u32 node, u32 cs),
+
+ TP_ARGS(node, cs),
+
+ TP_STRUCT__entry(
+ __field(u32, node)
+ __field(u32, cs)
+ ),
+
+ TP_fast_assign(
+ __entry->node = node;
+ __entry->cs = cs;
+ ),
+
+ TP_printk("CMEM(node=0x%x) cs=%d",
+ (u32) __entry->node, (u32) __entry->cs)
+);
+#endif /* _TRACE_EDAC_CMC_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/include/trace/events/edac_mc.h b/include/trace/events/edac_mc.h
new file mode 100644
index 0000000..fa09fda
--- /dev/null
+++ b/include/trace/events/edac_mc.h
@@ -0,0 +1,104 @@
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM edac_mc
+
+#if !defined(_TRACE_EDAC_MC_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_EDAC_MC_H
+
+#include <linux/types.h>
+#include <linux/tracepoint.h>
+
+
+TRACE_EVENT(edac_mc_int_status,
+ TP_PROTO(u32 node, u32 idx, u32 int_status),
+
+ TP_ARGS(node, idx, int_status),
+
+ TP_STRUCT__entry(
+ __field(u32, node)
+ __field(u32, idx)
+ __field(u32, int_status)
+ ),
+
+ TP_fast_assign(
+ __entry->node = node;
+ __entry->idx = idx;
+ __entry->int_status = int_status;
+ ),
+
+ TP_printk("SMEM(node=0x%x) int_status[%d]=0x%08x",
+ (u32) __entry->node, (u32) __entry->idx,
+ (u32) __entry->int_status)
+);
+
+TRACE_EVENT(edac_mc_dump_processed,
+ TP_PROTO(u32 node, u32 cs, u32 dram, u32 val),
+
+ TP_ARGS(node, cs, dram, val),
+
+ TP_STRUCT__entry(
+ __field(u32, node)
+ __field(u32, cs)
+ __field(u32, dram)
+ __field(u32, val)
+ ),
+
+ TP_fast_assign(
+ __entry->node = node;
+ __entry->cs = cs;
+ __entry->dram = dram;
+ __entry->val = val;
+ ),
+
+ TP_printk("SMEM(node=0x%x) cs=%d dram=%d value=0x%x",
+ (u32) __entry->node, (u32) __entry->cs,
+ (u32) __entry->dram, (u32) __entry->val)
+);
+
+TRACE_EVENT(edac_mc_dump_collected,
+ TP_PROTO(u32 node, u32 cs, u32 byte, u32 dram, u32 val),
+
+ TP_ARGS(node, cs, byte, dram, val),
+
+ TP_STRUCT__entry(
+ __field(u32, node)
+ __field(u32, cs)
+ __field(u32, byte)
+ __field(u32, dram)
+ __field(u32, val)
+ ),
+
+ TP_fast_assign(
+ __entry->node = node;
+ __entry->cs = cs;
+ __entry->byte = byte;
+ __entry->dram = dram;
+ __entry->val = val;
+ ),
+
+ TP_printk("SMEM(node=0x%x) cs=%d dram_%d_page[byte=%d]=0x%x",
+ (u32) __entry->node, (u32) __entry->cs, (u32) __entry->dram,
+ (u32) __entry->byte, (u32) __entry->val)
+);
+
+TRACE_EVENT(edac_mc_dump_triggered,
+ TP_PROTO(u32 node, u32 cs),
+
+ TP_ARGS(node, cs),
+
+ TP_STRUCT__entry(
+ __field(u32, node)
+ __field(u32, cs)
+ ),
+
+ TP_fast_assign(
+ __entry->node = node;
+ __entry->cs = cs;
+ ),
+
+ TP_printk("SMEM(node=0x%x) cs=%d",
+ (u32) __entry->node, (u32) __entry->cs)
+);
+#endif /* _TRACE_EDAC_MC_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
--
2.7.4
More information about the linux-yocto
mailing list