| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610 |
- // SPDX-License-Identifier: GPL-2.0-only
- /*
- * Ampere Computing SoC's SMpro Error Monitoring Driver
- *
- * Copyright (c) 2022, Ampere Computing LLC
- *
- */
- #include <linux/mod_devicetable.h>
- #include <linux/module.h>
- #include <linux/platform_device.h>
- #include <linux/regmap.h>
- /* GPI RAS Error Registers */
- #define GPI_RAS_ERR 0x7E
- /* Core and L2C Error Registers */
- #define CORE_CE_ERR_CNT 0x80
- #define CORE_CE_ERR_LEN 0x81
- #define CORE_CE_ERR_DATA 0x82
- #define CORE_UE_ERR_CNT 0x83
- #define CORE_UE_ERR_LEN 0x84
- #define CORE_UE_ERR_DATA 0x85
- /* Memory Error Registers */
- #define MEM_CE_ERR_CNT 0x90
- #define MEM_CE_ERR_LEN 0x91
- #define MEM_CE_ERR_DATA 0x92
- #define MEM_UE_ERR_CNT 0x93
- #define MEM_UE_ERR_LEN 0x94
- #define MEM_UE_ERR_DATA 0x95
- /* RAS Error/Warning Registers */
- #define ERR_SMPRO_TYPE 0xA0
- #define ERR_PMPRO_TYPE 0xA1
- #define ERR_SMPRO_INFO_LO 0xA2
- #define ERR_SMPRO_INFO_HI 0xA3
- #define ERR_SMPRO_DATA_LO 0xA4
- #define ERR_SMPRO_DATA_HI 0xA5
- #define WARN_SMPRO_INFO_LO 0xAA
- #define WARN_SMPRO_INFO_HI 0xAB
- #define ERR_PMPRO_INFO_LO 0xA6
- #define ERR_PMPRO_INFO_HI 0xA7
- #define ERR_PMPRO_DATA_LO 0xA8
- #define ERR_PMPRO_DATA_HI 0xA9
- #define WARN_PMPRO_INFO_LO 0xAC
- #define WARN_PMPRO_INFO_HI 0xAD
- /* Boot Stage Register */
- #define BOOTSTAGE 0xB0
- #define DIMM_SYNDROME_SEL 0xB4
- #define DIMM_SYNDROME_ERR 0xB5
- #define DIMM_SYNDROME_STAGE 4
- /* PCIE Error Registers */
- #define PCIE_CE_ERR_CNT 0xC0
- #define PCIE_CE_ERR_LEN 0xC1
- #define PCIE_CE_ERR_DATA 0xC2
- #define PCIE_UE_ERR_CNT 0xC3
- #define PCIE_UE_ERR_LEN 0xC4
- #define PCIE_UE_ERR_DATA 0xC5
- /* Other Error Registers */
- #define OTHER_CE_ERR_CNT 0xD0
- #define OTHER_CE_ERR_LEN 0xD1
- #define OTHER_CE_ERR_DATA 0xD2
- #define OTHER_UE_ERR_CNT 0xD8
- #define OTHER_UE_ERR_LEN 0xD9
- #define OTHER_UE_ERR_DATA 0xDA
- /* Event Data Registers */
- #define VRD_WARN_FAULT_EVENT_DATA 0x78
- #define VRD_HOT_EVENT_DATA 0x79
- #define DIMM_HOT_EVENT_DATA 0x7A
- #define DIMM_2X_REFRESH_EVENT_DATA 0x96
- #define MAX_READ_BLOCK_LENGTH 48
- #define RAS_SMPRO_ERR 0
- #define RAS_PMPRO_ERR 1
- enum RAS_48BYTES_ERR_TYPES {
- CORE_CE_ERR,
- CORE_UE_ERR,
- MEM_CE_ERR,
- MEM_UE_ERR,
- PCIE_CE_ERR,
- PCIE_UE_ERR,
- OTHER_CE_ERR,
- OTHER_UE_ERR,
- NUM_48BYTES_ERR_TYPE,
- };
- struct smpro_error_hdr {
- u8 count; /* Number of the RAS errors */
- u8 len; /* Number of data bytes */
- u8 data; /* Start of 48-byte data */
- u8 max_cnt; /* Max num of errors */
- };
- /*
- * Included Address of registers to get Count, Length of data and Data
- * of the 48 bytes error data
- */
- static struct smpro_error_hdr smpro_error_table[] = {
- [CORE_CE_ERR] = {
- .count = CORE_CE_ERR_CNT,
- .len = CORE_CE_ERR_LEN,
- .data = CORE_CE_ERR_DATA,
- .max_cnt = 32
- },
- [CORE_UE_ERR] = {
- .count = CORE_UE_ERR_CNT,
- .len = CORE_UE_ERR_LEN,
- .data = CORE_UE_ERR_DATA,
- .max_cnt = 32
- },
- [MEM_CE_ERR] = {
- .count = MEM_CE_ERR_CNT,
- .len = MEM_CE_ERR_LEN,
- .data = MEM_CE_ERR_DATA,
- .max_cnt = 16
- },
- [MEM_UE_ERR] = {
- .count = MEM_UE_ERR_CNT,
- .len = MEM_UE_ERR_LEN,
- .data = MEM_UE_ERR_DATA,
- .max_cnt = 16
- },
- [PCIE_CE_ERR] = {
- .count = PCIE_CE_ERR_CNT,
- .len = PCIE_CE_ERR_LEN,
- .data = PCIE_CE_ERR_DATA,
- .max_cnt = 96
- },
- [PCIE_UE_ERR] = {
- .count = PCIE_UE_ERR_CNT,
- .len = PCIE_UE_ERR_LEN,
- .data = PCIE_UE_ERR_DATA,
- .max_cnt = 96
- },
- [OTHER_CE_ERR] = {
- .count = OTHER_CE_ERR_CNT,
- .len = OTHER_CE_ERR_LEN,
- .data = OTHER_CE_ERR_DATA,
- .max_cnt = 8
- },
- [OTHER_UE_ERR] = {
- .count = OTHER_UE_ERR_CNT,
- .len = OTHER_UE_ERR_LEN,
- .data = OTHER_UE_ERR_DATA,
- .max_cnt = 8
- },
- };
- /*
- * List of SCP registers which are used to get
- * one type of RAS Internal errors.
- */
- struct smpro_int_error_hdr {
- u8 type;
- u8 info_l;
- u8 info_h;
- u8 data_l;
- u8 data_h;
- u8 warn_l;
- u8 warn_h;
- };
- static struct smpro_int_error_hdr list_smpro_int_error_hdr[] = {
- [RAS_SMPRO_ERR] = {
- .type = ERR_SMPRO_TYPE,
- .info_l = ERR_SMPRO_INFO_LO,
- .info_h = ERR_SMPRO_INFO_HI,
- .data_l = ERR_SMPRO_DATA_LO,
- .data_h = ERR_SMPRO_DATA_HI,
- .warn_l = WARN_SMPRO_INFO_LO,
- .warn_h = WARN_SMPRO_INFO_HI,
- },
- [RAS_PMPRO_ERR] = {
- .type = ERR_PMPRO_TYPE,
- .info_l = ERR_PMPRO_INFO_LO,
- .info_h = ERR_PMPRO_INFO_HI,
- .data_l = ERR_PMPRO_DATA_LO,
- .data_h = ERR_PMPRO_DATA_HI,
- .warn_l = WARN_PMPRO_INFO_LO,
- .warn_h = WARN_PMPRO_INFO_HI,
- },
- };
- struct smpro_errmon {
- struct regmap *regmap;
- };
- enum EVENT_TYPES {
- VRD_WARN_FAULT_EVENT,
- VRD_HOT_EVENT,
- DIMM_HOT_EVENT,
- DIMM_2X_REFRESH_EVENT,
- NUM_EVENTS_TYPE,
- };
- /* Included Address of event source and data registers */
- static u8 smpro_event_table[NUM_EVENTS_TYPE] = {
- VRD_WARN_FAULT_EVENT_DATA,
- VRD_HOT_EVENT_DATA,
- DIMM_HOT_EVENT_DATA,
- DIMM_2X_REFRESH_EVENT_DATA,
- };
- static ssize_t smpro_event_data_read(struct device *dev,
- struct device_attribute *da, char *buf,
- int channel)
- {
- struct smpro_errmon *errmon = dev_get_drvdata(dev);
- s32 event_data;
- int ret;
- ret = regmap_read(errmon->regmap, smpro_event_table[channel], &event_data);
- if (ret)
- return ret;
- /* Clear event after read */
- if (event_data != 0)
- regmap_write(errmon->regmap, smpro_event_table[channel], event_data);
- return sysfs_emit(buf, "%04x\n", event_data);
- }
- static ssize_t smpro_overflow_data_read(struct device *dev, struct device_attribute *da,
- char *buf, int channel)
- {
- struct smpro_errmon *errmon = dev_get_drvdata(dev);
- struct smpro_error_hdr *err_info;
- s32 err_count;
- int ret;
- err_info = &smpro_error_table[channel];
- ret = regmap_read(errmon->regmap, err_info->count, &err_count);
- if (ret)
- return ret;
- /* Bit 8 indicates the overflow status */
- return sysfs_emit(buf, "%d\n", (err_count & BIT(8)) ? 1 : 0);
- }
- static ssize_t smpro_error_data_read(struct device *dev, struct device_attribute *da,
- char *buf, int channel)
- {
- struct smpro_errmon *errmon = dev_get_drvdata(dev);
- unsigned char err_data[MAX_READ_BLOCK_LENGTH];
- struct smpro_error_hdr *err_info;
- s32 err_count, err_length;
- int ret;
- err_info = &smpro_error_table[channel];
- ret = regmap_read(errmon->regmap, err_info->count, &err_count);
- /* Error count is the low byte */
- err_count &= 0xff;
- if (ret || !err_count || err_count > err_info->max_cnt)
- return ret;
- ret = regmap_read(errmon->regmap, err_info->len, &err_length);
- if (ret || err_length <= 0)
- return ret;
- if (err_length > MAX_READ_BLOCK_LENGTH)
- err_length = MAX_READ_BLOCK_LENGTH;
- memset(err_data, 0x00, MAX_READ_BLOCK_LENGTH);
- ret = regmap_noinc_read(errmon->regmap, err_info->data, err_data, err_length);
- if (ret < 0)
- return ret;
- /* clear the error */
- ret = regmap_write(errmon->regmap, err_info->count, 0x100);
- if (ret)
- return ret;
- /*
- * The output of Core/Memory/PCIe/Others UE/CE errors follows the format
- * specified in section 5.8.1 CE/UE Error Data record in
- * Altra SOC BMC Interface specification.
- */
- return sysfs_emit(buf, "%*phN\n", MAX_READ_BLOCK_LENGTH, err_data);
- }
- /*
- * Output format:
- * <4-byte hex value of error info><4-byte hex value of error extensive data>
- * Where:
- * + error info : The error information
- * + error data : Extensive data (32 bits)
- * Reference to section 5.10 RAS Internal Error Register Definition in
- * Altra SOC BMC Interface specification
- */
- static ssize_t smpro_internal_err_read(struct device *dev, struct device_attribute *da,
- char *buf, int channel)
- {
- struct smpro_errmon *errmon = dev_get_drvdata(dev);
- struct smpro_int_error_hdr *err_info;
- unsigned int err[4] = { 0 };
- unsigned int err_type;
- unsigned int val;
- int ret;
- /* read error status */
- ret = regmap_read(errmon->regmap, GPI_RAS_ERR, &val);
- if (ret)
- return ret;
- if ((channel == RAS_SMPRO_ERR && !(val & BIT(0))) ||
- (channel == RAS_PMPRO_ERR && !(val & BIT(1))))
- return 0;
- err_info = &list_smpro_int_error_hdr[channel];
- ret = regmap_read(errmon->regmap, err_info->type, &val);
- if (ret)
- return ret;
- err_type = (val & BIT(1)) ? BIT(1) :
- (val & BIT(2)) ? BIT(2) : 0;
- if (!err_type)
- return 0;
- ret = regmap_read(errmon->regmap, err_info->info_l, err + 1);
- if (ret)
- return ret;
- ret = regmap_read(errmon->regmap, err_info->info_h, err);
- if (ret)
- return ret;
- if (err_type & BIT(2)) {
- /* Error with data type */
- ret = regmap_read(errmon->regmap, err_info->data_l, err + 3);
- if (ret)
- return ret;
- ret = regmap_read(errmon->regmap, err_info->data_h, err + 2);
- if (ret)
- return ret;
- }
- /* clear the read errors */
- ret = regmap_write(errmon->regmap, err_info->type, err_type);
- if (ret)
- return ret;
- return sysfs_emit(buf, "%*phN\n", (int)sizeof(err), err);
- }
- /*
- * Output format:
- * <4-byte hex value of warining info>
- * Reference to section 5.10 RAS Internal Error Register Definition in
- * Altra SOC BMC Interface specification
- */
- static ssize_t smpro_internal_warn_read(struct device *dev, struct device_attribute *da,
- char *buf, int channel)
- {
- struct smpro_errmon *errmon = dev_get_drvdata(dev);
- struct smpro_int_error_hdr *err_info;
- unsigned int warn[2] = { 0 };
- unsigned int val;
- int ret;
- /* read error status */
- ret = regmap_read(errmon->regmap, GPI_RAS_ERR, &val);
- if (ret)
- return ret;
- if ((channel == RAS_SMPRO_ERR && !(val & BIT(0))) ||
- (channel == RAS_PMPRO_ERR && !(val & BIT(1))))
- return 0;
- err_info = &list_smpro_int_error_hdr[channel];
- ret = regmap_read(errmon->regmap, err_info->type, &val);
- if (ret)
- return ret;
- if (!(val & BIT(0)))
- return 0;
- ret = regmap_read(errmon->regmap, err_info->warn_l, warn + 1);
- if (ret)
- return ret;
- ret = regmap_read(errmon->regmap, err_info->warn_h, warn);
- if (ret)
- return ret;
- /* clear the warning */
- ret = regmap_write(errmon->regmap, err_info->type, BIT(0));
- if (ret)
- return ret;
- return sysfs_emit(buf, "%*phN\n", (int)sizeof(warn), warn);
- }
- #define ERROR_OVERFLOW_RO(_error, _index) \
- static ssize_t overflow_##_error##_show(struct device *dev, \
- struct device_attribute *da, \
- char *buf) \
- { \
- return smpro_overflow_data_read(dev, da, buf, _index); \
- } \
- static DEVICE_ATTR_RO(overflow_##_error)
- ERROR_OVERFLOW_RO(core_ce, CORE_CE_ERR);
- ERROR_OVERFLOW_RO(core_ue, CORE_UE_ERR);
- ERROR_OVERFLOW_RO(mem_ce, MEM_CE_ERR);
- ERROR_OVERFLOW_RO(mem_ue, MEM_UE_ERR);
- ERROR_OVERFLOW_RO(pcie_ce, PCIE_CE_ERR);
- ERROR_OVERFLOW_RO(pcie_ue, PCIE_UE_ERR);
- ERROR_OVERFLOW_RO(other_ce, OTHER_CE_ERR);
- ERROR_OVERFLOW_RO(other_ue, OTHER_UE_ERR);
- #define ERROR_RO(_error, _index) \
- static ssize_t error_##_error##_show(struct device *dev, \
- struct device_attribute *da, \
- char *buf) \
- { \
- return smpro_error_data_read(dev, da, buf, _index); \
- } \
- static DEVICE_ATTR_RO(error_##_error)
- ERROR_RO(core_ce, CORE_CE_ERR);
- ERROR_RO(core_ue, CORE_UE_ERR);
- ERROR_RO(mem_ce, MEM_CE_ERR);
- ERROR_RO(mem_ue, MEM_UE_ERR);
- ERROR_RO(pcie_ce, PCIE_CE_ERR);
- ERROR_RO(pcie_ue, PCIE_UE_ERR);
- ERROR_RO(other_ce, OTHER_CE_ERR);
- ERROR_RO(other_ue, OTHER_UE_ERR);
- static ssize_t error_smpro_show(struct device *dev, struct device_attribute *da, char *buf)
- {
- return smpro_internal_err_read(dev, da, buf, RAS_SMPRO_ERR);
- }
- static DEVICE_ATTR_RO(error_smpro);
- static ssize_t error_pmpro_show(struct device *dev, struct device_attribute *da, char *buf)
- {
- return smpro_internal_err_read(dev, da, buf, RAS_PMPRO_ERR);
- }
- static DEVICE_ATTR_RO(error_pmpro);
- static ssize_t warn_smpro_show(struct device *dev, struct device_attribute *da, char *buf)
- {
- return smpro_internal_warn_read(dev, da, buf, RAS_SMPRO_ERR);
- }
- static DEVICE_ATTR_RO(warn_smpro);
- static ssize_t warn_pmpro_show(struct device *dev, struct device_attribute *da, char *buf)
- {
- return smpro_internal_warn_read(dev, da, buf, RAS_PMPRO_ERR);
- }
- static DEVICE_ATTR_RO(warn_pmpro);
- #define EVENT_RO(_event, _index) \
- static ssize_t event_##_event##_show(struct device *dev, \
- struct device_attribute *da, \
- char *buf) \
- { \
- return smpro_event_data_read(dev, da, buf, _index); \
- } \
- static DEVICE_ATTR_RO(event_##_event)
- EVENT_RO(vrd_warn_fault, VRD_WARN_FAULT_EVENT);
- EVENT_RO(vrd_hot, VRD_HOT_EVENT);
- EVENT_RO(dimm_hot, DIMM_HOT_EVENT);
- EVENT_RO(dimm_2x_refresh, DIMM_2X_REFRESH_EVENT);
- static ssize_t smpro_dimm_syndrome_read(struct device *dev, struct device_attribute *da,
- char *buf, unsigned int slot)
- {
- struct smpro_errmon *errmon = dev_get_drvdata(dev);
- unsigned int data;
- int ret;
- ret = regmap_read(errmon->regmap, BOOTSTAGE, &data);
- if (ret)
- return ret;
- /* check for valid stage */
- data = (data >> 8) & 0xff;
- if (data != DIMM_SYNDROME_STAGE)
- return ret;
- /* Write the slot ID to retrieve Error Syndrome */
- ret = regmap_write(errmon->regmap, DIMM_SYNDROME_SEL, slot);
- if (ret)
- return ret;
- /* Read the Syndrome error */
- ret = regmap_read(errmon->regmap, DIMM_SYNDROME_ERR, &data);
- if (ret || !data)
- return ret;
- return sysfs_emit(buf, "%04x\n", data);
- }
- #define EVENT_DIMM_SYNDROME(_slot) \
- static ssize_t event_dimm##_slot##_syndrome_show(struct device *dev, \
- struct device_attribute *da, \
- char *buf) \
- { \
- return smpro_dimm_syndrome_read(dev, da, buf, _slot); \
- } \
- static DEVICE_ATTR_RO(event_dimm##_slot##_syndrome)
- EVENT_DIMM_SYNDROME(0);
- EVENT_DIMM_SYNDROME(1);
- EVENT_DIMM_SYNDROME(2);
- EVENT_DIMM_SYNDROME(3);
- EVENT_DIMM_SYNDROME(4);
- EVENT_DIMM_SYNDROME(5);
- EVENT_DIMM_SYNDROME(6);
- EVENT_DIMM_SYNDROME(7);
- EVENT_DIMM_SYNDROME(8);
- EVENT_DIMM_SYNDROME(9);
- EVENT_DIMM_SYNDROME(10);
- EVENT_DIMM_SYNDROME(11);
- EVENT_DIMM_SYNDROME(12);
- EVENT_DIMM_SYNDROME(13);
- EVENT_DIMM_SYNDROME(14);
- EVENT_DIMM_SYNDROME(15);
- static struct attribute *smpro_errmon_attrs[] = {
- &dev_attr_overflow_core_ce.attr,
- &dev_attr_overflow_core_ue.attr,
- &dev_attr_overflow_mem_ce.attr,
- &dev_attr_overflow_mem_ue.attr,
- &dev_attr_overflow_pcie_ce.attr,
- &dev_attr_overflow_pcie_ue.attr,
- &dev_attr_overflow_other_ce.attr,
- &dev_attr_overflow_other_ue.attr,
- &dev_attr_error_core_ce.attr,
- &dev_attr_error_core_ue.attr,
- &dev_attr_error_mem_ce.attr,
- &dev_attr_error_mem_ue.attr,
- &dev_attr_error_pcie_ce.attr,
- &dev_attr_error_pcie_ue.attr,
- &dev_attr_error_other_ce.attr,
- &dev_attr_error_other_ue.attr,
- &dev_attr_error_smpro.attr,
- &dev_attr_error_pmpro.attr,
- &dev_attr_warn_smpro.attr,
- &dev_attr_warn_pmpro.attr,
- &dev_attr_event_vrd_warn_fault.attr,
- &dev_attr_event_vrd_hot.attr,
- &dev_attr_event_dimm_hot.attr,
- &dev_attr_event_dimm_2x_refresh.attr,
- &dev_attr_event_dimm0_syndrome.attr,
- &dev_attr_event_dimm1_syndrome.attr,
- &dev_attr_event_dimm2_syndrome.attr,
- &dev_attr_event_dimm3_syndrome.attr,
- &dev_attr_event_dimm4_syndrome.attr,
- &dev_attr_event_dimm5_syndrome.attr,
- &dev_attr_event_dimm6_syndrome.attr,
- &dev_attr_event_dimm7_syndrome.attr,
- &dev_attr_event_dimm8_syndrome.attr,
- &dev_attr_event_dimm9_syndrome.attr,
- &dev_attr_event_dimm10_syndrome.attr,
- &dev_attr_event_dimm11_syndrome.attr,
- &dev_attr_event_dimm12_syndrome.attr,
- &dev_attr_event_dimm13_syndrome.attr,
- &dev_attr_event_dimm14_syndrome.attr,
- &dev_attr_event_dimm15_syndrome.attr,
- NULL
- };
- ATTRIBUTE_GROUPS(smpro_errmon);
- static int smpro_errmon_probe(struct platform_device *pdev)
- {
- struct smpro_errmon *errmon;
- errmon = devm_kzalloc(&pdev->dev, sizeof(struct smpro_errmon), GFP_KERNEL);
- if (!errmon)
- return -ENOMEM;
- platform_set_drvdata(pdev, errmon);
- errmon->regmap = dev_get_regmap(pdev->dev.parent, NULL);
- if (!errmon->regmap)
- return -ENODEV;
- return 0;
- }
- static struct platform_driver smpro_errmon_driver = {
- .probe = smpro_errmon_probe,
- .driver = {
- .name = "smpro-errmon",
- .dev_groups = smpro_errmon_groups,
- },
- };
- module_platform_driver(smpro_errmon_driver);
- MODULE_AUTHOR("Tung Nguyen <tung.nguyen@amperecomputing.com>");
- MODULE_AUTHOR("Thinh Pham <thinh.pham@amperecomputing.com>");
- MODULE_AUTHOR("Hoang Nguyen <hnguyen@amperecomputing.com>");
- MODULE_AUTHOR("Thu Nguyen <thu@os.amperecomputing.com>");
- MODULE_AUTHOR("Quan Nguyen <quan@os.amperecomputing.com>");
- MODULE_DESCRIPTION("Ampere Altra SMpro driver");
- MODULE_LICENSE("GPL");
|