@@ -136,6 +136,7 @@ enum hnae3_reset_type {
HNAE3_CORE_RESET,
HNAE3_GLOBAL_RESET,
HNAE3_IMP_RESET,
+ HNAE3_UNKNOWN_RESET,
HNAE3_NONE_RESET,
};
@@ -220,6 +220,9 @@ enum hclge_opcode_type {
HCLGE_QUERY_RAS_INT_STS_BD_NUM = 0x1510,
HCLGE_QUERY_CLEAR_MPF_RAS_INT = 0x1511,
HCLGE_QUERY_CLEAR_PF_RAS_INT = 0x1512,
+ HCLGE_QUERY_MSIX_INT_STS_BD_NUM = 0x1513,
+ HCLGE_QUERY_CLEAR_ALL_MPF_MSIX_INT = 0x1514,
+ HCLGE_QUERY_CLEAR_ALL_PF_MSIX_INT = 0x1515,
HCLGE_IGU_EGU_TNL_INT_EN = 0x1803,
HCLGE_IGU_COMMON_INT_EN = 0x1806,
HCLGE_TM_QCN_MEM_INT_CFG = 0x1A14,
@@ -727,3 +727,96 @@ pci_ers_result_t hclge_handle_hw_ras_error(struct hnae3_ae_dev *ae_dev)
return PCI_ERS_RESULT_RECOVERED;
}
+
+int hclge_handle_hw_msix_error(struct hclge_dev *hdev,
+ unsigned long *reset_requests)
+{
+ struct device *dev = &hdev->pdev->dev;
+ u32 mpf_bd_num, pf_bd_num, bd_num;
+ struct hclge_desc desc_bd;
+ struct hclge_desc *desc;
+ int ret = 0;
+
+ /* set default handling */
+ set_bit(HNAE3_FUNC_RESET, reset_requests);
+
+ /* query the number of bds for the MSIx int status */
+ hclge_cmd_setup_basic_desc(&desc_bd, HCLGE_QUERY_MSIX_INT_STS_BD_NUM,
+ true);
+ ret = hclge_cmd_send(&hdev->hw, &desc_bd, 1);
+ if (ret) {
+ dev_err(dev, "fail(%d) to query msix int status bd num\n",
+ ret);
+ /* reset everything for now */
+ set_bit(HNAE3_GLOBAL_RESET, reset_requests);
+ return ret;
+ }
+
+ mpf_bd_num = le32_to_cpu(desc_bd.data[0]);
+ pf_bd_num = le32_to_cpu(desc_bd.data[1]);
+ bd_num = max_t(u32, mpf_bd_num, pf_bd_num);
+
+ desc = kcalloc(bd_num, sizeof(struct hclge_desc), GFP_KERNEL);
+ if (!desc)
+ goto out;
+
+ /* query all main PF MSIx errors */
+ hclge_cmd_setup_basic_desc(&desc[0], HCLGE_QUERY_CLEAR_ALL_MPF_MSIX_INT,
+ true);
+ desc[0].flag |= cpu_to_le16(HCLGE_CMD_FLAG_NEXT);
+
+ ret = hclge_cmd_send(&hdev->hw, &desc[0], mpf_bd_num);
+ if (ret) {
+ dev_err(dev, "query all mpf msix int cmd failed (%d)\n",
+ ret);
+ /* reset everything for now */
+ set_bit(HNAE3_GLOBAL_RESET, reset_requests);
+ goto msi_error;
+ }
+
+ /* clear all main PF MSIx errors */
+ hclge_cmd_reuse_desc(&desc[0], false);
+ desc[0].flag |= cpu_to_le16(HCLGE_CMD_FLAG_NEXT);
+
+ ret = hclge_cmd_send(&hdev->hw, &desc[0], mpf_bd_num);
+ if (ret) {
+ dev_err(dev, "clear all mpf msix int cmd failed (%d)\n",
+ ret);
+ /* reset everything for now */
+ set_bit(HNAE3_GLOBAL_RESET, reset_requests);
+ goto msi_error;
+ }
+
+ /* query all PF MSIx errors */
+ memset(desc, 0, bd_num * sizeof(struct hclge_desc));
+ hclge_cmd_setup_basic_desc(&desc[0], HCLGE_QUERY_CLEAR_ALL_PF_MSIX_INT,
+ true);
+ desc[0].flag |= cpu_to_le16(HCLGE_CMD_FLAG_NEXT);
+
+ ret = hclge_cmd_send(&hdev->hw, &desc[0], pf_bd_num);
+ if (ret) {
+ dev_err(dev, "query all pf msix int cmd failed (%d)\n",
+ ret);
+ /* reset everything for now */
+ set_bit(HNAE3_GLOBAL_RESET, reset_requests);
+ goto msi_error;
+ }
+
+ /* clear all PF MSIx errors */
+ hclge_cmd_reuse_desc(&desc[0], false);
+ desc[0].flag |= cpu_to_le16(HCLGE_CMD_FLAG_NEXT);
+
+ ret = hclge_cmd_send(&hdev->hw, &desc[0], pf_bd_num);
+ if (ret) {
+ dev_err(dev, "clear all pf msix int cmd failed (%d)\n",
+ ret);
+ /* reset everything for now */
+ set_bit(HNAE3_GLOBAL_RESET, reset_requests);
+ }
+
+msi_error:
+ kfree(desc);
+out:
+ return ret;
+}
+
@@ -9,6 +9,9 @@
#define HCLGE_RAS_PF_OTHER_INT_STS_REG 0x20B00
#define HCLGE_RAS_REG_NFE_MASK 0xFF00
+#define HCLGE_VECTOR0_PF_OTHER_INT_STS_REG 0x20800
+#define HCLGE_VECTOR0_REG_MSIX_MASK 0x1FF00
+
#define HCLGE_IMP_TCM_ECC_ERR_INT_EN 0xFFFF0000
#define HCLGE_IMP_TCM_ECC_ERR_INT_EN_MASK 0xFFFF0000
#define HCLGE_IMP_ITCM4_ECC_ERR_INT_EN 0x300
@@ -69,4 +72,6 @@ struct hclge_hw_error {
int hclge_hw_error_set_state(struct hclge_dev *hdev, bool state);
pci_ers_result_t hclge_handle_hw_ras_error(struct hnae3_ae_dev *ae_dev);
+int hclge_handle_hw_msix_error(struct hclge_dev *hdev,
+ unsigned long *reset_requests);
#endif
@@ -2200,12 +2200,13 @@ static void hclge_service_complete(struct hclge_dev *hdev)
static u32 hclge_check_event_cause(struct hclge_dev *hdev, u32 *clearval)
{
- u32 rst_src_reg;
- u32 cmdq_src_reg;
+ u32 rst_src_reg, cmdq_src_reg, msix_src_reg;
/* fetch the events from their corresponding regs */
rst_src_reg = hclge_read_dev(&hdev->hw, HCLGE_MISC_VECTOR_INT_STS);
cmdq_src_reg = hclge_read_dev(&hdev->hw, HCLGE_VECTOR0_CMDQ_SRC_REG);
+ msix_src_reg = hclge_read_dev(&hdev->hw,
+ HCLGE_VECTOR0_PF_OTHER_INT_STS_REG);
/* Assumption: If by any chance reset and mailbox events are reported
* together then we will only process reset event in this go and will
@@ -2239,6 +2240,10 @@ static u32 hclge_check_event_cause(struct hclge_dev *hdev, u32 *clearval)
return HCLGE_VECTOR0_EVENT_RST;
}
+ /* check for vector0 msix event source */
+ if (msix_src_reg & HCLGE_VECTOR0_REG_MSIX_MASK)
+ return HCLGE_VECTOR0_EVENT_ERR;
+
/* check for vector0 mailbox(=CMDQ RX) event source */
if (BIT(HCLGE_VECTOR0_RX_CMDQ_INT_B) & cmdq_src_reg) {
cmdq_src_reg &= ~BIT(HCLGE_VECTOR0_RX_CMDQ_INT_B);
@@ -2289,6 +2294,19 @@ static irqreturn_t hclge_misc_irq_handle(int irq, void *data)
/* vector 0 interrupt is shared with reset and mailbox source events.*/
switch (event_cause) {
+ case HCLGE_VECTOR0_EVENT_ERR:
+ /* we do not know what type of reset is required now. This could
+ * only be decided after we fetch the type of errors which
+ * caused this event. Therefore, we will do below for now:
+ * 1. Assert HNAE3_UNKNOWN_RESET type of reset. This means we
+ * have defered type of reset to be used.
+ * 2. Schedule the reset serivce task.
+ * 3. When service task receives HNAE3_UNKNOWN_RESET type it
+ * will fetch the correct type of reset. This would be done
+ * by first decoding the types of errors.
+ */
+ set_bit(HNAE3_UNKNOWN_RESET, &hdev->reset_request);
+ /* fall through */
case HCLGE_VECTOR0_EVENT_RST:
hclge_reset_task_schedule(hdev);
break;
@@ -2593,6 +2611,23 @@ static enum hnae3_reset_type hclge_get_reset_level(struct hclge_dev *hdev,
{
enum hnae3_reset_type rst_level = HNAE3_NONE_RESET;
+ /* first, resolve any unknown reset type to the known type(s) */
+ if (test_bit(HNAE3_UNKNOWN_RESET, addr)) {
+ /* we will intentionally ignore any errors from this function
+ * as we will end up in *some* reset request in any case
+ */
+ hclge_handle_hw_msix_error(hdev, addr);
+ clear_bit(HNAE3_UNKNOWN_RESET, addr);
+ /* We defered the clearing of the error event which caused
+ * interrupt since it was not posssible to do that in
+ * interrupt context (and this is the reason we introduced
+ * new UNKNOWN reset type). Now, the errors have been
+ * handled and cleared in hardware we can safely enable
+ * interrupts. This is an exception to the norm.
+ */
+ hclge_enable_vector(&hdev->misc_vector, true);
+ }
+
/* return the highest priority reset level amongst all */
if (test_bit(HNAE3_IMP_RESET, addr)) {
rst_level = HNAE3_IMP_RESET;
@@ -205,6 +205,7 @@ enum HCLGE_DEV_STATE {
enum hclge_evt_cause {
HCLGE_VECTOR0_EVENT_RST,
HCLGE_VECTOR0_EVENT_MBX,
+ HCLGE_VECTOR0_EVENT_ERR,
HCLGE_VECTOR0_EVENT_OTHER,
};