diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index c6fa4d62a3a1397acadad8d82b65990e1a3766b3..1216d7f0422624095a746e2c2b1b7a39f6cf0c9a 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -4998,6 +4998,22 @@ pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev) bool need_full_reset = true; u32 memsize; struct list_head device_list; + struct amdgpu_hive_info *hive; + int hive_ras_recovery = 0; + struct amdgpu_ras *ras; + int major, minor, revision; + + /* PCI error slot reset should be skipped During RAS recovery */ + hive = amdgpu_get_xgmi_hive(adev); + if (hive) { + hive_ras_recovery = atomic_read(&hive->ras_recovery); + amdgpu_put_xgmi_hive(hive); + } + ras = amdgpu_ras_get_context(adev); + amdgpu_discovery_get_ip_version(adev, GC_HWIP, &major, &minor, &revision); + if ((major == 9 && minor == 4 && revision == 3) && + ras && (atomic_read(&ras->in_recovery) || hive_ras_recovery)) + return PCI_ERS_RESULT_RECOVERED; DRM_INFO("PCI error: slot reset callback!!\n"); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 3638f0e12a2b8fae3523858c4d5469060c7db7e7..454473c9db6c828a3a1cb79526bab1cab4571641 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -1568,9 +1568,11 @@ static void amdgpu_ras_do_recovery(struct work_struct *work) struct amdgpu_device *remote_adev = NULL; struct amdgpu_device *adev = ras->adev; struct list_head device_list, *device_list_handle = NULL; + struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); + if (hive) + atomic_set(&hive->ras_recovery, 1); if (!ras->disable_ras_err_cnt_harvest) { - struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); /* Build list of devices to query RAS related errors */ if (hive && adev->gmc.xgmi.num_physical_nodes > 1) { @@ -1587,12 +1589,15 @@ static void amdgpu_ras_do_recovery(struct work_struct *work) amdgpu_ras_log_on_err_counter(remote_adev); } - amdgpu_put_xgmi_hive(hive); } if (amdgpu_device_should_recover_gpu(ras->adev)) amdgpu_device_gpu_recover(ras->adev, NULL); atomic_set(&ras->in_recovery, 0); + if (hive) { + atomic_set(&hive->ras_recovery, 0); + amdgpu_put_xgmi_hive(hive); + } } /* alloc/realloc bps array */ diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h index 148560d63554364e7cc1485c57c0fe00724f0440..b06858fc00c06fe01d3bf8fa7ce93089d244b2dc 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.h @@ -42,6 +42,8 @@ struct amdgpu_hive_info { AMDGPU_XGMI_PSTATE_MAX_VEGA20, AMDGPU_XGMI_PSTATE_UNKNOWN } pstate; + + KABI_EXTEND(atomic_t ras_recovery) }; struct amdgpu_pcs_ras_field {