From 14e98c861d6890728c8ccaa42aafa93684c64d08 Mon Sep 17 00:00:00 2001 From: Mukul Joshi Date: Tue, 11 Jun 2024 09:16:23 +0000 Subject: [PATCH 1/2] drm/amdkfd: Rework kfd_locked handling mainline inclusion from mainline-v6.5-rc1 commit fe1f05df5919c67c3add49efb55e251a8d78ee4e category: bugfix bugzilla: https://gitee.com/src-openeuler/kernel/issues/I9U8NU CVE: CVE-2024-36949 Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=fe1f05df5919c67c3add49efb55e251a8d78ee4e -------------------------------- Currently, even if kfd_locked is set, a process is first created and then removed to work around a race condition in updating kfd_locked flag. Rework kfd_locked handling to ensure no processes is created if kfd_locked is set. This is achieved by updating kfd_locked under kfd_processes_mutex. With this there is no need for kfd_locked to be an atomic counter. Instead, it can be a regular integer. Signed-off-by: Mukul Joshi Reviewed-by: Felix Kuehling Signed-off-by: Alex Deucher Conflicts: drivers/gpu/drm/amd/amdkfd/kfd_device.c drivers/gpu/drm/amd/amdkfd/kfd_process.c [Some contexts different. No functional impact.] Signed-off-by: Zheng Zucheng --- drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 7 ------- drivers/gpu/drm/amd/amdkfd/kfd_device.c | 21 +++++++++++++++------ drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 2 ++ drivers/gpu/drm/amd/amdkfd/kfd_process.c | 8 +++++++- 4 files changed, 24 insertions(+), 14 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c index 799a91a064a1..2a99edf8666e 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c @@ -128,13 +128,6 @@ static int kfd_open(struct inode *inode, struct file *filep) if (IS_ERR(process)) return PTR_ERR(process); - if (kfd_is_locked()) { - dev_dbg(kfd_device, "kfd is locked!\n" - "process %d unreferenced", process->pasid); - kfd_unref_process(process); - return -EAGAIN; - } - /* filep now owns the reference returned by kfd_create_process */ filep->private_data = process; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c index 148e43dee657..bf8474d40f10 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c @@ -38,7 +38,7 @@ * once locked, kfd driver will stop any further GPU execution. * create process (open) will return -EAGAIN. */ -static atomic_t kfd_locked = ATOMIC_INIT(0); +static int kfd_locked; #ifdef CONFIG_DRM_AMDGPU_CIK extern const struct kfd2kgd_calls gfx_v7_kfd2kgd; @@ -842,8 +842,9 @@ int kgd2kfd_post_reset(struct kfd_dev *kfd) ret = kfd_resume(kfd); if (ret) return ret; - atomic_dec(&kfd_locked); - + mutex_lock(&kfd_processes_mutex); + --kfd_locked; + mutex_unlock(&kfd_processes_mutex); atomic_set(&kfd->sram_ecc_flag, 0); kfd_smi_event_update_gpu_reset(kfd, true); @@ -853,18 +854,23 @@ int kgd2kfd_post_reset(struct kfd_dev *kfd) bool kfd_is_locked(void) { - return (atomic_read(&kfd_locked) > 0); + lockdep_assert_held(&kfd_processes_mutex); + return (kfd_locked > 0); } void kgd2kfd_suspend(struct kfd_dev *kfd, bool run_pm) { + int count; if (!kfd->init_complete) return; /* for runtime suspend, skip locking kfd */ if (!run_pm) { + mutex_lock(&kfd_processes_mutex); + count = ++kfd_locked; + mutex_unlock(&kfd_processes_mutex); /* For first KFD device suspend all the KFD processes */ - if (atomic_inc_return(&kfd_locked) == 1) + if (count == 1) kfd_suspend_all_processes(); } @@ -885,7 +891,10 @@ int kgd2kfd_resume(struct kfd_dev *kfd, bool run_pm) /* for runtime resume, skip unlocking kfd */ if (!run_pm) { - count = atomic_dec_return(&kfd_locked); + mutex_lock(&kfd_processes_mutex); + count = --kfd_locked; + mutex_unlock(&kfd_processes_mutex); + WARN_ONCE(count < 0, "KFD suspend / resume ref. error"); if (count == 0) ret = kfd_resume_all_processes(); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h index 057c48a9b53a..b3b4da634f71 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h @@ -172,6 +172,8 @@ extern int queue_preemption_timeout_ms; /* Enable eviction debug messages */ extern bool debug_evictions; +extern struct mutex kfd_processes_mutex; + enum cache_policy { cache_policy_coherent, cache_policy_noncoherent diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c index d243e60c6eef..aa1e9d7a2507 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c @@ -48,7 +48,7 @@ struct mm_struct; * Unique/indexed by mm_struct* */ DEFINE_HASHTABLE(kfd_processes_table, KFD_PROCESS_TABLE_SIZE); -static DEFINE_MUTEX(kfd_processes_mutex); +DEFINE_MUTEX(kfd_processes_mutex); DEFINE_SRCU(kfd_processes_srcu); @@ -761,6 +761,12 @@ struct kfd_process *kfd_create_process(struct file *filep) */ mutex_lock(&kfd_processes_mutex); + if (kfd_is_locked()) { + mutex_unlock(&kfd_processes_mutex); + pr_debug("KFD is locked! Cannot create process"); + return ERR_PTR(-EINVAL); + } + /* A prior open of /dev/kfd could have already created the process. */ process = find_process(thread); if (process) { -- Gitee From cf1bee5d09595003aedca6a2e2ab7288b627842a Mon Sep 17 00:00:00 2001 From: Zhigang Luo Date: Tue, 11 Jun 2024 09:16:24 +0000 Subject: [PATCH 2/2] amd/amdkfd: sync all devices to wait all processes being evicted mainline inclusion from mainline-v6.10-rc1 commit dfb15c4ab58658aaa6161b546e7eb852ae7cc132 category: bugfix bugzilla: https://gitee.com/src-openeuler/kernel/issues/I9U8NU CVE: CVE-2024-36949 Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=dfb15c4ab58658aaa6161b546e7eb852ae7cc132 -------------------------------- If there are more than one device doing reset in parallel, the first device will call kfd_suspend_all_processes() to evict all processes on all devices, this call takes time to finish. other device will start reset and recover without waiting. if the process has not been evicted before doing recover, it will be restored, then caused page fault. Signed-off-by: Zhigang Luo Reviewed-by: Felix Kuehling Signed-off-by: Alex Deucher Conflicts: drivers/gpu/drm/amd/amdkfd/kfd_device.c [Some contexts different. No functional impact.] Signed-off-by: Zheng Zucheng --- drivers/gpu/drm/amd/amdkfd/kfd_device.c | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c index bf8474d40f10..04126ffbc973 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c @@ -860,18 +860,16 @@ bool kfd_is_locked(void) void kgd2kfd_suspend(struct kfd_dev *kfd, bool run_pm) { - int count; if (!kfd->init_complete) return; /* for runtime suspend, skip locking kfd */ if (!run_pm) { mutex_lock(&kfd_processes_mutex); - count = ++kfd_locked; - mutex_unlock(&kfd_processes_mutex); /* For first KFD device suspend all the KFD processes */ - if (count == 1) + if (++kfd_locked == 1) kfd_suspend_all_processes(); + mutex_unlock(&kfd_processes_mutex); } kfd->dqm->ops.stop(kfd->dqm); @@ -880,7 +878,7 @@ void kgd2kfd_suspend(struct kfd_dev *kfd, bool run_pm) int kgd2kfd_resume(struct kfd_dev *kfd, bool run_pm) { - int ret, count; + int ret; if (!kfd->init_complete) return 0; @@ -892,12 +890,10 @@ int kgd2kfd_resume(struct kfd_dev *kfd, bool run_pm) /* for runtime resume, skip unlocking kfd */ if (!run_pm) { mutex_lock(&kfd_processes_mutex); - count = --kfd_locked; - mutex_unlock(&kfd_processes_mutex); - - WARN_ONCE(count < 0, "KFD suspend / resume ref. error"); - if (count == 0) + if (--kfd_locked == 0) ret = kfd_resume_all_processes(); + WARN_ONCE(kfd_locked < 0, "KFD suspend / resume ref. error"); + mutex_unlock(&kfd_processes_mutex); } return ret; -- Gitee