From fd3847e04f8493890b1732645c543300cdf38efb Mon Sep 17 00:00:00 2001 From: Ravi Bangoria Date: Wed, 15 Jan 2025 05:44:30 +0000 Subject: [PATCH 01/21] perf/amd/ibs: Remove IBS_{FETCH|OP}_CONFIG_MASK macros mainline inclusion from mainline-v6.15-rc1 commit 003c0414318a1829a1a5b195ad81e8a7960c3f5d category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/ID0OQX CVE: NA Reference: https://github.com/torvalds/linux/commit/003c0414318a1829a1a5b195ad81e8a7960c3f5d -------------------------------- commit 003c0414318a1829a1a5b195ad81e8a7960c3f5d upstream Definition of these macros are very simple and they are used at only one place. Get rid of unnecessary redirection. Signed-off-by: Ravi Bangoria Signed-off-by: Peter Zijlstra (Intel) Acked-by: Namhyung Kim Link: https://lkml.kernel.org/r/20250115054438.1021-2-ravi.bangoria@amd.com Signed-off-by: Arukonda Rahul Signed-off-by: PvsNarasimha --- arch/x86/events/amd/ibs.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/arch/x86/events/amd/ibs.c b/arch/x86/events/amd/ibs.c index c3a2f6f57770..063e8f83bc57 100644 --- a/arch/x86/events/amd/ibs.c +++ b/arch/x86/events/amd/ibs.c @@ -28,9 +28,6 @@ static u32 ibs_caps; #include #include -#define IBS_FETCH_CONFIG_MASK (IBS_FETCH_RAND_EN | IBS_FETCH_MAX_CNT) -#define IBS_OP_CONFIG_MASK IBS_OP_MAX_CNT - /* * IBS states: @@ -670,7 +667,7 @@ static struct perf_ibs perf_ibs_fetch = { .capabilities = PERF_PMU_CAP_NO_EXCLUDE, }, .msr = MSR_AMD64_IBSFETCHCTL, - .config_mask = IBS_FETCH_CONFIG_MASK, + .config_mask = IBS_FETCH_MAX_CNT | IBS_FETCH_RAND_EN, .cnt_mask = IBS_FETCH_MAX_CNT, .enable_mask = IBS_FETCH_ENABLE, .valid_mask = IBS_FETCH_VAL, @@ -694,7 +691,7 @@ static struct perf_ibs perf_ibs_op = { .capabilities = PERF_PMU_CAP_NO_EXCLUDE, }, .msr = MSR_AMD64_IBSOPCTL, - .config_mask = IBS_OP_CONFIG_MASK, + .config_mask = IBS_OP_MAX_CNT, .cnt_mask = IBS_OP_MAX_CNT | IBS_OP_CUR_CNT | IBS_OP_CUR_CNT_RAND, .enable_mask = IBS_OP_ENABLE, -- Gitee From f10c19c2174ce3ddcbe392c836f81900a7a9f9a8 Mon Sep 17 00:00:00 2001 From: Ravi Bangoria Date: Wed, 15 Jan 2025 05:44:31 +0000 Subject: [PATCH 02/21] perf/amd/ibs: Remove pointless sample period check mainline inclusion from mainline-v6.15-rc1 commit 88c7bcad71c83f52f24108dedcecae0d18dbc627 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/ID0OQX CVE: NA Reference: https://github.com/torvalds/linux/commit/88c7bcad71c83f52f24108dedcecae0d18dbc627 -------------------------------- commit 88c7bcad71c83f52f24108dedcecae0d18dbc627 upstream Valid perf event sample period value for IBS PMUs (Fetch and Op both) is limited to multiple of 0x10. perf_ibs_init() has this check: if (!event->attr.sample_freq && hwc->sample_period & 0x0f) return -EINVAL; But it's broken since hwc->sample_period will always be 0 when event->attr.sample_freq is 0 (irrespective of event->attr.freq value.) One option to fix this is to change the condition: - if (!event->attr.sample_freq && hwc->sample_period & 0x0f) + if (!event->attr.freq && hwc->sample_period & 0x0f) However, that will break all userspace tools which have been using IBS event with sample_period not multiple of 0x10. Another option is to remove the condition altogether and mask lower nibble _silently_, same as what current code is inadvertently doing. I'm preferring this approach as it keeps the existing behavior. Signed-off-by: Ravi Bangoria Signed-off-by: Peter Zijlstra (Intel) Acked-by: Namhyung Kim Link: https://lkml.kernel.org/r/20250115054438.1021-3-ravi.bangoria@amd.com Signed-off-by: Arukonda Rahul Signed-off-by: PvsNarasimha --- arch/x86/events/amd/ibs.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/arch/x86/events/amd/ibs.c b/arch/x86/events/amd/ibs.c index 063e8f83bc57..67b9d619a908 100644 --- a/arch/x86/events/amd/ibs.c +++ b/arch/x86/events/amd/ibs.c @@ -295,13 +295,8 @@ static int perf_ibs_init(struct perf_event *event) if (config & perf_ibs->cnt_mask) /* raw max_cnt may not be set */ return -EINVAL; - if (!event->attr.sample_freq && hwc->sample_period & 0x0f) - /* - * lower 4 bits can not be set in ibs max cnt, - * but allowing it in case we adjust the - * sample period to set a frequency. - */ - return -EINVAL; + + /* Silently mask off lower nibble. IBS hw mandates it. */ hwc->sample_period &= ~0x0FULL; if (!hwc->sample_period) hwc->sample_period = 0x10; -- Gitee From 15bae6fd0ed4e5de9abc881e2160c3d66c50bd3c Mon Sep 17 00:00:00 2001 From: Ravi Bangoria Date: Wed, 15 Jan 2025 05:44:32 +0000 Subject: [PATCH 03/21] perf/amd/ibs: Fix ->config to sample period calculation for OP PMU mainline inclusion from mainline-v6.15-rc1 commit 598bdf4fefff5af4ce6d26d16f7b2a20808fc4cb category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/ID0OQX CVE: NA Reference: https://github.com/torvalds/linux/commit/598bdf4fefff5af4ce6d26d16f7b2a20808fc4cb -------------------------------- commit 598bdf4fefff5af4ce6d26d16f7b2a20808fc4cb upstream Instead of using standard perf_event_attr->freq=0 and ->sample_period fields, IBS event in 'sample period mode' can also be opened by setting period value directly in perf_event_attr->config in a MaxCnt bit-field format. IBS OP MaxCnt bits are defined as: (high bits) IbsOpCtl[26:20] = IbsOpMaxCnt[26:20] (low bits) IbsOpCtl[15:0] = IbsOpMaxCnt[19:4] Perf event sample period can be derived from MaxCnt bits as: sample_period = (high bits) | ((low_bits) << 4); However, current code just masks MaxCnt bits and shifts all of them, including high bits, which is incorrect. Fix it. Signed-off-by: Ravi Bangoria Signed-off-by: Peter Zijlstra (Intel) Acked-by: Namhyung Kim Link: https://lkml.kernel.org/r/20250115054438.1021-4-ravi.bangoria@amd.com Signed-off-by: Arukonda Rahul Signed-off-by: PvsNarasimha --- arch/x86/events/amd/ibs.c | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/arch/x86/events/amd/ibs.c b/arch/x86/events/amd/ibs.c index 67b9d619a908..bd12b891ad43 100644 --- a/arch/x86/events/amd/ibs.c +++ b/arch/x86/events/amd/ibs.c @@ -269,7 +269,7 @@ static int perf_ibs_init(struct perf_event *event) { struct hw_perf_event *hwc = &event->hw; struct perf_ibs *perf_ibs; - u64 max_cnt, config; + u64 config; int ret; perf_ibs = get_ibs_pmu(event->attr.type); @@ -301,10 +301,19 @@ static int perf_ibs_init(struct perf_event *event) if (!hwc->sample_period) hwc->sample_period = 0x10; } else { - max_cnt = config & perf_ibs->cnt_mask; + u64 period = 0; + + if (perf_ibs == &perf_ibs_op) { + period = (config & IBS_OP_MAX_CNT) << 4; + if (ibs_caps & IBS_CAPS_OPCNTEXT) + period |= config & IBS_OP_MAX_CNT_EXT_MASK; + } else { + period = (config & IBS_FETCH_MAX_CNT) << 4; + } + config &= ~perf_ibs->cnt_mask; - event->attr.sample_period = max_cnt << 4; - hwc->sample_period = event->attr.sample_period; + event->attr.sample_period = period; + hwc->sample_period = period; } if (!hwc->sample_period) -- Gitee From 84dec671c780a2c7c1a67bfb11bde810abc36b9d Mon Sep 17 00:00:00 2001 From: Ravi Bangoria Date: Wed, 15 Jan 2025 05:44:33 +0000 Subject: [PATCH 04/21] perf/amd/ibs: Fix perf_ibs_op.cnt_mask for CurCnt mainline inclusion from mainline-v6.15-rc1 commit 46dcf85566170d4528b842bf83ffc350d71771fa category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/ID0OQX CVE: NA Reference: https://github.com/torvalds/linux/commit/46dcf85566170d4528b842bf83ffc350d71771fa -------------------------------- commit 46dcf85566170d4528b842bf83ffc350d71771fa upstream IBS Op uses two counters: MaxCnt and CurCnt. MaxCnt is programmed with the desired sample period. IBS hw generates sample when CurCnt reaches to MaxCnt. The size of these counter used to be 20 bits but later they were extended to 27 bits. The 7 bit extension is indicated by CPUID Fn8000_001B_EAX[6 / OpCntExt]. perf_ibs->cnt_mask variable contains bit masks for MaxCnt and CurCnt. But IBS driver does not set upper 7 bits of CurCnt in cnt_mask even when OpCntExt CPUID bit is set. Fix this. IBS driver uses cnt_mask[CurCnt] bits only while disabling an event. Fortunately, CurCnt bits are not read from MSR while re-enabling the event, instead MaxCnt is programmed with desired period and CurCnt is set to 0. Hence, we did not see any issues so far. Signed-off-by: Ravi Bangoria Signed-off-by: Peter Zijlstra (Intel) Acked-by: Namhyung Kim Link: https://lkml.kernel.org/r/20250115054438.1021-5-ravi.bangoria@amd.com Signed-off-by: Arukonda Rahul Signed-off-by: PvsNarasimha --- arch/x86/events/amd/ibs.c | 3 ++- arch/x86/include/asm/perf_event.h | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/x86/events/amd/ibs.c b/arch/x86/events/amd/ibs.c index bd12b891ad43..bf45239b4c52 100644 --- a/arch/x86/events/amd/ibs.c +++ b/arch/x86/events/amd/ibs.c @@ -1223,7 +1223,8 @@ static __init int perf_ibs_op_init(void) if (ibs_caps & IBS_CAPS_OPCNTEXT) { perf_ibs_op.max_period |= IBS_OP_MAX_CNT_EXT_MASK; perf_ibs_op.config_mask |= IBS_OP_MAX_CNT_EXT_MASK; - perf_ibs_op.cnt_mask |= IBS_OP_MAX_CNT_EXT_MASK; + perf_ibs_op.cnt_mask |= (IBS_OP_MAX_CNT_EXT_MASK | + IBS_OP_CUR_CNT_EXT_MASK); } if (ibs_caps & IBS_CAPS_ZEN4) diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index 0335e7263419..4a9d67a3a8fe 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -564,6 +564,7 @@ struct pebs_cntr_header { */ #define IBS_OP_CUR_CNT (0xFFF80ULL<<32) #define IBS_OP_CUR_CNT_RAND (0x0007FULL<<32) +#define IBS_OP_CUR_CNT_EXT_MASK (0x7FULL<<52) #define IBS_OP_CNT_CTL (1ULL<<19) #define IBS_OP_VAL (1ULL<<18) #define IBS_OP_ENABLE (1ULL<<17) -- Gitee From 3b396392a77780855cf002b591f74da6c7eea382 Mon Sep 17 00:00:00 2001 From: Ravi Bangoria Date: Wed, 15 Jan 2025 05:44:34 +0000 Subject: [PATCH 05/21] perf/amd/ibs: Don't allow freq mode event creation through ->config interface mainline inclusion from mainline-v6.15-rc1 commit e1e7844ced88f9558a48579390a7d4eaac6a28eb category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/ID0OQX CVE: NA Reference: https://github.com/torvalds/linux/commit/e1e7844ced88f9558a48579390a7d4eaac6a28eb -------------------------------- commit e1e7844ced88f9558a48579390a7d4eaac6a28eb upstream Most perf_event_attr->config bits directly maps to IBS_{FETCH|OP}_CTL MSR. Since the sample period is programmed in these control registers, IBS PMU driver allows opening an IBS event by setting sample period value directly in perf_event_attr->config instead of using explicit perf_event_attr->sample_period interface. However, this logic is not applicable for freq mode events since the semantics of control register fields are applicable only to fixed sample period whereas the freq mode event adjusts sample period after each and every sample. Currently, IBS driver (unintentionally) allows creating freq mode event via ->config interface, which is semantically wrong as well as detrimental because it can be misused to bypass perf_event_max_sample_rate checks. Don't allow freq mode event creation through perf_event_attr->config interface. Signed-off-by: Ravi Bangoria Signed-off-by: Peter Zijlstra (Intel) Acked-by: Namhyung Kim Link: https://lkml.kernel.org/r/20250115054438.1021-6-ravi.bangoria@amd.com Signed-off-by: chaithanyaLagisetty Signed-off-by: PvsNarasimha --- arch/x86/events/amd/ibs.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/x86/events/amd/ibs.c b/arch/x86/events/amd/ibs.c index bf45239b4c52..401db361acfa 100644 --- a/arch/x86/events/amd/ibs.c +++ b/arch/x86/events/amd/ibs.c @@ -303,6 +303,9 @@ static int perf_ibs_init(struct perf_event *event) } else { u64 period = 0; + if (event->attr.freq) + return -EINVAL; + if (perf_ibs == &perf_ibs_op) { period = (config & IBS_OP_MAX_CNT) << 4; if (ibs_caps & IBS_CAPS_OPCNTEXT) -- Gitee From d693951b27340dcf23d06761fe51f048870f5665 Mon Sep 17 00:00:00 2001 From: Ravi Bangoria Date: Wed, 15 Jan 2025 05:44:35 +0000 Subject: [PATCH 06/21] perf/amd/ibs: Add PMU specific minimum period mainline inclusion from mainline-v6.15-rc1 commit b2fc7b282bf7c1253b01c8da84e894539a3e709d category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/ID0OQX CVE: NA Reference: https://github.com/torvalds/linux/commit/b2fc7b282bf7c1253b01c8da84e894539a3e709d -------------------------------- commit b2fc7b282bf7c1253b01c8da84e894539a3e709d upstream 0x10 is the minimum sample period for IBS Fetch and 0x90 for IBS Op. Current IBS PMU driver uses 0x10 for both the PMUs, which is incorrect. Fix it by adding PMU specific minimum period values in struct perf_ibs. Also, bail out opening a 'sample period mode' event if the user requested sample period is less than PMU supported minimum value. For a 'freq mode' event, start calibrating sample period from PMU specific minimum period. Signed-off-by: Ravi Bangoria Signed-off-by: Peter Zijlstra (Intel) Acked-by: Namhyung Kim Link: https://lkml.kernel.org/r/20250115054438.1021-7-ravi.bangoria@amd.com Signed-off-by: chaithanyaLagisetty Signed-off-by: PvsNarasimha --- arch/x86/events/amd/ibs.c | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/arch/x86/events/amd/ibs.c b/arch/x86/events/amd/ibs.c index 401db361acfa..4eb84d3e8124 100644 --- a/arch/x86/events/amd/ibs.c +++ b/arch/x86/events/amd/ibs.c @@ -84,6 +84,7 @@ struct perf_ibs { u64 cnt_mask; u64 enable_mask; u64 valid_mask; + u16 min_period; u64 max_period; unsigned long offset_mask[1]; int offset_max; @@ -296,10 +297,14 @@ static int perf_ibs_init(struct perf_event *event) /* raw max_cnt may not be set */ return -EINVAL; - /* Silently mask off lower nibble. IBS hw mandates it. */ - hwc->sample_period &= ~0x0FULL; - if (!hwc->sample_period) - hwc->sample_period = 0x10; + if (event->attr.freq) { + hwc->sample_period = perf_ibs->min_period; + } else { + /* Silently mask off lower nibble. IBS hw mandates it. */ + hwc->sample_period &= ~0x0FULL; + if (hwc->sample_period < perf_ibs->min_period) + return -EINVAL; + } } else { u64 period = 0; @@ -317,10 +322,10 @@ static int perf_ibs_init(struct perf_event *event) config &= ~perf_ibs->cnt_mask; event->attr.sample_period = period; hwc->sample_period = period; - } - if (!hwc->sample_period) - return -EINVAL; + if (hwc->sample_period < perf_ibs->min_period) + return -EINVAL; + } /* * If we modify hwc->sample_period, we also need to update @@ -341,7 +346,8 @@ static int perf_ibs_set_period(struct perf_ibs *perf_ibs, int overflow; /* ignore lower 4 bits in min count: */ - overflow = perf_event_set_period(hwc, 1<<4, perf_ibs->max_period, period); + overflow = perf_event_set_period(hwc, perf_ibs->min_period, + perf_ibs->max_period, period); local64_set(&hwc->prev_count, 0); return overflow; @@ -678,6 +684,7 @@ static struct perf_ibs perf_ibs_fetch = { .cnt_mask = IBS_FETCH_MAX_CNT, .enable_mask = IBS_FETCH_ENABLE, .valid_mask = IBS_FETCH_VAL, + .min_period = 0x10, .max_period = IBS_FETCH_MAX_CNT << 4, .offset_mask = { MSR_AMD64_IBSFETCH_REG_MASK }, .offset_max = MSR_AMD64_IBSFETCH_REG_COUNT, @@ -703,6 +710,7 @@ static struct perf_ibs perf_ibs_op = { IBS_OP_CUR_CNT_RAND, .enable_mask = IBS_OP_ENABLE, .valid_mask = IBS_OP_VAL, + .min_period = 0x90, .max_period = IBS_OP_MAX_CNT << 4, .offset_mask = { MSR_AMD64_IBSOP_REG_MASK }, .offset_max = MSR_AMD64_IBSOP_REG_COUNT, -- Gitee From bbf87dc33fbd442a3ba8af6eee72526a0654fb5e Mon Sep 17 00:00:00 2001 From: Ravi Bangoria Date: Wed, 15 Jan 2025 05:44:36 +0000 Subject: [PATCH 07/21] perf/amd/ibs: Add ->check_period() callback mainline inclusion from mainline-v6.15-rc1 commit 1afbdd970f50f2e0431fae26b25d4e54e561fa7f category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/ID0OQX CVE: NA Reference: https://github.com/torvalds/linux/commit/1afbdd970f50f2e0431fae26b25d4e54e561fa7f -------------------------------- commit 1afbdd970f50f2e0431fae26b25d4e54e561fa7f upstream IBS Fetch and IBS Op PMUs have constraints on sample period. The sample period is verified at the time of opening an event but not at the ioctl() interface. Hence, a user can open an event with valid period but change it later with ioctl(). Add a ->check_period() callback to verify the period provided at ioctl() is also valid. Signed-off-by: Ravi Bangoria Signed-off-by: Peter Zijlstra (Intel) Acked-by: Namhyung Kim Link: https://lkml.kernel.org/r/20250115054438.1021-8-ravi.bangoria@amd.com Signed-off-by: PvsNarasimha --- arch/x86/events/amd/ibs.c | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/arch/x86/events/amd/ibs.c b/arch/x86/events/amd/ibs.c index 4eb84d3e8124..6839cceb96f2 100644 --- a/arch/x86/events/amd/ibs.c +++ b/arch/x86/events/amd/ibs.c @@ -552,6 +552,28 @@ static void perf_ibs_del(struct perf_event *event, int flags) static void perf_ibs_read(struct perf_event *event) { } +static int perf_ibs_check_period(struct perf_event *event, u64 value) +{ + struct perf_ibs *perf_ibs; + u64 low_nibble; + + if (event->attr.freq) + return 0; + + perf_ibs = container_of(event->pmu, struct perf_ibs, pmu); + low_nibble = value & 0xFULL; + + /* + * This contradicts with perf_ibs_init() which allows sample period + * with lower nibble bits set but silently masks them off. Whereas + * this returns error. + */ + if (low_nibble || value < perf_ibs->min_period) + return -EINVAL; + + return 0; +} + /* * We need to initialize with empty group if all attributes in the * group are dynamic. @@ -678,6 +700,7 @@ static struct perf_ibs perf_ibs_fetch = { .stop = perf_ibs_stop, .read = perf_ibs_read, .capabilities = PERF_PMU_CAP_NO_EXCLUDE, + .check_period = perf_ibs_check_period, }, .msr = MSR_AMD64_IBSFETCHCTL, .config_mask = IBS_FETCH_MAX_CNT | IBS_FETCH_RAND_EN, @@ -703,6 +726,7 @@ static struct perf_ibs perf_ibs_op = { .stop = perf_ibs_stop, .read = perf_ibs_read, .capabilities = PERF_PMU_CAP_NO_EXCLUDE, + .check_period = perf_ibs_check_period, }, .msr = MSR_AMD64_IBSOPCTL, .config_mask = IBS_OP_MAX_CNT, -- Gitee From fa5fd3674586cc4728243af063af46792fd1575f Mon Sep 17 00:00:00 2001 From: Ravi Bangoria Date: Wed, 15 Jan 2025 05:44:37 +0000 Subject: [PATCH 08/21] perf/amd/ibs: Ceil sample_period to min_period mainline inclusion from mainline-v6.15-rc1 commit fa5d0a824e3bbd1f793d962f9e012ab0a8ee11c5 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/ID0OQX CVE: NA Reference: https://github.com/torvalds/linux/commit/fa5d0a824e3bbd1f793d962f9e012ab0a8ee11c5 -------------------------------- commit fa5d0a824e3bbd1f793d962f9e012ab0a8ee11c5 upstream The sample_period needs to be recalibrated after every sample to match the desired sampling freq for a 'freq mode event'. Since the next sample_period is calculated by generic kernel, PMU specific constraints are not (explicitly) reckoned. The sample_period value is programmed in a MaxCnt field of IBS PMUs, and the MaxCnt field has following constraints: 1) MaxCnt must be multiple of 0x10. Kernel keeps track of residual / over-counted period into period_left, which should take care of this constraint by programming MaxCnt with (sample_period & ~0xF) and adding remaining period into the next sample. 2) MaxCnt must be >= 0x10 for IBS Fetch PMU and >= 0x90 for IBS Op PMU. Currently, IBS PMU driver allows sample_period below min_period, which is an undefined HW behavior. Reset sample_period to min_period whenever it's less than that. Signed-off-by: Ravi Bangoria Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20250115054438.1021-9-ravi.bangoria@amd.com Signed-off-by: chaithanyaLagisetty Signed-off-by: PvsNarasimha --- arch/x86/events/amd/ibs.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/arch/x86/events/amd/ibs.c b/arch/x86/events/amd/ibs.c index 6839cceb96f2..38215fd06a46 100644 --- a/arch/x86/events/amd/ibs.c +++ b/arch/x86/events/amd/ibs.c @@ -445,6 +445,9 @@ static void perf_ibs_start(struct perf_event *event, int flags) WARN_ON_ONCE(!(hwc->state & PERF_HES_UPTODATE)); hwc->state = 0; + if (event->attr.freq && hwc->sample_period < perf_ibs->min_period) + hwc->sample_period = perf_ibs->min_period; + perf_ibs_set_period(perf_ibs, hwc, &period); if (perf_ibs == &perf_ibs_op && (ibs_caps & IBS_CAPS_OPCNTEXT)) { config |= period & IBS_OP_MAX_CNT_EXT_MASK; @@ -1169,6 +1172,10 @@ static int perf_ibs_handle_irq(struct perf_ibs *perf_ibs, struct pt_regs *iregs) perf_sample_save_callchain(&data, event, iregs); throttle = perf_event_overflow(event, &data, ®s); + + if (event->attr.freq && hwc->sample_period < perf_ibs->min_period) + hwc->sample_period = perf_ibs->min_period; + out: if (throttle) { perf_ibs_stop(event, 0); -- Gitee From b7cd824888486abdcd78991ff81db7a4526ca538 Mon Sep 17 00:00:00 2001 From: Ravi Bangoria Date: Wed, 5 Feb 2025 06:05:41 +0000 Subject: [PATCH 09/21] perf/amd/ibs: Add support for OP Load Latency Filtering mainline inclusion from mainline-v6.15-rc1 commit d20610c19b4a22bc69085b7eb7a02741d51de30e category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/ID0OQX CVE: NA Reference: https://github.com/torvalds/linux/commit/d20610c19b4a22bc69085b7eb7a02741d51de30e -------------------------------- commit d20610c19b4a22bc69085b7eb7a02741d51de30e upstream IBS Op PMU on Zen5 uarch added new Load Latency filtering capability. It's advertised by CPUID_Fn8000001B_EAX bit 12. When enabled, IBS HW will raise interrupt only for sample that had an IbsDcMissLat value greater than N cycles, where N is a programmable value defined as multiples of 128 (i.e. 128, 256, 384 etc.) from 128-2048 cycles. Similar to L3MissOnly, IBS HW internally drops the sample and restarts if the sample does not meet the filtering criteria. Add support for LdLat filtering in IBS Op PMU. Since hardware supports threshold in multiple of 128, add a software filter on top to support latency threshold with the granularity of 1 cycle between [128-2048]. Example usage: # perf record -a -e ibs_op/ldlat=128/ -- sleep 5 Signed-off-by: Ravi Bangoria Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20250205060547.1337-2-ravi.bangoria@amd.com Signed-off-by: chaithanyaLagisetty Signed-off-by: PvsNarasimha --- arch/x86/events/amd/ibs.c | 93 ++++++++++++++++++++++++++++--- arch/x86/include/asm/amd-ibs.h | 3 +- arch/x86/include/asm/perf_event.h | 3 + 3 files changed, 90 insertions(+), 9 deletions(-) diff --git a/arch/x86/events/amd/ibs.c b/arch/x86/events/amd/ibs.c index 38215fd06a46..0aacedecf08a 100644 --- a/arch/x86/events/amd/ibs.c +++ b/arch/x86/events/amd/ibs.c @@ -266,6 +266,14 @@ static int validate_group(struct perf_event *event) return 0; } +static bool perf_ibs_ldlat_event(struct perf_ibs *perf_ibs, + struct perf_event *event) +{ + return perf_ibs == &perf_ibs_op && + (ibs_caps & IBS_CAPS_OPLDLAT) && + (event->attr.config1 & 0xFFF); +} + static int perf_ibs_init(struct perf_event *event) { struct hw_perf_event *hwc = &event->hw; @@ -327,6 +335,17 @@ static int perf_ibs_init(struct perf_event *event) return -EINVAL; } + if (perf_ibs_ldlat_event(perf_ibs, event)) { + u64 ldlat = event->attr.config1 & 0xFFF; + + if (ldlat < 128 || ldlat > 2048) + return -EINVAL; + ldlat >>= 7; + + config |= (ldlat - 1) << 59; + config |= IBS_OP_L3MISSONLY | IBS_OP_LDLAT_EN; + } + /* * If we modify hwc->sample_period, we also need to update * hwc->last_period and hwc->period_left. @@ -605,7 +624,9 @@ PMU_FORMAT_ATTR(rand_en, "config:57"); PMU_FORMAT_ATTR(cnt_ctl, "config:19"); PMU_EVENT_ATTR_STRING(l3missonly, fetch_l3missonly, "config:59"); PMU_EVENT_ATTR_STRING(l3missonly, op_l3missonly, "config:16"); +PMU_EVENT_ATTR_STRING(ldlat, ibs_op_ldlat_format, "config1:0-11"); PMU_EVENT_ATTR_STRING(zen4_ibs_extensions, zen4_ibs_extensions, "1"); +PMU_EVENT_ATTR_STRING(ldlat, ibs_op_ldlat_cap, "1"); static umode_t zen4_ibs_extensions_is_visible(struct kobject *kobj, struct attribute *attr, int i) @@ -613,6 +634,12 @@ zen4_ibs_extensions_is_visible(struct kobject *kobj, struct attribute *attr, int return ibs_caps & IBS_CAPS_ZEN4 ? attr->mode : 0; } +static umode_t +ibs_op_ldlat_is_visible(struct kobject *kobj, struct attribute *attr, int i) +{ + return ibs_caps & IBS_CAPS_OPLDLAT ? attr->mode : 0; +} + static struct attribute *rand_en_attrs[] = { &format_attr_rand_en.attr, NULL, @@ -628,6 +655,11 @@ static struct attribute *zen4_ibs_extensions_attrs[] = { NULL, }; +static struct attribute *ibs_op_ldlat_cap_attrs[] = { + &ibs_op_ldlat_cap.attr.attr, + NULL, +}; + static struct attribute_group group_rand_en = { .name = "format", .attrs = rand_en_attrs, @@ -645,6 +677,12 @@ static struct attribute_group group_zen4_ibs_extensions = { .is_visible = zen4_ibs_extensions_is_visible, }; +static struct attribute_group group_ibs_op_ldlat_cap = { + .name = "caps", + .attrs = ibs_op_ldlat_cap_attrs, + .is_visible = ibs_op_ldlat_is_visible, +}; + static const struct attribute_group *fetch_attr_groups[] = { &group_rand_en, &empty_caps_group, @@ -673,6 +711,11 @@ static struct attribute *op_l3missonly_attrs[] = { NULL, }; +static struct attribute *ibs_op_ldlat_format_attrs[] = { + &ibs_op_ldlat_format.attr.attr, + NULL, +}; + static struct attribute_group group_cnt_ctl = { .name = "format", .attrs = cnt_ctl_attrs, @@ -685,10 +728,18 @@ static struct attribute_group group_op_l3missonly = { .is_visible = zen4_ibs_extensions_is_visible, }; +static struct attribute_group group_ibs_op_ldlat_format = { + .name = "format", + .attrs = ibs_op_ldlat_format_attrs, + .is_visible = ibs_op_ldlat_is_visible, +}; + static const struct attribute_group *op_attr_update[] = { &group_cnt_ctl, &group_op_l3missonly, &group_zen4_ibs_extensions, + &group_ibs_op_ldlat_cap, + &group_ibs_op_ldlat_format, NULL, }; @@ -1043,15 +1094,25 @@ static void perf_ibs_parse_ld_st_data(__u64 sample_type, } } -static int perf_ibs_get_offset_max(struct perf_ibs *perf_ibs, u64 sample_type, +static bool perf_ibs_is_mem_sample_type(struct perf_ibs *perf_ibs, + struct perf_event *event) +{ + u64 sample_type = event->attr.sample_type; + + return perf_ibs == &perf_ibs_op && + sample_type & (PERF_SAMPLE_DATA_SRC | + PERF_SAMPLE_WEIGHT_TYPE | + PERF_SAMPLE_ADDR | + PERF_SAMPLE_PHYS_ADDR); +} + +static int perf_ibs_get_offset_max(struct perf_ibs *perf_ibs, + struct perf_event *event, int check_rip) { - if (sample_type & PERF_SAMPLE_RAW || - (perf_ibs == &perf_ibs_op && - (sample_type & PERF_SAMPLE_DATA_SRC || - sample_type & PERF_SAMPLE_WEIGHT_TYPE || - sample_type & PERF_SAMPLE_ADDR || - sample_type & PERF_SAMPLE_PHYS_ADDR))) + if (event->attr.sample_type & PERF_SAMPLE_RAW || + perf_ibs_is_mem_sample_type(perf_ibs, event) || + perf_ibs_ldlat_event(perf_ibs, event)) return perf_ibs->offset_max; else if (check_rip) return 3; @@ -1106,7 +1167,7 @@ static int perf_ibs_handle_irq(struct perf_ibs *perf_ibs, struct pt_regs *iregs) offset = 1; check_rip = (perf_ibs == &perf_ibs_op && (ibs_caps & IBS_CAPS_RIPINVALIDCHK)); - offset_max = perf_ibs_get_offset_max(perf_ibs, event->attr.sample_type, check_rip); + offset_max = perf_ibs_get_offset_max(perf_ibs, event, check_rip); do { rdmsrl(msr + offset, *buf++); @@ -1115,6 +1176,22 @@ static int perf_ibs_handle_irq(struct perf_ibs *perf_ibs, struct pt_regs *iregs) perf_ibs->offset_max, offset + 1); } while (offset < offset_max); + + if (perf_ibs_ldlat_event(perf_ibs, event)) { + union ibs_op_data3 op_data3; + + op_data3.val = ibs_data.regs[ibs_op_msr_idx(MSR_AMD64_IBSOPDATA3)]; + /* + * Opening event is errored out if load latency threshold is + * outside of [128, 2048] range. Since the event has reached + * interrupt handler, we can safely assume the threshold is + * within [128, 2048] range. + */ + if (!op_data3.ld_op || !op_data3.dc_miss || + op_data3.dc_miss_lat <= (event->attr.config1 & 0xFFF)) + goto out; + } + /* * Read IbsBrTarget, IbsOpData4, and IbsExtdCtl separately * depending on their availability. diff --git a/arch/x86/include/asm/amd-ibs.h b/arch/x86/include/asm/amd-ibs.h index cb2a5e113daa..77f3a589a99a 100644 --- a/arch/x86/include/asm/amd-ibs.h +++ b/arch/x86/include/asm/amd-ibs.h @@ -64,7 +64,8 @@ union ibs_op_ctl { opmaxcnt_ext:7, /* 20-26: upper 7 bits of periodic op maximum count */ reserved0:5, /* 27-31: reserved */ opcurcnt:27, /* 32-58: periodic op counter current count */ - reserved1:5; /* 59-63: reserved */ + ldlat_thrsh:4, /* 59-62: Load Latency threshold */ + ldlat_en:1; /* 63: Load Latency enabled */ }; }; diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index 4a9d67a3a8fe..856d80eed122 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -537,6 +537,7 @@ struct pebs_cntr_header { #define IBS_CAPS_FETCHCTLEXTD (1U<<9) #define IBS_CAPS_OPDATA4 (1U<<10) #define IBS_CAPS_ZEN4 (1U<<11) +#define IBS_CAPS_OPLDLAT (1U<<12) #define IBS_CAPS_DEFAULT (IBS_CAPS_AVAIL \ | IBS_CAPS_FETCHSAM \ @@ -562,6 +563,8 @@ struct pebs_cntr_header { * The lower 7 bits of the current count are random bits * preloaded by hardware and ignored in software */ +#define IBS_OP_LDLAT_EN (1ULL<<63) +#define IBS_OP_LDLAT_THRSH (0xFULL<<59) #define IBS_OP_CUR_CNT (0xFFF80ULL<<32) #define IBS_OP_CUR_CNT_RAND (0x0007FULL<<32) #define IBS_OP_CUR_CNT_EXT_MASK (0x7FULL<<52) -- Gitee From 44859a512720f52c9a2105e48c9243e0e921f182 Mon Sep 17 00:00:00 2001 From: Ravi Bangoria Date: Wed, 5 Feb 2025 06:05:42 +0000 Subject: [PATCH 10/21] perf/amd/ibs: Update DTLB/PageSize decode logic mainline inclusion from mainline-v6.15-rc1 commit 0b347a4218da08b1eb400c259d193bff463dae87 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/ID0OQX CVE: NA Reference: https://github.com/torvalds/linux/commit/0b347a4218da08b1eb400c259d193bff463dae87 -------------------------------- commit 0b347a4218da08b1eb400c259d193bff463dae87 upstream IBS Op PMU on Zen5 reports DTLB and page size information differently compared to prior generation. The change is enumerated by CPUID_Fn8000001B_EAX[19]. IBS_OP_DATA3 Zen3/4 Zen5 ---------------------------------------------------------------- 19 IbsDcL2TlbHit1G Reserved ---------------------------------------------------------------- 6 IbsDcL2tlbHit2M Reserved ---------------------------------------------------------------- 5 IbsDcL1TlbHit1G PageSize: 4 IbsDcL1TlbHit2M 0 - 4K 1 - 2M 2 - 1G 3 - Reserved Valid only if IbsDcPhyAddrValid = 1 ---------------------------------------------------------------- 3 IbsDcL2TlbMiss IbsDcL2TlbMiss Valid only if IbsDcPhyAddrValid = 1 ---------------------------------------------------------------- 2 IbsDcL1tlbMiss IbsDcL1tlbMiss Valid only if IbsDcPhyAddrValid = 1 ---------------------------------------------------------------- o Currently, only bit 2 and 3 are interpreted by IBS NMI handler for PERF_SAMPLE_DATA_SRC. Add dependency on IbsDcPhyAddrValid for those bits. o Introduce new IBS Op PMU capability and expose it to userspace via PMU's sysfs directory. Signed-off-by: Ravi Bangoria Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20250205060547.1337-3-ravi.bangoria@amd.com Signed-off-by: Malathi Signed-off-by: PvsNarasimha --- arch/x86/events/amd/ibs.c | 23 +++++++++++++++++++++++ arch/x86/include/asm/perf_event.h | 1 + 2 files changed, 24 insertions(+) diff --git a/arch/x86/events/amd/ibs.c b/arch/x86/events/amd/ibs.c index 0aacedecf08a..64d7b125d0f6 100644 --- a/arch/x86/events/amd/ibs.c +++ b/arch/x86/events/amd/ibs.c @@ -627,6 +627,7 @@ PMU_EVENT_ATTR_STRING(l3missonly, op_l3missonly, "config:16"); PMU_EVENT_ATTR_STRING(ldlat, ibs_op_ldlat_format, "config1:0-11"); PMU_EVENT_ATTR_STRING(zen4_ibs_extensions, zen4_ibs_extensions, "1"); PMU_EVENT_ATTR_STRING(ldlat, ibs_op_ldlat_cap, "1"); +PMU_EVENT_ATTR_STRING(dtlb_pgsize, ibs_op_dtlb_pgsize_cap, "1"); static umode_t zen4_ibs_extensions_is_visible(struct kobject *kobj, struct attribute *attr, int i) @@ -640,6 +641,12 @@ ibs_op_ldlat_is_visible(struct kobject *kobj, struct attribute *attr, int i) return ibs_caps & IBS_CAPS_OPLDLAT ? attr->mode : 0; } +static umode_t +ibs_op_dtlb_pgsize_is_visible(struct kobject *kobj, struct attribute *attr, int i) +{ + return ibs_caps & IBS_CAPS_OPDTLBPGSIZE ? attr->mode : 0; +} + static struct attribute *rand_en_attrs[] = { &format_attr_rand_en.attr, NULL, @@ -660,6 +667,11 @@ static struct attribute *ibs_op_ldlat_cap_attrs[] = { NULL, }; +static struct attribute *ibs_op_dtlb_pgsize_cap_attrs[] = { + &ibs_op_dtlb_pgsize_cap.attr.attr, + NULL, +}; + static struct attribute_group group_rand_en = { .name = "format", .attrs = rand_en_attrs, @@ -683,6 +695,12 @@ static struct attribute_group group_ibs_op_ldlat_cap = { .is_visible = ibs_op_ldlat_is_visible, }; +static struct attribute_group group_ibs_op_dtlb_pgsize_cap = { + .name = "caps", + .attrs = ibs_op_dtlb_pgsize_cap_attrs, + .is_visible = ibs_op_dtlb_pgsize_is_visible, +}; + static const struct attribute_group *fetch_attr_groups[] = { &group_rand_en, &empty_caps_group, @@ -740,6 +758,7 @@ static const struct attribute_group *op_attr_update[] = { &group_zen4_ibs_extensions, &group_ibs_op_ldlat_cap, &group_ibs_op_ldlat_format, + &group_ibs_op_dtlb_pgsize_cap, NULL, }; @@ -990,6 +1009,10 @@ static void perf_ibs_get_tlb_lvl(union ibs_op_data3 *op_data3, if (!op_data3->dc_lin_addr_valid) return; + if ((ibs_caps & IBS_CAPS_OPDTLBPGSIZE) && + !op_data3->dc_phy_addr_valid) + return; + if (!op_data3->dc_l1tlb_miss) { data_src->mem_dtlb = PERF_MEM_TLB_L1 | PERF_MEM_TLB_HIT; return; diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index 856d80eed122..25e84329c21e 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -538,6 +538,7 @@ struct pebs_cntr_header { #define IBS_CAPS_OPDATA4 (1U<<10) #define IBS_CAPS_ZEN4 (1U<<11) #define IBS_CAPS_OPLDLAT (1U<<12) +#define IBS_CAPS_OPDTLBPGSIZE (1U<<19) #define IBS_CAPS_DEFAULT (IBS_CAPS_AVAIL \ | IBS_CAPS_FETCHSAM \ -- Gitee From 3d5e1de19a8abf21a75afdc724b57ae63871a4d5 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Tue, 8 Apr 2025 14:47:30 -0700 Subject: [PATCH 11/21] x86/bugs: Rename entry_ibpb() to write_ibpb() mainline inclusion from mainline-v6.15-rc2 commit 13235d6d50bba99931c4392c0f813cfae0de3eac category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/ID0OQX CVE: NA Reference: https://github.com/torvalds/linux/commit/13235d6d50bba99931c4392c0f813cfae0de3eac -------------------------------- commit 13235d6d50bba99931c4392c0f813cfae0de3eac upstream There's nothing entry-specific about entry_ibpb(). In preparation for calling it from elsewhere, rename it to write_ibpb(). Signed-off-by: Josh Poimboeuf Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/1e54ace131e79b760de3fe828264e26d0896e3ac.1744148254.git.jpoimboe@kernel.org Signed-off-by: PvsNarasimha --- arch/x86/entry/entry.S | 6 ++++-- arch/x86/include/asm/nospec-branch.h | 6 +++--- arch/x86/kernel/cpu/bugs.c | 6 +++--- 3 files changed, 10 insertions(+), 8 deletions(-) diff --git a/arch/x86/entry/entry.S b/arch/x86/entry/entry.S index 3b34db70e07e..f68f5b49e95d 100644 --- a/arch/x86/entry/entry.S +++ b/arch/x86/entry/entry.S @@ -15,6 +15,8 @@ .pushsection .noinstr.text, "ax" SYM_FUNC_START(entry_ibpb) +/* Clobbers AX, CX, DX */ +SYM_FUNC_START(write_ibpb) movl $MSR_IA32_PRED_CMD, %ecx movl _ASM_RIP(x86_pred_cmd), %eax xorl %edx, %edx @@ -23,9 +25,9 @@ SYM_FUNC_START(entry_ibpb) /* Make sure IBPB clears return stack preductions too. */ FILL_RETURN_BUFFER %rax, RSB_CLEAR_LOOPS, X86_BUG_IBPB_NO_RET RET -SYM_FUNC_END(entry_ibpb) +SYM_FUNC_END(write_ibpb) /* For KVM */ -EXPORT_SYMBOL_GPL(entry_ibpb); +EXPORT_SYMBOL_GPL(write_ibpb); .popsection diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index edc15e1029f0..62eee074c8a1 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -292,7 +292,7 @@ * typically has NO_MELTDOWN). * * While retbleed_untrain_ret() doesn't clobber anything but requires stack, - * entry_ibpb() will clobber AX, CX, DX. + * write_ibpb() will clobber AX, CX, DX. * * As such, this must be placed after every *SWITCH_TO_KERNEL_CR3 at a point * where we have a stack but before any RET instruction. @@ -302,7 +302,7 @@ VALIDATE_UNRET_END CALL_UNTRAIN_RET ALTERNATIVE_2 "", \ - "call entry_ibpb", \ibpb_feature, \ + "call write_ibpb", \ibpb_feature, \ __stringify(\call_depth_insns), X86_FEATURE_CALL_DEPTH #endif .endm @@ -407,7 +407,7 @@ extern void srso_untrain_ret(void); extern void srso_alias_untrain_ret(void); extern void entry_untrain_ret(void); -extern void entry_ibpb(void); +extern void write_ibpb(void); #ifdef CONFIG_X86_64 extern void clear_bhb_loop(void); diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 87ac97323b70..bc654294ee08 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -1129,7 +1129,7 @@ static void __init retbleed_select_mitigation(void) setup_clear_cpu_cap(X86_FEATURE_RETHUNK); /* - * There is no need for RSB filling: entry_ibpb() ensures + * There is no need for RSB filling: write_ibpb() ensures * all predictions, including the RSB, are invalidated, * regardless of IBPB implementation. */ @@ -2748,7 +2748,7 @@ static void __init srso_select_mitigation(void) setup_clear_cpu_cap(X86_FEATURE_RETHUNK); /* - * There is no need for RSB filling: entry_ibpb() ensures + * There is no need for RSB filling: write_ibpb() ensures * all predictions, including the RSB, are invalidated, * regardless of IBPB implementation. */ @@ -2767,7 +2767,7 @@ static void __init srso_select_mitigation(void) srso_mitigation = SRSO_MITIGATION_IBPB_ON_VMEXIT; /* - * There is no need for RSB filling: entry_ibpb() ensures + * There is no need for RSB filling: write_ibpb() ensures * all predictions, including the RSB, are invalidated, * regardless of IBPB implementation. */ -- Gitee From 58ad5fc054c663f924443e90d026bbe658aca722 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Tue, 8 Apr 2025 14:47:32 -0700 Subject: [PATCH 12/21] x86/bugs: Fix RSB clearing in indirect_branch_prediction_barrier() mainline inclusion from mainline-v6.15-rc2 commit b1b19cfcf4656c75088dc06b7499f493e0dec3e5 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/ID0OQX CVE: NA Reference: https://github.com/torvalds/linux/commit/b1b19cfcf4656c75088dc06b7499f493e0dec3e5 -------------------------------- commit b1b19cfcf4656c75088dc06b7499f493e0dec3e5 upstream IBPB is expected to clear the RSB. However, if X86_BUG_IBPB_NO_RET is set, that doesn't happen. Make indirect_branch_prediction_barrier() take that into account by calling write_ibpb() which clears RSB on X86_BUG_IBPB_NO_RET: /* Make sure IBPB clears return stack preductions too. */ FILL_RETURN_BUFFER %rax, RSB_CLEAR_LOOPS, X86_BUG_IBPB_NO_RET Note that, as of the previous patch, write_ibpb() also reads 'x86_pred_cmd' in order to use SBPB when applicable: movl _ASM_RIP(x86_pred_cmd), %eax Therefore that existing behavior in indirect_branch_prediction_barrier() is not lost. [Backport changes] In arch/x86/include/asm/nospec-branch.h, within the function indirect_branch_prediction_barrier(), current upstream commit b1b19cfcf465 replaced the call to alternative_msr_write() with an asm_inline volatile(ALTERNATIVE(...)) construct and use the feature flag from X86_FEATURE_USE_IBPB to X86_FEATURE_IBPB. However, since commit 549435aab49a, which renamed X86_FEATURE_USE_IBPB to X86_FEATURE_IBPB, is not present in the current source, the existing X86_FEATURE_USE_IBPB definition has been retained in this backport. This change preserves functionality while ensuring compatibility with the current codebase. Fixes: 50e4b3b94090 ("x86/entry: Have entry_ibpb() invalidate return predictions") Signed-off-by: Josh Poimboeuf Signed-off-by: Ingo Molnar Reviewed-by: Nikolay Borisov Link: https://lore.kernel.org/r/bba68888c511743d4cd65564d1fc41438907523f.1744148254.git.jpoimboe@kernel.org Signed-off-by: chaithanyaLagisetty Signed-off-by: PvsNarasimha --- arch/x86/include/asm/nospec-branch.h | 6 +++--- arch/x86/kernel/cpu/bugs.c | 1 - 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index 62eee074c8a1..ebaeef50effe 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -551,11 +551,11 @@ void alternative_msr_write(unsigned int msr, u64 val, unsigned int feature) : "memory"); } -extern u64 x86_pred_cmd; - static inline void indirect_branch_prediction_barrier(void) { - alternative_msr_write(MSR_IA32_PRED_CMD, x86_pred_cmd, X86_FEATURE_USE_IBPB); + asm_inline volatile(ALTERNATIVE("", "call write_ibpb", X86_FEATURE_USE_IBPB) + : ASM_CALL_CONSTRAINT + :: "rax", "rcx", "rdx", "memory"); } /* The Intel SPEC CTRL MSR base value cache */ diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index bc654294ee08..3dedb36e05cc 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -61,7 +61,6 @@ DEFINE_PER_CPU(u64, x86_spec_ctrl_current); EXPORT_SYMBOL_GPL(x86_spec_ctrl_current); u64 x86_pred_cmd __ro_after_init = PRED_CMD_IBPB; -EXPORT_SYMBOL_GPL(x86_pred_cmd); static u64 __ro_after_init x86_arch_cap_msr; -- Gitee From da6fcacd788c5bce25306e4ec22d1eceb42ccc23 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Tue, 8 Apr 2025 14:47:35 -0700 Subject: [PATCH 13/21] x86/bugs: Add RSB mitigation document mainline inclusion from mainline-v6.15-rc2 commit 83f6665a49c3d44ad0c08f837d352dd290f5d10b category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/ID0OQX CVE: NA Reference: https://github.com/torvalds/linux/commit/83f6665a49c3d44ad0c08f837d352dd290f5d10b -------------------------------- commit 83f6665a49c3d44ad0c08f837d352dd290f5d10b upstream Create a document to summarize hard-earned knowledge about RSB-related mitigations, with references, and replace the overly verbose yet incomplete comments with a reference to the document. Signed-off-by: Josh Poimboeuf Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/ab73f4659ba697a974759f07befd41ae605e33dd.1744148254.git.jpoimboe@kernel.org Signed-off-by: PvsNarasimha --- Documentation/admin-guide/hw-vuln/index.rst | 1 + Documentation/admin-guide/hw-vuln/rsb.rst | 268 ++++++++++++++++++++ arch/x86/kernel/cpu/bugs.c | 64 +---- 3 files changed, 282 insertions(+), 51 deletions(-) create mode 100644 Documentation/admin-guide/hw-vuln/rsb.rst diff --git a/Documentation/admin-guide/hw-vuln/index.rst b/Documentation/admin-guide/hw-vuln/index.rst index ff0b440ef2dc..451874b8135d 100644 --- a/Documentation/admin-guide/hw-vuln/index.rst +++ b/Documentation/admin-guide/hw-vuln/index.rst @@ -22,3 +22,4 @@ are configurable at compile, boot or run time. srso gather_data_sampling reg-file-data-sampling + rsb diff --git a/Documentation/admin-guide/hw-vuln/rsb.rst b/Documentation/admin-guide/hw-vuln/rsb.rst new file mode 100644 index 000000000000..21dbf9cf25f8 --- /dev/null +++ b/Documentation/admin-guide/hw-vuln/rsb.rst @@ -0,0 +1,268 @@ +.. SPDX-License-Identifier: GPL-2.0 + +======================= +RSB-related mitigations +======================= + +.. warning:: + Please keep this document up-to-date, otherwise you will be + volunteered to update it and convert it to a very long comment in + bugs.c! + +Since 2018 there have been many Spectre CVEs related to the Return Stack +Buffer (RSB) (sometimes referred to as the Return Address Stack (RAS) or +Return Address Predictor (RAP) on AMD). + +Information about these CVEs and how to mitigate them is scattered +amongst a myriad of microarchitecture-specific documents. + +This document attempts to consolidate all the relevant information in +once place and clarify the reasoning behind the current RSB-related +mitigations. It's meant to be as concise as possible, focused only on +the current kernel mitigations: what are the RSB-related attack vectors +and how are they currently being mitigated? + +It's *not* meant to describe how the RSB mechanism operates or how the +exploits work. More details about those can be found in the references +below. + +Rather, this is basically a glorified comment, but too long to actually +be one. So when the next CVE comes along, a kernel developer can +quickly refer to this as a refresher to see what we're actually doing +and why. + +At a high level, there are two classes of RSB attacks: RSB poisoning +(Intel and AMD) and RSB underflow (Intel only). They must each be +considered individually for each attack vector (and microarchitecture +where applicable). + +---- + +RSB poisoning (Intel and AMD) +============================= + +SpectreRSB +~~~~~~~~~~ + +RSB poisoning is a technique used by SpectreRSB [#spectre-rsb]_ where +an attacker poisons an RSB entry to cause a victim's return instruction +to speculate to an attacker-controlled address. This can happen when +there are unbalanced CALLs/RETs after a context switch or VMEXIT. + +* All attack vectors can potentially be mitigated by flushing out any + poisoned RSB entries using an RSB filling sequence + [#intel-rsb-filling]_ [#amd-rsb-filling]_ when transitioning between + untrusted and trusted domains. But this has a performance impact and + should be avoided whenever possible. + + .. DANGER:: + **FIXME**: Currently we're flushing 32 entries. However, some CPU + models have more than 32 entries. The loop count needs to be + increased for those. More detailed information is needed about RSB + sizes. + +* On context switch, the user->user mitigation requires ensuring the + RSB gets filled or cleared whenever IBPB gets written [#cond-ibpb]_ + during a context switch: + + * AMD: + On Zen 4+, IBPB (or SBPB [#amd-sbpb]_ if used) clears the RSB. + This is indicated by IBPB_RET in CPUID [#amd-ibpb-rsb]_. + + On Zen < 4, the RSB filling sequence [#amd-rsb-filling]_ must be + always be done in addition to IBPB [#amd-ibpb-no-rsb]_. This is + indicated by X86_BUG_IBPB_NO_RET. + + * Intel: + IBPB always clears the RSB: + + "Software that executed before the IBPB command cannot control + the predicted targets of indirect branches executed after the + command on the same logical processor. The term indirect branch + in this context includes near return instructions, so these + predicted targets may come from the RSB." [#intel-ibpb-rsb]_ + +* On context switch, user->kernel attacks are prevented by SMEP. User + space can only insert user space addresses into the RSB. Even + non-canonical addresses can't be inserted due to the page gap at the + end of the user canonical address space reserved by TASK_SIZE_MAX. + A SMEP #PF at instruction fetch prevents the kernel from speculatively + executing user space. + + * AMD: + "Finally, branches that are predicted as 'ret' instructions get + their predicted targets from the Return Address Predictor (RAP). + AMD recommends software use a RAP stuffing sequence (mitigation + V2-3 in [2]) and/or Supervisor Mode Execution Protection (SMEP) + to ensure that the addresses in the RAP are safe for + speculation. Collectively, we refer to these mitigations as "RAP + Protection"." [#amd-smep-rsb]_ + + * Intel: + "On processors with enhanced IBRS, an RSB overwrite sequence may + not suffice to prevent the predicted target of a near return + from using an RSB entry created in a less privileged predictor + mode. Software can prevent this by enabling SMEP (for + transitions from user mode to supervisor mode) and by having + IA32_SPEC_CTRL.IBRS set during VM exits." [#intel-smep-rsb]_ + +* On VMEXIT, guest->host attacks are mitigated by eIBRS (and PBRSB + mitigation if needed): + + * AMD: + "When Automatic IBRS is enabled, the internal return address + stack used for return address predictions is cleared on VMEXIT." + [#amd-eibrs-vmexit]_ + + * Intel: + "On processors with enhanced IBRS, an RSB overwrite sequence may + not suffice to prevent the predicted target of a near return + from using an RSB entry created in a less privileged predictor + mode. Software can prevent this by enabling SMEP (for + transitions from user mode to supervisor mode) and by having + IA32_SPEC_CTRL.IBRS set during VM exits. Processors with + enhanced IBRS still support the usage model where IBRS is set + only in the OS/VMM for OSes that enable SMEP. To do this, such + processors will ensure that guest behavior cannot control the + RSB after a VM exit once IBRS is set, even if IBRS was not set + at the time of the VM exit." [#intel-eibrs-vmexit]_ + + Note that some Intel CPUs are susceptible to Post-barrier Return + Stack Buffer Predictions (PBRSB) [#intel-pbrsb]_, where the last + CALL from the guest can be used to predict the first unbalanced RET. + In this case the PBRSB mitigation is needed in addition to eIBRS. + +AMD RETBleed / SRSO / Branch Type Confusion +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +On AMD, poisoned RSB entries can also be created by the AMD RETBleed +variant [#retbleed-paper]_ [#amd-btc]_ or by Speculative Return Stack +Overflow [#amd-srso]_ (Inception [#inception-paper]_). The kernel +protects itself by replacing every RET in the kernel with a branch to a +single safe RET. + +---- + +RSB underflow (Intel only) +========================== + +RSB Alternate (RSBA) ("Intel Retbleed") +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Some Intel Skylake-generation CPUs are susceptible to the Intel variant +of RETBleed [#retbleed-paper]_ (Return Stack Buffer Underflow +[#intel-rsbu]_). If a RET is executed when the RSB buffer is empty due +to mismatched CALLs/RETs or returning from a deep call stack, the branch +predictor can fall back to using the Branch Target Buffer (BTB). If a +user forces a BTB collision then the RET can speculatively branch to a +user-controlled address. + +* Note that RSB filling doesn't fully mitigate this issue. If there + are enough unbalanced RETs, the RSB may still underflow and fall back + to using a poisoned BTB entry. + +* On context switch, user->user underflow attacks are mitigated by the + conditional IBPB [#cond-ibpb]_ on context switch which effectively + clears the BTB: + + * "The indirect branch predictor barrier (IBPB) is an indirect branch + control mechanism that establishes a barrier, preventing software + that executed before the barrier from controlling the predicted + targets of indirect branches executed after the barrier on the same + logical processor." [#intel-ibpb-btb]_ + +* On context switch and VMEXIT, user->kernel and guest->host RSB + underflows are mitigated by IBRS or eIBRS: + + * "Enabling IBRS (including enhanced IBRS) will mitigate the "RSBU" + attack demonstrated by the researchers. As previously documented, + Intel recommends the use of enhanced IBRS, where supported. This + includes any processor that enumerates RRSBA but not RRSBA_DIS_S." + [#intel-rsbu]_ + + However, note that eIBRS and IBRS do not mitigate intra-mode attacks. + Like RRSBA below, this is mitigated by clearing the BHB on kernel + entry. + + As an alternative to classic IBRS, call depth tracking (combined with + retpolines) can be used to track kernel returns and fill the RSB when + it gets close to being empty. + +Restricted RSB Alternate (RRSBA) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Some newer Intel CPUs have Restricted RSB Alternate (RRSBA) behavior, +which, similar to RSBA described above, also falls back to using the BTB +on RSB underflow. The only difference is that the predicted targets are +restricted to the current domain when eIBRS is enabled: + +* "Restricted RSB Alternate (RRSBA) behavior allows alternate branch + predictors to be used by near RET instructions when the RSB is + empty. When eIBRS is enabled, the predicted targets of these + alternate predictors are restricted to those belonging to the + indirect branch predictor entries of the current prediction domain. + [#intel-eibrs-rrsba]_ + +When a CPU with RRSBA is vulnerable to Branch History Injection +[#bhi-paper]_ [#intel-bhi]_, an RSB underflow could be used for an +intra-mode BTI attack. This is mitigated by clearing the BHB on +kernel entry. + +However if the kernel uses retpolines instead of eIBRS, it needs to +disable RRSBA: + +* "Where software is using retpoline as a mitigation for BHI or + intra-mode BTI, and the processor both enumerates RRSBA and + enumerates RRSBA_DIS controls, it should disable this behavior." + [#intel-retpoline-rrsba]_ + +---- + +References +========== + +.. [#spectre-rsb] `Spectre Returns! Speculation Attacks using the Return Stack Buffer `_ + +.. [#intel-rsb-filling] "Empty RSB Mitigation on Skylake-generation" in `Retpoline: A Branch Target Injection Mitigation `_ + +.. [#amd-rsb-filling] "Mitigation V2-3" in `Software Techniques for Managing Speculation `_ + +.. [#cond-ibpb] Whether IBPB is written depends on whether the prev and/or next task is protected from Spectre attacks. It typically requires opting in per task or system-wide. For more details see the documentation for the ``spectre_v2_user`` cmdline option in Documentation/admin-guide/kernel-parameters.txt. + +.. [#amd-sbpb] IBPB without flushing of branch type predictions. Only exists for AMD. + +.. [#amd-ibpb-rsb] "Function 8000_0008h -- Processor Capacity Parameters and Extended Feature Identification" in `AMD64 Architecture Programmer's Manual Volume 3: General-Purpose and System Instructions `_. SBPB behaves the same way according to `this email `_. + +.. [#amd-ibpb-no-rsb] `Spectre Attacks: Exploiting Speculative Execution `_ + +.. [#intel-ibpb-rsb] "Introduction" in `Post-barrier Return Stack Buffer Predictions / CVE-2022-26373 / INTEL-SA-00706 `_ + +.. [#amd-smep-rsb] "Existing Mitigations" in `Technical Guidance for Mitigating Branch Type Confusion `_ + +.. [#intel-smep-rsb] "Enhanced IBRS" in `Indirect Branch Restricted Speculation `_ + +.. [#amd-eibrs-vmexit] "Extended Feature Enable Register (EFER)" in `AMD64 Architecture Programmer's Manual Volume 2: System Programming `_ + +.. [#intel-eibrs-vmexit] "Enhanced IBRS" in `Indirect Branch Restricted Speculation `_ + +.. [#intel-pbrsb] `Post-barrier Return Stack Buffer Predictions / CVE-2022-26373 / INTEL-SA-00706 `_ + +.. [#retbleed-paper] `RETBleed: Arbitrary Speculative Code Execution with Return Instruction `_ + +.. [#amd-btc] `Technical Guidance for Mitigating Branch Type Confusion `_ + +.. [#amd-srso] `Technical Update Regarding Speculative Return Stack Overflow `_ + +.. [#inception-paper] `Inception: Exposing New Attack Surfaces with Training in Transient Execution `_ + +.. [#intel-rsbu] `Return Stack Buffer Underflow / Return Stack Buffer Underflow / CVE-2022-29901, CVE-2022-28693 / INTEL-SA-00702 `_ + +.. [#intel-ibpb-btb] `Indirect Branch Predictor Barrier' `_ + +.. [#intel-eibrs-rrsba] "Guidance for RSBU" in `Return Stack Buffer Underflow / Return Stack Buffer Underflow / CVE-2022-29901, CVE-2022-28693 / INTEL-SA-00702 `_ + +.. [#bhi-paper] `Branch History Injection: On the Effectiveness of Hardware Mitigations Against Cross-Privilege Spectre-v2 Attacks `_ + +.. [#intel-bhi] `Branch History Injection and Intra-mode Branch Target Injection / CVE-2022-0001, CVE-2022-0002 / INTEL-SA-00598 `_ + +.. [#intel-retpoline-rrsba] "Retpoline" in `Branch History Injection and Intra-mode Branch Target Injection / CVE-2022-0001, CVE-2022-0002 / INTEL-SA-00598 `_ diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 3dedb36e05cc..4c3538c629fb 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -1668,25 +1668,25 @@ static void __init spec_ctrl_disable_kernel_rrsba(void) static void __init spectre_v2_select_rsb_mitigation(enum spectre_v2_mitigation mode) { /* - * Similar to context switches, there are two types of RSB attacks - * after VM exit: + * WARNING! There are many subtleties to consider when changing *any* + * code related to RSB-related mitigations. Before doing so, carefully + * read the following document, and update if necessary: * - * 1) RSB underflow + * Documentation/admin-guide/hw-vuln/rsb.rst * - * 2) Poisoned RSB entry + * In an overly simplified nutshell: * - * When retpoline is enabled, both are mitigated by filling/clearing - * the RSB. + * - User->user RSB attacks are conditionally mitigated during + * context switches by cond_mitigation -> write_ibpb(). * - * When IBRS is enabled, while #1 would be mitigated by the IBRS branch - * prediction isolation protections, RSB still needs to be cleared - * because of #2. Note that SMEP provides no protection here, unlike - * user-space-poisoned RSB entries. + * - User->kernel and guest->host attacks are mitigated by eIBRS or + * RSB filling. * - * eIBRS should protect against RSB poisoning, but if the EIBRS_PBRSB - * bug is present then a LITE version of RSB protection is required, - * just a single call needs to retire before a RET is executed. + * Though, depending on config, note that other alternative + * mitigations may end up getting used instead, e.g., IBPB on + * entry/vmexit, call depth tracking, or return thunks. */ + switch (mode) { case SPECTRE_V2_NONE: break; @@ -1900,44 +1900,6 @@ static void __init spectre_v2_select_mitigation(void) spectre_v2_enabled = mode; pr_info("%s\n", spectre_v2_strings[mode]); - /* - * If Spectre v2 protection has been enabled, fill the RSB during a - * context switch. In general there are two types of RSB attacks - * across context switches, for which the CALLs/RETs may be unbalanced. - * - * 1) RSB underflow - * - * Some Intel parts have "bottomless RSB". When the RSB is empty, - * speculated return targets may come from the branch predictor, - * which could have a user-poisoned BTB or BHB entry. - * - * AMD has it even worse: *all* returns are speculated from the BTB, - * regardless of the state of the RSB. - * - * When IBRS or eIBRS is enabled, the "user -> kernel" attack - * scenario is mitigated by the IBRS branch prediction isolation - * properties, so the RSB buffer filling wouldn't be necessary to - * protect against this type of attack. - * - * The "user -> user" attack scenario is mitigated by RSB filling. - * - * 2) Poisoned RSB entry - * - * If the 'next' in-kernel return stack is shorter than 'prev', - * 'next' could be tricked into speculating with a user-poisoned RSB - * entry. - * - * The "user -> kernel" attack scenario is mitigated by SMEP and - * eIBRS. - * - * The "user -> user" scenario, also known as SpectreBHB, requires - * RSB clearing. - * - * So to mitigate all cases, unconditionally fill RSB on context - * switches. - * - * FIXME: Is this pointless for retbleed-affected AMD? - */ spectre_v2_select_rsb_mitigation(mode); /* -- Gitee From 7b3e96d8ced7c9efe767ff11df7a752d1a6b2438 Mon Sep 17 00:00:00 2001 From: Chao Peng Date: Fri, 27 Oct 2023 11:22:01 -0700 Subject: [PATCH 14/21] KVM: x86: Disallow hugepages when memory attributes are mixed mainline inclusion from mainline-v6.8-rc1 commit 90b4fe17981e155432c4dbc490606d0c2e9c2199 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/ID0OQX CVE: NA Reference: https://github.com/torvalds/linux/commit/90b4fe17981e155432c4dbc490606d0c2e9c2199 -------------------------------- commit 90b4fe17981e155432c4dbc490606d0c2e9c2199 upstream Disallow creating hugepages with mixed memory attributes, e.g. shared versus private, as mapping a hugepage in this case would allow the guest to access memory with the wrong attributes, e.g. overlaying private memory with a shared hugepage. Tracking whether or not attributes are mixed via the existing disallow_lpage field, but use the most significant bit in 'disallow_lpage' to indicate a hugepage has mixed attributes instead using the normal refcounting. Whether or not attributes are mixed is binary; either they are or they aren't. Attempting to squeeze that info into the refcount is unnecessarily complex as it would require knowing the previous state of the mixed count when updating attributes. Using a flag means KVM just needs to ensure the current status is reflected in the memslots. Signed-off-by: Chao Peng Co-developed-by: Sean Christopherson Signed-off-by: Sean Christopherson Message-Id: <20231027182217.3615211-20-seanjc@google.com> Signed-off-by: Paolo Bonzini Signed-off-by: Malathi Signed-off-by: PvsNarasimha --- arch/x86/include/asm/kvm_host.h | 3 + arch/x86/kvm/mmu/mmu.c | 154 +++++++++++++++++++++++++++++++- arch/x86/kvm/x86.c | 4 + 3 files changed, 159 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 3936da6febce..174ae3458a13 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1876,6 +1876,9 @@ int kvm_mmu_create(struct kvm_vcpu *vcpu); void kvm_mmu_init_vm(struct kvm *kvm); void kvm_mmu_uninit_vm(struct kvm *kvm); +void kvm_mmu_init_memslot_memory_attributes(struct kvm *kvm, + struct kvm_memory_slot *slot); + void kvm_mmu_after_set_cpuid(struct kvm_vcpu *vcpu); void kvm_mmu_reset_context(struct kvm_vcpu *vcpu); void kvm_mmu_slot_remove_write_access(struct kvm *kvm, diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index c54c8385b16d..dda56a31f585 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -795,16 +795,26 @@ static struct kvm_lpage_info *lpage_info_slot(gfn_t gfn, return &slot->arch.lpage_info[level - 2][idx]; } +/* + * The most significant bit in disallow_lpage tracks whether or not memory + * attributes are mixed, i.e. not identical for all gfns at the current level. + * The lower order bits are used to refcount other cases where a hugepage is + * disallowed, e.g. if KVM has shadow a page table at the gfn. + */ +#define KVM_LPAGE_MIXED_FLAG BIT(31) + static void update_gfn_disallow_lpage_count(const struct kvm_memory_slot *slot, gfn_t gfn, int count) { struct kvm_lpage_info *linfo; - int i; + int old, i; for (i = PG_LEVEL_2M; i <= KVM_MAX_HUGEPAGE_LEVEL; ++i) { linfo = lpage_info_slot(gfn, slot, i); + + old = linfo->disallow_lpage; linfo->disallow_lpage += count; - WARN_ON_ONCE(linfo->disallow_lpage < 0); + WARN_ON_ONCE((old ^ linfo->disallow_lpage) & KVM_LPAGE_MIXED_FLAG); } } @@ -7172,3 +7182,143 @@ void kvm_mmu_pre_destroy_vm(struct kvm *kvm) if (kvm->arch.nx_huge_page_recovery_thread) kthread_stop(kvm->arch.nx_huge_page_recovery_thread); } + +#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES +static bool hugepage_test_mixed(struct kvm_memory_slot *slot, gfn_t gfn, + int level) +{ + return lpage_info_slot(gfn, slot, level)->disallow_lpage & KVM_LPAGE_MIXED_FLAG; +} + +static void hugepage_clear_mixed(struct kvm_memory_slot *slot, gfn_t gfn, + int level) +{ + lpage_info_slot(gfn, slot, level)->disallow_lpage &= ~KVM_LPAGE_MIXED_FLAG; +} + +static void hugepage_set_mixed(struct kvm_memory_slot *slot, gfn_t gfn, + int level) +{ + lpage_info_slot(gfn, slot, level)->disallow_lpage |= KVM_LPAGE_MIXED_FLAG; +} + +static bool hugepage_has_attrs(struct kvm *kvm, struct kvm_memory_slot *slot, + gfn_t gfn, int level, unsigned long attrs) +{ + const unsigned long start = gfn; + const unsigned long end = start + KVM_PAGES_PER_HPAGE(level); + + if (level == PG_LEVEL_2M) + return kvm_range_has_memory_attributes(kvm, start, end, attrs); + + for (gfn = start; gfn < end; gfn += KVM_PAGES_PER_HPAGE(level - 1)) { + if (hugepage_test_mixed(slot, gfn, level - 1) || + attrs != kvm_get_memory_attributes(kvm, gfn)) + return false; + } + return true; +} + +bool kvm_arch_post_set_memory_attributes(struct kvm *kvm, + struct kvm_gfn_range *range) +{ + unsigned long attrs = range->arg.attributes; + struct kvm_memory_slot *slot = range->slot; + int level; + + lockdep_assert_held_write(&kvm->mmu_lock); + lockdep_assert_held(&kvm->slots_lock); + + /* + * Calculate which ranges can be mapped with hugepages even if the slot + * can't map memory PRIVATE. KVM mustn't create a SHARED hugepage over + * a range that has PRIVATE GFNs, and conversely converting a range to + * SHARED may now allow hugepages. + */ + if (WARN_ON_ONCE(!kvm_arch_has_private_mem(kvm))) + return false; + + /* + * The sequence matters here: upper levels consume the result of lower + * level's scanning. + */ + for (level = PG_LEVEL_2M; level <= KVM_MAX_HUGEPAGE_LEVEL; level++) { + gfn_t nr_pages = KVM_PAGES_PER_HPAGE(level); + gfn_t gfn = gfn_round_for_level(range->start, level); + + /* Process the head page if it straddles the range. */ + if (gfn != range->start || gfn + nr_pages > range->end) { + /* + * Skip mixed tracking if the aligned gfn isn't covered + * by the memslot, KVM can't use a hugepage due to the + * misaligned address regardless of memory attributes. + */ + if (gfn >= slot->base_gfn) { + if (hugepage_has_attrs(kvm, slot, gfn, level, attrs)) + hugepage_clear_mixed(slot, gfn, level); + else + hugepage_set_mixed(slot, gfn, level); + } + gfn += nr_pages; + } + + /* + * Pages entirely covered by the range are guaranteed to have + * only the attributes which were just set. + */ + for ( ; gfn + nr_pages <= range->end; gfn += nr_pages) + hugepage_clear_mixed(slot, gfn, level); + + /* + * Process the last tail page if it straddles the range and is + * contained by the memslot. Like the head page, KVM can't + * create a hugepage if the slot size is misaligned. + */ + if (gfn < range->end && + (gfn + nr_pages) <= (slot->base_gfn + slot->npages)) { + if (hugepage_has_attrs(kvm, slot, gfn, level, attrs)) + hugepage_clear_mixed(slot, gfn, level); + else + hugepage_set_mixed(slot, gfn, level); + } + } + return false; +} + +void kvm_mmu_init_memslot_memory_attributes(struct kvm *kvm, + struct kvm_memory_slot *slot) +{ + int level; + + if (!kvm_arch_has_private_mem(kvm)) + return; + + for (level = PG_LEVEL_2M; level <= KVM_MAX_HUGEPAGE_LEVEL; level++) { + /* + * Don't bother tracking mixed attributes for pages that can't + * be huge due to alignment, i.e. process only pages that are + * entirely contained by the memslot. + */ + gfn_t end = gfn_round_for_level(slot->base_gfn + slot->npages, level); + gfn_t start = gfn_round_for_level(slot->base_gfn, level); + gfn_t nr_pages = KVM_PAGES_PER_HPAGE(level); + gfn_t gfn; + + if (start < slot->base_gfn) + start += nr_pages; + + /* + * Unlike setting attributes, every potential hugepage needs to + * be manually checked as the attributes may already be mixed. + */ + for (gfn = start; gfn < end; gfn += nr_pages) { + unsigned long attrs = kvm_get_memory_attributes(kvm, gfn); + + if (hugepage_has_attrs(kvm, slot, gfn, level, attrs)) + hugepage_clear_mixed(slot, gfn, level); + else + hugepage_set_mixed(slot, gfn, level); + } + } +} +#endif diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 2139f728aecc..761e3974edad 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -12832,6 +12832,10 @@ static int kvm_alloc_memslot_metadata(struct kvm *kvm, } } +#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES + kvm_mmu_init_memslot_memory_attributes(kvm, slot); +#endif + if (kvm_page_track_create_memslot(kvm, slot, npages)) goto out_free; -- Gitee From 7dc7f6fbf64c9f27d10169867f96704ef582f7dd Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Tue, 3 Dec 2024 10:04:41 -0800 Subject: [PATCH 15/21] perf/x86: Relax privilege filter restriction on AMD IBS mainline inclusion from mainline-v6.14-rc1 commit d29e744c71673a71da8f8522799ee02744cad6c9 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/ID0OQX CVE: NA Reference: https://github.com/torvalds/linux/commit/d29e744c71673a71da8f8522799ee02744cad6c9 -------------------------------- commit d29e744c71673a71da8f8522799ee02744cad6c9 upstream While IBS is available for per-thread profiling, still regular users cannot open an event due to the default paranoid setting (2) which doesn't allow unprivileged users to get kernel samples. That means it needs to set exclude_kernel bit in the attribute but IBS driver would reject it since it has PERF_PMU_CAP_NO_EXCLUDE. This is not what we want and I've been getting requests to fix this issue. This should be done in the hardware, but until we get the HW fix we may allow exclude_{kernel,user,hv} in the attribute and silently drop the samples in the PMU IRQ handler. It won't guarantee the sampling frequency or even it'd miss some with fixed period too. Not ideal, but that'd still be helpful to regular users. To minimize the confusion, let's add 'swfilt' bit to attr.config2 which is exposed in the sysfs format directory so that users can figure out if the kernel support the privilege filters by software. $ perf record -e ibs_op/swfilt=1/u true This uses perf_exclude_event() which checks regs->cs. But it should be fine because set_linear_ip() also updates the CS according to the RIP provided by IBS. Signed-off-by: Namhyung Kim Signed-off-by: Ingo Molnar Tested-by: Ravi Bangoria Reviewed-by: Ravi Bangoria Link: https://lore.kernel.org/r/20241203180441.1634709-3-namhyung@kernel.org Signed-off-by: PvsNarasimha --- arch/x86/events/amd/ibs.c | 59 +++++++++++++++++++++++++++------------ 1 file changed, 41 insertions(+), 18 deletions(-) diff --git a/arch/x86/events/amd/ibs.c b/arch/x86/events/amd/ibs.c index 64d7b125d0f6..454d2aa61ae0 100644 --- a/arch/x86/events/amd/ibs.c +++ b/arch/x86/events/amd/ibs.c @@ -28,6 +28,8 @@ static u32 ibs_caps; #include #include +/* attr.config2 */ +#define IBS_SW_FILTER_MASK 1 /* * IBS states: @@ -296,6 +298,16 @@ static int perf_ibs_init(struct perf_event *event) if (has_branch_stack(event)) return -EOPNOTSUPP; + /* handle exclude_{user,kernel} in the IRQ handler */ + if (event->attr.exclude_host || event->attr.exclude_guest || + event->attr.exclude_idle) + return -EINVAL; + + if (!(event->attr.config2 & IBS_SW_FILTER_MASK) && + (event->attr.exclude_kernel || event->attr.exclude_user || + event->attr.exclude_hv)) + return -EINVAL; + ret = validate_group(event); if (ret) return ret; @@ -604,24 +616,14 @@ static struct attribute *attrs_empty[] = { NULL, }; -static struct attribute_group empty_format_group = { - .name = "format", - .attrs = attrs_empty, -}; - static struct attribute_group empty_caps_group = { .name = "caps", .attrs = attrs_empty, }; -static const struct attribute_group *empty_attr_groups[] = { - &empty_format_group, - &empty_caps_group, - NULL, -}; - PMU_FORMAT_ATTR(rand_en, "config:57"); PMU_FORMAT_ATTR(cnt_ctl, "config:19"); +PMU_FORMAT_ATTR(swfilt, "config2:0"); PMU_EVENT_ATTR_STRING(l3missonly, fetch_l3missonly, "config:59"); PMU_EVENT_ATTR_STRING(l3missonly, op_l3missonly, "config:16"); PMU_EVENT_ATTR_STRING(ldlat, ibs_op_ldlat_format, "config1:0-11"); @@ -647,8 +649,9 @@ ibs_op_dtlb_pgsize_is_visible(struct kobject *kobj, struct attribute *attr, int return ibs_caps & IBS_CAPS_OPDTLBPGSIZE ? attr->mode : 0; } -static struct attribute *rand_en_attrs[] = { +static struct attribute *fetch_attrs[] = { &format_attr_rand_en.attr, + &format_attr_swfilt.attr, NULL, }; @@ -672,9 +675,9 @@ static struct attribute *ibs_op_dtlb_pgsize_cap_attrs[] = { NULL, }; -static struct attribute_group group_rand_en = { +static struct attribute_group group_fetch_formats = { .name = "format", - .attrs = rand_en_attrs, + .attrs = fetch_attrs, }; static struct attribute_group group_fetch_l3missonly = { @@ -702,7 +705,7 @@ static struct attribute_group group_ibs_op_dtlb_pgsize_cap = { }; static const struct attribute_group *fetch_attr_groups[] = { - &group_rand_en, + &group_fetch_formats, &empty_caps_group, NULL, }; @@ -719,6 +722,11 @@ cnt_ctl_is_visible(struct kobject *kobj, struct attribute *attr, int i) return ibs_caps & IBS_CAPS_OPCNT ? attr->mode : 0; } +static struct attribute *op_attrs[] = { + &format_attr_swfilt.attr, + NULL, +}; + static struct attribute *cnt_ctl_attrs[] = { &format_attr_cnt_ctl.attr, NULL, @@ -734,6 +742,11 @@ static struct attribute *ibs_op_ldlat_format_attrs[] = { NULL, }; +static struct attribute_group group_op_formats = { + .name = "format", + .attrs = op_attrs, +}; + static struct attribute_group group_cnt_ctl = { .name = "format", .attrs = cnt_ctl_attrs, @@ -752,6 +765,12 @@ static struct attribute_group group_ibs_op_ldlat_format = { .is_visible = ibs_op_ldlat_is_visible, }; +static const struct attribute_group *op_attr_groups[] = { + &group_op_formats, + &empty_caps_group, + NULL, +}; + static const struct attribute_group *op_attr_update[] = { &group_cnt_ctl, &group_op_l3missonly, @@ -772,7 +791,6 @@ static struct perf_ibs perf_ibs_fetch = { .start = perf_ibs_start, .stop = perf_ibs_stop, .read = perf_ibs_read, - .capabilities = PERF_PMU_CAP_NO_EXCLUDE, .check_period = perf_ibs_check_period, }, .msr = MSR_AMD64_IBSFETCHCTL, @@ -798,7 +816,6 @@ static struct perf_ibs perf_ibs_op = { .start = perf_ibs_start, .stop = perf_ibs_stop, .read = perf_ibs_read, - .capabilities = PERF_PMU_CAP_NO_EXCLUDE, .check_period = perf_ibs_check_period, }, .msr = MSR_AMD64_IBSOPCTL, @@ -1250,6 +1267,12 @@ static int perf_ibs_handle_irq(struct perf_ibs *perf_ibs, struct pt_regs *iregs) regs.flags |= PERF_EFLAGS_EXACT; } + if ((event->attr.config2 & IBS_SW_FILTER_MASK) && + perf_exclude_event(event, ®s)) { + throttle = perf_event_account_interrupt(event); + goto out; + } + if (event->attr.sample_type & PERF_SAMPLE_RAW) { raw = (struct perf_raw_record){ .frag = { @@ -1372,7 +1395,7 @@ static __init int perf_ibs_op_init(void) if (ibs_caps & IBS_CAPS_ZEN4) perf_ibs_op.config_mask |= IBS_OP_L3MISSONLY; - perf_ibs_op.pmu.attr_groups = empty_attr_groups; + perf_ibs_op.pmu.attr_groups = op_attr_groups; perf_ibs_op.pmu.attr_update = op_attr_update; return perf_ibs_pmu_init(&perf_ibs_op, "ibs_op"); -- Gitee From 05ba2214ceb2402a13ddf673eb046cc4ff024330 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Mon, 17 Mar 2025 09:37:55 -0700 Subject: [PATCH 16/21] perf/x86: Check data address for IBS software filter mainline inclusion from mainline-v6.14 commit 65a99264f5e5a2bcc8c905f7b2d633e8991672ac category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/ID0OQX CVE: NA Reference: https://github.com/torvalds/linux/commit/65a99264f5e5a2bcc8c905f7b2d633e8991672ac -------------------------------- commit 65a99264f5e5a2bcc8c905f7b2d633e8991672ac upstream The IBS software filter is filtering kernel samples for regular users in the PMI handler. It checks the instruction address in the IBS register to determine if it was in kernel mode or not. But it turns out that it's possible to report a kernel data address even if the instruction address belongs to user-space. Matteo Rizzo found that when an instruction raises an exception, IBS can report some kernel data addresses like IDT while holding the faulting instruction's RIP. To prevent an information leak, it should double check if the data address in PERF_SAMPLE_DATA is in the kernel space as well. [ mingo: Clarified the changelog ] Suggested-by: Matteo Rizzo Signed-off-by: Namhyung Kim Signed-off-by: Ingo Molnar Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20250317163755.1842589-1-namhyung@kernel.org Signed-off-by: Arukonda Rahul Signed-off-by: PvsNarasimha --- arch/x86/events/amd/ibs.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/arch/x86/events/amd/ibs.c b/arch/x86/events/amd/ibs.c index 454d2aa61ae0..1053fdc97a6e 100644 --- a/arch/x86/events/amd/ibs.c +++ b/arch/x86/events/amd/ibs.c @@ -1267,8 +1267,13 @@ static int perf_ibs_handle_irq(struct perf_ibs *perf_ibs, struct pt_regs *iregs) regs.flags |= PERF_EFLAGS_EXACT; } + if (perf_ibs == &perf_ibs_op) + perf_ibs_parse_ld_st_data(event->attr.sample_type, &ibs_data, &data); + if ((event->attr.config2 & IBS_SW_FILTER_MASK) && - perf_exclude_event(event, ®s)) { + (perf_exclude_event(event, ®s) || + ((data.sample_flags & PERF_SAMPLE_ADDR) && + event->attr.exclude_kernel && kernel_ip(data.addr)))) { throttle = perf_event_account_interrupt(event); goto out; } @@ -1283,9 +1288,6 @@ static int perf_ibs_handle_irq(struct perf_ibs *perf_ibs, struct pt_regs *iregs) perf_sample_save_raw_data(&data, event, &raw); } - if (perf_ibs == &perf_ibs_op) - perf_ibs_parse_ld_st_data(event->attr.sample_type, &ibs_data, &data); - /* * rip recorded by IbsOpRip will not be consistent with rsp and rbp * recorded as part of interrupt regs. Thus we need to use rip from -- Gitee From dd25aa63fc43bee7ab99346a32cde717bb84f741 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Sat, 22 Mar 2025 08:13:01 +0100 Subject: [PATCH 17/21] perf/amd/ibs: Prevent leaking sensitive data to userspace mainline inclusion from mainline-v6.14 commit 50a53b60e141d7e31368a87e222e4dd5597bd4ae category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/ID0OQX CVE: NA Reference: https://github.com/torvalds/linux/commit/50a53b60e141d7e31368a87e222e4dd5597bd4ae -------------------------------- commit 50a53b60e141d7e31368a87e222e4dd5597bd4ae upstream Although IBS "swfilt" can prevent leaking samples with kernel RIP to the userspace, there are few subtle cases where a 'data' address and/or a 'branch target' address can fall under kernel address range although RIP is from userspace. Prevent leaking kernel 'data' addresses by discarding such samples when {exclude_kernel=1,swfilt=1}. IBS can now be invoked by unprivileged user with the introduction of "swfilt". However, this creates a loophole in the interface where an unprivileged user can get physical address of the userspace virtual addresses through IBS register raw dump (PERF_SAMPLE_RAW). Prevent this as well. This upstream commit fixed the most obvious leak: 65a99264f5e5 perf/x86: Check data address for IBS software filter Follow that up with a more complete fix. Fixes: d29e744c7167 ("perf/x86: Relax privilege filter restriction on AMD IBS") Suggested-by: Matteo Rizzo Co-developed-by: Ravi Bangoria Signed-off-by: Namhyung Kim Signed-off-by: Ravi Bangoria Signed-off-by: Ingo Molnar Cc: Linus Torvalds Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20250321161251.1033-1-ravi.bangoria@amd.com Signed-off-by: Arukonda Rahul Signed-off-by: PvsNarasimha --- arch/x86/events/amd/ibs.c | 84 ++++++++++++++++++++++++++++++++++++--- 1 file changed, 78 insertions(+), 6 deletions(-) diff --git a/arch/x86/events/amd/ibs.c b/arch/x86/events/amd/ibs.c index 1053fdc97a6e..aef64b02d86a 100644 --- a/arch/x86/events/amd/ibs.c +++ b/arch/x86/events/amd/ibs.c @@ -1054,6 +1054,8 @@ static void perf_ibs_get_mem_lock(union ibs_op_data3 *op_data3, data_src->mem_lock = PERF_MEM_LOCK_LOCKED; } +/* Be careful. Works only for contiguous MSRs. */ +#define ibs_fetch_msr_idx(msr) (msr - MSR_AMD64_IBSFETCHCTL) #define ibs_op_msr_idx(msr) (msr - MSR_AMD64_IBSOPCTL) static void perf_ibs_get_data_src(struct perf_ibs_data *ibs_data, @@ -1159,6 +1161,67 @@ static int perf_ibs_get_offset_max(struct perf_ibs *perf_ibs, return 1; } +static bool perf_ibs_is_kernel_data_addr(struct perf_event *event, + struct perf_ibs_data *ibs_data) +{ + u64 sample_type_mask = PERF_SAMPLE_ADDR | PERF_SAMPLE_RAW; + union ibs_op_data3 op_data3; + u64 dc_lin_addr; + + op_data3.val = ibs_data->regs[ibs_op_msr_idx(MSR_AMD64_IBSOPDATA3)]; + dc_lin_addr = ibs_data->regs[ibs_op_msr_idx(MSR_AMD64_IBSDCLINAD)]; + + return unlikely((event->attr.sample_type & sample_type_mask) && + op_data3.dc_lin_addr_valid && kernel_ip(dc_lin_addr)); +} + +static bool perf_ibs_is_kernel_br_target(struct perf_event *event, + struct perf_ibs_data *ibs_data, + int br_target_idx) +{ + union ibs_op_data op_data; + u64 br_target; + + op_data.val = ibs_data->regs[ibs_op_msr_idx(MSR_AMD64_IBSOPDATA)]; + br_target = ibs_data->regs[br_target_idx]; + + return unlikely((event->attr.sample_type & PERF_SAMPLE_RAW) && + op_data.op_brn_ret && kernel_ip(br_target)); +} + +static bool perf_ibs_swfilt_discard(struct perf_ibs *perf_ibs, struct perf_event *event, + struct pt_regs *regs, struct perf_ibs_data *ibs_data, + int br_target_idx) +{ + if (perf_exclude_event(event, regs)) + return true; + + if (perf_ibs != &perf_ibs_op || !event->attr.exclude_kernel) + return false; + + if (perf_ibs_is_kernel_data_addr(event, ibs_data)) + return true; + + if (br_target_idx != -1 && + perf_ibs_is_kernel_br_target(event, ibs_data, br_target_idx)) + return true; + + return false; +} + +static void perf_ibs_phyaddr_clear(struct perf_ibs *perf_ibs, + struct perf_ibs_data *ibs_data) +{ + if (perf_ibs == &perf_ibs_op) { + ibs_data->regs[ibs_op_msr_idx(MSR_AMD64_IBSOPDATA3)] &= ~(1ULL << 18); + ibs_data->regs[ibs_op_msr_idx(MSR_AMD64_IBSDCPHYSAD)] = 0; + return; + } + + ibs_data->regs[ibs_fetch_msr_idx(MSR_AMD64_IBSFETCHCTL)] &= ~(1ULL << 52); + ibs_data->regs[ibs_fetch_msr_idx(MSR_AMD64_IBSFETCHPHYSAD)] = 0; +} + static int perf_ibs_handle_irq(struct perf_ibs *perf_ibs, struct pt_regs *iregs) { struct cpu_perf_ibs *pcpu = this_cpu_ptr(perf_ibs->pcpu); @@ -1171,6 +1234,7 @@ static int perf_ibs_handle_irq(struct perf_ibs *perf_ibs, struct pt_regs *iregs) int offset, size, check_rip, offset_max, throttle = 0; unsigned int msr; u64 *buf, *config, period, new_config = 0; + int br_target_idx = -1; if (!test_bit(IBS_STARTED, pcpu->state)) { fail: @@ -1241,6 +1305,7 @@ static int perf_ibs_handle_irq(struct perf_ibs *perf_ibs, struct pt_regs *iregs) if (perf_ibs == &perf_ibs_op) { if (ibs_caps & IBS_CAPS_BRNTRGT) { rdmsrl(MSR_AMD64_IBSBRTARGET, *buf++); + br_target_idx = size; size++; } if (ibs_caps & IBS_CAPS_OPDATA4) { @@ -1267,16 +1332,20 @@ static int perf_ibs_handle_irq(struct perf_ibs *perf_ibs, struct pt_regs *iregs) regs.flags |= PERF_EFLAGS_EXACT; } - if (perf_ibs == &perf_ibs_op) - perf_ibs_parse_ld_st_data(event->attr.sample_type, &ibs_data, &data); - if ((event->attr.config2 & IBS_SW_FILTER_MASK) && - (perf_exclude_event(event, ®s) || - ((data.sample_flags & PERF_SAMPLE_ADDR) && - event->attr.exclude_kernel && kernel_ip(data.addr)))) { + perf_ibs_swfilt_discard(perf_ibs, event, ®s, &ibs_data, br_target_idx)) { throttle = perf_event_account_interrupt(event); goto out; } + /* + * Prevent leaking physical addresses to unprivileged users. Skip + * PERF_SAMPLE_PHYS_ADDR check since generic code prevents it for + * unprivileged users. + */ + if ((event->attr.sample_type & PERF_SAMPLE_RAW) && + perf_allow_kernel(&event->attr)) { + perf_ibs_phyaddr_clear(perf_ibs, &ibs_data); + } if (event->attr.sample_type & PERF_SAMPLE_RAW) { raw = (struct perf_raw_record){ @@ -1288,6 +1357,9 @@ static int perf_ibs_handle_irq(struct perf_ibs *perf_ibs, struct pt_regs *iregs) perf_sample_save_raw_data(&data, event, &raw); } + if (perf_ibs == &perf_ibs_op) + perf_ibs_parse_ld_st_data(event->attr.sample_type, &ibs_data, &data); + /* * rip recorded by IbsOpRip will not be consistent with rsp and rbp * recorded as part of interrupt regs. Thus we need to use rip from -- Gitee From 208f562cb3b4312d68b1793ad1742a44171f5c4e Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 27 Feb 2025 14:24:09 -0800 Subject: [PATCH 18/21] KVM: SVM: Manually context switch DEBUGCTL if LBR virtualization is disabled mainline inclusion from mainline-v6.14-rc6 commit 433265870ab3455b418885bff48fa5fd02f7e448 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/ID0OQX CVE: NA Reference: https://github.com/torvalds/linux/commit/433265870ab3455b418885bff48fa5fd02f7e448 -------------------------------- commit 433265870ab3455b418885bff48fa5fd02f7e448 upstream Manually load the guest's DEBUGCTL prior to VMRUN (and restore the host's value on #VMEXIT) if it diverges from the host's value and LBR virtualization is disabled, as hardware only context switches DEBUGCTL if LBR virtualization is fully enabled. Running the guest with the host's value has likely been mildly problematic for quite some time, e.g. it will result in undesirable behavior if BTF diverges (with the caveat that KVM now suppresses guest BTF due to lack of support). But the bug became fatal with the introduction of Bus Lock Trap ("Detect" in kernel paralance) support for AMD (commit 408eb7417a92 ("x86/bus_lock: Add support for AMD")), as a bus lock in the guest will trigger an unexpected #DB. Note, suppressing the bus lock #DB, i.e. simply resuming the guest without injecting a #DB, is not an option. It wouldn't address the general issue with DEBUGCTL, e.g. for things like BTF, and there are other guest-visible side effects if BusLockTrap is left enabled. If BusLockTrap is disabled, then DR6.BLD is reserved-to-1; any attempts to clear it by software are ignored. But if BusLockTrap is enabled, software can clear DR6.BLD: Software enables bus lock trap by setting DebugCtl MSR[BLCKDB] (bit 2) to 1. When bus lock trap is enabled, ... The processor indicates that this #DB was caused by a bus lock by clearing DR6[BLD] (bit 11). DR6[11] previously had been defined to be always 1. and clearing DR6.BLD is "sticky" in that it's not set (i.e. lowered) by other #DBs: All other #DB exceptions leave DR6[BLD] unmodified E.g. leaving BusLockTrap enable can confuse a legacy guest that writes '0' to reset DR6. [Backport_change] To fix the compilation error, add the host_debugctl member to the struct kvm_vcpu_arch in arch/x86/include/asm/kvm_host.h Reported-by: rangemachine@gmail.com Reported-by: whanos@sergal.fun Closes: https://bugzilla.kernel.org/show_bug.cgi?id=219787 Closes: https://lore.kernel.org/all/bug-219787-28872@https.bugzilla.kernel.org%2F Cc: Ravi Bangoria Cc: stable@vger.kernel.org Reviewed-and-tested-by: Ravi Bangoria Link: https://lore.kernel.org/r/20250227222411.3490595-5-seanjc@google.com Signed-off-by: Sean Christopherson Signed-off-by: PvsNarasimha --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/svm/svm.c | 14 ++++++++++++++ 2 files changed, 15 insertions(+) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 174ae3458a13..227f3ad4a06d 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -734,6 +734,7 @@ struct kvm_vcpu_arch { u32 pkru; u32 hflags; u64 efer; + u64 host_debugctl; u64 apic_base; struct kvm_lapic *apic; /* kernel irqchip context */ bool load_eoi_exitmap_pending; diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 7bf68a753a97..9652c2b96d5a 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -4300,6 +4300,16 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu) clgi(); kvm_load_guest_xsave_state(vcpu); + /* + * Hardware only context switches DEBUGCTL if LBR virtualization is + * enabled. Manually load DEBUGCTL if necessary (and restore it after + * VM-Exit), as running with the host's DEBUGCTL can negatively affect + * guest state and can even be fatal, e.g. due to Bus Lock Detect. + */ + if (!(svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK) && + vcpu->arch.host_debugctl != svm->vmcb->save.dbgctl) + update_debugctlmsr(svm->vmcb->save.dbgctl); + kvm_wait_lapic_expire(vcpu); /* @@ -4327,6 +4337,10 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu) if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI)) kvm_before_interrupt(vcpu, KVM_HANDLING_NMI); + if (!(svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK) && + vcpu->arch.host_debugctl != svm->vmcb->save.dbgctl) + update_debugctlmsr(vcpu->arch.host_debugctl); + kvm_load_host_xsave_state(vcpu); stgi(); -- Gitee From 47dbd85e31c88f4aa4bf030c58fe9fed259a2565 Mon Sep 17 00:00:00 2001 From: Rick Edgecombe Date: Thu, 14 Mar 2024 14:29:02 -0700 Subject: [PATCH 19/21] KVM: x86/mmu: x86: Don't overflow lpage_info when checking attributes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit mainline inclusion from mainline-v6.9-rc5 commit 992b54bd083c5bee24ff7cc35991388ab08598c4 category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/ID0OQX CVE: NA Reference: https://github.com/torvalds/linux/commit/992b54bd083c5bee24ff7cc35991388ab08598c4 -------------------------------- commit 992b54bd083c5bee24ff7cc35991388ab08598c4 upstream Fix KVM_SET_MEMORY_ATTRIBUTES to not overflow lpage_info array and trigger KASAN splat, as seen in the private_mem_conversions_test selftest. When memory attributes are set on a GFN range, that range will have specific properties applied to the TDP. A huge page cannot be used when the attributes are inconsistent, so they are disabled for those the specific huge pages. For internal KVM reasons, huge pages are also not allowed to span adjacent memslots regardless of whether the backing memory could be mapped as huge. What GFNs support which huge page sizes is tracked by an array of arrays 'lpage_info' on the memslot, of ‘kvm_lpage_info’ structs. Each index of lpage_info contains a vmalloc allocated array of these for a specific supported page size. The kvm_lpage_info denotes whether a specific huge page (GFN and page size) on the memslot is supported. These arrays include indices for unaligned head and tail huge pages. Preventing huge pages from spanning adjacent memslot is covered by incrementing the count in head and tail kvm_lpage_info when the memslot is allocated, but disallowing huge pages for memory that has mixed attributes has to be done in a more complicated way. During the KVM_SET_MEMORY_ATTRIBUTES ioctl KVM updates lpage_info for each memslot in the range that has mismatched attributes. KVM does this a memslot at a time, and marks a special bit, KVM_LPAGE_MIXED_FLAG, in the kvm_lpage_info for any huge page. This bit is essentially a permanently elevated count. So huge pages will not be mapped for the GFN at that page size if the count is elevated in either case: a huge head or tail page unaligned to the memslot or if KVM_LPAGE_MIXED_FLAG is set because it has mixed attributes. To determine whether a huge page has consistent attributes, the KVM_SET_MEMORY_ATTRIBUTES operation checks an xarray to make sure it consistently has the incoming attribute. Since level - 1 huge pages are aligned to level huge pages, it employs an optimization. As long as the level - 1 huge pages are checked first, it can just check these and assume that if each level - 1 huge page contained within the level sized huge page is not mixed, then the level size huge page is not mixed. This optimization happens in the helper hugepage_has_attrs(). Unfortunately, although the kvm_lpage_info array representing page size 'level' will contain an entry for an unaligned tail page of size level, the array for level - 1 will not contain an entry for each GFN at page size level. The level - 1 array will only contain an index for any unaligned region covered by level - 1 huge page size, which can be a smaller region. So this causes the optimization to overflow the level - 1 kvm_lpage_info and perform a vmalloc out of bounds read. In some cases of head and tail pages where an overflow could happen, callers skip the operation completely as KVM_LPAGE_MIXED_FLAG is not required to prevent huge pages as discussed earlier. But for memslots that are smaller than the 1GB page size, it does call hugepage_has_attrs(). In this case the huge page is both the head and tail page. The issue can be observed simply by compiling the kernel with CONFIG_KASAN_VMALLOC and running the selftest “private_mem_conversions_test”, which produces the output like the following: BUG: KASAN: vmalloc-out-of-bounds in hugepage_has_attrs+0x7e/0x110 Read of size 4 at addr ffffc900000a3008 by task private_mem_con/169 Call Trace: dump_stack_lvl print_report ? __virt_addr_valid ? hugepage_has_attrs ? hugepage_has_attrs kasan_report ? hugepage_has_attrs hugepage_has_attrs kvm_arch_post_set_memory_attributes kvm_vm_ioctl It is a little ambiguous whether the unaligned head page (in the bug case also the tail page) should be expected to have KVM_LPAGE_MIXED_FLAG set. It is not functionally required, as the unaligned head/tail pages will already have their kvm_lpage_info count incremented. The comments imply not setting it on unaligned head pages is intentional, so fix the callers to skip trying to set KVM_LPAGE_MIXED_FLAG in this case, and in doing so not call hugepage_has_attrs(). Cc: stable@vger.kernel.org Fixes: 90b4fe17981e ("KVM: x86: Disallow hugepages when memory attributes are mixed") Signed-off-by: Rick Edgecombe Reviewed-by: Kai Huang Reviewed-by: Chao Peng Link: https://lore.kernel.org/r/20240314212902.2762507-1-rick.p.edgecombe@intel.com Signed-off-by: Sean Christopherson Signed-off-by: PvsNarasimha --- arch/x86/kvm/mmu/mmu.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index dda56a31f585..83b3b8130c4b 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -7253,7 +7253,8 @@ bool kvm_arch_post_set_memory_attributes(struct kvm *kvm, * by the memslot, KVM can't use a hugepage due to the * misaligned address regardless of memory attributes. */ - if (gfn >= slot->base_gfn) { + if (gfn >= slot->base_gfn && + gfn + nr_pages <= slot->base_gfn + slot->npages) { if (hugepage_has_attrs(kvm, slot, gfn, level, attrs)) hugepage_clear_mixed(slot, gfn, level); else -- Gitee From f01c4f7a1c0bba262b3ab4e602c04414a3c8de18 Mon Sep 17 00:00:00 2001 From: "PVS.NarasimhaRao" Date: Wed, 29 Oct 2025 14:43:00 +0800 Subject: [PATCH 20/21] kvm: Fix KABI breakage caused by addition of host_debugctl member hulk inclusion category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/ID0OQX CVE: NA -------------------------------- Commit 7ecc7cfb129963 introduced a new member `host_debugctl` in `struct kvm_vcpu_arch`, which resulted in a KABI (Kernel ABI) compatibility breakage due to changes in the structure layout. To restore KABI compatibility, this patch introduces an extended structure `struct kvm_vcpu_arch_ext` to contain the `host_debugctl` member. The extended structure is referenced via a pointer (`struct kvm_vcpu_arch_ext *arch_ext`) inside `struct kvm_vcpu` defined in `include/linux/kvm_host.h`. The `KABI_EXTEND` macro is applied to ensure binary compatibility is preserved across kernel versions without altering the existing KABI layout. This change resolves the build and KABI check failures observed on non-x86 architectures. Fixes: KVM: SVM: Manually context switch DEBUGCTL if LBR virtualization is disabled ("kvm: Add host_debugctl member to kvm_vcpu_arch") Signed-off-by: PVS.NarasimhaRao --- arch/x86/include/asm/kvm_host.h | 5 ++++- arch/x86/kvm/svm/svm.c | 6 +++--- include/linux/kvm_host.h | 1 + 3 files changed, 8 insertions(+), 4 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 227f3ad4a06d..1554cd123c72 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -734,7 +734,6 @@ struct kvm_vcpu_arch { u32 pkru; u32 hflags; u64 efer; - u64 host_debugctl; u64 apic_base; struct kvm_lapic *apic; /* kernel irqchip context */ bool load_eoi_exitmap_pending; @@ -1020,6 +1019,10 @@ struct kvm_vcpu_arch { #endif }; +struct kvm_vcpu_arch_ext { + u64 host_debugctl; +}; + struct kvm_lpage_info { int disallow_lpage; }; diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 9652c2b96d5a..40225de3322f 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -4307,7 +4307,7 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu) * guest state and can even be fatal, e.g. due to Bus Lock Detect. */ if (!(svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK) && - vcpu->arch.host_debugctl != svm->vmcb->save.dbgctl) + vcpu->arch_ext->host_debugctl != svm->vmcb->save.dbgctl) update_debugctlmsr(svm->vmcb->save.dbgctl); kvm_wait_lapic_expire(vcpu); @@ -4338,8 +4338,8 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu) kvm_before_interrupt(vcpu, KVM_HANDLING_NMI); if (!(svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK) && - vcpu->arch.host_debugctl != svm->vmcb->save.dbgctl) - update_debugctlmsr(vcpu->arch.host_debugctl); + vcpu->arch_ext->host_debugctl != svm->vmcb->save.dbgctl) + update_debugctlmsr(vcpu->arch_ext->host_debugctl); kvm_load_host_xsave_state(vcpu); stgi(); diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 070fda6c98ca..b5782c95eacc 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -399,6 +399,7 @@ struct kvm_vcpu { */ struct kvm_memory_slot *last_used_slot; u64 last_used_slot_gen; + KABI_EXTEND(struct kvm_vcpu_arch_ext *arch_ext) }; /* -- Gitee From 61a8ca29b5605894bb4fdf48aa880dbb03b9f96b Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Tue, 3 Dec 2024 10:04:40 -0800 Subject: [PATCH 21/21] perf/core: Export perf_exclude_event() mainline inclusion from mainline-v6.14-rc1 commit 6057b90ecc84f232dd32a047a086a4c4c271765f category: feature bugzilla: https://gitee.com/openeuler/kernel/issues/ID0OQX CVE: NA Reference: https://github.com/torvalds/linux/commit/6057b90ecc84f232dd32a047a086a4c4c271765f -------------------------------- commit 6057b90ecc84f232dd32a047a086a4c4c271765f upstream While at it, rename the same function in s390 cpum_sf PMU. Signed-off-by: Namhyung Kim Signed-off-by: Ingo Molnar Tested-by: Ravi Bangoria Reviewed-by: Ravi Bangoria Acked-by: Thomas Richter Link: https://lore.kernel.org/r/20241203180441.1634709-2-namhyung@kernel.org Signed-off-by: Arukonda Rahul Signed-off-by: PvsNarasimha --- arch/s390/kernel/perf_cpum_sf.c | 6 +++--- include/linux/perf_event.h | 6 ++++++ kernel/events/core.c | 3 +-- 3 files changed, 10 insertions(+), 5 deletions(-) diff --git a/arch/s390/kernel/perf_cpum_sf.c b/arch/s390/kernel/perf_cpum_sf.c index e52c89739bc9..928333a0a8cf 100644 --- a/arch/s390/kernel/perf_cpum_sf.c +++ b/arch/s390/kernel/perf_cpum_sf.c @@ -1074,7 +1074,7 @@ static void cpumsf_pmu_disable(struct pmu *pmu) cpuhw->flags &= ~PMU_F_ENABLED; } -/* perf_exclude_event() - Filter event +/* perf_event_exclude() - Filter event * @event: The perf event * @regs: pt_regs structure * @sde_regs: Sample-data-entry (sde) regs structure @@ -1083,7 +1083,7 @@ static void cpumsf_pmu_disable(struct pmu *pmu) * * Return non-zero if the event shall be excluded. */ -static int perf_exclude_event(struct perf_event *event, struct pt_regs *regs, +static int perf_event_exclude(struct perf_event *event, struct pt_regs *regs, struct perf_sf_sde_regs *sde_regs) { if (event->attr.exclude_user && user_mode(regs)) @@ -1166,7 +1166,7 @@ static int perf_push_sample(struct perf_event *event, data.tid_entry.pid = basic->hpp & LPP_PID_MASK; overflow = 0; - if (perf_exclude_event(event, ®s, sde_regs)) + if (perf_event_exclude(event, ®s, sde_regs)) goto out; if (perf_event_overflow(event, &data, ®s)) { overflow = 1; diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 593fa001b092..cb500df1aa7c 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1703,6 +1703,8 @@ static inline int perf_allow_tracepoint(struct perf_event_attr *attr) return security_perf_event_open(attr, PERF_SECURITY_TRACEPOINT); } +extern int perf_exclude_event(struct perf_event *event, struct pt_regs *regs); + extern void perf_event_init(void); extern void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size, struct pt_regs *regs, @@ -1878,6 +1880,10 @@ static inline u64 perf_event_pause(struct perf_event *event, bool reset) { return 0; } +static inline int perf_exclude_event(struct perf_event *event, struct pt_regs *regs) +{ + return 0; +} #endif #if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_CPU_SUP_INTEL) diff --git a/kernel/events/core.c b/kernel/events/core.c index d9f2c02e9b00..6522b0a3f25f 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -9799,8 +9799,7 @@ static void perf_swevent_event(struct perf_event *event, u64 nr, perf_swevent_overflow(event, 0, data, regs); } -static int perf_exclude_event(struct perf_event *event, - struct pt_regs *regs) +int perf_exclude_event(struct perf_event *event, struct pt_regs *regs) { if (event->hw.state & PERF_HES_STOPPED) return 1; -- Gitee