From 459fcf8352b7e18a8c50a9149a80cc81fa198394 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Thu, 1 Oct 2020 06:57:46 -0700 Subject: [PATCH 01/13] perf/core: Add PERF_SAMPLE_DATA_PAGE_SIZE ANBZ: #1117 commit 8d97e71811aaafe4abf611dc24822fd6e73df1a1 upstream Current perf can report both virtual addresses and physical addresses, but not the MMU page size. Without the MMU page size information of the utilized page, users cannot decide whether to promote/demote large pages to optimize memory usage. Add a new sample type for the data MMU page size. Current perf already has a facility to collect data virtual addresses. A page walker is required to walk the pages tables and calculate the MMU page size from a given virtual address. On some platforms, e.g., X86, the page walker is invoked in an NMI handler. So the page walker must be NMI-safe and low overhead. Besides, the page walker should work for both user and kernel virtual address. The existing generic page walker, e.g., walk_page_range_novma(), is a little bit complex and doesn't guarantee the NMI-safe. The follow_page() is only for user-virtual address. Add a new function perf_get_page_size() to walk the page tables and calculate the MMU page size. In the function: - Interrupts have to be disabled to prevent any teardown of the page tables. - For user space threads, the current->mm is used for the page walker. For kernel threads and the like, the current->mm is NULL. The init_mm is used for the page walker. The active_mm is not used here, because it can be NULL. Quote from Peter Zijlstra, "context_switch() can set prev->active_mm to NULL when it transfers it to @next. It does this before @current is updated. So an NMI that comes in between this active_mm swizzling and updating @current will see !active_mm." - The MMU page size is calculated from the page table level. The method should work for all architectures, but it has only been verified on X86. Should there be some architectures, which support perf, where the method doesn't work, it can be fixed later separately. Reporting the wrong page size would not be fatal for the architecture. Some under discussion features may impact the method in the future. Quote from Dave Hansen, "There are lots of weird things folks are trying to do with the page tables, like Address Space Isolation. For instance, if you get a perf NMI when running userspace, current->mm->pgd is *different* than the PGD that was in use when userspace was running. It's close enough today, but it might not stay that way." If the case happens later, lots of consecutive page walk errors will happen. The worst case is that lots of page-size '0' are returned, which would not be fatal. In the perf tool, a check is implemented to detect this case. Once it happens, a kernel patch could be implemented accordingly then. Suggested-by: Peter Zijlstra Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20201001135749.2804-2-kan.liang@linux.intel.com Signed-off-by: Guanjun Acked-by: Zelin Deng --- include/linux/perf_event.h | 1 + include/uapi/linux/perf_event.h | 2 + kernel/events/core.c | 103 ++++++++++++++++++++++++++++++++ 3 files changed, 106 insertions(+) diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index dc71ab9fa646..bd19771b3bea 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1023,6 +1023,7 @@ struct perf_sample_data { u64 phys_addr; u64 cgroup; + u64 data_page_size; } ____cacheline_aligned; /* default value for data source */ diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index 1105da8d63a6..fb2e82946075 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -143,6 +143,7 @@ enum perf_event_sample_format { PERF_SAMPLE_PHYS_ADDR = 1U << 19, PERF_SAMPLE_AUX = 1U << 20, PERF_SAMPLE_CGROUP = 1U << 21, + PERF_SAMPLE_DATA_PAGE_SIZE = 1U << 22, PERF_SAMPLE_WEIGHT_STRUCT = 1U << 24, PERF_SAMPLE_MAX = 1U << 25, /* non-ABI */ @@ -915,6 +916,7 @@ enum perf_event_type { * { u64 phys_addr;} && PERF_SAMPLE_PHYS_ADDR * { u64 size; * char data[size]; } && PERF_SAMPLE_AUX + * { u64 data_page_size;} && PERF_SAMPLE_DATA_PAGE_SIZE * }; */ PERF_RECORD_SAMPLE = 9, diff --git a/kernel/events/core.c b/kernel/events/core.c index 604dbe2ab969..42bb89a8f167 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -51,6 +51,7 @@ #include #include #include +#include #include "internal.h" @@ -1948,6 +1949,9 @@ static void __perf_event_header_size(struct perf_event *event, u64 sample_type) if (sample_type & PERF_SAMPLE_CGROUP) size += sizeof(data->cgroup); + if (sample_type & PERF_SAMPLE_DATA_PAGE_SIZE) + size += sizeof(data->data_page_size); + event->header_size = size; } @@ -7038,6 +7042,9 @@ void perf_output_sample(struct perf_output_handle *handle, if (sample_type & PERF_SAMPLE_CGROUP) perf_output_put(handle, data->cgroup); + if (sample_type & PERF_SAMPLE_DATA_PAGE_SIZE) + perf_output_put(handle, data->data_page_size); + if (sample_type & PERF_SAMPLE_AUX) { perf_output_put(handle, data->aux_size); @@ -7095,6 +7102,94 @@ static u64 perf_virt_to_phys(u64 virt) return phys_addr; } +#ifdef CONFIG_MMU + +/* + * Return the MMU page size of a given virtual address + */ +static u64 __perf_get_page_size(struct mm_struct *mm, unsigned long addr) +{ + pgd_t *pgd; + p4d_t *p4d; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + + pgd = pgd_offset(mm, addr); + if (pgd_none(*pgd)) + return 0; + + p4d = p4d_offset(pgd, addr); + if (!p4d_present(*p4d)) + return 0; + + if (p4d_leaf(*p4d)) + return 1ULL << P4D_SHIFT; + + pud = pud_offset(p4d, addr); + if (!pud_present(*pud)) + return 0; + + if (pud_leaf(*pud)) + return 1ULL << PUD_SHIFT; + + pmd = pmd_offset(pud, addr); + if (!pmd_present(*pmd)) + return 0; + + if (pmd_leaf(*pmd)) + return 1ULL << PMD_SHIFT; + + pte = pte_offset_map(pmd, addr); + if (!pte_present(*pte)) { + pte_unmap(pte); + return 0; + } + + pte_unmap(pte); + return PAGE_SIZE; +} + +#else + +static u64 __perf_get_page_size(struct mm_struct *mm, unsigned long addr) +{ + return 0; +} + +#endif + +static u64 perf_get_page_size(unsigned long addr) +{ + struct mm_struct *mm; + unsigned long flags; + u64 size; + + if (!addr) + return 0; + + /* + * Software page-table walkers must disable IRQs, + * which prevents any tear down of the page tables. + */ + local_irq_save(flags); + + mm = current->mm; + if (!mm) { + /* + * For kernel threads and the like, use init_mm so that + * we can find kernel memory. + */ + mm = &init_mm; + } + + size = __perf_get_page_size(mm, addr); + + local_irq_restore(flags); + + return size; +} + static struct perf_callchain_entry __empty_callchain = { .nr = 0, }; struct perf_callchain_entry * @@ -7249,6 +7344,14 @@ void perf_prepare_sample(struct perf_event_header *header, } #endif + /* + * PERF_DATA_PAGE_SIZE requires PERF_SAMPLE_ADDR. If the user doesn't + * require PERF_SAMPLE_ADDR, kernel implicitly retrieve the data->addr, + * but the value will not dump to the userspace. + */ + if (sample_type & PERF_SAMPLE_DATA_PAGE_SIZE) + data->data_page_size = perf_get_page_size(data->addr); + if (sample_type & PERF_SAMPLE_AUX) { u64 size; -- Gitee From 9c52b1fc6c3bd4bbcb856b7b2fb7c163741040c8 Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Thu, 1 Oct 2020 06:57:49 -0700 Subject: [PATCH 02/13] perf/core: Add support for PERF_SAMPLE_CODE_PAGE_SIZE ANBZ: #1117 commit 995f088efebe1eba0282a6ffa12411b37f8990c2 upstream When studying code layout, it is useful to capture the page size of the sampled code address. Add a new sample type for code page size. The new sample type requires collecting the ip. The code page size can be calculated from the NMI-safe perf_get_page_size(). For large PEBS, it's very unlikely that the mapping is gone for the earlier PEBS records. Enable the feature for the large PEBS. The worst case is that page-size '0' is returned. Signed-off-by: Kan Liang Signed-off-by: Stephane Eranian Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20201001135749.2804-5-kan.liang@linux.intel.com Signed-off-by: Guanjun Acked-by: Zelin Deng --- arch/x86/events/perf_event.h | 2 +- include/linux/perf_event.h | 1 + include/uapi/linux/perf_event.h | 2 ++ kernel/events/core.c | 11 ++++++++++- 4 files changed, 14 insertions(+), 2 deletions(-) diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index 1f4ed45608ef..644298b66879 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -134,7 +134,7 @@ struct amd_nb { PERF_SAMPLE_DATA_SRC | PERF_SAMPLE_IDENTIFIER | \ PERF_SAMPLE_TRANSACTION | PERF_SAMPLE_PHYS_ADDR | \ PERF_SAMPLE_REGS_INTR | PERF_SAMPLE_REGS_USER | \ - PERF_SAMPLE_PERIOD) + PERF_SAMPLE_PERIOD | PERF_SAMPLE_CODE_PAGE_SIZE) #define PEBS_GP_REGS \ ((1ULL << PERF_REG_X86_AX) | \ diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index bd19771b3bea..adcbf44ea947 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -1024,6 +1024,7 @@ struct perf_sample_data { u64 phys_addr; u64 cgroup; u64 data_page_size; + u64 code_page_size; } ____cacheline_aligned; /* default value for data source */ diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index fb2e82946075..e87bbe763d6a 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -144,6 +144,7 @@ enum perf_event_sample_format { PERF_SAMPLE_AUX = 1U << 20, PERF_SAMPLE_CGROUP = 1U << 21, PERF_SAMPLE_DATA_PAGE_SIZE = 1U << 22, + PERF_SAMPLE_CODE_PAGE_SIZE = 1U << 23, PERF_SAMPLE_WEIGHT_STRUCT = 1U << 24, PERF_SAMPLE_MAX = 1U << 25, /* non-ABI */ @@ -917,6 +918,7 @@ enum perf_event_type { * { u64 size; * char data[size]; } && PERF_SAMPLE_AUX * { u64 data_page_size;} && PERF_SAMPLE_DATA_PAGE_SIZE + * { u64 code_page_size;} && PERF_SAMPLE_CODE_PAGE_SIZE * }; */ PERF_RECORD_SAMPLE = 9, diff --git a/kernel/events/core.c b/kernel/events/core.c index 42bb89a8f167..339a84e83bdd 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1952,6 +1952,9 @@ static void __perf_event_header_size(struct perf_event *event, u64 sample_type) if (sample_type & PERF_SAMPLE_DATA_PAGE_SIZE) size += sizeof(data->data_page_size); + if (sample_type & PERF_SAMPLE_CODE_PAGE_SIZE) + size += sizeof(data->code_page_size); + event->header_size = size; } @@ -7045,6 +7048,9 @@ void perf_output_sample(struct perf_output_handle *handle, if (sample_type & PERF_SAMPLE_DATA_PAGE_SIZE) perf_output_put(handle, data->data_page_size); + if (sample_type & PERF_SAMPLE_CODE_PAGE_SIZE) + perf_output_put(handle, data->code_page_size); + if (sample_type & PERF_SAMPLE_AUX) { perf_output_put(handle, data->aux_size); @@ -7225,7 +7231,7 @@ void perf_prepare_sample(struct perf_event_header *header, __perf_event_header__init_id(header, data, event); - if (sample_type & PERF_SAMPLE_IP) + if (sample_type & (PERF_SAMPLE_IP | PERF_SAMPLE_CODE_PAGE_SIZE)) data->ip = perf_instruction_pointer(regs); if (sample_type & PERF_SAMPLE_CALLCHAIN) { @@ -7352,6 +7358,9 @@ void perf_prepare_sample(struct perf_event_header *header, if (sample_type & PERF_SAMPLE_DATA_PAGE_SIZE) data->data_page_size = perf_get_page_size(data->addr); + if (sample_type & PERF_SAMPLE_CODE_PAGE_SIZE) + data->code_page_size = perf_get_page_size(data->ip); + if (sample_type & PERF_SAMPLE_AUX) { u64 size; -- Gitee From 2ed6fdd7dbaf030d947ea1faf25ca6f2cfeca1ac Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Mon, 30 Nov 2020 09:27:52 -0800 Subject: [PATCH 03/13] tools headers UAPI: Update tools's copy of linux/perf_event.h ANBZ: #1117 commit 47d982202f8cfaac6f208c9109fa15cb6a0181f7 upstream To get the changes in: commit 459fcf8352b7 ("perf/core: Add PERF_SAMPLE_DATA_PAGE_SIZE") commit 9c52b1fc6c3b ("perf/core: Add support for PERF_SAMPLE_CODE_PAGE_SIZE") This silences this perf tools build warning: Warning: Kernel ABI header at 'tools/include/uapi/linux/perf_event.h' differs from latest version at 'include/uapi/linux/perf_event.h' diff -u tools/include/uapi/linux/perf_event.h include/uapi/linux/perf_event.h Signed-off-by: Kan Liang Acked-by: Namhyung Kim Cc: Andi Kleen Cc: Jiri Olsa Cc: Mark Rutland Cc: Michael Ellerman Cc: Stephane Eranian Cc: Will Deacon Link: http://lore.kernel.org/lkml/20201130172803.2676-2-kan.liang@linux.intel.com Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: Guanjun Acked-by: Zelin Deng --- tools/include/uapi/linux/perf_event.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tools/include/uapi/linux/perf_event.h b/tools/include/uapi/linux/perf_event.h index b95d3c485d27..b15e3447cd9f 100644 --- a/tools/include/uapi/linux/perf_event.h +++ b/tools/include/uapi/linux/perf_event.h @@ -143,8 +143,10 @@ enum perf_event_sample_format { PERF_SAMPLE_PHYS_ADDR = 1U << 19, PERF_SAMPLE_AUX = 1U << 20, PERF_SAMPLE_CGROUP = 1U << 21, + PERF_SAMPLE_DATA_PAGE_SIZE = 1U << 22, + PERF_SAMPLE_CODE_PAGE_SIZE = 1U << 23, - PERF_SAMPLE_MAX = 1U << 22, /* non-ABI */ + PERF_SAMPLE_MAX = 1U << 24, /* non-ABI */ __PERF_SAMPLE_CALLCHAIN_EARLY = 1ULL << 63, /* non-ABI; internal use */ }; @@ -896,6 +898,8 @@ enum perf_event_type { * { u64 phys_addr;} && PERF_SAMPLE_PHYS_ADDR * { u64 size; * char data[size]; } && PERF_SAMPLE_AUX + * { u64 data_page_size;} && PERF_SAMPLE_DATA_PAGE_SIZE + * { u64 code_page_size;} && PERF_SAMPLE_CODE_PAGE_SIZE * }; */ PERF_RECORD_SAMPLE = 9, -- Gitee From eae773688dae1c0fe58a6d30c8eeb72498c60901 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Tue, 2 Feb 2021 12:09:05 -0800 Subject: [PATCH 04/13] tools headers uapi: Update tools's copy of linux/perf_event.h ANBZ: #1117 commit 81898ef1303d8fb5a3256b09b3140b4eee83dad8 upstream To get the changes in these csets: 09a6c94014383d3b ("perf/core: Add PERF_SAMPLE_WEIGHT_STRUCT") 8adb5167276855b8 ("perf/x86/intel: Add perf core PMU support for Sapphire Rapids") This cures the following warning during perf's build: Warning: Kernel ABI header at 'tools/include/uapi/linux/perf_event.h' differs from latest version at 'include/uapi/linux/perf_event.h' diff -u tools/include/uapi/linux/perf_event.h include/uapi/linux/perf_event.h Committer notes: Picked by hand as I had already merged the MMAP buildid patch that also touches perf_event.h and is also only in {acme,tip}/perf/core, not yet upstream. Signed-off-by: Kan Liang Cc: Andi Kleen Cc: Jin Yao Cc: Jiri Olsa Cc: Madhavan Srinivasan Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Link: http://lore.kernel.org/lkml/1612296553-21962-2-git-send-email-kan.liang@linux.intel.com Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: Guanjun Acked-by: Zelin Deng --- tools/include/uapi/linux/perf_event.h | 54 +++++++++++++++++++++++++-- 1 file changed, 50 insertions(+), 4 deletions(-) diff --git a/tools/include/uapi/linux/perf_event.h b/tools/include/uapi/linux/perf_event.h index b15e3447cd9f..7d292de51410 100644 --- a/tools/include/uapi/linux/perf_event.h +++ b/tools/include/uapi/linux/perf_event.h @@ -145,12 +145,14 @@ enum perf_event_sample_format { PERF_SAMPLE_CGROUP = 1U << 21, PERF_SAMPLE_DATA_PAGE_SIZE = 1U << 22, PERF_SAMPLE_CODE_PAGE_SIZE = 1U << 23, + PERF_SAMPLE_WEIGHT_STRUCT = 1U << 24, - PERF_SAMPLE_MAX = 1U << 24, /* non-ABI */ + PERF_SAMPLE_MAX = 1U << 25, /* non-ABI */ __PERF_SAMPLE_CALLCHAIN_EARLY = 1ULL << 63, /* non-ABI; internal use */ }; +#define PERF_SAMPLE_WEIGHT_TYPE (PERF_SAMPLE_WEIGHT | PERF_SAMPLE_WEIGHT_STRUCT) /* * values to program into branch_sample_type when PERF_SAMPLE_BRANCH is set * @@ -890,7 +892,24 @@ enum perf_event_type { * char data[size]; * u64 dyn_size; } && PERF_SAMPLE_STACK_USER * - * { u64 weight; } && PERF_SAMPLE_WEIGHT + * { union perf_sample_weight + * { + * u64 full; && PERF_SAMPLE_WEIGHT + * #if defined(__LITTLE_ENDIAN_BITFIELD) + * struct { + * u32 var1_dw; + * u16 var2_w; + * u16 var3_w; + * } && PERF_SAMPLE_WEIGHT_STRUCT + * #elif defined(__BIG_ENDIAN_BITFIELD) + * struct { + * u16 var3_w; + * u16 var2_w; + * u32 var1_dw; + * } && PERF_SAMPLE_WEIGHT_STRUCT + * #endif + * } + * } * { u64 data_src; } && PERF_SAMPLE_DATA_SRC * { u64 transaction; } && PERF_SAMPLE_TRANSACTION * { u64 abi; # enum perf_sample_regs_abi @@ -1127,14 +1146,16 @@ union perf_mem_data_src { mem_lvl_num:4, /* memory hierarchy level number */ mem_remote:1, /* remote */ mem_snoopx:2, /* snoop mode, ext */ - mem_rsvd:24; + mem_blk:3, /* access blocked */ + mem_rsvd:21; }; }; #elif defined(__BIG_ENDIAN_BITFIELD) union perf_mem_data_src { __u64 val; struct { - __u64 mem_rsvd:24, + __u64 mem_rsvd:21, + mem_blk:3, /* access blocked */ mem_snoopx:2, /* snoop mode, ext */ mem_remote:1, /* remote */ mem_lvl_num:4, /* memory hierarchy level number */ @@ -1217,6 +1238,12 @@ union perf_mem_data_src { #define PERF_MEM_TLB_OS 0x40 /* OS fault handler */ #define PERF_MEM_TLB_SHIFT 26 +/* Access blocked */ +#define PERF_MEM_BLK_NA 0x01 /* not available */ +#define PERF_MEM_BLK_DATA 0x02 /* data could not be forwarded */ +#define PERF_MEM_BLK_ADDR 0x04 /* address conflict */ +#define PERF_MEM_BLK_SHIFT 40 + #define PERF_MEM_S(a, s) \ (((__u64)PERF_MEM_##a##_##s) << PERF_MEM_##a##_SHIFT) @@ -1248,4 +1275,23 @@ struct perf_branch_entry { reserved:40; }; +union perf_sample_weight { + __u64 full; +#if defined(__LITTLE_ENDIAN_BITFIELD) + struct { + __u32 var1_dw; + __u16 var2_w; + __u16 var3_w; + }; +#elif defined(__BIG_ENDIAN_BITFIELD) + struct { + __u16 var3_w; + __u16 var2_w; + __u32 var1_dw; + }; +#else +#error "Unknown endianness" +#endif +}; + #endif /* _UAPI_LINUX_PERF_EVENT_H */ -- Gitee From 4e843e479840a8bb9d141093400e2005564bb55f Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Tue, 2 Feb 2021 12:09:06 -0800 Subject: [PATCH 05/13] perf tools: Support the auxiliary event ANBZ: #1117 commit 2a57d40832dc8366bc517bcbbfdb1d7fb583735b upstream On the Intel Sapphire Rapids server, an auxiliary event has to be enabled simultaneously with the load latency event to retrieve complete Memory Info. Add X86 specific perf_mem_events__name() to handle the auxiliary event. - Users are only interested in the samples of the mem-loads event. Sample read the auxiliary event. - The auxiliary event must be in front of the load latency event in a group. Assume the second event to sample if the auxiliary event is the leader. - Add a weak is_mem_loads_aux_event() to check the auxiliary event for X86. For other ARCHs, it always return false. Parse the unique event name, mem-loads-aux, for the auxiliary event. Committer notes: According to 61b985e3e775a3a7 ("perf/x86/intel: Add perf core PMU support for Sapphire Rapids"), ENODATA is only returned by sys_perf_event_open() when used with these auxiliary events, with this in evsel__open_strerror(): case ENODATA: return scnprintf(msg, size, "Cannot collect data source with the load latency event alone. " "Please add an auxiliary event in front of the load latency event."); This is Ok at this point in time, but fragile long term, I pointed this out in the e-mail thread, requesting a follow up patch to check if ENODATA is really for this specific case. Fixed up sizeof(MEM_LOADS_AUX_NAME) bug pointed out by Namhyung. Signed-off-by: Kan Liang Cc: Andi Kleen Cc: Jin Yao Cc: Jiri Olsa Cc: Madhavan Srinivasan Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Link: http://lore.kernel.org/lkml/20210205152648.GC920417@kernel.org Link: http://lore.kernel.org/lkml/1612296553-21962-3-git-send-email-kan.liang@linux.intel.com Signed-off-by: Arnaldo Carvalho de Melo [guanjun: fix conflicts in tools/perf/arch/x86/util/Build, tools/perf/util/mem-events.h] Signed-off-by: Guanjun Acked-by: Zelin Deng --- tools/perf/arch/x86/util/Build | 1 + tools/perf/arch/x86/util/mem-events.c | 44 +++++++++++++++++++++++++++ tools/perf/util/evsel.c | 3 ++ tools/perf/util/mem-events.c | 5 +++ tools/perf/util/mem-events.h | 2 ++ tools/perf/util/parse-events.l | 1 + tools/perf/util/record.c | 5 ++- 7 files changed, 60 insertions(+), 1 deletion(-) create mode 100644 tools/perf/arch/x86/util/mem-events.c diff --git a/tools/perf/arch/x86/util/Build b/tools/perf/arch/x86/util/Build index 347c39b960eb..d73f548a6282 100644 --- a/tools/perf/arch/x86/util/Build +++ b/tools/perf/arch/x86/util/Build @@ -6,6 +6,7 @@ perf-y += perf_regs.o perf-y += topdown.o perf-y += machine.o perf-y += event.o +perf-y += mem-events.o perf-$(CONFIG_DWARF) += dwarf-regs.o perf-$(CONFIG_BPF_PROLOGUE) += dwarf-regs.o diff --git a/tools/perf/arch/x86/util/mem-events.c b/tools/perf/arch/x86/util/mem-events.c new file mode 100644 index 000000000000..588110fd8904 --- /dev/null +++ b/tools/perf/arch/x86/util/mem-events.c @@ -0,0 +1,44 @@ +// SPDX-License-Identifier: GPL-2.0 +#include "util/pmu.h" +#include "map_symbol.h" +#include "mem-events.h" + +static char mem_loads_name[100]; +static bool mem_loads_name__init; + +#define MEM_LOADS_AUX 0x8203 +#define MEM_LOADS_AUX_NAME "{cpu/mem-loads-aux/,cpu/mem-loads,ldlat=%u/pp}:S" + +bool is_mem_loads_aux_event(struct evsel *leader) +{ + if (!pmu_have_event("cpu", "mem-loads-aux")) + return false; + + return leader->core.attr.config == MEM_LOADS_AUX; +} + +char *perf_mem_events__name(int i) +{ + struct perf_mem_event *e = perf_mem_events__ptr(i); + + if (!e) + return NULL; + + if (i == PERF_MEM_EVENTS__LOAD) { + if (mem_loads_name__init) + return mem_loads_name; + + mem_loads_name__init = true; + + if (pmu_have_event("cpu", "mem-loads-aux")) { + scnprintf(mem_loads_name, sizeof(mem_loads_name), + MEM_LOADS_AUX_NAME, perf_mem_events__loads_ldlat); + } else { + scnprintf(mem_loads_name, sizeof(mem_loads_name), + e->name, perf_mem_events__loads_ldlat); + } + return mem_loads_name; + } + + return (char *)e->name; +} diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c index 1a1cbd16d76d..65c3e82981c1 100644 --- a/tools/perf/util/evsel.c +++ b/tools/perf/util/evsel.c @@ -2692,6 +2692,9 @@ int evsel__open_strerror(struct evsel *evsel, struct target *target, if (perf_missing_features.aux_output) return scnprintf(msg, size, "The 'aux_output' feature is not supported, update the kernel."); break; + case ENODATA: + return scnprintf(msg, size, "Cannot collect data source with the load latency event alone. " + "Please add an auxiliary event in front of the load latency event."); default: break; } diff --git a/tools/perf/util/mem-events.c b/tools/perf/util/mem-events.c index b174591479bf..a4e7b6cfeffe 100644 --- a/tools/perf/util/mem-events.c +++ b/tools/perf/util/mem-events.c @@ -56,6 +56,11 @@ char * __weak perf_mem_events__name(int i) return (char *)e->name; } +__weak bool is_mem_loads_aux_event(struct evsel *leader __maybe_unused) +{ + return false; +} + int perf_mem_events__parse(const char *str) { char *tok, *saveptr = NULL; diff --git a/tools/perf/util/mem-events.h b/tools/perf/util/mem-events.h index 4fb9870971e0..55d992f6430b 100644 --- a/tools/perf/util/mem-events.h +++ b/tools/perf/util/mem-events.h @@ -9,6 +9,7 @@ #include #include #include "stat.h" +#include "evsel.h" struct perf_mem_event { bool record; @@ -39,6 +40,7 @@ int perf_mem_events__init(void); char *perf_mem_events__name(int i); struct perf_mem_event *perf_mem_events__ptr(int i); +bool is_mem_loads_aux_event(struct evsel *leader); void perf_mem_events__list(void); diff --git a/tools/perf/util/parse-events.l b/tools/perf/util/parse-events.l index 9db5097317f4..0b36285a9435 100644 --- a/tools/perf/util/parse-events.l +++ b/tools/perf/util/parse-events.l @@ -356,6 +356,7 @@ bpf-output { return sym(yyscanner, PERF_TYPE_SOFTWARE, PERF_COUNT_SW_BPF_OUT cycles-ct | cycles-t | mem-loads | +mem-loads-aux | mem-stores | topdown-[a-z-]+ | tx-capacity-[a-z-]+ | diff --git a/tools/perf/util/record.c b/tools/perf/util/record.c index 07e4b96a6625..c68800fae9ca 100644 --- a/tools/perf/util/record.c +++ b/tools/perf/util/record.c @@ -15,6 +15,8 @@ #include "record.h" #include "../perf-sys.h" #include "topdown.h" +#include "map_symbol.h" +#include "mem-events.h" /* * evsel__config_leader_sampling() uses special rules for leader sampling. @@ -25,7 +27,8 @@ static struct evsel *evsel__read_sampler(struct evsel *evsel, struct evlist *evl { struct evsel *leader = evsel->leader; - if (evsel__is_aux_event(leader) || arch_topdown_sample_read(leader)) { + if (evsel__is_aux_event(leader) || arch_topdown_sample_read(leader) || + is_mem_loads_aux_event(leader)) { evlist__for_each_entry(evlist, evsel) { if (evsel->leader == leader && evsel != evsel->leader) return evsel; -- Gitee From 02f01e687e263ca42a38b01169049e374acb76dd Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Tue, 2 Feb 2021 12:09:07 -0800 Subject: [PATCH 06/13] perf tools: Support data block and addr block ANBZ: #1117 commit d9d5d767b2c006bbc1993ba3f2124d23ff515e32 upstream Two new data source fields, to indicate the block reasons of a load instruction, are introduced on the Intel Sapphire Rapids server. The fields can be used by the memory profiling. Add a new sort function, SORT_MEM_BLOCKED, for the two fields. For the previous platforms or the block reason is unknown, print "N/A" for the block reason. Add blocked as a default mem sort key for perf report and perf mem report. Committer testing: So in machines without this capability we get a "N/A" filling the new "Blocked" column: $ perf mem record ls arch certs CREDITS Documentation include ipc Kconfig lib MAINTAINERS mm samples security usr block COPYING crypto drivers fs init Kbuild kernel LICENSES Makefile net README scripts sound tools virt [ perf record: Woken up 1 times to write data ] [ perf record: Captured and wrote 0.008 MB perf.data (17 samples) ] $ $ perf mem report --stdio # To display the perf.data header info, please use --header/--header-only options. # # Total Lost Samples: 0 # # Samples: 6 of event 'cpu/mem-loads,ldlat=30/Pu' # Total weight : 1381 # Sort order : local_weight,mem,sym,dso,symbol_daddr,dso_daddr,snoop,tlb,locked,blocked # # Overhead Samples Local Weight Memory access Symbol Shared Object Data Symbol Data Object Snoop TLB access Locked Blocked # ........ ....... ............ .................... ....................... ............. ...................... ............ ..... ............ ...... ....... # 32.87% 1 454 Local RAM or RAM hit [.] _dl_relocate_object ld-2.31.so [.] 0x00007fe91cef3078 libc-2.31.so Hit L1 or L2 hit No N/A 25.56% 1 353 LFB or LFB hit [.] strcmp ld-2.31.so [.] 0x00005586973855ca ls None L1 or L2 hit No N/A 22.59% 1 312 LFB or LFB hit [.] _dl_cache_libcmp ld-2.31.so [.] 0x00007fe91d0e3b18 ld.so.cache None L1 or L2 hit No N/A 8.47% 1 117 LFB or LFB hit [.] _dl_relocate_object ld-2.31.so [.] 0x00007fe91ceee570 libc-2.31.so None L1 or L2 hit No N/A 6.88% 1 95 LFB or LFB hit [.] _dl_relocate_object ld-2.31.so [.] 0x00007fe91ceed490 libc-2.31.so None L1 or L2 hit No N/A 3.62% 1 50 LFB or LFB hit [.] _dl_cache_libcmp ld-2.31.so [.] 0x00007fe91d0ebe60 ld.so.cache None L1 or L2 hit No N/A # Samples: 11 of event 'cpu/mem-stores/Pu' # Total weight : 11 # Sort order : local_weight,mem,sym,dso,symbol_daddr,dso_daddr,snoop,tlb,locked,blocked # # Overhead Samples Local Weight Memory access Symbol Shared Object Data Symbol Data Object Snoop TLB access Locked Blocked # ........ ....... ............ ............. ....................... ............. ...................... ........... ..... .......... ...... ....... # 9.09% 1 0 L1 hit [.] __strcoll_l libc-2.31.so [.] 0x00007fffe5648fc8 [stack] N/A N/A N/A N/A 9.09% 1 0 L1 hit [.] _dl_lookup_symbol_x ld-2.31.so [.] 0x00007fffe56490b8 [stack] N/A N/A N/A N/A 9.09% 1 0 L1 hit [.] _dl_name_match_p ld-2.31.so [.] 0x00007fffe56487d8 [stack] N/A N/A N/A N/A 9.09% 1 0 L1 hit [.] _dl_start ld-2.31.so [.] start_time+0x0 ld-2.31.so N/A N/A N/A N/A 9.09% 1 0 L1 hit [.] _dl_sysdep_start ld-2.31.so [.] 0x00007fffe56494b8 [stack] N/A N/A N/A N/A 9.09% 1 0 L1 hit [.] do_lookup_x ld-2.31.so [.] 0x00007fffe5648ff8 [stack] N/A N/A N/A N/A 9.09% 1 0 L1 hit [.] do_lookup_x ld-2.31.so [.] 0x00007fffe5649064 [stack] N/A N/A N/A N/A 9.09% 1 0 L1 hit [.] do_lookup_x ld-2.31.so [.] 0x00007fffe5649130 [stack] N/A N/A N/A N/A 9.09% 1 0 L1 miss [.] _dl_start ld-2.31.so [.] _rtld_global+0xaf8 ld-2.31.so N/A N/A N/A N/A 9.09% 1 0 L1 miss [.] _dl_start ld-2.31.so [.] _rtld_global+0xc28 ld-2.31.so N/A N/A N/A N/A 9.09% 1 0 L1 miss [.] _dl_start ld-2.31.so [.] 0x00007fffe56495b8 [stack] N/A N/A N/A N/A # (Tip: Show user configuration overrides: perf config --user --list) $ Signed-off-by: Kan Liang Tested-by: Arnaldo Carvalho de Melo Cc: Andi Kleen Cc: Jin Yao Cc: Jiri Olsa Cc: Madhavan Srinivasan Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Link: http://lore.kernel.org/lkml/1612296553-21962-4-git-send-email-kan.liang@linux.intel.com Signed-off-by: Arnaldo Carvalho de Melo [guanjun: fix conflicts due to unbackported perf feature of data page size] Signed-off-by: Guanjun Acked-by: Zelin Deng --- tools/perf/Documentation/perf-report.txt | 5 ++-- tools/perf/builtin-mem.c | 2 +- tools/perf/util/hist.c | 1 + tools/perf/util/hist.h | 1 + tools/perf/util/mem-events.c | 25 ++++++++++++++++ tools/perf/util/mem-events.h | 1 + tools/perf/util/sort.c | 38 +++++++++++++++++++++++- tools/perf/util/sort.h | 1 + 8 files changed, 70 insertions(+), 4 deletions(-) diff --git a/tools/perf/Documentation/perf-report.txt b/tools/perf/Documentation/perf-report.txt index d068103690cc..c0d7c9f5c93d 100644 --- a/tools/perf/Documentation/perf-report.txt +++ b/tools/perf/Documentation/perf-report.txt @@ -139,7 +139,7 @@ OPTIONS If the --mem-mode option is used, the following sort keys are also available (incompatible with --branch-stack): - symbol_daddr, dso_daddr, locked, tlb, mem, snoop, dcacheline. + symbol_daddr, dso_daddr, locked, tlb, mem, snoop, dcacheline, blocked. - symbol_daddr: name of data symbol being executed on at the time of sample - dso_daddr: name of library or module containing the data being executed @@ -150,9 +150,10 @@ OPTIONS - snoop: type of snoop (if any) for the data at the time of the sample - dcacheline: the cacheline the data address is on at the time of the sample - phys_daddr: physical address of data being executed on at the time of sample + - blocked: reason of blocked load access for the data at the time of the sample And the default sort keys are changed to local_weight, mem, sym, dso, - symbol_daddr, dso_daddr, snoop, tlb, locked, see '--mem-mode'. + symbol_daddr, dso_daddr, snoop, tlb, locked, blocked, see '--mem-mode'. If the data file has tracepoint event(s), following (dynamic) sort keys are also available: diff --git a/tools/perf/builtin-mem.c b/tools/perf/builtin-mem.c index fdfbff7592f4..274091e0cd29 100644 --- a/tools/perf/builtin-mem.c +++ b/tools/perf/builtin-mem.c @@ -329,7 +329,7 @@ static int report_events(int argc, const char **argv, struct perf_mem *mem) "dso_daddr,tlb,locked"; } else if (mem->phys_addr) rep_argv[i++] = "--sort=local_weight,mem,sym,dso,symbol_daddr," - "dso_daddr,snoop,tlb,locked,phys_daddr"; + "dso_daddr,snoop,tlb,locked,blocked,phys_daddr"; for (j = 1; j < argc; j++, i++) rep_argv[i] = argv[j]; diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c index 8a793e4c9400..9c4c06f618d6 100644 --- a/tools/perf/util/hist.c +++ b/tools/perf/util/hist.c @@ -205,6 +205,7 @@ void hists__calc_col_len(struct hists *hists, struct hist_entry *h) hists__new_col_len(hists, HISTC_MEM_LVL, 21 + 3); hists__new_col_len(hists, HISTC_LOCAL_WEIGHT, 12); hists__new_col_len(hists, HISTC_GLOBAL_WEIGHT, 12); + hists__new_col_len(hists, HISTC_MEM_BLOCKED, 10); if (symbol_conf.nanosecs) hists__new_col_len(hists, HISTC_TIME, 16); else diff --git a/tools/perf/util/hist.h b/tools/perf/util/hist.h index 919f2c6c4814..fe40c1a4ec81 100644 --- a/tools/perf/util/hist.h +++ b/tools/perf/util/hist.h @@ -70,6 +70,7 @@ enum hist_column { HISTC_SYM_SIZE, HISTC_DSO_SIZE, HISTC_SYMBOL_IPC, + HISTC_MEM_BLOCKED, HISTC_NR_COLS, /* Last entry */ }; diff --git a/tools/perf/util/mem-events.c b/tools/perf/util/mem-events.c index a4e7b6cfeffe..f66edfa9a806 100644 --- a/tools/perf/util/mem-events.c +++ b/tools/perf/util/mem-events.c @@ -337,6 +337,29 @@ int perf_mem__lck_scnprintf(char *out, size_t sz, struct mem_info *mem_info) return l; } +int perf_mem__blk_scnprintf(char *out, size_t sz, struct mem_info *mem_info) +{ + size_t l = 0; + u64 mask = PERF_MEM_BLK_NA; + + sz -= 1; /* -1 for null termination */ + out[0] = '\0'; + + if (mem_info) + mask = mem_info->data_src.mem_blk; + + if (!mask || (mask & PERF_MEM_BLK_NA)) { + l += scnprintf(out + l, sz - l, " N/A"); + return l; + } + if (mask & PERF_MEM_BLK_DATA) + l += scnprintf(out + l, sz - l, " Data"); + if (mask & PERF_MEM_BLK_ADDR) + l += scnprintf(out + l, sz - l, " Addr"); + + return l; +} + int perf_script__meminfo_scnprintf(char *out, size_t sz, struct mem_info *mem_info) { int i = 0; @@ -348,6 +371,8 @@ int perf_script__meminfo_scnprintf(char *out, size_t sz, struct mem_info *mem_in i += perf_mem__tlb_scnprintf(out + i, sz - i, mem_info); i += scnprintf(out + i, sz - i, "|LCK "); i += perf_mem__lck_scnprintf(out + i, sz - i, mem_info); + i += scnprintf(out + i, sz - i, "|BLK "); + i += perf_mem__blk_scnprintf(out + i, sz - i, mem_info); return i; } diff --git a/tools/perf/util/mem-events.h b/tools/perf/util/mem-events.h index 55d992f6430b..1d945d96f07b 100644 --- a/tools/perf/util/mem-events.h +++ b/tools/perf/util/mem-events.h @@ -49,6 +49,7 @@ int perf_mem__tlb_scnprintf(char *out, size_t sz, struct mem_info *mem_info); int perf_mem__lvl_scnprintf(char *out, size_t sz, struct mem_info *mem_info); int perf_mem__snp_scnprintf(char *out, size_t sz, struct mem_info *mem_info); int perf_mem__lck_scnprintf(char *out, size_t sz, struct mem_info *mem_info); +int perf_mem__blk_scnprintf(char *out, size_t sz, struct mem_info *mem_info); int perf_script__meminfo_scnprintf(char *bf, size_t size, struct mem_info *mem_info); diff --git a/tools/perf/util/sort.c b/tools/perf/util/sort.c index 5e9e96452b9e..7a3323fad4c1 100644 --- a/tools/perf/util/sort.c +++ b/tools/perf/util/sort.c @@ -36,7 +36,7 @@ const char default_parent_pattern[] = "^sys_|^do_page_fault"; const char *parent_pattern = default_parent_pattern; const char *default_sort_order = "comm,dso,symbol"; const char default_branch_sort_order[] = "comm,dso_from,symbol_from,symbol_to,cycles"; -const char default_mem_sort_order[] = "local_weight,mem,sym,dso,symbol_daddr,dso_daddr,snoop,tlb,locked"; +const char default_mem_sort_order[] = "local_weight,mem,sym,dso,symbol_daddr,dso_daddr,snoop,tlb,locked,blocked"; const char default_top_sort_order[] = "dso,symbol"; const char default_diff_sort_order[] = "dso,symbol"; const char default_tracepoint_sort_order[] = "trace"; @@ -1421,6 +1421,41 @@ struct sort_entry sort_mem_dcacheline = { .se_width_idx = HISTC_MEM_DCACHELINE, }; +static int64_t +sort__blocked_cmp(struct hist_entry *left, struct hist_entry *right) +{ + union perf_mem_data_src data_src_l; + union perf_mem_data_src data_src_r; + + if (left->mem_info) + data_src_l = left->mem_info->data_src; + else + data_src_l.mem_blk = PERF_MEM_BLK_NA; + + if (right->mem_info) + data_src_r = right->mem_info->data_src; + else + data_src_r.mem_blk = PERF_MEM_BLK_NA; + + return (int64_t)(data_src_r.mem_blk - data_src_l.mem_blk); +} + +static int hist_entry__blocked_snprintf(struct hist_entry *he, char *bf, + size_t size, unsigned int width) +{ + char out[16]; + + perf_mem__blk_scnprintf(out, sizeof(out), he->mem_info); + return repsep_snprintf(bf, size, "%.*s", width, out); +} + +struct sort_entry sort_mem_blocked = { + .se_header = "Blocked", + .se_cmp = sort__blocked_cmp, + .se_snprintf = hist_entry__blocked_snprintf, + .se_width_idx = HISTC_MEM_BLOCKED, +}; + static int64_t sort__phys_daddr_cmp(struct hist_entry *left, struct hist_entry *right) { @@ -1740,6 +1775,7 @@ static struct sort_dimension memory_sort_dimensions[] = { DIM(SORT_MEM_SNOOP, "snoop", sort_mem_snoop), DIM(SORT_MEM_DCACHELINE, "dcacheline", sort_mem_dcacheline), DIM(SORT_MEM_PHYS_DADDR, "phys_daddr", sort_mem_phys_daddr), + DIM(SORT_MEM_BLOCKED, "blocked", sort_mem_blocked), }; #undef DIM diff --git a/tools/perf/util/sort.h b/tools/perf/util/sort.h index fc94dcd67abc..983795ab1faf 100644 --- a/tools/perf/util/sort.h +++ b/tools/perf/util/sort.h @@ -255,6 +255,7 @@ enum sort_type { SORT_MEM_DCACHELINE, SORT_MEM_IADDR_SYMBOL, SORT_MEM_PHYS_DADDR, + SORT_MEM_BLOCKED, }; /* -- Gitee From c33d18eb359afe86e4bc8978a2ea75e05cbb634f Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Tue, 2 Feb 2021 12:09:08 -0800 Subject: [PATCH 07/13] perf c2c: Support data block and addr block ANBZ: #1117 commit d9d5d767b2c006bbc1993ba3f2124d23ff515e32 upstream 'perf c2c' is also a memory profiling tool. Apply the two new data source fields to 'perf c2c' as well. Extend 'perf c2c' to display the number of loads which blocked by data or address conflict. Signed-off-by: Kan Liang Cc: Andi Kleen Cc: Don Zickus Cc: Jin Yao Cc: Jiri Olsa Cc: Joe Mario Cc: Madhavan Srinivasan Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Link: http://lore.kernel.org/lkml/1612296553-21962-5-git-send-email-kan.liang@linux.intel.com Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: Guanjun Acked-by: Zelin Deng --- tools/perf/builtin-c2c.c | 3 +++ tools/perf/util/mem-events.c | 6 ++++++ tools/perf/util/mem-events.h | 2 ++ 3 files changed, 11 insertions(+) diff --git a/tools/perf/builtin-c2c.c b/tools/perf/builtin-c2c.c index a693abdfa9ee..bf624701678b 100644 --- a/tools/perf/builtin-c2c.c +++ b/tools/perf/builtin-c2c.c @@ -2237,6 +2237,8 @@ static void print_c2c__display_stats(FILE *out) fprintf(out, " Load MESI State Exclusive : %10d\n", stats->ld_excl); fprintf(out, " Load MESI State Shared : %10d\n", stats->ld_shared); fprintf(out, " Load LLC Misses : %10d\n", llc_misses); + fprintf(out, " Load access blocked by data : %10d\n", stats->blk_data); + fprintf(out, " Load access blocked by address : %10d\n", stats->blk_addr); fprintf(out, " LLC Misses to Local DRAM : %10.1f%%\n", ((double)stats->lcl_dram/(double)llc_misses) * 100.); fprintf(out, " LLC Misses to Remote DRAM : %10.1f%%\n", ((double)stats->rmt_dram/(double)llc_misses) * 100.); fprintf(out, " LLC Misses to Remote cache (HIT) : %10.1f%%\n", ((double)stats->rmt_hit /(double)llc_misses) * 100.); @@ -2265,6 +2267,7 @@ static void print_shared_cacheline_info(FILE *out) fprintf(out, " L2D hits on shared lines : %10d\n", stats->ld_l2hit); fprintf(out, " LLC hits on shared lines : %10d\n", stats->ld_llchit + stats->lcl_hitm); fprintf(out, " Locked Access on shared lines : %10d\n", stats->locks); + fprintf(out, " Blocked Access on shared lines : %10d\n", stats->blk_data + stats->blk_addr); fprintf(out, " Store HITs on shared lines : %10d\n", stats->store); fprintf(out, " Store L1D hits on shared lines : %10d\n", stats->st_l1hit); fprintf(out, " Total Merged records : %10d\n", hitm_cnt + stats->store); diff --git a/tools/perf/util/mem-events.c b/tools/perf/util/mem-events.c index f66edfa9a806..47316d32b914 100644 --- a/tools/perf/util/mem-events.c +++ b/tools/perf/util/mem-events.c @@ -386,6 +386,7 @@ int c2c_decode_stats(struct c2c_stats *stats, struct mem_info *mi) u64 lvl = data_src->mem_lvl; u64 snoop = data_src->mem_snoop; u64 lock = data_src->mem_lock; + u64 blk = data_src->mem_blk; /* * Skylake might report unknown remote level via this * bit, consider it when evaluating remote HITMs. @@ -411,6 +412,9 @@ do { \ if (lock & P(LOCK, LOCKED)) stats->locks++; + if (blk & P(BLK, DATA)) stats->blk_data++; + if (blk & P(BLK, ADDR)) stats->blk_addr++; + if (op & P(OP, LOAD)) { /* load */ stats->load++; @@ -522,6 +526,8 @@ void c2c_add_stats(struct c2c_stats *stats, struct c2c_stats *add) stats->rmt_hit += add->rmt_hit; stats->lcl_dram += add->lcl_dram; stats->rmt_dram += add->rmt_dram; + stats->blk_data += add->blk_data; + stats->blk_addr += add->blk_addr; stats->nomap += add->nomap; stats->noparse += add->noparse; stats->tot_lat += add->tot_lat; diff --git a/tools/perf/util/mem-events.h b/tools/perf/util/mem-events.h index 1d945d96f07b..4484cbefe639 100644 --- a/tools/perf/util/mem-events.h +++ b/tools/perf/util/mem-events.h @@ -79,6 +79,8 @@ struct c2c_stats { u32 rmt_hit; /* count of loads with remote hit clean; */ u32 lcl_dram; /* count of loads miss to local DRAM */ u32 rmt_dram; /* count of loads miss to remote DRAM */ + u32 blk_data; /* count of loads blocked by data */ + u32 blk_addr; /* count of loads blocked by address conflict */ u32 nomap; /* count of load/stores with no phys adrs */ u32 noparse; /* count of unparsable data sources */ u64 tot_lat; /* Cycle count to complete operation */ -- Gitee From 0083645760308b1741eb7b5ea0215472a1436765 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Tue, 2 Feb 2021 12:09:09 -0800 Subject: [PATCH 08/13] perf tools: Support PERF_SAMPLE_WEIGHT_STRUCT ANBZ: #1117 commit ea8d0ed6eae37b01953a29bca98112d9e2507a84 upstream The new sample type, PERF_SAMPLE_WEIGHT_STRUCT, is an alternative of the PERF_SAMPLE_WEIGHT sample type. Users can apply either the PERF_SAMPLE_WEIGHT sample type or the PERF_SAMPLE_WEIGHT_STRUCT sample type to retrieve the sample weight, but they cannot apply both sample types simultaneously. The new sample type shares the same space as the PERF_SAMPLE_WEIGHT sample type. The lower 32 bits are exactly the same for both sample type. The higher 32 bits may be different for different architecture. Add arch specific arch_evsel__set_sample_weight() to set the new sample type for X86. Only store the lower 32 bits for the sample->weight if the new sample type is applied. In practice, no memory access could last than 4G cycles. No data will be lost. If the kernel doesn't support the new sample type. Fall back to the PERF_SAMPLE_WEIGHT sample type. There is no impact for other architectures. Committer notes: Fixup related to PERF_SAMPLE_CODE_PAGE_SIZE, present in acme/perf/core but not upstream yet. Signed-off-by: Kan Liang Cc: Andi Kleen Cc: Jin Yao Cc: Jiri Olsa Cc: Madhavan Srinivasan Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Link: http://lore.kernel.org/lkml/1612296553-21962-6-git-send-email-kan.liang@linux.intel.com Signed-off-by: Arnaldo Carvalho de Melo [guanjun: fix conflicts due to unbackported perf feature of code page size] Signed-off-by: Guanjun Acked-by: Zelin Deng --- tools/perf/arch/x86/util/Build | 1 + tools/perf/arch/x86/util/evsel.c | 8 +++++++ tools/perf/util/evsel.c | 28 +++++++++++++++++++---- tools/perf/util/evsel.h | 3 +++ tools/perf/util/intel-pt.c | 22 +++++++++++++++--- tools/perf/util/perf_event_attr_fprintf.c | 1 + tools/perf/util/session.c | 2 +- tools/perf/util/synthetic-events.c | 6 +++-- 8 files changed, 61 insertions(+), 10 deletions(-) create mode 100644 tools/perf/arch/x86/util/evsel.c diff --git a/tools/perf/arch/x86/util/Build b/tools/perf/arch/x86/util/Build index d73f548a6282..18848b3040cb 100644 --- a/tools/perf/arch/x86/util/Build +++ b/tools/perf/arch/x86/util/Build @@ -7,6 +7,7 @@ perf-y += topdown.o perf-y += machine.o perf-y += event.o perf-y += mem-events.o +perf-y += evsel.o perf-$(CONFIG_DWARF) += dwarf-regs.o perf-$(CONFIG_BPF_PROLOGUE) += dwarf-regs.o diff --git a/tools/perf/arch/x86/util/evsel.c b/tools/perf/arch/x86/util/evsel.c new file mode 100644 index 000000000000..2f733cdc8dbb --- /dev/null +++ b/tools/perf/arch/x86/util/evsel.c @@ -0,0 +1,8 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include "util/evsel.h" + +void arch_evsel__set_sample_weight(struct evsel *evsel) +{ + evsel__set_sample_bit(evsel, WEIGHT_STRUCT); +} diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c index 65c3e82981c1..ffe4eee834e2 100644 --- a/tools/perf/util/evsel.c +++ b/tools/perf/util/evsel.c @@ -1014,6 +1014,11 @@ struct evsel_config_term *__evsel__get_config_term(struct evsel *evsel, enum evs return found_term; } +void __weak arch_evsel__set_sample_weight(struct evsel *evsel) +{ + evsel__set_sample_bit(evsel, WEIGHT); +} + static void evsel__set_default_freq_period(struct record_opts *opts, struct perf_event_attr *attr) { @@ -1177,7 +1182,7 @@ void evsel__config(struct evsel *evsel, struct record_opts *opts, } if (opts->sample_weight) - evsel__set_sample_bit(evsel, WEIGHT); + arch_evsel__set_sample_weight(evsel); attr->task = track; attr->mmap = track; @@ -1751,6 +1756,10 @@ static int evsel__open_cpu(struct evsel *evsel, struct perf_cpu_map *cpus, } fallback_missing_features: + if (perf_missing_features.weight_struct) { + evsel__set_sample_bit(evsel, WEIGHT); + evsel__reset_sample_bit(evsel, WEIGHT_STRUCT); + } if (perf_missing_features.clockid_wrong) evsel->core.attr.clockid = CLOCK_MONOTONIC; /* should always work */ if (perf_missing_features.clockid) { @@ -1889,7 +1898,12 @@ static int evsel__open_cpu(struct evsel *evsel, struct perf_cpu_map *cpus, * Must probe features in the order they were added to the * perf_event_attr interface. */ - if (!perf_missing_features.cgroup && evsel->core.attr.cgroup) { + if (!perf_missing_features.weight_struct && + (evsel->core.attr.sample_type & PERF_SAMPLE_WEIGHT_STRUCT)) { + perf_missing_features.weight_struct = true; + pr_debug2("switching off weight struct support\n"); + goto fallback_missing_features; + } else if (!perf_missing_features.cgroup && evsel->core.attr.cgroup) { perf_missing_features.cgroup = true; pr_debug2_peo("Kernel has no cgroup sampling support, bailing out\n"); goto out_close; @@ -2327,9 +2341,15 @@ int evsel__parse_sample(struct evsel *evsel, union perf_event *event, } } - if (type & PERF_SAMPLE_WEIGHT) { + if (type & PERF_SAMPLE_WEIGHT_TYPE) { + union perf_sample_weight weight; + OVERFLOW_CHECK_u64(array); - data->weight = *array; + weight.full = *array; + if (type & PERF_SAMPLE_WEIGHT) + data->weight = weight.full; + else + data->weight = weight.var1_dw; array++; } diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h index 79a860d8e3ee..469745394c2d 100644 --- a/tools/perf/util/evsel.h +++ b/tools/perf/util/evsel.h @@ -144,6 +144,7 @@ struct perf_missing_features { bool aux_output; bool branch_hw_idx; bool cgroup; + bool weight_struct; }; extern struct perf_missing_features perf_missing_features; @@ -238,6 +239,8 @@ void __evsel__reset_sample_bit(struct evsel *evsel, enum perf_event_sample_forma void evsel__set_sample_id(struct evsel *evsel, bool use_sample_identifier); +void arch_evsel__set_sample_weight(struct evsel *evsel); + int evsel__set_filter(struct evsel *evsel, const char *filter); int evsel__append_tp_filter(struct evsel *evsel, const char *filter); int evsel__append_addr_filter(struct evsel *evsel, const char *filter); diff --git a/tools/perf/util/intel-pt.c b/tools/perf/util/intel-pt.c index 5163d2ffea70..af7768fb8722 100644 --- a/tools/perf/util/intel-pt.c +++ b/tools/perf/util/intel-pt.c @@ -1871,13 +1871,29 @@ static int intel_pt_synth_pebs_sample(struct intel_pt_queue *ptq) if (sample_type & PERF_SAMPLE_ADDR && items->has_mem_access_address) sample.addr = items->mem_access_address; - if (sample_type & PERF_SAMPLE_WEIGHT) { + if (sample_type & PERF_SAMPLE_WEIGHT_TYPE) { /* * Refer kernel's setup_pebs_adaptive_sample_data() and * intel_hsw_weight(). */ - if (items->has_mem_access_latency) - sample.weight = items->mem_access_latency; + if (items->has_mem_access_latency) { + u64 weight = items->mem_access_latency >> 32; + + /* + * Starts from SPR, the mem access latency field + * contains both cache latency [47:32] and instruction + * latency [15:0]. The cache latency is the same as the + * mem access latency on previous platforms. + * + * In practice, no memory access could last than 4G + * cycles. Use latency >> 32 to distinguish the + * different format of the mem access latency field. + */ + if (weight > 0) + sample.weight = weight & 0xffff; + else + sample.weight = items->mem_access_latency; + } if (!sample.weight && items->has_tsx_aux_info) { /* Cycles last block */ sample.weight = (u32)items->tsx_aux_info; diff --git a/tools/perf/util/perf_event_attr_fprintf.c b/tools/perf/util/perf_event_attr_fprintf.c index e67a227c0ce7..a7482f1edb07 100644 --- a/tools/perf/util/perf_event_attr_fprintf.c +++ b/tools/perf/util/perf_event_attr_fprintf.c @@ -36,6 +36,7 @@ static void __p_sample_type(char *buf, size_t size, u64 value) bit_name(IDENTIFIER), bit_name(REGS_INTR), bit_name(DATA_SRC), bit_name(WEIGHT), bit_name(PHYS_ADDR), bit_name(AUX), bit_name(CGROUP), + bit_name(WEIGHT_STRUCT), { .name = NULL, } }; #undef bit_name diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c index 354e1e04a266..344b311a67fb 100644 --- a/tools/perf/util/session.c +++ b/tools/perf/util/session.c @@ -1302,7 +1302,7 @@ static void dump_sample(struct evsel *evsel, union perf_event *event, if (sample_type & PERF_SAMPLE_STACK_USER) stack_user__printf(&sample->user_stack); - if (sample_type & PERF_SAMPLE_WEIGHT) + if (sample_type & PERF_SAMPLE_WEIGHT_TYPE) printf("... weight: %" PRIu64 "\n", sample->weight); if (sample_type & PERF_SAMPLE_DATA_SRC) diff --git a/tools/perf/util/synthetic-events.c b/tools/perf/util/synthetic-events.c index b4cf6dd57dd6..af472a620759 100644 --- a/tools/perf/util/synthetic-events.c +++ b/tools/perf/util/synthetic-events.c @@ -1385,7 +1385,7 @@ size_t perf_event__sample_event_size(const struct perf_sample *sample, u64 type, } } - if (type & PERF_SAMPLE_WEIGHT) + if (type & PERF_SAMPLE_WEIGHT_TYPE) result += sizeof(u64); if (type & PERF_SAMPLE_DATA_SRC) @@ -1553,8 +1553,10 @@ int perf_event__synthesize_sample(union perf_event *event, u64 type, u64 read_fo } } - if (type & PERF_SAMPLE_WEIGHT) { + if (type & PERF_SAMPLE_WEIGHT_TYPE) { *array = sample->weight; + if (type & PERF_SAMPLE_WEIGHT_STRUCT) + *array &= 0xffffffff; array++; } -- Gitee From b6b720e427e42c5b529e4333d2517d6ae190c74d Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Tue, 2 Feb 2021 12:09:10 -0800 Subject: [PATCH 09/13] perf report: Support instruction latency ANBZ: #1117 commit 590db42de068a1d11e51bd0796a9044621aeed2e upstream The instruction latency information can be recorded on some platforms, e.g., the Intel Sapphire Rapids server. With both memory latency (weight) and the new instruction latency information, users can easily locate the expensive load instructions, and also understand the time spent in different stages. The users can optimize their applications in different pipeline stages. The 'weight' field is shared among different architectures. Reusing the 'weight' field may impacts other architectures. Add a new field to store the instruction latency. Like the 'weight' support, introduce a 'ins_lat' for the global instruction latency, and a 'local_ins_lat' for the local instruction latency version. Add new sort functions, INSTR Latency and Local INSTR Latency, accordingly. Add local_ins_lat to the default_mem_sort_order[]. Signed-off-by: Kan Liang Cc: Andi Kleen Cc: Jin Yao Cc: Jiri Olsa Cc: Madhavan Srinivasan Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Link: http://lore.kernel.org/lkml/1612296553-21962-7-git-send-email-kan.liang@linux.intel.com Signed-off-by: Arnaldo Carvalho de Melo [guanjun: fix conflicts due to unbackported perf feature of code page size] Signed-off-by: Guanjun Acked-by: Zelin Deng --- tools/perf/Documentation/perf-report.txt | 6 ++- tools/perf/util/event.h | 1 + tools/perf/util/evsel.c | 4 +- tools/perf/util/hist.c | 12 ++++-- tools/perf/util/hist.h | 2 + tools/perf/util/intel-pt.c | 5 ++- tools/perf/util/session.c | 8 +++- tools/perf/util/sort.c | 47 +++++++++++++++++++++++- tools/perf/util/sort.h | 3 ++ tools/perf/util/synthetic-events.c | 4 +- 10 files changed, 81 insertions(+), 11 deletions(-) diff --git a/tools/perf/Documentation/perf-report.txt b/tools/perf/Documentation/perf-report.txt index c0d7c9f5c93d..cff34d7c4ca0 100644 --- a/tools/perf/Documentation/perf-report.txt +++ b/tools/perf/Documentation/perf-report.txt @@ -108,6 +108,9 @@ OPTIONS - period: Raw number of event count of sample - time: Separate the samples by time stamp with the resolution specified by --time-quantum (default 100ms). Specify with overhead and before it. + - ins_lat: Instruction latency in core cycles. This is the global instruction + latency + - local_ins_lat: Local instruction latency version By default, comm, dso and symbol keys are used. (i.e. --sort comm,dso,symbol) @@ -153,7 +156,8 @@ OPTIONS - blocked: reason of blocked load access for the data at the time of the sample And the default sort keys are changed to local_weight, mem, sym, dso, - symbol_daddr, dso_daddr, snoop, tlb, locked, blocked, see '--mem-mode'. + symbol_daddr, dso_daddr, snoop, tlb, locked, blocked, local_ins_lat, + see '--mem-mode'. If the data file has tracepoint event(s), following (dynamic) sort keys are also available: diff --git a/tools/perf/util/event.h b/tools/perf/util/event.h index b828b99176f4..6ac2cd5b1067 100644 --- a/tools/perf/util/event.h +++ b/tools/perf/util/event.h @@ -140,6 +140,7 @@ struct perf_sample { u16 insn_len; u8 cpumode; u16 misc; + u16 ins_lat; bool no_hw_idx; /* No hw_idx collected in branch_stack */ char insn[MAX_INSN]; void *raw_data; diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c index ffe4eee834e2..d10e93109ae4 100644 --- a/tools/perf/util/evsel.c +++ b/tools/perf/util/evsel.c @@ -2348,8 +2348,10 @@ int evsel__parse_sample(struct evsel *evsel, union perf_event *event, weight.full = *array; if (type & PERF_SAMPLE_WEIGHT) data->weight = weight.full; - else + else { data->weight = weight.var1_dw; + data->ins_lat = weight.var2_w; + } array++; } diff --git a/tools/perf/util/hist.c b/tools/perf/util/hist.c index 9c4c06f618d6..25a2ae1c6b79 100644 --- a/tools/perf/util/hist.c +++ b/tools/perf/util/hist.c @@ -206,6 +206,8 @@ void hists__calc_col_len(struct hists *hists, struct hist_entry *h) hists__new_col_len(hists, HISTC_LOCAL_WEIGHT, 12); hists__new_col_len(hists, HISTC_GLOBAL_WEIGHT, 12); hists__new_col_len(hists, HISTC_MEM_BLOCKED, 10); + hists__new_col_len(hists, HISTC_LOCAL_INS_LAT, 13); + hists__new_col_len(hists, HISTC_GLOBAL_INS_LAT, 13); if (symbol_conf.nanosecs) hists__new_col_len(hists, HISTC_TIME, 16); else @@ -283,12 +285,13 @@ static long hist_time(unsigned long htime) } static void he_stat__add_period(struct he_stat *he_stat, u64 period, - u64 weight) + u64 weight, u64 ins_lat) { he_stat->period += period; he_stat->weight += weight; he_stat->nr_events += 1; + he_stat->ins_lat += ins_lat; } static void he_stat__add_stat(struct he_stat *dest, struct he_stat *src) @@ -300,6 +303,7 @@ static void he_stat__add_stat(struct he_stat *dest, struct he_stat *src) dest->period_guest_us += src->period_guest_us; dest->nr_events += src->nr_events; dest->weight += src->weight; + dest->ins_lat += src->ins_lat; } static void he_stat__decay(struct he_stat *he_stat) @@ -588,6 +592,7 @@ static struct hist_entry *hists__findnew_entry(struct hists *hists, int64_t cmp; u64 period = entry->stat.period; u64 weight = entry->stat.weight; + u64 ins_lat = entry->stat.ins_lat; bool leftmost = true; p = &hists->entries_in->rb_root.rb_node; @@ -606,11 +611,11 @@ static struct hist_entry *hists__findnew_entry(struct hists *hists, if (!cmp) { if (sample_self) { - he_stat__add_period(&he->stat, period, weight); + he_stat__add_period(&he->stat, period, weight, ins_lat); hist_entry__add_callchain_period(he, period); } if (symbol_conf.cumulate_callchain) - he_stat__add_period(he->stat_acc, period, weight); + he_stat__add_period(he->stat_acc, period, weight, ins_lat); /* * This mem info was allocated from sample__resolve_mem @@ -720,6 +725,7 @@ __hists__add_entry(struct hists *hists, .nr_events = 1, .period = sample->period, .weight = sample->weight, + .ins_lat = sample->ins_lat, }, .parent = sym_parent, .filtered = symbol__parent_filter(sym_parent) | al->filtered, diff --git a/tools/perf/util/hist.h b/tools/perf/util/hist.h index fe40c1a4ec81..4b2831b6acf7 100644 --- a/tools/perf/util/hist.h +++ b/tools/perf/util/hist.h @@ -71,6 +71,8 @@ enum hist_column { HISTC_DSO_SIZE, HISTC_SYMBOL_IPC, HISTC_MEM_BLOCKED, + HISTC_LOCAL_INS_LAT, + HISTC_GLOBAL_INS_LAT, HISTC_NR_COLS, /* Last entry */ }; diff --git a/tools/perf/util/intel-pt.c b/tools/perf/util/intel-pt.c index af7768fb8722..34485b980b1b 100644 --- a/tools/perf/util/intel-pt.c +++ b/tools/perf/util/intel-pt.c @@ -1889,9 +1889,10 @@ static int intel_pt_synth_pebs_sample(struct intel_pt_queue *ptq) * cycles. Use latency >> 32 to distinguish the * different format of the mem access latency field. */ - if (weight > 0) + if (weight > 0) { sample.weight = weight & 0xffff; - else + sample.ins_lat = items->mem_access_latency & 0xffff; + } else sample.weight = items->mem_access_latency; } if (!sample.weight && items->has_tsx_aux_info) { diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c index 344b311a67fb..894e1ebe830e 100644 --- a/tools/perf/util/session.c +++ b/tools/perf/util/session.c @@ -1302,8 +1302,12 @@ static void dump_sample(struct evsel *evsel, union perf_event *event, if (sample_type & PERF_SAMPLE_STACK_USER) stack_user__printf(&sample->user_stack); - if (sample_type & PERF_SAMPLE_WEIGHT_TYPE) - printf("... weight: %" PRIu64 "\n", sample->weight); + if (sample_type & PERF_SAMPLE_WEIGHT_TYPE) { + printf("... weight: %" PRIu64 "", sample->weight); + if (sample_type & PERF_SAMPLE_WEIGHT_STRUCT) + printf(",0x%"PRIx16"", sample->ins_lat); + printf("\n"); + } if (sample_type & PERF_SAMPLE_DATA_SRC) printf(" . data_src: 0x%"PRIx64"\n", sample->data_src); diff --git a/tools/perf/util/sort.c b/tools/perf/util/sort.c index 7a3323fad4c1..079ace9a781a 100644 --- a/tools/perf/util/sort.c +++ b/tools/perf/util/sort.c @@ -36,7 +36,7 @@ const char default_parent_pattern[] = "^sys_|^do_page_fault"; const char *parent_pattern = default_parent_pattern; const char *default_sort_order = "comm,dso,symbol"; const char default_branch_sort_order[] = "comm,dso_from,symbol_from,symbol_to,cycles"; -const char default_mem_sort_order[] = "local_weight,mem,sym,dso,symbol_daddr,dso_daddr,snoop,tlb,locked,blocked"; +const char default_mem_sort_order[] = "local_weight,mem,sym,dso,symbol_daddr,dso_daddr,snoop,tlb,locked,blocked,local_ins_lat"; const char default_top_sort_order[] = "dso,symbol"; const char default_diff_sort_order[] = "dso,symbol"; const char default_tracepoint_sort_order[] = "trace"; @@ -1365,6 +1365,49 @@ struct sort_entry sort_global_weight = { .se_width_idx = HISTC_GLOBAL_WEIGHT, }; +static u64 he_ins_lat(struct hist_entry *he) +{ + return he->stat.nr_events ? he->stat.ins_lat / he->stat.nr_events : 0; +} + +static int64_t +sort__local_ins_lat_cmp(struct hist_entry *left, struct hist_entry *right) +{ + return he_ins_lat(left) - he_ins_lat(right); +} + +static int hist_entry__local_ins_lat_snprintf(struct hist_entry *he, char *bf, + size_t size, unsigned int width) +{ + return repsep_snprintf(bf, size, "%-*u", width, he_ins_lat(he)); +} + +struct sort_entry sort_local_ins_lat = { + .se_header = "Local INSTR Latency", + .se_cmp = sort__local_ins_lat_cmp, + .se_snprintf = hist_entry__local_ins_lat_snprintf, + .se_width_idx = HISTC_LOCAL_INS_LAT, +}; + +static int64_t +sort__global_ins_lat_cmp(struct hist_entry *left, struct hist_entry *right) +{ + return left->stat.ins_lat - right->stat.ins_lat; +} + +static int hist_entry__global_ins_lat_snprintf(struct hist_entry *he, char *bf, + size_t size, unsigned int width) +{ + return repsep_snprintf(bf, size, "%-*u", width, he->stat.ins_lat); +} + +struct sort_entry sort_global_ins_lat = { + .se_header = "INSTR Latency", + .se_cmp = sort__global_ins_lat_cmp, + .se_snprintf = hist_entry__global_ins_lat_snprintf, + .se_width_idx = HISTC_GLOBAL_INS_LAT, +}; + struct sort_entry sort_mem_daddr_sym = { .se_header = "Data Symbol", .se_cmp = sort__daddr_cmp, @@ -1741,6 +1784,8 @@ static struct sort_dimension common_sort_dimensions[] = { DIM(SORT_CGROUP_ID, "cgroup_id", sort_cgroup_id), DIM(SORT_SYM_IPC_NULL, "ipc_null", sort_sym_ipc_null), DIM(SORT_TIME, "time", sort_time), + DIM(SORT_LOCAL_INS_LAT, "local_ins_lat", sort_local_ins_lat), + DIM(SORT_GLOBAL_INS_LAT, "ins_lat", sort_global_ins_lat), }; #undef DIM diff --git a/tools/perf/util/sort.h b/tools/perf/util/sort.h index 983795ab1faf..971d6578c985 100644 --- a/tools/perf/util/sort.h +++ b/tools/perf/util/sort.h @@ -50,6 +50,7 @@ struct he_stat { u64 period_guest_sys; u64 period_guest_us; u64 weight; + u64 ins_lat; u32 nr_events; }; @@ -229,6 +230,8 @@ enum sort_type { SORT_CGROUP_ID, SORT_SYM_IPC_NULL, SORT_TIME, + SORT_LOCAL_INS_LAT, + SORT_GLOBAL_INS_LAT, /* branch stack specific sort keys */ __SORT_BRANCH_STACK, diff --git a/tools/perf/util/synthetic-events.c b/tools/perf/util/synthetic-events.c index af472a620759..868afeed158c 100644 --- a/tools/perf/util/synthetic-events.c +++ b/tools/perf/util/synthetic-events.c @@ -1555,8 +1555,10 @@ int perf_event__synthesize_sample(union perf_event *event, u64 type, u64 read_fo if (type & PERF_SAMPLE_WEIGHT_TYPE) { *array = sample->weight; - if (type & PERF_SAMPLE_WEIGHT_STRUCT) + if (type & PERF_SAMPLE_WEIGHT_STRUCT) { *array &= 0xffffffff; + *array |= ((u64)sample->ins_lat << 32); + } array++; } -- Gitee From 4060b0b354ba4c445e82e01321149862989d7a78 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Tue, 2 Feb 2021 12:09:11 -0800 Subject: [PATCH 10/13] perf test: Support PERF_SAMPLE_WEIGHT_STRUCT ANBZ: #1117 commit c7444297fd3769d10c7ffb52c81d71503b3e268f upstream Support the new sample type for sample-parsing test case. Signed-off-by: Kan Liang Cc: Andi Kleen Cc: Jin Yao Cc: Jiri Olsa Cc: Madhavan Srinivasan Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Link: http://lore.kernel.org/lkml/1612296553-21962-8-git-send-email-kan.liang@linux.intel.com Signed-off-by: Arnaldo Carvalho de Melo [guanjun: fix conflicts due to unbackported perf feature of code page size] Signed-off-by: Guanjun Acked-by: Zelin Deng --- tools/perf/tests/sample-parsing.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/tools/perf/tests/sample-parsing.c b/tools/perf/tests/sample-parsing.c index 33a58976222d..7ad2256ef1cd 100644 --- a/tools/perf/tests/sample-parsing.c +++ b/tools/perf/tests/sample-parsing.c @@ -129,6 +129,9 @@ static bool samples_same(const struct perf_sample *s1, if (type & PERF_SAMPLE_WEIGHT) COMP(weight); + if (type & PERF_SAMPLE_WEIGHT_STRUCT) + COMP(ins_lat); + if (type & PERF_SAMPLE_DATA_SRC) COMP(data_src); @@ -234,6 +237,7 @@ static int do_test(u64 sample_type, u64 sample_regs, u64 read_format) }, .phys_addr = 113, .cgroup = 114, + .ins_lat = 117, .aux_sample = { .size = sizeof(aux_data), .data = (void *)aux_data, @@ -340,7 +344,7 @@ int test__sample_parsing(struct test *test __maybe_unused, int subtest __maybe_u * were added. Please actually update the test rather than just change * the condition below. */ - if (PERF_SAMPLE_MAX > PERF_SAMPLE_CGROUP << 1) { + if (PERF_SAMPLE_MAX > PERF_SAMPLE_WEIGHT_STRUCT << 1) { pr_debug("sample format has changed, some new PERF_SAMPLE_ bit was introduced - test needs updating\n"); return -1; } @@ -370,8 +374,12 @@ int test__sample_parsing(struct test *test __maybe_unused, int subtest __maybe_u return err; } - /* Test all sample format bits together */ - sample_type = PERF_SAMPLE_MAX - 1; + /* + * Test all sample format bits together + * Note: PERF_SAMPLE_WEIGHT and PERF_SAMPLE_WEIGHT_STRUCT cannot + * be set simultaneously. + */ + sample_type = (PERF_SAMPLE_MAX - 1) & ~PERF_SAMPLE_WEIGHT; sample_regs = 0x3fff; /* shared yb intr and user regs */ for (i = 0; i < ARRAY_SIZE(rf); i++) { err = do_test(sample_type, sample_regs, rf[i]); -- Gitee From 4c4fc7dd9da745f0ddb9722ebdb82d742129d3dd Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Tue, 2 Feb 2021 12:09:12 -0800 Subject: [PATCH 11/13] perf stat: Support L2 Topdown events ANBZ: #1117 commit 63e39aa6ae103451dfffe578c38e219d731e5cca upstream The TMA method level 2 metrics is supported from the Intel Sapphire Rapids server, which expose four L2 Topdown metrics events to user space. There are eight L2 events in total. The other four L2 Topdown metrics events are calculated from the corresponding L1 and the exposed L2 events. Now, the --topdown prints the complete top-down metrics that supported by the CPU. For the Intel Sapphire Rapids server, there are 4 L1 events and 8 L2 events displyed in one line. Add a new option, --td-level, to display the top-down statistics that equal to or lower than the input level. The L2 event is marked only when both its L1 parent event and itself crosse the threshold. Here is an example: $ perf stat --topdown --td-level=2 --no-metric-only sleep 1 Topdown accuracy may decrease when measuring long periods. Please print the result regularly, e.g. -I1000 Performance counter stats for 'sleep 1': 16,734,390 slots 2,100,001 topdown-retiring # 12.6% retiring 2,034,376 topdown-bad-spec # 12.3% bad speculation 4,003,128 topdown-fe-bound # 24.1% frontend bound 328,125 topdown-heavy-ops # 2.0% heavy operations # 10.6% light operations 1,968,751 topdown-br-mispredict # 11.9% branch mispredict # 0.4% machine clears 2,953,127 topdown-fetch-lat # 17.8% fetch latency # 6.3% fetch bandwidth 5,906,255 topdown-mem-bound # 35.6% memory bound # 15.4% core bound Signed-off-by: Kan Liang Cc: Andi Kleen Cc: Jin Yao Cc: Jiri Olsa Cc: Madhavan Srinivasan Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Link: http://lore.kernel.org/lkml/1612296553-21962-9-git-send-email-kan.liang@linux.intel.com Signed-off-by: Arnaldo Carvalho de Melo [guanjun: fix conflicts due to update_runtime_stat()\full_td()\td_metric_ratio() functions change] Signed-off-by: Guanjun Acked-by: Zelin Deng --- tools/perf/Documentation/perf-stat.txt | 14 ++++- tools/perf/builtin-stat.c | 34 ++++++++++- tools/perf/util/stat-shadow.c | 84 ++++++++++++++++++++++++++ tools/perf/util/stat.c | 4 ++ tools/perf/util/stat.h | 9 +++ 5 files changed, 141 insertions(+), 4 deletions(-) diff --git a/tools/perf/Documentation/perf-stat.txt b/tools/perf/Documentation/perf-stat.txt index 9f9f29025e49..7e68dc894638 100644 --- a/tools/perf/Documentation/perf-stat.txt +++ b/tools/perf/Documentation/perf-stat.txt @@ -353,7 +353,7 @@ See perf list output for the possble metrics and metricgroups. Do not aggregate counts across all monitored CPUs. --topdown:: -Print top down level 1 metrics if supported by the CPU. This allows to +Print complete top-down metrics supported by the CPU. This allows to determine bottle necks in the CPU pipeline for CPU bound workloads, by breaking the cycles consumed down into frontend bound, backend bound, bad speculation and retiring. @@ -388,6 +388,18 @@ To interpret the results it is usually needed to know on which CPUs the workload runs on. If needed the CPUs can be forced using taskset. +--td-level:: +Print the top-down statistics that equal to or lower than the input level. +It allows users to print the interested top-down metrics level instead of +the complete top-down metrics. + +The availability of the top-down metrics level depends on the hardware. For +example, Ice Lake only supports L1 top-down metrics. The Sapphire Rapids +supports both L1 and L2 top-down metrics. + +Default: 0 means the max level that the current hardware support. +Error out if the input is higher than the supported max level. + --no-merge:: Do not merge results from same PMUs. diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index b01af171d94f..c18d3c82ae3d 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -137,6 +137,19 @@ static const char *topdown_metric_attrs[] = { NULL, }; +static const char *topdown_metric_L2_attrs[] = { + "slots", + "topdown-retiring", + "topdown-bad-spec", + "topdown-fe-bound", + "topdown-be-bound", + "topdown-heavy-ops", + "topdown-br-mispredict", + "topdown-fetch-lat", + "topdown-mem-bound", + NULL, +}; + static const char *smi_cost_attrs = { "{" "msr/aperf/," @@ -1153,7 +1166,9 @@ static struct option stat_options[] = { OPT_BOOLEAN(0, "metric-no-merge", &stat_config.metric_no_merge, "don't try to share events between metrics in a group"), OPT_BOOLEAN(0, "topdown", &topdown_run, - "measure topdown level 1 statistics"), + "measure top-down statistics"), + OPT_UINTEGER(0, "td-level", &stat_config.topdown_level, + "Set the metrics level for the top-down statistics (0: max level)"), OPT_BOOLEAN(0, "smi-cost", &smi_cost, "measure SMI cost"), OPT_CALLBACK('M', "metrics", &evsel_list, "metric/metric group list", @@ -1696,17 +1711,30 @@ static int add_default_attributes(void) } if (topdown_run) { + const char **metric_attrs = topdown_metric_attrs; + unsigned int max_level = 1; char *str = NULL; bool warn = false; if (!force_metric_only) stat_config.metric_only = true; - if (topdown_filter_events(topdown_metric_attrs, &str, 1) < 0) { + if (pmu_have_event("cpu", topdown_metric_L2_attrs[5])) { + metric_attrs = topdown_metric_L2_attrs; + max_level = 2; + } + + if (stat_config.topdown_level > max_level) { + pr_err("Invalid top-down metrics level. The max level is %u.\n", max_level); + return -1; + } else if (!stat_config.topdown_level) + stat_config.topdown_level = max_level; + + if (topdown_filter_events(metric_attrs, &str, 1) < 0) { pr_err("Out of memory\n"); return -1; } - if (topdown_metric_attrs[0] && str) { + if (metric_attrs[0] && str) { if (!stat_config.interval && !stat_config.metric_only) { fprintf(stat_config.output, "Topdown accuracy may decrease when measuring long periods.\n" diff --git a/tools/perf/util/stat-shadow.c b/tools/perf/util/stat-shadow.c index 901265127e36..70e902409252 100644 --- a/tools/perf/util/stat-shadow.c +++ b/tools/perf/util/stat-shadow.c @@ -253,6 +253,18 @@ void perf_stat__update_shadow_stats(struct evsel *counter, u64 count, else if (perf_stat_evsel__is(counter, TOPDOWN_BE_BOUND)) update_runtime_stat(st, STAT_TOPDOWN_BE_BOUND, ctx, cpu, count); + else if (perf_stat_evsel__is(counter, TOPDOWN_HEAVY_OPS)) + update_runtime_stat(st, STAT_TOPDOWN_HEAVY_OPS, + ctx, cpu, count); + else if (perf_stat_evsel__is(counter, TOPDOWN_BR_MISPREDICT)) + update_runtime_stat(st, STAT_TOPDOWN_BR_MISPREDICT, + ctx, cpu, count); + else if (perf_stat_evsel__is(counter, TOPDOWN_FETCH_LAT)) + update_runtime_stat(st, STAT_TOPDOWN_FETCH_LAT, + ctx, cpu, count); + else if (perf_stat_evsel__is(counter, TOPDOWN_MEM_BOUND)) + update_runtime_stat(st, STAT_TOPDOWN_MEM_BOUND, + ctx, cpu, count); else if (evsel__match(counter, HARDWARE, HW_STALLED_CYCLES_FRONTEND)) update_runtime_stat(st, STAT_STALLED_CYCLES_FRONT, ctx, cpu, count); @@ -1162,6 +1174,78 @@ void perf_stat__print_shadow_stats(struct perf_stat_config *config, color = PERF_COLOR_RED; print_metric(config, ctxp, color, "%8.1f%%", "bad speculation", bad_spec * 100.); + } else if (perf_stat_evsel__is(evsel, TOPDOWN_HEAVY_OPS) && + full_td(ctx, cpu, st) && (config->topdown_level > 1)) { + double retiring = td_metric_ratio(ctx, cpu, + STAT_TOPDOWN_RETIRING, st); + double heavy_ops = td_metric_ratio(ctx, cpu, + STAT_TOPDOWN_HEAVY_OPS, st); + double light_ops = retiring - heavy_ops; + + if (retiring > 0.7 && heavy_ops > 0.1) + color = PERF_COLOR_GREEN; + print_metric(config, ctxp, color, "%8.1f%%", "heavy operations", + heavy_ops * 100.); + if (retiring > 0.7 && light_ops > 0.6) + color = PERF_COLOR_GREEN; + else + color = NULL; + print_metric(config, ctxp, color, "%8.1f%%", "light operations", + light_ops * 100.); + } else if (perf_stat_evsel__is(evsel, TOPDOWN_BR_MISPREDICT) && + full_td(ctx, cpu, st) && (config->topdown_level > 1)) { + double bad_spec = td_metric_ratio(ctx, cpu, + STAT_TOPDOWN_BAD_SPEC, st); + double br_mis = td_metric_ratio(ctx, cpu, + STAT_TOPDOWN_BR_MISPREDICT, st); + double m_clears = bad_spec - br_mis; + + if (bad_spec > 0.1 && br_mis > 0.05) + color = PERF_COLOR_RED; + print_metric(config, ctxp, color, "%8.1f%%", "branch mispredict", + br_mis * 100.); + if (bad_spec > 0.1 && m_clears > 0.05) + color = PERF_COLOR_RED; + else + color = NULL; + print_metric(config, ctxp, color, "%8.1f%%", "machine clears", + m_clears * 100.); + } else if (perf_stat_evsel__is(evsel, TOPDOWN_FETCH_LAT) && + full_td(ctx, cpu, st) && (config->topdown_level > 1)) { + double fe_bound = td_metric_ratio(ctx, cpu, + STAT_TOPDOWN_FE_BOUND, st); + double fetch_lat = td_metric_ratio(ctx, cpu, + STAT_TOPDOWN_FETCH_LAT, st); + double fetch_bw = fe_bound - fetch_lat; + + if (fe_bound > 0.2 && fetch_lat > 0.15) + color = PERF_COLOR_RED; + print_metric(config, ctxp, color, "%8.1f%%", "fetch latency", + fetch_lat * 100.); + if (fe_bound > 0.2 && fetch_bw > 0.1) + color = PERF_COLOR_RED; + else + color = NULL; + print_metric(config, ctxp, color, "%8.1f%%", "fetch bandwidth", + fetch_bw * 100.); + } else if (perf_stat_evsel__is(evsel, TOPDOWN_MEM_BOUND) && + full_td(ctx, cpu, st) && (config->topdown_level > 1)) { + double be_bound = td_metric_ratio(ctx, cpu, + STAT_TOPDOWN_BE_BOUND, st); + double mem_bound = td_metric_ratio(ctx, cpu, + STAT_TOPDOWN_MEM_BOUND, st); + double core_bound = be_bound - mem_bound; + + if (be_bound > 0.2 && mem_bound > 0.2) + color = PERF_COLOR_RED; + print_metric(config, ctxp, color, "%8.1f%%", "memory bound", + mem_bound * 100.); + if (be_bound > 0.2 && core_bound > 0.1) + color = PERF_COLOR_RED; + else + color = NULL; + print_metric(config, ctxp, color, "%8.1f%%", "Core bound", + core_bound * 100.); } else if (evsel->metric_expr) { generic_metric(config, evsel->metric_expr, evsel->metric_events, NULL, evsel->name, evsel->metric_name, NULL, 1, cpu, out, st); diff --git a/tools/perf/util/stat.c b/tools/perf/util/stat.c index bd0decd6d753..2c0d4042d036 100644 --- a/tools/perf/util/stat.c +++ b/tools/perf/util/stat.c @@ -99,6 +99,10 @@ static const char *id_str[PERF_STAT_EVSEL_ID__MAX] = { ID(TOPDOWN_BAD_SPEC, topdown-bad-spec), ID(TOPDOWN_FE_BOUND, topdown-fe-bound), ID(TOPDOWN_BE_BOUND, topdown-be-bound), + ID(TOPDOWN_HEAVY_OPS, topdown-heavy-ops), + ID(TOPDOWN_BR_MISPREDICT, topdown-br-mispredict), + ID(TOPDOWN_FETCH_LAT, topdown-fetch-lat), + ID(TOPDOWN_MEM_BOUND, topdown-mem-bound), ID(SMI_NUM, msr/smi/), ID(APERF, msr/aperf/), }; diff --git a/tools/perf/util/stat.h b/tools/perf/util/stat.h index 487010c624be..c26da0bf00b5 100644 --- a/tools/perf/util/stat.h +++ b/tools/perf/util/stat.h @@ -32,6 +32,10 @@ enum perf_stat_evsel_id { PERF_STAT_EVSEL_ID__TOPDOWN_BAD_SPEC, PERF_STAT_EVSEL_ID__TOPDOWN_FE_BOUND, PERF_STAT_EVSEL_ID__TOPDOWN_BE_BOUND, + PERF_STAT_EVSEL_ID__TOPDOWN_HEAVY_OPS, + PERF_STAT_EVSEL_ID__TOPDOWN_BR_MISPREDICT, + PERF_STAT_EVSEL_ID__TOPDOWN_FETCH_LAT, + PERF_STAT_EVSEL_ID__TOPDOWN_MEM_BOUND, PERF_STAT_EVSEL_ID__SMI_NUM, PERF_STAT_EVSEL_ID__APERF, PERF_STAT_EVSEL_ID__MAX, @@ -90,6 +94,10 @@ enum stat_type { STAT_TOPDOWN_BAD_SPEC, STAT_TOPDOWN_FE_BOUND, STAT_TOPDOWN_BE_BOUND, + STAT_TOPDOWN_HEAVY_OPS, + STAT_TOPDOWN_BR_MISPREDICT, + STAT_TOPDOWN_FETCH_LAT, + STAT_TOPDOWN_MEM_BOUND, STAT_SMI_NUM, STAT_APERF, STAT_MAX @@ -146,6 +154,7 @@ struct perf_stat_config { int ctl_fd_ack; bool ctl_fd_close; const char *cgroup_list; + unsigned int topdown_level; }; void perf_stat__set_big_num(int set); -- Gitee From 261800f8894b262653488fa7a3e063ec78d722a2 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Tue, 2 Feb 2021 12:09:13 -0800 Subject: [PATCH 12/13] perf tools: Update topdown documentation for Sapphire Rapids ANBZ: #1117 commit 7d91e8181dc0ed8585e55234288d11bc5dc083b2 upstream Update Topdown extension on Sapphire Rapids and how to collect the L2 events. Signed-off-by: Kan Liang Cc: Andi Kleen Cc: Jin Yao Cc: Jiri Olsa Cc: Madhavan Srinivasan Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Link: http://lore.kernel.org/lkml/1612296553-21962-10-git-send-email-kan.liang@linux.intel.com Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: Guanjun Acked-by: Zelin Deng --- tools/perf/Documentation/topdown.txt | 78 ++++++++++++++++++++++++++-- 1 file changed, 74 insertions(+), 4 deletions(-) diff --git a/tools/perf/Documentation/topdown.txt b/tools/perf/Documentation/topdown.txt index 3c39bb3dc5fa..10f07f9455b8 100644 --- a/tools/perf/Documentation/topdown.txt +++ b/tools/perf/Documentation/topdown.txt @@ -121,7 +121,7 @@ to read slots and the topdown metrics at different points of the program: #define RDPMC_METRIC (1 << 29) /* return metric counters */ #define FIXED_COUNTER_SLOTS 3 -#define METRIC_COUNTER_TOPDOWN_L1 0 +#define METRIC_COUNTER_TOPDOWN_L1_L2 0 static inline uint64_t read_slots(void) { @@ -130,7 +130,7 @@ static inline uint64_t read_slots(void) static inline uint64_t read_metrics(void) { - return _rdpmc(RDPMC_METRIC | METRIC_COUNTER_TOPDOWN_L1); + return _rdpmc(RDPMC_METRIC | METRIC_COUNTER_TOPDOWN_L1_L2); } Then the program can be instrumented to read these metrics at different @@ -152,11 +152,21 @@ The binary ratios in the metric value can be converted to float ratios: #define GET_METRIC(m, i) (((m) >> (i*8)) & 0xff) +/* L1 Topdown metric events */ #define TOPDOWN_RETIRING(val) ((float)GET_METRIC(val, 0) / 0xff) #define TOPDOWN_BAD_SPEC(val) ((float)GET_METRIC(val, 1) / 0xff) #define TOPDOWN_FE_BOUND(val) ((float)GET_METRIC(val, 2) / 0xff) #define TOPDOWN_BE_BOUND(val) ((float)GET_METRIC(val, 3) / 0xff) +/* + * L2 Topdown metric events. + * Available on Sapphire Rapids and later platforms. + */ +#define TOPDOWN_HEAVY_OPS(val) ((float)GET_METRIC(val, 4) / 0xff) +#define TOPDOWN_BR_MISPREDICT(val) ((float)GET_METRIC(val, 5) / 0xff) +#define TOPDOWN_FETCH_LAT(val) ((float)GET_METRIC(val, 6) / 0xff) +#define TOPDOWN_MEM_BOUND(val) ((float)GET_METRIC(val, 7) / 0xff) + and then converted to percent for printing. The ratios in the metric accumulate for the time when the counter @@ -190,8 +200,8 @@ for that time period. fe_bound_slots = GET_METRIC(metric_b, 2) * slots_b - fe_bound_slots_a be_bound_slots = GET_METRIC(metric_b, 3) * slots_b - be_bound_slots_a -Later the individual ratios for the measurement period can be recreated -from these counts. +Later the individual ratios of L1 metric events for the measurement period can +be recreated from these counts. slots_delta = slots_b - slots_a retiring_ratio = (float)retiring_slots / slots_delta @@ -205,6 +215,48 @@ from these counts. fe_bound_ratio * 100., be_bound_ratio * 100.); +The individual ratios of L2 metric events for the measurement period can be +recreated from L1 and L2 metric counters. (Available on Sapphire Rapids and +later platforms) + + # compute scaled metrics for measurement a + heavy_ops_slots_a = GET_METRIC(metric_a, 4) * slots_a + br_mispredict_slots_a = GET_METRIC(metric_a, 5) * slots_a + fetch_lat_slots_a = GET_METRIC(metric_a, 6) * slots_a + mem_bound_slots_a = GET_METRIC(metric_a, 7) * slots_a + + # compute delta scaled metrics between b and a + heavy_ops_slots = GET_METRIC(metric_b, 4) * slots_b - heavy_ops_slots_a + br_mispredict_slots = GET_METRIC(metric_b, 5) * slots_b - br_mispredict_slots_a + fetch_lat_slots = GET_METRIC(metric_b, 6) * slots_b - fetch_lat_slots_a + mem_bound_slots = GET_METRIC(metric_b, 7) * slots_b - mem_bound_slots_a + + slots_delta = slots_b - slots_a + heavy_ops_ratio = (float)heavy_ops_slots / slots_delta + light_ops_ratio = retiring_ratio - heavy_ops_ratio; + + br_mispredict_ratio = (float)br_mispredict_slots / slots_delta + machine_clears_ratio = bad_spec_ratio - br_mispredict_ratio; + + fetch_lat_ratio = (float)fetch_lat_slots / slots_delta + fetch_bw_ratio = fe_bound_ratio - fetch_lat_ratio; + + mem_bound_ratio = (float)mem_bound_slots / slota_delta + core_bound_ratio = be_bound_ratio - mem_bound_ratio; + + printf("Heavy Operations %.2f%% Light Operations %.2f%% " + "Branch Mispredict %.2f%% Machine Clears %.2f%% " + "Fetch Latency %.2f%% Fetch Bandwidth %.2f%% " + "Mem Bound %.2f%% Core Bound %.2f%%\n", + heavy_ops_ratio * 100., + light_ops_ratio * 100., + br_mispredict_ratio * 100., + machine_clears_ratio * 100., + fetch_lat_ratio * 100., + fetch_bw_ratio * 100., + mem_bound_ratio * 100., + core_bound_ratio * 100.); + Resetting metrics counters ========================== @@ -248,6 +300,24 @@ a sampling read group. Since the SLOTS event must be the leader of a TopDown group, the second event of the group is the sampling event. For example, perf record -e '{slots, $sampling_event, topdown-retiring}:S' +Extension on Sapphire Rapids Server +=================================== +The metrics counter is extended to support TMA method level 2 metrics. +The lower half of the register is the TMA level 1 metrics (legacy). +The upper half is also divided into four 8-bit fields for the new level 2 +metrics. Four more TopDown metric events are exposed for the end-users, +topdown-heavy-ops, topdown-br-mispredict, topdown-fetch-lat and +topdown-mem-bound. + +Each of the new level 2 metrics in the upper half is a subset of the +corresponding level 1 metric in the lower half. Software can deduce the +other four level 2 metrics by subtracting corresponding metrics as below. + + Light_Operations = Retiring - Heavy_Operations + Machine_Clears = Bad_Speculation - Branch_Mispredicts + Fetch_Bandwidth = Frontend_Bound - Fetch_Latency + Core_Bound = Backend_Bound - Memory_Bound + [1] https://software.intel.com/en-us/top-down-microarchitecture-analysis-method-win [2] https://github.com/andikleen/pmu-tools/wiki/toplev-manual -- Gitee From a6aa90d15130484e43d4c3ac6c876adfcd40bf69 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Wed, 29 Sep 2021 08:38:13 -0700 Subject: [PATCH 13/13] perf script: Fix PERF_SAMPLE_WEIGHT_STRUCT support ANBZ: #1117 commit 27730c8cd60d1574d8337276e7a9d7d2ca92e0d1 upstream -F weight in perf script is broken. # ./perf mem record # ./perf script -F weight Samples for 'dummy:HG' event do not have WEIGHT attribute set. Cannot print 'weight' field. The sample type, PERF_SAMPLE_WEIGHT_STRUCT, is an alternative of the PERF_SAMPLE_WEIGHT sample type. They share the same space, weight. The lower 32 bits are exactly the same for both sample type. The higher 32 bits may be different for different architecture. For a new kernel on x86, the PERF_SAMPLE_WEIGHT_STRUCT is used. For an old kernel or other ARCHs, the PERF_SAMPLE_WEIGHT is used. With -F weight, current perf script will only check the input string "weight" with the PERF_SAMPLE_WEIGHT sample type. Because the commit 008364576030 ("perf tools: Support PERF_SAMPLE_WEIGHT_STRUCT") didn't update the PERF_SAMPLE_WEIGHT_STRUCT sample type for perf script. For a new kernel on x86, the check fails. Use PERF_SAMPLE_WEIGHT_TYPE, which supports both sample types, to replace PERF_SAMPLE_WEIGHT Fixes: 008364576030 ("perf tools: Support PERF_SAMPLE_WEIGHT_STRUCT") Reported-by: Joe Mario Reviewed-by: Kajol Jain Signed-off-by: Kan Liang Tested-by: Jiri Olsa Tested-by: Joe Mario Acked-by: Jiri Olsa Acked-by: Joe Mario Cc: Andi Kleen Link: https://lore.kernel.org/r/1632929894-102778-1-git-send-email-kan.liang@linux.intel.com Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: Guanjun Acked-by: Zelin Deng --- tools/perf/builtin-script.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c index 5109d01619ee..36bb52890afa 100644 --- a/tools/perf/builtin-script.c +++ b/tools/perf/builtin-script.c @@ -444,7 +444,7 @@ static int evsel__check_attr(struct evsel *evsel, struct perf_session *session) return -EINVAL; if (PRINT_FIELD(WEIGHT) && - evsel__check_stype(evsel, PERF_SAMPLE_WEIGHT, "WEIGHT", PERF_OUTPUT_WEIGHT)) + evsel__check_stype(evsel, PERF_SAMPLE_WEIGHT_TYPE, "WEIGHT", PERF_OUTPUT_WEIGHT)) return -EINVAL; if (PRINT_FIELD(SYM) && -- Gitee