diff --git a/arch/arm64/configs/openeuler_defconfig b/arch/arm64/configs/openeuler_defconfig index 1233ce00c72cf8811c2d5930b6a514aab775b13f..e89a034988b4e022a42751c1ef10bada08a70538 100644 --- a/arch/arm64/configs/openeuler_defconfig +++ b/arch/arm64/configs/openeuler_defconfig @@ -1173,6 +1173,8 @@ CONFIG_PID_RESERVE=y CONFIG_MEMORY_RELIABLE=y # CONFIG_CLEAR_FREELIST_PAGE is not set CONFIG_EXTEND_HUGEPAGE_MAPPING=y +CONFIG_MEM_SAMPLING=y +CONFIG_NUMABALANCING_MEM_SAMPLING=y # # Data Access Monitoring @@ -6293,6 +6295,7 @@ CONFIG_UB_UDMA_HNS3=m CONFIG_CPU_INSPECT=m CONFIG_CPU_INSPECTOR_ATF=m # end of CPU Inspect +CONFIG_ARM_SPE=y # end of Device Drivers # diff --git a/arch/x86/configs/openeuler_defconfig b/arch/x86/configs/openeuler_defconfig index dd8c474969b32b89b99141d79b293ae7131938db..a680be4c3ee4f8b3f50c288927be31ed9f8e7967 100644 --- a/arch/x86/configs/openeuler_defconfig +++ b/arch/x86/configs/openeuler_defconfig @@ -1104,6 +1104,7 @@ CONFIG_ARCH_HAS_PTE_SPECIAL=y CONFIG_MAPPING_DIRTY_HELPERS=y CONFIG_MEMORY_RELIABLE=y # CONFIG_CLEAR_FREELIST_PAGE is not set +# CONFIG_MEM_SAMPLING is not set # # Data Access Monitoring diff --git a/drivers/Kconfig b/drivers/Kconfig index f765c3a688b6b0a6396f7616eb14c89dfcc51c8f..840137d901670440500b8931fb2cd622d2d3f0d6 100644 --- a/drivers/Kconfig +++ b/drivers/Kconfig @@ -244,4 +244,6 @@ source "drivers/ub/Kconfig" source "drivers/cpuinspect/Kconfig" +source "drivers/arm/Kconfig" + endmenu diff --git a/drivers/Makefile b/drivers/Makefile index 4e390005ded7ffb5c150a06e0f912d621303eeef..8264e814d3d67009550fc9206ae6ee095e565ebd 100644 --- a/drivers/Makefile +++ b/drivers/Makefile @@ -195,3 +195,4 @@ obj-$(CONFIG_COUNTER) += counter/ obj-$(CONFIG_MOST) += most/ obj-$(CONFIG_ROH) += roh/ obj-$(CONFIG_UB) += ub/ +obj-$(CONFIG_ARM_SPE) += arm/spe/ diff --git a/drivers/arm/Kconfig b/drivers/arm/Kconfig new file mode 100644 index 0000000000000000000000000000000000000000..a0c7a25220cc83a26e1beaf6c32d7a368fa2501d --- /dev/null +++ b/drivers/arm/Kconfig @@ -0,0 +1,2 @@ +# SPDX-License-Identifier: GPL-2.0-only +source "drivers/arm/spe/Kconfig" diff --git a/drivers/arm/spe/Kconfig b/drivers/arm/spe/Kconfig new file mode 100644 index 0000000000000000000000000000000000000000..2d81364d0e0a201a9cd0941274d5c27a3b0aa90d --- /dev/null +++ b/drivers/arm/spe/Kconfig @@ -0,0 +1,11 @@ +# SPDX-License-Identifier: GPL-2.0-only +# +# arm spe dirver +# +config ARM_SPE + bool "In-kernel SPE for driver for page access profiling" + depends on ARM64 + default n + help + Enable support for the ARMv8.2 Statistical Profiling Extension, which + provides periodic sampling of operations in the CPU pipeline. diff --git a/drivers/arm/spe/Makefile b/drivers/arm/spe/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..46c43e5974e1fd29afd2c2a695fce14977d78151 --- /dev/null +++ b/drivers/arm/spe/Makefile @@ -0,0 +1,2 @@ +# SPDX-License-Identifier: GPL-2.0 +obj-$(CONFIG_ARM_SPE) += spe.o spe-decoder/arm-spe-decoder.o spe-decoder/arm-spe-pkt-decoder.o diff --git a/drivers/arm/spe/spe-decoder/Makefile b/drivers/arm/spe/spe-decoder/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..4fdae5d381867542ad12a7a7d34aabfdd141e40b --- /dev/null +++ b/drivers/arm/spe/spe-decoder/Makefile @@ -0,0 +1,2 @@ +# SPDX-License-Identifier: GPL-2.0 +obj-y := arm-spe-decoder.o arm-spe-pkt-decoder.o diff --git a/drivers/arm/spe/spe-decoder/arm-spe-decoder.c b/drivers/arm/spe/spe-decoder/arm-spe-decoder.c new file mode 100644 index 0000000000000000000000000000000000000000..1b6ddeaaabe903602dd91c37dae2fe432c786969 --- /dev/null +++ b/drivers/arm/spe/spe-decoder/arm-spe-decoder.c @@ -0,0 +1,211 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * arm_spe_decoder.c: ARM SPE support + */ + +#include +#include +#include +#include +#include +#include +#include + +#include "arm-spe-decoder.h" + +static u64 arm_spe_calc_ip(int index, u64 payload) +{ + u64 ns, el, val; + u32 seen_idx; + + /* Instruction virtual address or Branch target address */ + if (index == SPE_ADDR_PKT_HDR_INDEX_INS || + index == SPE_ADDR_PKT_HDR_INDEX_BRANCH) { + ns = SPE_ADDR_PKT_GET_NS(payload); + el = SPE_ADDR_PKT_GET_EL(payload); + + /* Clean highest byte */ + payload = SPE_ADDR_PKT_ADDR_GET_BYTES_0_6(payload); + + /* Fill highest byte for EL1 or EL2 (VHE) mode */ + if (ns && (el == SPE_ADDR_PKT_EL1 || el == SPE_ADDR_PKT_EL2)) + payload |= 0xffULL << SPE_ADDR_PKT_ADDR_BYTE7_SHIFT; + + /* Data access virtual address */ + } else if (index == SPE_ADDR_PKT_HDR_INDEX_DATA_VIRT) { + + /* Clean tags */ + payload = SPE_ADDR_PKT_ADDR_GET_BYTES_0_6(payload); + + /* + * Armv8 ARM (ARM DDI 0487F.c), chapter "D10.2.1 Address packet" + * defines the data virtual address payload format, the top byte + * (bits [63:56]) is assigned as top-byte tag; so we only can + * retrieve address value from bits [55:0]. + * + * According to Documentation/arm64/memory.rst, if detects the + * specific pattern in bits [55:52] of payload which falls in + * the kernel space, should fixup the top byte and this allows + * perf tool to parse DSO symbol for data address correctly. + * + * For this reason, if detects the bits [55:52] is 0xf, will + * fill 0xff into the top byte. + */ + val = SPE_ADDR_PKT_ADDR_GET_BYTE_6(payload); + if ((val & 0xf0ULL) == 0xf0ULL) + payload |= 0xffULL << SPE_ADDR_PKT_ADDR_BYTE7_SHIFT; + + /* Data access physical address */ + } else if (index == SPE_ADDR_PKT_HDR_INDEX_DATA_PHYS) { + /* Clean highest byte */ + payload = SPE_ADDR_PKT_ADDR_GET_BYTES_0_6(payload); + } else { + seen_idx = 0; + if (!(seen_idx & BIT(index))) { + seen_idx |= BIT(index); + pr_warn("ignoring unsupported address packet index: 0x%x\n", index); + } + } + + return payload; +} + + +void arm_spe_decoder_free(struct arm_spe_decoder *decoder) +{ + kfree(decoder); +} + +static int arm_spe_get_next_packet(struct arm_spe_decoder *decoder) +{ + int ret; + + do { + if (!decoder->len) + return 0; + + ret = arm_spe_get_packet(decoder->buf, decoder->len, + &decoder->packet); + if (ret <= 0) { + /* Move forward for 1 byte */ + decoder->buf += 1; + decoder->len -= 1; + return -EBADMSG; + } + + decoder->buf += ret; + decoder->len -= ret; + } while (decoder->packet.type == ARM_SPE_PAD); + return 1; +} + +static int arm_spe_read_record(struct arm_spe_decoder *decoder) +{ + int err; + int idx; + u64 payload, ip; + + memset(&decoder->record, 0x0, sizeof(decoder->record)); + decoder->record.context_id = (u64)-1; + while (1) { + err = arm_spe_get_next_packet(decoder); + if (err <= 0) + return err; + + idx = decoder->packet.index; + payload = decoder->packet.payload; + + switch (decoder->packet.type) { + case ARM_SPE_TIMESTAMP: + decoder->record.timestamp = payload; + return 1; + case ARM_SPE_END: + return 1; + case ARM_SPE_ADDRESS: + ip = arm_spe_calc_ip(idx, payload); + if (idx == SPE_ADDR_PKT_HDR_INDEX_INS) + decoder->record.from_ip = ip; + else if (idx == SPE_ADDR_PKT_HDR_INDEX_BRANCH) + decoder->record.to_ip = ip; + else if (idx == SPE_ADDR_PKT_HDR_INDEX_DATA_VIRT) + decoder->record.virt_addr = ip; + else if (idx == SPE_ADDR_PKT_HDR_INDEX_DATA_PHYS) + decoder->record.phys_addr = ip; + break; + case ARM_SPE_COUNTER: + if (idx == SPE_CNT_PKT_HDR_INDEX_TOTAL_LAT) + decoder->record.latency = payload; + break; + case ARM_SPE_CONTEXT: + decoder->record.context_id = payload; + break; + case ARM_SPE_OP_TYPE: + if (idx == SPE_OP_PKT_HDR_CLASS_LD_ST_ATOMIC) { + if (payload & 0x1) + decoder->record.op = ARM_SPE_ST; + else + decoder->record.op = ARM_SPE_LD; + } + break; + case ARM_SPE_EVENTS: + if (payload & BIT(EV_L1D_REFILL)) + decoder->record.type |= ARM_SPE_L1D_MISS; + + if (payload & BIT(EV_L1D_ACCESS)) + decoder->record.type |= ARM_SPE_L1D_ACCESS; + + if (payload & BIT(EV_TLB_WALK)) + decoder->record.type |= ARM_SPE_TLB_MISS; + + if (payload & BIT(EV_TLB_ACCESS)) + decoder->record.type |= ARM_SPE_TLB_ACCESS; + + if (payload & BIT(EV_LLC_MISS)) + decoder->record.type |= ARM_SPE_LLC_MISS; + + if (payload & BIT(EV_LLC_ACCESS)) + decoder->record.type |= ARM_SPE_LLC_ACCESS; + + if (payload & BIT(EV_REMOTE_ACCESS)) + decoder->record.type |= ARM_SPE_REMOTE_ACCESS; + + if (payload & BIT(EV_MISPRED)) + decoder->record.type |= ARM_SPE_BRANCH_MISS; + + break; + case ARM_SPE_DATA_SOURCE: + decoder->record.source = payload; + break; + case ARM_SPE_BAD: + break; + case ARM_SPE_PAD: + break; + default: + pr_err("Get packet error!\n"); + return -1; + } + } + return 0; +} + +static bool arm_spe_decode(struct arm_spe_decoder *decoder) +{ + if (decoder->len) { + if (arm_spe_read_record(decoder) == 1) + return true; + } + return false; +} + +void arm_spe_decode_buf(const unsigned char *buf, size_t len) +{ + struct arm_spe_decoder decoder; + + decoder.buf = buf; + decoder.len = len; + + while (arm_spe_decode(&decoder)) + arm_spe_record_enqueue(&(decoder.record)); + +} +EXPORT_SYMBOL(arm_spe_decode_buf); diff --git a/drivers/arm/spe/spe-decoder/arm-spe-decoder.h b/drivers/arm/spe/spe-decoder/arm-spe-decoder.h new file mode 100644 index 0000000000000000000000000000000000000000..567a70307c5f1ef506cbdda74260ce71fd500753 --- /dev/null +++ b/drivers/arm/spe/spe-decoder/arm-spe-decoder.h @@ -0,0 +1,73 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * arm_spe_decoder.h: Arm Statistical Profiling Extensions support + * Copyright (c) 2019-2020, Arm Ltd. + */ + +#ifndef INCLUDE__ARM_SPE_DECODER_H__ +#define INCLUDE__ARM_SPE_DECODER_H__ + +#include + +#include "arm-spe-pkt-decoder.h" + +enum arm_spe_sample_type { + ARM_SPE_L1D_ACCESS = 1 << 0, + ARM_SPE_L1D_MISS = 1 << 1, + ARM_SPE_LLC_ACCESS = 1 << 2, + ARM_SPE_LLC_MISS = 1 << 3, + ARM_SPE_TLB_ACCESS = 1 << 4, + ARM_SPE_TLB_MISS = 1 << 5, + ARM_SPE_BRANCH_MISS = 1 << 6, + ARM_SPE_REMOTE_ACCESS = 1 << 7, +}; + +enum arm_spe_op_type { + ARM_SPE_LD = 1 << 0, + ARM_SPE_ST = 1 << 1, +}; + +enum arm_spe_neoverse_data_source { + ARM_SPE_NV_L1D = 0x0, + ARM_SPE_NV_L2 = 0x8, + ARM_SPE_NV_PEER_CORE = 0x9, + ARM_SPE_NV_LOCAL_CLUSTER = 0xa, + ARM_SPE_NV_SYS_CACHE = 0xb, + ARM_SPE_NV_PEER_CLUSTER = 0xc, + ARM_SPE_NV_REMOTE = 0xd, + ARM_SPE_NV_DRAM = 0xe, +}; + +struct arm_spe_record { + enum arm_spe_sample_type type; + int err; + u32 op; + u32 latency; + u64 from_ip; + u64 to_ip; + u64 timestamp; + u64 virt_addr; + u64 phys_addr; + u64 context_id; + u16 source; +}; + +struct arm_spe_buffer { + const unsigned char *buf; + size_t len; + u64 offset; + u64 trace_nr; +}; + +struct arm_spe_decoder { + struct arm_spe_record record; + const unsigned char *buf; + size_t len; + struct arm_spe_pkt packet; +}; + +void arm_spe_decoder_free(struct arm_spe_decoder *decoder); +void arm_spe_decode_buf(const unsigned char *buf, size_t len); +void arm_spe_record_enqueue(struct arm_spe_record *record); + +#endif diff --git a/drivers/arm/spe/spe-decoder/arm-spe-pkt-decoder.c b/drivers/arm/spe/spe-decoder/arm-spe-pkt-decoder.c new file mode 100644 index 0000000000000000000000000000000000000000..aeec434487798475c7899cce9f91e8fb0a6f272e --- /dev/null +++ b/drivers/arm/spe/spe-decoder/arm-spe-pkt-decoder.c @@ -0,0 +1,227 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Arm Statistical Profiling Extensions (SPE) support + * Copyright (c) 2017-2018, Arm Ltd. + */ + +#include +#include +#include +#include +#include + +#include "arm-spe-pkt-decoder.h" + +/* + * Extracts the field "sz" from header bits and converts to bytes: + * 00 : byte (1) + * 01 : halfword (2) + * 10 : word (4) + * 11 : doubleword (8) + */ +static unsigned int arm_spe_payload_len(unsigned char hdr) +{ + return 1U << ((hdr & GENMASK_ULL(5, 4)) >> 4); +} + +static int arm_spe_get_payload(const unsigned char *buf, size_t len, + unsigned char ext_hdr, + struct arm_spe_pkt *packet) +{ + size_t payload_len = arm_spe_payload_len(buf[ext_hdr]); + + if (len < 1 + ext_hdr + payload_len) + return ARM_SPE_NEED_MORE_BYTES; + + buf += 1 + ext_hdr; + + switch (payload_len) { + case 1: + packet->payload = *(uint8_t *)buf; + break; + case 2: + packet->payload = le16_to_cpu(*(uint16_t *)buf); + break; + case 4: + packet->payload = le32_to_cpu(*(uint32_t *)buf); + break; + case 8: + packet->payload = le64_to_cpu(*(uint64_t *)buf); + break; + default: + return ARM_SPE_BAD_PACKET; + } + + return 1 + ext_hdr + payload_len; +} + +static int arm_spe_get_pad(struct arm_spe_pkt *packet) +{ + packet->type = ARM_SPE_PAD; + return 1; +} + +static int arm_spe_get_alignment(const unsigned char *buf, size_t len, + struct arm_spe_pkt *packet) +{ + unsigned int alignment = 1 << ((buf[0] & 0xf) + 1); + + if (len < alignment) + return ARM_SPE_NEED_MORE_BYTES; + + packet->type = ARM_SPE_PAD; + return alignment - (((uintptr_t)buf) & (alignment - 1)); +} + +static int arm_spe_get_end(struct arm_spe_pkt *packet) +{ + packet->type = ARM_SPE_END; + return 1; +} + +static int arm_spe_get_timestamp(const unsigned char *buf, size_t len, + struct arm_spe_pkt *packet) +{ + packet->type = ARM_SPE_TIMESTAMP; + return arm_spe_get_payload(buf, len, 0, packet); +} + +static int arm_spe_get_events(const unsigned char *buf, size_t len, + struct arm_spe_pkt *packet) +{ + packet->type = ARM_SPE_EVENTS; + + /* we use index to identify Events with a less number of + * comparisons in arm_spe_pkt_desc(): E.g., the LLC-ACCESS, + * LLC-REFILL, and REMOTE-ACCESS events are identified if + * index > 1. + */ + packet->index = arm_spe_payload_len(buf[0]); + + return arm_spe_get_payload(buf, len, 0, packet); +} + +static int arm_spe_get_data_source(const unsigned char *buf, size_t len, + struct arm_spe_pkt *packet) +{ + packet->type = ARM_SPE_DATA_SOURCE; + return arm_spe_get_payload(buf, len, 0, packet); +} + +static int arm_spe_get_context(const unsigned char *buf, size_t len, + struct arm_spe_pkt *packet) +{ + packet->type = ARM_SPE_CONTEXT; + packet->index = SPE_CTX_PKT_HDR_INDEX(buf[0]); + return arm_spe_get_payload(buf, len, 0, packet); +} + +static int arm_spe_get_op_type(const unsigned char *buf, size_t len, + struct arm_spe_pkt *packet) +{ + packet->type = ARM_SPE_OP_TYPE; + packet->index = SPE_OP_PKT_HDR_CLASS(buf[0]); + return arm_spe_get_payload(buf, len, 0, packet); +} + +static int arm_spe_get_counter(const unsigned char *buf, size_t len, + const unsigned char ext_hdr, struct arm_spe_pkt *packet) +{ + packet->type = ARM_SPE_COUNTER; + + if (ext_hdr) + packet->index = SPE_HDR_EXTENDED_INDEX(buf[0], buf[1]); + else + packet->index = SPE_HDR_SHORT_INDEX(buf[0]); + + return arm_spe_get_payload(buf, len, ext_hdr, packet); +} + +static int arm_spe_get_addr(const unsigned char *buf, size_t len, + const unsigned char ext_hdr, struct arm_spe_pkt *packet) +{ + packet->type = ARM_SPE_ADDRESS; + + if (ext_hdr) + packet->index = SPE_HDR_EXTENDED_INDEX(buf[0], buf[1]); + else + packet->index = SPE_HDR_SHORT_INDEX(buf[0]); + + return arm_spe_get_payload(buf, len, ext_hdr, packet); +} + +static int arm_spe_do_get_packet(const unsigned char *buf, size_t len, + struct arm_spe_pkt *packet) +{ + unsigned int hdr; + unsigned char ext_hdr = 0; + + memset(packet, 0, sizeof(struct arm_spe_pkt)); + + if (!len) + return ARM_SPE_NEED_MORE_BYTES; + + hdr = buf[0]; + + if (hdr == SPE_HEADER0_PAD) + return arm_spe_get_pad(packet); + + if (hdr == SPE_HEADER0_END) /* no timestamp at end of record */ + return arm_spe_get_end(packet); + + if (hdr == SPE_HEADER0_TIMESTAMP) + return arm_spe_get_timestamp(buf, len, packet); + + if ((hdr & SPE_HEADER0_MASK1) == SPE_HEADER0_EVENTS) + return arm_spe_get_events(buf, len, packet); + + if ((hdr & SPE_HEADER0_MASK1) == SPE_HEADER0_SOURCE) + return arm_spe_get_data_source(buf, len, packet); + + if ((hdr & SPE_HEADER0_MASK2) == SPE_HEADER0_CONTEXT) + return arm_spe_get_context(buf, len, packet); + + if ((hdr & SPE_HEADER0_MASK2) == SPE_HEADER0_OP_TYPE) + return arm_spe_get_op_type(buf, len, packet); + + if ((hdr & SPE_HEADER0_MASK2) == SPE_HEADER0_EXTENDED) { + /* 16-bit extended format header */ + if (len == 1) + return ARM_SPE_BAD_PACKET; + + ext_hdr = 1; + hdr = buf[1]; + if (hdr == SPE_HEADER1_ALIGNMENT) + return arm_spe_get_alignment(buf, len, packet); + } + + /* + * The short format header's byte 0 or the extended format header's + * byte 1 has been assigned to 'hdr', which uses the same encoding for + * address packet and counter packet, so don't need to distinguish if + * it's short format or extended format and handle in once. + */ + if ((hdr & SPE_HEADER0_MASK3) == SPE_HEADER0_ADDRESS) + return arm_spe_get_addr(buf, len, ext_hdr, packet); + + if ((hdr & SPE_HEADER0_MASK3) == SPE_HEADER0_COUNTER) + return arm_spe_get_counter(buf, len, ext_hdr, packet); + + return ARM_SPE_BAD_PACKET; +} + +int arm_spe_get_packet(const unsigned char *buf, size_t len, + struct arm_spe_pkt *packet) +{ + int ret; + + ret = arm_spe_do_get_packet(buf, len, packet); + /* put multiple consecutive PADs on the same line, up to + * the fixed-width output format of 16 bytes per line. + */ + if (ret > 0 && packet->type == ARM_SPE_PAD) { + while (ret < 16 && len > (size_t)ret && !buf[ret]) + ret += 1; + } + return ret; +} diff --git a/drivers/arm/spe/spe-decoder/arm-spe-pkt-decoder.h b/drivers/arm/spe/spe-decoder/arm-spe-pkt-decoder.h new file mode 100644 index 0000000000000000000000000000000000000000..1a67b580b47f4d480fcdbee4569b57f2960b279d --- /dev/null +++ b/drivers/arm/spe/spe-decoder/arm-spe-pkt-decoder.h @@ -0,0 +1,153 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Arm Statistical Profiling Extensions (SPE) support + * Copyright (c) 2017-2018, Arm Ltd. + */ + +#ifndef INCLUDE__ARM_SPE_PKT_DECODER_H__ +#define INCLUDE__ARM_SPE_PKT_DECODER_H__ + +#include + +#define ARM_SPE_PKT_DESC_MAX 256 +#define ARM_SPE_NEED_MORE_BYTES -1 +#define ARM_SPE_BAD_PACKET -2 +#define ARM_SPE_PKT_MAX_SZ 16 + +enum arm_spe_pkt_type { + ARM_SPE_BAD, + ARM_SPE_PAD, + ARM_SPE_END, + ARM_SPE_TIMESTAMP, + ARM_SPE_ADDRESS, + ARM_SPE_COUNTER, + ARM_SPE_CONTEXT, + ARM_SPE_OP_TYPE, + ARM_SPE_EVENTS, + ARM_SPE_DATA_SOURCE, +}; + +struct arm_spe_pkt { + enum arm_spe_pkt_type type; + unsigned char index; + uint64_t payload; +}; + +/* Short header (HEADER0) and extended header (HEADER1) */ +#define SPE_HEADER0_PAD 0x0 +#define SPE_HEADER0_END 0x1 +#define SPE_HEADER0_TIMESTAMP 0x71 +/* Mask for event & data source */ +#define SPE_HEADER0_MASK1 (GENMASK_ULL(7, 6) | GENMASK_ULL(3, 0)) +#define SPE_HEADER0_EVENTS 0x42 +#define SPE_HEADER0_SOURCE 0x43 +/* Mask for context & operation */ +#define SPE_HEADER0_MASK2 GENMASK_ULL(7, 2) +#define SPE_HEADER0_CONTEXT 0x64 +#define SPE_HEADER0_OP_TYPE 0x48 +/* Mask for extended format */ +#define SPE_HEADER0_EXTENDED 0x20 +/* Mask for address & counter */ +#define SPE_HEADER0_MASK3 GENMASK_ULL(7, 3) +#define SPE_HEADER0_ADDRESS 0xb0 +#define SPE_HEADER0_COUNTER 0x98 +#define SPE_HEADER1_ALIGNMENT 0x0 + +#define SPE_HDR_SHORT_INDEX(h) ((h) & GENMASK_ULL(2, 0)) +#define SPE_HDR_EXTENDED_INDEX(h0, h1) (((h0) & GENMASK_ULL(1, 0)) << 3 | \ + SPE_HDR_SHORT_INDEX(h1)) + +/* Address packet header */ +#define SPE_ADDR_PKT_HDR_INDEX_INS 0x0 +#define SPE_ADDR_PKT_HDR_INDEX_BRANCH 0x1 +#define SPE_ADDR_PKT_HDR_INDEX_DATA_VIRT 0x2 +#define SPE_ADDR_PKT_HDR_INDEX_DATA_PHYS 0x3 +#define SPE_ADDR_PKT_HDR_INDEX_PREV_BRANCH 0x4 + +/* Address packet payload */ +#define SPE_ADDR_PKT_ADDR_BYTE7_SHIFT 56 +#define SPE_ADDR_PKT_ADDR_GET_BYTES_0_6(v) ((v) & GENMASK_ULL(55, 0)) +#define SPE_ADDR_PKT_ADDR_GET_BYTE_6(v) (((v) & GENMASK_ULL(55, 48)) >> 48) + +#define SPE_ADDR_PKT_GET_NS(v) (((v) & BIT_ULL(63)) >> 63) +#define SPE_ADDR_PKT_GET_EL(v) (((v) & GENMASK_ULL(62, 61)) >> 61) +#define SPE_ADDR_PKT_GET_CH(v) (((v) & BIT_ULL(62)) >> 62) +#define SPE_ADDR_PKT_GET_PAT(v) (((v) & GENMASK_ULL(59, 56)) >> 56) + +#define SPE_ADDR_PKT_EL0 0 +#define SPE_ADDR_PKT_EL1 1 +#define SPE_ADDR_PKT_EL2 2 +#define SPE_ADDR_PKT_EL3 3 + +/* Context packet header */ +#define SPE_CTX_PKT_HDR_INDEX(h) ((h) & GENMASK_ULL(1, 0)) + +/* Counter packet header */ +#define SPE_CNT_PKT_HDR_INDEX_TOTAL_LAT 0x0 +#define SPE_CNT_PKT_HDR_INDEX_ISSUE_LAT 0x1 +#define SPE_CNT_PKT_HDR_INDEX_TRANS_LAT 0x2 + +/* Event packet payload */ +enum arm_spe_events { + EV_EXCEPTION_GEN = 0, + EV_RETIRED = 1, + EV_L1D_ACCESS = 2, + EV_L1D_REFILL = 3, + EV_TLB_ACCESS = 4, + EV_TLB_WALK = 5, + EV_NOT_TAKEN = 6, + EV_MISPRED = 7, + EV_LLC_ACCESS = 8, + EV_LLC_MISS = 9, + EV_REMOTE_ACCESS = 10, + EV_ALIGNMENT = 11, + EV_PARTIAL_PREDICATE = 17, + EV_EMPTY_PREDICATE = 18, +}; + +/* Operation packet header */ +#define SPE_OP_PKT_HDR_CLASS(h) ((h) & GENMASK_ULL(1, 0)) +#define SPE_OP_PKT_HDR_CLASS_OTHER 0x0 +#define SPE_OP_PKT_HDR_CLASS_LD_ST_ATOMIC 0x1 +#define SPE_OP_PKT_HDR_CLASS_BR_ERET 0x2 + +#define SPE_OP_PKT_IS_OTHER_SVE_OP(v) (((v) & (BIT(7) | BIT(3) | BIT(0))) == 0x8) + +#define SPE_OP_PKT_COND BIT(0) + +#define SPE_OP_PKT_LDST_SUBCLASS_GET(v) ((v) & GENMASK_ULL(7, 1)) +#define SPE_OP_PKT_LDST_SUBCLASS_GP_REG 0x0 +#define SPE_OP_PKT_LDST_SUBCLASS_SIMD_FP 0x4 +#define SPE_OP_PKT_LDST_SUBCLASS_UNSPEC_REG 0x10 +#define SPE_OP_PKT_LDST_SUBCLASS_NV_SYSREG 0x30 + +#define SPE_OP_PKT_IS_LDST_ATOMIC(v) (((v) & (GENMASK_ULL(7, 5) | BIT(1))) == 0x2) + +#define SPE_OP_PKT_AR BIT(4) +#define SPE_OP_PKT_EXCL BIT(3) +#define SPE_OP_PKT_AT BIT(2) +#define SPE_OP_PKT_ST BIT(0) + +#define SPE_OP_PKT_IS_LDST_SVE(v) (((v) & (BIT(3) | BIT(1))) == 0x8) + +#define SPE_OP_PKT_SVE_SG BIT(7) +/* + * SVE effective vector length (EVL) is stored in byte 0 bits [6:4]; + * the length is rounded up to a power of two and use 32 as one step, + * so EVL calculation is: + * + * 32 * (2 ^ bits [6:4]) = 32 << (bits [6:4]) + */ +#define SPE_OP_PKG_SVE_EVL(v) (32 << (((v) & GENMASK_ULL(6, 4)) >> 4)) +#define SPE_OP_PKT_SVE_PRED BIT(2) +#define SPE_OP_PKT_SVE_FP BIT(1) + +#define SPE_OP_PKT_IS_INDIRECT_BRANCH(v) (((v) & GENMASK_ULL(7, 1)) == 0x2) + +const char *arm_spe_pkt_name(enum arm_spe_pkt_type); + +int arm_spe_get_packet(const unsigned char *buf, size_t len, + struct arm_spe_pkt *packet); + +int arm_spe_pkt_desc(const struct arm_spe_pkt *packet, char *buf, size_t len); +#endif diff --git a/drivers/arm/spe/spe.c b/drivers/arm/spe/spe.c new file mode 100644 index 0000000000000000000000000000000000000000..d22a6aff9a94d86f9d95dbc642dd9d12e81536a4 --- /dev/null +++ b/drivers/arm/spe/spe.c @@ -0,0 +1,859 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#define PMUNAME "arm_spe" +#define DRVNAME PMUNAME "_driver" +#define pr_fmt(fmt) DRVNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include + +#include "spe-decoder/arm-spe-decoder.h" +#include "spe-decoder/arm-spe-pkt-decoder.h" +#include "spe.h" + +static long __percpu irq_dev_id; + +static struct arm_spe *spe; + +#define SPE_INIT_FAIL 0 +#define SPE_INIT_SUCC 1 +static int spe_probe_status = SPE_INIT_FAIL; + +/* Keep track of our dynamic hotplug state */ +static enum cpuhp_state arm_spe_online; + +/* keep track of who use the SPE */ +static enum arm_spe_user_e arm_spe_user = ARM_SPE_USER_MEM_SAMPLING; + +DEFINE_PER_CPU(struct arm_spe_buf, per_cpu_spe_buf); + +mem_sampling_cb_type arm_spe_sampling_cb; +void arm_spe_record_capture_callback_register(mem_sampling_cb_type cb) +{ + arm_spe_sampling_cb = cb; +} +EXPORT_SYMBOL_GPL(arm_spe_record_capture_callback_register); + +/* SPE sampling callback for perf */ +perf_sampling_cb_type arm_spe_sampling_perf_cb; +void arm_spe_sampling_for_perf_callback_register(perf_sampling_cb_type cb) +{ + arm_spe_sampling_perf_cb = cb; +} +EXPORT_SYMBOL_GPL(arm_spe_sampling_for_perf_callback_register); + +/* + * SPE can be useed by mem_sampling/perf, perf takes precedence. + * when perf is used, this callback is used to disable mem_sampling. + */ +mem_sampling_user_switch_cb_type arm_spe_user_switch_cb; +void arm_spe_user_switch_callback_register(mem_sampling_user_switch_cb_type cb) +{ + arm_spe_user_switch_cb = cb; +} + +struct arm_spe *arm_spe_get_desc(void) +{ + return spe; +} +EXPORT_SYMBOL_GPL(arm_spe_get_desc); + +static inline int arm_spe_per_buffer_alloc(int cpu) +{ + struct arm_spe_buf *spe_buf = &per_cpu(per_cpu_spe_buf, cpu); + void *alloc_base; + + if (spe_buf->base && spe_buf->record_base) + return 0; + + /* alloc spe raw data buffer */ + alloc_base = kzalloc_node(SPE_BUFFER_MAX_SIZE, GFP_KERNEL, cpu_to_node(cpu)); + if (unlikely(!alloc_base)) { + pr_err("alloc spe raw data buffer failed.\n"); + return -ENOMEM; + } + + spe_buf->base = alloc_base; + spe_buf->size = SPE_BUFFER_SIZE; + spe_buf->cur = alloc_base + SPE_BUFFER_MAX_SIZE - SPE_BUFFER_SIZE; + spe_buf->period = SPE_SAMPLE_PERIOD; + + /* alloc record buffer */ + spe_buf->record_size = SPE_RECORD_ENTRY_SIZE * SPE_RECORD_BUFFER_MAX_RECORDS; + spe_buf->record_base = kzalloc_node(spe_buf->record_size, GFP_KERNEL, cpu_to_node(cpu)); + if (unlikely(!spe_buf->record_base)) { + pr_err("alloc spe record buffer failed.\n"); + return -ENOMEM; + } + + return 0; +} + +static int arm_spe_buffer_alloc(void) +{ + int cpu, ret = 0; + cpumask_t *mask = &spe->supported_cpus; + + for_each_possible_cpu(cpu) { + if (!cpumask_test_cpu(cpu, mask)) + continue; + ret = arm_spe_per_buffer_alloc(cpu); + if (ret) + return ret; + } + return ret; +} + +static inline void arm_spe_per_buffer_free(int cpu) +{ + struct arm_spe_buf *spe_buf = &per_cpu(per_cpu_spe_buf, cpu); + + if (!spe_buf->base) + return; + + kfree(spe_buf->base); + spe_buf->cur = NULL; + spe_buf->base = NULL; + spe_buf->size = 0; + + kfree(spe_buf->record_base); + spe_buf->record_base = NULL; + spe_buf->record_size = 0; +} + +static inline void arm_spe_buffer_free(void) +{ + cpumask_t *mask = &spe->supported_cpus; + int cpu; + + for_each_possible_cpu(cpu) { + if (!cpumask_test_cpu(cpu, mask)) + continue; + arm_spe_per_buffer_free(cpu); + } +} + +static void arm_spe_buffer_init(void) +{ + u64 base, limit; + struct arm_spe_buf *spe_buf = this_cpu_ptr(&per_cpu_spe_buf); + + if (!spe_buf || !spe_buf->cur || !spe_buf->size) { + /* + * We still need to clear the limit pointer, since the + * profiler might only be disabled by virtue of a fault. + */ + limit = 0; + goto out_write_limit; + } + + base = (u64)spe_buf->cur; + limit = ((u64)spe_buf->cur + spe_buf->size) | PMBLIMITR_EL1_E; + write_sysreg_s(base, SYS_PMBPTR_EL1); + +out_write_limit: + write_sysreg_s(limit, SYS_PMBLIMITR_EL1); + +} + +static void arm_spe_disable_and_drain_local(void) +{ + /* Disable profiling at EL0 and EL1 */ + write_sysreg_s(0, SYS_PMSCR_EL1); + isb(); + + /* Drain any buffered data */ + psb_csync(); + dsb(nsh); + + /* Disable the profiling buffer */ + write_sysreg_s(0, SYS_PMBLIMITR_EL1); + isb(); +} + +/* IRQ handling */ +static enum arm_spe_buf_fault_action arm_spe_buf_get_fault_act(void) +{ + const char *err_str; + u64 pmbsr; + enum arm_spe_buf_fault_action ret; + + /* + * Ensure new profiling data is visible to the CPU and any external + * aborts have been resolved. + */ + psb_csync(); + dsb(nsh); + + /* Ensure hardware updates to PMBPTR_EL1 are visible */ + isb(); + + /* Service required? */ + pmbsr = read_sysreg_s(SYS_PMBSR_EL1); + if (!FIELD_GET(PMBSR_EL1_S, pmbsr)) + return SPE_PMU_BUF_FAULT_ACT_SPURIOUS; + + /* We only expect buffer management events */ + switch (FIELD_GET(PMBSR_EL1_EC, pmbsr)) { + case PMBSR_EL1_EC_BUF: + /* Handled below */ + break; + case PMBSR_EL1_EC_FAULT_S1: + case PMBSR_EL1_EC_FAULT_S2: + err_str = "Unexpected buffer fault"; + goto out_err; + default: + err_str = "Unknown error code"; + goto out_err; + } + + /* Buffer management event */ + switch (FIELD_GET(PMBSR_EL1_BUF_BSC_MASK, pmbsr)) { + case PMBSR_EL1_BUF_BSC_FULL: + ret = SPE_PMU_BUF_FAULT_ACT_OK; + goto out_stop; + default: + err_str = "Unknown buffer status code"; + } + +out_err: + pr_err_ratelimited( + "%s on CPU %d [PMBSR=0x%016llx, PMBPTR=0x%016llx, PMBLIMITR=0x%016llx]\n", + err_str, smp_processor_id(), pmbsr, + read_sysreg_s(SYS_PMBPTR_EL1), + read_sysreg_s(SYS_PMBLIMITR_EL1)); + ret = SPE_PMU_BUF_FAULT_ACT_FATAL; + +out_stop: + return ret; +} + +void arm_spe_stop(void) +{ + arm_spe_disable_and_drain_local(); +} + +static u64 arm_spe_to_pmsfcr(void) +{ + u64 reg = 0; + + if (spe->load_filter) + reg |= PMSFCR_EL1_LD; + + if (spe->store_filter) + reg |= PMSFCR_EL1_ST; + + if (spe->branch_filter) + reg |= PMSFCR_EL1_B; + + if (reg) + reg |= PMSFCR_EL1_FT; + + if (spe->event_filter) + reg |= PMSFCR_EL1_FE; + + if (spe->inv_event_filter) + reg |= PMSFCR_EL1_FnE; + + if (spe->min_latency) + reg |= PMSFCR_EL1_FL; + + return reg; +} + +static u64 arm_spe_to_pmsevfr(void) +{ + return spe->event_filter; +} + +static u64 arm_spe_to_pmsnevfr(void) +{ + return spe->inv_event_filter; +} + +static u64 arm_spe_to_pmslatfr(void) +{ + return spe->min_latency; +} + +static void arm_spe_sanitise_period(struct arm_spe_buf *spe_buf) +{ + u64 period = spe_buf->period; + u64 max_period = PMSIRR_EL1_INTERVAL_MASK; + + if (period < spe->min_period) + period = spe->min_period; + else if (period > max_period) + period = max_period; + else + period &= max_period; + + spe_buf->period = period; +} + +static u64 arm_spe_to_pmsirr(void) +{ + u64 reg = 0; + struct arm_spe_buf *spe_buf = this_cpu_ptr(&per_cpu_spe_buf); + + arm_spe_sanitise_period(spe_buf); + + if (spe->jitter) + reg |= 0x1; + + reg |= spe_buf->period << 8; + + return reg; +} + +static u64 arm_spe_to_pmscr(void) +{ + u64 reg = 0; + + if (spe->ts_enable) + reg |= PMSCR_EL1_TS; + + if (spe->pa_enable) + reg |= PMSCR_EL1_PA; + + if (spe->pct_enable < 0x4) + reg |= spe->pct_enable << 6; + + if (spe->exclude_user) + reg |= PMSCR_EL1_E0SPE; + + if (spe->exclude_kernel) + reg |= PMSCR_EL1_E1SPE; + + if (IS_ENABLED(CONFIG_PID_IN_CONTEXTIDR)) + reg |= PMSCR_EL1_CX; + + return reg; +} + +int arm_spe_start(void) +{ + u64 reg; + int cpu = smp_processor_id(); + + if (!cpumask_test_cpu(cpu, &spe->supported_cpus)) + return -ENOENT; + + arm_spe_buffer_init(); + + reg = arm_spe_to_pmsfcr(); + write_sysreg_s(reg, SYS_PMSFCR_EL1); + + reg = arm_spe_to_pmsevfr(); + write_sysreg_s(reg, SYS_PMSEVFR_EL1); + + if (spe->features & SPE_PMU_FEAT_INV_FILT_EVT) { + reg = arm_spe_to_pmsnevfr(); + write_sysreg_s(reg, SYS_PMSNEVFR_EL1); + } + + reg = arm_spe_to_pmslatfr(); + + write_sysreg_s(reg, SYS_PMSLATFR_EL1); + + reg = arm_spe_to_pmsirr(); + write_sysreg_s(reg, SYS_PMSIRR_EL1); + isb(); + + reg = arm_spe_to_pmscr(); + isb(); + write_sysreg_s(reg, SYS_PMSCR_EL1); + return 0; +} + +void arm_spe_continue(void) +{ + int reg; + + arm_spe_buffer_init(); + reg = arm_spe_to_pmscr(); + + isb(); + write_sysreg_s(reg, SYS_PMSCR_EL1); +} + +int arm_spe_enabled(void) +{ + return spe_probe_status == SPE_INIT_SUCC; +} + +static irqreturn_t arm_spe_irq_handler(int irq, void *dev) +{ + enum arm_spe_buf_fault_action act; + struct arm_spe_buf *spe_buf = this_cpu_ptr(&per_cpu_spe_buf); + + act = arm_spe_buf_get_fault_act(); + + switch (act) { + case SPE_PMU_BUF_FAULT_ACT_FATAL: + if (unlikely(arm_spe_user == ARM_SPE_USER_PERF)) { + if (arm_spe_sampling_perf_cb) + arm_spe_sampling_perf_cb(act); + } + /* + * If a fatal exception occurred then leaving the profiling + * buffer enabled is a recipe waiting to happen. Since + * fatal faults don't always imply truncation, make sure + * that the profiling buffer is disabled explicitly before + * clearing the syndrome register. + */ + arm_spe_disable_and_drain_local(); + break; + case SPE_PMU_BUF_FAULT_ACT_OK: + /* + * Callback function processing record data. + * ARM_SPE_USER_MEM_SAMPLING: arm_spe_record_captured_cb - mem_sampling layer. + * ARM_SPE_USER_PERF: arm_spe_sampling_perf_cb - perf. + * TODO: 1) use per CPU workqueue to process data and reduce + * interrupt processing time. 2) The "register" function can be + * registered in a callback structure. + */ + if (likely(arm_spe_user == ARM_SPE_USER_MEM_SAMPLING)) { + spe_buf->nr_records = 0; + arm_spe_decode_buf(spe_buf->cur, spe_buf->size); + + if (arm_spe_sampling_cb) + arm_spe_sampling_cb( + (struct mem_sampling_record *)spe_buf->record_base, + spe_buf->nr_records); + } else { + if (arm_spe_sampling_perf_cb) + arm_spe_sampling_perf_cb(act); + } + + break; + + case SPE_PMU_BUF_FAULT_ACT_SPURIOUS: + /* We've seen you before, but GCC has the memory of a sieve. */ + arm_spe_stop(); + break; + } + + /* The buffer pointers are now sane, so resume profiling. */ + write_sysreg_s(0, SYS_PMBSR_EL1); + return IRQ_HANDLED; +} + + +static void __arm_spe_dev_probe(void *data) +{ + int fld; + u64 reg; + + fld = cpuid_feature_extract_unsigned_field( + read_cpuid(ID_AA64DFR0_EL1), ID_AA64DFR0_EL1_PMSVer_SHIFT); + if (!fld) { + pr_err("unsupported ID_AA64DFR0_EL1.PMSVer [%d] on CPU %d\n", + fld, smp_processor_id()); + return; + } + spe->pmsver = (u16)fld; + + /* Read PMBIDR first to determine whether or not we have access */ + reg = read_sysreg_s(SYS_PMBIDR_EL1); + if (FIELD_GET(PMBIDR_EL1_P, reg)) { + pr_err("profiling buffer owned by higher exception level\n"); + return; + } + + /* Minimum alignment. If it's out-of-range, then fail the probe */ + fld = FIELD_GET(PMBIDR_EL1_ALIGN, reg); + spe->align = 1 << fld; + if (spe->align > SZ_2K) { + pr_err("unsupported PMBIDR.Align [%d] on CPU %d\n", fld, + smp_processor_id()); + return; + } + + /* It's now safe to read PMSIDR and figure out what we've got */ + reg = read_sysreg_s(SYS_PMSIDR_EL1); + if (FIELD_GET(PMSIDR_EL1_FE, reg)) + spe->features |= SPE_PMU_FEAT_FILT_EVT; + + if (FIELD_GET(PMSIDR_EL1_FnE, reg)) + spe->features |= SPE_PMU_FEAT_INV_FILT_EVT; + + if (FIELD_GET(PMSIDR_EL1_FT, reg)) + spe->features |= SPE_PMU_FEAT_FILT_TYP; + + if (FIELD_GET(PMSIDR_EL1_FL, reg)) + spe->features |= SPE_PMU_FEAT_FILT_LAT; + + if (FIELD_GET(PMSIDR_EL1_ARCHINST, reg)) + spe->features |= SPE_PMU_FEAT_ARCH_INST; + + if (FIELD_GET(PMSIDR_EL1_LDS, reg)) + spe->features |= SPE_PMU_FEAT_LDS; + + if (FIELD_GET(PMSIDR_EL1_ERND, reg)) + spe->features |= SPE_PMU_FEAT_ERND; + + /* This field has a spaced out encoding, so just use a look-up */ + fld = FIELD_GET(PMSIDR_EL1_INTERVAL, reg); + switch (fld) { + case PMSIDR_EL1_INTERVAL_256: + spe->min_period = 256; + break; + case PMSIDR_EL1_INTERVAL_512: + spe->min_period = 512; + break; + case PMSIDR_EL1_INTERVAL_768: + spe->min_period = 768; + break; + case PMSIDR_EL1_INTERVAL_1024: + spe->min_period = 1024; + break; + case PMSIDR_EL1_INTERVAL_1536: + spe->min_period = 1536; + break; + case PMSIDR_EL1_INTERVAL_2048: + spe->min_period = 2048; + break; + case PMSIDR_EL1_INTERVAL_3072: + spe->min_period = 3072; + break; + case PMSIDR_EL1_INTERVAL_4096: + spe->min_period = 4096; + break; + default: + pr_warn("unknown PMSIDR_EL1.Interval [%d]; assuming 8\n", fld); + fallthrough; + } + + /* Maximum record size. If it's out-of-range, then fail the probe */ + fld = FIELD_GET(PMSIDR_EL1_MAXSIZE, reg); + spe->max_record_sz = 1 << fld; + if (spe->max_record_sz > SZ_2K || spe->max_record_sz < 16) { + pr_err("unsupported PMSIDR_EL1.MaxSize [%d] on CPU %d\n", fld, + smp_processor_id()); + return; + } + + fld = FIELD_GET(PMSIDR_EL1_COUNTSIZE, reg); + switch (fld) { + case PMSIDR_EL1_COUNTSIZE_12_BIT_SAT: + spe->counter_sz = 12; + break; + case PMSIDR_EL1_COUNTSIZE_16_BIT_SAT: + spe->counter_sz = 16; + break; + default: + pr_warn("unknown PMSIDR_EL1.CountSize [%d]; assuming 2\n", fld); + fallthrough; + } + + pr_info("probed SPEv1.%d for CPUs %*pbl [max_record_sz %u, min_period %u, align %u, features 0x%llx]\n", + spe->pmsver - 1, cpumask_pr_args(&spe->supported_cpus), + spe->max_record_sz, spe->min_period, spe->align, spe->features); + + spe->features |= SPE_PMU_FEAT_DEV_PROBED; +} + +static void __arm_spe_reset_local(void) +{ + /* + * This is probably overkill, as we have no idea where we're + * draining any buffered data to... + */ + arm_spe_disable_and_drain_local(); + + /* Reset the buffer base pointer */ + write_sysreg_s(0, SYS_PMBPTR_EL1); + isb(); + + /* Clear any pending management interrupts */ + write_sysreg_s(0, SYS_PMBSR_EL1); + isb(); +} + +static void __arm_spe_setup_one(void) +{ + __arm_spe_reset_local(); + enable_percpu_irq(spe->irq, IRQ_TYPE_NONE); +} + +static void __arm_spe_stop_one(void) +{ + disable_percpu_irq(spe->irq); + __arm_spe_reset_local(); +} + +void arm_spe_set_user(enum arm_spe_user_e user) +{ + if (user == ARM_SPE_USER_PERF) + arm_spe_user_switch_cb(USER_SWITCH_AWAY_FROM_MEM_SAMPLING); + else + arm_spe_user_switch_cb(USER_SWITCH_BACK_TO_MEM_SAMPLING); + + __arm_spe_reset_local(); + + arm_spe_user = user; +} +EXPORT_SYMBOL_GPL(arm_spe_set_user); + +static int arm_spe_cpu_startup(unsigned int cpu, struct hlist_node *node) +{ + struct arm_spe *spe; + + spe = hlist_entry_safe(node, struct arm_spe, hotplug_node); + if (!cpumask_test_cpu(cpu, &spe->supported_cpus)) + return 0; + + /* Alloc per cpu spe buffer */ + arm_spe_per_buffer_alloc(cpu); + + /* Reset pmu and enable irq */ + __arm_spe_setup_one(); + + return 0; +} + +static int arm_spe_cpu_teardown(unsigned int cpu, struct hlist_node *node) +{ + struct arm_spe *spe; + + spe = hlist_entry_safe(node, struct arm_spe, hotplug_node); + if (!cpumask_test_cpu(cpu, &spe->supported_cpus)) + return 0; + + /* Disable irq and reset pmu */ + __arm_spe_stop_one(); + + /* Release per cpu spe buffer */ + arm_spe_per_buffer_free(cpu); + + return 0; +} + +static int arm_spe_dev_init(void) +{ + int ret; + cpumask_t *mask = &spe->supported_cpus; + + + /* Make sure we probe the hardware on a relevant CPU */ + ret = smp_call_function_any(mask, __arm_spe_dev_probe, NULL, 1); + if (ret || !(spe->features & SPE_PMU_FEAT_DEV_PROBED)) + return -ENXIO; + + /* Request our PPIs (note that the IRQ is still disabled) */ + ret = request_percpu_irq(spe->irq, arm_spe_irq_handler, DRVNAME, + &irq_dev_id); + if (ret) + return ret; + + /* + * Register our hotplug notifier now so we don't miss any events. + * This will enable the IRQ for any supported CPUs that are already + * up. + */ + ret = cpuhp_state_add_instance(arm_spe_online, + &spe->hotplug_node); + if (ret) + free_percpu_irq(spe->irq, &irq_dev_id); + + return ret; +} + +static void arm_spe_dev_teardown(void) +{ + arm_spe_buffer_free(); + cpuhp_state_remove_instance(arm_spe_online, &spe->hotplug_node); + free_percpu_irq(spe->irq, &irq_dev_id); +} + +static const struct of_device_id arm_spe_of_match[] = { + { .compatible = "arm,statistical-profiling-extension-v1", + .data = (void *)1 }, + { /* Sentinel */ }, +}; +MODULE_DEVICE_TABLE(of, arm_spe_of_match); + +static const struct platform_device_id arm_spe_match[] = { + { ARMV8_SPE_PDEV_NAME, 0 }, + {} +}; +MODULE_DEVICE_TABLE(platform, arm_spe_match); + +/* Driver and device probing */ +static int arm_spe_irq_probe(void) +{ + struct platform_device *pdev = spe->pdev; + int irq = platform_get_irq(pdev, 0); + + if (irq < 0) + return -ENXIO; + + if (!irq_is_percpu(irq)) { + dev_err(&pdev->dev, "expected PPI but got SPI (%d)\n", irq); + return -EINVAL; + } + + if (irq_get_percpu_devid_partition(irq, &spe->supported_cpus)) { + dev_err(&pdev->dev, "failed to get PPI partition (%d)\n", irq); + return -EINVAL; + } + + spe->irq = irq; + return 0; +} + +static void arm_spe_sample_para_init(void) +{ + spe->sample_period = SPE_SAMPLE_PERIOD; + spe->jitter = 1; + spe->load_filter = 1; + spe->store_filter = 1; + spe->branch_filter = 0; + spe->inv_event_filter = 0; + spe->event_filter = 0x2; + + spe->ts_enable = 1; + spe->pa_enable = 1; + spe->pct_enable = 0; + + spe->exclude_user = 1; + spe->exclude_kernel = 0; + + spe->min_latency = 120; +} + +void arm_spe_record_enqueue(struct arm_spe_record *record) +{ + struct arm_spe_buf *spe_buf = this_cpu_ptr(&per_cpu_spe_buf); + struct mem_sampling_record *record_tail; + + if (spe_buf->nr_records >= SPE_RECORD_BUFFER_MAX_RECORDS) { + pr_err("nr_records exceeded!\n"); + return; + } + + trace_spe_record((struct mem_sampling_record *)record, smp_processor_id()); + record_tail = spe_buf->record_base + + spe_buf->nr_records * SPE_RECORD_ENTRY_SIZE; + *record_tail = *(struct mem_sampling_record *)record; + spe_buf->nr_records++; + +} + +static int arm_spe_device_probe(struct platform_device *pdev) +{ + + int ret; + struct device *dev; + /* + * If kernelspace is unmapped when running at EL0, then the SPE + * buffer will fault and prematurely terminate the AUX session. + */ + if (arm64_kernel_unmapped_at_el0()) { + dev_warn_once(dev, "buffer inaccessible. Try passing \"kpti=off\" on the kernel command line\n"); + return -EPERM; + } + + if (!pdev) { + pr_err("pdev is NULL!\n"); + return -ENODEV; + } + + dev = &pdev->dev; + if (!dev) { + pr_err("dev is NULL!\n"); + return -ENODEV; + } + + spe = devm_kzalloc(dev, sizeof(*spe), GFP_KERNEL); + if (!spe) + return -ENOMEM; + + spe->pdev = pdev; + platform_set_drvdata(pdev, spe); + + ret = arm_spe_irq_probe(); + if (ret) + goto out_free; + + ret = arm_spe_dev_init(); + if (ret) + goto out_free; + + /* + * Ensure that all CPUs that support SPE can apply for the cache + * area, with each CPU defaulting to 4K * 2. Failure to do so will + * result in the inability to collect SPE data in kernel mode. + */ + ret = arm_spe_buffer_alloc(); + if (ret) + goto out_teardown; + + arm_spe_sample_para_init(); + + spe_probe_status = SPE_INIT_SUCC; + + return 0; + +out_teardown: + arm_spe_dev_teardown(); +out_free: + kfree(spe); + return ret; +} + +static int arm_spe_device_remove(struct platform_device *pdev) +{ + arm_spe_dev_teardown(); + return 0; +} + +static struct platform_driver arm_spe_driver = { + .id_table = arm_spe_match, + .driver = { + .name = DRVNAME, + .of_match_table = of_match_ptr(arm_spe_of_match), + .suppress_bind_attrs = true, + }, + .probe = arm_spe_device_probe, + .remove = arm_spe_device_remove, +}; + +static int __init arm_spe_init(void) +{ + int ret; + + ret = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN, DRVNAME, + arm_spe_cpu_startup, + arm_spe_cpu_teardown); + if (ret < 0) + return ret; + arm_spe_online = ret; + + ret = platform_driver_register(&arm_spe_driver); + + if (ret) + cpuhp_remove_multi_state(arm_spe_online); + + return ret; +} + +static void __exit arm_spe_exit(void) +{ + /* + * TODO: Find a clean way to disable SPE so that SPE + * can be used for perf. + */ + platform_driver_unregister(&arm_spe_driver); + cpuhp_remove_multi_state(arm_spe_online); + arm_spe_buffer_free(); +} + +module_init(arm_spe_init); +module_exit(arm_spe_exit); diff --git a/drivers/arm/spe/spe.h b/drivers/arm/spe/spe.h new file mode 100644 index 0000000000000000000000000000000000000000..6a51cdb401301bc22f44ac354494f365c02691fc --- /dev/null +++ b/drivers/arm/spe/spe.h @@ -0,0 +1,125 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef __SPE_H +#define __SPE_H + +#define SPE_BUFFER_MAX_SIZE (PAGE_SIZE) +#define SPE_BUFFER_SIZE (PAGE_SIZE / 32) + +#define SPE_SAMPLE_PERIOD 1024 + +#define SPE_RECORD_BUFFER_MAX_RECORDS (100) +#define SPE_RECORD_ENTRY_SIZE sizeof(struct mem_sampling_record) + +#define SPE_PMU_FEAT_FILT_EVT (1UL << 0) +#define SPE_PMU_FEAT_FILT_TYP (1UL << 1) +#define SPE_PMU_FEAT_FILT_LAT (1UL << 2) +#define SPE_PMU_FEAT_ARCH_INST (1UL << 3) +#define SPE_PMU_FEAT_LDS (1UL << 4) +#define SPE_PMU_FEAT_ERND (1UL << 5) +#define SPE_PMU_FEAT_INV_FILT_EVT (1UL << 6) +#define SPE_PMU_FEAT_DEV_PROBED (1UL << 63) +#define ARM_SPE_BUF_PAD_BYTE (0) +#define PMBLIMITR_EL1_E GENMASK(0, 0) +#define PMBSR_EL1_S GENMASK(17, 17) +#define PMBSR_EL1_EC GENMASK(31, 26) +#define PMBSR_EL1_EC_BUF UL(0b000000) +#define PMBSR_EL1_EC_FAULT_S1 UL(0b100100) +#define PMBSR_EL1_EC_FAULT_S2 UL(0b100101) +#define PMBSR_EL1_MSS_MASK GENMASK(15, 0) +#define PMBSR_EL1_BUF_BSC_MASK PMBSR_EL1_MSS_MASK +#define PMBSR_EL1_BUF_BSC_FULL 0x1UL +#define PMSFCR_EL1_LD GENMASK(17, 17) +#define PMSFCR_EL1_ST GENMASK(18, 18) +#define PMSFCR_EL1_B GENMASK(16, 16) +#define PMSFCR_EL1_FnE GENMASK(3, 3) +#define PMSFCR_EL1_FT GENMASK(1, 1) +#define PMSFCR_EL1_FE GENMASK(0, 0) +#define PMSFCR_EL1_FL GENMASK(2, 2) +#define PMSIRR_EL1_INTERVAL_MASK GENMASK(31, 8) +#define PMSCR_EL1_TS GENMASK(5, 5) +#define PMSCR_EL1_PA GENMASK(4, 4) +#define PMSCR_EL1_CX GENMASK(3, 3) +#define PMSCR_EL1_E1SPE GENMASK(1, 1) +#define PMSCR_EL1_E0SPE GENMASK(0, 0) +#define ID_AA64DFR0_EL1_PMSVer_SHIFT 32 +#define PMBIDR_EL1_P GENMASK(4, 4) +#define PMBIDR_EL1_ALIGN GENMASK(3, 0) +#define PMSIDR_EL1_FE GENMASK(0, 0) +#define PMSIDR_EL1_FnE GENMASK(6, 6) +#define PMSIDR_EL1_FT GENMASK(1, 1) +#define PMSIDR_EL1_ARCHINST GENMASK(3, 3) +#define PMSIDR_EL1_LDS GENMASK(4, 4) +#define PMSIDR_EL1_ERND GENMASK(5, 5) +#define PMSIDR_EL1_INTERVAL GENMASK(11, 8) +#define PMSIDR_EL1_INTERVAL_256 UL(0b0000) +#define PMSIDR_EL1_INTERVAL_512 UL(0b0010) +#define PMSIDR_EL1_INTERVAL_768 UL(0b0011) +#define PMSIDR_EL1_INTERVAL_1024 UL(0b0100) +#define PMSIDR_EL1_INTERVAL_1536 UL(0b0101) +#define PMSIDR_EL1_INTERVAL_2048 UL(0b0110) +#define PMSIDR_EL1_INTERVAL_3072 UL(0b0111) +#define PMSIDR_EL1_INTERVAL_4096 UL(0b1000) +#define PMSIDR_EL1_MAXSIZE GENMASK(15, 12) +#define PMSIDR_EL1_COUNTSIZE GENMASK(19, 16) +#define PMSIDR_EL1_COUNTSIZE_12_BIT_SAT UL(0b0010) +#define PMSIDR_EL1_COUNTSIZE_16_BIT_SAT UL(0b0011) +#define PMSIDR_EL1_FL GENMASK(2, 2) +#define SYS_PMSNEVFR_EL1 sys_reg(3, 0, 9, 9, 1) +#define SPE_PMU_FEAT_INV_FILT_EVT (1UL << 6) +#define PMBSR_EL1_COLL_MASK GENMASK(16, 16) +#define PMBSR_EL1_COLL PMBSR_EL1_COLL_MASK +#define PMBSR_EL1_DL_MASK GENMASK(19, 19) +#define PMBSR_EL1_DL PMBSR_EL1_DL_MASK + +enum arm_spe_buf_fault_action { + SPE_PMU_BUF_FAULT_ACT_SPURIOUS, + SPE_PMU_BUF_FAULT_ACT_FATAL, + SPE_PMU_BUF_FAULT_ACT_OK, +}; + +enum arm_spe_user_e { + ARM_SPE_USER_PERF, + ARM_SPE_USER_MEM_SAMPLING, +}; + +struct arm_spe { + struct pmu pmu; + struct platform_device *pdev; + cpumask_t supported_cpus; + struct hlist_node hotplug_node; + int irq; /* PPI */ + u16 pmsver; + u16 min_period; + u16 counter_sz; + u64 features; + u16 max_record_sz; + u16 align; + u64 sample_period; + local64_t period_left; + bool jitter; + bool load_filter; + bool store_filter; + bool branch_filter; + u64 inv_event_filter; + u16 min_latency; + u64 event_filter; + bool ts_enable; + bool pa_enable; + u8 pct_enable; + bool exclude_user; + bool exclude_kernel; +}; + +struct arm_spe_buf { + void *cur; /* for spe raw data buffer */ + int size; + int period; + void *base; + + void *record_base; /* for spe record buffer */ + int record_size; + int nr_records; +}; + +#endif /* __SPE_H */ diff --git a/drivers/perf/Kconfig b/drivers/perf/Kconfig index 67ad53cde11fc23661070292517c5bf211a5b9f8..e6eee6f3d33c2a9d90bd8c7e851df90e12336569 100644 --- a/drivers/perf/Kconfig +++ b/drivers/perf/Kconfig @@ -124,7 +124,7 @@ config XGENE_PMU config ARM_SPE_PMU tristate "Enable support for the ARMv8.2 Statistical Profiling Extension" - depends on ARM64 + depends on ARM_SPE && MEM_SAMPLING help Enable perf support for the ARMv8.2 Statistical Profiling Extension, which provides periodic sampling of operations in diff --git a/drivers/perf/arm_pmu_acpi.c b/drivers/perf/arm_pmu_acpi.c index 0d284fda87aca86263ee382ae903547b4320e837..4e716b700c0f2c449643b2f55987ae34ec07d1f2 100644 --- a/drivers/perf/arm_pmu_acpi.c +++ b/drivers/perf/arm_pmu_acpi.c @@ -125,6 +125,33 @@ arm_acpi_register_pmu_device(struct platform_device *pdev, u8 len, } #if IS_ENABLED(CONFIG_ARM_SPE_PMU) +static struct resource spe_pmu_resources[] = { + { + } +}; + +static struct platform_device spe_pmu_dev = { + .name = ARMV8_SPE_PMU_PDEV_NAME, + .id = -1, + .resource = spe_pmu_resources, + .num_resources = ARRAY_SIZE(spe_pmu_resources) +}; + +static void arm_spe_pmu_acpi_register_device(void) +{ + int ret; + + ret = platform_device_register(&spe_pmu_dev); + if (ret < 0) + pr_warn("ACPI: SPE_PMU: Unable to register device\n"); +} +#else +static inline void arm_spe_pmu_acpi_register_device(void) +{ +} +#endif + +#if IS_ENABLED(CONFIG_ARM_SPE) static struct resource spe_resources[] = { { /* irq */ @@ -160,7 +187,7 @@ static void arm_spe_acpi_register_device(void) static inline void arm_spe_acpi_register_device(void) { } -#endif /* CONFIG_ARM_SPE_PMU */ +#endif /* CONFIG_ARM_SPE */ #if IS_ENABLED(CONFIG_CORESIGHT_TRBE) static struct resource trbe_resources[] = { @@ -402,6 +429,7 @@ static int arm_pmu_acpi_init(void) return 0; arm_spe_acpi_register_device(); + arm_spe_pmu_acpi_register_device(); arm_trbe_acpi_register_device(); ret = arm_pmu_acpi_parse_irqs(); diff --git a/drivers/perf/arm_spe_pmu.c b/drivers/perf/arm_spe_pmu.c index 2a4ebdd1ee78da280e416f9b775636061846376e..970bc2f3c4bf474ce3e5c75e71c93861f493bf34 100644 --- a/drivers/perf/arm_spe_pmu.c +++ b/drivers/perf/arm_spe_pmu.c @@ -39,6 +39,8 @@ #include #include +#include "../arm/spe/spe.h" + /* * Cache if the event is allowed to trace Context information. * This allows us to perform the check, i.e, perfmon_capable(), @@ -57,8 +59,6 @@ static bool get_spe_event_has_cx(struct perf_event *event) return !!(event->hw.flags & SPE_PMU_HW_FLAGS_CX); } -#define ARM_SPE_BUF_PAD_BYTE 0 - struct arm_spe_pmu_buf { int nr_pages; bool snapshot; @@ -76,13 +76,6 @@ struct arm_spe_pmu { u16 min_period; u16 counter_sz; -#define SPE_PMU_FEAT_FILT_EVT (1UL << 0) -#define SPE_PMU_FEAT_FILT_TYP (1UL << 1) -#define SPE_PMU_FEAT_FILT_LAT (1UL << 2) -#define SPE_PMU_FEAT_ARCH_INST (1UL << 3) -#define SPE_PMU_FEAT_LDS (1UL << 4) -#define SPE_PMU_FEAT_ERND (1UL << 5) -#define SPE_PMU_FEAT_DEV_PROBED (1UL << 63) u64 features; u16 max_record_sz; @@ -95,15 +88,6 @@ struct arm_spe_pmu { /* Convert a free-running index from perf into an SPE buffer offset */ #define PERF_IDX2OFF(idx, buf) ((idx) % ((buf)->nr_pages << PAGE_SHIFT)) -/* Keep track of our dynamic hotplug state */ -static enum cpuhp_state arm_spe_pmu_online; - -enum arm_spe_pmu_buf_fault_action { - SPE_PMU_BUF_FAULT_ACT_SPURIOUS, - SPE_PMU_BUF_FAULT_ACT_FATAL, - SPE_PMU_BUF_FAULT_ACT_OK, -}; - /* This sysfs gunk was really good fun to write. */ enum arm_spe_pmu_capabilities { SPE_PMU_CAP_ARCH_INST = 0, @@ -276,6 +260,8 @@ static const struct attribute_group *arm_spe_pmu_attr_groups[] = { NULL, }; +struct arm_spe_pmu *spe_pmu_local; + /* Convert between user ABI and register values */ static u64 arm_spe_event_to_pmscr(struct perf_event *event) { @@ -551,12 +537,12 @@ static void arm_spe_pmu_disable_and_drain_local(void) } /* IRQ handling */ -static enum arm_spe_pmu_buf_fault_action +static enum arm_spe_buf_fault_action arm_spe_pmu_buf_get_fault_act(struct perf_output_handle *handle) { const char *err_str; u64 pmbsr; - enum arm_spe_pmu_buf_fault_action ret; + enum arm_spe_buf_fault_action ret; /* * Ensure new profiling data is visible to the CPU and any external @@ -621,57 +607,6 @@ arm_spe_pmu_buf_get_fault_act(struct perf_output_handle *handle) return ret; } -static irqreturn_t arm_spe_pmu_irq_handler(int irq, void *dev) -{ - struct perf_output_handle *handle = dev; - struct perf_event *event = handle->event; - enum arm_spe_pmu_buf_fault_action act; - - if (!perf_get_aux(handle)) - return IRQ_NONE; - - act = arm_spe_pmu_buf_get_fault_act(handle); - if (act == SPE_PMU_BUF_FAULT_ACT_SPURIOUS) - return IRQ_NONE; - - /* - * Ensure perf callbacks have completed, which may disable the - * profiling buffer in response to a TRUNCATION flag. - */ - irq_work_run(); - - switch (act) { - case SPE_PMU_BUF_FAULT_ACT_FATAL: - /* - * If a fatal exception occurred then leaving the profiling - * buffer enabled is a recipe waiting to happen. Since - * fatal faults don't always imply truncation, make sure - * that the profiling buffer is disabled explicitly before - * clearing the syndrome register. - */ - arm_spe_pmu_disable_and_drain_local(); - break; - case SPE_PMU_BUF_FAULT_ACT_OK: - /* - * We handled the fault (the buffer was full), so resume - * profiling as long as we didn't detect truncation. - * PMBPTR might be misaligned, but we'll burn that bridge - * when we get to it. - */ - if (!(handle->aux_flags & PERF_AUX_FLAG_TRUNCATED)) { - arm_spe_perf_aux_output_begin(handle, event); - isb(); - } - break; - case SPE_PMU_BUF_FAULT_ACT_SPURIOUS: - /* We've seen you before, but GCC has the memory of a sieve. */ - break; - } - - /* The buffer pointers are now sane, so resume profiling. */ - write_sysreg_s(0, SYS_PMBSR_EL1); - return IRQ_HANDLED; -} static u64 arm_spe_pmsevfr_res0(u16 pmsver) { @@ -746,6 +681,8 @@ static void arm_spe_pmu_start(struct perf_event *event, int flags) struct hw_perf_event *hwc = &event->hw; struct perf_output_handle *handle = this_cpu_ptr(spe_pmu->handle); + arm_spe_set_user(ARM_SPE_USER_PERF); + hwc->state = 0; arm_spe_perf_aux_output_begin(handle, event); if (hwc->state) @@ -780,8 +717,14 @@ static void arm_spe_pmu_stop(struct perf_event *event, int flags) struct perf_output_handle *handle = this_cpu_ptr(spe_pmu->handle); /* If we're already stopped, then nothing to do */ - if (hwc->state & PERF_HES_STOPPED) + if (hwc->state & PERF_HES_STOPPED) { + /* + * PERF_HES_STOPPED maybe set in arm_spe_perf_aux_output_begin, + * we switch user here. + */ + arm_spe_set_user(ARM_SPE_USER_MEM_SAMPLING); return; + } /* Stop all trace generation */ arm_spe_pmu_disable_and_drain_local(); @@ -793,7 +736,7 @@ static void arm_spe_pmu_stop(struct perf_event *event, int flags) * path. */ if (perf_get_aux(handle)) { - enum arm_spe_pmu_buf_fault_action act; + enum arm_spe_buf_fault_action act; act = arm_spe_pmu_buf_get_fault_act(handle); if (act == SPE_PMU_BUF_FAULT_ACT_SPURIOUS) @@ -812,6 +755,7 @@ static void arm_spe_pmu_stop(struct perf_event *event, int flags) } hwc->state |= PERF_HES_STOPPED; + arm_spe_set_user(ARM_SPE_USER_MEM_SAMPLING); } static int arm_spe_pmu_add(struct perf_event *event, int flags) @@ -952,233 +896,58 @@ static void arm_spe_pmu_perf_destroy(struct arm_spe_pmu *spe_pmu) perf_pmu_unregister(&spe_pmu->pmu); } -static void __arm_spe_pmu_dev_probe(void *info) +void arm_spe_sampling_process(enum arm_spe_buf_fault_action act) { - int fld; - u64 reg; - struct arm_spe_pmu *spe_pmu = info; - struct device *dev = &spe_pmu->pdev->dev; - - fld = cpuid_feature_extract_unsigned_field(read_cpuid(ID_AA64DFR0_EL1), - ID_AA64DFR0_PMSVER_SHIFT); - if (!fld) { - dev_err(dev, - "unsupported ID_AA64DFR0_EL1.PMSVer [%d] on CPU %d\n", - fld, smp_processor_id()); - return; - } - spe_pmu->pmsver = (u16)fld; - - /* Read PMBIDR first to determine whether or not we have access */ - reg = read_sysreg_s(SYS_PMBIDR_EL1); - if (reg & BIT(SYS_PMBIDR_EL1_P_SHIFT)) { - dev_err(dev, - "profiling buffer owned by higher exception level\n"); - return; - } - - /* Minimum alignment. If it's out-of-range, then fail the probe */ - fld = reg >> SYS_PMBIDR_EL1_ALIGN_SHIFT & SYS_PMBIDR_EL1_ALIGN_MASK; - spe_pmu->align = 1 << fld; - if (spe_pmu->align > SZ_2K) { - dev_err(dev, "unsupported PMBIDR.Align [%d] on CPU %d\n", - fld, smp_processor_id()); - return; - } - - /* It's now safe to read PMSIDR and figure out what we've got */ - reg = read_sysreg_s(SYS_PMSIDR_EL1); - if (reg & BIT(SYS_PMSIDR_EL1_FE_SHIFT)) - spe_pmu->features |= SPE_PMU_FEAT_FILT_EVT; - - if (reg & BIT(SYS_PMSIDR_EL1_FT_SHIFT)) - spe_pmu->features |= SPE_PMU_FEAT_FILT_TYP; - - if (reg & BIT(SYS_PMSIDR_EL1_FL_SHIFT)) - spe_pmu->features |= SPE_PMU_FEAT_FILT_LAT; - - if (reg & BIT(SYS_PMSIDR_EL1_ARCHINST_SHIFT)) - spe_pmu->features |= SPE_PMU_FEAT_ARCH_INST; - - if (reg & BIT(SYS_PMSIDR_EL1_LDS_SHIFT)) - spe_pmu->features |= SPE_PMU_FEAT_LDS; - - if (reg & BIT(SYS_PMSIDR_EL1_ERND_SHIFT)) - spe_pmu->features |= SPE_PMU_FEAT_ERND; - - /* This field has a spaced out encoding, so just use a look-up */ - fld = reg >> SYS_PMSIDR_EL1_INTERVAL_SHIFT & SYS_PMSIDR_EL1_INTERVAL_MASK; - switch (fld) { - case 0: - spe_pmu->min_period = 256; - break; - case 2: - spe_pmu->min_period = 512; - break; - case 3: - spe_pmu->min_period = 768; - break; - case 4: - spe_pmu->min_period = 1024; - break; - case 5: - spe_pmu->min_period = 1536; - break; - case 6: - spe_pmu->min_period = 2048; - break; - case 7: - spe_pmu->min_period = 3072; - break; - default: - dev_warn(dev, "unknown PMSIDR_EL1.Interval [%d]; assuming 8\n", - fld); - fallthrough; - case 8: - spe_pmu->min_period = 4096; - } + struct perf_output_handle *handle = this_cpu_ptr(spe_pmu_local->handle); + struct perf_event *event = handle->event; + u64 pmbsr; - /* Maximum record size. If it's out-of-range, then fail the probe */ - fld = reg >> SYS_PMSIDR_EL1_MAXSIZE_SHIFT & SYS_PMSIDR_EL1_MAXSIZE_MASK; - spe_pmu->max_record_sz = 1 << fld; - if (spe_pmu->max_record_sz > SZ_2K || spe_pmu->max_record_sz < 16) { - dev_err(dev, "unsupported PMSIDR_EL1.MaxSize [%d] on CPU %d\n", - fld, smp_processor_id()); + if (!perf_get_aux(handle)) return; - } - - fld = reg >> SYS_PMSIDR_EL1_COUNTSIZE_SHIFT & SYS_PMSIDR_EL1_COUNTSIZE_MASK; - switch (fld) { - default: - dev_warn(dev, "unknown PMSIDR_EL1.CountSize [%d]; assuming 2\n", - fld); - fallthrough; - case 2: - spe_pmu->counter_sz = 12; - } - - dev_info(dev, - "probed for CPUs %*pbl [max_record_sz %u, align %u, features 0x%llx]\n", - cpumask_pr_args(&spe_pmu->supported_cpus), - spe_pmu->max_record_sz, spe_pmu->align, spe_pmu->features); - - spe_pmu->features |= SPE_PMU_FEAT_DEV_PROBED; - return; -} -static void __arm_spe_pmu_reset_local(void) -{ /* - * This is probably overkill, as we have no idea where we're - * draining any buffered data to... + * If we've lost data, disable profiling and also set the PARTIAL + * flag to indicate that the last record is corrupted. */ - arm_spe_pmu_disable_and_drain_local(); - - /* Reset the buffer base pointer */ - write_sysreg_s(0, SYS_PMBPTR_EL1); - isb(); - - /* Clear any pending management interrupts */ - write_sysreg_s(0, SYS_PMBSR_EL1); - isb(); -} - -static void __arm_spe_pmu_setup_one(void *info) -{ - struct arm_spe_pmu *spe_pmu = info; - - __arm_spe_pmu_reset_local(); - enable_percpu_irq(spe_pmu->irq, IRQ_TYPE_NONE); -} - -static void __arm_spe_pmu_stop_one(void *info) -{ - struct arm_spe_pmu *spe_pmu = info; - - disable_percpu_irq(spe_pmu->irq); - __arm_spe_pmu_reset_local(); -} - -static int arm_spe_pmu_cpu_startup(unsigned int cpu, struct hlist_node *node) -{ - struct arm_spe_pmu *spe_pmu; - - spe_pmu = hlist_entry_safe(node, struct arm_spe_pmu, hotplug_node); - if (!cpumask_test_cpu(cpu, &spe_pmu->supported_cpus)) - return 0; - - __arm_spe_pmu_setup_one(spe_pmu); - return 0; -} - -static int arm_spe_pmu_cpu_teardown(unsigned int cpu, struct hlist_node *node) -{ - struct arm_spe_pmu *spe_pmu; - - spe_pmu = hlist_entry_safe(node, struct arm_spe_pmu, hotplug_node); - if (!cpumask_test_cpu(cpu, &spe_pmu->supported_cpus)) - return 0; - - __arm_spe_pmu_stop_one(spe_pmu); - return 0; -} - -static int arm_spe_pmu_dev_init(struct arm_spe_pmu *spe_pmu) -{ - int ret; - cpumask_t *mask = &spe_pmu->supported_cpus; - - /* Make sure we probe the hardware on a relevant CPU */ - ret = smp_call_function_any(mask, __arm_spe_pmu_dev_probe, spe_pmu, 1); - if (ret || !(spe_pmu->features & SPE_PMU_FEAT_DEV_PROBED)) - return -ENXIO; - - /* Request our PPIs (note that the IRQ is still disabled) */ - ret = request_percpu_irq(spe_pmu->irq, arm_spe_pmu_irq_handler, DRVNAME, - spe_pmu->handle); - if (ret) - return ret; + if (FIELD_GET(PMBSR_EL1_DL, pmbsr)) + perf_aux_output_flag(handle, PERF_AUX_FLAG_TRUNCATED | + PERF_AUX_FLAG_PARTIAL); - /* - * Register our hotplug notifier now so we don't miss any events. - * This will enable the IRQ for any supported CPUs that are already - * up. - */ - ret = cpuhp_state_add_instance(arm_spe_pmu_online, - &spe_pmu->hotplug_node); - if (ret) - free_percpu_irq(spe_pmu->irq, spe_pmu->handle); + /* Report collisions to userspace so that it can up the period */ + if (FIELD_GET(PMBSR_EL1_COLL, pmbsr)) + perf_aux_output_flag(handle, PERF_AUX_FLAG_COLLISION); - return ret; -} + arm_spe_perf_aux_output_end(handle); -static void arm_spe_pmu_dev_teardown(struct arm_spe_pmu *spe_pmu) -{ - cpuhp_state_remove_instance(arm_spe_pmu_online, &spe_pmu->hotplug_node); - free_percpu_irq(spe_pmu->irq, spe_pmu->handle); + if (act == SPE_PMU_BUF_FAULT_ACT_OK) { + if (!(handle->aux_flags & PERF_AUX_FLAG_TRUNCATED)) { + arm_spe_perf_aux_output_begin(handle, event); + isb(); + } + } } -/* Driver and device probing */ -static int arm_spe_pmu_irq_probe(struct arm_spe_pmu *spe_pmu) +static bool arm_spe_pmu_set_cap(struct arm_spe_pmu *spe_pmu) { - struct platform_device *pdev = spe_pmu->pdev; - int irq = platform_get_irq(pdev, 0); - - if (irq < 0) - return -ENXIO; + struct arm_spe *p; + struct device *dev = &spe_pmu->pdev->dev; - if (!irq_is_percpu(irq)) { - dev_err(&pdev->dev, "expected PPI but got SPI (%d)\n", irq); - return -EINVAL; + p = arm_spe_get_desc(); + if (!p) { + dev_err(dev, "get spe pmu cap from arm spe driver failed!"); + return false; } - if (irq_get_percpu_devid_partition(irq, &spe_pmu->supported_cpus)) { - dev_err(&pdev->dev, "failed to get PPI partition (%d)\n", irq); - return -EINVAL; - } + spe_pmu->supported_cpus = p->supported_cpus; + spe_pmu->irq = p->irq; + spe_pmu->pmsver = p->pmsver; + spe_pmu->align = p->align; + spe_pmu->features = p->features; + spe_pmu->min_period = p->min_period; + spe_pmu->max_record_sz = p->max_record_sz; + spe_pmu->counter_sz = p->counter_sz; - spe_pmu->irq = irq; - return 0; + return true; } static const struct of_device_id arm_spe_pmu_of_match[] = { @@ -1188,7 +957,7 @@ static const struct of_device_id arm_spe_pmu_of_match[] = { MODULE_DEVICE_TABLE(of, arm_spe_pmu_of_match); static const struct platform_device_id arm_spe_match[] = { - { ARMV8_SPE_PDEV_NAME, 0}, + { ARMV8_SPE_PMU_PDEV_NAME, 0}, { } }; MODULE_DEVICE_TABLE(platform, arm_spe_match); @@ -1221,22 +990,17 @@ static int arm_spe_pmu_device_probe(struct platform_device *pdev) spe_pmu->pdev = pdev; platform_set_drvdata(pdev, spe_pmu); - ret = arm_spe_pmu_irq_probe(spe_pmu); - if (ret) + if (!arm_spe_pmu_set_cap(spe_pmu)) goto out_free_handle; - ret = arm_spe_pmu_dev_init(spe_pmu); - if (ret) - goto out_free_handle; + spe_pmu_local = spe_pmu; ret = arm_spe_pmu_perf_init(spe_pmu); if (ret) - goto out_teardown_dev; + goto out_free_handle; return 0; -out_teardown_dev: - arm_spe_pmu_dev_teardown(spe_pmu); out_free_handle: free_percpu(spe_pmu->handle); return ret; @@ -1247,7 +1011,6 @@ static int arm_spe_pmu_device_remove(struct platform_device *pdev) struct arm_spe_pmu *spe_pmu = platform_get_drvdata(pdev); arm_spe_pmu_perf_destroy(spe_pmu); - arm_spe_pmu_dev_teardown(spe_pmu); free_percpu(spe_pmu->handle); return 0; } @@ -1265,29 +1028,17 @@ static struct platform_driver arm_spe_pmu_driver = { static int __init arm_spe_pmu_init(void) { - int ret; - - ret = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN, DRVNAME, - arm_spe_pmu_cpu_startup, - arm_spe_pmu_cpu_teardown); - if (ret < 0) - return ret; - arm_spe_pmu_online = ret; - - ret = platform_driver_register(&arm_spe_pmu_driver); - if (ret) - cpuhp_remove_multi_state(arm_spe_pmu_online); - - return ret; + arm_spe_sampling_for_perf_callback_register(arm_spe_sampling_process); + return platform_driver_register(&arm_spe_pmu_driver); } static void __exit arm_spe_pmu_exit(void) { + arm_spe_sampling_for_perf_callback_register(NULL); platform_driver_unregister(&arm_spe_pmu_driver); - cpuhp_remove_multi_state(arm_spe_pmu_online); } -module_init(arm_spe_pmu_init); +late_initcall(arm_spe_pmu_init); module_exit(arm_spe_pmu_exit); MODULE_DESCRIPTION("Perf driver for the ARMv8.2 Statistical Profiling Extension"); diff --git a/include/linux/mem_sampling.h b/include/linux/mem_sampling.h new file mode 100644 index 0000000000000000000000000000000000000000..f2721e6d88a6cb4cfeee3494be50c254fd8201bd --- /dev/null +++ b/include/linux/mem_sampling.h @@ -0,0 +1,130 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * mem_sampling.h: declare the mem_sampling abstract layer and provide + * unified pmu sampling for NUMA, DAMON, etc. + * + * Sample records are converted to mem_sampling_record, and then + * mem_sampling_record_captured_cb_type invoke the callbacks to + * pass the record. + * + * Copyright (C) 2012 ARM Ltd. + */ +#ifndef __MEM_SAMPLING_H +#define __MEM_SAMPLING_H + +extern struct static_key_false mem_sampling_access_hints; + +enum mem_sampling_sample_type { + MEM_SAMPLING_L1D_ACCESS = 1 << 0, + MEM_SAMPLING_L1D_MISS = 1 << 1, + MEM_SAMPLING_LLC_ACCESS = 1 << 2, + MEM_SAMPLING_LLC_MISS = 1 << 3, + MEM_SAMPLING_TLB_ACCESS = 1 << 4, + MEM_SAMPLING_TLB_MISS = 1 << 5, + MEM_SAMPLING_BRANCH_MISS = 1 << 6, + MEM_SAMPLING_REMOTE_ACCESS = 1 << 7, +}; + +enum mem_sampling_op_type { + MEM_SAMPLING_LD = 1 << 0, + MEM_SAMPLING_ST = 1 << 1, +}; + +struct mem_sampling_record { + enum mem_sampling_sample_type type; + int err; + u32 op; + u32 latency; + u64 from_ip; + u64 to_ip; + u64 timestamp; + u64 virt_addr; + u64 phys_addr; + u64 context_id; + u16 source; +}; + +/* + * Callbacks should be registered using mem_sampling_record_cb_register() + * by NUMA, DAMON and etc during their initialisation. + * Callbacks will be invoked on new hardware pmu records caputured. + */ +typedef void (*mem_sampling_record_cb_type)(struct mem_sampling_record *record); +void mem_sampling_record_cb_register(mem_sampling_record_cb_type cb); +void mem_sampling_record_cb_unregister(mem_sampling_record_cb_type cb); + +#ifdef CONFIG_MEM_SAMPLING +void mem_sampling_sched_in(struct task_struct *prev, struct task_struct *curr); +#else +static inline void mem_sampling_sched_in(struct task_struct *prev, struct task_struct *curr) { }; +#endif + +/* invoked by specific mem_sampling */ +typedef void (*mem_sampling_cb_type)(struct mem_sampling_record *record_base, + int n_records); + + +struct mem_sampling_ops_struct { + int (*sampling_start)(void); + void (*sampling_stop)(void); + void (*sampling_continue)(void); +}; +extern struct mem_sampling_ops_struct mem_sampling_ops; + +enum mem_sampling_type_enum { + MEM_SAMPLING_ARM_SPE, + MEM_SAMPLING_UNSUPPORTED +}; + +enum user_switch_type { + USER_SWITCH_AWAY_FROM_MEM_SAMPLING, + USER_SWITCH_BACK_TO_MEM_SAMPLING, +}; +typedef void (*mem_sampling_user_switch_cb_type)(enum user_switch_type type); + +enum mem_sampling_saved_state_e { + MEM_SAMPLING_STATE_ENABLE, + MEM_SAMPLING_STATE_DISABLE, + MEM_SAMPLING_STATE_EMPTY, +}; + +#ifdef CONFIG_ARM_SPE +int arm_spe_start(void); +void arm_spe_stop(void); +void arm_spe_continue(void); +int arm_spe_enabled(void); +void arm_spe_record_capture_callback_register(mem_sampling_cb_type cb); +void arm_spe_user_switch_callback_register(mem_sampling_user_switch_cb_type cb); +#else +static inline void arm_spe_stop(void) { }; +static inline void arm_spe_continue(void) { }; +static inline void arm_spe_record_capture_callback_register(mem_sampling_cb_type cb) { }; +static inline void arm_spe_user_switch_callback_register(mem_sampling_user_switch_cb_type cb) { }; + +static inline int arm_spe_start(void) +{ + return 0; +} + +static inline int arm_spe_enabled(void) +{ + return 0; +} +#endif /* CONFIG_ARM_SPE */ + +extern enum mem_sampling_saved_state_e mem_sampling_saved_state; + +extern struct static_key_false mem_sampling_access_hints; +#ifdef CONFIG_MEM_SAMPLING +extern void set_mem_sampling_state(bool enabled); +#else +static inline void set_mem_sampling_state(bool enabled) +{ +} +#endif /* CONFIG_MEM_SAMPLING */ + +#ifdef CONFIG_NUMABALANCING_MEM_SAMPLING +void numa_balancing_mem_sampling_cb_register(void); +void numa_balancing_mem_sampling_cb_unregister(void); +#endif /* CONFIG_NUMABALANCING_MEM_SAMPLING */ +#endif /* __MEM_SAMPLING_H */ diff --git a/include/linux/migrate.h b/include/linux/migrate.h index ade4993f5fab4d84facfa9876842cfc692dc1b7f..098e05338d25047d98be4d02b5418d16424123c9 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -209,4 +209,8 @@ void migrate_vma_finalize(struct migrate_vma *migrate); #endif /* CONFIG_MIGRATION */ +#ifdef CONFIG_NUMABALANCING_MEM_SAMPLING +void do_numa_access(struct task_struct *p, u64 laddr, u64 paddr); +#endif /* CONFIG_NUMABALANCING_MEM_SAMPLING */ + #endif /* _LINUX_MIGRATE_H */ diff --git a/include/linux/perf/arm_pmu.h b/include/linux/perf/arm_pmu.h index c7a35d32127271d8f2e3a3b8f796f10466436e03..64cef5f97080903c7bfd2af1409d497bfa923286 100644 --- a/include/linux/perf/arm_pmu.h +++ b/include/linux/perf/arm_pmu.h @@ -14,6 +14,8 @@ #include #include +#include "../../../drivers/arm/spe/spe.h" + #ifdef CONFIG_ARM_PMU /* @@ -205,6 +207,12 @@ void armpmu_free_irq(int irq, int cpu); #endif /* CONFIG_ARM_PMU */ #define ARMV8_SPE_PDEV_NAME "arm,spe-v1" +#define ARMV8_SPE_PMU_PDEV_NAME "arm,pmu,spe-v1" + #define ARMV8_TRBE_PDEV_NAME "arm,trbe" +typedef void (*perf_sampling_cb_type)(enum arm_spe_buf_fault_action act); +void arm_spe_sampling_for_perf_callback_register(perf_sampling_cb_type cb); +struct arm_spe *arm_spe_get_desc(void); +void arm_spe_set_user(enum arm_spe_user_e user); #endif /* __ARM_PMU_H__ */ diff --git a/include/linux/sched/numa_balancing.h b/include/linux/sched/numa_balancing.h index 3988762efe15c0e5a80602e2c9acb6a5820a740e..c9262641b6a0d81a849b0afc572c645ea76f41cd 100644 --- a/include/linux/sched/numa_balancing.h +++ b/include/linux/sched/numa_balancing.h @@ -15,6 +15,15 @@ #define TNF_FAULT_LOCAL 0x08 #define TNF_MIGRATE_FAIL 0x10 +#ifdef CONFIG_NUMABALANCING_MEM_SAMPLING +struct mem_sampling_numa_access_work { + struct callback_head work; + u64 laddr, paddr; + /* Test for debug : decode buffer cpu not same with handle interrupt cpu*/ + int cpu; +}; +#endif /* CONFIG_NUMABALANCING_MEM_SAMPLING */ + #ifdef CONFIG_NUMA_BALANCING extern void task_numa_fault(int last_node, int node, int pages, int flags); extern pid_t task_numa_group_id(struct task_struct *p); @@ -43,5 +52,4 @@ static inline bool should_numa_migrate_memory(struct task_struct *p, return true; } #endif - #endif /* _LINUX_SCHED_NUMA_BALANCING_H */ diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index 5cd5b3c579d3735bfb8109f57bfb590dc59b3359..a97d73a6e426fe583d17858e142a328c96fb026c 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -126,4 +126,26 @@ int sched_cluster_handler(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos); #endif +#define MEM_SAMPLING_DISABLED 0x0 +#define MEM_SAMPLING_NORMAL 0x1 + +#ifdef CONFIG_MEM_SAMPLING +extern int sysctl_mem_sampling_mode; +int sysctl_mem_sampling_enable(struct ctl_table *table, int write, void *buffer, + size_t *lenp, loff_t *ppos); +#else +#define sysctl_mem_sampling_mode 0 +#endif + +#define NUMA_BALANCING_HW_DISABLED 0x0 +#define NUMA_BALANCING_HW_NORMAL 0x1 + +#ifdef CONFIG_NUMABALANCING_MEM_SAMPLING +extern int sysctl_numa_balacing_hw_mode; +int sysctl_numabalancing_mem_sampling(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos); +#else +#define sysctl_numa_balacing_hw_mode 0 +#endif + #endif /* _LINUX_SCHED_SYSCTL_H */ diff --git a/include/trace/events/kmem.h b/include/trace/events/kmem.h index f65b1f6db22d868485a6cd43eb884935bda11d0f..345ba9350dd00aeaabb82824db09b529654ad91b 100644 --- a/include/trace/events/kmem.h +++ b/include/trace/events/kmem.h @@ -8,6 +8,7 @@ #include #include #include +#include DECLARE_EVENT_CLASS(kmem_alloc, @@ -363,6 +364,85 @@ TRACE_EVENT(rss_stat, __entry->member, __entry->size) ); + +#ifdef CONFIG_NUMABALANCING_MEM_SAMPLING +TRACE_EVENT(mm_numa_migrating, + + TP_PROTO(u64 laddr, int page_nid, int target_nid, + int migrate_success), + + TP_ARGS(laddr, page_nid, target_nid, migrate_success), + + TP_STRUCT__entry( + __field(u64, laddr) + __field(int, page_nid) + __field(int, target_nid) + __field(int, migrate_success) + ), + + TP_fast_assign( + __entry->laddr = laddr; + __entry->page_nid = page_nid; + __entry->target_nid = target_nid; + __entry->migrate_success = !!(migrate_success); + ), + + TP_printk("laddr=%llu page_nid=%d target_nid=%d migrate_success=%d", + __entry->laddr, __entry->page_nid, + __entry->target_nid, __entry->migrate_success) +); + +TRACE_EVENT(mm_mem_sampling_access_record, + + TP_PROTO(u64 laddr, u64 paddr, int cpuid, int pid), + + TP_ARGS(laddr, paddr, cpuid, pid), + + TP_STRUCT__entry( + __field(u64, laddr) + __field(u64, paddr) + __field(int, cpuid) + __field(int, pid) + ), + + TP_fast_assign( + __entry->laddr = laddr; + __entry->paddr = paddr; + __entry->cpuid = cpuid; + __entry->pid = pid; + ), + + TP_printk("laddr=%llu paddr=%llu cpuid=%d pid=%d", + __entry->laddr, __entry->paddr, + __entry->cpuid, __entry->pid) +); +#endif /* CONFIG_NUMABALANCING_MEM_SAMPLING */ +#ifdef CONFIG_ARM_SPE +TRACE_EVENT(spe_record, + TP_PROTO(struct mem_sampling_record *record, int cpuid), + + TP_ARGS(record, cpuid), + + TP_STRUCT__entry( + __field(u64, laddr) + __field(u64, paddr) + __field(int, cpuid) + __field(int, pid) + ), + + TP_fast_assign( + __entry->laddr = record->virt_addr; + __entry->paddr = record->phys_addr; + __entry->cpuid = cpuid; + __entry->pid = record->context_id; + + ), + + TP_printk("laddr=%llu paddr=%llu cpuid=%d pid=%d", + __entry->laddr, __entry->paddr, + __entry->cpuid, __entry->pid) +); +#endif /* CONFIG_ARM_SPE */ #endif /* _TRACE_KMEM_H */ /* This part must be outside protection */ diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 4b9af3673285e63c87e159cd1f75e172f9fdf6e9..6d0e287f2bc82c525c10d4b93494ef05ff91b6f0 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -19,6 +19,7 @@ #include #include +#include #include #include "../workqueue_internal.h" @@ -3540,6 +3541,116 @@ int sysctl_numa_balancing(struct ctl_table *table, int write, #endif #endif +DEFINE_STATIC_KEY_FALSE(sched_numabalancing_mem_sampling); + +#ifdef CONFIG_NUMABALANCING_MEM_SAMPLING + +int sysctl_numa_balacing_hw_mode; + +static void __set_numabalancing_mem_sampling_state(bool enabled) +{ + if (enabled) { + numa_balancing_mem_sampling_cb_register(); + static_branch_enable(&sched_numabalancing_mem_sampling); + } else { + numa_balancing_mem_sampling_cb_unregister(); + static_branch_disable(&sched_numabalancing_mem_sampling); + } +} + +void set_numabalancing_mem_sampling_state(bool enabled) +{ + if (enabled) + sysctl_numa_balacing_hw_mode = NUMA_BALANCING_HW_NORMAL; + else + sysctl_numa_balacing_hw_mode = NUMA_BALANCING_HW_DISABLED; + __set_numabalancing_mem_sampling_state(enabled); +} + +#ifdef CONFIG_PROC_SYSCTL + +int sysctl_numabalancing_mem_sampling(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + struct ctl_table t; + int err; + int state = static_branch_likely(&sched_numabalancing_mem_sampling); + + if (write && !capable(CAP_SYS_ADMIN)) + return -EPERM; + + t = *table; + t.data = &state; + err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos); + if (err < 0) + return err; + + if (write && static_branch_likely(&mem_sampling_access_hints)) + set_numabalancing_mem_sampling_state(state); + + return err; +} +#endif +#endif + +DEFINE_STATIC_KEY_FALSE(mem_sampling_access_hints); + +#ifdef CONFIG_MEM_SAMPLING +int sysctl_mem_sampling_mode; + +static void __set_mem_sampling_state(bool enabled) +{ + if (enabled) + static_branch_enable(&mem_sampling_access_hints); + else + static_branch_disable(&mem_sampling_access_hints); +} + +void set_mem_sampling_state(bool enabled) +{ + if (!mem_sampling_ops.sampling_start) + return; + if (enabled) + sysctl_mem_sampling_mode = MEM_SAMPLING_NORMAL; + else + sysctl_mem_sampling_mode = MEM_SAMPLING_DISABLED; + __set_mem_sampling_state(enabled); + +#ifdef CONFIG_NUMABALANCING_MEM_SAMPLING + if (!enabled) + set_numabalancing_mem_sampling_state(enabled); +#endif +} + +#ifdef CONFIG_PROC_SYSCTL +int sysctl_mem_sampling_enable(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + struct ctl_table t; + int err; + int state = sysctl_mem_sampling_mode; + + if (write && !capable(CAP_SYS_ADMIN)) + return -EPERM; + + t = *table; + t.data = &state; + err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos); + if (err < 0) + return err; + if (write) { + if (mem_sampling_saved_state == MEM_SAMPLING_STATE_EMPTY) + set_mem_sampling_state(state); + else + mem_sampling_saved_state = state ? MEM_SAMPLING_STATE_ENABLE : + MEM_SAMPLING_STATE_DISABLE; + } + + return err; +} +#endif +#endif + #ifdef CONFIG_SCHEDSTATS DEFINE_STATIC_KEY_FALSE(sched_schedstats); @@ -4066,6 +4177,7 @@ static struct rq *finish_task_switch(struct task_struct *prev) prev_state = prev->state; vtime_task_switch(prev); perf_event_task_sched_in(prev, current); + mem_sampling_sched_in(prev, current); finish_task(prev); tick_nohz_task_switch(); finish_lock_switch(rq); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 654f6bc4b68247505dffc6afd9398f0bf7b9d609..5759a1aedec306a9a3624d26972b1d29408a3db9 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -30,6 +30,7 @@ #endif #include #include +#include /* * Targeted preemption latency for CPU-bound tasks: @@ -2967,6 +2968,17 @@ static void task_tick_numa(struct rq *rq, struct task_struct *curr) struct callback_head *work = &curr->numa_work; u64 period, now; +#ifdef CONFIG_NUMABALANCING_MEM_SAMPLING + /* + * If we are using access hints from hardware (like using + * SPE), don't scan the address space. + * Note that currently PMD-level page migration is not + * supported. + */ + if (static_branch_unlikely(&mem_sampling_access_hints) && + static_branch_unlikely(&sched_numabalancing_mem_sampling)) + return; +#endif /* * We don't care about NUMA placement if we don't have memory. */ diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 14d48f6380fa533411c64753a177066f8fca1d8e..ac385325e4d0bf326fcba58b0c313ef5277574b1 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2122,6 +2122,7 @@ static const_debug __maybe_unused unsigned int sysctl_sched_features = extern struct static_key_false sched_numa_balancing; extern struct static_key_false sched_schedstats; +extern struct static_key_false sched_numabalancing_mem_sampling; static inline u64 global_rt_period(void) { diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 3941856c19d1d7ff3366bbdad43c058d90f0f11b..5804be697a2518d235b77ef49674ebb3c08b75c5 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1868,6 +1868,28 @@ static struct ctl_table kern_table[] = { }, #endif /* CONFIG_NUMA_BALANCING */ #endif /* CONFIG_SCHED_DEBUG */ +#ifdef CONFIG_MEM_SAMPLING + { + .procname = "mem_sampling_enable", + .data = NULL, /* filled in by handler */ + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = sysctl_mem_sampling_enable, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, +#endif /* CONFIG_MEM_SAMPLING */ +#ifdef CONFIG_NUMABALANCING_MEM_SAMPLING + { + .procname = "numa_balancing_mem_sampling", + .data = NULL, /* filled in by handler */ + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = sysctl_numabalancing_mem_sampling, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, +#endif /* CONFIG_NUMABALANCING_MEM_SAMPLING */ { .procname = "sched_rt_period_us", .data = &sysctl_sched_rt_period, diff --git a/mm/Kconfig b/mm/Kconfig index 0f9209cd969b61e81acea0a56a351f24487383b2..175ebd7c3afb58748c2e37e37453fa8986573bc0 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -1014,6 +1014,29 @@ config EXTEND_HUGEPAGE_MAPPING help Introduce vmalloc/vmap/remap interfaces that handle only hugepages. +config MEM_SAMPLING + bool "Use hardware memory sampling for kernel features(NUMA, DAMON, etc.)" + default n + select ARM_SPE if ARM64 + help + Memory sampling is primarily based on specific hardware capabilities, + which enable hardware PMUs to sample memory access for use by kernel + features.. It requires at least one hardware pmu (e.g. ARM_SPE) to + be enabled. + +config NUMABALANCING_MEM_SAMPLING + bool "Use hardware memory samples for numa balancing" + depends on MEM_SAMPLING && NUMA_BALANCING + default n + help + This feature relies on hardware sampling, and will use memory access + information obtained from hardware sampling in the NUMA balancing + policy instead of the native software PROT_NONE scheme. Turning on + this feature may have a performance impact on some workloads, for + example, lightweight memory access programs. + + if unsure, say N to disable the NUMABALANCING_MEM_SAMPLING. + source "mm/damon/Kconfig" endmenu diff --git a/mm/Makefile b/mm/Makefile index a014a5e08f7b6a011a27088ed197208f7d4ad442..112966190c1dbd5f638643e61daf76ccfe418f31 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -133,3 +133,4 @@ obj-$(CONFIG_MEMORY_RELIABLE) += mem_reliable.o obj-$(CONFIG_MEMCG_MEMFS_INFO) += memcg_memfs_info.o obj-$(CONFIG_PAGE_CACHE_LIMIT) += page_cache_limit.o obj-$(CONFIG_CLEAR_FREELIST_PAGE) += clear_freelist_page.o +obj-$(CONFIG_MEM_SAMPLING) += mem_sampling.o diff --git a/mm/mem_sampling.c b/mm/mem_sampling.c new file mode 100644 index 0000000000000000000000000000000000000000..480c467f7b04dc9f540483232238ea026c86f075 --- /dev/null +++ b/mm/mem_sampling.c @@ -0,0 +1,188 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#define pr_fmt(fmt) "mem_sampling: " fmt + +#include +#include +#include +#include +#include +#include +#include + +struct mem_sampling_ops_struct mem_sampling_ops; + +static int mem_sampling_override __initdata; + +enum mem_sampling_saved_state_e mem_sampling_saved_state = MEM_SAMPLING_STATE_EMPTY; + +struct mem_sampling_record_cb_list_entry { + struct list_head list; + mem_sampling_record_cb_type cb; +}; +LIST_HEAD(mem_sampling_record_cb_list); + +void mem_sampling_record_cb_register(mem_sampling_record_cb_type cb) +{ + struct mem_sampling_record_cb_list_entry *cb_entry, *tmp; + + list_for_each_entry_safe(cb_entry, tmp, &mem_sampling_record_cb_list, list) { + if (cb_entry->cb == cb) { + pr_info("mem_sampling record cb already registered\n"); + return; + } + } + + cb_entry = NULL; + cb_entry = kmalloc(sizeof(struct mem_sampling_record_cb_list_entry), GFP_KERNEL); + if (!cb_entry) { + pr_info("mem_sampling record cb entry alloc memory failed\n"); + return; + } + + cb_entry->cb = cb; + list_add(&(cb_entry->list), &mem_sampling_record_cb_list); +} + +void mem_sampling_record_cb_unregister(mem_sampling_record_cb_type cb) +{ + struct mem_sampling_record_cb_list_entry *cb_entry, *tmp; + + list_for_each_entry_safe(cb_entry, tmp, &mem_sampling_record_cb_list, list) { + if (cb_entry->cb == cb) { + list_del(&cb_entry->list); + kfree(cb_entry); + return; + } + } +} + +void mem_sampling_sched_in(struct task_struct *prev, struct task_struct *curr) +{ + if (!static_branch_unlikely(&mem_sampling_access_hints)) + return; + + if (!mem_sampling_ops.sampling_start) + return; + + if (!curr->mm) + goto out; + + mem_sampling_ops.sampling_start(); + + return; + +out: + mem_sampling_ops.sampling_stop(); +} + +void mem_sampling_process(struct mem_sampling_record *record_base, int nr_records) +{ + int i; + struct mem_sampling_record *record; + struct mem_sampling_record_cb_list_entry *cb_entry, *tmp; + + if (list_empty(&mem_sampling_record_cb_list)) + goto out; + + for (i = 0; i < nr_records; i++) { + record = record_base + i; + list_for_each_entry_safe(cb_entry, tmp, &mem_sampling_record_cb_list, list) { + cb_entry->cb(record); + } + } +out: + /* if mem_sampling_access_hints is set to false, stop sampling */ + if (static_branch_unlikely(&mem_sampling_access_hints)) + mem_sampling_ops.sampling_continue(); + else + mem_sampling_ops.sampling_stop(); +} + +static inline enum mem_sampling_type_enum mem_sampling_get_type(void) +{ +#ifdef CONFIG_ARM_SPE + return MEM_SAMPLING_ARM_SPE; +#else + return MEM_SAMPLING_UNSUPPORTED; +#endif +} + +void mem_sampling_user_switch_process(enum user_switch_type type) +{ + bool state; + + if (type > USER_SWITCH_BACK_TO_MEM_SAMPLING) { + pr_err("user switch type error.\n"); + return; + } + + if (type == USER_SWITCH_AWAY_FROM_MEM_SAMPLING) { + /* save state only the status when leave mem_sampling for the first time */ + if (mem_sampling_saved_state != MEM_SAMPLING_STATE_EMPTY) + return; + + if (static_branch_unlikely(&mem_sampling_access_hints)) + mem_sampling_saved_state = MEM_SAMPLING_STATE_ENABLE; + else + mem_sampling_saved_state = MEM_SAMPLING_STATE_DISABLE; + + pr_debug("user switch away from mem_sampling, %s is saved, set to disable.\n", + mem_sampling_saved_state ? "disabled" : "enabled"); + + set_mem_sampling_state(false); + } else { + /* If the state is not backed up, do not restore it */ + if (mem_sampling_saved_state == MEM_SAMPLING_STATE_EMPTY) + return; + + state = (mem_sampling_saved_state == MEM_SAMPLING_STATE_ENABLE) ? true : false; + set_mem_sampling_state(state); + mem_sampling_saved_state = MEM_SAMPLING_STATE_EMPTY; + + pr_debug("user switch back to mem_sampling, set to saved %s.\n", + state ? "enalbe" : "disable"); + } +} + +static void __init check_mem_sampling_enable(void) +{ + bool mem_sampling_default = false; + + /* Parsed by setup_mem_sampling. override == 1 enables, -1 disables */ + if (mem_sampling_override) + set_mem_sampling_state(mem_sampling_override == 1); + else + set_mem_sampling_state(mem_sampling_default); +} + +static int __init mem_sampling_init(void) +{ + enum mem_sampling_type_enum mem_sampling_type = mem_sampling_get_type(); + + switch (mem_sampling_type) { + case MEM_SAMPLING_ARM_SPE: + if (!arm_spe_enabled()) { + set_mem_sampling_state(false); + return -ENODEV; + } + mem_sampling_ops.sampling_start = arm_spe_start, + mem_sampling_ops.sampling_stop = arm_spe_stop, + mem_sampling_ops.sampling_continue = arm_spe_continue, + + arm_spe_record_capture_callback_register(mem_sampling_process); + arm_spe_user_switch_callback_register(mem_sampling_user_switch_process); + break; + + default: + pr_info("unsupport hardware pmu type(%d), disable access hint!\n", + mem_sampling_type); + set_mem_sampling_state(false); + return -ENODEV; + } + check_mem_sampling_enable(); + + pr_info("mem_sampling layer access profiling setup for NUMA Balancing and DAMON etc.\n"); + return 0; +} +late_initcall(mem_sampling_init); diff --git a/mm/memory.c b/mm/memory.c index 494f40362174fe280261667ddec51cfa839624f9..eb7cb36abf4541e4253b1e0e761ea2e54022794e 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4480,6 +4480,92 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf) return 0; } +#ifdef CONFIG_NUMABALANCING_MEM_SAMPLING + +/* + * Called from task_work context to act upon the page access. + * + * Physical address (provided by SPE) is used directly instead + * of walking the page tables to get to the PTE/page. Hence we + * don't check if PTE is writable for the TNF_NO_GROUP + * optimization, which means RO pages are considered for grouping. + */ +void do_numa_access(struct task_struct *p, u64 laddr, u64 paddr) +{ + struct mm_struct *mm = p->mm; + struct vm_area_struct *vma; + struct page *page = NULL; + int page_nid = NUMA_NO_NODE; + int last_cpupid; + int target_nid; + int flags = 0; + + if (!mm) + return; + + if (!mmap_read_trylock(mm)) + return; + + vma = find_vma(mm, laddr); + if (!vma) + goto out_unlock; + + if (!vma_migratable(vma) || !vma_policy_mof(vma) || + is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_MIXEDMAP)) + goto out_unlock; + + if (!vma->vm_mm || + (vma->vm_file && (vma->vm_flags & (VM_READ|VM_WRITE)) == (VM_READ))) + goto out_unlock; + + if (!vma_is_accessible(vma)) + goto out_unlock; + + page = pfn_to_online_page(PHYS_PFN(paddr)); + if (!page || is_zone_device_page(page)) + goto out_unlock; + + if (unlikely(!PageLRU(page))) + goto out_unlock; + + /* TODO: handle PTE-mapped THP or PMD-mapped THP*/ + if (PageCompound(page)) + goto out_unlock; + + /* + * Flag if the page is shared between multiple address spaces. This + * is later used when determining whether to group tasks together + */ + if (page_mapcount(page) > 1 && (vma->vm_flags & VM_SHARED)) + flags |= TNF_SHARED; + + last_cpupid = page_cpupid_last(page); + page_nid = page_to_nid(page); + + target_nid = numa_migrate_prep(page, vma, laddr, page_nid, &flags); + if (target_nid == NUMA_NO_NODE) { + put_page(page); + goto out; + } + + /* Migrate to the requested node */ + if (migrate_misplaced_page(page, vma, target_nid)) { + page_nid = target_nid; + flags |= TNF_MIGRATED; + } else { + flags |= TNF_MIGRATE_FAIL; + } + +out: + trace_mm_numa_migrating(laddr, page_nid, target_nid, flags&TNF_MIGRATED); + if (page_nid != NUMA_NO_NODE) + task_numa_fault(last_cpupid, page_nid, 1, flags); + +out_unlock: + mmap_read_unlock(mm); +} +#endif /* CONFIG_NUMABALANCING_MEM_SAMPLING */ + static inline vm_fault_t create_huge_pmd(struct vm_fault *vmf) { if (vma_is_anonymous(vmf->vma)) diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 81bd26fb661f618b5cb23c3b2087bfe28d28b7bb..d81996ef0be09cd1a680c2797154a95dd483bc48 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -105,6 +105,10 @@ #include #include +#include +#include + +#include #include #include @@ -737,6 +741,76 @@ static unsigned long change_prot_numa(struct vm_area_struct *vma, } #endif /* CONFIG_NUMA_BALANCING */ +#ifdef CONFIG_NUMABALANCING_MEM_SAMPLING +static void task_mem_sampling_access_work(struct callback_head *work) +{ + struct mem_sampling_numa_access_work *iwork = + container_of(work, struct mem_sampling_numa_access_work, work); + struct task_struct *p = current; + int cpu = smp_processor_id(); + u64 laddr = iwork->laddr; + u64 paddr = iwork->paddr; + + kfree(iwork); + if (iwork->cpu != cpu) + return; + + do_numa_access(p, laddr, paddr); +} + +void numa_create_taskwork(u64 laddr, u64 paddr, int cpu) +{ + struct mem_sampling_numa_access_work *iwork = NULL; + + iwork = kzalloc(sizeof(*iwork), GFP_ATOMIC); + if (!iwork) + return; + + iwork->laddr = laddr; + iwork->paddr = paddr; + iwork->cpu = smp_processor_id(); + + init_task_work(&iwork->work, task_mem_sampling_access_work); + task_work_add(current, &iwork->work, TWA_RESUME); +} + +void numa_balancing_mem_sampling_cb(struct mem_sampling_record *record) +{ + struct task_struct *p = current; + u64 laddr = record->virt_addr; + u64 paddr = record->phys_addr; + + /* Discard kernel address accesses */ + if (laddr & (1UL << 63)) + return; + + if (p->pid != record->context_id) + return; + + trace_mm_mem_sampling_access_record(laddr, paddr, smp_processor_id(), + current->pid); + numa_create_taskwork(laddr, paddr, smp_processor_id()); +} + +void numa_balancing_mem_sampling_cb_register(void) +{ + mem_sampling_record_cb_register(numa_balancing_mem_sampling_cb); +} + +void numa_balancing_mem_sampling_cb_unregister(void) +{ + mem_sampling_record_cb_unregister(numa_balancing_mem_sampling_cb); +} +#else +static inline void numa_balancing_mem_sampling_cb_register(void) +{ +} + +static inline void numa_balancing_mem_sampling_cb_unregister(void) +{ +} +#endif /* CONFIG_NUMABALANCING_MEM_SAMPLING */ + static int queue_pages_test_walk(unsigned long start, unsigned long end, struct mm_walk *walk) { diff --git a/samples/bpf/spe/Makefile b/samples/bpf/spe/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..946bfdba163e393f8ac6c60f98965955b8ac817b --- /dev/null +++ b/samples/bpf/spe/Makefile @@ -0,0 +1,25 @@ +# SPDX-License-Identifier: GPL-2.0 + +include Makefile.arch + +INSTALL ?= install +CLANG ?= clang +CC ?= gcc + +BPFTOOL ?= bpftool +KERNEL_DIR ?= ../../../ + +MKFLAGS = -I$(KERNEL_DIR)/tools/lib -I$(KERNEL_DIR)/tools/include/uapi/ \ + -D__BPF_TRACING__ -D__TARGET_ARCH_${SRCARCH} +LDLIBBPF = -L$(KERNEL_DIR)/tools/lib/bpf/ -l:libbpf.a + +all: + $(CLANG) -O2 -g -Wall -target bpf -I. ${MKFLAGS} -c spe-record.bpf.c -o spe-record.bpf.o + $(BPFTOOL) gen skeleton spe-record.bpf.o > spe-record.skel.h + $(CC) -O2 -g -Wall ${MKFLAGS} spe-record.user.c -o spe-record ${LDLIBBPF} -lelf -lz --static + +clean: + rm -f spe-record + rm -f vmlinux.h + rm -f *.o + rm -f *.skel.h diff --git a/samples/bpf/spe/Makefile.arch b/samples/bpf/spe/Makefile.arch new file mode 100644 index 0000000000000000000000000000000000000000..f6a50f06dfc4538181b80b90f47c9a8a54c4b790 --- /dev/null +++ b/samples/bpf/spe/Makefile.arch @@ -0,0 +1,46 @@ +# SPDX-License-Identifier: GPL-2.0 +HOSTARCH := $(shell uname -m | sed -e s/i.86/x86/ -e s/x86_64/x86/ \ + -e s/sun4u/sparc/ -e s/sparc64/sparc/ \ + -e /arm64/!s/arm.*/arm/ -e s/sa110/arm/ \ + -e s/s390x/s390/ -e s/parisc64/parisc/ \ + -e s/ppc.*/powerpc/ -e s/mips.*/mips/ \ + -e s/sh[234].*/sh/ -e s/aarch64.*/arm64/ \ + -e s/riscv.*/riscv/ -e s/loongarch.*/loongarch/) + +ifndef ARCH +ARCH := $(HOSTARCH) +endif + +SRCARCH := $(ARCH) + +# Additional ARCH settings for x86 +ifeq ($(ARCH),i386) + SRCARCH := x86 +endif +ifeq ($(ARCH),x86_64) + SRCARCH := x86 +endif + +# Additional ARCH settings for sparc +ifeq ($(ARCH),sparc32) + SRCARCH := sparc +endif +ifeq ($(ARCH),sparc64) + SRCARCH := sparc +endif + +# Additional ARCH settings for loongarch +ifeq ($(ARCH),loongarch32) + SRCARCH := loongarch +endif + +ifeq ($(ARCH),loongarch64) + SRCARCH := loongarch +endif + +LP64 := $(shell echo __LP64__ | ${CC} ${CFLAGS} -E -x c - | tail -n 1) +ifeq ($(LP64), 1) + IS_64_BIT := 1 +else + IS_64_BIT := 0 +endif diff --git a/samples/bpf/spe/README.md b/samples/bpf/spe/README.md new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/samples/bpf/spe/spe-record.bpf.c b/samples/bpf/spe/spe-record.bpf.c new file mode 100644 index 0000000000000000000000000000000000000000..39d138a8e23156f532736395c7b966099d344233 --- /dev/null +++ b/samples/bpf/spe/spe-record.bpf.c @@ -0,0 +1,41 @@ +// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause +/* Copyright (c) 2020 Andrii Nakryiko */ +#include +#include +#include +#include "spe-record.h" + +char LICENSE[] SEC("license") = "Dual BSD/GPL"; + + +/* BPF ringbuf map */ +struct { + __uint(type, BPF_MAP_TYPE_RINGBUF); + __uint(max_entries, 256 * 1024 /* 256 KB */); +} rb SEC(".maps"); + + +SEC("raw_tracepoint/spe_record") +int handle_exec(struct bpf_raw_tracepoint_args *ctx) +{ + + // TP_PROTO(struct mem_sampling_record *record) + struct mem_sampling_record *rd = (struct mem_sampling_record *)ctx->args[0]; + struct event *e; + + e = bpf_ringbuf_reserve(&rb, sizeof(*e), 0); + if (!e) + return 0; + + if (bpf_get_current_comm(e->comm, sizeof(e->comm))) + e->comm[0] = 0; + + e->context_id = BPF_CORE_READ(rd, context_id); + e->virt_addr = BPF_CORE_READ(rd, virt_addr); + e->phys_addr = BPF_CORE_READ(rd, phys_addr); + e->latency = BPF_CORE_READ(rd, latency); + + bpf_ringbuf_submit(e, 0); + return 0; +} + diff --git a/samples/bpf/spe/spe-record.h b/samples/bpf/spe/spe-record.h new file mode 100644 index 0000000000000000000000000000000000000000..e9ec71bbb3a75c8f9664df2d568754daadba1237 --- /dev/null +++ b/samples/bpf/spe/spe-record.h @@ -0,0 +1,47 @@ +/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ +/* Copyright (c) 2020 Andrii Nakryiko */ +#ifndef __SPE_RECORD_H +#define __SPE_RECORD_H + +enum mem_sampling_sample_type { + MEM_SAMPLING_L1D_ACCESS = 1 << 0, + MEM_SAMPLING_L1D_MISS = 1 << 1, + MEM_SAMPLING_LLC_ACCESS = 1 << 2, + MEM_SAMPLING_LLC_MISS = 1 << 3, + MEM_SAMPLING_TLB_ACCESS = 1 << 4, + MEM_SAMPLING_TLB_MISS = 1 << 5, + MEM_SAMPLING_BRANCH_MISS = 1 << 6, + MEM_SAMPLING_REMOTE_ACCESS = 1 << 7, +}; + +struct mem_sampling_record { + enum mem_sampling_sample_type type; + int err; + unsigned int op; + unsigned int latency; + unsigned long long from_ip; + unsigned long long to_ip; + unsigned long long timestamp; + unsigned long long virt_addr; + unsigned long long phys_addr; + unsigned long long context_id; + unsigned char source; +}; + +/* definition of a sample sent to user-space from BPF program */ +struct event { + enum mem_sampling_sample_type type; + int err; + unsigned int op; + unsigned int latency; + unsigned long long from_ip; + unsigned long long to_ip; + unsigned long long timestamp; + unsigned long long virt_addr; + unsigned long long phys_addr; + unsigned long long context_id; + unsigned char source; + char comm[16]; +}; + +#endif /* __SPE_RECORD_H */ diff --git a/samples/bpf/spe/spe-record.user.c b/samples/bpf/spe/spe-record.user.c new file mode 100644 index 0000000000000000000000000000000000000000..f81a59d65e2fc518ce72a338b830aaeeee187d13 --- /dev/null +++ b/samples/bpf/spe/spe-record.user.c @@ -0,0 +1,116 @@ +// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) +// Copyright (c) 2020 Andrii Nakryiko +#include +#include +#include +#include +#include +#include +#include "spe-record.h" +#include "spe-record.skel.h" + +int libbpf_print_fn(enum libbpf_print_level level, const char *format, va_list args) +{ + /* Ignore debug-level libbpf logs */ + if (level > LIBBPF_INFO) + return 0; + return vfprintf(stderr, format, args); +} + +void bump_memlock_rlimit(void) +{ + struct rlimit rlim_new = { + .rlim_cur = RLIM_INFINITY, + .rlim_max = RLIM_INFINITY, + }; + + if (setrlimit(RLIMIT_MEMLOCK, &rlim_new)) { + fprintf(stderr, "Failed to increase RLIMIT_MEMLOCK limit!\n"); + exit(1); + } +} + +static bool exiting; + +static void sig_handler(int sig) +{ + exiting = true; +} + +int handle_event(void *ctx, void *data, size_t data_sz) +{ + const struct event *e = data; + struct tm *tm; + char ts[32]; + time_t t; + + time(&t); + tm = localtime(&t); + strftime(ts, sizeof(ts), "%H:%M:%S", tm); + + printf("%-20s %-8s %-10lld %-10d 0x%016llx 0x%016llx\n", e->comm, ts, e->context_id, + e->latency, e->virt_addr, e->phys_addr); + + return 0; +} + +int main(int argc, char **argv) +{ + struct ring_buffer *rb = NULL; + struct spe_record_bpf *skel; + int err; + + /* Set up libbpf logging callback */ + libbpf_set_print(libbpf_print_fn); + + /* Bump RLIMIT_MEMLOCK to create BPF maps */ + bump_memlock_rlimit(); + + /* Clean handling of Ctrl-C */ + signal(SIGINT, sig_handler); + signal(SIGTERM, sig_handler); + + /* Load and verify BPF application */ + skel = spe_record_bpf__open_and_load(); + if (!skel) { + fprintf(stderr, "Failed to open and load BPF skeleton\n"); + return 1; + } + + /* Attach tracepoint */ + err = spe_record_bpf__attach(skel); + if (err) { + fprintf(stderr, "Failed to attach BPF skeleton\n"); + goto cleanup; + } + + /* Set up ring buffer polling */ + rb = ring_buffer__new(bpf_map__fd(skel->maps.rb), handle_event, NULL, NULL); + if (!rb) { + err = -1; + fprintf(stderr, "Failed to create ring buffer\n"); + goto cleanup; + } + + /* Process events */ + printf("%-20s %-8s %-10s %-10s %-18s %-18s\n", + "COMM", "TIME", "PID", "LATENCY", "LADDR", "PADDR"); + while (!exiting) { + err = ring_buffer__poll(rb, 100 /* timeout, ms */); + /* Ctrl-C will cause -EINTR */ + if (err == -EINTR) { + err = 0; + break; + } + if (err < 0) { + printf("Error polling ring buffer: %d\n", err); + break; + } + } + +cleanup: + ring_buffer__free(rb); + spe_record_bpf__destroy(skel); + + return err < 0 ? -err : 0; +}