diff --git a/arch/arm64/configs/openeuler_defconfig b/arch/arm64/configs/openeuler_defconfig
index 1233ce00c72cf8811c2d5930b6a514aab775b13f..e89a034988b4e022a42751c1ef10bada08a70538 100644
--- a/arch/arm64/configs/openeuler_defconfig
+++ b/arch/arm64/configs/openeuler_defconfig
@@ -1173,6 +1173,8 @@ CONFIG_PID_RESERVE=y
 CONFIG_MEMORY_RELIABLE=y
 # CONFIG_CLEAR_FREELIST_PAGE is not set
 CONFIG_EXTEND_HUGEPAGE_MAPPING=y
+CONFIG_MEM_SAMPLING=y
+CONFIG_NUMABALANCING_MEM_SAMPLING=y
 
 #
 # Data Access Monitoring
@@ -6293,6 +6295,7 @@ CONFIG_UB_UDMA_HNS3=m
 CONFIG_CPU_INSPECT=m
 CONFIG_CPU_INSPECTOR_ATF=m
 # end of CPU Inspect
+CONFIG_ARM_SPE=y
 # end of Device Drivers
 
 #
diff --git a/arch/x86/configs/openeuler_defconfig b/arch/x86/configs/openeuler_defconfig
index dd8c474969b32b89b99141d79b293ae7131938db..a680be4c3ee4f8b3f50c288927be31ed9f8e7967 100644
--- a/arch/x86/configs/openeuler_defconfig
+++ b/arch/x86/configs/openeuler_defconfig
@@ -1104,6 +1104,7 @@ CONFIG_ARCH_HAS_PTE_SPECIAL=y
 CONFIG_MAPPING_DIRTY_HELPERS=y
 CONFIG_MEMORY_RELIABLE=y
 # CONFIG_CLEAR_FREELIST_PAGE is not set
+# CONFIG_MEM_SAMPLING is not set
 
 #
 # Data Access Monitoring
diff --git a/drivers/Kconfig b/drivers/Kconfig
index f765c3a688b6b0a6396f7616eb14c89dfcc51c8f..840137d901670440500b8931fb2cd622d2d3f0d6 100644
--- a/drivers/Kconfig
+++ b/drivers/Kconfig
@@ -244,4 +244,6 @@ source "drivers/ub/Kconfig"
 
 source "drivers/cpuinspect/Kconfig"
 
+source "drivers/arm/Kconfig"
+
 endmenu
diff --git a/drivers/Makefile b/drivers/Makefile
index 4e390005ded7ffb5c150a06e0f912d621303eeef..8264e814d3d67009550fc9206ae6ee095e565ebd 100644
--- a/drivers/Makefile
+++ b/drivers/Makefile
@@ -195,3 +195,4 @@ obj-$(CONFIG_COUNTER)		+= counter/
 obj-$(CONFIG_MOST)		+= most/
 obj-$(CONFIG_ROH)		+= roh/
 obj-$(CONFIG_UB)      += ub/
+obj-$(CONFIG_ARM_SPE)		+= arm/spe/
diff --git a/drivers/arm/Kconfig b/drivers/arm/Kconfig
new file mode 100644
index 0000000000000000000000000000000000000000..a0c7a25220cc83a26e1beaf6c32d7a368fa2501d
--- /dev/null
+++ b/drivers/arm/Kconfig
@@ -0,0 +1,2 @@
+# SPDX-License-Identifier: GPL-2.0-only
+source "drivers/arm/spe/Kconfig"
diff --git a/drivers/arm/spe/Kconfig b/drivers/arm/spe/Kconfig
new file mode 100644
index 0000000000000000000000000000000000000000..2d81364d0e0a201a9cd0941274d5c27a3b0aa90d
--- /dev/null
+++ b/drivers/arm/spe/Kconfig
@@ -0,0 +1,11 @@
+# SPDX-License-Identifier: GPL-2.0-only
+#
+# arm spe dirver
+#
+config ARM_SPE
+	bool "In-kernel SPE for driver for page access profiling"
+	depends on ARM64
+	default n
+        help
+	  Enable support for the ARMv8.2 Statistical Profiling Extension, which
+	  provides periodic sampling of operations in the CPU pipeline.
diff --git a/drivers/arm/spe/Makefile b/drivers/arm/spe/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..46c43e5974e1fd29afd2c2a695fce14977d78151
--- /dev/null
+++ b/drivers/arm/spe/Makefile
@@ -0,0 +1,2 @@
+# SPDX-License-Identifier: GPL-2.0
+obj-$(CONFIG_ARM_SPE) += spe.o spe-decoder/arm-spe-decoder.o spe-decoder/arm-spe-pkt-decoder.o
diff --git a/drivers/arm/spe/spe-decoder/Makefile b/drivers/arm/spe/spe-decoder/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..4fdae5d381867542ad12a7a7d34aabfdd141e40b
--- /dev/null
+++ b/drivers/arm/spe/spe-decoder/Makefile
@@ -0,0 +1,2 @@
+# SPDX-License-Identifier: GPL-2.0
+obj-y := arm-spe-decoder.o arm-spe-pkt-decoder.o
diff --git a/drivers/arm/spe/spe-decoder/arm-spe-decoder.c b/drivers/arm/spe/spe-decoder/arm-spe-decoder.c
new file mode 100644
index 0000000000000000000000000000000000000000..1b6ddeaaabe903602dd91c37dae2fe432c786969
--- /dev/null
+++ b/drivers/arm/spe/spe-decoder/arm-spe-decoder.c
@@ -0,0 +1,211 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * arm_spe_decoder.c: ARM SPE support
+ */
+
+#include <linux/mm.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/string.h>
+#include <linux/bitops.h>
+#include <linux/compiler.h>
+#include <linux/slab.h>
+
+#include "arm-spe-decoder.h"
+
+static u64 arm_spe_calc_ip(int index, u64 payload)
+{
+	u64 ns, el, val;
+	u32 seen_idx;
+
+	/* Instruction virtual address or Branch target address */
+	if (index == SPE_ADDR_PKT_HDR_INDEX_INS ||
+	    index == SPE_ADDR_PKT_HDR_INDEX_BRANCH) {
+		ns = SPE_ADDR_PKT_GET_NS(payload);
+		el = SPE_ADDR_PKT_GET_EL(payload);
+
+		/* Clean highest byte */
+		payload = SPE_ADDR_PKT_ADDR_GET_BYTES_0_6(payload);
+
+		/* Fill highest byte for EL1 or EL2 (VHE) mode */
+		if (ns && (el == SPE_ADDR_PKT_EL1 || el == SPE_ADDR_PKT_EL2))
+			payload |= 0xffULL << SPE_ADDR_PKT_ADDR_BYTE7_SHIFT;
+
+	/* Data access virtual address */
+	} else if (index == SPE_ADDR_PKT_HDR_INDEX_DATA_VIRT) {
+
+		/* Clean tags */
+		payload = SPE_ADDR_PKT_ADDR_GET_BYTES_0_6(payload);
+
+		/*
+		 * Armv8 ARM (ARM DDI 0487F.c), chapter "D10.2.1 Address packet"
+		 * defines the data virtual address payload format, the top byte
+		 * (bits [63:56]) is assigned as top-byte tag; so we only can
+		 * retrieve address value from bits [55:0].
+		 *
+		 * According to Documentation/arm64/memory.rst, if detects the
+		 * specific pattern in bits [55:52] of payload which falls in
+		 * the kernel space, should fixup the top byte and this allows
+		 * perf tool to parse DSO symbol for data address correctly.
+		 *
+		 * For this reason, if detects the bits [55:52] is 0xf, will
+		 * fill 0xff into the top byte.
+		 */
+		val = SPE_ADDR_PKT_ADDR_GET_BYTE_6(payload);
+		if ((val & 0xf0ULL) == 0xf0ULL)
+			payload |= 0xffULL << SPE_ADDR_PKT_ADDR_BYTE7_SHIFT;
+
+	/* Data access physical address */
+	} else if (index == SPE_ADDR_PKT_HDR_INDEX_DATA_PHYS) {
+		/* Clean highest byte */
+		payload = SPE_ADDR_PKT_ADDR_GET_BYTES_0_6(payload);
+	} else {
+		seen_idx = 0;
+		if (!(seen_idx & BIT(index))) {
+			seen_idx |= BIT(index);
+			pr_warn("ignoring unsupported address packet index: 0x%x\n", index);
+		}
+	}
+
+	return payload;
+}
+
+
+void arm_spe_decoder_free(struct arm_spe_decoder *decoder)
+{
+	kfree(decoder);
+}
+
+static int arm_spe_get_next_packet(struct arm_spe_decoder *decoder)
+{
+	int ret;
+
+	do {
+		if (!decoder->len)
+			return 0;
+
+		ret = arm_spe_get_packet(decoder->buf, decoder->len,
+					 &decoder->packet);
+		if (ret <= 0) {
+			/* Move forward for 1 byte */
+			decoder->buf += 1;
+			decoder->len -= 1;
+			return -EBADMSG;
+		}
+
+		decoder->buf += ret;
+		decoder->len -= ret;
+	} while (decoder->packet.type == ARM_SPE_PAD);
+	return 1;
+}
+
+static int arm_spe_read_record(struct arm_spe_decoder *decoder)
+{
+	int err;
+	int idx;
+	u64 payload, ip;
+
+	memset(&decoder->record, 0x0, sizeof(decoder->record));
+	decoder->record.context_id = (u64)-1;
+	while (1) {
+		err = arm_spe_get_next_packet(decoder);
+		if (err <= 0)
+			return err;
+
+		idx = decoder->packet.index;
+		payload = decoder->packet.payload;
+
+		switch (decoder->packet.type) {
+		case ARM_SPE_TIMESTAMP:
+			decoder->record.timestamp = payload;
+			return 1;
+		case ARM_SPE_END:
+			return 1;
+		case ARM_SPE_ADDRESS:
+			ip = arm_spe_calc_ip(idx, payload);
+			if (idx == SPE_ADDR_PKT_HDR_INDEX_INS)
+				decoder->record.from_ip = ip;
+			else if (idx == SPE_ADDR_PKT_HDR_INDEX_BRANCH)
+				decoder->record.to_ip = ip;
+			else if (idx == SPE_ADDR_PKT_HDR_INDEX_DATA_VIRT)
+				decoder->record.virt_addr = ip;
+			else if (idx == SPE_ADDR_PKT_HDR_INDEX_DATA_PHYS)
+				decoder->record.phys_addr = ip;
+			break;
+		case ARM_SPE_COUNTER:
+			if (idx == SPE_CNT_PKT_HDR_INDEX_TOTAL_LAT)
+				decoder->record.latency = payload;
+			break;
+		case ARM_SPE_CONTEXT:
+			decoder->record.context_id = payload;
+			break;
+		case ARM_SPE_OP_TYPE:
+			if (idx == SPE_OP_PKT_HDR_CLASS_LD_ST_ATOMIC) {
+				if (payload & 0x1)
+					decoder->record.op = ARM_SPE_ST;
+				else
+					decoder->record.op = ARM_SPE_LD;
+			}
+			break;
+		case ARM_SPE_EVENTS:
+			if (payload & BIT(EV_L1D_REFILL))
+				decoder->record.type |= ARM_SPE_L1D_MISS;
+
+			if (payload & BIT(EV_L1D_ACCESS))
+				decoder->record.type |= ARM_SPE_L1D_ACCESS;
+
+			if (payload & BIT(EV_TLB_WALK))
+				decoder->record.type |= ARM_SPE_TLB_MISS;
+
+			if (payload & BIT(EV_TLB_ACCESS))
+				decoder->record.type |= ARM_SPE_TLB_ACCESS;
+
+			if (payload & BIT(EV_LLC_MISS))
+				decoder->record.type |= ARM_SPE_LLC_MISS;
+
+			if (payload & BIT(EV_LLC_ACCESS))
+				decoder->record.type |= ARM_SPE_LLC_ACCESS;
+
+			if (payload & BIT(EV_REMOTE_ACCESS))
+				decoder->record.type |= ARM_SPE_REMOTE_ACCESS;
+
+			if (payload & BIT(EV_MISPRED))
+				decoder->record.type |= ARM_SPE_BRANCH_MISS;
+
+			break;
+		case ARM_SPE_DATA_SOURCE:
+			decoder->record.source = payload;
+			break;
+		case ARM_SPE_BAD:
+			break;
+		case ARM_SPE_PAD:
+			break;
+		default:
+			pr_err("Get packet error!\n");
+			return -1;
+		}
+	}
+	return 0;
+}
+
+static bool arm_spe_decode(struct arm_spe_decoder *decoder)
+{
+	if (decoder->len) {
+		if (arm_spe_read_record(decoder) == 1)
+			return true;
+	}
+	return false;
+}
+
+void arm_spe_decode_buf(const unsigned char *buf, size_t len)
+{
+	struct arm_spe_decoder decoder;
+
+	decoder.buf = buf;
+	decoder.len = len;
+
+	while (arm_spe_decode(&decoder))
+		arm_spe_record_enqueue(&(decoder.record));
+
+}
+EXPORT_SYMBOL(arm_spe_decode_buf);
diff --git a/drivers/arm/spe/spe-decoder/arm-spe-decoder.h b/drivers/arm/spe/spe-decoder/arm-spe-decoder.h
new file mode 100644
index 0000000000000000000000000000000000000000..567a70307c5f1ef506cbdda74260ce71fd500753
--- /dev/null
+++ b/drivers/arm/spe/spe-decoder/arm-spe-decoder.h
@@ -0,0 +1,73 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * arm_spe_decoder.h: Arm Statistical Profiling Extensions support
+ * Copyright (c) 2019-2020, Arm Ltd.
+ */
+
+#ifndef INCLUDE__ARM_SPE_DECODER_H__
+#define INCLUDE__ARM_SPE_DECODER_H__
+
+#include <linux/stddef.h>
+
+#include "arm-spe-pkt-decoder.h"
+
+enum arm_spe_sample_type {
+	ARM_SPE_L1D_ACCESS		= 1 << 0,
+	ARM_SPE_L1D_MISS		= 1 << 1,
+	ARM_SPE_LLC_ACCESS		= 1 << 2,
+	ARM_SPE_LLC_MISS		= 1 << 3,
+	ARM_SPE_TLB_ACCESS		= 1 << 4,
+	ARM_SPE_TLB_MISS		= 1 << 5,
+	ARM_SPE_BRANCH_MISS		= 1 << 6,
+	ARM_SPE_REMOTE_ACCESS		= 1 << 7,
+};
+
+enum arm_spe_op_type {
+	ARM_SPE_LD			= 1 << 0,
+	ARM_SPE_ST			= 1 << 1,
+};
+
+enum arm_spe_neoverse_data_source {
+	ARM_SPE_NV_L1D			= 0x0,
+	ARM_SPE_NV_L2			= 0x8,
+	ARM_SPE_NV_PEER_CORE		= 0x9,
+	ARM_SPE_NV_LOCAL_CLUSTER	= 0xa,
+	ARM_SPE_NV_SYS_CACHE		= 0xb,
+	ARM_SPE_NV_PEER_CLUSTER		= 0xc,
+	ARM_SPE_NV_REMOTE		= 0xd,
+	ARM_SPE_NV_DRAM			= 0xe,
+};
+
+struct arm_spe_record {
+	enum arm_spe_sample_type	type;
+	int				err;
+	u32				op;
+	u32				latency;
+	u64				from_ip;
+	u64				to_ip;
+	u64				timestamp;
+	u64				virt_addr;
+	u64				phys_addr;
+	u64				context_id;
+	u16				source;
+};
+
+struct arm_spe_buffer {
+	const unsigned char		*buf;
+	size_t				len;
+	u64				offset;
+	u64				trace_nr;
+};
+
+struct arm_spe_decoder {
+	struct arm_spe_record		record;
+	const unsigned char		*buf;
+	size_t				len;
+	struct arm_spe_pkt		packet;
+};
+
+void arm_spe_decoder_free(struct arm_spe_decoder *decoder);
+void arm_spe_decode_buf(const unsigned char *buf, size_t len);
+void arm_spe_record_enqueue(struct arm_spe_record *record);
+
+#endif
diff --git a/drivers/arm/spe/spe-decoder/arm-spe-pkt-decoder.c b/drivers/arm/spe/spe-decoder/arm-spe-pkt-decoder.c
new file mode 100644
index 0000000000000000000000000000000000000000..aeec434487798475c7899cce9f91e8fb0a6f272e
--- /dev/null
+++ b/drivers/arm/spe/spe-decoder/arm-spe-pkt-decoder.c
@@ -0,0 +1,227 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Arm Statistical Profiling Extensions (SPE) support
+ * Copyright (c) 2017-2018, Arm Ltd.
+ */
+
+#include <linux/kernel.h>
+#include <linux/printk.h>
+#include <linux/string.h>
+#include <linux/bitops.h>
+#include <linux/byteorder/generic.h>
+
+#include "arm-spe-pkt-decoder.h"
+
+/*
+ * Extracts the field "sz" from header bits and converts to bytes:
+ *   00 : byte (1)
+ *   01 : halfword (2)
+ *   10 : word (4)
+ *   11 : doubleword (8)
+ */
+static unsigned int arm_spe_payload_len(unsigned char hdr)
+{
+	return 1U << ((hdr & GENMASK_ULL(5, 4)) >> 4);
+}
+
+static int arm_spe_get_payload(const unsigned char *buf, size_t len,
+			       unsigned char ext_hdr,
+			       struct arm_spe_pkt *packet)
+{
+	size_t payload_len = arm_spe_payload_len(buf[ext_hdr]);
+
+	if (len < 1 + ext_hdr + payload_len)
+		return ARM_SPE_NEED_MORE_BYTES;
+
+	buf += 1 + ext_hdr;
+
+	switch (payload_len) {
+	case 1:
+		packet->payload = *(uint8_t *)buf;
+		break;
+	case 2:
+		packet->payload = le16_to_cpu(*(uint16_t *)buf);
+		break;
+	case 4:
+		packet->payload = le32_to_cpu(*(uint32_t *)buf);
+		break;
+	case 8:
+		packet->payload = le64_to_cpu(*(uint64_t *)buf);
+		break;
+	default:
+		return ARM_SPE_BAD_PACKET;
+	}
+
+	return 1 + ext_hdr + payload_len;
+}
+
+static int arm_spe_get_pad(struct arm_spe_pkt *packet)
+{
+	packet->type = ARM_SPE_PAD;
+	return 1;
+}
+
+static int arm_spe_get_alignment(const unsigned char *buf, size_t len,
+				 struct arm_spe_pkt *packet)
+{
+	unsigned int alignment = 1 << ((buf[0] & 0xf) + 1);
+
+	if (len < alignment)
+		return ARM_SPE_NEED_MORE_BYTES;
+
+	packet->type = ARM_SPE_PAD;
+	return alignment - (((uintptr_t)buf) & (alignment - 1));
+}
+
+static int arm_spe_get_end(struct arm_spe_pkt *packet)
+{
+	packet->type = ARM_SPE_END;
+	return 1;
+}
+
+static int arm_spe_get_timestamp(const unsigned char *buf, size_t len,
+				 struct arm_spe_pkt *packet)
+{
+	packet->type = ARM_SPE_TIMESTAMP;
+	return arm_spe_get_payload(buf, len, 0, packet);
+}
+
+static int arm_spe_get_events(const unsigned char *buf, size_t len,
+			      struct arm_spe_pkt *packet)
+{
+	packet->type = ARM_SPE_EVENTS;
+
+	/* we use index to identify Events with a less number of
+	 * comparisons in arm_spe_pkt_desc(): E.g., the LLC-ACCESS,
+	 * LLC-REFILL, and REMOTE-ACCESS events are identified if
+	 * index > 1.
+	 */
+	packet->index = arm_spe_payload_len(buf[0]);
+
+	return arm_spe_get_payload(buf, len, 0, packet);
+}
+
+static int arm_spe_get_data_source(const unsigned char *buf, size_t len,
+				   struct arm_spe_pkt *packet)
+{
+	packet->type = ARM_SPE_DATA_SOURCE;
+	return arm_spe_get_payload(buf, len, 0, packet);
+}
+
+static int arm_spe_get_context(const unsigned char *buf, size_t len,
+			       struct arm_spe_pkt *packet)
+{
+	packet->type = ARM_SPE_CONTEXT;
+	packet->index = SPE_CTX_PKT_HDR_INDEX(buf[0]);
+	return arm_spe_get_payload(buf, len, 0, packet);
+}
+
+static int arm_spe_get_op_type(const unsigned char *buf, size_t len,
+			       struct arm_spe_pkt *packet)
+{
+	packet->type = ARM_SPE_OP_TYPE;
+	packet->index = SPE_OP_PKT_HDR_CLASS(buf[0]);
+	return arm_spe_get_payload(buf, len, 0, packet);
+}
+
+static int arm_spe_get_counter(const unsigned char *buf, size_t len,
+			       const unsigned char ext_hdr, struct arm_spe_pkt *packet)
+{
+	packet->type = ARM_SPE_COUNTER;
+
+	if (ext_hdr)
+		packet->index = SPE_HDR_EXTENDED_INDEX(buf[0], buf[1]);
+	else
+		packet->index = SPE_HDR_SHORT_INDEX(buf[0]);
+
+	return arm_spe_get_payload(buf, len, ext_hdr, packet);
+}
+
+static int arm_spe_get_addr(const unsigned char *buf, size_t len,
+			    const unsigned char ext_hdr, struct arm_spe_pkt *packet)
+{
+	packet->type = ARM_SPE_ADDRESS;
+
+	if (ext_hdr)
+		packet->index = SPE_HDR_EXTENDED_INDEX(buf[0], buf[1]);
+	else
+		packet->index = SPE_HDR_SHORT_INDEX(buf[0]);
+
+	return arm_spe_get_payload(buf, len, ext_hdr, packet);
+}
+
+static int arm_spe_do_get_packet(const unsigned char *buf, size_t len,
+				 struct arm_spe_pkt *packet)
+{
+	unsigned int hdr;
+	unsigned char ext_hdr = 0;
+
+	memset(packet, 0, sizeof(struct arm_spe_pkt));
+
+	if (!len)
+		return ARM_SPE_NEED_MORE_BYTES;
+
+	hdr = buf[0];
+
+	if (hdr == SPE_HEADER0_PAD)
+		return arm_spe_get_pad(packet);
+
+	if (hdr == SPE_HEADER0_END) /* no timestamp at end of record */
+		return arm_spe_get_end(packet);
+
+	if (hdr == SPE_HEADER0_TIMESTAMP)
+		return arm_spe_get_timestamp(buf, len, packet);
+
+	if ((hdr & SPE_HEADER0_MASK1) == SPE_HEADER0_EVENTS)
+		return arm_spe_get_events(buf, len, packet);
+
+	if ((hdr & SPE_HEADER0_MASK1) == SPE_HEADER0_SOURCE)
+		return arm_spe_get_data_source(buf, len, packet);
+
+	if ((hdr & SPE_HEADER0_MASK2) == SPE_HEADER0_CONTEXT)
+		return arm_spe_get_context(buf, len, packet);
+
+	if ((hdr & SPE_HEADER0_MASK2) == SPE_HEADER0_OP_TYPE)
+		return arm_spe_get_op_type(buf, len, packet);
+
+	if ((hdr & SPE_HEADER0_MASK2) == SPE_HEADER0_EXTENDED) {
+		/* 16-bit extended format header */
+		if (len == 1)
+			return ARM_SPE_BAD_PACKET;
+
+		ext_hdr = 1;
+		hdr = buf[1];
+		if (hdr == SPE_HEADER1_ALIGNMENT)
+			return arm_spe_get_alignment(buf, len, packet);
+	}
+
+	/*
+	 * The short format header's byte 0 or the extended format header's
+	 * byte 1 has been assigned to 'hdr', which uses the same encoding for
+	 * address packet and counter packet, so don't need to distinguish if
+	 * it's short format or extended format and handle in once.
+	 */
+	if ((hdr & SPE_HEADER0_MASK3) == SPE_HEADER0_ADDRESS)
+		return arm_spe_get_addr(buf, len, ext_hdr, packet);
+
+	if ((hdr & SPE_HEADER0_MASK3) == SPE_HEADER0_COUNTER)
+		return arm_spe_get_counter(buf, len, ext_hdr, packet);
+
+	return ARM_SPE_BAD_PACKET;
+}
+
+int arm_spe_get_packet(const unsigned char *buf, size_t len,
+		       struct arm_spe_pkt *packet)
+{
+	int ret;
+
+	ret = arm_spe_do_get_packet(buf, len, packet);
+	/* put multiple consecutive PADs on the same line, up to
+	 * the fixed-width output format of 16 bytes per line.
+	 */
+	if (ret > 0 && packet->type == ARM_SPE_PAD) {
+		while (ret < 16 && len > (size_t)ret && !buf[ret])
+			ret += 1;
+	}
+	return ret;
+}
diff --git a/drivers/arm/spe/spe-decoder/arm-spe-pkt-decoder.h b/drivers/arm/spe/spe-decoder/arm-spe-pkt-decoder.h
new file mode 100644
index 0000000000000000000000000000000000000000..1a67b580b47f4d480fcdbee4569b57f2960b279d
--- /dev/null
+++ b/drivers/arm/spe/spe-decoder/arm-spe-pkt-decoder.h
@@ -0,0 +1,153 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Arm Statistical Profiling Extensions (SPE) support
+ * Copyright (c) 2017-2018, Arm Ltd.
+ */
+
+#ifndef INCLUDE__ARM_SPE_PKT_DECODER_H__
+#define INCLUDE__ARM_SPE_PKT_DECODER_H__
+
+#include <linux/stddef.h>
+
+#define ARM_SPE_PKT_DESC_MAX			256
+#define ARM_SPE_NEED_MORE_BYTES		-1
+#define ARM_SPE_BAD_PACKET			-2
+#define ARM_SPE_PKT_MAX_SZ			16
+
+enum arm_spe_pkt_type {
+	ARM_SPE_BAD,
+	ARM_SPE_PAD,
+	ARM_SPE_END,
+	ARM_SPE_TIMESTAMP,
+	ARM_SPE_ADDRESS,
+	ARM_SPE_COUNTER,
+	ARM_SPE_CONTEXT,
+	ARM_SPE_OP_TYPE,
+	ARM_SPE_EVENTS,
+	ARM_SPE_DATA_SOURCE,
+};
+
+struct arm_spe_pkt {
+	enum arm_spe_pkt_type			type;
+	unsigned char				index;
+	uint64_t				payload;
+};
+
+/* Short header (HEADER0) and extended header (HEADER1) */
+#define SPE_HEADER0_PAD			0x0
+#define SPE_HEADER0_END			0x1
+#define SPE_HEADER0_TIMESTAMP			0x71
+/* Mask for event & data source */
+#define SPE_HEADER0_MASK1			(GENMASK_ULL(7, 6) | GENMASK_ULL(3, 0))
+#define SPE_HEADER0_EVENTS			0x42
+#define SPE_HEADER0_SOURCE			0x43
+/* Mask for context & operation */
+#define SPE_HEADER0_MASK2			GENMASK_ULL(7, 2)
+#define SPE_HEADER0_CONTEXT			0x64
+#define SPE_HEADER0_OP_TYPE			0x48
+/* Mask for extended format */
+#define SPE_HEADER0_EXTENDED			0x20
+/* Mask for address & counter */
+#define SPE_HEADER0_MASK3			GENMASK_ULL(7, 3)
+#define SPE_HEADER0_ADDRESS			0xb0
+#define SPE_HEADER0_COUNTER			0x98
+#define SPE_HEADER1_ALIGNMENT			0x0
+
+#define SPE_HDR_SHORT_INDEX(h)			((h) & GENMASK_ULL(2, 0))
+#define SPE_HDR_EXTENDED_INDEX(h0, h1)		(((h0) & GENMASK_ULL(1, 0)) << 3 | \
+						SPE_HDR_SHORT_INDEX(h1))
+
+/* Address packet header */
+#define SPE_ADDR_PKT_HDR_INDEX_INS		0x0
+#define SPE_ADDR_PKT_HDR_INDEX_BRANCH		0x1
+#define SPE_ADDR_PKT_HDR_INDEX_DATA_VIRT	0x2
+#define SPE_ADDR_PKT_HDR_INDEX_DATA_PHYS	0x3
+#define SPE_ADDR_PKT_HDR_INDEX_PREV_BRANCH	0x4
+
+/* Address packet payload */
+#define SPE_ADDR_PKT_ADDR_BYTE7_SHIFT		56
+#define SPE_ADDR_PKT_ADDR_GET_BYTES_0_6(v)	((v) & GENMASK_ULL(55, 0))
+#define SPE_ADDR_PKT_ADDR_GET_BYTE_6(v)	(((v) & GENMASK_ULL(55, 48)) >> 48)
+
+#define SPE_ADDR_PKT_GET_NS(v)			(((v) & BIT_ULL(63)) >> 63)
+#define SPE_ADDR_PKT_GET_EL(v)			(((v) & GENMASK_ULL(62, 61)) >> 61)
+#define SPE_ADDR_PKT_GET_CH(v)			(((v) & BIT_ULL(62)) >> 62)
+#define SPE_ADDR_PKT_GET_PAT(v)		(((v) & GENMASK_ULL(59, 56)) >> 56)
+
+#define SPE_ADDR_PKT_EL0			0
+#define SPE_ADDR_PKT_EL1			1
+#define SPE_ADDR_PKT_EL2			2
+#define SPE_ADDR_PKT_EL3			3
+
+/* Context packet header */
+#define SPE_CTX_PKT_HDR_INDEX(h)		((h) & GENMASK_ULL(1, 0))
+
+/* Counter packet header */
+#define SPE_CNT_PKT_HDR_INDEX_TOTAL_LAT	0x0
+#define SPE_CNT_PKT_HDR_INDEX_ISSUE_LAT	0x1
+#define SPE_CNT_PKT_HDR_INDEX_TRANS_LAT	0x2
+
+/* Event packet payload */
+enum arm_spe_events {
+	EV_EXCEPTION_GEN			= 0,
+	EV_RETIRED				= 1,
+	EV_L1D_ACCESS				= 2,
+	EV_L1D_REFILL				= 3,
+	EV_TLB_ACCESS				= 4,
+	EV_TLB_WALK				= 5,
+	EV_NOT_TAKEN				= 6,
+	EV_MISPRED				= 7,
+	EV_LLC_ACCESS				= 8,
+	EV_LLC_MISS				= 9,
+	EV_REMOTE_ACCESS			= 10,
+	EV_ALIGNMENT				= 11,
+	EV_PARTIAL_PREDICATE			= 17,
+	EV_EMPTY_PREDICATE			= 18,
+};
+
+/* Operation packet header */
+#define SPE_OP_PKT_HDR_CLASS(h)		((h) & GENMASK_ULL(1, 0))
+#define SPE_OP_PKT_HDR_CLASS_OTHER		0x0
+#define SPE_OP_PKT_HDR_CLASS_LD_ST_ATOMIC	0x1
+#define SPE_OP_PKT_HDR_CLASS_BR_ERET		0x2
+
+#define SPE_OP_PKT_IS_OTHER_SVE_OP(v)		(((v) & (BIT(7) | BIT(3) | BIT(0))) == 0x8)
+
+#define SPE_OP_PKT_COND			BIT(0)
+
+#define SPE_OP_PKT_LDST_SUBCLASS_GET(v)	((v) & GENMASK_ULL(7, 1))
+#define SPE_OP_PKT_LDST_SUBCLASS_GP_REG	0x0
+#define SPE_OP_PKT_LDST_SUBCLASS_SIMD_FP	0x4
+#define SPE_OP_PKT_LDST_SUBCLASS_UNSPEC_REG	0x10
+#define SPE_OP_PKT_LDST_SUBCLASS_NV_SYSREG	0x30
+
+#define SPE_OP_PKT_IS_LDST_ATOMIC(v)		(((v) & (GENMASK_ULL(7, 5) | BIT(1))) == 0x2)
+
+#define SPE_OP_PKT_AR				BIT(4)
+#define SPE_OP_PKT_EXCL			BIT(3)
+#define SPE_OP_PKT_AT				BIT(2)
+#define SPE_OP_PKT_ST				BIT(0)
+
+#define SPE_OP_PKT_IS_LDST_SVE(v)		(((v) & (BIT(3) | BIT(1))) == 0x8)
+
+#define SPE_OP_PKT_SVE_SG			BIT(7)
+/*
+ * SVE effective vector length (EVL) is stored in byte 0 bits [6:4];
+ * the length is rounded up to a power of two and use 32 as one step,
+ * so EVL calculation is:
+ *
+ *   32 * (2 ^ bits [6:4]) = 32 << (bits [6:4])
+ */
+#define SPE_OP_PKG_SVE_EVL(v)			(32 << (((v) & GENMASK_ULL(6, 4)) >> 4))
+#define SPE_OP_PKT_SVE_PRED			BIT(2)
+#define SPE_OP_PKT_SVE_FP			BIT(1)
+
+#define SPE_OP_PKT_IS_INDIRECT_BRANCH(v)	(((v) & GENMASK_ULL(7, 1)) == 0x2)
+
+const char *arm_spe_pkt_name(enum arm_spe_pkt_type);
+
+int arm_spe_get_packet(const unsigned char *buf, size_t len,
+		       struct arm_spe_pkt *packet);
+
+int arm_spe_pkt_desc(const struct arm_spe_pkt *packet, char *buf, size_t len);
+#endif
diff --git a/drivers/arm/spe/spe.c b/drivers/arm/spe/spe.c
new file mode 100644
index 0000000000000000000000000000000000000000..d22a6aff9a94d86f9d95dbc642dd9d12e81536a4
--- /dev/null
+++ b/drivers/arm/spe/spe.c
@@ -0,0 +1,859 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#define PMUNAME "arm_spe"
+#define DRVNAME PMUNAME "_driver"
+#define pr_fmt(fmt) DRVNAME ": " fmt
+
+#include <linux/of_device.h>
+#include <linux/of_irq.h>
+#include <linux/perf/arm_pmu.h>
+#include <linux/platform_device.h>
+#include <linux/mem_sampling.h>
+#include <linux/perf/arm_pmu.h>
+#include <trace/events/kmem.h>
+
+#include "spe-decoder/arm-spe-decoder.h"
+#include "spe-decoder/arm-spe-pkt-decoder.h"
+#include "spe.h"
+
+static long __percpu irq_dev_id;
+
+static struct arm_spe *spe;
+
+#define SPE_INIT_FAIL	0
+#define SPE_INIT_SUCC	1
+static int spe_probe_status = SPE_INIT_FAIL;
+
+/* Keep track of our dynamic hotplug state */
+static enum cpuhp_state arm_spe_online;
+
+/* keep track of who use the SPE */
+static enum arm_spe_user_e arm_spe_user = ARM_SPE_USER_MEM_SAMPLING;
+
+DEFINE_PER_CPU(struct arm_spe_buf, per_cpu_spe_buf);
+
+mem_sampling_cb_type arm_spe_sampling_cb;
+void arm_spe_record_capture_callback_register(mem_sampling_cb_type cb)
+{
+	arm_spe_sampling_cb = cb;
+}
+EXPORT_SYMBOL_GPL(arm_spe_record_capture_callback_register);
+
+/* SPE sampling callback for perf */
+perf_sampling_cb_type arm_spe_sampling_perf_cb;
+void arm_spe_sampling_for_perf_callback_register(perf_sampling_cb_type cb)
+{
+	arm_spe_sampling_perf_cb = cb;
+}
+EXPORT_SYMBOL_GPL(arm_spe_sampling_for_perf_callback_register);
+
+/*
+ * SPE can be useed by mem_sampling/perf, perf takes precedence.
+ * when perf is used, this callback is used to disable mem_sampling.
+ */
+mem_sampling_user_switch_cb_type arm_spe_user_switch_cb;
+void arm_spe_user_switch_callback_register(mem_sampling_user_switch_cb_type cb)
+{
+	arm_spe_user_switch_cb = cb;
+}
+
+struct arm_spe *arm_spe_get_desc(void)
+{
+	return spe;
+}
+EXPORT_SYMBOL_GPL(arm_spe_get_desc);
+
+static inline int arm_spe_per_buffer_alloc(int cpu)
+{
+	struct arm_spe_buf *spe_buf = &per_cpu(per_cpu_spe_buf, cpu);
+	void *alloc_base;
+
+	if (spe_buf->base && spe_buf->record_base)
+		return 0;
+
+	/* alloc spe raw data buffer */
+	alloc_base = kzalloc_node(SPE_BUFFER_MAX_SIZE, GFP_KERNEL, cpu_to_node(cpu));
+	if (unlikely(!alloc_base)) {
+		pr_err("alloc spe raw data buffer failed.\n");
+		return -ENOMEM;
+	}
+
+	spe_buf->base = alloc_base;
+	spe_buf->size = SPE_BUFFER_SIZE;
+	spe_buf->cur = alloc_base + SPE_BUFFER_MAX_SIZE - SPE_BUFFER_SIZE;
+	spe_buf->period = SPE_SAMPLE_PERIOD;
+
+	/* alloc record buffer */
+	spe_buf->record_size = SPE_RECORD_ENTRY_SIZE * SPE_RECORD_BUFFER_MAX_RECORDS;
+	spe_buf->record_base = kzalloc_node(spe_buf->record_size, GFP_KERNEL, cpu_to_node(cpu));
+	if (unlikely(!spe_buf->record_base)) {
+		pr_err("alloc spe record buffer failed.\n");
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+static int arm_spe_buffer_alloc(void)
+{
+	int cpu, ret = 0;
+	cpumask_t *mask = &spe->supported_cpus;
+
+	for_each_possible_cpu(cpu) {
+		if (!cpumask_test_cpu(cpu, mask))
+			continue;
+		ret = arm_spe_per_buffer_alloc(cpu);
+		if (ret)
+			return ret;
+	}
+	return ret;
+}
+
+static inline void arm_spe_per_buffer_free(int cpu)
+{
+	struct arm_spe_buf *spe_buf = &per_cpu(per_cpu_spe_buf, cpu);
+
+	if (!spe_buf->base)
+		return;
+
+	kfree(spe_buf->base);
+	spe_buf->cur = NULL;
+	spe_buf->base = NULL;
+	spe_buf->size = 0;
+
+	kfree(spe_buf->record_base);
+	spe_buf->record_base = NULL;
+	spe_buf->record_size = 0;
+}
+
+static inline void arm_spe_buffer_free(void)
+{
+	cpumask_t *mask = &spe->supported_cpus;
+	int cpu;
+
+	for_each_possible_cpu(cpu) {
+		if (!cpumask_test_cpu(cpu, mask))
+			continue;
+		arm_spe_per_buffer_free(cpu);
+	}
+}
+
+static void arm_spe_buffer_init(void)
+{
+	u64 base, limit;
+	struct arm_spe_buf *spe_buf = this_cpu_ptr(&per_cpu_spe_buf);
+
+	if (!spe_buf || !spe_buf->cur || !spe_buf->size) {
+		/*
+		 * We still need to clear the limit pointer, since the
+		 * profiler might only be disabled by virtue of a fault.
+		 */
+		limit = 0;
+		goto out_write_limit;
+	}
+
+	base = (u64)spe_buf->cur;
+	limit = ((u64)spe_buf->cur + spe_buf->size) | PMBLIMITR_EL1_E;
+	write_sysreg_s(base, SYS_PMBPTR_EL1);
+
+out_write_limit:
+	write_sysreg_s(limit, SYS_PMBLIMITR_EL1);
+
+}
+
+static void arm_spe_disable_and_drain_local(void)
+{
+	/* Disable profiling at EL0 and EL1 */
+	write_sysreg_s(0, SYS_PMSCR_EL1);
+	isb();
+
+	/* Drain any buffered data */
+	psb_csync();
+	dsb(nsh);
+
+	/* Disable the profiling buffer */
+	write_sysreg_s(0, SYS_PMBLIMITR_EL1);
+	isb();
+}
+
+/* IRQ handling */
+static enum arm_spe_buf_fault_action arm_spe_buf_get_fault_act(void)
+{
+	const char *err_str;
+	u64 pmbsr;
+	enum arm_spe_buf_fault_action ret;
+
+	/*
+	 * Ensure new profiling data is visible to the CPU and any external
+	 * aborts have been resolved.
+	 */
+	psb_csync();
+	dsb(nsh);
+
+	/* Ensure hardware updates to PMBPTR_EL1 are visible */
+	isb();
+
+	/* Service required? */
+	pmbsr = read_sysreg_s(SYS_PMBSR_EL1);
+	if (!FIELD_GET(PMBSR_EL1_S, pmbsr))
+		return SPE_PMU_BUF_FAULT_ACT_SPURIOUS;
+
+	/* We only expect buffer management events */
+	switch (FIELD_GET(PMBSR_EL1_EC, pmbsr)) {
+	case PMBSR_EL1_EC_BUF:
+		/* Handled below */
+		break;
+	case PMBSR_EL1_EC_FAULT_S1:
+	case PMBSR_EL1_EC_FAULT_S2:
+		err_str = "Unexpected buffer fault";
+		goto out_err;
+	default:
+		err_str = "Unknown error code";
+		goto out_err;
+	}
+
+	/* Buffer management event */
+	switch (FIELD_GET(PMBSR_EL1_BUF_BSC_MASK, pmbsr)) {
+	case PMBSR_EL1_BUF_BSC_FULL:
+		ret = SPE_PMU_BUF_FAULT_ACT_OK;
+		goto out_stop;
+	default:
+		err_str = "Unknown buffer status code";
+	}
+
+out_err:
+	pr_err_ratelimited(
+		"%s on CPU %d [PMBSR=0x%016llx, PMBPTR=0x%016llx, PMBLIMITR=0x%016llx]\n",
+		err_str, smp_processor_id(), pmbsr,
+		read_sysreg_s(SYS_PMBPTR_EL1),
+		read_sysreg_s(SYS_PMBLIMITR_EL1));
+	ret = SPE_PMU_BUF_FAULT_ACT_FATAL;
+
+out_stop:
+	return ret;
+}
+
+void arm_spe_stop(void)
+{
+	arm_spe_disable_and_drain_local();
+}
+
+static u64 arm_spe_to_pmsfcr(void)
+{
+	u64 reg = 0;
+
+	if (spe->load_filter)
+		reg |= PMSFCR_EL1_LD;
+
+	if (spe->store_filter)
+		reg |= PMSFCR_EL1_ST;
+
+	if (spe->branch_filter)
+		reg |= PMSFCR_EL1_B;
+
+	if (reg)
+		reg |= PMSFCR_EL1_FT;
+
+	if (spe->event_filter)
+		reg |= PMSFCR_EL1_FE;
+
+	if (spe->inv_event_filter)
+		reg |= PMSFCR_EL1_FnE;
+
+	if (spe->min_latency)
+		reg |= PMSFCR_EL1_FL;
+
+	return reg;
+}
+
+static u64 arm_spe_to_pmsevfr(void)
+{
+	return spe->event_filter;
+}
+
+static u64 arm_spe_to_pmsnevfr(void)
+{
+	return spe->inv_event_filter;
+}
+
+static u64 arm_spe_to_pmslatfr(void)
+{
+	return spe->min_latency;
+}
+
+static void arm_spe_sanitise_period(struct arm_spe_buf *spe_buf)
+{
+	u64 period = spe_buf->period;
+	u64 max_period = PMSIRR_EL1_INTERVAL_MASK;
+
+	if (period < spe->min_period)
+		period = spe->min_period;
+	else if (period > max_period)
+		period = max_period;
+	else
+		period &= max_period;
+
+	spe_buf->period = period;
+}
+
+static u64 arm_spe_to_pmsirr(void)
+{
+	u64 reg = 0;
+	struct arm_spe_buf *spe_buf = this_cpu_ptr(&per_cpu_spe_buf);
+
+	arm_spe_sanitise_period(spe_buf);
+
+	if (spe->jitter)
+		reg |= 0x1;
+
+	reg |= spe_buf->period << 8;
+
+	return reg;
+}
+
+static u64 arm_spe_to_pmscr(void)
+{
+	u64 reg = 0;
+
+	if (spe->ts_enable)
+		reg |= PMSCR_EL1_TS;
+
+	if (spe->pa_enable)
+		reg |= PMSCR_EL1_PA;
+
+	if (spe->pct_enable < 0x4)
+		reg |= spe->pct_enable << 6;
+
+	if (spe->exclude_user)
+		reg |= PMSCR_EL1_E0SPE;
+
+	if (spe->exclude_kernel)
+		reg |= PMSCR_EL1_E1SPE;
+
+	if (IS_ENABLED(CONFIG_PID_IN_CONTEXTIDR))
+		reg |= PMSCR_EL1_CX;
+
+	return reg;
+}
+
+int arm_spe_start(void)
+{
+	u64 reg;
+	int cpu = smp_processor_id();
+
+	if (!cpumask_test_cpu(cpu, &spe->supported_cpus))
+		return -ENOENT;
+
+	arm_spe_buffer_init();
+
+	reg = arm_spe_to_pmsfcr();
+	write_sysreg_s(reg, SYS_PMSFCR_EL1);
+
+	reg = arm_spe_to_pmsevfr();
+	write_sysreg_s(reg, SYS_PMSEVFR_EL1);
+
+	if (spe->features & SPE_PMU_FEAT_INV_FILT_EVT) {
+		reg = arm_spe_to_pmsnevfr();
+		write_sysreg_s(reg, SYS_PMSNEVFR_EL1);
+	}
+
+	reg = arm_spe_to_pmslatfr();
+
+	write_sysreg_s(reg, SYS_PMSLATFR_EL1);
+
+	reg = arm_spe_to_pmsirr();
+	write_sysreg_s(reg, SYS_PMSIRR_EL1);
+	isb();
+
+	reg = arm_spe_to_pmscr();
+	isb();
+	write_sysreg_s(reg, SYS_PMSCR_EL1);
+	return 0;
+}
+
+void arm_spe_continue(void)
+{
+	int reg;
+
+	arm_spe_buffer_init();
+	reg = arm_spe_to_pmscr();
+
+	isb();
+	write_sysreg_s(reg, SYS_PMSCR_EL1);
+}
+
+int arm_spe_enabled(void)
+{
+	return spe_probe_status == SPE_INIT_SUCC;
+}
+
+static irqreturn_t arm_spe_irq_handler(int irq, void *dev)
+{
+	enum arm_spe_buf_fault_action act;
+	struct arm_spe_buf *spe_buf = this_cpu_ptr(&per_cpu_spe_buf);
+
+	act = arm_spe_buf_get_fault_act();
+
+	switch (act) {
+	case SPE_PMU_BUF_FAULT_ACT_FATAL:
+		if (unlikely(arm_spe_user == ARM_SPE_USER_PERF)) {
+			if (arm_spe_sampling_perf_cb)
+				arm_spe_sampling_perf_cb(act);
+		}
+		/*
+		 * If a fatal exception occurred then leaving the profiling
+		 * buffer enabled is a recipe waiting to happen. Since
+		 * fatal faults don't always imply truncation, make sure
+		 * that the profiling buffer is disabled explicitly before
+		 * clearing the syndrome register.
+		 */
+		arm_spe_disable_and_drain_local();
+		break;
+	case SPE_PMU_BUF_FAULT_ACT_OK:
+		/*
+		 * Callback function processing record data.
+		 * ARM_SPE_USER_MEM_SAMPLING: arm_spe_record_captured_cb - mem_sampling layer.
+		 * ARM_SPE_USER_PERF: arm_spe_sampling_perf_cb - perf.
+		 * TODO: 1) use per CPU workqueue to process data and reduce
+		 * interrupt processing time. 2) The "register" function can be
+		 * registered in a callback structure.
+		 */
+		if (likely(arm_spe_user == ARM_SPE_USER_MEM_SAMPLING)) {
+			spe_buf->nr_records = 0;
+			arm_spe_decode_buf(spe_buf->cur, spe_buf->size);
+
+			if (arm_spe_sampling_cb)
+				arm_spe_sampling_cb(
+						(struct mem_sampling_record *)spe_buf->record_base,
+						spe_buf->nr_records);
+		} else {
+			if (arm_spe_sampling_perf_cb)
+				arm_spe_sampling_perf_cb(act);
+		}
+
+		break;
+
+	case SPE_PMU_BUF_FAULT_ACT_SPURIOUS:
+		/* We've seen you before, but GCC has the memory of a sieve. */
+		arm_spe_stop();
+		break;
+	}
+
+	/* The buffer pointers are now sane, so resume profiling. */
+	write_sysreg_s(0, SYS_PMBSR_EL1);
+	return IRQ_HANDLED;
+}
+
+
+static void __arm_spe_dev_probe(void *data)
+{
+	int fld;
+	u64 reg;
+
+	fld = cpuid_feature_extract_unsigned_field(
+		read_cpuid(ID_AA64DFR0_EL1), ID_AA64DFR0_EL1_PMSVer_SHIFT);
+	if (!fld) {
+		pr_err("unsupported ID_AA64DFR0_EL1.PMSVer [%d] on CPU %d\n",
+		       fld, smp_processor_id());
+		return;
+	}
+	spe->pmsver = (u16)fld;
+
+	/* Read PMBIDR first to determine whether or not we have access */
+	reg = read_sysreg_s(SYS_PMBIDR_EL1);
+	if (FIELD_GET(PMBIDR_EL1_P, reg)) {
+		pr_err("profiling buffer owned by higher exception level\n");
+		return;
+	}
+
+	/* Minimum alignment. If it's out-of-range, then fail the probe */
+	fld = FIELD_GET(PMBIDR_EL1_ALIGN, reg);
+	spe->align = 1 << fld;
+	if (spe->align > SZ_2K) {
+		pr_err("unsupported PMBIDR.Align [%d] on CPU %d\n", fld,
+		       smp_processor_id());
+		return;
+	}
+
+	/* It's now safe to read PMSIDR and figure out what we've got */
+	reg = read_sysreg_s(SYS_PMSIDR_EL1);
+	if (FIELD_GET(PMSIDR_EL1_FE, reg))
+		spe->features |= SPE_PMU_FEAT_FILT_EVT;
+
+	if (FIELD_GET(PMSIDR_EL1_FnE, reg))
+		spe->features |= SPE_PMU_FEAT_INV_FILT_EVT;
+
+	if (FIELD_GET(PMSIDR_EL1_FT, reg))
+		spe->features |= SPE_PMU_FEAT_FILT_TYP;
+
+	if (FIELD_GET(PMSIDR_EL1_FL, reg))
+		spe->features |= SPE_PMU_FEAT_FILT_LAT;
+
+	if (FIELD_GET(PMSIDR_EL1_ARCHINST, reg))
+		spe->features |= SPE_PMU_FEAT_ARCH_INST;
+
+	if (FIELD_GET(PMSIDR_EL1_LDS, reg))
+		spe->features |= SPE_PMU_FEAT_LDS;
+
+	if (FIELD_GET(PMSIDR_EL1_ERND, reg))
+		spe->features |= SPE_PMU_FEAT_ERND;
+
+	/* This field has a spaced out encoding, so just use a look-up */
+	fld = FIELD_GET(PMSIDR_EL1_INTERVAL, reg);
+	switch (fld) {
+	case PMSIDR_EL1_INTERVAL_256:
+		spe->min_period = 256;
+		break;
+	case PMSIDR_EL1_INTERVAL_512:
+		spe->min_period = 512;
+		break;
+	case PMSIDR_EL1_INTERVAL_768:
+		spe->min_period = 768;
+		break;
+	case PMSIDR_EL1_INTERVAL_1024:
+		spe->min_period = 1024;
+		break;
+	case PMSIDR_EL1_INTERVAL_1536:
+		spe->min_period = 1536;
+		break;
+	case PMSIDR_EL1_INTERVAL_2048:
+		spe->min_period = 2048;
+		break;
+	case PMSIDR_EL1_INTERVAL_3072:
+		spe->min_period = 3072;
+		break;
+	case PMSIDR_EL1_INTERVAL_4096:
+		spe->min_period = 4096;
+		break;
+	default:
+		pr_warn("unknown PMSIDR_EL1.Interval [%d]; assuming 8\n", fld);
+		fallthrough;
+	}
+
+	/* Maximum record size. If it's out-of-range, then fail the probe */
+	fld = FIELD_GET(PMSIDR_EL1_MAXSIZE, reg);
+	spe->max_record_sz = 1 << fld;
+	if (spe->max_record_sz > SZ_2K || spe->max_record_sz < 16) {
+		pr_err("unsupported PMSIDR_EL1.MaxSize [%d] on CPU %d\n", fld,
+		       smp_processor_id());
+		return;
+	}
+
+	fld = FIELD_GET(PMSIDR_EL1_COUNTSIZE, reg);
+	switch (fld) {
+	case PMSIDR_EL1_COUNTSIZE_12_BIT_SAT:
+		spe->counter_sz = 12;
+		break;
+	case PMSIDR_EL1_COUNTSIZE_16_BIT_SAT:
+		spe->counter_sz = 16;
+		break;
+	default:
+		pr_warn("unknown PMSIDR_EL1.CountSize [%d]; assuming 2\n", fld);
+		fallthrough;
+	}
+
+	pr_info("probed SPEv1.%d for CPUs %*pbl [max_record_sz %u, min_period %u, align %u, features 0x%llx]\n",
+		spe->pmsver - 1, cpumask_pr_args(&spe->supported_cpus),
+		spe->max_record_sz, spe->min_period, spe->align, spe->features);
+
+	spe->features |= SPE_PMU_FEAT_DEV_PROBED;
+}
+
+static void __arm_spe_reset_local(void)
+{
+	/*
+	 * This is probably overkill, as we have no idea where we're
+	 * draining any buffered data to...
+	 */
+	arm_spe_disable_and_drain_local();
+
+	/* Reset the buffer base pointer */
+	write_sysreg_s(0, SYS_PMBPTR_EL1);
+	isb();
+
+	/* Clear any pending management interrupts */
+	write_sysreg_s(0, SYS_PMBSR_EL1);
+	isb();
+}
+
+static void __arm_spe_setup_one(void)
+{
+	__arm_spe_reset_local();
+	enable_percpu_irq(spe->irq, IRQ_TYPE_NONE);
+}
+
+static void __arm_spe_stop_one(void)
+{
+	disable_percpu_irq(spe->irq);
+	__arm_spe_reset_local();
+}
+
+void arm_spe_set_user(enum arm_spe_user_e user)
+{
+	if (user == ARM_SPE_USER_PERF)
+		arm_spe_user_switch_cb(USER_SWITCH_AWAY_FROM_MEM_SAMPLING);
+	else
+		arm_spe_user_switch_cb(USER_SWITCH_BACK_TO_MEM_SAMPLING);
+
+	__arm_spe_reset_local();
+
+	arm_spe_user = user;
+}
+EXPORT_SYMBOL_GPL(arm_spe_set_user);
+
+static int arm_spe_cpu_startup(unsigned int cpu, struct hlist_node *node)
+{
+	struct arm_spe *spe;
+
+	spe = hlist_entry_safe(node, struct arm_spe, hotplug_node);
+	if (!cpumask_test_cpu(cpu, &spe->supported_cpus))
+		return 0;
+
+	/* Alloc per cpu spe buffer */
+	arm_spe_per_buffer_alloc(cpu);
+
+	/* Reset pmu and enable irq */
+	__arm_spe_setup_one();
+
+	return 0;
+}
+
+static int arm_spe_cpu_teardown(unsigned int cpu, struct hlist_node *node)
+{
+	struct arm_spe *spe;
+
+	spe = hlist_entry_safe(node, struct arm_spe, hotplug_node);
+	if (!cpumask_test_cpu(cpu, &spe->supported_cpus))
+		return 0;
+
+	/* Disable irq and reset pmu */
+	__arm_spe_stop_one();
+
+	/* Release per cpu spe buffer */
+	arm_spe_per_buffer_free(cpu);
+
+	return 0;
+}
+
+static int arm_spe_dev_init(void)
+{
+	int ret;
+	cpumask_t *mask = &spe->supported_cpus;
+
+
+	/* Make sure we probe the hardware on a relevant CPU */
+	ret = smp_call_function_any(mask, __arm_spe_dev_probe, NULL, 1);
+	if (ret || !(spe->features & SPE_PMU_FEAT_DEV_PROBED))
+		return -ENXIO;
+
+	/* Request our PPIs (note that the IRQ is still disabled) */
+	ret = request_percpu_irq(spe->irq, arm_spe_irq_handler, DRVNAME,
+				 &irq_dev_id);
+	if (ret)
+		return ret;
+
+	/*
+	 * Register our hotplug notifier now so we don't miss any events.
+	 * This will enable the IRQ for any supported CPUs that are already
+	 * up.
+	 */
+	ret = cpuhp_state_add_instance(arm_spe_online,
+				       &spe->hotplug_node);
+	if (ret)
+		free_percpu_irq(spe->irq, &irq_dev_id);
+
+	return ret;
+}
+
+static void arm_spe_dev_teardown(void)
+{
+	arm_spe_buffer_free();
+	cpuhp_state_remove_instance(arm_spe_online, &spe->hotplug_node);
+	free_percpu_irq(spe->irq, &irq_dev_id);
+}
+
+static const struct of_device_id arm_spe_of_match[] = {
+	{ .compatible = "arm,statistical-profiling-extension-v1",
+	  .data = (void *)1 },
+	{ /* Sentinel */ },
+};
+MODULE_DEVICE_TABLE(of, arm_spe_of_match);
+
+static const struct platform_device_id arm_spe_match[] = {
+	{ ARMV8_SPE_PDEV_NAME, 0 },
+	{}
+};
+MODULE_DEVICE_TABLE(platform, arm_spe_match);
+
+/* Driver and device probing */
+static int arm_spe_irq_probe(void)
+{
+	struct platform_device *pdev = spe->pdev;
+	int irq = platform_get_irq(pdev, 0);
+
+	if (irq < 0)
+		return -ENXIO;
+
+	if (!irq_is_percpu(irq)) {
+		dev_err(&pdev->dev, "expected PPI but got SPI (%d)\n", irq);
+		return -EINVAL;
+	}
+
+	if (irq_get_percpu_devid_partition(irq, &spe->supported_cpus)) {
+		dev_err(&pdev->dev, "failed to get PPI partition (%d)\n", irq);
+		return -EINVAL;
+	}
+
+	spe->irq = irq;
+	return 0;
+}
+
+static void arm_spe_sample_para_init(void)
+{
+	spe->sample_period = SPE_SAMPLE_PERIOD;
+	spe->jitter = 1;
+	spe->load_filter = 1;
+	spe->store_filter = 1;
+	spe->branch_filter = 0;
+	spe->inv_event_filter = 0;
+	spe->event_filter = 0x2;
+
+	spe->ts_enable = 1;
+	spe->pa_enable = 1;
+	spe->pct_enable = 0;
+
+	spe->exclude_user = 1;
+	spe->exclude_kernel = 0;
+
+	spe->min_latency = 120;
+}
+
+void arm_spe_record_enqueue(struct arm_spe_record *record)
+{
+	struct arm_spe_buf *spe_buf = this_cpu_ptr(&per_cpu_spe_buf);
+	struct mem_sampling_record *record_tail;
+
+	if (spe_buf->nr_records >= SPE_RECORD_BUFFER_MAX_RECORDS) {
+		pr_err("nr_records exceeded!\n");
+		return;
+	}
+
+	trace_spe_record((struct mem_sampling_record *)record, smp_processor_id());
+	record_tail = spe_buf->record_base +
+			spe_buf->nr_records * SPE_RECORD_ENTRY_SIZE;
+	*record_tail = *(struct mem_sampling_record *)record;
+	spe_buf->nr_records++;
+
+}
+
+static int arm_spe_device_probe(struct platform_device *pdev)
+{
+
+	int ret;
+	struct device *dev;
+	/*
+	 * If kernelspace is unmapped when running at EL0, then the SPE
+	 * buffer will fault and prematurely terminate the AUX session.
+	 */
+	if (arm64_kernel_unmapped_at_el0()) {
+		dev_warn_once(dev, "buffer inaccessible. Try passing \"kpti=off\" on the kernel command line\n");
+		return -EPERM;
+	}
+
+	if (!pdev) {
+		pr_err("pdev is NULL!\n");
+		return -ENODEV;
+	}
+
+	dev = &pdev->dev;
+	if (!dev) {
+		pr_err("dev is NULL!\n");
+		return -ENODEV;
+	}
+
+	spe = devm_kzalloc(dev, sizeof(*spe), GFP_KERNEL);
+	if (!spe)
+		return -ENOMEM;
+
+	spe->pdev = pdev;
+	platform_set_drvdata(pdev, spe);
+
+	ret = arm_spe_irq_probe();
+	if (ret)
+		goto out_free;
+
+	ret = arm_spe_dev_init();
+	if (ret)
+		goto out_free;
+
+	/*
+	 * Ensure that all CPUs that support SPE can apply for the cache
+	 * area, with each CPU defaulting to 4K * 2. Failure to do so will
+	 * result in the inability to collect SPE data in kernel mode.
+	 */
+	ret = arm_spe_buffer_alloc();
+	if (ret)
+		goto out_teardown;
+
+	arm_spe_sample_para_init();
+
+	spe_probe_status = SPE_INIT_SUCC;
+
+	return 0;
+
+out_teardown:
+	arm_spe_dev_teardown();
+out_free:
+	kfree(spe);
+	return ret;
+}
+
+static int arm_spe_device_remove(struct platform_device *pdev)
+{
+	arm_spe_dev_teardown();
+	return 0;
+}
+
+static struct platform_driver arm_spe_driver = {
+	.id_table = arm_spe_match,
+	.driver	= {
+		.name		= DRVNAME,
+		.of_match_table	= of_match_ptr(arm_spe_of_match),
+		.suppress_bind_attrs = true,
+	},
+	.probe	= arm_spe_device_probe,
+	.remove	= arm_spe_device_remove,
+};
+
+static int __init arm_spe_init(void)
+{
+	int ret;
+
+	ret = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN, DRVNAME,
+				      arm_spe_cpu_startup,
+				      arm_spe_cpu_teardown);
+	if (ret < 0)
+		return ret;
+	arm_spe_online = ret;
+
+	ret = platform_driver_register(&arm_spe_driver);
+
+	if (ret)
+		cpuhp_remove_multi_state(arm_spe_online);
+
+	return ret;
+}
+
+static void __exit arm_spe_exit(void)
+{
+	/*
+	 * TODO: Find a clean way to disable SPE so that SPE
+	 * can be used for perf.
+	 */
+	platform_driver_unregister(&arm_spe_driver);
+	cpuhp_remove_multi_state(arm_spe_online);
+	arm_spe_buffer_free();
+}
+
+module_init(arm_spe_init);
+module_exit(arm_spe_exit);
diff --git a/drivers/arm/spe/spe.h b/drivers/arm/spe/spe.h
new file mode 100644
index 0000000000000000000000000000000000000000..6a51cdb401301bc22f44ac354494f365c02691fc
--- /dev/null
+++ b/drivers/arm/spe/spe.h
@@ -0,0 +1,125 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef __SPE_H
+#define __SPE_H
+
+#define SPE_BUFFER_MAX_SIZE		(PAGE_SIZE)
+#define SPE_BUFFER_SIZE		(PAGE_SIZE / 32)
+
+#define SPE_SAMPLE_PERIOD		1024
+
+#define SPE_RECORD_BUFFER_MAX_RECORDS	(100)
+#define SPE_RECORD_ENTRY_SIZE		sizeof(struct mem_sampling_record)
+
+#define SPE_PMU_FEAT_FILT_EVT		(1UL << 0)
+#define SPE_PMU_FEAT_FILT_TYP		(1UL << 1)
+#define SPE_PMU_FEAT_FILT_LAT		(1UL << 2)
+#define SPE_PMU_FEAT_ARCH_INST		(1UL << 3)
+#define SPE_PMU_FEAT_LDS		(1UL << 4)
+#define SPE_PMU_FEAT_ERND		(1UL << 5)
+#define SPE_PMU_FEAT_INV_FILT_EVT	(1UL << 6)
+#define SPE_PMU_FEAT_DEV_PROBED	(1UL << 63)
+#define ARM_SPE_BUF_PAD_BYTE		(0)
+#define PMBLIMITR_EL1_E			GENMASK(0, 0)
+#define PMBSR_EL1_S			GENMASK(17, 17)
+#define PMBSR_EL1_EC			GENMASK(31, 26)
+#define PMBSR_EL1_EC_BUF		UL(0b000000)
+#define PMBSR_EL1_EC_FAULT_S1		UL(0b100100)
+#define PMBSR_EL1_EC_FAULT_S2		UL(0b100101)
+#define PMBSR_EL1_MSS_MASK		GENMASK(15, 0)
+#define PMBSR_EL1_BUF_BSC_MASK		PMBSR_EL1_MSS_MASK
+#define PMBSR_EL1_BUF_BSC_FULL		0x1UL
+#define PMSFCR_EL1_LD			GENMASK(17, 17)
+#define PMSFCR_EL1_ST			GENMASK(18, 18)
+#define PMSFCR_EL1_B			GENMASK(16, 16)
+#define PMSFCR_EL1_FnE			GENMASK(3, 3)
+#define PMSFCR_EL1_FT			GENMASK(1, 1)
+#define PMSFCR_EL1_FE			GENMASK(0, 0)
+#define PMSFCR_EL1_FL			GENMASK(2, 2)
+#define PMSIRR_EL1_INTERVAL_MASK	GENMASK(31, 8)
+#define PMSCR_EL1_TS			GENMASK(5, 5)
+#define PMSCR_EL1_PA			GENMASK(4, 4)
+#define PMSCR_EL1_CX			GENMASK(3, 3)
+#define PMSCR_EL1_E1SPE			GENMASK(1, 1)
+#define PMSCR_EL1_E0SPE			GENMASK(0, 0)
+#define ID_AA64DFR0_EL1_PMSVer_SHIFT	32
+#define PMBIDR_EL1_P			GENMASK(4, 4)
+#define PMBIDR_EL1_ALIGN		GENMASK(3, 0)
+#define PMSIDR_EL1_FE			GENMASK(0, 0)
+#define PMSIDR_EL1_FnE			GENMASK(6, 6)
+#define PMSIDR_EL1_FT			GENMASK(1, 1)
+#define PMSIDR_EL1_ARCHINST		GENMASK(3, 3)
+#define PMSIDR_EL1_LDS			GENMASK(4, 4)
+#define PMSIDR_EL1_ERND			GENMASK(5, 5)
+#define PMSIDR_EL1_INTERVAL		GENMASK(11, 8)
+#define PMSIDR_EL1_INTERVAL_256		UL(0b0000)
+#define PMSIDR_EL1_INTERVAL_512		UL(0b0010)
+#define PMSIDR_EL1_INTERVAL_768		UL(0b0011)
+#define PMSIDR_EL1_INTERVAL_1024	UL(0b0100)
+#define PMSIDR_EL1_INTERVAL_1536	UL(0b0101)
+#define PMSIDR_EL1_INTERVAL_2048	UL(0b0110)
+#define PMSIDR_EL1_INTERVAL_3072	UL(0b0111)
+#define PMSIDR_EL1_INTERVAL_4096	UL(0b1000)
+#define PMSIDR_EL1_MAXSIZE		GENMASK(15, 12)
+#define PMSIDR_EL1_COUNTSIZE		GENMASK(19, 16)
+#define PMSIDR_EL1_COUNTSIZE_12_BIT_SAT	UL(0b0010)
+#define PMSIDR_EL1_COUNTSIZE_16_BIT_SAT	UL(0b0011)
+#define PMSIDR_EL1_FL			GENMASK(2, 2)
+#define SYS_PMSNEVFR_EL1		sys_reg(3, 0, 9, 9, 1)
+#define SPE_PMU_FEAT_INV_FILT_EVT	(1UL << 6)
+#define PMBSR_EL1_COLL_MASK		GENMASK(16, 16)
+#define PMBSR_EL1_COLL			PMBSR_EL1_COLL_MASK
+#define	PMBSR_EL1_DL_MASK		GENMASK(19, 19)
+#define PMBSR_EL1_DL			PMBSR_EL1_DL_MASK
+
+enum arm_spe_buf_fault_action {
+	SPE_PMU_BUF_FAULT_ACT_SPURIOUS,
+	SPE_PMU_BUF_FAULT_ACT_FATAL,
+	SPE_PMU_BUF_FAULT_ACT_OK,
+};
+
+enum arm_spe_user_e {
+	ARM_SPE_USER_PERF,
+	ARM_SPE_USER_MEM_SAMPLING,
+};
+
+struct arm_spe {
+	struct pmu			pmu;
+	struct platform_device		*pdev;
+	cpumask_t			supported_cpus;
+	struct hlist_node		hotplug_node;
+	int				irq; /* PPI */
+	u16				pmsver;
+	u16				min_period;
+	u16				counter_sz;
+	u64				features;
+	u16				max_record_sz;
+	u16				align;
+	u64				sample_period;
+	local64_t			period_left;
+	bool				jitter;
+	bool				load_filter;
+	bool				store_filter;
+	bool				branch_filter;
+	u64				inv_event_filter;
+	u16				min_latency;
+	u64				event_filter;
+	bool				ts_enable;
+	bool				pa_enable;
+	u8				pct_enable;
+	bool				exclude_user;
+	bool				exclude_kernel;
+};
+
+struct arm_spe_buf {
+	void				*cur;		/* for spe raw data buffer */
+	int				size;
+	int				period;
+	void				*base;
+
+	void				*record_base;	/* for spe record buffer */
+	int				record_size;
+	int				nr_records;
+};
+
+#endif /* __SPE_H */
diff --git a/drivers/perf/Kconfig b/drivers/perf/Kconfig
index 67ad53cde11fc23661070292517c5bf211a5b9f8..e6eee6f3d33c2a9d90bd8c7e851df90e12336569 100644
--- a/drivers/perf/Kconfig
+++ b/drivers/perf/Kconfig
@@ -124,7 +124,7 @@ config XGENE_PMU
 
 config ARM_SPE_PMU
 	tristate "Enable support for the ARMv8.2 Statistical Profiling Extension"
-	depends on ARM64
+	depends on ARM_SPE && MEM_SAMPLING
 	help
 	  Enable perf support for the ARMv8.2 Statistical Profiling
 	  Extension, which provides periodic sampling of operations in
diff --git a/drivers/perf/arm_pmu_acpi.c b/drivers/perf/arm_pmu_acpi.c
index 0d284fda87aca86263ee382ae903547b4320e837..4e716b700c0f2c449643b2f55987ae34ec07d1f2 100644
--- a/drivers/perf/arm_pmu_acpi.c
+++ b/drivers/perf/arm_pmu_acpi.c
@@ -125,6 +125,33 @@ arm_acpi_register_pmu_device(struct platform_device *pdev, u8 len,
 }
 
 #if IS_ENABLED(CONFIG_ARM_SPE_PMU)
+static struct resource spe_pmu_resources[] = {
+	{
+	}
+};
+
+static struct platform_device spe_pmu_dev = {
+	.name = ARMV8_SPE_PMU_PDEV_NAME,
+	.id = -1,
+	.resource = spe_pmu_resources,
+	.num_resources = ARRAY_SIZE(spe_pmu_resources)
+};
+
+static void arm_spe_pmu_acpi_register_device(void)
+{
+	int ret;
+
+	ret = platform_device_register(&spe_pmu_dev);
+	if (ret < 0)
+		pr_warn("ACPI: SPE_PMU: Unable to register device\n");
+}
+#else
+static inline void arm_spe_pmu_acpi_register_device(void)
+{
+}
+#endif
+
+#if IS_ENABLED(CONFIG_ARM_SPE)
 static struct resource spe_resources[] = {
 	{
 		/* irq */
@@ -160,7 +187,7 @@ static void arm_spe_acpi_register_device(void)
 static inline void arm_spe_acpi_register_device(void)
 {
 }
-#endif /* CONFIG_ARM_SPE_PMU */
+#endif /* CONFIG_ARM_SPE */
 
 #if IS_ENABLED(CONFIG_CORESIGHT_TRBE)
 static struct resource trbe_resources[] = {
@@ -402,6 +429,7 @@ static int arm_pmu_acpi_init(void)
 		return 0;
 
 	arm_spe_acpi_register_device();
+	arm_spe_pmu_acpi_register_device();
 	arm_trbe_acpi_register_device();
 
 	ret = arm_pmu_acpi_parse_irqs();
diff --git a/drivers/perf/arm_spe_pmu.c b/drivers/perf/arm_spe_pmu.c
index 2a4ebdd1ee78da280e416f9b775636061846376e..970bc2f3c4bf474ce3e5c75e71c93861f493bf34 100644
--- a/drivers/perf/arm_spe_pmu.c
+++ b/drivers/perf/arm_spe_pmu.c
@@ -39,6 +39,8 @@
 #include <asm/mmu.h>
 #include <asm/sysreg.h>
 
+#include "../arm/spe/spe.h"
+
 /*
  * Cache if the event is allowed to trace Context information.
  * This allows us to perform the check, i.e, perfmon_capable(),
@@ -57,8 +59,6 @@ static bool get_spe_event_has_cx(struct perf_event *event)
 	return !!(event->hw.flags & SPE_PMU_HW_FLAGS_CX);
 }
 
-#define ARM_SPE_BUF_PAD_BYTE			0
-
 struct arm_spe_pmu_buf {
 	int					nr_pages;
 	bool					snapshot;
@@ -76,13 +76,6 @@ struct arm_spe_pmu {
 	u16					min_period;
 	u16					counter_sz;
 
-#define SPE_PMU_FEAT_FILT_EVT			(1UL << 0)
-#define SPE_PMU_FEAT_FILT_TYP			(1UL << 1)
-#define SPE_PMU_FEAT_FILT_LAT			(1UL << 2)
-#define SPE_PMU_FEAT_ARCH_INST			(1UL << 3)
-#define SPE_PMU_FEAT_LDS			(1UL << 4)
-#define SPE_PMU_FEAT_ERND			(1UL << 5)
-#define SPE_PMU_FEAT_DEV_PROBED			(1UL << 63)
 	u64					features;
 
 	u16					max_record_sz;
@@ -95,15 +88,6 @@ struct arm_spe_pmu {
 /* Convert a free-running index from perf into an SPE buffer offset */
 #define PERF_IDX2OFF(idx, buf)	((idx) % ((buf)->nr_pages << PAGE_SHIFT))
 
-/* Keep track of our dynamic hotplug state */
-static enum cpuhp_state arm_spe_pmu_online;
-
-enum arm_spe_pmu_buf_fault_action {
-	SPE_PMU_BUF_FAULT_ACT_SPURIOUS,
-	SPE_PMU_BUF_FAULT_ACT_FATAL,
-	SPE_PMU_BUF_FAULT_ACT_OK,
-};
-
 /* This sysfs gunk was really good fun to write. */
 enum arm_spe_pmu_capabilities {
 	SPE_PMU_CAP_ARCH_INST = 0,
@@ -276,6 +260,8 @@ static const struct attribute_group *arm_spe_pmu_attr_groups[] = {
 	NULL,
 };
 
+struct arm_spe_pmu *spe_pmu_local;
+
 /* Convert between user ABI and register values */
 static u64 arm_spe_event_to_pmscr(struct perf_event *event)
 {
@@ -551,12 +537,12 @@ static void arm_spe_pmu_disable_and_drain_local(void)
 }
 
 /* IRQ handling */
-static enum arm_spe_pmu_buf_fault_action
+static enum arm_spe_buf_fault_action
 arm_spe_pmu_buf_get_fault_act(struct perf_output_handle *handle)
 {
 	const char *err_str;
 	u64 pmbsr;
-	enum arm_spe_pmu_buf_fault_action ret;
+	enum arm_spe_buf_fault_action ret;
 
 	/*
 	 * Ensure new profiling data is visible to the CPU and any external
@@ -621,57 +607,6 @@ arm_spe_pmu_buf_get_fault_act(struct perf_output_handle *handle)
 	return ret;
 }
 
-static irqreturn_t arm_spe_pmu_irq_handler(int irq, void *dev)
-{
-	struct perf_output_handle *handle = dev;
-	struct perf_event *event = handle->event;
-	enum arm_spe_pmu_buf_fault_action act;
-
-	if (!perf_get_aux(handle))
-		return IRQ_NONE;
-
-	act = arm_spe_pmu_buf_get_fault_act(handle);
-	if (act == SPE_PMU_BUF_FAULT_ACT_SPURIOUS)
-		return IRQ_NONE;
-
-	/*
-	 * Ensure perf callbacks have completed, which may disable the
-	 * profiling buffer in response to a TRUNCATION flag.
-	 */
-	irq_work_run();
-
-	switch (act) {
-	case SPE_PMU_BUF_FAULT_ACT_FATAL:
-		/*
-		 * If a fatal exception occurred then leaving the profiling
-		 * buffer enabled is a recipe waiting to happen. Since
-		 * fatal faults don't always imply truncation, make sure
-		 * that the profiling buffer is disabled explicitly before
-		 * clearing the syndrome register.
-		 */
-		arm_spe_pmu_disable_and_drain_local();
-		break;
-	case SPE_PMU_BUF_FAULT_ACT_OK:
-		/*
-		 * We handled the fault (the buffer was full), so resume
-		 * profiling as long as we didn't detect truncation.
-		 * PMBPTR might be misaligned, but we'll burn that bridge
-		 * when we get to it.
-		 */
-		if (!(handle->aux_flags & PERF_AUX_FLAG_TRUNCATED)) {
-			arm_spe_perf_aux_output_begin(handle, event);
-			isb();
-		}
-		break;
-	case SPE_PMU_BUF_FAULT_ACT_SPURIOUS:
-		/* We've seen you before, but GCC has the memory of a sieve. */
-		break;
-	}
-
-	/* The buffer pointers are now sane, so resume profiling. */
-	write_sysreg_s(0, SYS_PMBSR_EL1);
-	return IRQ_HANDLED;
-}
 
 static u64 arm_spe_pmsevfr_res0(u16 pmsver)
 {
@@ -746,6 +681,8 @@ static void arm_spe_pmu_start(struct perf_event *event, int flags)
 	struct hw_perf_event *hwc = &event->hw;
 	struct perf_output_handle *handle = this_cpu_ptr(spe_pmu->handle);
 
+	arm_spe_set_user(ARM_SPE_USER_PERF);
+
 	hwc->state = 0;
 	arm_spe_perf_aux_output_begin(handle, event);
 	if (hwc->state)
@@ -780,8 +717,14 @@ static void arm_spe_pmu_stop(struct perf_event *event, int flags)
 	struct perf_output_handle *handle = this_cpu_ptr(spe_pmu->handle);
 
 	/* If we're already stopped, then nothing to do */
-	if (hwc->state & PERF_HES_STOPPED)
+	if (hwc->state & PERF_HES_STOPPED) {
+		/*
+		 * PERF_HES_STOPPED maybe set in arm_spe_perf_aux_output_begin,
+		 * we switch user here.
+		 */
+		arm_spe_set_user(ARM_SPE_USER_MEM_SAMPLING);
 		return;
+	}
 
 	/* Stop all trace generation */
 	arm_spe_pmu_disable_and_drain_local();
@@ -793,7 +736,7 @@ static void arm_spe_pmu_stop(struct perf_event *event, int flags)
 		 * path.
 		 */
 		if (perf_get_aux(handle)) {
-			enum arm_spe_pmu_buf_fault_action act;
+			enum arm_spe_buf_fault_action act;
 
 			act = arm_spe_pmu_buf_get_fault_act(handle);
 			if (act == SPE_PMU_BUF_FAULT_ACT_SPURIOUS)
@@ -812,6 +755,7 @@ static void arm_spe_pmu_stop(struct perf_event *event, int flags)
 	}
 
 	hwc->state |= PERF_HES_STOPPED;
+	arm_spe_set_user(ARM_SPE_USER_MEM_SAMPLING);
 }
 
 static int arm_spe_pmu_add(struct perf_event *event, int flags)
@@ -952,233 +896,58 @@ static void arm_spe_pmu_perf_destroy(struct arm_spe_pmu *spe_pmu)
 	perf_pmu_unregister(&spe_pmu->pmu);
 }
 
-static void __arm_spe_pmu_dev_probe(void *info)
+void arm_spe_sampling_process(enum arm_spe_buf_fault_action act)
 {
-	int fld;
-	u64 reg;
-	struct arm_spe_pmu *spe_pmu = info;
-	struct device *dev = &spe_pmu->pdev->dev;
-
-	fld = cpuid_feature_extract_unsigned_field(read_cpuid(ID_AA64DFR0_EL1),
-						   ID_AA64DFR0_PMSVER_SHIFT);
-	if (!fld) {
-		dev_err(dev,
-			"unsupported ID_AA64DFR0_EL1.PMSVer [%d] on CPU %d\n",
-			fld, smp_processor_id());
-		return;
-	}
-	spe_pmu->pmsver = (u16)fld;
-
-	/* Read PMBIDR first to determine whether or not we have access */
-	reg = read_sysreg_s(SYS_PMBIDR_EL1);
-	if (reg & BIT(SYS_PMBIDR_EL1_P_SHIFT)) {
-		dev_err(dev,
-			"profiling buffer owned by higher exception level\n");
-		return;
-	}
-
-	/* Minimum alignment. If it's out-of-range, then fail the probe */
-	fld = reg >> SYS_PMBIDR_EL1_ALIGN_SHIFT & SYS_PMBIDR_EL1_ALIGN_MASK;
-	spe_pmu->align = 1 << fld;
-	if (spe_pmu->align > SZ_2K) {
-		dev_err(dev, "unsupported PMBIDR.Align [%d] on CPU %d\n",
-			fld, smp_processor_id());
-		return;
-	}
-
-	/* It's now safe to read PMSIDR and figure out what we've got */
-	reg = read_sysreg_s(SYS_PMSIDR_EL1);
-	if (reg & BIT(SYS_PMSIDR_EL1_FE_SHIFT))
-		spe_pmu->features |= SPE_PMU_FEAT_FILT_EVT;
-
-	if (reg & BIT(SYS_PMSIDR_EL1_FT_SHIFT))
-		spe_pmu->features |= SPE_PMU_FEAT_FILT_TYP;
-
-	if (reg & BIT(SYS_PMSIDR_EL1_FL_SHIFT))
-		spe_pmu->features |= SPE_PMU_FEAT_FILT_LAT;
-
-	if (reg & BIT(SYS_PMSIDR_EL1_ARCHINST_SHIFT))
-		spe_pmu->features |= SPE_PMU_FEAT_ARCH_INST;
-
-	if (reg & BIT(SYS_PMSIDR_EL1_LDS_SHIFT))
-		spe_pmu->features |= SPE_PMU_FEAT_LDS;
-
-	if (reg & BIT(SYS_PMSIDR_EL1_ERND_SHIFT))
-		spe_pmu->features |= SPE_PMU_FEAT_ERND;
-
-	/* This field has a spaced out encoding, so just use a look-up */
-	fld = reg >> SYS_PMSIDR_EL1_INTERVAL_SHIFT & SYS_PMSIDR_EL1_INTERVAL_MASK;
-	switch (fld) {
-	case 0:
-		spe_pmu->min_period = 256;
-		break;
-	case 2:
-		spe_pmu->min_period = 512;
-		break;
-	case 3:
-		spe_pmu->min_period = 768;
-		break;
-	case 4:
-		spe_pmu->min_period = 1024;
-		break;
-	case 5:
-		spe_pmu->min_period = 1536;
-		break;
-	case 6:
-		spe_pmu->min_period = 2048;
-		break;
-	case 7:
-		spe_pmu->min_period = 3072;
-		break;
-	default:
-		dev_warn(dev, "unknown PMSIDR_EL1.Interval [%d]; assuming 8\n",
-			 fld);
-		fallthrough;
-	case 8:
-		spe_pmu->min_period = 4096;
-	}
+	struct perf_output_handle *handle = this_cpu_ptr(spe_pmu_local->handle);
+	struct perf_event *event = handle->event;
+	u64 pmbsr;
 
-	/* Maximum record size. If it's out-of-range, then fail the probe */
-	fld = reg >> SYS_PMSIDR_EL1_MAXSIZE_SHIFT & SYS_PMSIDR_EL1_MAXSIZE_MASK;
-	spe_pmu->max_record_sz = 1 << fld;
-	if (spe_pmu->max_record_sz > SZ_2K || spe_pmu->max_record_sz < 16) {
-		dev_err(dev, "unsupported PMSIDR_EL1.MaxSize [%d] on CPU %d\n",
-			fld, smp_processor_id());
+	if (!perf_get_aux(handle))
 		return;
-	}
-
-	fld = reg >> SYS_PMSIDR_EL1_COUNTSIZE_SHIFT & SYS_PMSIDR_EL1_COUNTSIZE_MASK;
-	switch (fld) {
-	default:
-		dev_warn(dev, "unknown PMSIDR_EL1.CountSize [%d]; assuming 2\n",
-			 fld);
-		fallthrough;
-	case 2:
-		spe_pmu->counter_sz = 12;
-	}
-
-	dev_info(dev,
-		 "probed for CPUs %*pbl [max_record_sz %u, align %u, features 0x%llx]\n",
-		 cpumask_pr_args(&spe_pmu->supported_cpus),
-		 spe_pmu->max_record_sz, spe_pmu->align, spe_pmu->features);
-
-	spe_pmu->features |= SPE_PMU_FEAT_DEV_PROBED;
-	return;
-}
 
-static void __arm_spe_pmu_reset_local(void)
-{
 	/*
-	 * This is probably overkill, as we have no idea where we're
-	 * draining any buffered data to...
+	 * If we've lost data, disable profiling and also set the PARTIAL
+	 * flag to indicate that the last record is corrupted.
 	 */
-	arm_spe_pmu_disable_and_drain_local();
-
-	/* Reset the buffer base pointer */
-	write_sysreg_s(0, SYS_PMBPTR_EL1);
-	isb();
-
-	/* Clear any pending management interrupts */
-	write_sysreg_s(0, SYS_PMBSR_EL1);
-	isb();
-}
-
-static void __arm_spe_pmu_setup_one(void *info)
-{
-	struct arm_spe_pmu *spe_pmu = info;
-
-	__arm_spe_pmu_reset_local();
-	enable_percpu_irq(spe_pmu->irq, IRQ_TYPE_NONE);
-}
-
-static void __arm_spe_pmu_stop_one(void *info)
-{
-	struct arm_spe_pmu *spe_pmu = info;
-
-	disable_percpu_irq(spe_pmu->irq);
-	__arm_spe_pmu_reset_local();
-}
-
-static int arm_spe_pmu_cpu_startup(unsigned int cpu, struct hlist_node *node)
-{
-	struct arm_spe_pmu *spe_pmu;
-
-	spe_pmu = hlist_entry_safe(node, struct arm_spe_pmu, hotplug_node);
-	if (!cpumask_test_cpu(cpu, &spe_pmu->supported_cpus))
-		return 0;
-
-	__arm_spe_pmu_setup_one(spe_pmu);
-	return 0;
-}
-
-static int arm_spe_pmu_cpu_teardown(unsigned int cpu, struct hlist_node *node)
-{
-	struct arm_spe_pmu *spe_pmu;
-
-	spe_pmu = hlist_entry_safe(node, struct arm_spe_pmu, hotplug_node);
-	if (!cpumask_test_cpu(cpu, &spe_pmu->supported_cpus))
-		return 0;
-
-	__arm_spe_pmu_stop_one(spe_pmu);
-	return 0;
-}
-
-static int arm_spe_pmu_dev_init(struct arm_spe_pmu *spe_pmu)
-{
-	int ret;
-	cpumask_t *mask = &spe_pmu->supported_cpus;
-
-	/* Make sure we probe the hardware on a relevant CPU */
-	ret = smp_call_function_any(mask,  __arm_spe_pmu_dev_probe, spe_pmu, 1);
-	if (ret || !(spe_pmu->features & SPE_PMU_FEAT_DEV_PROBED))
-		return -ENXIO;
-
-	/* Request our PPIs (note that the IRQ is still disabled) */
-	ret = request_percpu_irq(spe_pmu->irq, arm_spe_pmu_irq_handler, DRVNAME,
-				 spe_pmu->handle);
-	if (ret)
-		return ret;
+	if (FIELD_GET(PMBSR_EL1_DL, pmbsr))
+		perf_aux_output_flag(handle, PERF_AUX_FLAG_TRUNCATED |
+				      PERF_AUX_FLAG_PARTIAL);
 
-	/*
-	 * Register our hotplug notifier now so we don't miss any events.
-	 * This will enable the IRQ for any supported CPUs that are already
-	 * up.
-	 */
-	ret = cpuhp_state_add_instance(arm_spe_pmu_online,
-				       &spe_pmu->hotplug_node);
-	if (ret)
-		free_percpu_irq(spe_pmu->irq, spe_pmu->handle);
+	/* Report collisions to userspace so that it can up the period */
+	if (FIELD_GET(PMBSR_EL1_COLL, pmbsr))
+		perf_aux_output_flag(handle, PERF_AUX_FLAG_COLLISION);
 
-	return ret;
-}
+	arm_spe_perf_aux_output_end(handle);
 
-static void arm_spe_pmu_dev_teardown(struct arm_spe_pmu *spe_pmu)
-{
-	cpuhp_state_remove_instance(arm_spe_pmu_online, &spe_pmu->hotplug_node);
-	free_percpu_irq(spe_pmu->irq, spe_pmu->handle);
+	if (act == SPE_PMU_BUF_FAULT_ACT_OK) {
+		if (!(handle->aux_flags & PERF_AUX_FLAG_TRUNCATED)) {
+			arm_spe_perf_aux_output_begin(handle, event);
+			isb();
+		}
+	}
 }
 
-/* Driver and device probing */
-static int arm_spe_pmu_irq_probe(struct arm_spe_pmu *spe_pmu)
+static bool arm_spe_pmu_set_cap(struct arm_spe_pmu *spe_pmu)
 {
-	struct platform_device *pdev = spe_pmu->pdev;
-	int irq = platform_get_irq(pdev, 0);
-
-	if (irq < 0)
-		return -ENXIO;
+	struct arm_spe *p;
+	struct device *dev = &spe_pmu->pdev->dev;
 
-	if (!irq_is_percpu(irq)) {
-		dev_err(&pdev->dev, "expected PPI but got SPI (%d)\n", irq);
-		return -EINVAL;
+	p = arm_spe_get_desc();
+	if (!p) {
+		dev_err(dev, "get spe pmu cap from arm spe driver failed!");
+		return false;
 	}
 
-	if (irq_get_percpu_devid_partition(irq, &spe_pmu->supported_cpus)) {
-		dev_err(&pdev->dev, "failed to get PPI partition (%d)\n", irq);
-		return -EINVAL;
-	}
+	spe_pmu->supported_cpus = p->supported_cpus;
+	spe_pmu->irq = p->irq;
+	spe_pmu->pmsver = p->pmsver;
+	spe_pmu->align = p->align;
+	spe_pmu->features = p->features;
+	spe_pmu->min_period = p->min_period;
+	spe_pmu->max_record_sz = p->max_record_sz;
+	spe_pmu->counter_sz = p->counter_sz;
 
-	spe_pmu->irq = irq;
-	return 0;
+	return true;
 }
 
 static const struct of_device_id arm_spe_pmu_of_match[] = {
@@ -1188,7 +957,7 @@ static const struct of_device_id arm_spe_pmu_of_match[] = {
 MODULE_DEVICE_TABLE(of, arm_spe_pmu_of_match);
 
 static const struct platform_device_id arm_spe_match[] = {
-	{ ARMV8_SPE_PDEV_NAME, 0},
+	{ ARMV8_SPE_PMU_PDEV_NAME, 0},
 	{ }
 };
 MODULE_DEVICE_TABLE(platform, arm_spe_match);
@@ -1221,22 +990,17 @@ static int arm_spe_pmu_device_probe(struct platform_device *pdev)
 	spe_pmu->pdev = pdev;
 	platform_set_drvdata(pdev, spe_pmu);
 
-	ret = arm_spe_pmu_irq_probe(spe_pmu);
-	if (ret)
+	if (!arm_spe_pmu_set_cap(spe_pmu))
 		goto out_free_handle;
 
-	ret = arm_spe_pmu_dev_init(spe_pmu);
-	if (ret)
-		goto out_free_handle;
+	spe_pmu_local = spe_pmu;
 
 	ret = arm_spe_pmu_perf_init(spe_pmu);
 	if (ret)
-		goto out_teardown_dev;
+		goto out_free_handle;
 
 	return 0;
 
-out_teardown_dev:
-	arm_spe_pmu_dev_teardown(spe_pmu);
 out_free_handle:
 	free_percpu(spe_pmu->handle);
 	return ret;
@@ -1247,7 +1011,6 @@ static int arm_spe_pmu_device_remove(struct platform_device *pdev)
 	struct arm_spe_pmu *spe_pmu = platform_get_drvdata(pdev);
 
 	arm_spe_pmu_perf_destroy(spe_pmu);
-	arm_spe_pmu_dev_teardown(spe_pmu);
 	free_percpu(spe_pmu->handle);
 	return 0;
 }
@@ -1265,29 +1028,17 @@ static struct platform_driver arm_spe_pmu_driver = {
 
 static int __init arm_spe_pmu_init(void)
 {
-	int ret;
-
-	ret = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN, DRVNAME,
-				      arm_spe_pmu_cpu_startup,
-				      arm_spe_pmu_cpu_teardown);
-	if (ret < 0)
-		return ret;
-	arm_spe_pmu_online = ret;
-
-	ret = platform_driver_register(&arm_spe_pmu_driver);
-	if (ret)
-		cpuhp_remove_multi_state(arm_spe_pmu_online);
-
-	return ret;
+	arm_spe_sampling_for_perf_callback_register(arm_spe_sampling_process);
+	return platform_driver_register(&arm_spe_pmu_driver);
 }
 
 static void __exit arm_spe_pmu_exit(void)
 {
+	arm_spe_sampling_for_perf_callback_register(NULL);
 	platform_driver_unregister(&arm_spe_pmu_driver);
-	cpuhp_remove_multi_state(arm_spe_pmu_online);
 }
 
-module_init(arm_spe_pmu_init);
+late_initcall(arm_spe_pmu_init);
 module_exit(arm_spe_pmu_exit);
 
 MODULE_DESCRIPTION("Perf driver for the ARMv8.2 Statistical Profiling Extension");
diff --git a/include/linux/mem_sampling.h b/include/linux/mem_sampling.h
new file mode 100644
index 0000000000000000000000000000000000000000..f2721e6d88a6cb4cfeee3494be50c254fd8201bd
--- /dev/null
+++ b/include/linux/mem_sampling.h
@@ -0,0 +1,130 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * mem_sampling.h: declare the mem_sampling abstract layer and provide
+ * unified pmu sampling for NUMA, DAMON, etc.
+ *
+ * Sample records are converted to mem_sampling_record, and then
+ * mem_sampling_record_captured_cb_type invoke the callbacks to
+ * pass the record.
+ *
+ * Copyright (C) 2012 ARM Ltd.
+ */
+#ifndef __MEM_SAMPLING_H
+#define __MEM_SAMPLING_H
+
+extern struct static_key_false mem_sampling_access_hints;
+
+enum mem_sampling_sample_type {
+	MEM_SAMPLING_L1D_ACCESS	= 1 << 0,
+	MEM_SAMPLING_L1D_MISS		= 1 << 1,
+	MEM_SAMPLING_LLC_ACCESS	= 1 << 2,
+	MEM_SAMPLING_LLC_MISS		= 1 << 3,
+	MEM_SAMPLING_TLB_ACCESS	= 1 << 4,
+	MEM_SAMPLING_TLB_MISS		= 1 << 5,
+	MEM_SAMPLING_BRANCH_MISS	= 1 << 6,
+	MEM_SAMPLING_REMOTE_ACCESS	= 1 << 7,
+};
+
+enum mem_sampling_op_type {
+	MEM_SAMPLING_LD		= 1 << 0,
+	MEM_SAMPLING_ST		= 1 << 1,
+};
+
+struct mem_sampling_record {
+	enum mem_sampling_sample_type	type;
+	int			err;
+	u32			op;
+	u32			latency;
+	u64			from_ip;
+	u64			to_ip;
+	u64			timestamp;
+	u64			virt_addr;
+	u64			phys_addr;
+	u64			context_id;
+	u16			source;
+};
+
+/*
+ * Callbacks should be registered using mem_sampling_record_cb_register()
+ * by NUMA, DAMON and etc during their initialisation.
+ * Callbacks will be invoked on new hardware pmu records caputured.
+ */
+typedef void (*mem_sampling_record_cb_type)(struct mem_sampling_record *record);
+void mem_sampling_record_cb_register(mem_sampling_record_cb_type cb);
+void mem_sampling_record_cb_unregister(mem_sampling_record_cb_type cb);
+
+#ifdef CONFIG_MEM_SAMPLING
+void mem_sampling_sched_in(struct task_struct *prev, struct task_struct *curr);
+#else
+static inline void mem_sampling_sched_in(struct task_struct *prev, struct task_struct *curr) { };
+#endif
+
+/* invoked by specific mem_sampling */
+typedef void (*mem_sampling_cb_type)(struct mem_sampling_record *record_base,
+							int n_records);
+
+
+struct mem_sampling_ops_struct {
+	int (*sampling_start)(void);
+	void (*sampling_stop)(void);
+	void (*sampling_continue)(void);
+};
+extern struct mem_sampling_ops_struct mem_sampling_ops;
+
+enum mem_sampling_type_enum {
+	MEM_SAMPLING_ARM_SPE,
+	MEM_SAMPLING_UNSUPPORTED
+};
+
+enum user_switch_type {
+	USER_SWITCH_AWAY_FROM_MEM_SAMPLING,
+	USER_SWITCH_BACK_TO_MEM_SAMPLING,
+};
+typedef void (*mem_sampling_user_switch_cb_type)(enum user_switch_type type);
+
+enum mem_sampling_saved_state_e {
+	MEM_SAMPLING_STATE_ENABLE,
+	MEM_SAMPLING_STATE_DISABLE,
+	MEM_SAMPLING_STATE_EMPTY,
+};
+
+#ifdef CONFIG_ARM_SPE
+int arm_spe_start(void);
+void arm_spe_stop(void);
+void arm_spe_continue(void);
+int arm_spe_enabled(void);
+void arm_spe_record_capture_callback_register(mem_sampling_cb_type cb);
+void arm_spe_user_switch_callback_register(mem_sampling_user_switch_cb_type cb);
+#else
+static inline void arm_spe_stop(void) { };
+static inline void arm_spe_continue(void) { };
+static inline void arm_spe_record_capture_callback_register(mem_sampling_cb_type cb) { };
+static inline void arm_spe_user_switch_callback_register(mem_sampling_user_switch_cb_type cb) { };
+
+static inline int arm_spe_start(void)
+{
+	return 0;
+}
+
+static inline int arm_spe_enabled(void)
+{
+	return 0;
+}
+#endif /* CONFIG_ARM_SPE */
+
+extern enum mem_sampling_saved_state_e mem_sampling_saved_state;
+
+extern struct static_key_false mem_sampling_access_hints;
+#ifdef CONFIG_MEM_SAMPLING
+extern void set_mem_sampling_state(bool enabled);
+#else
+static inline void set_mem_sampling_state(bool enabled)
+{
+}
+#endif /* CONFIG_MEM_SAMPLING */
+
+#ifdef CONFIG_NUMABALANCING_MEM_SAMPLING
+void numa_balancing_mem_sampling_cb_register(void);
+void numa_balancing_mem_sampling_cb_unregister(void);
+#endif /* CONFIG_NUMABALANCING_MEM_SAMPLING */
+#endif	/* __MEM_SAMPLING_H */
diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index ade4993f5fab4d84facfa9876842cfc692dc1b7f..098e05338d25047d98be4d02b5418d16424123c9 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -209,4 +209,8 @@ void migrate_vma_finalize(struct migrate_vma *migrate);
 
 #endif /* CONFIG_MIGRATION */
 
+#ifdef CONFIG_NUMABALANCING_MEM_SAMPLING
+void do_numa_access(struct task_struct *p, u64 laddr, u64 paddr);
+#endif /* CONFIG_NUMABALANCING_MEM_SAMPLING */
+
 #endif /* _LINUX_MIGRATE_H */
diff --git a/include/linux/perf/arm_pmu.h b/include/linux/perf/arm_pmu.h
index c7a35d32127271d8f2e3a3b8f796f10466436e03..64cef5f97080903c7bfd2af1409d497bfa923286 100644
--- a/include/linux/perf/arm_pmu.h
+++ b/include/linux/perf/arm_pmu.h
@@ -14,6 +14,8 @@
 #include <linux/sysfs.h>
 #include <asm/cputype.h>
 
+#include "../../../drivers/arm/spe/spe.h"
+
 #ifdef CONFIG_ARM_PMU
 
 /*
@@ -205,6 +207,12 @@ void armpmu_free_irq(int irq, int cpu);
 #endif /* CONFIG_ARM_PMU */
 
 #define ARMV8_SPE_PDEV_NAME "arm,spe-v1"
+#define ARMV8_SPE_PMU_PDEV_NAME "arm,pmu,spe-v1"
+
 #define ARMV8_TRBE_PDEV_NAME "arm,trbe"
 
+typedef void (*perf_sampling_cb_type)(enum arm_spe_buf_fault_action act);
+void arm_spe_sampling_for_perf_callback_register(perf_sampling_cb_type cb);
+struct arm_spe *arm_spe_get_desc(void);
+void arm_spe_set_user(enum arm_spe_user_e user);
 #endif /* __ARM_PMU_H__ */
diff --git a/include/linux/sched/numa_balancing.h b/include/linux/sched/numa_balancing.h
index 3988762efe15c0e5a80602e2c9acb6a5820a740e..c9262641b6a0d81a849b0afc572c645ea76f41cd 100644
--- a/include/linux/sched/numa_balancing.h
+++ b/include/linux/sched/numa_balancing.h
@@ -15,6 +15,15 @@
 #define TNF_FAULT_LOCAL	0x08
 #define TNF_MIGRATE_FAIL 0x10
 
+#ifdef CONFIG_NUMABALANCING_MEM_SAMPLING
+struct mem_sampling_numa_access_work {
+	struct callback_head work;
+	u64 laddr, paddr;
+	/* Test for debug : decode buffer cpu not same with handle interrupt cpu*/
+	int cpu;
+};
+#endif /* CONFIG_NUMABALANCING_MEM_SAMPLING */
+
 #ifdef CONFIG_NUMA_BALANCING
 extern void task_numa_fault(int last_node, int node, int pages, int flags);
 extern pid_t task_numa_group_id(struct task_struct *p);
@@ -43,5 +52,4 @@ static inline bool should_numa_migrate_memory(struct task_struct *p,
 	return true;
 }
 #endif
-
 #endif /* _LINUX_SCHED_NUMA_BALANCING_H */
diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h
index 5cd5b3c579d3735bfb8109f57bfb590dc59b3359..a97d73a6e426fe583d17858e142a328c96fb026c 100644
--- a/include/linux/sched/sysctl.h
+++ b/include/linux/sched/sysctl.h
@@ -126,4 +126,26 @@ int sched_cluster_handler(struct ctl_table *table, int write,
 			  void *buffer, size_t *lenp, loff_t *ppos);
 #endif
 
+#define MEM_SAMPLING_DISABLED		0x0
+#define MEM_SAMPLING_NORMAL			0x1
+
+#ifdef CONFIG_MEM_SAMPLING
+extern int sysctl_mem_sampling_mode;
+int sysctl_mem_sampling_enable(struct ctl_table *table, int write, void *buffer,
+		size_t *lenp, loff_t *ppos);
+#else
+#define sysctl_mem_sampling_mode		0
+#endif
+
+#define NUMA_BALANCING_HW_DISABLED	0x0
+#define NUMA_BALANCING_HW_NORMAL	0x1
+
+#ifdef CONFIG_NUMABALANCING_MEM_SAMPLING
+extern int sysctl_numa_balacing_hw_mode;
+int sysctl_numabalancing_mem_sampling(struct ctl_table *table, int write,
+				void *buffer, size_t *lenp, loff_t *ppos);
+#else
+#define sysctl_numa_balacing_hw_mode	0
+#endif
+
 #endif /* _LINUX_SCHED_SYSCTL_H */
diff --git a/include/trace/events/kmem.h b/include/trace/events/kmem.h
index f65b1f6db22d868485a6cd43eb884935bda11d0f..345ba9350dd00aeaabb82824db09b529654ad91b 100644
--- a/include/trace/events/kmem.h
+++ b/include/trace/events/kmem.h
@@ -8,6 +8,7 @@
 #include <linux/types.h>
 #include <linux/tracepoint.h>
 #include <trace/events/mmflags.h>
+#include <linux/mem_sampling.h>
 
 DECLARE_EVENT_CLASS(kmem_alloc,
 
@@ -363,6 +364,85 @@ TRACE_EVENT(rss_stat,
 		__entry->member,
 		__entry->size)
 	);
+
+#ifdef CONFIG_NUMABALANCING_MEM_SAMPLING
+TRACE_EVENT(mm_numa_migrating,
+
+	TP_PROTO(u64 laddr, int page_nid, int target_nid,
+		int migrate_success),
+
+	TP_ARGS(laddr, page_nid, target_nid, migrate_success),
+
+	TP_STRUCT__entry(
+		__field(u64, laddr)
+		__field(int, page_nid)
+		__field(int, target_nid)
+		__field(int, migrate_success)
+	),
+
+	TP_fast_assign(
+		__entry->laddr = laddr;
+		__entry->page_nid = page_nid;
+		__entry->target_nid = target_nid;
+		__entry->migrate_success = !!(migrate_success);
+	),
+
+	TP_printk("laddr=%llu page_nid=%d target_nid=%d migrate_success=%d",
+		__entry->laddr, __entry->page_nid,
+		__entry->target_nid, __entry->migrate_success)
+);
+
+TRACE_EVENT(mm_mem_sampling_access_record,
+
+	TP_PROTO(u64 laddr, u64 paddr, int cpuid, int pid),
+
+	TP_ARGS(laddr, paddr, cpuid, pid),
+
+	TP_STRUCT__entry(
+		__field(u64, laddr)
+		__field(u64, paddr)
+		__field(int, cpuid)
+		__field(int, pid)
+	),
+
+	TP_fast_assign(
+		__entry->laddr = laddr;
+		__entry->paddr = paddr;
+		__entry->cpuid = cpuid;
+		__entry->pid = pid;
+	),
+
+	TP_printk("laddr=%llu paddr=%llu cpuid=%d pid=%d",
+		__entry->laddr, __entry->paddr,
+		__entry->cpuid, __entry->pid)
+);
+#endif /* CONFIG_NUMABALANCING_MEM_SAMPLING */
+#ifdef CONFIG_ARM_SPE
+TRACE_EVENT(spe_record,
+	TP_PROTO(struct mem_sampling_record *record, int cpuid),
+
+	TP_ARGS(record, cpuid),
+
+	TP_STRUCT__entry(
+		__field(u64, laddr)
+		__field(u64, paddr)
+	__field(int, cpuid)
+		__field(int, pid)
+	),
+
+	TP_fast_assign(
+		__entry->laddr = record->virt_addr;
+		__entry->paddr = record->phys_addr;
+	__entry->cpuid = cpuid;
+		__entry->pid = record->context_id;
+
+	),
+
+	TP_printk("laddr=%llu paddr=%llu cpuid=%d pid=%d",
+		__entry->laddr, __entry->paddr,
+		__entry->cpuid, __entry->pid)
+);
+#endif /* CONFIG_ARM_SPE */
 #endif /* _TRACE_KMEM_H */
 
 /* This part must be outside protection */
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 4b9af3673285e63c87e159cd1f75e172f9fdf6e9..6d0e287f2bc82c525c10d4b93494ef05ff91b6f0 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -19,6 +19,7 @@
 
 #include <asm/irq_regs.h>
 #include <asm/switch_to.h>
+#include <linux/mem_sampling.h>
 #include <asm/tlb.h>
 
 #include "../workqueue_internal.h"
@@ -3540,6 +3541,116 @@ int sysctl_numa_balancing(struct ctl_table *table, int write,
 #endif
 #endif
 
+DEFINE_STATIC_KEY_FALSE(sched_numabalancing_mem_sampling);
+
+#ifdef CONFIG_NUMABALANCING_MEM_SAMPLING
+
+int sysctl_numa_balacing_hw_mode;
+
+static void __set_numabalancing_mem_sampling_state(bool enabled)
+{
+	if (enabled) {
+		numa_balancing_mem_sampling_cb_register();
+		static_branch_enable(&sched_numabalancing_mem_sampling);
+	} else {
+		numa_balancing_mem_sampling_cb_unregister();
+		static_branch_disable(&sched_numabalancing_mem_sampling);
+	}
+}
+
+void set_numabalancing_mem_sampling_state(bool enabled)
+{
+	if (enabled)
+		sysctl_numa_balacing_hw_mode = NUMA_BALANCING_HW_NORMAL;
+	else
+		sysctl_numa_balacing_hw_mode = NUMA_BALANCING_HW_DISABLED;
+	__set_numabalancing_mem_sampling_state(enabled);
+}
+
+#ifdef CONFIG_PROC_SYSCTL
+
+int sysctl_numabalancing_mem_sampling(struct ctl_table *table, int write,
+				void *buffer, size_t *lenp, loff_t *ppos)
+{
+	struct ctl_table t;
+	int err;
+	int state = static_branch_likely(&sched_numabalancing_mem_sampling);
+
+	if (write && !capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	t = *table;
+	t.data = &state;
+	err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos);
+	if (err < 0)
+		return err;
+
+	if (write && static_branch_likely(&mem_sampling_access_hints))
+		set_numabalancing_mem_sampling_state(state);
+
+	return err;
+}
+#endif
+#endif
+
+DEFINE_STATIC_KEY_FALSE(mem_sampling_access_hints);
+
+#ifdef CONFIG_MEM_SAMPLING
+int sysctl_mem_sampling_mode;
+
+static void __set_mem_sampling_state(bool enabled)
+{
+	if (enabled)
+		static_branch_enable(&mem_sampling_access_hints);
+	else
+		static_branch_disable(&mem_sampling_access_hints);
+}
+
+void set_mem_sampling_state(bool enabled)
+{
+	if (!mem_sampling_ops.sampling_start)
+		return;
+	if (enabled)
+		sysctl_mem_sampling_mode = MEM_SAMPLING_NORMAL;
+	else
+		sysctl_mem_sampling_mode = MEM_SAMPLING_DISABLED;
+	__set_mem_sampling_state(enabled);
+
+#ifdef CONFIG_NUMABALANCING_MEM_SAMPLING
+	if (!enabled)
+		set_numabalancing_mem_sampling_state(enabled);
+#endif
+}
+
+#ifdef CONFIG_PROC_SYSCTL
+int sysctl_mem_sampling_enable(struct ctl_table *table, int write,
+			  void *buffer, size_t *lenp, loff_t *ppos)
+{
+	struct ctl_table t;
+	int err;
+	int state = sysctl_mem_sampling_mode;
+
+	if (write && !capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	t = *table;
+	t.data = &state;
+	err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos);
+	if (err < 0)
+		return err;
+	if (write) {
+		if (mem_sampling_saved_state == MEM_SAMPLING_STATE_EMPTY)
+			set_mem_sampling_state(state);
+		else
+			mem_sampling_saved_state = state ? MEM_SAMPLING_STATE_ENABLE :
+						    MEM_SAMPLING_STATE_DISABLE;
+	}
+
+	return err;
+}
+#endif
+#endif
+
 #ifdef CONFIG_SCHEDSTATS
 
 DEFINE_STATIC_KEY_FALSE(sched_schedstats);
@@ -4066,6 +4177,7 @@ static struct rq *finish_task_switch(struct task_struct *prev)
 	prev_state = prev->state;
 	vtime_task_switch(prev);
 	perf_event_task_sched_in(prev, current);
+	mem_sampling_sched_in(prev, current);
 	finish_task(prev);
 	tick_nohz_task_switch();
 	finish_lock_switch(rq);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 654f6bc4b68247505dffc6afd9398f0bf7b9d609..5759a1aedec306a9a3624d26972b1d29408a3db9 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -30,6 +30,7 @@
 #endif
 #include <linux/sched/grid_qos.h>
 #include <linux/bpf_sched.h>
+#include <linux/mem_sampling.h>
 
 /*
  * Targeted preemption latency for CPU-bound tasks:
@@ -2967,6 +2968,17 @@ static void task_tick_numa(struct rq *rq, struct task_struct *curr)
 	struct callback_head *work = &curr->numa_work;
 	u64 period, now;
 
+#ifdef CONFIG_NUMABALANCING_MEM_SAMPLING
+	/*
+	 * If we are using access hints from hardware (like using
+	 * SPE), don't scan the address space.
+	 * Note that currently PMD-level page migration is not
+	 * supported.
+	 */
+	if (static_branch_unlikely(&mem_sampling_access_hints) &&
+	    static_branch_unlikely(&sched_numabalancing_mem_sampling))
+		return;
+#endif
 	/*
 	 * We don't care about NUMA placement if we don't have memory.
 	 */
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 14d48f6380fa533411c64753a177066f8fca1d8e..ac385325e4d0bf326fcba58b0c313ef5277574b1 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2122,6 +2122,7 @@ static const_debug __maybe_unused unsigned int sysctl_sched_features =
 
 extern struct static_key_false sched_numa_balancing;
 extern struct static_key_false sched_schedstats;
+extern struct static_key_false sched_numabalancing_mem_sampling;
 
 static inline u64 global_rt_period(void)
 {
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 3941856c19d1d7ff3366bbdad43c058d90f0f11b..5804be697a2518d235b77ef49674ebb3c08b75c5 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1868,6 +1868,28 @@ static struct ctl_table kern_table[] = {
 	},
 #endif /* CONFIG_NUMA_BALANCING */
 #endif /* CONFIG_SCHED_DEBUG */
+#ifdef CONFIG_MEM_SAMPLING
+	{
+		.procname       = "mem_sampling_enable",
+		.data           = NULL, /* filled in by handler */
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler   = sysctl_mem_sampling_enable,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
+	},
+#endif /* CONFIG_MEM_SAMPLING */
+#ifdef CONFIG_NUMABALANCING_MEM_SAMPLING
+	{
+		.procname	= "numa_balancing_mem_sampling",
+		.data		= NULL, /* filled in by handler */
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= sysctl_numabalancing_mem_sampling,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
+	},
+#endif /* CONFIG_NUMABALANCING_MEM_SAMPLING */
 	{
 		.procname	= "sched_rt_period_us",
 		.data		= &sysctl_sched_rt_period,
diff --git a/mm/Kconfig b/mm/Kconfig
index 0f9209cd969b61e81acea0a56a351f24487383b2..175ebd7c3afb58748c2e37e37453fa8986573bc0 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -1014,6 +1014,29 @@ config EXTEND_HUGEPAGE_MAPPING
 	help
 	  Introduce vmalloc/vmap/remap interfaces that handle only hugepages.
 
+config MEM_SAMPLING
+	bool "Use hardware memory sampling for kernel features(NUMA, DAMON, etc.)"
+	default n
+	select ARM_SPE if ARM64
+	help
+	  Memory sampling is primarily based on specific hardware capabilities,
+	  which enable hardware PMUs to sample memory access for use by kernel
+	  features.. It requires at least one hardware pmu (e.g. ARM_SPE) to
+	  be enabled.
+
+config NUMABALANCING_MEM_SAMPLING
+	bool "Use hardware memory samples for numa balancing"
+	depends on MEM_SAMPLING && NUMA_BALANCING
+	default n
+	help
+	  This feature relies on hardware sampling, and will use memory access
+	  information obtained from hardware sampling in the NUMA balancing
+	  policy instead of the native software PROT_NONE scheme. Turning on
+	  this feature may have a performance impact on some workloads, for
+	  example, lightweight memory access programs.
+
+	  if unsure, say N to disable the NUMABALANCING_MEM_SAMPLING.
+
 source "mm/damon/Kconfig"
 
 endmenu
diff --git a/mm/Makefile b/mm/Makefile
index a014a5e08f7b6a011a27088ed197208f7d4ad442..112966190c1dbd5f638643e61daf76ccfe418f31 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -133,3 +133,4 @@ obj-$(CONFIG_MEMORY_RELIABLE) += mem_reliable.o
 obj-$(CONFIG_MEMCG_MEMFS_INFO) += memcg_memfs_info.o
 obj-$(CONFIG_PAGE_CACHE_LIMIT) += page_cache_limit.o
 obj-$(CONFIG_CLEAR_FREELIST_PAGE) += clear_freelist_page.o
+obj-$(CONFIG_MEM_SAMPLING) += mem_sampling.o
diff --git a/mm/mem_sampling.c b/mm/mem_sampling.c
new file mode 100644
index 0000000000000000000000000000000000000000..480c467f7b04dc9f540483232238ea026c86f075
--- /dev/null
+++ b/mm/mem_sampling.c
@@ -0,0 +1,188 @@
+// SPDX-License-Identifier: GPL-2.0-only
+
+#define pr_fmt(fmt) "mem_sampling: " fmt
+
+#include <linux/slab.h>
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/mm.h>
+#include <linux/vmstat.h>
+#include <linux/mem_sampling.h>
+
+struct mem_sampling_ops_struct mem_sampling_ops;
+
+static int mem_sampling_override __initdata;
+
+enum mem_sampling_saved_state_e mem_sampling_saved_state = MEM_SAMPLING_STATE_EMPTY;
+
+struct mem_sampling_record_cb_list_entry {
+	struct list_head list;
+	mem_sampling_record_cb_type cb;
+};
+LIST_HEAD(mem_sampling_record_cb_list);
+
+void mem_sampling_record_cb_register(mem_sampling_record_cb_type cb)
+{
+	struct mem_sampling_record_cb_list_entry *cb_entry, *tmp;
+
+	list_for_each_entry_safe(cb_entry, tmp, &mem_sampling_record_cb_list, list) {
+		if (cb_entry->cb == cb) {
+			pr_info("mem_sampling record cb already registered\n");
+			return;
+		}
+	}
+
+	cb_entry = NULL;
+	cb_entry = kmalloc(sizeof(struct mem_sampling_record_cb_list_entry), GFP_KERNEL);
+	if (!cb_entry) {
+		pr_info("mem_sampling record cb entry alloc memory failed\n");
+		return;
+	}
+
+	cb_entry->cb = cb;
+	list_add(&(cb_entry->list), &mem_sampling_record_cb_list);
+}
+
+void mem_sampling_record_cb_unregister(mem_sampling_record_cb_type cb)
+{
+	struct mem_sampling_record_cb_list_entry *cb_entry, *tmp;
+
+	list_for_each_entry_safe(cb_entry, tmp, &mem_sampling_record_cb_list, list) {
+		if (cb_entry->cb == cb) {
+			list_del(&cb_entry->list);
+			kfree(cb_entry);
+			return;
+		}
+	}
+}
+
+void mem_sampling_sched_in(struct task_struct *prev, struct task_struct *curr)
+{
+	if (!static_branch_unlikely(&mem_sampling_access_hints))
+		return;
+
+	if (!mem_sampling_ops.sampling_start)
+		return;
+
+	if (!curr->mm)
+		goto out;
+
+	mem_sampling_ops.sampling_start();
+
+	return;
+
+out:
+	mem_sampling_ops.sampling_stop();
+}
+
+void mem_sampling_process(struct mem_sampling_record *record_base, int nr_records)
+{
+	int i;
+	struct mem_sampling_record *record;
+	struct mem_sampling_record_cb_list_entry *cb_entry, *tmp;
+
+	if (list_empty(&mem_sampling_record_cb_list))
+		goto out;
+
+	for (i = 0; i < nr_records; i++) {
+		record = record_base + i;
+		list_for_each_entry_safe(cb_entry, tmp, &mem_sampling_record_cb_list, list) {
+			cb_entry->cb(record);
+		}
+	}
+out:
+	/* if mem_sampling_access_hints is set to false, stop sampling */
+	if (static_branch_unlikely(&mem_sampling_access_hints))
+		mem_sampling_ops.sampling_continue();
+	else
+		mem_sampling_ops.sampling_stop();
+}
+
+static inline enum mem_sampling_type_enum mem_sampling_get_type(void)
+{
+#ifdef CONFIG_ARM_SPE
+	return MEM_SAMPLING_ARM_SPE;
+#else
+	return MEM_SAMPLING_UNSUPPORTED;
+#endif
+}
+
+void mem_sampling_user_switch_process(enum user_switch_type type)
+{
+	bool state;
+
+	if (type > USER_SWITCH_BACK_TO_MEM_SAMPLING) {
+		pr_err("user switch type error.\n");
+		return;
+	}
+
+	if (type == USER_SWITCH_AWAY_FROM_MEM_SAMPLING) {
+		/* save state only the status when leave mem_sampling for the first time */
+		if (mem_sampling_saved_state != MEM_SAMPLING_STATE_EMPTY)
+			return;
+
+		if (static_branch_unlikely(&mem_sampling_access_hints))
+			mem_sampling_saved_state = MEM_SAMPLING_STATE_ENABLE;
+		else
+			mem_sampling_saved_state = MEM_SAMPLING_STATE_DISABLE;
+
+		pr_debug("user switch away from mem_sampling, %s is saved, set to disable.\n",
+				mem_sampling_saved_state ? "disabled" : "enabled");
+
+		set_mem_sampling_state(false);
+	} else {
+		/* If the state is not backed up, do not restore it */
+		if (mem_sampling_saved_state == MEM_SAMPLING_STATE_EMPTY)
+			return;
+
+		state = (mem_sampling_saved_state == MEM_SAMPLING_STATE_ENABLE) ? true : false;
+		set_mem_sampling_state(state);
+		mem_sampling_saved_state = MEM_SAMPLING_STATE_EMPTY;
+
+		pr_debug("user switch back to mem_sampling, set to saved %s.\n",
+				state ? "enalbe" : "disable");
+	}
+}
+
+static void __init check_mem_sampling_enable(void)
+{
+	bool mem_sampling_default = false;
+
+	/* Parsed by setup_mem_sampling. override == 1 enables, -1 disables */
+	if (mem_sampling_override)
+		set_mem_sampling_state(mem_sampling_override == 1);
+	else
+		set_mem_sampling_state(mem_sampling_default);
+}
+
+static int __init mem_sampling_init(void)
+{
+	enum mem_sampling_type_enum mem_sampling_type = mem_sampling_get_type();
+
+	switch (mem_sampling_type) {
+	case MEM_SAMPLING_ARM_SPE:
+		if (!arm_spe_enabled()) {
+			set_mem_sampling_state(false);
+			return -ENODEV;
+		}
+		mem_sampling_ops.sampling_start	= arm_spe_start,
+		mem_sampling_ops.sampling_stop	= arm_spe_stop,
+		mem_sampling_ops.sampling_continue	= arm_spe_continue,
+
+		arm_spe_record_capture_callback_register(mem_sampling_process);
+		arm_spe_user_switch_callback_register(mem_sampling_user_switch_process);
+		break;
+
+	default:
+		pr_info("unsupport hardware pmu type(%d), disable access hint!\n",
+			mem_sampling_type);
+		set_mem_sampling_state(false);
+		return -ENODEV;
+	}
+	check_mem_sampling_enable();
+
+	pr_info("mem_sampling layer access profiling setup for NUMA Balancing and DAMON etc.\n");
+	return 0;
+}
+late_initcall(mem_sampling_init);
diff --git a/mm/memory.c b/mm/memory.c
index 494f40362174fe280261667ddec51cfa839624f9..eb7cb36abf4541e4253b1e0e761ea2e54022794e 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -4480,6 +4480,92 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf)
 	return 0;
 }
 
+#ifdef CONFIG_NUMABALANCING_MEM_SAMPLING
+
+/*
+ * Called from task_work context to act upon the page access.
+ *
+ * Physical address (provided by SPE) is used directly instead
+ * of walking the page tables to get to the PTE/page. Hence we
+ * don't check if PTE is writable for the TNF_NO_GROUP
+ * optimization, which means RO pages are considered for grouping.
+ */
+void do_numa_access(struct task_struct *p, u64 laddr, u64 paddr)
+{
+	struct mm_struct *mm = p->mm;
+	struct vm_area_struct *vma;
+	struct page *page = NULL;
+	int page_nid = NUMA_NO_NODE;
+	int last_cpupid;
+	int target_nid;
+	int flags = 0;
+
+	if (!mm)
+		return;
+
+	if (!mmap_read_trylock(mm))
+		return;
+
+	vma = find_vma(mm, laddr);
+	if (!vma)
+		goto out_unlock;
+
+	if (!vma_migratable(vma) || !vma_policy_mof(vma) ||
+		is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_MIXEDMAP))
+		goto out_unlock;
+
+	if (!vma->vm_mm ||
+	    (vma->vm_file && (vma->vm_flags & (VM_READ|VM_WRITE)) == (VM_READ)))
+		goto out_unlock;
+
+	if (!vma_is_accessible(vma))
+		goto out_unlock;
+
+	page = pfn_to_online_page(PHYS_PFN(paddr));
+	if (!page || is_zone_device_page(page))
+		goto out_unlock;
+
+	if (unlikely(!PageLRU(page)))
+		goto out_unlock;
+
+	/* TODO: handle PTE-mapped THP or PMD-mapped THP*/
+	if (PageCompound(page))
+		goto out_unlock;
+
+	/*
+	 * Flag if the page is shared between multiple address spaces. This
+	 * is later used when determining whether to group tasks together
+	 */
+	if (page_mapcount(page) > 1 && (vma->vm_flags & VM_SHARED))
+		flags |= TNF_SHARED;
+
+	last_cpupid = page_cpupid_last(page);
+	page_nid = page_to_nid(page);
+
+	target_nid = numa_migrate_prep(page, vma, laddr, page_nid, &flags);
+	if (target_nid == NUMA_NO_NODE) {
+		put_page(page);
+		goto out;
+	}
+
+	/* Migrate to the requested node */
+	if (migrate_misplaced_page(page, vma, target_nid)) {
+		page_nid = target_nid;
+		flags |= TNF_MIGRATED;
+	} else {
+		flags |= TNF_MIGRATE_FAIL;
+	}
+
+out:
+	trace_mm_numa_migrating(laddr, page_nid, target_nid, flags&TNF_MIGRATED);
+	if (page_nid != NUMA_NO_NODE)
+		task_numa_fault(last_cpupid, page_nid, 1, flags);
+
+out_unlock:
+	mmap_read_unlock(mm);
+}
+#endif /* CONFIG_NUMABALANCING_MEM_SAMPLING */
+
 static inline vm_fault_t create_huge_pmd(struct vm_fault *vmf)
 {
 	if (vma_is_anonymous(vmf->vma))
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 81bd26fb661f618b5cb23c3b2087bfe28d28b7bb..d81996ef0be09cd1a680c2797154a95dd483bc48 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -105,6 +105,10 @@
 #include <linux/swapops.h>
 
 #include <linux/share_pool_interface.h>
+#include <linux/task_work.h>
+#include <linux/mem_sampling.h>
+
+#include <trace/events/kmem.h>
 
 #include <asm/tlbflush.h>
 #include <linux/uaccess.h>
@@ -737,6 +741,76 @@ static unsigned long change_prot_numa(struct vm_area_struct *vma,
 }
 #endif /* CONFIG_NUMA_BALANCING */
 
+#ifdef CONFIG_NUMABALANCING_MEM_SAMPLING
+static void task_mem_sampling_access_work(struct callback_head *work)
+{
+	struct mem_sampling_numa_access_work *iwork =
+		container_of(work, struct mem_sampling_numa_access_work, work);
+	struct task_struct *p = current;
+	int cpu = smp_processor_id();
+	u64 laddr = iwork->laddr;
+	u64 paddr = iwork->paddr;
+
+	kfree(iwork);
+	if (iwork->cpu != cpu)
+		return;
+
+	do_numa_access(p, laddr, paddr);
+}
+
+void numa_create_taskwork(u64 laddr, u64 paddr, int cpu)
+{
+	struct mem_sampling_numa_access_work *iwork = NULL;
+
+	iwork = kzalloc(sizeof(*iwork), GFP_ATOMIC);
+	if (!iwork)
+		return;
+
+	iwork->laddr = laddr;
+	iwork->paddr = paddr;
+	iwork->cpu = smp_processor_id();
+
+	init_task_work(&iwork->work, task_mem_sampling_access_work);
+	task_work_add(current, &iwork->work, TWA_RESUME);
+}
+
+void numa_balancing_mem_sampling_cb(struct mem_sampling_record *record)
+{
+	struct task_struct *p = current;
+	u64 laddr = record->virt_addr;
+	u64 paddr = record->phys_addr;
+
+	/* Discard kernel address accesses */
+	if (laddr & (1UL << 63))
+		return;
+
+	if (p->pid != record->context_id)
+		return;
+
+	trace_mm_mem_sampling_access_record(laddr, paddr, smp_processor_id(),
+					current->pid);
+	numa_create_taskwork(laddr, paddr, smp_processor_id());
+}
+
+void numa_balancing_mem_sampling_cb_register(void)
+{
+	mem_sampling_record_cb_register(numa_balancing_mem_sampling_cb);
+}
+
+void numa_balancing_mem_sampling_cb_unregister(void)
+{
+	mem_sampling_record_cb_unregister(numa_balancing_mem_sampling_cb);
+}
+#else
+static inline void numa_balancing_mem_sampling_cb_register(void)
+{
+}
+
+static inline void numa_balancing_mem_sampling_cb_unregister(void)
+{
+}
+#endif /* CONFIG_NUMABALANCING_MEM_SAMPLING */
+
 static int queue_pages_test_walk(unsigned long start, unsigned long end,
 				struct mm_walk *walk)
 {
diff --git a/samples/bpf/spe/Makefile b/samples/bpf/spe/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..946bfdba163e393f8ac6c60f98965955b8ac817b
--- /dev/null
+++ b/samples/bpf/spe/Makefile
@@ -0,0 +1,25 @@
+# SPDX-License-Identifier: GPL-2.0
+
+include Makefile.arch
+
+INSTALL ?= install
+CLANG ?= clang
+CC ?= gcc
+
+BPFTOOL ?= bpftool
+KERNEL_DIR ?= ../../../
+
+MKFLAGS = -I$(KERNEL_DIR)/tools/lib -I$(KERNEL_DIR)/tools/include/uapi/ \
+	-D__BPF_TRACING__ -D__TARGET_ARCH_${SRCARCH}
+LDLIBBPF = -L$(KERNEL_DIR)/tools/lib/bpf/ -l:libbpf.a
+
+all:
+	$(CLANG) -O2 -g -Wall -target bpf -I. ${MKFLAGS} -c spe-record.bpf.c -o spe-record.bpf.o
+	$(BPFTOOL) gen skeleton spe-record.bpf.o > spe-record.skel.h
+	$(CC) -O2 -g -Wall ${MKFLAGS} spe-record.user.c -o spe-record ${LDLIBBPF} -lelf -lz --static
+
+clean:
+	rm -f spe-record
+	rm -f vmlinux.h
+	rm -f *.o
+	rm -f *.skel.h
diff --git a/samples/bpf/spe/Makefile.arch b/samples/bpf/spe/Makefile.arch
new file mode 100644
index 0000000000000000000000000000000000000000..f6a50f06dfc4538181b80b90f47c9a8a54c4b790
--- /dev/null
+++ b/samples/bpf/spe/Makefile.arch
@@ -0,0 +1,46 @@
+# SPDX-License-Identifier: GPL-2.0
+HOSTARCH := $(shell uname -m | sed -e s/i.86/x86/ -e s/x86_64/x86/ \
+                                  -e s/sun4u/sparc/ -e s/sparc64/sparc/ \
+                                  -e /arm64/!s/arm.*/arm/ -e s/sa110/arm/ \
+                                  -e s/s390x/s390/ -e s/parisc64/parisc/ \
+                                  -e s/ppc.*/powerpc/ -e s/mips.*/mips/ \
+                                  -e s/sh[234].*/sh/ -e s/aarch64.*/arm64/ \
+                                  -e s/riscv.*/riscv/ -e s/loongarch.*/loongarch/)
+
+ifndef ARCH
+ARCH := $(HOSTARCH)
+endif
+
+SRCARCH := $(ARCH)
+
+# Additional ARCH settings for x86
+ifeq ($(ARCH),i386)
+        SRCARCH := x86
+endif
+ifeq ($(ARCH),x86_64)
+        SRCARCH := x86
+endif
+
+# Additional ARCH settings for sparc
+ifeq ($(ARCH),sparc32)
+       SRCARCH := sparc
+endif
+ifeq ($(ARCH),sparc64)
+       SRCARCH := sparc
+endif
+
+# Additional ARCH settings for loongarch
+ifeq ($(ARCH),loongarch32)
+       SRCARCH := loongarch
+endif
+
+ifeq ($(ARCH),loongarch64)
+       SRCARCH := loongarch
+endif
+
+LP64 := $(shell echo __LP64__ | ${CC} ${CFLAGS} -E -x c - | tail -n 1)
+ifeq ($(LP64), 1)
+  IS_64_BIT := 1
+else
+  IS_64_BIT := 0
+endif
diff --git a/samples/bpf/spe/README.md b/samples/bpf/spe/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/samples/bpf/spe/spe-record.bpf.c b/samples/bpf/spe/spe-record.bpf.c
new file mode 100644
index 0000000000000000000000000000000000000000..39d138a8e23156f532736395c7b966099d344233
--- /dev/null
+++ b/samples/bpf/spe/spe-record.bpf.c
@@ -0,0 +1,41 @@
+// SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
+/* Copyright (c) 2020 Andrii Nakryiko */
+#include <linux/bpf.h>
+#include <bpf/bpf_helpers.h>
+#include <bpf/bpf_core_read.h>
+#include "spe-record.h"
+
+char LICENSE[] SEC("license") = "Dual BSD/GPL";
+
+
+/* BPF ringbuf map */
+struct {
+	__uint(type, BPF_MAP_TYPE_RINGBUF);
+	__uint(max_entries, 256 * 1024 /* 256 KB */);
+} rb SEC(".maps");
+
+
+SEC("raw_tracepoint/spe_record")
+int handle_exec(struct bpf_raw_tracepoint_args *ctx)
+{
+
+	// TP_PROTO(struct mem_sampling_record *record)
+	struct mem_sampling_record *rd = (struct mem_sampling_record *)ctx->args[0];
+	struct event *e;
+
+	e = bpf_ringbuf_reserve(&rb, sizeof(*e), 0);
+	if (!e)
+		return 0;
+
+	if (bpf_get_current_comm(e->comm, sizeof(e->comm)))
+		e->comm[0] = 0;
+
+	e->context_id = BPF_CORE_READ(rd, context_id);
+	e->virt_addr = BPF_CORE_READ(rd, virt_addr);
+	e->phys_addr = BPF_CORE_READ(rd, phys_addr);
+	e->latency = BPF_CORE_READ(rd, latency);
+
+	bpf_ringbuf_submit(e, 0);
+	return 0;
+}
+
diff --git a/samples/bpf/spe/spe-record.h b/samples/bpf/spe/spe-record.h
new file mode 100644
index 0000000000000000000000000000000000000000..e9ec71bbb3a75c8f9664df2d568754daadba1237
--- /dev/null
+++ b/samples/bpf/spe/spe-record.h
@@ -0,0 +1,47 @@
+/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */
+/* Copyright (c) 2020 Andrii Nakryiko */
+#ifndef __SPE_RECORD_H
+#define __SPE_RECORD_H
+
+enum mem_sampling_sample_type {
+	MEM_SAMPLING_L1D_ACCESS	= 1 << 0,
+	MEM_SAMPLING_L1D_MISS		= 1 << 1,
+	MEM_SAMPLING_LLC_ACCESS	= 1 << 2,
+	MEM_SAMPLING_LLC_MISS		= 1 << 3,
+	MEM_SAMPLING_TLB_ACCESS	= 1 << 4,
+	MEM_SAMPLING_TLB_MISS		= 1 << 5,
+	MEM_SAMPLING_BRANCH_MISS	= 1 << 6,
+	MEM_SAMPLING_REMOTE_ACCESS	= 1 << 7,
+};
+
+struct mem_sampling_record {
+	enum mem_sampling_sample_type	type;
+	int			err;
+	unsigned int		op;
+	unsigned int		latency;
+	unsigned long long	from_ip;
+	unsigned long long	to_ip;
+	unsigned long long	timestamp;
+	unsigned long long	virt_addr;
+	unsigned long long	phys_addr;
+	unsigned long long	context_id;
+	unsigned char		source;
+};
+
+/* definition of a sample sent to user-space from BPF program */
+struct event {
+	enum mem_sampling_sample_type	type;
+	int			err;
+	unsigned int		op;
+	unsigned int		latency;
+	unsigned long long	from_ip;
+	unsigned long long	to_ip;
+	unsigned long long	timestamp;
+	unsigned long long	virt_addr;
+	unsigned long long	phys_addr;
+	unsigned long long	context_id;
+	unsigned char		source;
+	char			comm[16];
+};
+
+#endif /* __SPE_RECORD_H */
diff --git a/samples/bpf/spe/spe-record.user.c b/samples/bpf/spe/spe-record.user.c
new file mode 100644
index 0000000000000000000000000000000000000000..f81a59d65e2fc518ce72a338b830aaeeee187d13
--- /dev/null
+++ b/samples/bpf/spe/spe-record.user.c
@@ -0,0 +1,116 @@
+// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause)
+// Copyright (c) 2020 Andrii Nakryiko
+#include <errno.h>
+#include <signal.h>
+#include <stdio.h>
+#include <time.h>
+#include <sys/resource.h>
+#include <bpf/libbpf.h>
+#include "spe-record.h"
+#include "spe-record.skel.h"
+
+int libbpf_print_fn(enum libbpf_print_level level, const char *format, va_list args)
+{
+	/* Ignore debug-level libbpf logs */
+	if (level > LIBBPF_INFO)
+		return 0;
+	return vfprintf(stderr, format, args);
+}
+
+void bump_memlock_rlimit(void)
+{
+	struct rlimit rlim_new = {
+		.rlim_cur	= RLIM_INFINITY,
+		.rlim_max	= RLIM_INFINITY,
+	};
+
+	if (setrlimit(RLIMIT_MEMLOCK, &rlim_new)) {
+		fprintf(stderr, "Failed to increase RLIMIT_MEMLOCK limit!\n");
+		exit(1);
+	}
+}
+
+static bool exiting;
+
+static void sig_handler(int sig)
+{
+	exiting = true;
+}
+
+int handle_event(void *ctx, void *data, size_t data_sz)
+{
+	const struct event *e = data;
+	struct tm *tm;
+	char ts[32];
+	time_t t;
+
+	time(&t);
+	tm = localtime(&t);
+	strftime(ts, sizeof(ts), "%H:%M:%S", tm);
+
+	printf("%-20s %-8s %-10lld %-10d 0x%016llx 0x%016llx\n", e->comm, ts, e->context_id,
+		e->latency, e->virt_addr, e->phys_addr);
+
+	return 0;
+}
+
+int main(int argc, char **argv)
+{
+	struct ring_buffer *rb = NULL;
+	struct spe_record_bpf *skel;
+	int err;
+
+	/* Set up libbpf logging callback */
+	libbpf_set_print(libbpf_print_fn);
+
+	/* Bump RLIMIT_MEMLOCK to create BPF maps */
+	bump_memlock_rlimit();
+
+	/* Clean handling of Ctrl-C */
+	signal(SIGINT, sig_handler);
+	signal(SIGTERM, sig_handler);
+
+	/* Load and verify BPF application */
+	skel = spe_record_bpf__open_and_load();
+	if (!skel) {
+		fprintf(stderr, "Failed to open and load BPF skeleton\n");
+		return 1;
+	}
+
+	/* Attach tracepoint */
+	err = spe_record_bpf__attach(skel);
+	if (err) {
+		fprintf(stderr, "Failed to attach BPF skeleton\n");
+		goto cleanup;
+	}
+
+	/* Set up ring buffer polling */
+	rb = ring_buffer__new(bpf_map__fd(skel->maps.rb), handle_event, NULL, NULL);
+	if (!rb) {
+		err = -1;
+		fprintf(stderr, "Failed to create ring buffer\n");
+		goto cleanup;
+	}
+
+	/* Process events */
+	printf("%-20s %-8s %-10s %-10s %-18s %-18s\n",
+	       "COMM", "TIME", "PID", "LATENCY", "LADDR", "PADDR");
+	while (!exiting) {
+		err = ring_buffer__poll(rb, 100 /* timeout, ms */);
+		/* Ctrl-C will cause -EINTR */
+		if (err == -EINTR) {
+			err = 0;
+			break;
+		}
+		if (err < 0) {
+			printf("Error polling ring buffer: %d\n", err);
+			break;
+		}
+	}
+
+cleanup:
+	ring_buffer__free(rb);
+	spe_record_bpf__destroy(skel);
+
+	return err < 0 ? -err : 0;
+}