From c237c1413a61b1c7ac1f34bb0f2c4f204e439874 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Wed, 10 Mar 2021 08:21:04 +0530 Subject: [PATCH 01/21] arch_topology: Allow multiple entities to provide sched_freq_tick() callback ANBZ: #26358 commit 01e055c120a46e78650b5f903088badbbdaae9ad upstream This patch attempts to make it generic enough so other parts of the kernel can also provide their own implementation of scale_freq_tick() callback, which is called by the scheduler periodically to update the per-cpu arch_freq_scale variable. The implementations now need to provide 'struct scale_freq_data' for the CPUs for which they have hardware counters available, and a callback gets registered for each possible CPU in a per-cpu variable. The arch specific (or ARM AMU) counters are updated to adapt to this and they take the highest priority if they are available, i.e. they will be used instead of CPPC based counters for example. The special code to rebuild the sched domains, in case invariance status change for the system, is moved out of arm64 specific code and is added to arch_topology.c. Note that this also defines SCALE_FREQ_SOURCE_CPUFREQ but doesn't use it and it is added to show that cpufreq is also acts as source of information for FIE and will be used by default if no other counters are supported for a platform. Reviewed-by: Ionela Voinescu Tested-by: Ionela Voinescu Acked-by: Will Deacon # for arm64 Tested-by: Vincent Guittot Signed-off-by: Viresh Kumar commit eec73529a9321616ed13cf732cd21a17eb1a2836 upstream Rename freq_scale to a less generic name, as it will get exported soon for modules. Since x86 already names its own implementation of this as arch_freq_scale, lets stick to that. Suggested-by: Will Deacon Signed-off-by: Viresh Kumar combined two commits above in order to fix compile problems. --- arch/arm64/include/asm/topology.h | 1 + arch/arm64/kernel/topology.c | 47 ----------------- drivers/base/arch_topology.c | 85 ++++++++++++++++++++++++++++--- include/linux/arch_topology.h | 18 +++++-- 4 files changed, 95 insertions(+), 56 deletions(-) diff --git a/arch/arm64/include/asm/topology.h b/arch/arm64/include/asm/topology.h index ecd02a7e1506..665ba58d3382 100644 --- a/arch/arm64/include/asm/topology.h +++ b/arch/arm64/include/asm/topology.h @@ -31,6 +31,7 @@ void topology_scale_freq_tick(void); #endif /* CONFIG_ARM64_AMU_EXTN */ /* Replace task scheduler's default frequency-invariant accounting */ +#define arch_scale_freq_tick topology_scale_freq_tick #define arch_set_freq_scale topology_set_freq_scale #define arch_scale_freq_capacity topology_get_freq_scale #define arch_scale_freq_invariant topology_scale_freq_invariant diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c index ba5482201fef..4d26d005cb0d 100644 --- a/arch/arm64/kernel/topology.c +++ b/arch/arm64/kernel/topology.c @@ -415,51 +415,4 @@ static int __init init_amu_fie(void) } late_initcall_sync(init_amu_fie); -bool arch_freq_counters_available(const struct cpumask *cpus) -{ - return amu_freq_invariant() && - cpumask_subset(cpus, amu_fie_cpus); -} - -void topology_scale_freq_tick(void) -{ - u64 prev_core_cnt, prev_const_cnt; - u64 core_cnt, const_cnt, scale; - int cpu = smp_processor_id(); - - if (!amu_freq_invariant()) - return; - - if (!cpumask_test_cpu(cpu, amu_fie_cpus)) - return; - - const_cnt = read_sysreg_s(SYS_AMEVCNTR0_CONST_EL0); - core_cnt = read_sysreg_s(SYS_AMEVCNTR0_CORE_EL0); - prev_const_cnt = this_cpu_read(arch_const_cycles_prev); - prev_core_cnt = this_cpu_read(arch_core_cycles_prev); - - if (unlikely(core_cnt <= prev_core_cnt || - const_cnt <= prev_const_cnt)) - goto store_and_exit; - - /* - * /\core arch_max_freq_scale - * scale = ------- * -------------------- - * /\const SCHED_CAPACITY_SCALE - * - * See validate_cpu_freq_invariance_counters() for details on - * arch_max_freq_scale and the use of SCHED_CAPACITY_SHIFT. - */ - scale = core_cnt - prev_core_cnt; - scale *= this_cpu_read(arch_max_freq_scale); - scale = div64_u64(scale >> SCHED_CAPACITY_SHIFT, - const_cnt - prev_const_cnt); - - scale = min_t(unsigned long, scale, SCHED_CAPACITY_SCALE); - this_cpu_write(freq_scale, (unsigned long)scale); - -store_and_exit: - this_cpu_write(arch_core_cycles_prev, core_cnt); - this_cpu_write(arch_const_cycles_prev, const_cnt); -} #endif /* CONFIG_ARM64_AMU_EXTN */ diff --git a/drivers/base/arch_topology.c b/drivers/base/arch_topology.c index 9e5a33fa99fd..394888aabb5f 100644 --- a/drivers/base/arch_topology.c +++ b/drivers/base/arch_topology.c @@ -21,17 +21,90 @@ #include #include +static DEFINE_PER_CPU(struct scale_freq_data *, sft_data); +static struct cpumask scale_freq_counters_mask; +static bool scale_freq_invariant; + +static bool supports_scale_freq_counters(const struct cpumask *cpus) +{ + return cpumask_subset(cpus, &scale_freq_counters_mask); +} + bool topology_scale_freq_invariant(void) { return cpufreq_supports_freq_invariance() || - arch_freq_counters_available(cpu_online_mask); + supports_scale_freq_counters(cpu_online_mask); } -__weak bool arch_freq_counters_available(const struct cpumask *cpus) +static void update_scale_freq_invariant(bool status) { - return false; + if (scale_freq_invariant == status) + return; + + /* + * Task scheduler behavior depends on frequency invariance support, + * either cpufreq or counter driven. If the support status changes as + * a result of counter initialisation and use, retrigger the build of + * scheduling domains to ensure the information is propagated properly. + */ + if (topology_scale_freq_invariant() == status) { + scale_freq_invariant = status; + } } -DEFINE_PER_CPU(unsigned long, freq_scale) = SCHED_CAPACITY_SCALE; + +void topology_set_scale_freq_source(struct scale_freq_data *data, + const struct cpumask *cpus) +{ + struct scale_freq_data *sfd; + int cpu; + + /* + * Avoid calling rebuild_sched_domains() unnecessarily if FIE is + * supported by cpufreq. + */ + if (cpumask_empty(&scale_freq_counters_mask)) + scale_freq_invariant = topology_scale_freq_invariant(); + + for_each_cpu(cpu, cpus) { + sfd = per_cpu(sft_data, cpu); + + /* Use ARCH provided counters whenever possible */ + if (!sfd || sfd->source != SCALE_FREQ_SOURCE_ARCH) { + per_cpu(sft_data, cpu) = data; + cpumask_set_cpu(cpu, &scale_freq_counters_mask); + } + } + + update_scale_freq_invariant(true); +} + +void topology_clear_scale_freq_source(enum scale_freq_source source, + const struct cpumask *cpus) +{ + struct scale_freq_data *sfd; + int cpu; + + for_each_cpu(cpu, cpus) { + sfd = per_cpu(sft_data, cpu); + + if (sfd && sfd->source == source) { + per_cpu(sft_data, cpu) = NULL; + cpumask_clear_cpu(cpu, &scale_freq_counters_mask); + } + } + + update_scale_freq_invariant(false); +} + +void topology_scale_freq_tick(void) +{ + struct scale_freq_data *sfd = *this_cpu_ptr(&sft_data); + + if (sfd) + sfd->set_freq_scale(); +} + +DEFINE_PER_CPU(unsigned long, arch_freq_scale) = SCHED_CAPACITY_SCALE; void topology_set_freq_scale(const struct cpumask *cpus, unsigned long cur_freq, unsigned long max_freq) @@ -47,13 +120,13 @@ void topology_set_freq_scale(const struct cpumask *cpus, unsigned long cur_freq, * want to update the scale factor with information from CPUFREQ. * Instead the scale factor will be updated from arch_scale_freq_tick. */ - if (arch_freq_counters_available(cpus)) + if (supports_scale_freq_counters(cpus)) return; scale = (cur_freq << SCHED_CAPACITY_SHIFT) / max_freq; for_each_cpu(i, cpus) - per_cpu(freq_scale, i) = scale; + per_cpu(arch_freq_scale, i) = scale; } DEFINE_PER_CPU(unsigned long, cpu_scale) = SCHED_CAPACITY_SCALE; diff --git a/include/linux/arch_topology.h b/include/linux/arch_topology.h index 414fd7f21768..820549fdb167 100644 --- a/include/linux/arch_topology.h +++ b/include/linux/arch_topology.h @@ -23,18 +23,30 @@ static inline unsigned long topology_get_cpu_scale(int cpu) void topology_set_cpu_scale(unsigned int cpu, unsigned long capacity); -DECLARE_PER_CPU(unsigned long, freq_scale); +DECLARE_PER_CPU(unsigned long, arch_freq_scale); static inline unsigned long topology_get_freq_scale(int cpu) { - return per_cpu(freq_scale, cpu); + return per_cpu(arch_freq_scale, cpu); } void topology_set_freq_scale(const struct cpumask *cpus, unsigned long cur_freq, unsigned long max_freq); bool topology_scale_freq_invariant(void); -bool arch_freq_counters_available(const struct cpumask *cpus); +enum scale_freq_source { + SCALE_FREQ_SOURCE_CPUFREQ = 0, + SCALE_FREQ_SOURCE_ARCH, +}; + +struct scale_freq_data { + enum scale_freq_source source; + void (*set_freq_scale)(void); +}; + +void topology_scale_freq_tick(void); +void topology_set_scale_freq_source(struct scale_freq_data *data, const struct cpumask *cpus); +void topology_clear_scale_freq_source(enum scale_freq_source source, const struct cpumask *cpus); DECLARE_PER_CPU(unsigned long, thermal_pressure); -- Gitee From e599b101287a59e414c8d9dc2562b0bb0848ec38 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Fri, 18 Jun 2021 13:42:23 +0530 Subject: [PATCH 02/21] cpufreq: CPPC: Pass structure instance by reference and Add support for frequency invariance ANBZ: #26358 commit 4c38f2df71c8e33c0b64865992d693f5022eeaad upstream The Frequency Invariance Engine (FIE) is providing a frequency scaling correction factor that helps achieve more accurate load-tracking. Normally, this scaling factor can be obtained directly with the help of the cpufreq drivers as they know the exact frequency the hardware is running at. But that isn't the case for CPPC cpufreq driver. Another way of obtaining that is using the arch specific counter support, which is already present in kernel, but that hardware is optional for platforms. This patch updates the CPPC driver to register itself with the topology core to provide its own implementation (cppc_scale_freq_tick()) of topology_scale_freq_tick() which gets called by the scheduler on every tick. Note that the arch specific counters have higher priority than CPPC counters, if available, though the CPPC driver doesn't need to have any special handling for that. On an invocation of cppc_scale_freq_tick(), we schedule an irq work (since we reach here from hard-irq context), which then schedules a normal work item and cppc_scale_freq_workfn() updates the per_cpu arch_freq_scale variable based on the counter updates since the last tick. To allow platforms to disable this CPPC counter-based frequency invariance support, this is all done under CONFIG_ACPI_CPPC_CPUFREQ_FIE, which is enabled by default. This also exports sched_setattr_nocheck() as the CPPC driver can be built as a module. Cc: linux-acpi@vger.kernel.org Reviewed-by: Ionela Voinescu Tested-by: Ionela Voinescu Tested-by: Vincent Guittot Acked-by: Rafael J. Wysocki Signed-off-by: Viresh Kumar --- drivers/cpufreq/Kconfig.arm | 10 ++ drivers/cpufreq/cppc_cpufreq.c | 263 ++++++++++++++++++++++++++++++--- include/linux/arch_topology.h | 1 + kernel/sched/core.c | 1 + 4 files changed, 255 insertions(+), 20 deletions(-) diff --git a/drivers/cpufreq/Kconfig.arm b/drivers/cpufreq/Kconfig.arm index 1f73fa75b1a0..4a95897f1d67 100644 --- a/drivers/cpufreq/Kconfig.arm +++ b/drivers/cpufreq/Kconfig.arm @@ -19,6 +19,16 @@ config ACPI_CPPC_CPUFREQ If in doubt, say N. +config ACPI_CPPC_CPUFREQ_FIE + bool "Frequency Invariance support for CPPC cpufreq driver" + depends on ACPI_CPPC_CPUFREQ && GENERIC_ARCH_TOPOLOGY + default y + help + This extends frequency invariance support in the CPPC cpufreq driver, + by using CPPC delivered and reference performance counters. + + If in doubt, say N. + config ARM_ALLWINNER_SUN50I_CPUFREQ_NVMEM tristate "Allwinner nvmem based SUN50I CPUFreq driver" depends on ARCH_SUNXI diff --git a/drivers/cpufreq/cppc_cpufreq.c b/drivers/cpufreq/cppc_cpufreq.c index 1c3f7a8d40eb..62dc5bb969a5 100644 --- a/drivers/cpufreq/cppc_cpufreq.c +++ b/drivers/cpufreq/cppc_cpufreq.c @@ -10,14 +10,18 @@ #define pr_fmt(fmt) "CPPC Cpufreq:" fmt +#include #include #include #include #include #include #include +#include +#include #include #include +#include #include @@ -57,6 +61,215 @@ static struct cppc_workaround_oem_info wa_info[] = { } }; +#ifdef CONFIG_ACPI_CPPC_CPUFREQ_FIE + +/* Frequency invariance support */ +struct cppc_freq_invariance { + int cpu; + struct irq_work irq_work; + struct kthread_work work; + struct cppc_perf_fb_ctrs prev_perf_fb_ctrs; + struct cppc_cpudata *cpu_data; +}; + +static DEFINE_PER_CPU(struct cppc_freq_invariance, cppc_freq_inv); +static struct kthread_worker *kworker_fie; +static bool fie_disabled; + +static struct cpufreq_driver cppc_cpufreq_driver; +static unsigned int hisi_cppc_cpufreq_get_rate(unsigned int cpu); +static int cppc_perf_from_fbctrs(struct cppc_cpudata *cpu_data, + struct cppc_perf_fb_ctrs *fb_ctrs_t0, + struct cppc_perf_fb_ctrs *fb_ctrs_t1); + +/** + * cppc_scale_freq_workfn - CPPC arch_freq_scale updater for frequency invariance + * @work: The work item. + * + * The CPPC driver register itself with the topology core to provide its own + * implementation (cppc_scale_freq_tick()) of topology_scale_freq_tick() which + * gets called by the scheduler on every tick. + * + * Note that the arch specific counters have higher priority than CPPC counters, + * if available, though the CPPC driver doesn't need to have any special + * handling for that. + * + * On an invocation of cppc_scale_freq_tick(), we schedule an irq work (since we + * reach here from hard-irq context), which then schedules a normal work item + * and cppc_scale_freq_workfn() updates the per_cpu arch_freq_scale variable + * based on the counter updates since the last tick. + */ +static void cppc_scale_freq_workfn(struct kthread_work *work) +{ + struct cppc_freq_invariance *cppc_fi; + struct cppc_perf_fb_ctrs fb_ctrs = {0}; + struct cppc_cpudata *cpu_data; + unsigned long local_freq_scale; + u64 perf; + + cppc_fi = container_of(work, struct cppc_freq_invariance, work); + cpu_data = cppc_fi->cpu_data; + + if (cppc_get_perf_ctrs(cppc_fi->cpu, &fb_ctrs)) { + pr_warn("%s: failed to read perf counters\n", __func__); + return; + } + + cppc_fi->prev_perf_fb_ctrs = fb_ctrs; + perf = cppc_perf_from_fbctrs(cpu_data, &cppc_fi->prev_perf_fb_ctrs, + &fb_ctrs); + + perf <<= SCHED_CAPACITY_SHIFT; + local_freq_scale = div64_u64(perf, cpu_data->perf_caps.highest_perf); + if (WARN_ON(local_freq_scale > 1024)) + local_freq_scale = 1024; + + per_cpu(arch_freq_scale, cppc_fi->cpu) = local_freq_scale; +} + +static void cppc_irq_work(struct irq_work *irq_work) +{ + struct cppc_freq_invariance *cppc_fi; + + cppc_fi = container_of(irq_work, struct cppc_freq_invariance, irq_work); + kthread_queue_work(kworker_fie, &cppc_fi->work); +} + +static void cppc_scale_freq_tick(void) +{ + struct cppc_freq_invariance *cppc_fi = &per_cpu(cppc_freq_inv, smp_processor_id()); + + /* + * cppc_get_perf_ctrs() can potentially sleep, call that from the right + * context. + */ + irq_work_queue(&cppc_fi->irq_work); +} + +static struct scale_freq_data cppc_sftd = { + .source = SCALE_FREQ_SOURCE_CPPC, + .set_freq_scale = cppc_scale_freq_tick, +}; + +static void cppc_cpufreq_cpu_fie_init(struct cpufreq_policy *policy) +{ + struct cppc_freq_invariance *cppc_fi; + int cpu, ret; + + if (cppc_cpufreq_driver.get == hisi_cppc_cpufreq_get_rate) + return; + + for_each_cpu(cpu, policy->cpus) { + cppc_fi = &per_cpu(cppc_freq_inv, cpu); + cppc_fi->cpu = cpu; + cppc_fi->cpu_data = policy->driver_data; + kthread_init_work(&cppc_fi->work, cppc_scale_freq_workfn); + init_irq_work(&cppc_fi->irq_work, cppc_irq_work); + + ret = cppc_get_perf_ctrs(cpu, &cppc_fi->prev_perf_fb_ctrs); + if (ret) { + pr_warn("%s: failed to read perf counters for cpu:%d: %d\n", + __func__, cpu, ret); + + /* + * Don't abort if the CPU was offline while the driver + * was getting registered. + */ + if (cpu_online(cpu)) + return; + } + } + + /* Register for freq-invariance */ + topology_set_scale_freq_source(&cppc_sftd, policy->cpus); +} + +/* + * We free all the resources on policy's removal and not on CPU removal as the + * irq-work are per-cpu and the hotplug core takes care of flushing the pending + * irq-works (hint: smpcfd_dying_cpu()) on CPU hotplug. Even if the kthread-work + * fires on another CPU after the concerned CPU is removed, it won't harm. + * + * We just need to make sure to remove them all on policy->exit(). + */ +static void cppc_cpufreq_cpu_fie_exit(struct cpufreq_policy *policy) +{ + struct cppc_freq_invariance *cppc_fi; + int cpu; + + if (cppc_cpufreq_driver.get == hisi_cppc_cpufreq_get_rate) + return; + + /* policy->cpus will be empty here, use related_cpus instead */ + topology_clear_scale_freq_source(SCALE_FREQ_SOURCE_CPPC, policy->related_cpus); + + for_each_cpu(cpu, policy->related_cpus) { + cppc_fi = &per_cpu(cppc_freq_inv, cpu); + irq_work_sync(&cppc_fi->irq_work); + kthread_cancel_work_sync(&cppc_fi->work); + } +} + +static void __init cppc_freq_invariance_init(void) +{ + struct sched_attr attr = { + .size = sizeof(struct sched_attr), + .sched_policy = SCHED_DEADLINE, + .sched_nice = 0, + .sched_priority = 0, + /* + * Fake (unused) bandwidth; workaround to "fix" + * priority inheritance. + */ + .sched_runtime = 1000000, + .sched_deadline = 10000000, + .sched_period = 10000000, + }; + int ret; + + if (cppc_cpufreq_driver.get == hisi_cppc_cpufreq_get_rate) + return; + + kworker_fie = kthread_create_worker(0, "cppc_fie"); + if (IS_ERR(kworker_fie)) + return; + + ret = sched_setattr_nocheck(kworker_fie->task, &attr); + if (ret) { + pr_warn("%s: failed to set SCHED_DEADLINE: %d\n", __func__, + ret); + kthread_destroy_worker(kworker_fie); + return; + } +} + +static void cppc_freq_invariance_exit(void) +{ + if (cppc_cpufreq_driver.get == hisi_cppc_cpufreq_get_rate) + return; + + kthread_destroy_worker(kworker_fie); + kworker_fie = NULL; +} + +#else +static inline void cppc_cpufreq_cpu_fie_init(struct cpufreq_policy *policy) +{ +} + +static inline void cppc_cpufreq_cpu_fie_exit(struct cpufreq_policy *policy) +{ +} + +static inline void cppc_freq_invariance_init(void) +{ +} + +static inline void cppc_freq_invariance_exit(void) +{ +} +#endif /* CONFIG_ACPI_CPPC_CPUFREQ_FIE */ + /* Callback function used to retrieve the max frequency from DMI */ static void cppc_find_dmi_mhz(const struct dmi_header *dm, void *private) { @@ -350,10 +563,10 @@ static int cppc_cpufreq_cpu_init(struct cpufreq_policy *policy) cpu_data->perf_ctrls.desired_perf = cpu_data->perf_caps.highest_perf; ret = cppc_set_perf(cpu, &cpu_data->perf_ctrls); - if (ret) + if (ret) { pr_debug("Err setting perf value:%d on CPU:%d. ret:%d\n", cpu_data->perf_caps.highest_perf, cpu, ret); - + } return ret; } @@ -365,28 +578,25 @@ static inline u64 get_delta(u64 t1, u64 t0) return (u32)t1 - (u32)t0; } -static int cppc_get_rate_from_fbctrs(struct cppc_cpudata *cpu_data, - struct cppc_perf_fb_ctrs fb_ctrs_t0, - struct cppc_perf_fb_ctrs fb_ctrs_t1) +static int cppc_perf_from_fbctrs(struct cppc_cpudata *cpu_data, + struct cppc_perf_fb_ctrs *fb_ctrs_t0, + struct cppc_perf_fb_ctrs *fb_ctrs_t1) { u64 delta_reference, delta_delivered; - u64 reference_perf, delivered_perf; + u64 reference_perf; - reference_perf = fb_ctrs_t0.reference_perf; + reference_perf = fb_ctrs_t0->reference_perf; - delta_reference = get_delta(fb_ctrs_t1.reference, - fb_ctrs_t0.reference); - delta_delivered = get_delta(fb_ctrs_t1.delivered, - fb_ctrs_t0.delivered); + delta_reference = get_delta(fb_ctrs_t1->reference, + fb_ctrs_t0->reference); + delta_delivered = get_delta(fb_ctrs_t1->delivered, + fb_ctrs_t0->delivered); - /* Check to avoid divide-by zero */ - if (delta_reference || delta_delivered) - delivered_perf = (reference_perf * delta_delivered) / - delta_reference; - else - delivered_perf = cpu_data->perf_ctrls.desired_perf; + /* Check to avoid divide-by zero and invalid delivered_perf */ + if (!delta_reference || !delta_delivered) + return cpu_data->perf_ctrls.desired_perf; - return cppc_cpufreq_perf_to_khz(cpu_data, delivered_perf); + return (reference_perf * delta_delivered) / delta_reference; } static unsigned int cppc_cpufreq_get_rate(unsigned int cpu) @@ -394,6 +604,7 @@ static unsigned int cppc_cpufreq_get_rate(unsigned int cpu) struct cppc_perf_fb_ctrs fb_ctrs_t0 = {0}, fb_ctrs_t1 = {0}; struct cpufreq_policy *policy = cpufreq_cpu_get(cpu); struct cppc_cpudata *cpu_data = policy->driver_data; + u64 delivered_perf; int ret; cpufreq_cpu_put(policy); @@ -408,7 +619,10 @@ static unsigned int cppc_cpufreq_get_rate(unsigned int cpu) if (ret) return ret; - return cppc_get_rate_from_fbctrs(cpu_data, fb_ctrs_t0, fb_ctrs_t1); + delivered_perf = cppc_perf_from_fbctrs(cpu_data, &fb_ctrs_t0, + &fb_ctrs_t1); + + return cppc_cpufreq_perf_to_khz(cpu_data, delivered_perf); } static int cppc_cpufreq_set_boost(struct cpufreq_policy *policy, int state) @@ -494,14 +708,22 @@ static void cppc_check_hisi_workaround(void) static int __init cppc_cpufreq_init(void) { + int ret; + if ((acpi_disabled) || !acpi_cpc_valid()) return -ENODEV; INIT_LIST_HEAD(&cpu_data_list); cppc_check_hisi_workaround(); + cppc_freq_invariance_init(); + - return cpufreq_register_driver(&cppc_cpufreq_driver); + ret = cpufreq_register_driver(&cppc_cpufreq_driver); + if (ret) + cppc_freq_invariance_exit(); + + return ret; } static inline void free_cpu_data(void) @@ -519,6 +741,7 @@ static inline void free_cpu_data(void) static void __exit cppc_cpufreq_exit(void) { cpufreq_unregister_driver(&cppc_cpufreq_driver); + cppc_freq_invariance_exit(); free_cpu_data(); } diff --git a/include/linux/arch_topology.h b/include/linux/arch_topology.h index 820549fdb167..c6abbfd928ab 100644 --- a/include/linux/arch_topology.h +++ b/include/linux/arch_topology.h @@ -37,6 +37,7 @@ bool topology_scale_freq_invariant(void); enum scale_freq_source { SCALE_FREQ_SOURCE_CPUFREQ = 0, SCALE_FREQ_SOURCE_ARCH, + SCALE_FREQ_SOURCE_CPPC, }; struct scale_freq_data { diff --git a/kernel/sched/core.c b/kernel/sched/core.c index cb4cb2bc16d7..7442b978997b 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6770,6 +6770,7 @@ int sched_setattr_nocheck(struct task_struct *p, const struct sched_attr *attr) { return __sched_setscheduler(p, attr, false, true); } +EXPORT_SYMBOL_GPL(sched_setattr_nocheck); /** * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace. -- Gitee From a1d97b9fb3ce6d5b8ad50c0159b3c4730417a290 Mon Sep 17 00:00:00 2001 From: Ionela Voinescu Date: Fri, 6 Nov 2020 12:53:33 +0000 Subject: [PATCH 03/21] arm64: split counter validation function ANBZ: #26358 commit bc3b6562a1acc1f43ae84c7d69428cdd8ae1390e upstream In order for the counter validation function to be reused, split validate_cpu_freq_invariance_counters() into: - freq_counters_valid(cpu) - check cpu for valid cycle counters - freq_inv_set_max_ratio(int cpu, u64 max_rate, u64 ref_rate) - generic function that sets the normalization ratio used by topology_scale_freq_tick() Signed-off-by: Ionela Voinescu Reviewed-by: Sudeep Holla Cc: Will Deacon Link: https://lore.kernel.org/r/20201106125334.21570-3-ionela.voinescu@arm.com Signed-off-by: Catalin Marinas --- arch/arm64/kernel/topology.c | 44 +++++++++++++++++++++--------------- 1 file changed, 26 insertions(+), 18 deletions(-) diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c index 4d26d005cb0d..688f5b155a2b 100644 --- a/arch/arm64/kernel/topology.c +++ b/arch/arm64/kernel/topology.c @@ -295,45 +295,49 @@ void init_cpu_freq_invariance_counters(void) read_sysreg_s(SYS_AMEVCNTR0_CONST_EL0)); } -static int validate_cpu_freq_invariance_counters(int cpu) +static inline bool freq_counters_valid(int cpu) { - u64 max_freq_hz, ratio; - if (!cpu_has_amu_feat(cpu)) { pr_debug("CPU%d: counters are not supported.\n", cpu); - return -EINVAL; + return false; } if (unlikely(!per_cpu(arch_const_cycles_prev, cpu) || !per_cpu(arch_core_cycles_prev, cpu))) { pr_debug("CPU%d: cycle counters are not enabled.\n", cpu); - return -EINVAL; + return false; } - /* Convert maximum frequency from KHz to Hz and validate */ - max_freq_hz = cpufreq_get_hw_max_freq(cpu) * 1000; - if (unlikely(!max_freq_hz)) { - pr_debug("CPU%d: invalid maximum frequency.\n", cpu); + return true; +} + +static int freq_inv_set_max_ratio(int cpu, u64 max_rate, u64 ref_rate) +{ + u64 ratio; + + if (unlikely(!max_rate || !ref_rate)) { + pr_debug("CPU%d: invalid maximum or reference frequency.\n", + cpu); return -EINVAL; } /* * Pre-compute the fixed ratio between the frequency of the constant - * counter and the maximum frequency of the CPU. + * reference counter and the maximum frequency of the CPU. * - * const_freq - * arch_max_freq_scale = ---------------- * SCHED_CAPACITY_SCALE² - * cpuinfo_max_freq + * ref_rate + * arch_max_freq_scale = ---------- * SCHED_CAPACITY_SCALE² + * max_rate * * We use a factor of 2 * SCHED_CAPACITY_SHIFT -> SCHED_CAPACITY_SCALE² * in order to ensure a good resolution for arch_max_freq_scale for - * very low arch timer frequencies (down to the KHz range which should + * very low reference frequencies (down to the KHz range which should * be unlikely). */ - ratio = (u64)arch_timer_get_rate() << (2 * SCHED_CAPACITY_SHIFT); - ratio = div64_u64(ratio, max_freq_hz); + ratio = ref_rate << (2 * SCHED_CAPACITY_SHIFT); + ratio = div64_u64(ratio, max_rate); if (!ratio) { - WARN_ONCE(1, "System timer frequency too low.\n"); + WARN_ONCE(1, "Reference frequency too low.\n"); return -EINVAL; } @@ -380,8 +384,12 @@ static int __init init_amu_fie(void) } for_each_present_cpu(cpu) { - if (validate_cpu_freq_invariance_counters(cpu)) + if (!freq_counters_valid(cpu) || + freq_inv_set_max_ratio(cpu, + cpufreq_get_hw_max_freq(cpu) * 1000, + arch_timer_get_rate())) continue; + cpumask_set_cpu(cpu, valid_cpus); have_policy |= enable_policy_freq_counters(cpu, valid_cpus); } -- Gitee From 0ed824eaa344751b40a38e9ee5ef8a65746c4231 Mon Sep 17 00:00:00 2001 From: Ionela Voinescu Date: Tue, 27 Oct 2020 18:07:12 +0000 Subject: [PATCH 04/21] arm64: Rebuild sched domains on invariance status changes ANBZ: #26358 commit ecec9e86d1a366f97c827ab4a8134ec06ccf031a upstream Task scheduler behavior depends on frequency invariance (FI) support and the resulting invariant load tracking signals. For example, in order to make accurate predictions across CPUs for all performance states, Energy Aware Scheduling (EAS) needs frequency-invariant load tracking signals and therefore it has a direct dependency on FI. This dependency is known, but EAS enablement is not yet conditioned on the presence of FI during the built of the scheduling domain hierarchy. Before this is done, the following must be considered: while arch_scale_freq_invariant() will see changes in FI support and could be used to condition the use of EAS, it could return different values during system initialisation. For arm64, such a scenario will happen for a system that does not support cpufreq driven FI, but does support counter-driven FI. For such a system, arch_scale_freq_invariant() will return false if called before counter based FI initialisation, but change its status to true after it. If EAS becomes explicitly dependent on FI this would affect the task scheduler behavior which builds its scheduling domain hierarchy well before the late counter-based FI init. During that process, EAS would be disabled due to its dependency on FI. Two points of future early calls to arch_scale_freq_invariant() which would determine EAS enablement are: - (1) drivers/base/arch_topology.c:126 <> rebuild_sched_domains(); This will happen after CPU capacity initialisation. - (2) kernel/sched/cpufreq_schedutil.c:917 <> rebuild_sched_domains_energy(); -->rebuild_sched_domains(); This will happen during sched_cpufreq_governor_change() for the schedutil cpufreq governor. Therefore, before enforcing the presence of FI support for the use of EAS, ensure the following: if there is a change in FI support status after counter init, use the existing rebuild_sched_domains_energy() function to trigger a rebuild of the scheduling and performance domains that in turn will determine the enablement of EAS. Signed-off-by: Ionela Voinescu Signed-off-by: Peter Zijlstra (Intel) Acked-by: Catalin Marinas Link: https://lkml.kernel.org/r/20201027180713.7642-3-ionela.voinescu@arm.com --- arch/arm64/kernel/topology.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c index 688f5b155a2b..3dde1627f18c 100644 --- a/arch/arm64/kernel/topology.c +++ b/arch/arm64/kernel/topology.c @@ -370,6 +370,7 @@ static DEFINE_STATIC_KEY_FALSE(amu_fie_key); static int __init init_amu_fie(void) { + bool invariance_status = topology_scale_freq_invariant(); cpumask_var_t valid_cpus; bool have_policy = false; int ret = 0; @@ -416,6 +417,15 @@ static int __init init_amu_fie(void) if (!topology_scale_freq_invariant()) static_branch_disable(&amu_fie_key); + /* + * Task scheduler behavior depends on frequency invariance support, + * either cpufreq or counter driven. If the support status changes as + * a result of counter initialisation and use, retrigger the build of + * scheduling domains to ensure the information is propagated properly. + */ + if (invariance_status != topology_scale_freq_invariant()) + rebuild_sched_domains_energy(); + free_valid_mask: free_cpumask_var(valid_cpus); -- Gitee From 00742471a020b88067fb8127ab36f4496ea5b566 Mon Sep 17 00:00:00 2001 From: Ionela Voinescu Date: Tue, 27 Oct 2020 18:07:11 +0000 Subject: [PATCH 05/21] sched/topology,schedutil: Wrap sched domains rebuild ANBZ: #26358 commit 31f6a8c0a471be7d7d05c93eac50fcb729e79b9d upstream Add the rebuild_sched_domains_energy() function to wrap the functionality that rebuilds the scheduling domains if any of the Energy Aware Scheduling (EAS) initialisation conditions change. This functionality is used when schedutil is added or removed or when EAS is enabled or disabled through the sched_energy_aware sysctl. Therefore, create a single function that is used in both these cases and that can be later reused. Signed-off-by: Ionela Voinescu Signed-off-by: Peter Zijlstra (Intel) Acked-by: Quentin Perret Acked-by: Rafael J. Wysocki Link: https://lkml.kernel.org/r/20201027180713.7642-2-ionela.voinescu@arm.com --- include/linux/sched/topology.h | 8 ++++++++ kernel/sched/cpufreq_schedutil.c | 9 +-------- kernel/sched/topology.c | 18 +++++++++++------- 3 files changed, 20 insertions(+), 15 deletions(-) diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h index e6c21dbe4796..33e6c9b550e0 100644 --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h @@ -259,6 +259,14 @@ static inline bool cpus_share_resources(int this_cpu, int that_cpu) #endif /* !CONFIG_SMP */ +#if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) +extern void rebuild_sched_domains_energy(void); +#else +static inline void rebuild_sched_domains_energy(void) +{ +} +#endif + #ifndef arch_scale_cpu_capacity /** * arch_scale_cpu_capacity - get the capacity scale factor of a given CPU. diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 23e86924ef11..a8a544108e5a 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -951,16 +951,9 @@ struct cpufreq_governor *cpufreq_default_governor(void) cpufreq_governor_init(schedutil_gov); #ifdef CONFIG_ENERGY_MODEL -extern bool sched_energy_update; -extern struct mutex sched_energy_mutex; - static void rebuild_sd_workfn(struct work_struct *work) { - mutex_lock(&sched_energy_mutex); - sched_energy_update = true; - rebuild_sched_domains(); - sched_energy_update = false; - mutex_unlock(&sched_energy_mutex); + rebuild_sched_domains_energy(); } static DECLARE_WORK(rebuild_sd_work, rebuild_sd_workfn); diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index f533c6a00f3e..7f87234ca9d8 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -211,6 +211,15 @@ unsigned int sysctl_sched_energy_aware = 1; DEFINE_MUTEX(sched_energy_mutex); bool sched_energy_update; +void rebuild_sched_domains_energy(void) +{ + mutex_lock(&sched_energy_mutex); + sched_energy_update = true; + rebuild_sched_domains(); + sched_energy_update = false; + mutex_unlock(&sched_energy_mutex); +} + #ifdef CONFIG_PROC_SYSCTL int sched_energy_aware_handler(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) @@ -223,13 +232,8 @@ int sched_energy_aware_handler(struct ctl_table *table, int write, ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); if (!ret && write) { state = static_branch_unlikely(&sched_energy_present); - if (state != sysctl_sched_energy_aware) { - mutex_lock(&sched_energy_mutex); - sched_energy_update = 1; - rebuild_sched_domains(); - sched_energy_update = 0; - mutex_unlock(&sched_energy_mutex); - } + if (state != sysctl_sched_energy_aware) + rebuild_sched_domains_energy(); } return ret; -- Gitee From e4fafe149cd952b73bf9b8feac8bafeef942121d Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Fri, 8 Jan 2021 16:46:51 +0530 Subject: [PATCH 06/21] arm64: topology: Avoid the have_policy check ANBZ: #26358 commit 384e5699e101b1ee184590a39ee60f10ec0d36b6 upstream Every time I have stumbled upon this routine, I get confused with the way 'have_policy' is used and I have to dig in to understand why is it so. Here is an attempt to make it easier to understand, and hopefully it is an improvement. The 'have_policy' check was just an optimization to avoid writing to amu_fie_cpus in case we don't have to, but that optimization itself is creating more confusion than the real work. Lets just do that if all the CPUs support AMUs. It is much cleaner that way. Reviewed-by: Ionela Voinescu Signed-off-by: Viresh Kumar Tested-by: Ionela Voinescu Link: https://lore.kernel.org/r/c125766c4be93461772015ac7c9a6ae45d5756f6.1610104461.git.viresh.kumar@linaro.org Signed-off-by: Will Deacon --- arch/arm64/kernel/topology.c | 20 ++++++-------------- 1 file changed, 6 insertions(+), 14 deletions(-) diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c index 3dde1627f18c..94669153d4e7 100644 --- a/arch/arm64/kernel/topology.c +++ b/arch/arm64/kernel/topology.c @@ -346,14 +346,14 @@ static int freq_inv_set_max_ratio(int cpu, u64 max_rate, u64 ref_rate) return 0; } -static inline bool +static inline void enable_policy_freq_counters(int cpu, cpumask_var_t valid_cpus) { struct cpufreq_policy *policy = cpufreq_cpu_get(cpu); if (!policy) { pr_debug("CPU%d: No cpufreq policy found.\n", cpu); - return false; + return; } if (cpumask_subset(policy->related_cpus, valid_cpus)) @@ -361,8 +361,6 @@ enable_policy_freq_counters(int cpu, cpumask_var_t valid_cpus) amu_fie_cpus); cpufreq_cpu_put(policy); - - return true; } static DEFINE_STATIC_KEY_FALSE(amu_fie_key); @@ -372,7 +370,6 @@ static int __init init_amu_fie(void) { bool invariance_status = topology_scale_freq_invariant(); cpumask_var_t valid_cpus; - bool have_policy = false; int ret = 0; int cpu; @@ -392,17 +389,12 @@ static int __init init_amu_fie(void) continue; cpumask_set_cpu(cpu, valid_cpus); - have_policy |= enable_policy_freq_counters(cpu, valid_cpus); + enable_policy_freq_counters(cpu, valid_cpus); } - /* - * If we are not restricted by cpufreq policies, we only enable - * the use of the AMU feature for FIE if all CPUs support AMU. - * Otherwise, enable_policy_freq_counters has already enabled - * policy cpus. - */ - if (!have_policy && cpumask_equal(valid_cpus, cpu_present_mask)) - cpumask_or(amu_fie_cpus, amu_fie_cpus, valid_cpus); + /* Overwrite amu_fie_cpus if all CPUs support AMU */ + if (cpumask_equal(valid_cpus, cpu_present_mask)) + cpumask_copy(amu_fie_cpus, cpu_present_mask); if (!cpumask_empty(amu_fie_cpus)) { pr_info("CPUs[%*pbl]: counters will be used for FIE.", -- Gitee From 625ca8d85d73cd88f5434574de38d9a3c9a50167 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Fri, 8 Jan 2021 16:46:52 +0530 Subject: [PATCH 07/21] arm64: topology: Reorder init_amu_fie() a bit ANBZ: #26358 commit 47b10b737c0794c2fa584d7c8103b485e274ed51 upstream This patch does a couple of optimizations in init_amu_fie(), like early exits from paths where we don't need to continue any further, avoid the enable/disable dance, moving the calls to topology_scale_freq_invariant() just when we need them, instead of at the top of the routine, and avoiding calling it for the third time. Reviewed-by: Ionela Voinescu Signed-off-by: Viresh Kumar Tested-by: Ionela Voinescu Link: https://lore.kernel.org/r/a732e71ab9ec28c354eb28dd898c9b47d490863f.1610104461.git.viresh.kumar@linaro.org Signed-off-by: Will Deacon --- arch/arm64/kernel/topology.c | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c index 94669153d4e7..15e4a72e9ccd 100644 --- a/arch/arm64/kernel/topology.c +++ b/arch/arm64/kernel/topology.c @@ -368,8 +368,8 @@ static DEFINE_STATIC_KEY_FALSE(amu_fie_key); static int __init init_amu_fie(void) { - bool invariance_status = topology_scale_freq_invariant(); cpumask_var_t valid_cpus; + bool invariant; int ret = 0; int cpu; @@ -396,18 +396,19 @@ static int __init init_amu_fie(void) if (cpumask_equal(valid_cpus, cpu_present_mask)) cpumask_copy(amu_fie_cpus, cpu_present_mask); - if (!cpumask_empty(amu_fie_cpus)) { - pr_info("CPUs[%*pbl]: counters will be used for FIE.", - cpumask_pr_args(amu_fie_cpus)); - static_branch_enable(&amu_fie_key); - } + if (cpumask_empty(amu_fie_cpus)) + goto free_valid_mask; - /* - * If the system is not fully invariant after AMU init, disable - * partial use of counters for frequency invariance. - */ - if (!topology_scale_freq_invariant()) - static_branch_disable(&amu_fie_key); + invariant = topology_scale_freq_invariant(); + + /* We aren't fully invariant yet */ + if (!invariant && !cpumask_equal(amu_fie_cpus, cpu_present_mask)) + goto free_valid_mask; + + static_branch_enable(&amu_fie_key); + + pr_info("CPUs[%*pbl]: counters will be used for FIE.", + cpumask_pr_args(amu_fie_cpus)); /* * Task scheduler behavior depends on frequency invariance support, @@ -415,7 +416,7 @@ static int __init init_amu_fie(void) * a result of counter initialisation and use, retrigger the build of * scheduling domains to ensure the information is propagated properly. */ - if (invariance_status != topology_scale_freq_invariant()) + if (!invariant) rebuild_sched_domains_energy(); free_valid_mask: -- Gitee From bd8802543b239125a706eefe4608faf2421b9e81 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Fri, 8 Jan 2021 16:46:53 +0530 Subject: [PATCH 08/21] arm64: topology: Make AMUs work with modular cpufreq drivers ANBZ: #26358 commit a5f1b187cd24f7da35eb7a48fc50839c554d52a2 upstream The AMU counters won't get used today if the cpufreq driver is built as a module as the amu core requires everything to be ready by late init. Fix that properly by registering for cpufreq policy notifier. Note that the amu core don't have any cpufreq dependency after the first time CPUFREQ_CREATE_POLICY notifier is called for all the CPUs. And so we don't need to do anything on the CPUFREQ_REMOVE_POLICY notifier. And for the same reason we check if the CPUs are already parsed in the beginning of amu_fie_setup() and skip if that is true. Alternatively we can shoot a work from there to unregister the notifier instead, but that seemed too much instead of this simple check. While at it, convert the print message to pr_debug instead of pr_info. Signed-off-by: Viresh Kumar Reviewed-by: Ionela Voinescu Tested-by: Ionela Voinescu Link: https://lore.kernel.org/r/89c1921334443e133c9c8791b4693607d65ed9f5.1610104461.git.viresh.kumar@linaro.org Signed-off-by: Will Deacon --- arch/arm64/kernel/topology.c | 92 +++++++++++++++++++----------------- 1 file changed, 48 insertions(+), 44 deletions(-) diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c index 15e4a72e9ccd..e4a24542de49 100644 --- a/arch/arm64/kernel/topology.c +++ b/arch/arm64/kernel/topology.c @@ -346,69 +346,38 @@ static int freq_inv_set_max_ratio(int cpu, u64 max_rate, u64 ref_rate) return 0; } -static inline void -enable_policy_freq_counters(int cpu, cpumask_var_t valid_cpus) -{ - struct cpufreq_policy *policy = cpufreq_cpu_get(cpu); - - if (!policy) { - pr_debug("CPU%d: No cpufreq policy found.\n", cpu); - return; - } - - if (cpumask_subset(policy->related_cpus, valid_cpus)) - cpumask_or(amu_fie_cpus, policy->related_cpus, - amu_fie_cpus); - - cpufreq_cpu_put(policy); -} - static DEFINE_STATIC_KEY_FALSE(amu_fie_key); #define amu_freq_invariant() static_branch_unlikely(&amu_fie_key) -static int __init init_amu_fie(void) +static void amu_fie_setup(const struct cpumask *cpus) { - cpumask_var_t valid_cpus; bool invariant; - int ret = 0; int cpu; - if (!zalloc_cpumask_var(&valid_cpus, GFP_KERNEL)) - return -ENOMEM; - - if (!zalloc_cpumask_var(&amu_fie_cpus, GFP_KERNEL)) { - ret = -ENOMEM; - goto free_valid_mask; - } + /* We are already set since the last insmod of cpufreq driver */ + if (unlikely(cpumask_subset(cpus, amu_fie_cpus))) + return; - for_each_present_cpu(cpu) { + for_each_cpu(cpu, cpus) { if (!freq_counters_valid(cpu) || freq_inv_set_max_ratio(cpu, cpufreq_get_hw_max_freq(cpu) * 1000, arch_timer_get_rate())) - continue; - - cpumask_set_cpu(cpu, valid_cpus); - enable_policy_freq_counters(cpu, valid_cpus); + return; } - /* Overwrite amu_fie_cpus if all CPUs support AMU */ - if (cpumask_equal(valid_cpus, cpu_present_mask)) - cpumask_copy(amu_fie_cpus, cpu_present_mask); - - if (cpumask_empty(amu_fie_cpus)) - goto free_valid_mask; + cpumask_or(amu_fie_cpus, amu_fie_cpus, cpus); invariant = topology_scale_freq_invariant(); /* We aren't fully invariant yet */ if (!invariant && !cpumask_equal(amu_fie_cpus, cpu_present_mask)) - goto free_valid_mask; + return; static_branch_enable(&amu_fie_key); - pr_info("CPUs[%*pbl]: counters will be used for FIE.", - cpumask_pr_args(amu_fie_cpus)); + pr_debug("CPUs[%*pbl]: counters will be used for FIE.", + cpumask_pr_args(cpus)); /* * Task scheduler behavior depends on frequency invariance support, @@ -418,12 +387,47 @@ static int __init init_amu_fie(void) */ if (!invariant) rebuild_sched_domains_energy(); +} -free_valid_mask: - free_cpumask_var(valid_cpus); +static int init_amu_fie_callback(struct notifier_block *nb, unsigned long val, + void *data) +{ + struct cpufreq_policy *policy = data; + + if (val == CPUFREQ_CREATE_POLICY) + amu_fie_setup(policy->related_cpus); + + /* + * We don't need to handle CPUFREQ_REMOVE_POLICY event as the AMU + * counters don't have any dependency on cpufreq driver once we have + * initialized AMU support and enabled invariance. The AMU counters will + * keep on working just fine in the absence of the cpufreq driver, and + * for the CPUs for which there are no counters available, the last set + * value of freq_scale will remain valid as that is the frequency those + * CPUs are running at. + */ + + return 0; +} + +static struct notifier_block init_amu_fie_notifier = { + .notifier_call = init_amu_fie_callback, +}; + +static int __init init_amu_fie(void) +{ + int ret; + + if (!zalloc_cpumask_var(&amu_fie_cpus, GFP_KERNEL)) + return -ENOMEM; + + ret = cpufreq_register_notifier(&init_amu_fie_notifier, + CPUFREQ_POLICY_NOTIFIER); + if (ret) + free_cpumask_var(amu_fie_cpus); return ret; } -late_initcall_sync(init_amu_fie); +core_initcall(init_amu_fie); #endif /* CONFIG_ARM64_AMU_EXTN */ -- Gitee From 2f9e490fd50a77b2d6e8e65d696b10ab74459fdd Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Wed, 10 Mar 2021 08:21:04 +0530 Subject: [PATCH 09/21] arch_topology: Allow multiple entities to provide sched_freq_tick() callback ANBZ: #26358 commit 01e055c120a46e78650b5f903088badbbdaae9ad upstream This patch attempts to make it generic enough so other parts of the kernel can also provide their own implementation of scale_freq_tick() callback, which is called by the scheduler periodically to update the per-cpu arch_freq_scale variable. The implementations now need to provide 'struct scale_freq_data' for the CPUs for which they have hardware counters available, and a callback gets registered for each possible CPU in a per-cpu variable. The arch specific (or ARM AMU) counters are updated to adapt to this and they take the highest priority if they are available, i.e. they will be used instead of CPPC based counters for example. The special code to rebuild the sched domains, in case invariance status change for the system, is moved out of arm64 specific code and is added to arch_topology.c. Note that this also defines SCALE_FREQ_SOURCE_CPUFREQ but doesn't use it and it is added to show that cpufreq is also acts as source of information for FIE and will be used by default if no other counters are supported for a platform. Reviewed-by: Ionela Voinescu Tested-by: Ionela Voinescu Acked-by: Will Deacon # for arm64 Tested-by: Vincent Guittot Signed-off-by: Viresh Kumar --- arch/arm64/include/asm/topology.h | 9 +--- arch/arm64/kernel/topology.c | 71 ++++++++++++++++++++++--------- 2 files changed, 52 insertions(+), 28 deletions(-) diff --git a/arch/arm64/include/asm/topology.h b/arch/arm64/include/asm/topology.h index 665ba58d3382..49e9addbc6ab 100644 --- a/arch/arm64/include/asm/topology.h +++ b/arch/arm64/include/asm/topology.h @@ -21,14 +21,7 @@ int pcibus_to_node(struct pci_bus *bus); #include -#ifdef CONFIG_ARM64_AMU_EXTN -/* - * Replace task scheduler's default counter-based - * frequency-invariance scale factor setting. - */ -void topology_scale_freq_tick(void); -#define arch_scale_freq_tick topology_scale_freq_tick -#endif /* CONFIG_ARM64_AMU_EXTN */ +void update_freq_counters_refs(void); /* Replace task scheduler's default frequency-invariant accounting */ #define arch_scale_freq_tick topology_scale_freq_tick diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c index e4a24542de49..ca6ae77da2d3 100644 --- a/arch/arm64/kernel/topology.c +++ b/arch/arm64/kernel/topology.c @@ -211,6 +211,12 @@ unsigned int arch_cpufreq_get_khz(int cpu) } #ifdef CONFIG_ARM64_AMU_EXTN +#define read_corecnt() read_sysreg_s(SYS_AMEVCNTR0_CORE_EL0) +#define read_constcnt() read_sysreg_s(SYS_AMEVCNTR0_CONST_EL0) +#else +#define read_corecnt() (0UL) +#define read_constcnt() (0UL) +#endif #undef pr_fmt #define pr_fmt(fmt) "AMU: " fmt @@ -221,6 +227,12 @@ static DEFINE_PER_CPU(u64, arch_const_cycles_prev); static DEFINE_PER_CPU(u64, arch_core_cycles_prev); static cpumask_var_t amu_fie_cpus; +void update_freq_counters_refs(void) +{ + this_cpu_write(arch_core_cycles_prev, read_corecnt()); + this_cpu_write(arch_const_cycles_prev, read_constcnt()); +} + /* * Sample cpu freq. * @@ -346,12 +358,47 @@ static int freq_inv_set_max_ratio(int cpu, u64 max_rate, u64 ref_rate) return 0; } -static DEFINE_STATIC_KEY_FALSE(amu_fie_key); -#define amu_freq_invariant() static_branch_unlikely(&amu_fie_key) +static void amu_scale_freq_tick(void) +{ + u64 prev_core_cnt, prev_const_cnt; + u64 core_cnt, const_cnt, scale; + + prev_const_cnt = this_cpu_read(arch_const_cycles_prev); + prev_core_cnt = this_cpu_read(arch_core_cycles_prev); + + update_freq_counters_refs(); + + const_cnt = this_cpu_read(arch_const_cycles_prev); + core_cnt = this_cpu_read(arch_core_cycles_prev); + + if (unlikely(core_cnt <= prev_core_cnt || + const_cnt <= prev_const_cnt)) + return; + + /* + * /\core arch_max_freq_scale + * scale = ------- * -------------------- + * /\const SCHED_CAPACITY_SCALE + * + * See validate_cpu_freq_invariance_counters() for details on + * arch_max_freq_scale and the use of SCHED_CAPACITY_SHIFT. + */ + scale = core_cnt - prev_core_cnt; + scale *= this_cpu_read(arch_max_freq_scale); + scale = div64_u64(scale >> SCHED_CAPACITY_SHIFT, + const_cnt - prev_const_cnt); + + scale = min_t(unsigned long, scale, SCHED_CAPACITY_SCALE); + this_cpu_write(arch_freq_scale, (unsigned long)scale); +} + +static struct scale_freq_data amu_sfd = { + .source = SCALE_FREQ_SOURCE_ARCH, + .set_freq_scale = amu_scale_freq_tick, +}; static void amu_fie_setup(const struct cpumask *cpus) { - bool invariant; int cpu; /* We are already set since the last insmod of cpufreq driver */ @@ -368,25 +415,10 @@ static void amu_fie_setup(const struct cpumask *cpus) cpumask_or(amu_fie_cpus, amu_fie_cpus, cpus); - invariant = topology_scale_freq_invariant(); - - /* We aren't fully invariant yet */ - if (!invariant && !cpumask_equal(amu_fie_cpus, cpu_present_mask)) - return; - - static_branch_enable(&amu_fie_key); + topology_set_scale_freq_source(&amu_sfd, amu_fie_cpus); pr_debug("CPUs[%*pbl]: counters will be used for FIE.", cpumask_pr_args(cpus)); - - /* - * Task scheduler behavior depends on frequency invariance support, - * either cpufreq or counter driven. If the support status changes as - * a result of counter initialisation and use, retrigger the build of - * scheduling domains to ensure the information is propagated properly. - */ - if (!invariant) - rebuild_sched_domains_energy(); } static int init_amu_fie_callback(struct notifier_block *nb, unsigned long val, @@ -430,4 +462,3 @@ static int __init init_amu_fie(void) } core_initcall(init_amu_fie); -#endif /* CONFIG_ARM64_AMU_EXTN */ -- Gitee From fd0b0c8690fa483aa2a07632556bdf95e1432a64 Mon Sep 17 00:00:00 2001 From: Sergey Shtylyov Date: Fri, 16 Sep 2022 23:17:07 +0300 Subject: [PATCH 10/21] arm64: topology: fix possible overflow in amu_fie_setup() ANBZ: #26358 commit d4955c0ad77dbc684fc716387070ac24801b8bca upstream cpufreq_get_hw_max_freq() returns max frequency in kHz as *unsigned int*, while freq_inv_set_max_ratio() gets passed this frequency in Hz as 'u64'. Multiplying max frequency by 1000 can potentially result in overflow -- multiplying by 1000ULL instead should avoid that... Found by Linux Verification Center (linuxtesting.org) with the SVACE static analysis tool. Fixes: cd0ed03a8903 ("arm64: use activity monitors for frequency invariance") Signed-off-by: Sergey Shtylyov Link: https://lore.kernel.org/r/01493d64-2bce-d968-86dc-11a122a9c07d@omp.ru Signed-off-by: Will Deacon --- arch/arm64/kernel/topology.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c index ca6ae77da2d3..26711544b2cf 100644 --- a/arch/arm64/kernel/topology.c +++ b/arch/arm64/kernel/topology.c @@ -408,7 +408,7 @@ static void amu_fie_setup(const struct cpumask *cpus) for_each_cpu(cpu, cpus) { if (!freq_counters_valid(cpu) || freq_inv_set_max_ratio(cpu, - cpufreq_get_hw_max_freq(cpu) * 1000, + cpufreq_get_hw_max_freq(cpu) * 1000ULL, arch_timer_get_rate())) return; } -- Gitee From 8d623a5f4f3e599c372188c2d81c1adc4a451481 Mon Sep 17 00:00:00 2001 From: Lukasz Luba Date: Tue, 9 Nov 2021 19:57:10 +0000 Subject: [PATCH 11/21] arch_topology: Introduce thermal pressure update function ANBZ: #26358 commit c214f124161d446b340597e7c968e0a2dc149142 upstream The thermal pressure is a mechanism which is used for providing information about reduced CPU performance to the scheduler. Usually code has to convert the value from frequency units into capacity units, which are understandable by the scheduler. Create a common conversion code which can be just used via a handy API. Internally, the topology_update_thermal_pressure() operates on frequency in MHz and max CPU frequency is taken from 'freq_factor' (per-cpu). Signed-off-by: Lukasz Luba Reviewed-by: Thara Gopinath Signed-off-by: Viresh Kumar --- arch/arm/include/asm/topology.h | 1 + arch/arm64/include/asm/topology.h | 1 + drivers/base/arch_topology.c | 43 ++++++++++++++++++++++++++++++- include/linux/arch_topology.h | 3 +++ include/linux/sched/topology.h | 7 +++++ 5 files changed, 54 insertions(+), 1 deletion(-) diff --git a/arch/arm/include/asm/topology.h b/arch/arm/include/asm/topology.h index 470299ee2fba..f1eafacc9a30 100644 --- a/arch/arm/include/asm/topology.h +++ b/arch/arm/include/asm/topology.h @@ -24,6 +24,7 @@ /* Replace task scheduler's default thermal pressure API */ #define arch_scale_thermal_pressure topology_get_thermal_pressure #define arch_set_thermal_pressure topology_set_thermal_pressure +#define arch_update_thermal_pressure topology_update_thermal_pressure #else diff --git a/arch/arm64/include/asm/topology.h b/arch/arm64/include/asm/topology.h index 49e9addbc6ab..09e742a863e6 100644 --- a/arch/arm64/include/asm/topology.h +++ b/arch/arm64/include/asm/topology.h @@ -38,6 +38,7 @@ void update_freq_counters_refs(void); /* Replace task scheduler's default thermal pressure API */ #define arch_scale_thermal_pressure topology_get_thermal_pressure #define arch_set_thermal_pressure topology_set_thermal_pressure +#define arch_update_thermal_pressure topology_update_thermal_pressure #include diff --git a/drivers/base/arch_topology.c b/drivers/base/arch_topology.c index 394888aabb5f..30348e1a1d69 100644 --- a/drivers/base/arch_topology.c +++ b/drivers/base/arch_topology.c @@ -24,6 +24,7 @@ static DEFINE_PER_CPU(struct scale_freq_data *, sft_data); static struct cpumask scale_freq_counters_mask; static bool scale_freq_invariant; +static DEFINE_PER_CPU(u32, freq_factor) = 1; static bool supports_scale_freq_counters(const struct cpumask *cpus) { @@ -147,6 +148,47 @@ void topology_set_thermal_pressure(const struct cpumask *cpus, WRITE_ONCE(per_cpu(thermal_pressure, cpu), th_pressure); } +/** + * topology_update_thermal_pressure() - Update thermal pressure for CPUs + * @cpus : The related CPUs for which capacity has been reduced + * @capped_freq : The maximum allowed frequency that CPUs can run at + * + * Update the value of thermal pressure for all @cpus in the mask. The + * cpumask should include all (online+offline) affected CPUs, to avoid + * operating on stale data when hot-plug is used for some CPUs. The + * @capped_freq reflects the currently allowed max CPUs frequency due to + * thermal capping. It might be also a boost frequency value, which is bigger + * than the internal 'freq_factor' max frequency. In such case the pressure + * value should simply be removed, since this is an indication that there is + * no thermal throttling. The @capped_freq must be provided in kHz. + */ +void topology_update_thermal_pressure(const struct cpumask *cpus, + unsigned long capped_freq) +{ + unsigned long max_capacity, capacity; + u32 max_freq; + int cpu; + + cpu = cpumask_first(cpus); + max_capacity = arch_scale_cpu_capacity(cpu); + max_freq = per_cpu(freq_factor, cpu); + + /* Convert to MHz scale which is used in 'freq_factor' */ + capped_freq /= 1000; + + /* + * Handle properly the boost frequencies, which should simply clean + * the thermal pressure value. + */ + if (max_freq <= capped_freq) + capacity = max_capacity; + else + capacity = mult_frac(max_capacity, capped_freq, max_freq); + + arch_set_thermal_pressure(cpus, max_capacity - capacity); +} +EXPORT_SYMBOL_GPL(topology_update_thermal_pressure); + static ssize_t cpu_capacity_show(struct device *dev, struct device_attribute *attr, char *buf) @@ -204,7 +246,6 @@ static void update_topology_flags_workfn(struct work_struct *work) arch_rebuild_cpu_topology(); } -static DEFINE_PER_CPU(u32, freq_factor) = 1; static u32 *raw_capacity; static int free_raw_capacity(void) diff --git a/include/linux/arch_topology.h b/include/linux/arch_topology.h index c6abbfd928ab..49cc3e9ae423 100644 --- a/include/linux/arch_topology.h +++ b/include/linux/arch_topology.h @@ -59,6 +59,9 @@ static inline unsigned long topology_get_thermal_pressure(int cpu) void topology_set_thermal_pressure(const struct cpumask *cpus, unsigned long th_pressure); +void topology_update_thermal_pressure(const struct cpumask *cpus, + unsigned long capped_freq); + struct cpu_topology { int thread_id; int core_id; diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h index 33e6c9b550e0..eaeb2f06b402 100644 --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h @@ -300,6 +300,13 @@ void arch_set_thermal_pressure(const struct cpumask *cpus, { } #endif +#ifndef arch_update_thermal_pressure +static __always_inline +void arch_update_thermal_pressure(const struct cpumask *cpus, + unsigned long capped_frequency) +{ } +#endif + static inline int task_node(const struct task_struct *p) { return cpu_to_node(task_cpu(p)); -- Gitee From ab376ead1a7b5b5f5297bb5a58e7c01c14d4c314 Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Tue, 7 Sep 2021 19:57:48 -0700 Subject: [PATCH 12/21] units: add the HZ macros ANBZ: #26358 commit e2c77032fcbe515194107994d12cd72ddb77b022 upstream The macros for the unit conversion for frequency are duplicated in different places. Provide these macros in the 'units' header, so they can be reused. Link: https://lkml.kernel.org/r/20210816114732.1834145-3-daniel.lezcano@linaro.org Signed-off-by: Daniel Lezcano Reviewed-by: Christian Eggers Reviewed-by: Andy Shevchenko Cc: Chanwoo Choi Cc: Guenter Roeck Cc: Jonathan Cameron Cc: Jonathan Cameron Cc: Kyungmin Park Cc: Lars-Peter Clausen Cc: Lukasz Luba Cc: Maxime Coquelin Cc: Miquel Raynal Cc: MyungJoo Ham Cc: Peter Meerwald Cc: "Rafael J. Wysocki" Cc: Zhang Rui Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/units.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/include/linux/units.h b/include/linux/units.h index aaf716364ec3..f7dd7e8084f6 100644 --- a/include/linux/units.h +++ b/include/linux/units.h @@ -3,6 +3,9 @@ #define _LINUX_UNITS_H #include +#define HZ_PER_KHZ 1000UL +#define KHZ_PER_MHZ 1000UL +#define HZ_PER_MHZ 1000000UL #define ABSOLUTE_ZERO_MILLICELSIUS -273150 -- Gitee From 9c80ef9b3ae498ef7455483ad63a71cd9acacab4 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Mon, 11 Dec 2023 11:48:49 +0100 Subject: [PATCH 13/21] sched/topology: Add a new arch_scale_freq_ref() method ANBZ: #26358 commit 1535ad17fabec2b8ad61b1c8f21650c90d8cbbbc upstream Create a new method to get a unique and fixed max frequency. Currently cpuinfo.max_freq or the highest (or last) state of performance domain are used as the max frequency when computing the frequency for a level of utilization, but: - cpuinfo_max_freq can change at runtime. boost is one example of such change. - cpuinfo.max_freq and last item of the PD can be different leading to different results between cpufreq and energy model. We need to save the reference frequency that has been used when computing the CPUs capacity and use this fixed and coherent value to convert between frequency and CPU's capacity. In fact, we already save the frequency that has been used when computing the capacity of each CPU. We extend the precision to save kHz instead of MHz currently and we modify the type to be aligned with other variables used when converting frequency to capacity and the other way. [ mingo: Minor edits. ] Signed-off-by: Vincent Guittot Signed-off-by: Ingo Molnar Tested-by: Lukasz Luba Reviewed-by: Lukasz Luba Acked-by: Sudeep Holla Link: https://lore.kernel.org/r/20231211104855.558096-2-vincent.guittot@linaro.org --- arch/arm/include/asm/topology.h | 1 + arch/arm64/include/asm/topology.h | 1 + arch/riscv/include/asm/topology.h | 22 ++++++++++++++++++++++ drivers/base/arch_topology.c | 29 ++++++++++++++--------------- include/linux/arch_topology.h | 7 +++++++ include/linux/sched/topology.h | 8 ++++++++ 6 files changed, 53 insertions(+), 15 deletions(-) create mode 100644 arch/riscv/include/asm/topology.h diff --git a/arch/arm/include/asm/topology.h b/arch/arm/include/asm/topology.h index f1eafacc9a30..cdc0f5e347b7 100644 --- a/arch/arm/include/asm/topology.h +++ b/arch/arm/include/asm/topology.h @@ -13,6 +13,7 @@ #define arch_set_freq_scale topology_set_freq_scale #define arch_scale_freq_capacity topology_get_freq_scale #define arch_scale_freq_invariant topology_scale_freq_invariant +#define arch_scale_freq_ref topology_get_freq_ref #endif /* Replace task scheduler's default cpu-invariant accounting */ diff --git a/arch/arm64/include/asm/topology.h b/arch/arm64/include/asm/topology.h index 09e742a863e6..2204f3aef997 100644 --- a/arch/arm64/include/asm/topology.h +++ b/arch/arm64/include/asm/topology.h @@ -28,6 +28,7 @@ void update_freq_counters_refs(void); #define arch_set_freq_scale topology_set_freq_scale #define arch_scale_freq_capacity topology_get_freq_scale #define arch_scale_freq_invariant topology_scale_freq_invariant +#define arch_scale_freq_ref topology_get_freq_ref /* Replace task scheduler's default cpu-invariant accounting */ #define arch_scale_cpu_capacity topology_get_cpu_scale diff --git a/arch/riscv/include/asm/topology.h b/arch/riscv/include/asm/topology.h new file mode 100644 index 000000000000..61183688bdd5 --- /dev/null +++ b/arch/riscv/include/asm/topology.h @@ -0,0 +1,22 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_RISCV_TOPOLOGY_H +#define _ASM_RISCV_TOPOLOGY_H + +#include + +/* Replace task scheduler's default frequency-invariant accounting */ +#define arch_scale_freq_tick topology_scale_freq_tick +#define arch_set_freq_scale topology_set_freq_scale +#define arch_scale_freq_capacity topology_get_freq_scale +#define arch_scale_freq_invariant topology_scale_freq_invariant +#define arch_scale_freq_ref topology_get_freq_ref + +/* Replace task scheduler's default cpu-invariant accounting */ +#define arch_scale_cpu_capacity topology_get_cpu_scale + +/* Enable topology flag updates */ +#define arch_update_cpu_topology topology_update_cpu_topology + +#include + +#endif /* _ASM_RISCV_TOPOLOGY_H */ diff --git a/drivers/base/arch_topology.c b/drivers/base/arch_topology.c index 30348e1a1d69..582fefb13198 100644 --- a/drivers/base/arch_topology.c +++ b/drivers/base/arch_topology.c @@ -20,11 +20,13 @@ #include #include #include +#include static DEFINE_PER_CPU(struct scale_freq_data *, sft_data); static struct cpumask scale_freq_counters_mask; static bool scale_freq_invariant; -static DEFINE_PER_CPU(u32, freq_factor) = 1; +DEFINE_PER_CPU(unsigned long, capacity_freq_ref) = 1; +EXPORT_PER_CPU_SYMBOL_GPL(capacity_freq_ref); static bool supports_scale_freq_counters(const struct cpumask *cpus) { @@ -158,9 +160,9 @@ void topology_set_thermal_pressure(const struct cpumask *cpus, * operating on stale data when hot-plug is used for some CPUs. The * @capped_freq reflects the currently allowed max CPUs frequency due to * thermal capping. It might be also a boost frequency value, which is bigger - * than the internal 'freq_factor' max frequency. In such case the pressure - * value should simply be removed, since this is an indication that there is - * no thermal throttling. The @capped_freq must be provided in kHz. + * than the internal 'capacity_freq_ref' max frequency. In such case the + * pressure value should simply be removed, since this is an indication that + * there is no thermal throttling. The @capped_freq must be provided in kHz. */ void topology_update_thermal_pressure(const struct cpumask *cpus, unsigned long capped_freq) @@ -171,10 +173,7 @@ void topology_update_thermal_pressure(const struct cpumask *cpus, cpu = cpumask_first(cpus); max_capacity = arch_scale_cpu_capacity(cpu); - max_freq = per_cpu(freq_factor, cpu); - - /* Convert to MHz scale which is used in 'freq_factor' */ - capped_freq /= 1000; + max_freq = arch_scale_freq_ref(cpu); /* * Handle properly the boost frequencies, which should simply clean @@ -267,13 +266,13 @@ void topology_normalize_cpu_scale(void) capacity_scale = 1; for_each_possible_cpu(cpu) { - capacity = raw_capacity[cpu] * per_cpu(freq_factor, cpu); + capacity = raw_capacity[cpu] * per_cpu(capacity_freq_ref, cpu); capacity_scale = max(capacity, capacity_scale); } pr_debug("cpu_capacity: capacity_scale=%llu\n", capacity_scale); for_each_possible_cpu(cpu) { - capacity = raw_capacity[cpu] * per_cpu(freq_factor, cpu); + capacity = raw_capacity[cpu] * per_cpu(capacity_freq_ref, cpu); capacity = div64_u64(capacity << SCHED_CAPACITY_SHIFT, capacity_scale); topology_set_cpu_scale(cpu, capacity); @@ -309,15 +308,15 @@ bool __init topology_parse_cpu_capacity(struct device_node *cpu_node, int cpu) cpu_node, raw_capacity[cpu]); /* - * Update freq_factor for calculating early boot cpu capacities. + * Update capacity_freq_ref for calculating early boot CPU capacities. * For non-clk CPU DVFS mechanism, there's no way to get the * frequency value now, assuming they are running at the same - * frequency (by keeping the initial freq_factor value). + * frequency (by keeping the initial capacity_freq_ref value). */ cpu_clk = of_clk_get(cpu_node, 0); if (!PTR_ERR_OR_ZERO(cpu_clk)) { - per_cpu(freq_factor, cpu) = - clk_get_rate(cpu_clk) / 1000; + per_cpu(capacity_freq_ref, cpu) = + clk_get_rate(cpu_clk) / HZ_PER_KHZ; clk_put(cpu_clk); } } else { @@ -359,7 +358,7 @@ init_cpu_capacity_callback(struct notifier_block *nb, cpumask_andnot(cpus_to_visit, cpus_to_visit, policy->related_cpus); for_each_cpu(cpu, policy->related_cpus) - per_cpu(freq_factor, cpu) = policy->cpuinfo.max_freq / 1000; + per_cpu(capacity_freq_ref, cpu) = policy->cpuinfo.max_freq; if (cpumask_empty(cpus_to_visit)) { topology_normalize_cpu_scale(); diff --git a/include/linux/arch_topology.h b/include/linux/arch_topology.h index 49cc3e9ae423..7a7b83659f76 100644 --- a/include/linux/arch_topology.h +++ b/include/linux/arch_topology.h @@ -23,6 +23,13 @@ static inline unsigned long topology_get_cpu_scale(int cpu) void topology_set_cpu_scale(unsigned int cpu, unsigned long capacity); +DECLARE_PER_CPU(unsigned long, capacity_freq_ref); + +static inline unsigned long topology_get_freq_ref(int cpu) +{ + return per_cpu(capacity_freq_ref, cpu); +} + DECLARE_PER_CPU(unsigned long, arch_freq_scale); static inline unsigned long topology_get_freq_scale(int cpu) diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h index eaeb2f06b402..6fee876323bd 100644 --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h @@ -307,6 +307,14 @@ void arch_update_thermal_pressure(const struct cpumask *cpus, { } #endif +#ifndef arch_scale_freq_ref +static __always_inline +unsigned int arch_scale_freq_ref(int cpu) +{ + return 0; +} +#endif + static inline int task_node(const struct task_struct *p) { return cpu_to_node(task_cpu(p)); -- Gitee From f044c2ff6bee7f4d5d3d618ba41b254c2ef8d65b Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Fri, 13 Jun 2025 15:27:48 +0800 Subject: [PATCH 14/21] arm64/amu: Use capacity_ref_freq() to set AMU ratio ANBZ: #26358 commit f938727d6c48e405ef6f2e3880e0e28f8a97e338 upstream mainline inclusion from mainline-v6.8-rc6 commit 1f023007f5e782bda19ad9104830c404fd622c5d category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/ICEXYT CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=1f023007f5e782bda19ad9104830c404fd622c5d ---------------------------------------------------------------------- Use the new capacity_ref_freq() method to set the ratio that is used by AMU for computing the arch_scale_freq_capacity(). This helps to keep everything aligned using the same reference for computing CPUs capacity. The default value of the ratio (stored in per_cpu(arch_max_freq_scale)) ensures that arch_scale_freq_capacity() returns max capacity until it is set to its correct value with the cpu capacity and capacity_ref_freq(). Fixes: 1eb5dde674f5 ("cpufreq: CPPC: Add support for frequency invariance") Signed-off-by: Vincent Guittot Signed-off-by: Ingo Molnar Acked-by: Sudeep Holla Acked-by: Will Deacon Link: https://lore.kernel.org/r/20231211104855.558096-8-vincent.guittot@linaro.org Signed-off-by: Lifeng Zheng Signed-off-by: Hongye Lin --- arch/arm64/kernel/topology.c | 26 +++++++++++++------------- drivers/base/arch_topology.c | 9 ++++++++- include/linux/arch_topology.h | 1 + 3 files changed, 22 insertions(+), 14 deletions(-) diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c index 26711544b2cf..6e1ec0906d3a 100644 --- a/arch/arm64/kernel/topology.c +++ b/arch/arm64/kernel/topology.c @@ -222,7 +222,12 @@ unsigned int arch_cpufreq_get_khz(int cpu) #define pr_fmt(fmt) "AMU: " fmt #define ARCH_FREQ_THRESHOLD_MS 10 -static DEFINE_PER_CPU_READ_MOSTLY(unsigned long, arch_max_freq_scale); +/* + * Ensure that amu_scale_freq_tick() will return SCHED_CAPACITY_SCALE until + * the CPU capacity and its associated frequency have been correctly + * initialized. + */ +static DEFINE_PER_CPU_READ_MOSTLY(unsigned long, arch_max_freq_scale) = 1UL << (2 * SCHED_CAPACITY_SHIFT); static DEFINE_PER_CPU(u64, arch_const_cycles_prev); static DEFINE_PER_CPU(u64, arch_core_cycles_prev); static cpumask_var_t amu_fie_cpus; @@ -323,14 +328,14 @@ static inline bool freq_counters_valid(int cpu) return true; } -static int freq_inv_set_max_ratio(int cpu, u64 max_rate, u64 ref_rate) +void freq_inv_set_max_ratio(int cpu, u64 max_rate) { - u64 ratio; + u64 ratio, ref_rate = arch_timer_get_rate(); if (unlikely(!max_rate || !ref_rate)) { - pr_debug("CPU%d: invalid maximum or reference frequency.\n", + WARN_ONCE(1, "CPU%d: invalid maximum or reference frequency.\n", cpu); - return -EINVAL; + return; } /* @@ -350,12 +355,10 @@ static int freq_inv_set_max_ratio(int cpu, u64 max_rate, u64 ref_rate) ratio = div64_u64(ratio, max_rate); if (!ratio) { WARN_ONCE(1, "Reference frequency too low.\n"); - return -EINVAL; + return; } - per_cpu(arch_max_freq_scale, cpu) = (unsigned long)ratio; - - return 0; + WRITE_ONCE(per_cpu(arch_max_freq_scale, cpu), (unsigned long)ratio); } static void amu_scale_freq_tick(void) @@ -406,10 +409,7 @@ static void amu_fie_setup(const struct cpumask *cpus) return; for_each_cpu(cpu, cpus) { - if (!freq_counters_valid(cpu) || - freq_inv_set_max_ratio(cpu, - cpufreq_get_hw_max_freq(cpu) * 1000ULL, - arch_timer_get_rate())) + if (!freq_counters_valid(cpu)) return; } diff --git a/drivers/base/arch_topology.c b/drivers/base/arch_topology.c index 582fefb13198..feae06b3f541 100644 --- a/drivers/base/arch_topology.c +++ b/drivers/base/arch_topology.c @@ -332,6 +332,10 @@ bool __init topology_parse_cpu_capacity(struct device_node *cpu_node, int cpu) return !ret; } +void __weak freq_inv_set_max_ratio(int cpu, u64 max_rate) +{ +} + #ifdef CONFIG_CPU_FREQ static cpumask_var_t cpus_to_visit; static void parsing_done_workfn(struct work_struct *work); @@ -357,8 +361,11 @@ init_cpu_capacity_callback(struct notifier_block *nb, cpumask_andnot(cpus_to_visit, cpus_to_visit, policy->related_cpus); - for_each_cpu(cpu, policy->related_cpus) + for_each_cpu(cpu, policy->related_cpus) { per_cpu(capacity_freq_ref, cpu) = policy->cpuinfo.max_freq; + freq_inv_set_max_ratio(cpu, + per_cpu(capacity_freq_ref, cpu) * HZ_PER_KHZ); + } if (cpumask_empty(cpus_to_visit)) { topology_normalize_cpu_scale(); diff --git a/include/linux/arch_topology.h b/include/linux/arch_topology.h index 7a7b83659f76..d73f2889425e 100644 --- a/include/linux/arch_topology.h +++ b/include/linux/arch_topology.h @@ -103,6 +103,7 @@ void update_siblings_masks(unsigned int cpu); void remove_cpu_topology(unsigned int cpuid); void reset_cpu_topology(void); int parse_acpi_topology(void); +void freq_inv_set_max_ratio(int cpu, u64 max_rate); #endif #endif /* _LINUX_ARCH_TOPOLOGY_H_ */ -- Gitee From 798e3116090b22cd30c8f92dab6e62b197b7af42 Mon Sep 17 00:00:00 2001 From: Lifeng Zheng Date: Fri, 13 Jun 2025 15:27:51 +0800 Subject: [PATCH 15/21] arm64: topology: Setup amu fie when cpu hotplugging ANBZ: #26358 commit 7b342952fd52859d1bd2ed4b97f7e479d7ec54bf upstream driver inclusion category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/ICEXYT ---------------------------------------------------------------------- Amu fie was set up by a cpufreq policy notifier after the policy was created. This caused some problems: 1. The cpus related to the same policy would all fail to set up amu fie if one of them couldn't pass the freq_counters_valid() check. 2. The cpus fail to set up amu fie would never have a chance to set up again. When boot with maxcpu=1 restrict, the support amu flags of the offline cpus would never be setup. After that, when cpufreq policy was being created, the online cpu might set up amu fie fail because the other cpus related to the same policy couldn't pass the freq_counters_valid() check. Hotplug the offline cpus, since the policy was already created, amu_fie_setup() would never be called again. All cpus couldn't setup amu fie in this situation. After commit 1f023007f5e7 ("arm64/amu: Use capacity_ref_freq() to set AMU ratio"), the max_freq stores in policy data is never needed when setting up amu fie. This indicates that the setting up of amu fie does not depend on the policy any more. So each cpu can set up amu fie separately during hotplug and the problems above will be solved. Fixes: 1eb5dde674f5 ("cpufreq: CPPC: Add support for frequency invariance") Signed-off-by: Lifeng Zheng Signed-off-by: Hongye Lin --- arch/arm64/kernel/topology.c | 52 +++++++++++------------------------- 1 file changed, 16 insertions(+), 36 deletions(-) diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c index 6e1ec0906d3a..151baa8e5443 100644 --- a/arch/arm64/kernel/topology.c +++ b/arch/arm64/kernel/topology.c @@ -400,52 +400,28 @@ static struct scale_freq_data amu_sfd = { .set_freq_scale = amu_scale_freq_tick, }; -static void amu_fie_setup(const struct cpumask *cpus) +static void amu_fie_setup(unsigned int cpu) { - int cpu; - - /* We are already set since the last insmod of cpufreq driver */ - if (unlikely(cpumask_subset(cpus, amu_fie_cpus))) + if (cpumask_test_cpu(cpu, amu_fie_cpus)) return; - for_each_cpu(cpu, cpus) { - if (!freq_counters_valid(cpu)) - return; - } + if (!freq_counters_valid(cpu)) + return; - cpumask_or(amu_fie_cpus, amu_fie_cpus, cpus); + cpumask_set_cpu(cpu, amu_fie_cpus); topology_set_scale_freq_source(&amu_sfd, amu_fie_cpus); - pr_debug("CPUs[%*pbl]: counters will be used for FIE.", - cpumask_pr_args(cpus)); + pr_debug("CPUs[%u]: counters will be used for FIE.", cpu); } -static int init_amu_fie_callback(struct notifier_block *nb, unsigned long val, - void *data) +static int cpuhp_topology_online(unsigned int cpu) { - struct cpufreq_policy *policy = data; - - if (val == CPUFREQ_CREATE_POLICY) - amu_fie_setup(policy->related_cpus); - - /* - * We don't need to handle CPUFREQ_REMOVE_POLICY event as the AMU - * counters don't have any dependency on cpufreq driver once we have - * initialized AMU support and enabled invariance. The AMU counters will - * keep on working just fine in the absence of the cpufreq driver, and - * for the CPUs for which there are no counters available, the last set - * value of freq_scale will remain valid as that is the frequency those - * CPUs are running at. - */ + amu_fie_setup(cpu); return 0; } -static struct notifier_block init_amu_fie_notifier = { - .notifier_call = init_amu_fie_callback, -}; - static int __init init_amu_fie(void) { int ret; @@ -453,12 +429,16 @@ static int __init init_amu_fie(void) if (!zalloc_cpumask_var(&amu_fie_cpus, GFP_KERNEL)) return -ENOMEM; - ret = cpufreq_register_notifier(&init_amu_fie_notifier, - CPUFREQ_POLICY_NOTIFIER); - if (ret) + ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, + "arm64/topology:online", + cpuhp_topology_online, + NULL); + if (ret < 0) { free_cpumask_var(amu_fie_cpus); + return ret; + } - return ret; + return 0; } core_initcall(init_amu_fie); -- Gitee From 871bc9df60083ff920f46b427f13de6e71436c22 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Fri, 13 Jun 2025 15:27:44 +0800 Subject: [PATCH 16/21] cpufreq: Use the fixed and coherent frequency for scaling capacity ANBZ: #26358 commit 599457ba15403037b489fe536266a3d5f9efaed7 upstream mainline inclusion from mainline-v6.8-rc6 commit 599457ba15403037b489fe536266a3d5f9efaed7 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/ICEXYT CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=599457ba15403037b489fe536266a3d5f9efaed7 ---------------------------------------------------------------------- cpuinfo.max_freq can change at runtime because of boost as an example. This implies that the value could be different from the frequency that has been used to compute the capacity of a CPU. The new arch_scale_freq_ref() returns a fixed and coherent frequency that can be used to compute the capacity for a given frequency. [ Also fix a arch_set_freq_scale() newline style wart in . ] Fixes: 1eb5dde674f5 ("cpufreq: CPPC: Add support for frequency invariance") Signed-off-by: Vincent Guittot Signed-off-by: Ingo Molnar Tested-by: Lukasz Luba Reviewed-by: Lukasz Luba Acked-by: Viresh Kumar Acked-by: Rafael J. Wysocki Link: https://lore.kernel.org/r/20231211104855.558096-3-vincent.guittot@linaro.org Signed-off-by: Lifeng Zheng Signed-off-by: Hongye Lin --- drivers/cpufreq/cpufreq.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index 058609b233e0..45f448dde438 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -448,7 +448,7 @@ void cpufreq_freq_transition_end(struct cpufreq_policy *policy, arch_set_freq_scale(policy->related_cpus, policy->cur, - policy->cpuinfo.max_freq); + arch_scale_freq_ref(policy->cpu)); spin_lock(&policy->transition_lock); policy->transition_ongoing = false; @@ -2101,7 +2101,7 @@ unsigned int cpufreq_driver_fast_switch(struct cpufreq_policy *policy, policy->cur = freq; arch_set_freq_scale(policy->related_cpus, freq, - policy->cpuinfo.max_freq); + arch_scale_freq_ref(policy->cpu)); cpufreq_stats_record_transition(policy, freq); if (trace_cpu_frequency_enabled()) { -- Gitee From 9ee89b882dd06cb8aeea086df85ac0b3a71a8cc1 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Fri, 13 Jun 2025 15:27:45 +0800 Subject: [PATCH 17/21] cpufreq/schedutil: Use a fixed reference frequency ANBZ: #26358 commit b3edde44e5d4504c23a176819865cd603fd16d6c upstream mainline inclusion from mainline-v6.8-rc6 commit b3edde44e5d4504c23a176819865cd603fd16d6c category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/ICEXYT CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=b3edde44e5d4504c23a176819865cd603fd16d6c ---------------------------------------------------------------------- cpuinfo.max_freq can change at runtime because of boost as an example. This implies that the value could be different than the one that has been used when computing the capacity of a CPU. The new arch_scale_freq_ref() returns a fixed and coherent reference frequency that can be used when computing a frequency based on utilization. Use this arch_scale_freq_ref() when available and fallback to policy otherwise. Fixes: 1eb5dde674f5 ("cpufreq: CPPC: Add support for frequency invariance") Signed-off-by: Vincent Guittot Signed-off-by: Ingo Molnar Tested-by: Lukasz Luba Reviewed-by: Lukasz Luba Reviewed-by: Dietmar Eggemann Acked-by: Rafael J. Wysocki Acked-by: Viresh Kumar Link: https://lore.kernel.org/r/20231211104855.558096-4-vincent.guittot@linaro.org Signed-off-by: Lifeng Zheng Signed-off-by: Hongye Lin --- include/linux/energy_model.h | 2 +- kernel/sched/cpufreq_schedutil.c | 26 ++++++++++++++++++++++++-- 2 files changed, 25 insertions(+), 3 deletions(-) diff --git a/include/linux/energy_model.h b/include/linux/energy_model.h index bf6d4445dbcd..d6f06e272f7e 100644 --- a/include/linux/energy_model.h +++ b/include/linux/energy_model.h @@ -121,7 +121,7 @@ void em_dev_unregister_perf_domain(struct device *dev); static inline unsigned long em_cpu_energy(struct em_perf_domain *pd, unsigned long max_util, unsigned long sum_util) { - unsigned long freq, scale_cpu; + unsigned long freq, ref_freq, scale_cpu; struct em_perf_state *ps; int i, cpu; diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index a8a544108e5a..1737e6277e58 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -135,6 +135,28 @@ static void sugov_deferred_update(struct sugov_policy *sg_policy, u64 time, } } +/** + * get_capacity_ref_freq - get the reference frequency that has been used to + * correlate frequency and compute capacity for a given cpufreq policy. We use + * the CPU managing it for the arch_scale_freq_ref() call in the function. + * @policy: the cpufreq policy of the CPU in question. + * + * Return: the reference CPU frequency to compute a capacity. + */ +static __always_inline +unsigned long get_capacity_ref_freq(struct cpufreq_policy *policy) +{ + unsigned int freq = arch_scale_freq_ref(policy->cpu); + + if (freq) + return freq; + + if (arch_scale_freq_invariant()) + return policy->cpuinfo.max_freq; + + return policy->cur; +} + /** * get_next_freq - Compute a new frequency for a given cpufreq policy. * @sg_policy: schedutil policy object to compute the new frequency for. @@ -161,9 +183,9 @@ static unsigned int get_next_freq(struct sugov_policy *sg_policy, unsigned long util, unsigned long max) { struct cpufreq_policy *policy = sg_policy->policy; - unsigned int freq = arch_scale_freq_invariant() ? - policy->cpuinfo.max_freq : policy->cur; + unsigned int freq; + freq = get_capacity_ref_freq(policy); freq = map_util_freq(util, freq, max); if (freq == sg_policy->cached_raw_freq && !sg_policy->need_freq_update) -- Gitee From 43af3077104acfefda8e3a7c5435faa791571694 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Wed, 17 Jan 2024 20:05:45 +0100 Subject: [PATCH 18/21] topology: Set capacity_freq_ref in all cases ANBZ: #26358 commit 98323e9d70172f1b46d1cadb20d6c54abf62870d upstream mainline inclusion from mainline-v6.9-rc6 commit 98323e9d70172f1b46d1cadb20d6c54abf62870d category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/ICEXYT CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=98323e9d70172f1b46d1cadb20d6c54abf62870d ---------------------------------------------------------------------- If "capacity-dmips-mhz" is not set, raw_capacity is null and we skip the normalization step which includes setting per_cpu capacity_freq_ref. Always register the notifier but skip the capacity normalization if raw_capacity is null. Fixes: 9942cb22ea45 ("sched/topology: Add a new arch_scale_freq_ref() method") Signed-off-by: Vincent Guittot Acked-by: Sudeep Holla Tested-by: Pierre Gondois Tested-by: Mark Brown Tested-by: Paul Barker Reviewed-by: Dietmar Eggemann Tested-by: Dietmar Eggemann Link: https://lore.kernel.org/r/20240117190545.596057-1-vincent.guittot@linaro.org Signed-off-by: Greg Kroah-Hartman Signed-off-by: Lifeng Zheng Signed-off-by: Hongye Lin --- drivers/base/arch_topology.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/drivers/base/arch_topology.c b/drivers/base/arch_topology.c index feae06b3f541..4f150e7b820f 100644 --- a/drivers/base/arch_topology.c +++ b/drivers/base/arch_topology.c @@ -349,9 +349,6 @@ init_cpu_capacity_callback(struct notifier_block *nb, struct cpufreq_policy *policy = data; int cpu; - if (!raw_capacity) - return 0; - if (val != CPUFREQ_CREATE_POLICY) return 0; @@ -368,9 +365,11 @@ init_cpu_capacity_callback(struct notifier_block *nb, } if (cpumask_empty(cpus_to_visit)) { - topology_normalize_cpu_scale(); - schedule_work(&update_topology_flags_work); - free_raw_capacity(); + if (raw_capacity) { + topology_normalize_cpu_scale(); + schedule_work(&update_topology_flags_work); + free_raw_capacity(); + } pr_debug("cpu_capacity: parsing done\n"); schedule_work(&parsing_done_work); } @@ -391,7 +390,7 @@ static int __init register_cpufreq_notifier(void) * until we have the necessary code to parse the cpu capacity, so * skip registering cpufreq notifier. */ - if (!acpi_disabled || !raw_capacity) + if (!acpi_disabled) return -EINVAL; if (!alloc_cpumask_var(&cpus_to_visit, GFP_KERNEL)) -- Gitee From 67618a0ea51991f723073720b00c49afb77667e3 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Fri, 13 Jun 2025 15:27:47 +0800 Subject: [PATCH 19/21] cpufreq/cppc: Set the frequency used for computing the capacity ANBZ: #26358 commit 5477fa249b56c59c3baa1b237bf083cffa64c84a upstream mainline inclusion from mainline-v6.8-rc6 commit 5477fa249b56c59c3baa1b237bf083cffa64c84a category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/ICEXYT CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=5477fa249b56c59c3baa1b237bf083cffa64c84a ---------------------------------------------------------------------- Save the frequency associated to the performance that has been used when initializing the capacity of CPUs. Also, cppc cpufreq driver can register an artificial energy model. In such case, it needs the frequency for this compute capacity. Fixes: 1eb5dde674f5 ("cpufreq: CPPC: Add support for frequency invariance") Signed-off-by: Vincent Guittot Signed-off-by: Ingo Molnar Tested-by: Pierre Gondois Acked-by: Sudeep Holla Acked-by: Viresh Kumar Link: https://lore.kernel.org/r/20231211104855.558096-7-vincent.guittot@linaro.org Signed-off-by: Lifeng Zheng Signed-off-by: Hongye Lin --- drivers/acpi/cppc_acpi.c | 78 ++++++++++++++++++++++++++++++++++++ drivers/base/arch_topology.c | 55 +++++++++++++++++++++++++ include/acpi/cppc_acpi.h | 1 + 3 files changed, 134 insertions(+) diff --git a/drivers/acpi/cppc_acpi.c b/drivers/acpi/cppc_acpi.c index 41aced350131..dc280eaf363e 100644 --- a/drivers/acpi/cppc_acpi.c +++ b/drivers/acpi/cppc_acpi.c @@ -40,6 +40,9 @@ #include #include #include +#include +#include +#include #include @@ -1475,3 +1478,78 @@ unsigned int cppc_get_transition_latency(int cpu_num) return latency_ns; } EXPORT_SYMBOL_GPL(cppc_get_transition_latency); + +/* Minimum struct length needed for the DMI processor entry we want */ +#define DMI_ENTRY_PROCESSOR_MIN_LENGTH 48 + +/* Offset in the DMI processor structure for the max frequency */ +#define DMI_PROCESSOR_MAX_SPEED 0x14 + +/* Callback function used to retrieve the max frequency from DMI */ +static void cppc_find_dmi_mhz(const struct dmi_header *dm, void *private) +{ + const u8 *dmi_data = (const u8 *)dm; + u16 *mhz = (u16 *)private; + + if (dm->type == DMI_ENTRY_PROCESSOR && + dm->length >= DMI_ENTRY_PROCESSOR_MIN_LENGTH) { + u16 val = (u16)get_unaligned((const u16 *) + (dmi_data + DMI_PROCESSOR_MAX_SPEED)); + *mhz = val > *mhz ? val : *mhz; + } +} + +/* Look up the max frequency in DMI */ +static u64 cppc_get_dmi_max_khz(void) +{ + u16 mhz = 0; + + dmi_walk(cppc_find_dmi_mhz, &mhz); + + /* + * Real stupid fallback value, just in case there is no + * actual value set. + */ + mhz = mhz ? mhz : 1; + + return KHZ_PER_MHZ * mhz; +} + +/* + * If CPPC lowest_freq and nominal_freq registers are exposed then we can + * use them to convert perf to freq and vice versa. The conversion is + * extrapolated as an affine function passing by the 2 points: + * - (Low perf, Low freq) + * - (Nominal perf, Nominal freq) + */ +unsigned int cppc_perf_to_khz(struct cppc_perf_caps *caps, unsigned int perf) +{ + s64 retval, offset = 0; + static u64 max_khz; + u64 mul, div; + + if (caps->lowest_freq && caps->nominal_freq) { + /* Avoid special case when nominal_freq is equal to lowest_freq */ + if (caps->lowest_freq == caps->nominal_freq) { + mul = caps->nominal_freq; + div = caps->nominal_perf; + } else { + mul = caps->nominal_freq - caps->lowest_freq; + div = caps->nominal_perf - caps->lowest_perf; + } + mul *= KHZ_PER_MHZ; + offset = caps->nominal_freq * KHZ_PER_MHZ - + div64_u64(caps->nominal_perf * mul, div); + } else { + if (!max_khz) + max_khz = cppc_get_dmi_max_khz(); + mul = max_khz; + div = caps->highest_perf; + } + + retval = offset + div64_u64(perf * mul, div); + if (retval >= 0) + return retval; + return 0; +} +EXPORT_SYMBOL_GPL(cppc_perf_to_khz); diff --git a/drivers/base/arch_topology.c b/drivers/base/arch_topology.c index 4f150e7b820f..a0d0796c754d 100644 --- a/drivers/base/arch_topology.c +++ b/drivers/base/arch_topology.c @@ -336,6 +336,61 @@ void __weak freq_inv_set_max_ratio(int cpu, u64 max_rate) { } +#ifdef CONFIG_ACPI_CPPC_LIB +#include + +void topology_init_cpu_capacity_cppc(void) +{ + u64 capacity, capacity_scale = 0; + struct cppc_perf_caps perf_caps; + int cpu; + + if (likely(acpi_disabled || !acpi_cpc_valid())) + return; + + raw_capacity = kcalloc(num_possible_cpus(), sizeof(*raw_capacity), + GFP_KERNEL); + if (!raw_capacity) + return; + + for_each_possible_cpu(cpu) { + if (!cppc_get_perf_caps(cpu, &perf_caps) && + (perf_caps.highest_perf >= perf_caps.nominal_perf) && + (perf_caps.highest_perf >= perf_caps.lowest_perf)) { + raw_capacity[cpu] = perf_caps.highest_perf; + capacity_scale = max_t(u64, capacity_scale, raw_capacity[cpu]); + + per_cpu(capacity_freq_ref, cpu) = cppc_perf_to_khz(&perf_caps, raw_capacity[cpu]); + + pr_debug("cpu_capacity: CPU%d cpu_capacity=%u (raw).\n", + cpu, raw_capacity[cpu]); + continue; + } + + pr_err("cpu_capacity: CPU%d missing/invalid highest performance.\n", cpu); + pr_err("cpu_capacity: partial information: fallback to 1024 for all CPUs\n"); + goto exit; + } + + for_each_possible_cpu(cpu) { + freq_inv_set_max_ratio(cpu, per_cpu(capacity_freq_ref, cpu) * HZ_PER_KHZ); + + capacity = raw_capacity[cpu]; + capacity = div64_u64(capacity << SCHED_CAPACITY_SHIFT, + capacity_scale); + topology_set_cpu_scale(cpu, capacity); + pr_debug("cpu_capacity: CPU%d cpu_capacity=%lu\n", + cpu, topology_get_cpu_scale(cpu)); + } + + schedule_work(&update_topology_flags_work); + pr_debug("cpu_capacity: cpu_capacity initialization done\n"); + +exit: + free_raw_capacity(); +} +#endif + #ifdef CONFIG_CPU_FREQ static cpumask_var_t cpus_to_visit; static void parsing_done_workfn(struct work_struct *work); diff --git a/include/acpi/cppc_acpi.h b/include/acpi/cppc_acpi.h index 787dd001285b..f5ef50db5ace 100644 --- a/include/acpi/cppc_acpi.h +++ b/include/acpi/cppc_acpi.h @@ -143,5 +143,6 @@ extern unsigned int cppc_get_transition_latency(int cpu); extern bool cpc_ffh_supported(void); extern int cpc_read_ffh(int cpunum, struct cpc_reg *reg, u64 *val); extern int cpc_write_ffh(int cpunum, struct cpc_reg *reg, u64 val); +extern unsigned int cppc_perf_to_khz(struct cppc_perf_caps *caps, unsigned int perf); #endif /* _CPPC_ACPI_H*/ -- Gitee From 7968a5758e68299a482598f01c4af43326df2f6c Mon Sep 17 00:00:00 2001 From: Ionela Voinescu Date: Thu, 5 Mar 2020 09:06:21 +0000 Subject: [PATCH 20/21] arm64: add support for the AMU extension v1 ANBZ: #26358 commit 2c9d45b43c39e26fd2a73f2203321cdaee98b58b upstream The activity monitors extension is an optional extension introduced by the ARMv8.4 CPU architecture. This implements basic support for version 1 of the activity monitors architecture, AMUv1. This support includes: - Extension detection on each CPU (boot, secondary, hotplugged) - Register interface for AMU aarch64 registers Signed-off-by: Ionela Voinescu Reviewed-by: Valentin Schneider Cc: Mark Rutland Cc: Marc Zyngier Cc: Suzuki K Poulose Cc: Will Deacon Cc: Catalin Marinas Signed-off-by: Catalin Marinas --- arch/arm64/Kconfig | 27 +++++++++++++++++++++++++++ arch/arm64/include/asm/cpucaps.h | 1 + arch/arm64/include/asm/cpufeature.h | 5 +++++ arch/arm64/include/asm/sysreg.h | 2 ++ 4 files changed, 35 insertions(+) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 43021d272253..2e48d7a2449a 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -1737,6 +1737,33 @@ config ARM64_MPAM endmenu +menu "ARMv8.4 architectural features" + +config ARM64_AMU_EXTN + bool "Enable support for the Activity Monitors Unit CPU extension" + default y + help + The activity monitors extension is an optional extension introduced + by the ARMv8.4 CPU architecture. This enables support for version 1 + of the activity monitors architecture, AMUv1. + + To enable the use of this extension on CPUs that implement it, say Y. + + Note that for architectural reasons, firmware _must_ implement AMU + support when running on CPUs that present the activity monitors + extension. The required support is present in: + * Version 1.5 and later of the ARM Trusted Firmware + + For kernels that have this configuration enabled but boot with broken + firmware, you may need to say N here until the firmware is fixed. + Otherwise you may experience firmware panics or lockups when + accessing the counter registers. Even if you are not observing these + symptoms, the values returned by the register reads might not + correctly reflect reality. Most commonly, the value read will be 0, + indicating that the counter is not enabled. + +endmenu + menu "ARMv8.5 architectural features" config ARM64_BTI diff --git a/arch/arm64/include/asm/cpucaps.h b/arch/arm64/include/asm/cpucaps.h index 7a1e4462359f..e0c6d75069ea 100644 --- a/arch/arm64/include/asm/cpucaps.h +++ b/arch/arm64/include/asm/cpucaps.h @@ -72,5 +72,6 @@ #define ARM64_HAS_ECV 64 #define ARM64_HAS_WFXT 65 #define ARM64_NCAPS 66 +#define ARM64_WORKAROUND_SPECULATIVE_AT_NVHE 48 #endif /* __ASM_CPUCAPS_H */ diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h index b6de5b0a3439..c3c6eb51406a 100644 --- a/arch/arm64/include/asm/cpufeature.h +++ b/arch/arm64/include/asm/cpufeature.h @@ -841,6 +841,11 @@ static inline bool cpu_has_pan(void) #ifdef CONFIG_ARM64_AMU_EXTN /* Check whether the cpu supports the Activity Monitors Unit (AMU) */ extern bool cpu_has_amu_feat(int cpu); +#else +static inline bool cpu_has_amu_feat(int cpu) +{ + return false; +} #endif static inline unsigned int get_vmid_bits(u64 mmfr1) diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h index ff0145b085d1..5323a7705eb5 100644 --- a/arch/arm64/include/asm/sysreg.h +++ b/arch/arm64/include/asm/sysreg.h @@ -500,6 +500,8 @@ #define SYS_AMEVTYPER0_EL0(n) SYS_AM_EL0(6 + ((n) >> 3), (n) & 7) #define SYS_AMEVCNTR1_EL0(n) SYS_AM_EL0(12 + ((n) >> 3), (n) & 7) #define SYS_AMEVTYPER1_EL0(n) SYS_AM_EL0(14 + ((n) >> 3), (n) & 7) +#define SYS_AMEVTYPE0_EL0(n) SYS_AM_EL0(6 + ((n) >> 3), (n) & 7) +#define SYS_AMEVTYPE1_EL0(n) SYS_AM_EL0(14 + ((n) >> 3), (n) & 7) /* AMU v1: Fixed (architecturally defined) activity monitors */ #define SYS_AMEVCNTR0_CORE_EL0 SYS_AMEVCNTR0_EL0(0) -- Gitee From 089c6c425e2c9885310c40aa108f097e84fcc40a Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Fri, 13 Jun 2025 15:27:46 +0800 Subject: [PATCH 21/21] energy_model: Use a fixed reference frequency ANBZ: #26358 commit 15cbbd1d317e07b4e5c6aca5d4c5579539a82784 upstream mainline inclusion from mainline-v6.8-rc6 commit 15cbbd1d317e07b4e5c6aca5d4c5579539a82784 category: bugfix bugzilla: https://gitee.com/openeuler/kernel/issues/ICEXYT CVE: NA Reference: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=15cbbd1d317e07b4e5c6aca5d4c5579539a82784 ---------------------------------------------------------------------- The last item of a performance domain is not always the performance point that has been used to compute CPU's capacity. This can lead to different target frequency compared with other part of the system like schedutil and would result in wrong energy estimation. A new arch_scale_freq_ref() is available to return a fixed and coherent frequency reference that can be used when computing the CPU's frequency for an level of utilization. Use this function to get this reference frequency. Energy model is never used without defining arch_scale_freq_ref() but can be compiled. Define a default arch_scale_freq_ref() returning 0 in such case. Fixes: 1eb5dde674f5 ("cpufreq: CPPC: Add support for frequency invariance") Signed-off-by: Vincent Guittot Signed-off-by: Ingo Molnar Tested-by: Lukasz Luba Reviewed-by: Lukasz Luba Link: https://lore.kernel.org/r/20231211104855.558096-5-vincent.guittot@linaro.org Signed-off-by: Lifeng Zheng Signed-off-by: Hongye Lin --- include/linux/energy_model.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/include/linux/energy_model.h b/include/linux/energy_model.h index d6f06e272f7e..b9562d0854d2 100644 --- a/include/linux/energy_model.h +++ b/include/linux/energy_model.h @@ -132,8 +132,9 @@ static inline unsigned long em_cpu_energy(struct em_perf_domain *pd, */ cpu = cpumask_first(to_cpumask(pd->cpus)); scale_cpu = arch_scale_cpu_capacity(cpu); - ps = &pd->table[pd->nr_perf_states - 1]; - freq = map_util_freq(max_util, ps->frequency, scale_cpu); + ref_freq = arch_scale_freq_ref(cpu); + + freq = map_util_freq(max_util, ref_freq, scale_cpu); /* * Find the lowest performance state of the Energy Model above the -- Gitee