diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 9e5bab29685ff0534fe82714dda5e20804c38ead..705ff4b379c8d1a794eafbfa6f762c1685c73964 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -3951,6 +3951,16 @@ NUMA balancing. Allowed values are enable and disable + numa_spinlock= [NUMA, PV_OPS] Select the NUMA-aware variant + of spinlock. The options are: + auto - Enable this variant if running on a multi-node + machine in native environment. + on - Unconditionally enable this variant. + off - Unconditionally disable this variant. + + Not specifying this option is equivalent to + numa_spinlock=auto. + numa_zonelist_order= [KNL, BOOT] Select zonelist order for NUMA. 'node', 'default' can be specified This can be set from sysctl after boot. @@ -4631,6 +4641,14 @@ [KNL] Number of legacy pty's. Overwrites compiled-in default number. + qspinlock.numa_spinlock_threshold_ns= [NUMA, PV_OPS] + Set the time threshold in nanoseconds for the + number of intra-node lock hand-offs before the + NUMA-aware spinlock is forced to be passed to + a thread on another NUMA node. Smaller values + result in a more fair, but less performant spinlock, + and vice versa. The default value is 1000000 (=1ms). + quiet [KNL] Disable most log messages r128= [HW,DRM] diff --git a/arch/arm/include/asm/mcs_spinlock.h b/arch/arm/include/asm/mcs_spinlock.h index 529d2cf4d06f4adf88170ca6c84f3e39d3305188..1eb4d733459c5050b7813fbab929b8c11a257dde 100644 --- a/arch/arm/include/asm/mcs_spinlock.h +++ b/arch/arm/include/asm/mcs_spinlock.h @@ -6,7 +6,7 @@ #include /* MCS spin-locking. */ -#define arch_mcs_spin_lock_contended(lock) \ +#define arch_mcs_spin_wait(lock) \ do { \ /* Ensure prior stores are observed before we enter wfe. */ \ smp_mb(); \ @@ -14,9 +14,9 @@ do { \ wfe(); \ } while (0) \ -#define arch_mcs_spin_unlock_contended(lock) \ +#define arch_mcs_lock_handoff(lock, val) \ do { \ - smp_store_release(lock, 1); \ + smp_store_release((lock), (val)); \ dsb_sev(); \ } while (0) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 343e1e1cae10a3d51098b2ad7b9f1d457fcca2a5..a302c82c182b215e0e9b9b9dcfd05939de666b72 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -1400,6 +1400,22 @@ config NODES_SHIFT Specify the maximum number of NUMA Nodes available on the target system. Increases memory reserved to accommodate various tables. +config NUMA_AWARE_SPINLOCKS + bool "Numa-aware spinlocks" + depends on NUMA + depends on QUEUED_SPINLOCKS + depends on PARAVIRT_SPINLOCKS + default y + help + Introduce NUMA (Non Uniform Memory Access) awareness into + the slow path of spinlocks. + + In this variant of qspinlock, the kernel will try to keep the lock + on the same node, thus reducing the number of remote cache misses, + while trading some of the short term fairness for better performance. + + Say N if you want absolute first come first serve fairness. + source "kernel/Kconfig.hz" config ARCH_SPARSEMEM_ENABLE diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 53bab123a8ee40a2484ae63b29cac4e8f545ee32..0ccde77b685b0061949a73d7b21024efde0668a6 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1586,6 +1586,26 @@ config NUMA Otherwise, you should say N. +config NUMA_AWARE_SPINLOCKS + bool "Numa-aware spinlocks" + depends on NUMA + depends on QUEUED_SPINLOCKS + depends on 64BIT + # For now, we depend on PARAVIRT_SPINLOCKS to make the patching work. + # This is awkward, but hopefully would be resolved once static_call() + # is available. + depends on PARAVIRT_SPINLOCKS + default y + help + Introduce NUMA (Non Uniform Memory Access) awareness into + the slow path of spinlocks. + + In this variant of qspinlock, the kernel will try to keep the lock + on the same node, thus reducing the number of remote cache misses, + while trading some of the short term fairness for better performance. + + Say N if you want absolute first come first serve fairness. + config AMD_NUMA def_bool y prompt "Old style AMD Opteron NUMA detection" diff --git a/arch/x86/include/asm/qspinlock.h b/arch/x86/include/asm/qspinlock.h index d87451df480bd783b3fc352c57cd1e64a9de291e..f48a2a250e57172a522a4482712d139a414d29e5 100644 --- a/arch/x86/include/asm/qspinlock.h +++ b/arch/x86/include/asm/qspinlock.h @@ -27,6 +27,10 @@ static __always_inline u32 queued_fetch_set_pending_acquire(struct qspinlock *lo return val; } +#ifdef CONFIG_NUMA_AWARE_SPINLOCKS +extern void cna_configure_spin_lock_slowpath(void); +#endif + #ifdef CONFIG_PARAVIRT_SPINLOCKS extern void native_queued_spin_lock_slowpath(struct qspinlock *lock, u32 val); extern void __pv_init_lock_hash(void); diff --git a/include/asm-generic/mcs_spinlock.h b/include/asm-generic/mcs_spinlock.h index 10cd4ffc6ba29563c1f5813c587db69164b23189..f933d99c63e0c73768e3efa82e79e398919203d9 100644 --- a/include/asm-generic/mcs_spinlock.h +++ b/include/asm-generic/mcs_spinlock.h @@ -4,8 +4,8 @@ /* * Architectures can define their own: * - * arch_mcs_spin_lock_contended(l) - * arch_mcs_spin_unlock_contended(l) + * arch_mcs_spin_wait(l) + * arch_mcs_lock_handoff(l, val) * * See kernel/locking/mcs_spinlock.c. */ diff --git a/kernel/locking/mcs_spinlock.h b/kernel/locking/mcs_spinlock.h index 85251d8771d91904446d085cf6992da4b33c3e0c..3926aad129eda3f49d8454547f03a0c694280367 100644 --- a/kernel/locking/mcs_spinlock.h +++ b/kernel/locking/mcs_spinlock.h @@ -17,11 +17,11 @@ struct mcs_spinlock { struct mcs_spinlock *next; - int locked; /* 1 if lock acquired */ + unsigned int locked; /* 1 if lock acquired */ int count; /* nesting count, see qspinlock.c */ }; -#ifndef arch_mcs_spin_lock_contended +#ifndef arch_mcs_spin_wait /* * Using smp_cond_load_acquire() provides the acquire semantics * required so that subsequent operations happen after the @@ -29,20 +29,20 @@ struct mcs_spinlock { * ARM64 would like to do spin-waiting instead of purely * spinning, and smp_cond_load_acquire() provides that behavior. */ -#define arch_mcs_spin_lock_contended(l) \ -do { \ - smp_cond_load_acquire(l, VAL); \ +#define arch_mcs_spin_wait(l) \ +do { \ + smp_cond_load_acquire(l, VAL); \ } while (0) #endif -#ifndef arch_mcs_spin_unlock_contended +#ifndef arch_mcs_lock_handoff /* * smp_store_release() provides a memory barrier to ensure all * operations in the critical section has been completed before * unlocking. */ -#define arch_mcs_spin_unlock_contended(l) \ - smp_store_release((l), 1) +#define arch_mcs_lock_handoff(l, val) \ + smp_store_release((l), (val)) #endif /* @@ -91,7 +91,7 @@ void mcs_spin_lock(struct mcs_spinlock **lock, struct mcs_spinlock *node) WRITE_ONCE(prev->next, node); /* Wait until the lock holder passes the lock down. */ - arch_mcs_spin_lock_contended(&node->locked); + arch_mcs_spin_wait(&node->locked); } /* @@ -115,7 +115,7 @@ void mcs_spin_unlock(struct mcs_spinlock **lock, struct mcs_spinlock *node) } /* Pass lock to next waiter. */ - arch_mcs_spin_unlock_contended(&next->locked); + arch_mcs_lock_handoff(&next->locked, 1); } #endif /* __LINUX_MCS_SPINLOCK_H */ diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c index ebe6b8ec7cb380da9d62e09f1b7ab98f339d35a8..d3f99060b60f1cab0961c94e154153e7604b2594 100644 --- a/kernel/locking/qspinlock.c +++ b/kernel/locking/qspinlock.c @@ -11,7 +11,7 @@ * Peter Zijlstra */ -#ifndef _GEN_PV_LOCK_SLOWPATH +#if !defined(_GEN_PV_LOCK_SLOWPATH) && !defined(_GEN_CNA_LOCK_SLOWPATH) #include #include @@ -72,7 +72,8 @@ /* * On 64-bit architectures, the mcs_spinlock structure will be 16 bytes in * size and four of them will fit nicely in one 64-byte cacheline. For - * pvqspinlock, however, we need more space for extra data. To accommodate + * pvqspinlock, however, we need more space for extra data. The same also + * applies for the NUMA-aware variant of spinlocks (CNA). To accommodate * that, we insert two more long words to pad it up to 32 bytes. IOW, only * two of them can fit in a cacheline in this case. That is OK as it is rare * to have more than 2 levels of slowpath nesting in actual use. We don't @@ -81,7 +82,7 @@ */ struct qnode { struct mcs_spinlock mcs; -#ifdef CONFIG_PARAVIRT_SPINLOCKS +#if defined(CONFIG_PARAVIRT_SPINLOCKS) || defined(CONFIG_NUMA_AWARE_SPINLOCKS) long reserved[2]; #endif }; @@ -105,6 +106,8 @@ struct qnode { * Exactly fits one 64-byte cacheline on a 64-bit architecture. * * PV doubles the storage and uses the second cacheline for PV state. + * CNA also doubles the storage and uses the second cacheline for + * CNA-specific state. */ static DEFINE_PER_CPU_ALIGNED(struct qnode, qnodes[MAX_NODES]); @@ -290,7 +293,35 @@ static __always_inline u32 __pv_wait_head_or_lock(struct qspinlock *lock, #define queued_spin_lock_slowpath native_queued_spin_lock_slowpath #endif -#endif /* _GEN_PV_LOCK_SLOWPATH */ +/* + * __try_clear_tail - try to clear tail by setting the lock value to + * _Q_LOCKED_VAL. + * @lock: Pointer to the queued spinlock structure + * @val: Current value of the lock + * @node: Pointer to the MCS node of the lock holder + */ +static __always_inline bool __try_clear_tail(struct qspinlock *lock, + u32 val, + struct mcs_spinlock *node) +{ + return atomic_try_cmpxchg_relaxed(&lock->val, &val, _Q_LOCKED_VAL); +} + +/* + * __mcs_lock_handoff - pass the MCS lock to the next waiter + * @node: Pointer to the MCS node of the lock holder + * @next: Pointer to the MCS node of the first waiter in the MCS queue + */ +static __always_inline void __mcs_lock_handoff(struct mcs_spinlock *node, + struct mcs_spinlock *next) +{ + arch_mcs_lock_handoff(&next->locked, 1); +} + +#define try_clear_tail __try_clear_tail +#define mcs_lock_handoff __mcs_lock_handoff + +#endif /* _GEN_PV_LOCK_SLOWPATH && _GEN_CNA_LOCK_SLOWPATH */ /** * queued_spin_lock_slowpath - acquire the queued spinlock @@ -474,7 +505,7 @@ void __lockfunc queued_spin_lock_slowpath(struct qspinlock *lock, u32 val) WRITE_ONCE(prev->next, node); pv_wait_node(node, prev); - arch_mcs_spin_lock_contended(&node->locked); + arch_mcs_spin_wait(&node->locked); /* * While waiting for the MCS lock, the next pointer may have @@ -536,7 +567,7 @@ void __lockfunc queued_spin_lock_slowpath(struct qspinlock *lock, u32 val) * PENDING will make the uncontended transition fail. */ if ((val & _Q_TAIL_MASK) == tail) { - if (atomic_try_cmpxchg_relaxed(&lock->val, &val, _Q_LOCKED_VAL)) + if (try_clear_tail(lock, val, node)) goto release; /* No contention */ } @@ -553,7 +584,7 @@ void __lockfunc queued_spin_lock_slowpath(struct qspinlock *lock, u32 val) if (!next) next = smp_cond_load_relaxed(&node->next, (VAL)); - arch_mcs_spin_unlock_contended(&next->locked); + mcs_lock_handoff(node, next); pv_kick_node(lock, next); release: @@ -566,6 +597,37 @@ void __lockfunc queued_spin_lock_slowpath(struct qspinlock *lock, u32 val) } EXPORT_SYMBOL(queued_spin_lock_slowpath); +/* + * Generate the code for NUMA-aware spinlocks + */ +#if !defined(_GEN_CNA_LOCK_SLOWPATH) && defined(CONFIG_NUMA_AWARE_SPINLOCKS) +#define _GEN_CNA_LOCK_SLOWPATH + +#undef pv_init_node +#define pv_init_node cna_init_node + +#undef pv_wait_head_or_lock +#define pv_wait_head_or_lock cna_wait_head_or_lock + +#undef try_clear_tail +#define try_clear_tail cna_try_clear_tail + +#undef mcs_lock_handoff +#define mcs_lock_handoff cna_lock_handoff + +#undef queued_spin_lock_slowpath +/* + * defer defining queued_spin_lock_slowpath until after the include to + * avoid a name clash with the identically named field in pv_ops.lock + * (see cna_configure_spin_lock_slowpath()) + */ +#include "qspinlock_cna.h" +#define queued_spin_lock_slowpath __cna_queued_spin_lock_slowpath + +#include "qspinlock.c" + +#endif + /* * Generate the paravirt code for queued_spin_unlock_slowpath(). */ @@ -580,6 +642,12 @@ EXPORT_SYMBOL(queued_spin_lock_slowpath); #undef pv_kick_node #undef pv_wait_head_or_lock +#undef try_clear_tail +#define try_clear_tail __try_clear_tail + +#undef mcs_lock_handoff +#define mcs_lock_handoff __mcs_lock_handoff + #undef queued_spin_lock_slowpath #define queued_spin_lock_slowpath __pv_queued_spin_lock_slowpath diff --git a/kernel/locking/qspinlock_cna.h b/kernel/locking/qspinlock_cna.h new file mode 100644 index 0000000000000000000000000000000000000000..3983505c1118fa956458d160080d65721232d594 --- /dev/null +++ b/kernel/locking/qspinlock_cna.h @@ -0,0 +1,425 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _GEN_CNA_LOCK_SLOWPATH +#error "do not include this file" +#endif + +#include +#include +#include +#include +#include + +/* + * Implement a NUMA-aware version of MCS (aka CNA, or compact NUMA-aware lock). + * + * In CNA, spinning threads are organized in two queues, a primary queue for + * threads running on the same NUMA node as the current lock holder, and a + * secondary queue for threads running on other nodes. Schematically, it + * looks like this: + * + * cna_node + * +----------+ +--------+ +--------+ + * |mcs:next | --> |mcs:next| --> ... |mcs:next| --> NULL [Primary queue] + * |mcs:locked| -. +--------+ +--------+ + * +----------+ | + * `----------------------. + * v + * +--------+ +--------+ + * |mcs:next| --> ... |mcs:next| [Secondary queue] + * +--------+ +--------+ + * ^ | + * `--------------------' + * + * N.B. locked := 1 if secondary queue is absent. Otherwise, it contains the + * encoded pointer to the tail of the secondary queue, which is organized as a + * circular list. + * + * After acquiring the MCS lock and before acquiring the spinlock, the MCS lock + * holder checks whether the next waiter in the primary queue (if exists) is + * running on the same NUMA node. If it is not, that waiter is detached from the + * main queue and moved into the tail of the secondary queue. This way, we + * gradually filter the primary queue, leaving only waiters running on the same + * preferred NUMA node. Note that certain priortized waiters (e.g., in + * irq and nmi contexts) are excluded from being moved to the secondary queue. + * + * We change the NUMA node preference after a waiter at the head of the + * secondary queue spins for a certain amount of time (1ms, by default). + * We do that by flushing the secondary queue into the head of the primary queue, + * effectively changing the preference to the NUMA node of the waiter at the head + * of the secondary queue at the time of the flush. + * + * For more details, see https://arxiv.org/abs/1810.05600. + * + * Authors: Alex Kogan + * Dave Dice + */ + +#define FLUSH_SECONDARY_QUEUE 1 + +#define CNA_PRIORITY_NODE 0xffff + +struct cna_node { + struct mcs_spinlock mcs; + u16 numa_node; + u16 real_numa_node; + u32 encoded_tail; /* self */ + u64 start_time; +}; + +static ulong numa_spinlock_threshold_ns = 1000000; /* 1ms, by default */ +module_param(numa_spinlock_threshold_ns, ulong, 0644); + +static inline bool intra_node_threshold_reached(struct cna_node *cn) +{ + u64 current_time = local_clock(); + u64 threshold = cn->start_time + numa_spinlock_threshold_ns; + + return current_time > threshold; +} + +/* + * Controls the probability for enabling the ordering of the main queue + * when the secondary queue is empty. The chosen value reduces the amount + * of unnecessary shuffling of threads between the two waiting queues + * when the contention is low, while responding fast enough and enabling + * the shuffling when the contention is high. + */ +#define SHUFFLE_REDUCTION_PROB_ARG (7) + +/* Per-CPU pseudo-random number seed */ +static DEFINE_PER_CPU(u32, seed); + +/* + * Return false with probability 1 / 2^@num_bits. + * Intuitively, the larger @num_bits the less likely false is to be returned. + * @num_bits must be a number between 0 and 31. + */ +static bool probably(unsigned int num_bits) +{ + u32 s; + + s = this_cpu_read(seed); + s = next_pseudo_random32(s); + this_cpu_write(seed, s); + + return s & ((1 << num_bits) - 1); +} + +static void __init cna_init_nodes_per_cpu(unsigned int cpu) +{ + struct mcs_spinlock *base = per_cpu_ptr(&qnodes[0].mcs, cpu); + int numa_node = cpu_to_node(cpu); + int i; + + for (i = 0; i < MAX_NODES; i++) { + struct cna_node *cn = (struct cna_node *)grab_mcs_node(base, i); + + cn->real_numa_node = numa_node; + cn->encoded_tail = encode_tail(cpu, i); + /* + * make sure @encoded_tail is not confused with other valid + * values for @locked (0 or 1) + */ + WARN_ON(cn->encoded_tail <= 1); + } +} + +static int __init cna_init_nodes(void) +{ + unsigned int cpu; + + /* + * this will break on 32bit architectures, so we restrict + * the use of CNA to 64bit only (see arch/x86/Kconfig) + */ + BUILD_BUG_ON(sizeof(struct cna_node) > sizeof(struct qnode)); + /* we store an ecoded tail word in the node's @locked field */ + BUILD_BUG_ON(sizeof(u32) > sizeof(unsigned int)); + + for_each_possible_cpu(cpu) + cna_init_nodes_per_cpu(cpu); + + return 0; +} + +static __always_inline void cna_init_node(struct mcs_spinlock *node) +{ + bool priority = !in_task() || irqs_disabled() || rt_task(current); + struct cna_node *cn = (struct cna_node *)node; + + cn->numa_node = priority ? CNA_PRIORITY_NODE : cn->real_numa_node; + cn->start_time = 0; +} + +/* + * cna_splice_head -- splice the entire secondary queue onto the head of the + * primary queue. + * + * Returns the new primary head node or NULL on failure. + */ +static struct mcs_spinlock * +cna_splice_head(struct qspinlock *lock, u32 val, + struct mcs_spinlock *node, struct mcs_spinlock *next) +{ + struct mcs_spinlock *head_2nd, *tail_2nd; + u32 new; + + tail_2nd = decode_tail(node->locked); + head_2nd = tail_2nd->next; + + if (next) { + /* + * If the primary queue is not empty, the primary tail doesn't + * need to change and we can simply link the secondary tail to + * the old primary head. + */ + tail_2nd->next = next; + } else { + /* + * When the primary queue is empty, the secondary tail becomes + * the primary tail. + */ + + /* + * Speculatively break the secondary queue's circular link such + * that when the secondary tail becomes the primary tail it all + * works out. + */ + tail_2nd->next = NULL; + + /* + * tail_2nd->next = NULL; old = xchg_tail(lock, tail); + * prev = decode_tail(old); + * try_cmpxchg_release(...); WRITE_ONCE(prev->next, node); + * + * If the following cmpxchg() succeeds, our stores will not + * collide. + */ + new = ((struct cna_node *)tail_2nd)->encoded_tail | + _Q_LOCKED_VAL; + if (!atomic_try_cmpxchg_release(&lock->val, &val, new)) { + /* Restore the secondary queue's circular link. */ + tail_2nd->next = head_2nd; + return NULL; + } + } + + /* The primary queue head now is what was the secondary queue head. */ + return head_2nd; +} + +static inline bool cna_try_clear_tail(struct qspinlock *lock, u32 val, + struct mcs_spinlock *node) +{ + /* + * We're here because the primary queue is empty; check the secondary + * queue for remote waiters. + */ + if (node->locked > 1) { + struct mcs_spinlock *next; + + /* + * When there are waiters on the secondary queue, try to move + * them back onto the primary queue and let them rip. + */ + next = cna_splice_head(lock, val, node, NULL); + if (next) { + arch_mcs_lock_handoff(&next->locked, 1); + return true; + } + + return false; + } + + /* Both queues are empty. Do what MCS does. */ + return __try_clear_tail(lock, val, node); +} + +/* + * cna_splice_next -- splice the next node from the primary queue onto + * the secondary queue. + */ +static void cna_splice_next(struct mcs_spinlock *node, + struct mcs_spinlock *next, + struct mcs_spinlock *nnext) +{ + /* remove 'next' from the main queue */ + node->next = nnext; + + /* stick `next` on the secondary queue tail */ + if (node->locked <= 1) { /* if secondary queue is empty */ + struct cna_node *cn = (struct cna_node *)node; + + /* create secondary queue */ + next->next = next; + + cn->start_time = local_clock(); + /* secondary queue is not empty iff start_time != 0 */ + WARN_ON(!cn->start_time); + } else { + /* add to the tail of the secondary queue */ + struct mcs_spinlock *tail_2nd = decode_tail(node->locked); + struct mcs_spinlock *head_2nd = tail_2nd->next; + + tail_2nd->next = next; + next->next = head_2nd; + } + + node->locked = ((struct cna_node *)next)->encoded_tail; +} + +/* + * cna_order_queue - check whether the next waiter in the main queue is on + * the same NUMA node as the lock holder; if not, and it has a waiter behind + * it in the main queue, move the former onto the secondary queue. + * Returns 1 if the next waiter runs on the same NUMA node; 0 otherwise. + */ +static int cna_order_queue(struct mcs_spinlock *node) +{ + struct mcs_spinlock *next = READ_ONCE(node->next); + struct cna_node *cn = (struct cna_node *)node; + int numa_node, next_numa_node; + + if (!next) + return 0; + + numa_node = cn->numa_node; + next_numa_node = ((struct cna_node *)next)->numa_node; + + if (next_numa_node != numa_node && next_numa_node != CNA_PRIORITY_NODE) { + struct mcs_spinlock *nnext = READ_ONCE(next->next); + + if (nnext) + cna_splice_next(node, next, nnext); + + return 0; + } + return 1; +} + +#define LOCK_IS_BUSY(lock) (atomic_read(&(lock)->val) & _Q_LOCKED_PENDING_MASK) + +/* Abuse the pv_wait_head_or_lock() hook to get some work done */ +static __always_inline u32 cna_wait_head_or_lock(struct qspinlock *lock, + struct mcs_spinlock *node) +{ + struct cna_node *cn = (struct cna_node *)node; + + if (node->locked <= 1 && probably(SHUFFLE_REDUCTION_PROB_ARG)) { + /* + * When the secondary queue is empty, skip the calls to + * cna_order_queue() below with high probability. This optimization + * reduces the overhead of unnecessary shuffling of threads + * between waiting queues when the lock is only lightly contended. + */ + return 0; + } + + if (!cn->start_time || !intra_node_threshold_reached(cn)) { + /* + * We are at the head of the wait queue, no need to use + * the fake NUMA node ID. + */ + if (cn->numa_node == CNA_PRIORITY_NODE) + cn->numa_node = cn->real_numa_node; + + /* + * Try and put the time otherwise spent spin waiting on + * _Q_LOCKED_PENDING_MASK to use by sorting our lists. + */ + while (LOCK_IS_BUSY(lock) && !cna_order_queue(node)) + cpu_relax(); + } else { + cn->start_time = FLUSH_SECONDARY_QUEUE; + } + + return 0; /* we lied; we didn't wait, go do so now */ +} + +static inline void cna_lock_handoff(struct mcs_spinlock *node, + struct mcs_spinlock *next) +{ + struct cna_node *cn = (struct cna_node *)node; + u32 val = 1; + + if (cn->start_time != FLUSH_SECONDARY_QUEUE) { + if (node->locked > 1) { + val = node->locked; /* preseve secondary queue */ + + /* + * We have a local waiter, either real or fake one; + * reload @next in case it was changed by cna_order_queue(). + */ + next = node->next; + + /* + * Pass over NUMA node id of primary queue, to maintain the + * preference even if the next waiter is on a different node. + */ + ((struct cna_node *)next)->numa_node = cn->numa_node; + + ((struct cna_node *)next)->start_time = cn->start_time; + } + } else { + /* + * We decided to flush the secondary queue; + * this can only happen if that queue is not empty. + */ + WARN_ON(node->locked <= 1); + /* + * Splice the secondary queue onto the primary queue and pass the lock + * to the longest waiting remote waiter. + */ + next = cna_splice_head(NULL, 0, node, next); + } + + arch_mcs_lock_handoff(&next->locked, val); +} + +/* + * Constant (boot-param configurable) flag selecting the NUMA-aware variant + * of spinlock. Possible values: -1 (off, default) / 0 (auto) / 1 (on). + */ +static int numa_spinlock_flag = -1; + +static int __init numa_spinlock_setup(char *str) +{ + if (!strcmp(str, "auto")) { + numa_spinlock_flag = 0; + return 1; + } else if (!strcmp(str, "on")) { + numa_spinlock_flag = 1; + return 1; + } else if (!strcmp(str, "off")) { + numa_spinlock_flag = -1; + return 1; + } + + return 0; +} +__setup("numa_spinlock=", numa_spinlock_setup); + +void __cna_queued_spin_lock_slowpath(struct qspinlock *lock, u32 val); + +/* + * Switch to the NUMA-friendly slow path for spinlocks when we have + * multiple NUMA nodes in native environment, unless the user has + * overridden this default behavior by setting the numa_spinlock flag. + */ +void __init cna_configure_spin_lock_slowpath(void) +{ + + if (numa_spinlock_flag < 0) + return; + + if (numa_spinlock_flag == 0 && (nr_node_ids < 2 || + pv_ops.lock.queued_spin_lock_slowpath != + native_queued_spin_lock_slowpath)) + return; + + cna_init_nodes(); + + pv_ops.lock.queued_spin_lock_slowpath = __cna_queued_spin_lock_slowpath; + + pr_info("Enabling CNA spinlock\n"); +} diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h index 6afc249ce697dba8d05374cf8f3637d08a62b6fa..2b35460d5658e98f1d54d62f255c1f6408c47f89 100644 --- a/kernel/locking/qspinlock_paravirt.h +++ b/kernel/locking/qspinlock_paravirt.h @@ -368,7 +368,7 @@ static void pv_kick_node(struct qspinlock *lock, struct mcs_spinlock *node) * * Matches with smp_store_mb() and cmpxchg() in pv_wait_node() * - * The write to next->locked in arch_mcs_spin_unlock_contended() + * The write to next->locked in arch_mcs_lock_handoff() * must be ordered before the read of pn->state in the cmpxchg() * below for the code to work correctly. To guarantee full ordering * irrespective of the success or failure of the cmpxchg(), diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 1756389a060942d0ad1e9bcc8952c90a03651821..40985c9d92d05217f42715ce4209ceb01c6b6bd5 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -2944,6 +2944,10 @@ void __init numa_policy_init(void) pr_err("%s: interleaving failed\n", __func__); check_numabalancing_enable(); + +#if defined(CONFIG_NUMA_AWARE_SPINLOCKS) + cna_configure_spin_lock_slowpath(); +#endif } /* Reset policy of current process to default */