From d458b55ad1b114dc1046def3e71d8f5024772c4c Mon Sep 17 00:00:00 2001 From: Di Zhu Date: Wed, 3 Dec 2025 15:11:26 +0800 Subject: [PATCH] anolis: net: page_frag_refill from numa node of dev in TX path ANBZ: #27529 Note that the NETIF_F_NOCACHE_COPY flag indicates that this data will not be accessed on the sending path and nontemporal copies will be used. When this flag is set, memory can be allocated from the numa node where the network card is located to store the user's data to be sent, which helps improve the DMA processing performance of the network card and mitigating the impact of PCIe ordering issues on Hygon CPUs and early AMD CPUs with Zen architecture. This solution has two performance implications: 1) It involves copying data from user memory to kernel memory across NUMA nodes (cross-NUMA access is shifted from the network card to the CPU), but sacrificing little CPU performance can yield higher network performance when the network card bandwidth becomes a bottleneck. 2) It replaces the original mempolicy allocation strategy, which may cause memory pressure on the NUMA node where the network card resides under high concurrency. However, due to the addition of support for high-order pages PCP in commit: 44042b44987, the actual memory operation contention overhead is very small. The iperf benchmark below shows that after enabling this patch, the maximum network bandwidth for bidirectional traffic increased on the Hygon CPU platform, but the maximum network bandwidth in a purely sending scenario decreased slightly: origin(%ifutil) patched(%ifutil) send+recv(1:16): 92Gb/s 160Gb/s (+73%) send+recv(16:16): 46Gb/s 102Gb/s (+121%) send+recv(16:1): 82Gb/s 166Gb/s (+102%) send only: 176Gb/s 174Gb/s (-1.1%) recv only: 176Gb/s 178Gb/s (+1.1%) Currently, only connection oriented sockets (TCP, connect UDP, etc.) are supported. Due to the default setting of the numa node field of the virtual network card to NUMA_NO_NOODE, only physical network cards can benefit from it, and the virtual network card will continue its previous memory allocation behavior Hygon-SIG: commit none hygon net: page_frag_refill from numa node of dev in TX path Signed-off-by: Di Zhu Cc: hygon-arch@list.openanolis.cn --- net/core/sock.c | 83 ++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 79 insertions(+), 4 deletions(-) diff --git a/net/core/sock.c b/net/core/sock.c index fd5f227d4eaf..1aad23b6382a 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -140,6 +140,8 @@ #include #include +#include + static DEFINE_MUTEX(proto_list_mutex); static LIST_HEAD(proto_list); @@ -2030,8 +2032,25 @@ void sk_free_unlock_clone(struct sock *sk) } EXPORT_SYMBOL_GPL(sk_free_unlock_clone); +static inline struct net_device *netdev_first_slave_rcu(struct net_device *master) +{ + struct bonding *bond; + struct slave *first_slave; + + if (netif_is_bond_master(master)) { + bond = netdev_priv(master); + first_slave = bond_first_slave_rcu(bond); + + if (likely(first_slave)) + return first_slave->dev; + } + + return NULL; +} + void sk_setup_caps(struct sock *sk, struct dst_entry *dst) { + struct net_device *slave_netdev; u32 max_segs = 1; sk_dst_set(sk, dst); @@ -2048,6 +2067,13 @@ void sk_setup_caps(struct sock *sk, struct dst_entry *dst) max_segs = max_t(u32, dst->dev->gso_max_segs, 1); } } + + rcu_read_lock(); + slave_netdev = netdev_first_slave_rcu(dst->dev); + if (slave_netdev && slave_netdev->features & NETIF_F_NOCACHE_COPY) + sk->sk_route_caps |= NETIF_F_NOCACHE_COPY; + rcu_read_unlock(); + sk->sk_gso_max_segs = max_segs; } EXPORT_SYMBOL_GPL(sk_setup_caps); @@ -2476,6 +2502,48 @@ static void sk_leave_memory_pressure(struct sock *sk) DEFINE_STATIC_KEY_FALSE(net_high_order_alloc_disable_key); +static inline struct page *alloc_frag_pages(int numa_node, gfp_t gfp, + unsigned int order) +{ + struct page *page = NULL; + + if (numa_node != NUMA_NO_NODE) + page = alloc_pages_node(numa_node, gfp, order); + + if (!page) + page = alloc_pages(gfp, order); + + return page; +} + +static int get_numa_node_for_sock(struct sock *sk) +{ + struct net_device *slave_netdev; + struct net_device *netdev; + struct dst_entry *dst; + int numa_node = NUMA_NO_NODE; + + if (!(sk->sk_route_caps & NETIF_F_NOCACHE_COPY)) + return NUMA_NO_NODE; + + dst = sk_dst_get(sk); + if (!dst) + return NUMA_NO_NODE; + + netdev = READ_ONCE(dst->dev); + + rcu_read_lock(); + slave_netdev = netdev_first_slave_rcu(netdev); + if (slave_netdev) + netdev = slave_netdev; + + numa_node = dev_to_node(&netdev->dev); + rcu_read_unlock(); + + dst_release(dst); + return numa_node; +} + /** * skb_page_frag_refill - check that a page_frag contains enough room * @sz: minimum size of the fragment we want to get @@ -2486,7 +2554,8 @@ DEFINE_STATIC_KEY_FALSE(net_high_order_alloc_disable_key); * no guarantee that allocations succeed. Therefore, @sz MUST be * less or equal than PAGE_SIZE. */ -bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp) +static bool skb_page_frag_refill_numa(unsigned int sz, struct page_frag *pfrag, + gfp_t gfp, int numa_node) { if (pfrag->page) { if (page_ref_count(pfrag->page) == 1) { @@ -2502,7 +2571,7 @@ bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp) if (SKB_FRAG_PAGE_ORDER && !static_branch_unlikely(&net_high_order_alloc_disable_key)) { /* Avoid direct reclaim but allow kswapd to wake */ - pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) | + pfrag->page = alloc_frag_pages(numa_node, (gfp & ~__GFP_DIRECT_RECLAIM) | __GFP_COMP | __GFP_NOWARN | __GFP_NORETRY, SKB_FRAG_PAGE_ORDER); @@ -2511,18 +2580,24 @@ bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp) return true; } } - pfrag->page = alloc_page(gfp); + pfrag->page = alloc_frag_pages(numa_node, gfp, 0); if (likely(pfrag->page)) { pfrag->size = PAGE_SIZE; return true; } return false; } + +bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp) +{ + return skb_page_frag_refill_numa(sz, pfrag, gfp, NUMA_NO_NODE); +} EXPORT_SYMBOL(skb_page_frag_refill); bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag) { - if (likely(skb_page_frag_refill(32U, pfrag, sk->sk_allocation))) + if (likely(skb_page_frag_refill_numa(32U, pfrag, sk->sk_allocation, + get_numa_node_for_sock(sk)))) return true; sk_enter_memory_pressure(sk); -- Gitee