From 83cf4c46b8f18ea9321e761c1294811efacbe85b Mon Sep 17 00:00:00 2001 From: Tony Lu Date: Thu, 25 May 2023 17:05:59 +0800 Subject: [PATCH] anolis: net/smc: Introduce smc_mem to limit memory usage ANBZ: #5791 This introduces a new sysctl knob smc_mem to limit the pages usage of send and receive buffer. This knob is global, not per net namespace, for the whole SMC stack. If pages are used up, new connections will fallback to TCP. The pages will be freed when link group free. It's common to trade more memory for better throughput, and SMC allocates full size buffer for every buffer. In the case of a large number of connections, it is more likely to run out of memory. Signed-off-by: Tony Lu --- Documentation/networking/smc-sysctl.rst | 5 +++++ net/smc/af_smc.c | 13 ++++++++++++ net/smc/smc_core.c | 27 ++++++++++++++++++++++--- net/smc/smc_core.h | 20 ++++++++++++++++++ net/smc/smc_sysctl.c | 16 +++++++++++++++ net/smc/smc_sysctl.h | 2 ++ 6 files changed, 80 insertions(+), 3 deletions(-) diff --git a/Documentation/networking/smc-sysctl.rst b/Documentation/networking/smc-sysctl.rst index 497af518816b..c2f7ab4829bb 100644 --- a/Documentation/networking/smc-sysctl.rst +++ b/Documentation/networking/smc-sysctl.rst @@ -42,6 +42,11 @@ smcr_testlink_time - INTEGER Default: 30 seconds. +mem - INTEGER + Number of pages allowed for allocating by all SMC sockets. + + Default value is calculated at module initialization from available memory. + wmem - INTEGER Initial size of send buffer used by SMC sockets. diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index b8d57c7ed895..f15a069c2744 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -1861,6 +1861,11 @@ static int smc_connect(struct socket *sock, struct sockaddr *addr, goto out; } + /* fast path for checking min buffer */ + if (!smc->use_fallback && smc_memory_allocated() + + SMC_BUF_MIN_PAGES * 2 > READ_ONCE(sysctl_smc_mem)) + smc_switch_to_fallback(smc, SMC_CLC_DECL_MEM); + if (smc->connect_nonblock) { rc = -EALREADY; goto out; @@ -2709,6 +2714,12 @@ static void smc_listen_work(struct work_struct *work) return; } + if (smc_memory_allocated() + SMC_BUF_MIN_PAGES * 2 > + READ_ONCE(sysctl_smc_mem)) { + rc = SMC_CLC_DECL_MEM; + goto out_decl; + } + /* do inband token exchange - * wait for and receive SMC Proposal CLC message */ @@ -4680,6 +4691,8 @@ static int __init smc_init(void) { int rc, i; + smc_sysctl_init(); + if (reserve_mode) { pr_info_ratelimited("smc: load SMC module with reserve_mode\n"); if (rsvd_ports_base > diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index 7ee67e3c7137..dc7288d4e689 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -40,6 +40,9 @@ #define SMC_LGR_FREE_DELAY_SERV (600 * HZ) #define SMC_LGR_FREE_DELAY_CLNT (SMC_LGR_FREE_DELAY_SERV + 10 * HZ) +unsigned long sysctl_smc_mem __read_mostly; +atomic_long_t __smc_memory_allocated ____cacheline_aligned_in_smp; + struct smc_lgr_list smc_lgr_list = { /* established link groups */ .lock = __SPIN_LOCK_UNLOCKED(smc_lgr_list.lock), .list = LIST_HEAD_INIT(smc_lgr_list.list), @@ -1463,6 +1466,8 @@ static void smcr_buf_free(struct smc_link_group *lgr, bool is_rmb, __free_pages(buf_desc->pages, buf_desc->order); else if (buf_desc->is_vm && buf_desc->cpu_addr) vfree(buf_desc->cpu_addr); + smc_memory_allocated_sub(2 << buf_desc->order); + kfree(buf_desc); } @@ -1475,6 +1480,7 @@ static void smcd_buf_free(struct smc_link_group *lgr, bool is_dmb, smc_ism_unregister_dmb(lgr->smcd, buf_desc); } else { kfree(buf_desc->cpu_addr); + smc_memory_allocated_sub(buf_desc->len / PAGE_SIZE); } kfree(buf_desc); } @@ -2469,11 +2475,11 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb) struct smc_buf_desc *buf_desc = ERR_PTR(-ENOMEM); struct smc_connection *conn = &smc->conn; struct smc_link_group *lgr = conn->lgr; + int sk_buf_size, left_buf_size; struct list_head *buf_list; int bufsize, bufsize_short; struct rw_semaphore *lock; /* lock buffer list */ bool is_dgraded = false; - int sk_buf_size; if (is_rmb) /* use socket recv buffer size (w/o overhead) as start value */ @@ -2482,6 +2488,18 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb) /* use socket send buffer size (w/o overhead) as start value */ sk_buf_size = smc->sk.sk_sndbuf; + left_buf_size = (READ_ONCE(sysctl_smc_mem) - + smc_memory_allocated()) * PAGE_SIZE; + if (left_buf_size < sk_buf_size) { + /* no space for min buf, return */ + if (left_buf_size < SMC_BUF_MIN_SIZE) + return -ENOMEM; + + is_dgraded = true; + SMC_STAT_RMB_DOWNGRADED(smc, is_smcd, is_rmb); + sk_buf_size = left_buf_size; + } + for (bufsize_short = smc_compress_bufsize(sk_buf_size, is_smcd, is_rmb); bufsize_short >= 0; bufsize_short--) { if (is_rmb) { @@ -2502,14 +2520,17 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb) break; /* found reusable slot */ } + smc_memory_allocated_add(PAGE_ALIGN(bufsize) / PAGE_SIZE); if (is_smcd) buf_desc = smcd_new_buf_create(lgr, is_rmb, bufsize); else buf_desc = smcr_new_buf_create(lgr, is_rmb, bufsize); - if (PTR_ERR(buf_desc) == -ENOMEM) - break; if (IS_ERR(buf_desc)) { + smc_memory_allocated_sub(PAGE_ALIGN(bufsize) / PAGE_SIZE); + + if (PTR_ERR(buf_desc) == -ENOMEM) + break; if (!is_dgraded) { is_dgraded = true; SMC_STAT_RMB_DOWNGRADED(smc, is_smcd, is_rmb); diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index 6338d762c7d4..2733a312c3c9 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -22,6 +22,9 @@ #include "smc_ib.h" #include "smc_stats.h" +extern unsigned long sysctl_smc_mem; +extern atomic_long_t __smc_memory_allocated; + #define SMC_RMBS_PER_LGR_MAX 255 /* max. # of RMBs per link group */ #define SMC_CONN_PER_LGR_MAX 32 /* max. # of connections per link group. * Correspondingly, SMC_WR_BUF_CNT should not be less than @@ -242,6 +245,7 @@ struct smc_rtoken { /* address/key of remote RMB */ }; #define SMC_BUF_MIN_SIZE 16384 /* minimum size of an RMB */ +#define SMC_BUF_MIN_PAGES (PAGE_ALIGN(SMC_BUF_MIN_SIZE) / PAGE_SIZE) #define SMC_RMBE_SIZES 16 /* number of distinct RMBE sizes */ /* theoretically, the RFC states that largest size would be 512K, * i.e. compressed 5 and thus 6 sizes (0..5), despite @@ -676,4 +680,20 @@ static inline void smcr_link_stats_clear(struct smc_link *link) sizeof(struct smc_link_ib_stats)); } } + +static inline long smc_memory_allocated(void) +{ + return atomic_long_read(&__smc_memory_allocated); +} + +static inline long smc_memory_allocated_add(int pages) +{ + return atomic_long_add_return(pages, &__smc_memory_allocated); +} + +static inline void smc_memory_allocated_sub(int pages) +{ + atomic_long_sub(pages, &__smc_memory_allocated); +} + #endif diff --git a/net/smc/smc_sysctl.c b/net/smc/smc_sysctl.c index 3f53694d5549..a5b3d0575794 100644 --- a/net/smc/smc_sysctl.c +++ b/net/smc/smc_sysctl.c @@ -12,6 +12,7 @@ #include #include +#include #include #include "smc.h" @@ -22,6 +23,7 @@ static int two = 2; static int min_sndbuf = SMC_BUF_MIN_SIZE; static int min_rcvbuf = SMC_BUF_MIN_SIZE; +static unsigned long min_mem = SMC_BUF_MIN_PAGES * 2; static struct ctl_table smc_table[] = { { @@ -47,6 +49,14 @@ static struct ctl_table smc_table[] = { .mode = 0644, .proc_handler = proc_dointvec_jiffies, }, + { + .procname = "mem", + .data = &sysctl_smc_mem, + .maxlen = sizeof(unsigned long), + .mode = 0644, + .proc_handler = proc_doulongvec_minmax, + .extra1 = &min_mem, + }, { .procname = "wmem", .data = &init_net.smc.sysctl_wmem, @@ -89,6 +99,12 @@ static struct ctl_table smc_table[] = { { } }; +void smc_sysctl_init(void) +{ + sysctl_smc_mem = max_t(unsigned long, nr_free_buffer_pages() >> 4, + SMC_BUF_MIN_PAGES * 2); +} + int __net_init smc_sysctl_net_init(struct net *net) { struct ctl_table *table; diff --git a/net/smc/smc_sysctl.h b/net/smc/smc_sysctl.h index 0becc11bd2f4..8ae587040b65 100644 --- a/net/smc/smc_sysctl.h +++ b/net/smc/smc_sysctl.h @@ -13,6 +13,8 @@ #ifndef _SMC_SYSCTL_H #define _SMC_SYSCTL_H +void smc_sysctl_init(void); + #ifdef CONFIG_SYSCTL int __net_init smc_sysctl_net_init(struct net *net); -- Gitee