diff --git a/drivers/block/zram/Kconfig b/drivers/block/zram/Kconfig index 0386b7da02aa3ba46d187358d5fe3a0302b97a8d..6326e4a1462efc2338a24f45f962dd7e1f68f95e 100644 --- a/drivers/block/zram/Kconfig +++ b/drivers/block/zram/Kconfig @@ -87,3 +87,5 @@ config ZRAM_MULTI_COMP re-compress pages using a potentially slower but more effective compression algorithm. Note, that IDLE page recompression requires ZRAM_MEMORY_TRACKING. + +source "drivers/block/zram/zram_group/Kconfig" diff --git a/drivers/block/zram/Makefile b/drivers/block/zram/Makefile index de9e457907b1e9834937df323413bd11d18f5d5c..a8947f7faa980f96ce88ee9ae1d8278761175435 100644 --- a/drivers/block/zram/Makefile +++ b/drivers/block/zram/Makefile @@ -1,4 +1,9 @@ # SPDX-License-Identifier: GPL-2.0-only zram-y := zcomp.o zram_drv.o +zram-$(CONFIG_ZRAM_GROUP) += zram_group/zram_group.o zram_group/zlist.o zram_group/group_writeback.o + obj-$(CONFIG_ZRAM) += zram.o + +ccflags-$(CONFIG_ZRAM_GROUP) += -I$(srctree)/drivers/block/zram/zram_group/ +ccflags-$(CONFIG_HYPERHOLD) += -I$(srctree)/drivers/hyperhold/ diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 06673c6ca255555dfa6c870f4c06dcb2cad6d8fc..e3e0da787deac8a084018c8b39bfbfd22cab4958 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -34,6 +34,10 @@ #include #include +#ifdef CONFIG_ZRAM_GROUP +#include +#endif + #include "zram_drv.h" static DEFINE_IDR(zram_index_idr); @@ -57,21 +61,6 @@ static void zram_free_page(struct zram *zram, size_t index); static int zram_read_page(struct zram *zram, struct page *page, u32 index, struct bio *parent); -static int zram_slot_trylock(struct zram *zram, u32 index) -{ - return bit_spin_trylock(ZRAM_LOCK, &zram->table[index].flags); -} - -static void zram_slot_lock(struct zram *zram, u32 index) -{ - bit_spin_lock(ZRAM_LOCK, &zram->table[index].flags); -} - -static void zram_slot_unlock(struct zram *zram, u32 index) -{ - bit_spin_unlock(ZRAM_LOCK, &zram->table[index].flags); -} - static inline bool init_done(struct zram *zram) { return zram->disksize; @@ -82,35 +71,6 @@ static inline struct zram *dev_to_zram(struct device *dev) return (struct zram *)dev_to_disk(dev)->private_data; } -static unsigned long zram_get_handle(struct zram *zram, u32 index) -{ - return zram->table[index].handle; -} - -static void zram_set_handle(struct zram *zram, u32 index, unsigned long handle) -{ - zram->table[index].handle = handle; -} - -/* flag operations require table entry bit_spin_lock() being held */ -static bool zram_test_flag(struct zram *zram, u32 index, - enum zram_pageflags flag) -{ - return zram->table[index].flags & BIT(flag); -} - -static void zram_set_flag(struct zram *zram, u32 index, - enum zram_pageflags flag) -{ - zram->table[index].flags |= BIT(flag); -} - -static void zram_clear_flag(struct zram *zram, u32 index, - enum zram_pageflags flag) -{ - zram->table[index].flags &= ~BIT(flag); -} - static inline void zram_set_element(struct zram *zram, u32 index, unsigned long element) { @@ -122,19 +82,6 @@ static unsigned long zram_get_element(struct zram *zram, u32 index) return zram->table[index].element; } -static size_t zram_get_obj_size(struct zram *zram, u32 index) -{ - return zram->table[index].flags & (BIT(ZRAM_FLAG_SHIFT) - 1); -} - -static void zram_set_obj_size(struct zram *zram, - u32 index, size_t size) -{ - unsigned long flags = zram->table[index].flags >> ZRAM_FLAG_SHIFT; - - zram->table[index].flags = (flags << ZRAM_FLAG_SHIFT) | size; -} - static inline bool zram_allocated(struct zram *zram, u32 index) { return zram_get_obj_size(zram, index) || @@ -588,9 +535,6 @@ static void read_from_bdev_async(struct zram *zram, struct page *page, submit_bio(bio); } -#define PAGE_WB_SIG "page_index=" - -#define PAGE_WRITEBACK 0 #define HUGE_WRITEBACK (1<<0) #define IDLE_WRITEBACK (1<<1) #define INCOMPRESSIBLE_WRITEBACK (1<<2) @@ -616,17 +560,8 @@ static ssize_t writeback_store(struct device *dev, mode = IDLE_WRITEBACK | HUGE_WRITEBACK; else if (sysfs_streq(buf, "incompressible")) mode = INCOMPRESSIBLE_WRITEBACK; - else { - if (strncmp(buf, PAGE_WB_SIG, sizeof(PAGE_WB_SIG) - 1)) - return -EINVAL; - - if (kstrtol(buf + sizeof(PAGE_WB_SIG) - 1, 10, &index) || - index >= nr_pages) - return -EINVAL; - - nr_pages = 1; - mode = PAGE_WRITEBACK; - } + else + return -EINVAL; down_read(&zram->init_lock); if (!init_done(zram)) { @@ -645,7 +580,7 @@ static ssize_t writeback_store(struct device *dev, goto release_init_lock; } - for (; nr_pages != 0; index++, nr_pages--) { + for (index = 0; index < nr_pages; index++) { spin_lock(&zram->wb_limit_lock); if (zram->wb_limit_enable && !zram->bd_wb_limit) { spin_unlock(&zram->wb_limit_lock); @@ -1215,6 +1150,66 @@ static DEVICE_ATTR_RO(bd_stat); #endif static DEVICE_ATTR_RO(debug_stat); +#ifdef CONFIG_ZRAM_GROUP +static ssize_t group_show(struct device *dev, struct device_attribute *attr, char *buf) +{ + struct zram *zram = dev_to_zram(dev); + int ret = 0; + + down_read(&zram->init_lock); + if (zram->zgrp_ctrl == ZGRP_NONE) + ret = snprintf(buf, PAGE_SIZE - 1, "disable\n"); + else if (zram->zgrp_ctrl == ZGRP_TRACK) + ret = snprintf(buf, PAGE_SIZE - 1, "readonly\n"); +#ifdef CONFIG_ZRAM_GROUP_WRITEBACK + else if (zram->zgrp_ctrl == ZGRP_WRITE) + ret = snprintf(buf, PAGE_SIZE - 1, "readwrite\n"); +#endif + up_read(&zram->init_lock); + + return ret; +} + +static ssize_t group_store(struct device *dev, struct device_attribute *attr, + const char *buf, size_t len) +{ + struct zram *zram = dev_to_zram(dev); + int ret; +#ifdef CONFIG_ZRAM_GROUP_DEBUG + u32 op, gid, index; + + ret = sscanf(buf, "%u %u %u", &op, &index, &gid); + if (ret == 3) { + pr_info("op[%u] index[%u] gid[%u].\n", op, index, gid); + group_debug(zram, op, index, gid); + return len; + } +#endif + + ret = len; + down_write(&zram->init_lock); + if (init_done(zram)) { + pr_info("Can't setup group ctrl for initialized device!\n"); + ret = -EBUSY; + goto out; + } + if (!strcmp(buf, "disable\n")) + zram->zgrp_ctrl = ZGRP_NONE; + else if (!strcmp(buf, "readonly\n")) + zram->zgrp_ctrl = ZGRP_TRACK; +#ifdef CONFIG_ZRAM_GROUP_WRITEBACK + else if (!strcmp(buf, "readwrite\n")) + zram->zgrp_ctrl = ZGRP_WRITE; +#endif + else + ret = -EINVAL; +out: + up_write(&zram->init_lock); + + return ret; +} +#endif + static void zram_meta_free(struct zram *zram, u64 disksize) { size_t num_pages = disksize >> PAGE_SHIFT; @@ -1226,6 +1221,9 @@ static void zram_meta_free(struct zram *zram, u64 disksize) zs_destroy_pool(zram->mem_pool); vfree(zram->table); +#ifdef CONFIG_ZRAM_GROUP + zram_group_deinit(zram); +#endif } static bool zram_meta_alloc(struct zram *zram, u64 disksize) @@ -1245,6 +1243,10 @@ static bool zram_meta_alloc(struct zram *zram, u64 disksize) if (!huge_class_size) huge_class_size = zs_huge_class_size(zram->mem_pool); +#ifdef CONFIG_ZRAM_GROUP + zram_group_init(zram, num_pages); +#endif + return true; } @@ -1257,6 +1259,10 @@ static void zram_free_page(struct zram *zram, size_t index) { unsigned long handle; +#ifdef CONFIG_ZRAM_GROUP + zram_group_untrack_obj(zram, index); +#endif + #ifdef CONFIG_ZRAM_MEMORY_TRACKING zram->table[index].ac_time = 0; #endif @@ -1360,6 +1366,20 @@ static int zram_read_page(struct zram *zram, struct page *page, u32 index, int ret; zram_slot_lock(zram, index); +#ifdef CONFIG_ZRAM_GROUP_WRITEBACK + if (!parent) { + ret = zram_group_fault_obj(zram, index); + if (ret) { + zram_slot_unlock(zram, index); + return ret; + } + } + + if (zram_test_flag(zram, index, ZRAM_GWB)) { + zram_slot_unlock(zram, index); + return -EIO; + } +#endif if (!zram_test_flag(zram, index, ZRAM_WB)) { /* Slot should be locked through out the function call */ ret = zram_read_from_zspool(zram, page, index); @@ -1527,6 +1547,9 @@ static int zram_write_page(struct zram *zram, struct page *page, u32 index) zram_set_handle(zram, index, handle); zram_set_obj_size(zram, index, comp_len); } +#ifdef CONFIG_ZRAM_GROUP + zram_group_track_obj(zram, index, page_memcg(page)); +#endif zram_slot_unlock(zram, index); /* Update stats */ @@ -2143,6 +2166,9 @@ static DEVICE_ATTR_RW(writeback_limit_enable); static DEVICE_ATTR_RW(recomp_algorithm); static DEVICE_ATTR_WO(recompress); #endif +#ifdef CONFIG_ZRAM_GROUP +static DEVICE_ATTR_RW(group); +#endif static struct attribute *zram_disk_attrs[] = { &dev_attr_disksize.attr, @@ -2169,6 +2195,9 @@ static struct attribute *zram_disk_attrs[] = { #ifdef CONFIG_ZRAM_MULTI_COMP &dev_attr_recomp_algorithm.attr, &dev_attr_recompress.attr, +#endif +#ifdef CONFIG_ZRAM_GROUP + &dev_attr_group.attr, #endif NULL, }; diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h index ca7a15bd48456afbcc6f23394ea9c18040905108..f61352fa7c3e29ddd29851eba5c9741bc101219e 100644 --- a/drivers/block/zram/zram_drv.h +++ b/drivers/block/zram/zram_drv.h @@ -21,6 +21,10 @@ #include "zcomp.h" +#ifdef CONFIG_ZRAM_GROUP +#include "zram_group.h" +#endif + #define SECTORS_PER_PAGE_SHIFT (PAGE_SHIFT - SECTOR_SHIFT) #define SECTORS_PER_PAGE (1 << SECTORS_PER_PAGE_SHIFT) #define ZRAM_LOGICAL_BLOCK_SHIFT 12 @@ -38,7 +42,15 @@ * * We use BUILD_BUG_ON() to make sure that zram pageflags don't overflow. */ +#ifdef CONFIG_ZRAM_GROUP +/* reserve 16 bits for group id */ +#define ZRAM_SIZE_SHIFT 24 +#define ZRAM_GRPID_SHIFT 16 +#define ZRAM_GRPID_MASK (((1UL << ZRAM_GRPID_SHIFT) - 1) << ZRAM_SIZE_SHIFT) +#define ZRAM_FLAG_SHIFT (ZRAM_SIZE_SHIFT + ZRAM_GRPID_SHIFT) +#else #define ZRAM_FLAG_SHIFT (PAGE_SHIFT + 1) +#endif /* Only 2 bits are allowed for comp priority index */ #define ZRAM_COMP_PRIORITY_MASK 0x3 @@ -52,6 +64,10 @@ enum zram_pageflags { ZRAM_UNDER_WB, /* page is under writeback */ ZRAM_HUGE, /* Incompressible page */ ZRAM_IDLE, /* not accessed page since last idle marking */ +#ifdef CONFIG_ZRAM_GROUP_WRITEBACK + ZRAM_GWB, /* obj is group writeback*/ + ZRAM_FAULT, /* obj is needed by a pagefault req */ +#endif ZRAM_INCOMPRESSIBLE, /* none of the algorithms could compress it */ ZRAM_COMP_PRIORITY_BIT1, /* First bit of comp priority index */ @@ -105,6 +121,10 @@ struct zram_stats { struct zram { struct zram_table_entry *table; +#ifdef CONFIG_ZRAM_GROUP + struct zram_group *zgrp; + unsigned int zgrp_ctrl; +#endif struct zs_pool *mem_pool; struct zcomp *comps[ZRAM_MAX_COMPS]; struct gendisk *disk; @@ -140,4 +160,86 @@ struct zram { struct dentry *debugfs_dir; #endif }; + +static inline int zram_slot_trylock(struct zram *zram, u32 index) +{ + return bit_spin_trylock(ZRAM_LOCK, &zram->table[index].flags); +} + +static inline void zram_slot_lock(struct zram *zram, u32 index) +{ + bit_spin_lock(ZRAM_LOCK, &zram->table[index].flags); +} + +static inline void zram_slot_unlock(struct zram *zram, u32 index) +{ + bit_spin_unlock(ZRAM_LOCK, &zram->table[index].flags); +} + +static inline unsigned long zram_get_handle(struct zram *zram, u32 index) +{ + return zram->table[index].handle; +} + +static inline void zram_set_handle(struct zram *zram, u32 index, unsigned long handle) +{ + zram->table[index].handle = handle; +} + +/* flag operations require table entry bit_spin_lock() being held */ +static inline bool zram_test_flag(struct zram *zram, u32 index, + enum zram_pageflags flag) +{ + return zram->table[index].flags & BIT(flag); +} + +static inline void zram_set_flag(struct zram *zram, u32 index, + enum zram_pageflags flag) +{ + zram->table[index].flags |= BIT(flag); +} + +static inline void zram_clear_flag(struct zram *zram, u32 index, + enum zram_pageflags flag) +{ + zram->table[index].flags &= ~BIT(flag); +} +#ifdef CONFIG_ZRAM_GROUP +static inline size_t zram_get_obj_size(struct zram *zram, u32 index) +{ + return zram->table[index].flags & (BIT(ZRAM_SIZE_SHIFT) - 1); +} + +static inline void zram_set_obj_size(struct zram *zram, u32 index, size_t size) +{ + unsigned long flags = zram->table[index].flags >> ZRAM_SIZE_SHIFT; + + zram->table[index].flags = (flags << ZRAM_SIZE_SHIFT) | size; +} + +void zram_group_init(struct zram *zram, u32 nr_obj); +void zram_group_deinit(struct zram *zram); +void zram_group_track_obj(struct zram *zram, u32 index, struct mem_cgroup *memcg); +void zram_group_untrack_obj(struct zram *zram, u32 index); +#ifdef CONFIG_ZRAM_GROUP_WRITEBACK +int zram_group_fault_obj(struct zram *zram, u32 index); +#endif + +#ifdef CONFIG_ZRAM_GROUP_DEBUG +void group_debug(struct zram *zram, u32 op, u32 index, u32 gid); +#endif + +#else +static inline size_t zram_get_obj_size(struct zram *zram, u32 index) +{ + return zram->table[index].flags & (BIT(ZRAM_FLAG_SHIFT) - 1); +} + +static inline void zram_set_obj_size(struct zram *zram, u32 index, size_t size) +{ + unsigned long flags = zram->table[index].flags >> ZRAM_FLAG_SHIFT; + + zram->table[index].flags = (flags << ZRAM_FLAG_SHIFT) | size; +} +#endif #endif diff --git a/drivers/block/zram/zram_group/Kconfig b/drivers/block/zram/zram_group/Kconfig new file mode 100644 index 0000000000000000000000000000000000000000..0eacf79fb2594db32641d6997e463061c8da7880 --- /dev/null +++ b/drivers/block/zram/zram_group/Kconfig @@ -0,0 +1,24 @@ +# SPDX-License-Identifier: GPL-2.0 +config ZRAM_GROUP + bool "Manage Zram objs with mem_cgroup" + depends on ZRAM && MEMCG + help + Manage Zram objs with mem_cgroup. + +config ZRAM_GROUP_DEBUG + bool "Debug info for zram group" + depends on ZRAM_GROUP + help + Debug info for ZRAM_GROUP. + +config ZLIST_DEBUG + bool "Debug info for zram group list" + depends on ZRAM_GROUP + help + Debug info for zram group list. + +config ZRAM_GROUP_WRITEBACK + bool "Write back grouped zram objs to Hyperhold driver" + depends on ZRAM_GROUP && HYPERHOLD + help + Write back grouped zram objs to hyperhold. diff --git a/drivers/block/zram/zram_group/group_writeback.c b/drivers/block/zram/zram_group/group_writeback.c new file mode 100644 index 0000000000000000000000000000000000000000..0956a2eb939a2b312fcc00a7a8325e99eedb24c7 --- /dev/null +++ b/drivers/block/zram/zram_group/group_writeback.c @@ -0,0 +1,735 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * drivers/block/zram/zram_group/group_writeback.c + * + * Copyright (c) 2020-2022 Huawei Technologies Co., Ltd. + */ + +#include +#include +#include +#include + +#include "../zram_drv.h" +#include "zram_group.h" + +#ifdef CONFIG_HYPERHOLD +#include "hyperhold.h" +#endif + +#define CHECK(cond, ...) ((cond) || (pr_err(__VA_ARGS__), false)) +#define CHECK_BOUND(var, min, max) \ + CHECK((var) >= (min) && (var) <= (max), \ + "%s %u out of bounds %u ~ %u!\n", \ + #var, (var), (min), (max)) + +static u16 zram_get_memcg_id(struct zram *zram, u32 index) +{ + return (zram->table[index].flags & ZRAM_GRPID_MASK) >> ZRAM_SIZE_SHIFT; +} + +static void zram_set_memcg_id(struct zram *zram, u32 index, u16 gid) +{ + unsigned long old = zram->table[index].flags & (~ZRAM_GRPID_MASK); + + zram->table[index].flags = old | ((u64)gid << ZRAM_SIZE_SHIFT); +} + +#ifdef CONFIG_ZRAM_GROUP_WRITEBACK +static bool obj_can_wb(struct zram *zram, u32 index, u16 gid) +{ + /* overwrited obj, just skip */ + if (zram_get_memcg_id(zram, index) != gid) { + pr_debug("obj %u is from group %u instead of group %u.\n", + index, zram_get_memcg_id(zram, index), gid); + return false; + } + if (!zgrp_obj_is_isolated(zram->zgrp, index)) { + pr_debug("obj %u is not isolated.\n", index); + return false; + } + /* need not to writeback, put back the obj as HOTEST */ + if (zram_test_flag(zram, index, ZRAM_SAME)) { + pr_debug("obj %u is filled with same element.\n", index); + goto insert; + } + if (zram_test_flag(zram, index, ZRAM_WB)) { + pr_debug("obj %u is writeback.\n", index); + goto insert; + } + /* obj is needed by a pagefault req, do not writeback it. */ + if (zram_test_flag(zram, index, ZRAM_FAULT)) { + pr_debug("obj %u is needed by a pagefault request.\n", index); + goto insert; + } + /* should never happen */ + if (zram_test_flag(zram, index, ZRAM_GWB)) { + pr_debug("obj %u is group writeback.\n", index); + BUG(); + return false; + } + + return true; +insert: + zgrp_obj_insert(zram->zgrp, index, gid); + + return false; +} + +static void copy_obj(struct hpio *hpio, u32 offset, char *obj, u32 size, bool to) +{ + u32 page_id, start; + char *buf = NULL; + + page_id = offset / PAGE_SIZE; + start = offset % PAGE_SIZE; + if (size + start <= PAGE_SIZE) { + buf = page_to_virt(hyperhold_io_page(hpio, page_id)); + if (to) + memcpy(buf + start, obj, size); + else + memcpy(obj, buf + start, size); + + return; + } + buf = page_to_virt(hyperhold_io_page(hpio, page_id)); + if (to) + memcpy(buf + start, obj, PAGE_SIZE - start); + else + memcpy(obj, buf + start, PAGE_SIZE - start); + buf = page_to_virt(hyperhold_io_page(hpio, page_id + 1)); + if (to) + memcpy(buf, obj + PAGE_SIZE - start, size + start - PAGE_SIZE); + else + memcpy(obj + PAGE_SIZE - start, buf, size + start - PAGE_SIZE); +} + +static u32 move_obj_to_hpio(struct zram *zram, u32 index, u16 gid, + struct hpio *hpio, u32 offset) +{ + u32 size = 0; + unsigned long handle; + char *src = NULL; + u32 ext_size; + u32 eid; + + eid = hyperhold_io_extent(hpio); + ext_size = hyperhold_extent_size(eid); + + zram_slot_lock(zram, index); + if (!obj_can_wb(zram, index, gid)) + goto unlock; + size = zram_get_obj_size(zram, index); + /* no space, put back the obj as COLDEST */ + if (size + offset > ext_size) { + pr_debug("obj %u size is %u, but ext %u only %u space left.\n", + index, size, eid, ext_size - offset); + zgrp_obj_putback(zram->zgrp, index, gid); + size = 0; + goto unlock; + } + handle = zram_get_handle(zram, index); + src = zs_map_object(zram->mem_pool, handle, ZS_MM_RO); + copy_obj(hpio, offset, src, size, true); + zs_unmap_object(zram->mem_pool, handle); + zs_free(zram->mem_pool, handle); + zram_set_handle(zram, index, hyperhold_address(eid, offset)); + zram_set_flag(zram, index, ZRAM_GWB); + wbgrp_obj_insert(zram->zgrp, index, eid); + wbgrp_obj_stats_inc(zram->zgrp, gid, eid, size); + zgrp_obj_stats_dec(zram->zgrp, gid, size); + pr_debug("move obj %u of group %u to hpio %p of eid %u, size = %u, offset = %u\n", + index, gid, hpio, eid, size, offset); +unlock: + zram_slot_unlock(zram, index); + + return size; +} + +static void move_obj_from_hpio(struct zram *zram, int index, struct hpio *hpio) +{ + u32 size = 0; + unsigned long handle = 0; + u32 eid, offset; + u64 addr; + char *dst = NULL; + u16 gid; + + eid = hyperhold_io_extent(hpio); +retry: + zram_slot_lock(zram, index); + if (!zram_test_flag(zram, index, ZRAM_GWB)) + goto unlock; + addr = zram_get_handle(zram, index); + if (hyperhold_addr_extent(addr) != eid) + goto unlock; + size = zram_get_obj_size(zram, index); + if (handle) + goto move; + handle = zs_malloc(zram->mem_pool, size, GFP_NOWAIT); + if (handle) + goto move; + zram_slot_unlock(zram, index); + handle = zs_malloc(zram->mem_pool, size, GFP_NOIO | __GFP_NOFAIL); + if (handle) + goto retry; + BUG(); + + return; +move: + offset = hyperhold_addr_offset(addr); + dst = zs_map_object(zram->mem_pool, handle, ZS_MM_WO); + copy_obj(hpio, offset, dst, size, false); + zs_unmap_object(zram->mem_pool, handle); + zram_set_handle(zram, index, handle); + zram_clear_flag(zram, index, ZRAM_GWB); + gid = zram_get_memcg_id(zram, index); + zgrp_obj_insert(zram->zgrp, index, gid); + wbgrp_obj_stats_dec(zram->zgrp, gid, eid, size); + zgrp_obj_stats_inc(zram->zgrp, gid, size); + pr_debug("move obj %u of group %u from hpio %p of eid %u, size = %u, offset = %u\n", + index, gid, hpio, eid, size, offset); +unlock: + zram_slot_unlock(zram, index); +} + + +#define NR_ISOLATE 32 +static bool move_extent_from_hpio(struct zram *zram, struct hpio *hpio) +{ + u32 idxs[NR_ISOLATE]; + u32 eid; + u32 nr; + int i; + bool last = false; + + eid = hyperhold_io_extent(hpio); +repeat: + nr = wbgrp_isolate_objs(zram->zgrp, eid, idxs, NR_ISOLATE, &last); + for (i = 0; i < nr; i++) + move_obj_from_hpio(zram, idxs[i], hpio); + if (last) + return true; + if (nr) + goto repeat; + + return false; +} + +struct hpio_priv { + struct zram *zram; + u16 gid; +}; + +static void write_endio(struct hpio *hpio) +{ + struct hpio_priv *priv = hyperhold_io_private(hpio); + struct zram *zram = priv->zram; + u16 gid = priv->gid; + u32 eid = hyperhold_io_extent(hpio); + + if (hyperhold_io_success(hpio)) + goto out; + if (move_extent_from_hpio(zram, hpio)) { + zgrp_ext_delete(zram->zgrp, eid, gid); + hyperhold_should_free_extent(eid); + } +out: + hyperhold_io_complete(hpio); + hyperhold_io_put(hpio); + kfree(priv); +} + +static u32 collect_objs(struct zram *zram, u16 gid, struct hpio *hpio, u32 ext_size) +{ + u32 offset = 0; + u32 last_offset; + u32 nr; + u32 idxs[NR_ISOLATE]; + int i; + +more: + last_offset = offset; + nr = zgrp_isolate_objs(zram->zgrp, gid, idxs, NR_ISOLATE, NULL); + for (i = 0; i < nr; i++) + offset += move_obj_to_hpio(zram, idxs[i], gid, hpio, offset); + pr_debug("%u data attached, offset = %u.\n", offset - last_offset, offset); + if (offset < ext_size && offset != last_offset) + goto more; + + return offset; +} + +static u64 write_one_extent(struct zram *zram, u16 gid) +{ + int eid; + struct hpio *hpio = NULL; + struct hpio_priv *priv = NULL; + u32 size = 0; + int ret; + + priv = kmalloc(sizeof(struct hpio_priv), GFP_NOIO); + if (!priv) + return 0; + priv->gid = gid; + priv->zram = zram; + eid = hyperhold_alloc_extent(); + if (eid < 0) + goto err; + hpio = hyperhold_io_get(eid, GFP_NOIO, REQ_OP_WRITE); + if (!hpio) + goto free_extent; + + zgrp_get_ext(zram->zgrp, eid); + size = collect_objs(zram, gid, hpio, hyperhold_extent_size(eid)); + if (size == 0) { + pr_err("group %u has no data in zram.\n", gid); + zgrp_put_ext(zram->zgrp, eid); + goto put_hpio; + } + zgrp_ext_insert(zram->zgrp, eid, gid); + if (zgrp_put_ext(zram->zgrp, eid)) { + zgrp_ext_delete(zram->zgrp, eid, gid); + hyperhold_should_free_extent(eid); + } + + ret = hyperhold_write_async(hpio, write_endio, priv); + if (ret) + goto move_back; + + return size; +move_back: + if (move_extent_from_hpio(zram, hpio)) { + zgrp_ext_delete(zram->zgrp, eid, gid); + hyperhold_should_free_extent(eid); + } + eid = -EINVAL; +put_hpio: + hyperhold_io_put(hpio); +free_extent: + if (eid >= 0) + hyperhold_free_extent(eid); +err: + kfree(priv); + + return 0; +} + +static void read_endio(struct hpio *hpio) +{ + struct hpio_priv *priv = hyperhold_io_private(hpio); + struct zram *zram = priv->zram; + u16 gid = priv->gid; + u32 eid = hyperhold_io_extent(hpio); + + if (!hyperhold_io_success(hpio)) { + BUG(); + goto out; + } + if (move_extent_from_hpio(zram, hpio)) { + zgrp_ext_delete(zram->zgrp, eid, gid); + hyperhold_should_free_extent(eid); + } +out: + hyperhold_io_complete(hpio); + hyperhold_io_put(hpio); + kfree(priv); +} + +static u64 read_one_extent(struct zram *zram, u32 eid, u16 gid) +{ + struct hpio *hpio = NULL; + u32 ext_size = 0; + int ret; + struct hpio_priv *priv = NULL; + + priv = kmalloc(sizeof(struct hpio_priv), GFP_NOIO); + if (!priv) + goto err; + priv->gid = gid; + priv->zram = zram; + hpio = hyperhold_io_get(eid, GFP_NOIO, REQ_OP_READ); + if (!hpio) + goto err; + ext_size = hyperhold_extent_size(eid); + ret = hyperhold_read_async(hpio, read_endio, priv); + if (ret) + goto err; + + return ext_size; +err: + hyperhold_io_put(hpio); + kfree(priv); + + return 0; +} + +static void sync_read_endio(struct hpio *hpio) +{ + hyperhold_io_complete(hpio); +} + +static int read_one_obj_sync(struct zram *zram, u32 index) +{ + struct hpio *hpio = NULL; + int ret; + u32 eid; + u16 gid; + u32 size; + + if (!zram_test_flag(zram, index, ZRAM_GWB)) + return 0; + + pr_debug("read obj %u.\n", index); + + gid = zram_get_memcg_id(zram, index); + eid = hyperhold_addr_extent(zram_get_handle(zram, index)); + size = zram_get_obj_size(zram, index); + wbgrp_fault_stats_inc(zram->zgrp, gid, eid, size); +check: + if (!zram_test_flag(zram, index, ZRAM_GWB)) + return 0; + if (!zram_test_flag(zram, index, ZRAM_FAULT)) + goto read; + zram_slot_unlock(zram, index); + wait_event(zram->zgrp->wbgrp.fault_wq, !zram_test_flag(zram, index, ZRAM_FAULT)); + zram_slot_lock(zram, index); + goto check; +read: + zram_set_flag(zram, index, ZRAM_FAULT); + zram_slot_unlock(zram, index); + + hpio = hyperhold_io_get(eid, GFP_NOIO, REQ_OP_READ); + if (!hpio) { + ret = -ENOMEM; + goto out; + } + ret = hyperhold_read_async(hpio, sync_read_endio, NULL); + /* io submit error */ + if (ret && ret != -EAGAIN) + goto out; + + hyperhold_io_wait(hpio); + + /* if not reset to zero, will return err sometimes and cause SIG_BUS error */ + ret = 0; + + /* get a write io, data is ready, copy the pages even write failed */ + if (op_is_write(hyperhold_io_operate(hpio))) + goto move; + /* read io failed, return -EIO */ + if (!hyperhold_io_success(hpio)) { + ret = -EIO; + goto out; + } + /* success, copy the data and free extent */ +move: + if (move_extent_from_hpio(zram, hpio)) { + zgrp_ext_delete(zram->zgrp, eid, gid); + hyperhold_should_free_extent(eid); + } + move_obj_from_hpio(zram, index, hpio); +out: + hyperhold_io_put(hpio); + zram_slot_lock(zram, index); + zram_clear_flag(zram, index, ZRAM_FAULT); + wake_up(&zram->zgrp->wbgrp.fault_wq); + + return ret; +} + +u64 read_group_objs(struct zram *zram, u16 gid, u64 req_size) +{ + u32 eid; + u64 read_size = 0; + u32 nr; + + if (!(zram->zgrp)) { + pr_debug("zram group is not enable!\n"); + return 0; + } + if (!CHECK_BOUND(gid, 1, zram->zgrp->nr_grp - 1)) + return 0; + + pr_debug("read %llu data of group %u.\n", req_size, gid); + + while (!req_size || req_size > read_size) { + nr = zgrp_isolate_exts(zram->zgrp, gid, &eid, 1, NULL); + if (!nr) + break; + read_size += read_one_extent(zram, eid, gid); + } + + return read_size; +} + +u64 write_group_objs(struct zram *zram, u16 gid, u64 req_size) +{ + u64 write_size = 0; + u64 size = 0; + + if (!(zram->zgrp)) { + pr_debug("zram group is not enable!\n"); + return 0; + } + if (!CHECK(zram->zgrp->wbgrp.enable, "zram group writeback is not enable!\n")) + return 0; + if (!CHECK_BOUND(gid, 1, zram->zgrp->nr_grp - 1)) + return 0; + + pr_debug("write %llu data of group %u.\n", req_size, gid); + + while (!req_size || req_size > write_size) { + size = write_one_extent(zram, gid); + if (!size) + break; + write_size += size; + } + + atomic64_add(write_size, &zram->zgrp->stats[0].write_size); + atomic64_add(write_size, &zram->zgrp->stats[gid].write_size); + return write_size; +} +#endif + +#ifdef CONFIG_ZRAM_GROUP_DEBUG +#include +#define ZGRP_TEST_MAX_GRP 101 +#endif + +int zram_group_fault_obj(struct zram *zram, u32 index) +{ + u16 gid; + u32 size; + + if (!(zram->zgrp)) { + pr_debug("zram group is not enable!\n"); + return 0; + } + if (!CHECK_BOUND(index, 0, zram->zgrp->nr_obj - 1)) + return 0; + + gid = zram_get_memcg_id(zram, index); + size = zram_get_obj_size(zram, index); + zgrp_fault_stats_inc(zram->zgrp, gid, size); +#ifdef CONFIG_ZRAM_GROUP_WRITEBACK + return read_one_obj_sync(zram, index); +#else + return 0; +#endif +} + +void zram_group_track_obj(struct zram *zram, u32 index, struct mem_cgroup *memcg) +{ + u16 gid; + + if (!(zram->zgrp)) { + pr_debug("zram group is not enable!\n"); + return; + } + if (!CHECK_BOUND(index, 0, zram->zgrp->nr_obj - 1)) + return; + if (!CHECK(memcg || !memcg->id.id, "obj %u has no memcg!\n", index)) + return; + gid = zram_get_memcg_id(zram, index); + if (!CHECK(!gid, "obj %u has gid %u.\n", index, gid)) + BUG(); + + gid = memcg->id.id; + zram_set_memcg_id(zram, index, gid); + zgrp_obj_insert(zram->zgrp, index, gid); + zgrp_obj_stats_inc(zram->zgrp, gid, zram_get_obj_size(zram, index)); +} + +void zram_group_untrack_obj(struct zram *zram, u32 index) +{ + u16 gid; + u32 size; + + if (!(zram->zgrp)) { + pr_debug("zram group is not enable!\n"); + return; + } + if (!CHECK_BOUND(index, 0, zram->zgrp->nr_obj - 1)) + return; + +#ifdef CONFIG_ZRAM_GROUP_WRITEBACK +check: + if (!zram_test_flag(zram, index, ZRAM_FAULT)) + goto clear; + zram_slot_unlock(zram, index); + wait_event(zram->zgrp->wbgrp.fault_wq, !zram_test_flag(zram, index, ZRAM_FAULT)); + zram_slot_lock(zram, index); + goto check; +clear: +#endif + gid = zram_get_memcg_id(zram, index); + size = zram_get_obj_size(zram, index); + if (!gid) + return; +#ifdef CONFIG_ZRAM_GROUP_WRITEBACK + if (zram_test_flag(zram, index, ZRAM_GWB)) { + u32 eid = hyperhold_addr_extent(zram_get_handle(zram, index)); + + if (wbgrp_obj_delete(zram->zgrp, index, eid)) { + zgrp_ext_delete(zram->zgrp, eid, gid); + hyperhold_should_free_extent(eid); + } + zram_clear_flag(zram, index, ZRAM_GWB); + zram_set_memcg_id(zram, index, 0); + wbgrp_obj_stats_dec(zram->zgrp, gid, eid, size); + zram_set_handle(zram, index, 0); + return; + } +#endif + zgrp_obj_delete(zram->zgrp, index, gid); + zram_set_memcg_id(zram, index, 0); + zgrp_obj_stats_dec(zram->zgrp, gid, size); +} + +#ifdef CONFIG_ZRAM_GROUP_DEBUG +void group_debug(struct zram *zram, u32 op, u32 index, u32 gid) +{ + if (op == 0) + zram_group_dump(zram->zgrp, gid, index); + +#ifdef CONFIG_ZRAM_GROUP_WRITEBACK + if (op == 22) + read_group_objs(zram, gid, index); + if (op == 23) + write_group_objs(zram, gid, index); + if (op == 20) { + if (index) + zram_group_apply_writeback(zram->zgrp, hyperhold_nr_extent()); + else + zram_group_remove_writeback(zram->zgrp); + } +#endif +} +#endif + +static u64 group_obj_stats(struct zram *zram, u16 gid, int type) +{ + if (!(zram->zgrp)) { + pr_debug("zram group is not enable!\n"); + return 0; + } + if (!CHECK_BOUND(gid, 0, zram->zgrp->nr_grp - 1)) + return 0; + + if (type == CACHE_SIZE) + return atomic64_read(&zram->zgrp->stats[gid].zram_size); + else if (type == CACHE_PAGE) + return atomic_read(&zram->zgrp->stats[gid].zram_pages); + else if (type == CACHE_FAULT) + return atomic64_read(&zram->zgrp->stats[gid].zram_fault); +#ifdef CONFIG_ZRAM_GROUP_WRITEBACK + else if (type == SWAP_SIZE) + return atomic64_read(&zram->zgrp->stats[gid].wb_size); + else if (type == SWAP_PAGE) + return atomic_read(&zram->zgrp->stats[gid].wb_pages); + else if (type == READ_SIZE) + return atomic64_read(&zram->zgrp->stats[gid].read_size); + else if (type == WRITE_SIZE) + return atomic64_read(&zram->zgrp->stats[gid].write_size); + else if (type == SWAP_FAULT) + return atomic64_read(&zram->zgrp->stats[gid].wb_fault); + BUG(); +#endif + + return 0; +} + +#ifdef CONFIG_ZRAM_GROUP_WRITEBACK +static u64 zram_group_read(u16 gid, u64 req_size, void *priv) +{ + if (!CHECK(priv, "priv is NULL!\n")) + return 0; + + return read_group_objs((struct zram *)priv, gid, req_size); +} + +static u64 zram_group_write(u16 gid, u64 req_size, void *priv) +{ + if (!CHECK(priv, "priv is NULL!\n")) + return 0; + + return write_group_objs((struct zram *)priv, gid, req_size); +} +#else +static u64 zram_group_read(u16 gid, u64 req_size, void *priv) +{ + return 0; +} +static u64 zram_group_write(u16 gid, u64 req_size, void *priv) +{ + return 0; +} +#endif + + +static u64 zram_group_data_size(u16 gid, int type, void *priv) +{ + if (!CHECK(priv, "priv is NULL!\n")) + return 0; + + return group_obj_stats((struct zram *)priv, gid, type); +} + +struct group_swap_ops zram_group_ops = { + .group_read = zram_group_read, + .group_write = zram_group_write, + .group_data_size = zram_group_data_size, +}; + +static int register_zram_group(struct zram *zram) +{ + if (!CHECK(zram, "zram is NULL!\n")) + return -EINVAL; + if (!(zram->zgrp)) { + pr_debug("zram group is not enable!\n"); + return -EINVAL; + } + + zram->zgrp->gsdev = register_group_swap(&zram_group_ops, zram); + if (!zram->zgrp->gsdev) { + pr_err("register zram group failed!\n"); + return -ENOMEM; + } + + return 0; +} + +static void unregister_zram_group(struct zram *zram) +{ + if (!CHECK(zram, "zram is NULL!\n")) + return; + if (!(zram->zgrp)) { + pr_debug("zram group is not enable!\n"); + return; + } + + unregister_group_swap(zram->zgrp->gsdev); + zram->zgrp->gsdev = NULL; +} + +void zram_group_init(struct zram *zram, u32 nr_obj) +{ + unsigned int ctrl = zram->zgrp_ctrl; + + if (ctrl == ZGRP_NONE) + return; + zram->zgrp = zram_group_meta_alloc(nr_obj, ZGRP_MAX_GRP - 1); +#ifdef CONFIG_ZRAM_GROUP_WRITEBACK + if (ctrl == ZGRP_WRITE) + zram_group_apply_writeback(zram->zgrp, hyperhold_nr_extent()); +#endif + register_zram_group(zram); +} + +void zram_group_deinit(struct zram *zram) +{ + unregister_zram_group(zram); + zram_group_meta_free(zram->zgrp); + zram->zgrp = NULL; +} diff --git a/drivers/block/zram/zram_group/zlist.c b/drivers/block/zram/zram_group/zlist.c new file mode 100644 index 0000000000000000000000000000000000000000..fd8295ecadaacb27312f7bde75cc48dd9940f54e --- /dev/null +++ b/drivers/block/zram/zram_group/zlist.c @@ -0,0 +1,235 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * drivers/block/zram/zram_group/zlist.c + * + * Copyright (c) 2020-2022 Huawei Technologies Co., Ltd. + */ + +#define pr_fmt(fmt) "[ZLIST]" fmt + +#include +#include +#include + +#include "zlist.h" + +#define assert(expr) \ + do { \ + if (expr) \ + break; \ + pr_err("assertion [%s] failed: in func<%s> at %s:%d\n", \ + #expr, __func__, __FILE__, __LINE__); \ + BUG(); \ + } while (0) + +static inline void zlist_node_lock(struct zlist_node *node) +{ + bit_spin_lock(ZLIST_LOCK_BIT, (unsigned long *)node); +} + +static inline void zlist_node_unlock(struct zlist_node *node) +{ + bit_spin_unlock(ZLIST_LOCK_BIT, (unsigned long *)node); +} + +#ifdef CONFIG_ZLIST_DEBUG +static inline void zlist_before_add_check(struct zlist_table *tab, + struct zlist_node *prev, struct zlist_node *node, + struct zlist_node *next) +{ + assert(idx2node(prev->next, tab) == next); + assert(idx2node(next->prev, tab) == prev); + assert(idx2node(node->prev, tab) == node); + assert(idx2node(node->next, tab) == node); +} + +static inline void zlist_after_add_check(struct zlist_table *tab, + struct zlist_node *prev, struct zlist_node *node, + struct zlist_node *next) +{ + assert(idx2node(prev->next, tab) == node); + assert(idx2node(next->prev, tab) == node); + assert(idx2node(node->prev, tab) == prev); + assert(idx2node(node->next, tab) == next); +} + +static inline void zlist_before_del_check(struct zlist_table *tab, + struct zlist_node *prev, struct zlist_node *node, + struct zlist_node *next) +{ + assert(idx2node(prev->next, tab) == node); + assert(idx2node(next->prev, tab) == node); + assert(idx2node(node->prev, tab) == prev); + assert(idx2node(node->next, tab) == next); +} + +static inline void zlist_after_del_check(struct zlist_table *tab, + struct zlist_node *prev, struct zlist_node *node, + struct zlist_node *next) +{ + assert(idx2node(prev->next, tab) == next); + assert(idx2node(next->prev, tab) == prev); + assert(idx2node(node->prev, tab) == node); + assert(idx2node(node->next, tab) == node); +} +#else +static inline void zlist_before_add_check(struct zlist_table *tab, + struct zlist_node *prev, struct zlist_node *node, + struct zlist_node *next) {}; +static inline void zlist_after_add_check(struct zlist_table *tab, + struct zlist_node *prev, struct zlist_node *node, + struct zlist_node *next) {}; +static inline void zlist_before_del_check(struct zlist_table *tab, + struct zlist_node *prev, struct zlist_node *node, + struct zlist_node *next) {}; +static inline void zlist_after_del_check(struct zlist_table *tab, + struct zlist_node *prev, struct zlist_node *node, + struct zlist_node *next) {}; +#endif + +struct zlist_table *zlist_table_alloc(struct zlist_node *(*i2n)(u32, void*), + void *private, gfp_t gfp) +{ + struct zlist_table *tab = kmalloc(sizeof(struct zlist_table), gfp); + + if (!tab) + return NULL; + tab->idx2node = i2n; + tab->private = private; + + return tab; +} + +void zlist_lock(u32 idx, struct zlist_table *tab) +{ + zlist_node_lock(idx2node(idx, tab)); +} + +void zlist_unlock(u32 idx, struct zlist_table *tab) +{ + zlist_node_unlock(idx2node(idx, tab)); +} + +void zlist_add_nolock(u32 hid, u32 idx, struct zlist_table *tab) +{ + struct zlist_node *node = idx2node(idx, tab); + struct zlist_node *head = idx2node(hid, tab); + u32 nid = head->next; + struct zlist_node *next = idx2node(nid, tab); + + zlist_before_add_check(tab, head, node, next); + if (idx != hid) + zlist_node_lock(node); + node->prev = hid; + node->next = nid; + if (idx != hid) + zlist_node_unlock(node); + head->next = idx; + if (nid != hid) + zlist_node_lock(next); + next->prev = idx; + if (nid != hid) + zlist_node_unlock(next); + zlist_after_add_check(tab, head, node, next); +} + +void zlist_add_tail_nolock(u32 hid, u32 idx, struct zlist_table *tab) +{ + struct zlist_node *node = idx2node(idx, tab); + struct zlist_node *head = idx2node(hid, tab); + u32 tid = head->prev; + struct zlist_node *tail = idx2node(tid, tab); + + zlist_before_add_check(tab, tail, node, head); + if (idx != hid) + zlist_node_lock(node); + node->prev = tid; + node->next = hid; + if (idx != hid) + zlist_node_unlock(node); + head->prev = idx; + if (tid != hid) + zlist_node_lock(tail); + tail->next = idx; + if (tid != hid) + zlist_node_unlock(tail); + zlist_after_add_check(tab, tail, node, head); +} + +bool zlist_del_nolock(u32 hid, u32 idx, struct zlist_table *tab) +{ + struct zlist_node *node = idx2node(idx, tab); + u32 pid = node->prev; + u32 nid = node->next; + struct zlist_node *prev = idx2node(pid, tab); + struct zlist_node *next = idx2node(nid, tab); + + zlist_before_del_check(tab, prev, node, next); + if (idx != hid) + zlist_node_lock(node); + node->prev = idx; + node->next = idx; + if (idx != hid) + zlist_node_unlock(node); + if (pid != hid) + zlist_node_lock(prev); + prev->next = nid; + if (pid != hid) + zlist_node_unlock(prev); + if (nid != hid) + zlist_node_lock(next); + next->prev = pid; + if (nid != hid) + zlist_node_unlock(next); + zlist_after_del_check(tab, prev, node, next); + + return zlist_is_isolated_nolock(hid, tab); +} + +bool zlist_is_isolated_nolock(u32 idx, struct zlist_table *tab) +{ + struct zlist_node *node = idx2node(idx, tab); + + return (node->prev == idx) && (node->next == idx); +} + +bool zlist_set_priv(u32 idx, struct zlist_table *tab) +{ + struct zlist_node *node = idx2node(idx, tab); + bool ret = false; + + zlist_node_lock(node); + ret = !test_and_set_bit(ZLIST_PRIV_BIT, (unsigned long *)node); + zlist_node_unlock(node); + + return ret; +} + +bool zlist_clr_priv_nolock(u32 idx, struct zlist_table *tab) +{ + struct zlist_node *node = idx2node(idx, tab); + bool ret = false; + + ret = !test_and_clear_bit(ZLIST_PRIV_BIT, (unsigned long *)node); + + return ret; +} + +bool zlist_test_priv_nolock(u32 idx, struct zlist_table *tab) +{ + struct zlist_node *node = idx2node(idx, tab); + bool ret = false; + + ret = test_bit(ZLIST_PRIV_BIT, (unsigned long *)node); + + return ret; +} + +void zlist_node_init(u32 idx, struct zlist_table *tab) +{ + struct zlist_node *node = idx2node(idx, tab); + + memset(node, 0, sizeof(struct zlist_node)); + node->prev = idx; + node->next = idx; +} diff --git a/drivers/block/zram/zram_group/zlist.h b/drivers/block/zram/zram_group/zlist.h new file mode 100644 index 0000000000000000000000000000000000000000..a7cbf37509e9291a1feee9dbc9ac78f79a924f42 --- /dev/null +++ b/drivers/block/zram/zram_group/zlist.h @@ -0,0 +1,97 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * drivers/block/zram/zram_group/zlist.h + * + * Copyright (c) 2020-2022 Huawei Technologies Co., Ltd. + */ + +#ifndef _ZLIST_H_ +#define _ZLIST_H_ + +#define ZLIST_IDX_SHIFT 30 +#define ZLIST_LOCK_BIT ZLIST_IDX_SHIFT +#define ZLIST_PRIV_BIT ((ZLIST_IDX_SHIFT << 1) + 1) + +#define ZLIST_IDX_MAX (1 << ZLIST_IDX_SHIFT) + +struct zlist_node { + u64 prev : ZLIST_IDX_SHIFT; + u64 lock : 1; + u64 next : ZLIST_IDX_SHIFT; + u64 priv : 1; +}; + +struct zlist_table { + struct zlist_node *(*idx2node)(u32 idx, void *priv); + void *private; +}; + +static inline struct zlist_node *idx2node(u32 idx, struct zlist_table *tab) +{ + return tab->idx2node(idx, tab->private); +} + +static inline u32 next_idx(u32 idx, struct zlist_table *tab) +{ + return idx2node(idx, tab)->next; +} + +static inline u32 prev_idx(u32 idx, struct zlist_table *tab) +{ + return idx2node(idx, tab)->prev; +} + +static inline void zlist_table_free(struct zlist_table *tab) +{ + kfree(tab); +} + +struct zlist_table *zlist_table_alloc(struct zlist_node *(*i2n)(u32, void*), + void *private, gfp_t gfp); + +void zlist_lock(u32 idx, struct zlist_table *tab); +void zlist_unlock(u32 idx, struct zlist_table *tab); + +void zlist_add_nolock(u32 hid, u32 idx, struct zlist_table *tab); +void zlist_add_tail_nolock(u32 hid, u32 idx, struct zlist_table *tab); +bool zlist_del_nolock(u32 hid, u32 idx, struct zlist_table *tab); +bool zlist_is_isolated_nolock(u32 idx, struct zlist_table *tab); + +static inline void zlist_add(u32 hid, u32 idx, struct zlist_table *tab) +{ + zlist_lock(hid, tab); + zlist_add_nolock(hid, idx, tab); + zlist_unlock(hid, tab); +} + +static inline void zlist_add_tail(u32 hid, u32 idx, struct zlist_table *tab) +{ + zlist_lock(hid, tab); + zlist_add_tail_nolock(hid, idx, tab); + zlist_unlock(hid, tab); +} + +static inline bool zlist_del(u32 hid, u32 idx, struct zlist_table *tab) +{ + bool ret = false; + + zlist_lock(hid, tab); + ret = zlist_del_nolock(hid, idx, tab); + zlist_unlock(hid, tab); + + return ret; +} + +bool zlist_set_priv(u32 idx, struct zlist_table *tab); +bool zlist_clr_priv_nolock(u32 idx, struct zlist_table *tab); +bool zlist_test_priv_nolock(u32 idx, struct zlist_table *tab); + +void zlist_node_init(u32 idx, struct zlist_table *tab); + +#define zlist_for_each_entry(idx, hid, tab) \ + for ((idx) = next_idx(hid, tab); (idx) != (hid); \ + (idx) = next_idx(idx, tab)) +#define zlist_for_each_entry_reverse(idx, hid, tab) \ + for ((idx) = prev_idx(hid, tab); (idx) != (hid); \ + (idx) = prev_idx(idx, tab)) +#endif diff --git a/drivers/block/zram/zram_group/zram_group.c b/drivers/block/zram/zram_group/zram_group.c new file mode 100644 index 0000000000000000000000000000000000000000..9a023e77d5cdb9c90f2b5c2682d1373135e7d86f --- /dev/null +++ b/drivers/block/zram/zram_group/zram_group.c @@ -0,0 +1,672 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * drivers/block/zram/zram_group/zram_group.c + * + * Copyright (c) 2020-2022 Huawei Technologies Co., Ltd. + */ + +#define pr_fmt(fmt) "[ZRAM_GROUP]" fmt + +#include +#include +#include "zram_group.h" + +#define CHECK(cond, ...) ((cond) || (pr_err(__VA_ARGS__), false)) +#define CHECK_BOUND(var, min, max) \ + CHECK((var) >= (min) && (var) <= (max), \ + "%s %u out of bounds %u ~ %u!\n", \ + #var, (var), (min), (max)) + +/* + * idx2node for obj table + */ +static struct zlist_node *get_obj(u32 index, void *private) +{ + struct zram_group *zgrp = private; + + if (index < zgrp->nr_obj) + return &zgrp->obj[index]; + + index -= zgrp->nr_obj; + BUG_ON(!index); + if (index < zgrp->nr_grp) + return &zgrp->grp_obj_head[index]; +#ifdef CONFIG_ZRAM_GROUP_WRITEBACK + index -= zgrp->nr_grp; + BUG_ON(index >= zgrp->wbgrp.nr_ext); + return &zgrp->wbgrp.ext_obj_head[index]; +#endif + BUG(); +} + +void zram_group_meta_free(struct zram_group *zgrp) +{ + if (!CHECK(zgrp, "zram group is not enable!\n")) + return; + +#ifdef CONFIG_ZRAM_GROUP_WRITEBACK + zram_group_remove_writeback(zgrp); +#endif + vfree(zgrp->grp_obj_head); + vfree(zgrp->obj); + zlist_table_free(zgrp->obj_tab); + vfree(zgrp->stats); + kfree(zgrp); + + pr_info("zram group freed.\n"); +} + +struct zram_group *zram_group_meta_alloc(u32 nr_obj, u32 nr_grp) +{ + struct zram_group *zgrp = NULL; + u32 i; + + if (!CHECK_BOUND(nr_grp, 1, ZGRP_MAX_GRP - 1)) + return NULL; + + /* reserve gid 0 */ + nr_grp++; + if (!CHECK_BOUND(nr_obj, 1, ZGRP_MAX_OBJ)) + return NULL; + zgrp = kzalloc(sizeof(struct zram_group), GFP_KERNEL); + if (!zgrp) + goto err; + zgrp->nr_obj = nr_obj; + zgrp->nr_grp = nr_grp; + zgrp->grp_obj_head = vmalloc(sizeof(struct zlist_node) * zgrp->nr_grp); + if (!zgrp->grp_obj_head) + goto err; + zgrp->obj = vmalloc(sizeof(struct zlist_node) * zgrp->nr_obj); + if (!zgrp->obj) + goto err; + zgrp->obj_tab = zlist_table_alloc(get_obj, zgrp, GFP_KERNEL); + if (!zgrp->obj_tab) + goto err; + zgrp->stats = vzalloc(sizeof(struct zram_group_stats) * zgrp->nr_grp); + if (!zgrp->stats) + goto err; + zgrp->gsdev = NULL; + + for (i = 0; i < zgrp->nr_obj; i++) + zlist_node_init(i, zgrp->obj_tab); + for (i = 1; i < zgrp->nr_grp; i++) + zlist_node_init(i + zgrp->nr_obj, zgrp->obj_tab); + +#ifdef CONFIG_ZRAM_GROUP_WRITEBACK + zgrp->wbgrp.enable = false; + mutex_init(&zgrp->wbgrp.init_lock); +#endif + pr_info("zram_group alloc succ.\n"); + return zgrp; +err: + pr_err("zram_group alloc failed!\n"); + zram_group_meta_free(zgrp); + + return NULL; +} + +/* + * insert obj at @index into group @gid as the HOTTEST obj + */ +void zgrp_obj_insert(struct zram_group *zgrp, u32 index, u16 gid) +{ + u32 hid; + + if (!zgrp) { + pr_debug("zram group is not enable!"); + return; + } + if (!CHECK_BOUND(index, 0, zgrp->nr_obj - 1)) + return; + if (!CHECK_BOUND(gid, 1, zgrp->nr_grp - 1)) + return; + hid = gid + zgrp->nr_obj; + zlist_add(hid, index, zgrp->obj_tab); + pr_debug("insert obj %u to group %u\n", index, gid); +} + +/* + * remove obj at @index from group @gid + */ +bool zgrp_obj_delete(struct zram_group *zgrp, u32 index, u16 gid) +{ + u32 hid; + + if (!zgrp) { + pr_debug("zram group is not enable!"); + return false; + } + if (!CHECK_BOUND(index, 0, zgrp->nr_obj - 1)) + return false; + if (!CHECK_BOUND(gid, 1, zgrp->nr_grp - 1)) + return false; + pr_debug("delete obj %u from group %u\n", index, gid); + hid = gid + zgrp->nr_obj; + + return zlist_del(hid, index, zgrp->obj_tab); +} + +/* + * try to isolate the last @nr objs of @gid, store their indexes in array @idxs + * and @return the obj cnt actually isolated. isolate all objs if nr is 0. + */ +u32 zgrp_isolate_objs(struct zram_group *zgrp, u16 gid, u32 *idxs, u32 nr, bool *last) +{ + u32 hid, idx; + u32 cnt = 0; + u32 i; + + if (last) + *last = false; + if (!zgrp) { + pr_debug("zram group is not enable!"); + return 0; + } + if (!CHECK_BOUND(gid, 1, zgrp->nr_grp - 1)) + return 0; + if (!CHECK(idxs, "return array idxs is null!\n")) + return 0; + hid = gid + zgrp->nr_obj; + zlist_lock(hid, zgrp->obj_tab); + zlist_for_each_entry_reverse(idx, hid, zgrp->obj_tab) { + idxs[cnt++] = idx; + if (nr && cnt == nr) + break; + } + for (i = 0; i < cnt; i++) + zlist_del_nolock(hid, idxs[i], zgrp->obj_tab); + if (last) + *last = cnt && zlist_is_isolated_nolock(hid, zgrp->obj_tab); + zlist_unlock(hid, zgrp->obj_tab); + + pr_debug("isolated %u objs from group %u.\n", cnt, gid); + + return cnt; +} + +/* + * check if the obj at @index is isolate from zram groups + */ +bool zgrp_obj_is_isolated(struct zram_group *zgrp, u32 index) +{ + bool ret = false; + + if (!zgrp) { + pr_debug("zram group is not enable!"); + return false; + } + if (!CHECK_BOUND(index, 0, zgrp->nr_obj - 1)) + return false; + + zlist_lock(index, zgrp->obj_tab); + ret = zlist_is_isolated_nolock(index, zgrp->obj_tab); + zlist_unlock(index, zgrp->obj_tab); + + return ret; +} +/* + * insert obj at @index into group @gid as the COLDEST obj + */ +void zgrp_obj_putback(struct zram_group *zgrp, u32 index, u16 gid) +{ + u32 hid; + + if (!zgrp) { + pr_debug("zram group is not enable!"); + return; + } + if (!CHECK_BOUND(index, 0, zgrp->nr_obj - 1)) + return; + if (!CHECK_BOUND(gid, 1, zgrp->nr_grp - 1)) + return; + hid = gid + zgrp->nr_obj; + zlist_add_tail(hid, index, zgrp->obj_tab); + pr_debug("putback obj %u to group %u\n", index, gid); +} + +void zgrp_obj_stats_inc(struct zram_group *zgrp, u16 gid, u32 size) +{ + if (!zgrp) { + pr_debug("zram group is not enable!"); + return; + } + if (!CHECK_BOUND(gid, 1, zgrp->nr_grp - 1)) + return; + + atomic_inc(&zgrp->stats[gid].zram_pages); + atomic64_add(size, &zgrp->stats[gid].zram_size); + atomic_inc(&zgrp->stats[0].zram_pages); + atomic64_add(size, &zgrp->stats[0].zram_size); +} + +void zgrp_obj_stats_dec(struct zram_group *zgrp, u16 gid, u32 size) +{ + if (!zgrp) { + pr_debug("zram group is not enable!"); + return; + } + if (!CHECK_BOUND(gid, 1, zgrp->nr_grp - 1)) + return; + + atomic_dec(&zgrp->stats[gid].zram_pages); + atomic64_sub(size, &zgrp->stats[gid].zram_size); + atomic_dec(&zgrp->stats[0].zram_pages); + atomic64_sub(size, &zgrp->stats[0].zram_size); +} + +void zgrp_fault_stats_inc(struct zram_group *zgrp, u16 gid, u32 size) +{ + if (!zgrp) { + pr_debug("zram group is not enable!"); + return; + } + if (!CHECK_BOUND(gid, 1, zgrp->nr_grp - 1)) + return; + + atomic64_inc(&zgrp->stats[gid].zram_fault); + atomic64_inc(&zgrp->stats[0].zram_fault); +} + +#ifdef CONFIG_ZRAM_GROUP_DEBUG +void zram_group_dump(struct zram_group *zgrp, u16 gid, u32 index) +{ + u32 hid, idx; + + if (!zgrp) { + pr_debug("zram group is not enable!"); + return; + } + hid = gid + zgrp->nr_obj; + if (gid == 0) { + struct zlist_node *node = NULL; + + if (!CHECK_BOUND(index, 0, zgrp->nr_obj - 1)) + return; + node = idx2node(index, zgrp->obj_tab); + pr_err("dump index %u = %u %u %u %u\n", index, + node->prev, node->next, + node->lock, node->priv); + } else { + if (!CHECK_BOUND(gid, 1, zgrp->nr_grp - 1)) + return; + pr_err("dump index of group %u\n", gid); + zlist_for_each_entry(idx, hid, zgrp->obj_tab) + pr_err("%u\n", idx); + } +} +#endif + +#ifdef CONFIG_ZRAM_GROUP_WRITEBACK +/* + * idx2node for ext table + */ +static struct zlist_node *get_ext(u32 index, void *private) +{ + struct zram_group *zgrp = private; + + if (index < zgrp->wbgrp.nr_ext) + return &zgrp->wbgrp.ext[index]; + + index -= zgrp->wbgrp.nr_ext; + BUG_ON(!index); + return &zgrp->wbgrp.grp_ext_head[index]; +} + +/* + * disable writeback for zram group @zgrp + */ +void zram_group_remove_writeback(struct zram_group *zgrp) +{ + if (!CHECK(zgrp, "zram group is not enable!\n")) + return; + if (!CHECK(zgrp->wbgrp.enable, "zram group writeback is not enable!\n")) + return; + zgrp->wbgrp.enable = false; + vfree(zgrp->wbgrp.grp_ext_head); + vfree(zgrp->wbgrp.ext); + zlist_table_free(zgrp->wbgrp.ext_tab); + vfree(zgrp->wbgrp.ext_obj_head); + pr_info("zram group writeback is removed.\n"); +} + +/* + * init & enable writeback on exist zram group @zgrp with a backing device of + * @nr_ext extents. + */ +int zram_group_apply_writeback(struct zram_group *zgrp, u32 nr_ext) +{ + struct writeback_group *wbgrp = NULL; + u32 i; + int ret = 0; + + if (!CHECK(zgrp, "zram group is not enable!\n")) + return -EINVAL; + + mutex_lock(&zgrp->wbgrp.init_lock); + if (!CHECK(!zgrp->wbgrp.enable, "zram group writeback is already enable!\n")) + goto out; + if (!CHECK_BOUND(nr_ext, 1, ZGRP_MAX_EXT)) { + ret = -EINVAL; + goto out; + } + wbgrp = &zgrp->wbgrp; + wbgrp->nr_ext = nr_ext; + wbgrp->grp_ext_head = vmalloc(sizeof(struct zlist_node) * zgrp->nr_grp); + if (!wbgrp->grp_ext_head) { + ret = -ENOMEM; + goto out; + } + wbgrp->ext = vmalloc(sizeof(struct zlist_node) * wbgrp->nr_ext); + if (!wbgrp->ext) { + ret = -ENOMEM; + goto out; + } + wbgrp->ext_obj_head = vmalloc(sizeof(struct zlist_node) * wbgrp->nr_ext); + if (!wbgrp->ext_obj_head) { + ret = -ENOMEM; + goto out; + } + + wbgrp->ext_tab = zlist_table_alloc(get_ext, zgrp, GFP_KERNEL); + if (!wbgrp->ext_tab) { + ret = -ENOMEM; + goto out; + } + + for (i = 0; i < wbgrp->nr_ext; i++) + zlist_node_init(i, wbgrp->ext_tab); + for (i = 1; i < zgrp->nr_grp; i++) + zlist_node_init(i + wbgrp->nr_ext, wbgrp->ext_tab); + + for (i = 0; i < wbgrp->nr_ext; i++) + zlist_node_init(i + zgrp->nr_obj + zgrp->nr_grp, zgrp->obj_tab); + + init_waitqueue_head(&wbgrp->fault_wq); + wbgrp->enable = true; + pr_info("zram group writeback is enabled.\n"); +out: + mutex_unlock(&zgrp->wbgrp.init_lock); + + if (ret) { + zram_group_remove_writeback(zgrp); + pr_err("zram group writeback enable failed!\n"); + } + + return ret; +} + +/* + * attach extent at @eid to group @gid as the HOTTEST extent + */ +void zgrp_ext_insert(struct zram_group *zgrp, u32 eid, u16 gid) +{ + u32 hid; + + if (!zgrp) { + pr_debug("zram group is not enable!"); + return; + } + if (!CHECK(zgrp->wbgrp.enable, "zram group writeback is not enable!\n")) + return; + if (!CHECK_BOUND(eid, 0, zgrp->wbgrp.nr_ext - 1)) + return; + if (!CHECK_BOUND(gid, 1, zgrp->nr_grp - 1)) + return; + hid = gid + zgrp->wbgrp.nr_ext; + zlist_add(hid, eid, zgrp->wbgrp.ext_tab); + pr_debug("insert extent %u to group %u\n", eid, gid); +} + +/* + * remove extent at @eid from group @gid + */ +bool zgrp_ext_delete(struct zram_group *zgrp, u32 eid, u16 gid) +{ + u32 hid; + bool isolated = false; + + if (!zgrp) { + pr_debug("zram group is not enable!"); + return false; + } + if (!CHECK(zgrp->wbgrp.enable, "zram group writeback is not enable!\n")) + return false; + if (!CHECK_BOUND(eid, 0, zgrp->wbgrp.nr_ext - 1)) + return false; + if (!CHECK_BOUND(gid, 1, zgrp->nr_grp - 1)) + return false; + + zlist_lock(eid, zgrp->wbgrp.ext_tab); + isolated = zlist_is_isolated_nolock(eid, zgrp->wbgrp.ext_tab); + zlist_unlock(eid, zgrp->wbgrp.ext_tab); + if (isolated) { + pr_debug("extent %u is already isolated, skip delete.\n", eid); + return false; + } + + pr_debug("delete extent %u from group %u\n", eid, gid); + hid = gid + zgrp->wbgrp.nr_ext; + return zlist_del(hid, eid, zgrp->wbgrp.ext_tab); +} + +/* + * try to isolate the first @nr exts of @gid, store their eids in array @eids + * and @return the cnt actually isolated. isolate all exts if nr is 0. + */ +u32 zgrp_isolate_exts(struct zram_group *zgrp, u16 gid, u32 *eids, u32 nr, bool *last) +{ + u32 hid, idx; + u32 cnt = 0; + u32 i; + + if (last) + *last = false; + if (!zgrp) { + pr_debug("zram group is not enable!"); + return 0; + } + if (!CHECK(zgrp->wbgrp.enable, "zram group writeback is not enable!\n")) + return 0; + if (!CHECK_BOUND(gid, 1, zgrp->nr_grp - 1)) + return 0; + if (!CHECK(eids, "return array eids is null!\n")) + return 0; + hid = gid + zgrp->wbgrp.nr_ext; + zlist_lock(hid, zgrp->wbgrp.ext_tab); + zlist_for_each_entry_reverse(idx, hid, zgrp->wbgrp.ext_tab) { + eids[cnt++] = idx; + if (nr && cnt == nr) + break; + } + for (i = 0; i < cnt; i++) + zlist_del_nolock(hid, eids[i], zgrp->wbgrp.ext_tab); + if (last) + *last = cnt && zlist_is_isolated_nolock(hid, zgrp->wbgrp.ext_tab); + zlist_unlock(hid, zgrp->wbgrp.ext_tab); + + pr_debug("isolated %u exts from group %u.\n", cnt, gid); + + return cnt; +} + +void zgrp_get_ext(struct zram_group *zgrp, u32 eid) +{ + u32 hid; + + if (!CHECK(zgrp, "zram group is not enable!\n")) + return; + if (!CHECK(zgrp->wbgrp.enable, "zram group writeback is not enable!\n")) + return; + if (!CHECK_BOUND(eid, 0, zgrp->wbgrp.nr_ext - 1)) + return; + + hid = eid + zgrp->nr_obj + zgrp->nr_grp; + zlist_set_priv(hid, zgrp->obj_tab); + pr_info("get extent %u\n", eid); +} + +bool zgrp_put_ext(struct zram_group *zgrp, u32 eid) +{ + u32 hid; + bool ret = false; + + if (!CHECK(zgrp, "zram group is not enable!\n")) + return false; + if (!CHECK(zgrp->wbgrp.enable, "zram group writeback is not enable!\n")) + return false; + if (!CHECK_BOUND(eid, 0, zgrp->wbgrp.nr_ext - 1)) + return false; + + hid = eid + zgrp->nr_obj + zgrp->nr_grp; + zlist_lock(hid, zgrp->obj_tab); + zlist_clr_priv_nolock(hid, zgrp->obj_tab); + ret = zlist_is_isolated_nolock(hid, zgrp->obj_tab); + zlist_unlock(hid, zgrp->obj_tab); + + pr_info("put extent %u, ret = %d\n", eid, ret); + + return ret; +} + +/* + * insert obj at @index into extent @eid + */ +void wbgrp_obj_insert(struct zram_group *zgrp, u32 index, u32 eid) +{ + u32 hid; + + if (!zgrp) { + pr_debug("zram group is not enable!"); + return; + } + if (!CHECK(zgrp->wbgrp.enable, "zram group writeback is not enable!\n")) + return; + if (!CHECK_BOUND(index, 0, zgrp->nr_obj - 1)) + return; + if (!CHECK_BOUND(eid, 0, zgrp->wbgrp.nr_ext - 1)) + return; + hid = eid + zgrp->nr_obj + zgrp->nr_grp; + zlist_add_tail(hid, index, zgrp->obj_tab); + pr_debug("insert obj %u to extent %u\n", index, eid); +} + +/* + * remove obj at @index from extent @eid + */ +bool wbgrp_obj_delete(struct zram_group *zgrp, u32 index, u32 eid) +{ + u32 hid; + bool ret = false; + + if (!zgrp) { + pr_debug("zram group is not enable!"); + return false; + } + if (!CHECK(zgrp->wbgrp.enable, "zram group writeback is not enable!\n")) + return false; + if (!CHECK_BOUND(index, 0, zgrp->nr_obj - 1)) + return false; + if (!CHECK_BOUND(eid, 0, zgrp->wbgrp.nr_ext - 1)) + return false; + pr_debug("delete obj %u from extent %u\n", index, eid); + hid = eid + zgrp->nr_obj + zgrp->nr_grp; + + zlist_lock(hid, zgrp->obj_tab); + ret = zlist_del_nolock(hid, index, zgrp->obj_tab) + && !zlist_test_priv_nolock(hid, zgrp->obj_tab); + zlist_unlock(hid, zgrp->obj_tab); + + return ret; +} + +/* + * try to isolate the first @nr writeback objs of @eid, store their indexes in + * array @idxs and @return the obj cnt actually isolated. isolate all objs if + * @nr is 0. + */ +u32 wbgrp_isolate_objs(struct zram_group *zgrp, u32 eid, u32 *idxs, u32 nr, bool *last) +{ + u32 hid, idx; + u32 cnt = 0; + u32 i; + + if (last) + *last = false; + if (!zgrp) { + pr_debug("zram group is not enable!"); + return 0; + } + if (!CHECK(zgrp->wbgrp.enable, "zram group writeback is not enable!\n")) + return 0; + if (!CHECK_BOUND(eid, 0, zgrp->wbgrp.nr_ext - 1)) + return 0; + if (!CHECK(idxs, "return array idxs is null!\n")) + return 0; + hid = eid + zgrp->nr_obj + zgrp->nr_grp; + zlist_lock(hid, zgrp->obj_tab); + zlist_for_each_entry(idx, hid, zgrp->obj_tab) { + idxs[cnt++] = idx; + if (nr && cnt == nr) + break; + } + for (i = 0; i < cnt; i++) + zlist_del_nolock(hid, idxs[i], zgrp->obj_tab); + if (last) + *last = cnt && zlist_is_isolated_nolock(hid, zgrp->obj_tab) + && !zlist_test_priv_nolock(hid, zgrp->obj_tab); + zlist_unlock(hid, zgrp->obj_tab); + + pr_debug("isolated %u objs from extent %u.\n", cnt, eid); + + return cnt; +} + +void wbgrp_obj_stats_inc(struct zram_group *zgrp, u16 gid, u32 eid, u32 size) +{ + if (!zgrp) { + pr_debug("zram group is not enable!"); + return; + } + if (!CHECK_BOUND(gid, 1, zgrp->nr_grp - 1)) + return; + if (!CHECK_BOUND(eid, 0, zgrp->wbgrp.nr_ext - 1)) + return; + + atomic_inc(&zgrp->stats[gid].wb_pages); + atomic64_add(size, &zgrp->stats[gid].wb_size); + atomic_inc(&zgrp->stats[0].wb_pages); + atomic64_add(size, &zgrp->stats[0].wb_size); +} + +void wbgrp_obj_stats_dec(struct zram_group *zgrp, u16 gid, u32 eid, u32 size) +{ + if (!zgrp) { + pr_debug("zram group is not enable!"); + return; + } + if (!CHECK_BOUND(gid, 1, zgrp->nr_grp - 1)) + return; + if (!CHECK_BOUND(eid, 0, zgrp->wbgrp.nr_ext - 1)) + return; + + atomic_dec(&zgrp->stats[gid].wb_pages); + atomic64_sub(size, &zgrp->stats[gid].wb_size); + atomic_dec(&zgrp->stats[0].wb_pages); + atomic64_sub(size, &zgrp->stats[0].wb_size); +} + +void wbgrp_fault_stats_inc(struct zram_group *zgrp, u16 gid, u32 eid, u32 size) +{ + if (!zgrp) { + pr_debug("zram group is not enable!"); + return; + } + if (!CHECK_BOUND(gid, 1, zgrp->nr_grp - 1)) + return; + if (!CHECK_BOUND(eid, 0, zgrp->wbgrp.nr_ext - 1)) + return; + + atomic64_inc(&zgrp->stats[gid].wb_fault); + atomic64_inc(&zgrp->stats[0].wb_fault); +} +#endif diff --git a/drivers/block/zram/zram_group/zram_group.h b/drivers/block/zram/zram_group/zram_group.h new file mode 100644 index 0000000000000000000000000000000000000000..9b184b7bda77b55ddb78b37a0d0af06c2e04caa3 --- /dev/null +++ b/drivers/block/zram/zram_group/zram_group.h @@ -0,0 +1,98 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * drivers/block/zram/zram_group/zram_group.h + * + * Copyright (c) 2020-2022 Huawei Technologies Co., Ltd. + */ + +#ifndef _ZRAM_GROUP_H_ +#define _ZRAM_GROUP_H_ + +#include +#include + +#include "zlist.h" + +#define ZGRP_MAX_GRP USHRT_MAX +#define ZGRP_MAX_OBJ (1 << 30) + +enum { + ZGRP_NONE = 0, + ZGRP_TRACK, +#ifdef CONFIG_ZRAM_GROUP_WRITEBACK + ZGRP_WRITE, +#endif +}; + +#ifdef CONFIG_ZRAM_GROUP_WRITEBACK +#define ZGRP_MAX_EXT (ZLIST_IDX_MAX - ZGRP_MAX_GRP - ZGRP_MAX_OBJ) +struct writeback_group { + bool enable; + u32 nr_ext; + struct zlist_node *grp_ext_head; + struct zlist_node *ext; + struct zlist_table *ext_tab; + struct zlist_node *ext_obj_head; + struct mutex init_lock; + wait_queue_head_t fault_wq; +}; +#endif + +struct zram_group_stats { + atomic64_t zram_size; + atomic_t zram_pages; + atomic64_t zram_fault; +#ifdef CONFIG_ZRAM_GROUP_WRITEBACK + atomic64_t wb_size; + atomic_t wb_pages; + atomic64_t wb_fault; + atomic_t wb_exts; + atomic64_t write_size; + atomic64_t read_size; +#endif +}; + +struct zram_group { + u32 nr_obj; + u32 nr_grp; + struct zlist_node *grp_obj_head; + struct zlist_node *obj; + struct zlist_table *obj_tab; +#ifdef CONFIG_ZRAM_GROUP_WRITEBACK + struct writeback_group wbgrp; +#endif + struct group_swap_device *gsdev; + struct zram_group_stats *stats; +}; + +void zram_group_meta_free(struct zram_group *zgrp); +struct zram_group *zram_group_meta_alloc(u32 nr_obj, u32 nr_grp); +void zgrp_obj_insert(struct zram_group *zgrp, u32 index, u16 gid); +bool zgrp_obj_delete(struct zram_group *zgrp, u32 index, u16 gid); +u32 zgrp_isolate_objs(struct zram_group *zgrp, u16 gid, u32 *idxs, u32 nr, bool *last); +bool zgrp_obj_is_isolated(struct zram_group *zgrp, u32 index); +void zgrp_obj_putback(struct zram_group *zgrp, u32 index, u16 gid); +void zgrp_obj_stats_inc(struct zram_group *zgrp, u16 gid, u32 size); +void zgrp_obj_stats_dec(struct zram_group *zgrp, u16 gid, u32 size); +void zgrp_fault_stats_inc(struct zram_group *zgrp, u16 gid, u32 size); + +#ifdef CONFIG_ZRAM_GROUP_DEBUG +void zram_group_dump(struct zram_group *zgrp, u16 gid, u32 index); +#endif + +#ifdef CONFIG_ZRAM_GROUP_WRITEBACK +void zram_group_remove_writeback(struct zram_group *zgrp); +int zram_group_apply_writeback(struct zram_group *zgrp, u32 nr_ext); +void zgrp_ext_insert(struct zram_group *zgrp, u32 eid, u16 gid); +bool zgrp_ext_delete(struct zram_group *zgrp, u32 eid, u16 gid); +u32 zgrp_isolate_exts(struct zram_group *zgrp, u16 gid, u32 *eids, u32 nr, bool *last); +void zgrp_get_ext(struct zram_group *zgrp, u32 eid); +bool zgrp_put_ext(struct zram_group *zgrp, u32 eid); +void wbgrp_obj_insert(struct zram_group *zgrp, u32 index, u32 eid); +bool wbgrp_obj_delete(struct zram_group *zgrp, u32 index, u32 eid); +u32 wbgrp_isolate_objs(struct zram_group *zgrp, u32 eid, u32 *idxs, u32 nr, bool *last); +void wbgrp_obj_stats_inc(struct zram_group *zgrp, u16 gid, u32 eid, u32 size); +void wbgrp_obj_stats_dec(struct zram_group *zgrp, u16 gid, u32 eid, u32 size); +void wbgrp_fault_stats_inc(struct zram_group *zgrp, u16 gid, u32 eid, u32 size); +#endif +#endif diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index e4e24da16d2c398493e197cfb97f25be9940e3d0..688ecee2f5655d029453b49e7f3bbdf394091c78 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -21,6 +21,8 @@ #include #include #include +#include +#include struct mem_cgroup; struct obj_cgroup; @@ -58,6 +60,11 @@ struct mem_cgroup_reclaim_cookie { unsigned int generation; }; +static inline bool is_prot_page(struct page *page) +{ + return false; +} + #ifdef CONFIG_MEMCG #define MEM_CGROUP_ID_SHIFT 16 @@ -297,6 +304,13 @@ struct mem_cgroup { bool tcpmem_active; int tcpmem_pressure; +#ifdef CONFIG_HYPERHOLD_MEMCG + struct list_head score_node; +#define MEM_CGROUP_NAME_MAX_LEN 100 + char name[MEM_CGROUP_NAME_MAX_LEN]; + struct memcg_reclaim memcg_reclaimed; +#endif + #ifdef CONFIG_MEMCG_KMEM int kmemcg_id; struct obj_cgroup __rcu *objcg; @@ -715,6 +729,12 @@ static inline void mem_cgroup_uncharge_list(struct list_head *page_list) void mem_cgroup_migrate(struct folio *old, struct folio *new); +static inline struct mem_cgroup_per_node *mem_cgroup_nodeinfo(struct mem_cgroup *memcg, + int nid) +{ + return memcg->nodeinfo[nid]; +} + /** * mem_cgroup_lruvec - get the lru list vector for a memcg & node * @memcg: memcg of the wanted lruvec @@ -834,6 +854,10 @@ static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg) { if (mem_cgroup_disabled()) return 0; +#ifdef CONFIG_HYPERHOLD_FILE_LRU + if (!memcg) + return -1; +#endif return memcg->id.id; } @@ -860,6 +884,11 @@ static inline struct mem_cgroup *lruvec_memcg(struct lruvec *lruvec) if (mem_cgroup_disabled()) return NULL; +#ifdef CONFIG_HYPERHOLD_FILE_LRU + if (is_node_lruvec(lruvec)) + return NULL; +#endif + mz = container_of(lruvec, struct mem_cgroup_per_node, lruvec); return mz->memcg; } @@ -1012,6 +1041,10 @@ static inline unsigned long lruvec_page_state(struct lruvec *lruvec, if (mem_cgroup_disabled()) return node_page_state(lruvec_pgdat(lruvec), idx); +#ifdef CONFIG_HYPERHOLD_FILE_LRU + if (is_node_lruvec(lruvec)) + return node_page_state(lruvec_pgdat(lruvec), idx); +#endif pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec); x = READ_ONCE(pn->lruvec_stats.state[idx]); #ifdef CONFIG_SMP @@ -1030,6 +1063,11 @@ static inline unsigned long lruvec_page_state_local(struct lruvec *lruvec, if (mem_cgroup_disabled()) return node_page_state(lruvec_pgdat(lruvec), idx); +#ifdef CONFIG_HYPERHOLD_FILE_LRU + if (is_node_lruvec(lruvec)) + return node_page_state(lruvec_pgdat(lruvec), idx); +#endif + pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec); x = READ_ONCE(pn->lruvec_stats.state_local[idx]); #ifdef CONFIG_SMP @@ -1066,6 +1104,17 @@ static inline void mod_memcg_lruvec_state(struct lruvec *lruvec, local_irq_restore(flags); } +#ifdef CONFIG_HYPERHOLD_FILE_LRU +static __always_inline bool is_file_page(struct page *page) +{ + if (!PageUnevictable(page) && !PageSwapBacked(page) && page_mapping(page)) + return true; + + return false; + +} +#endif + void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx, unsigned long count); diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 0f62786269d0c1e789d4266ecf6d7d866fd546db..f9b71ccdd9b94a8618f623c21d32d19ed2d84f7c 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -1322,6 +1322,12 @@ typedef struct pglist_data { int kswapd_failures; /* Number of 'reclaimed == 0' runs */ +#ifdef CONFIG_HYPERHOLD_ZSWAPD + wait_queue_head_t zswapd_wait; + atomic_t zswapd_wait_flag; + struct task_struct *zswapd; +#endif + #ifdef CONFIG_COMPACTION int kcompactd_max_order; enum zone_type kcompactd_highest_zoneidx; @@ -1410,6 +1416,11 @@ typedef struct pglist_data { #define node_start_pfn(nid) (NODE_DATA(nid)->node_start_pfn) #define node_end_pfn(nid) pgdat_end_pfn(NODE_DATA(nid)) +static inline struct lruvec *node_lruvec(struct pglist_data *pgdat) +{ + return &pgdat->__lruvec; +} + static inline unsigned long pgdat_end_pfn(pg_data_t *pgdat) { return pgdat->node_start_pfn + pgdat->node_spanned_pages; @@ -1451,6 +1462,15 @@ static inline struct pglist_data *lruvec_pgdat(struct lruvec *lruvec) #endif } +#ifdef CONFIG_HYPERHOLD_FILE_LRU +static inline int is_node_lruvec(struct lruvec *lruvec) +{ + return &lruvec_pgdat(lruvec)->__lruvec == lruvec; +} +#endif + +extern unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru, int zone_idx); + #ifdef CONFIG_HAVE_MEMORYLESS_NODES int local_memory_node(int node_id); #else diff --git a/include/linux/swap.h b/include/linux/swap.h index cb25db2a93dd1bd138479363829bed5ef566155f..8f4e2fcef6ed215dc69819d65f2a849c9b820544 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -426,6 +426,23 @@ extern int sysctl_min_slab_ratio; #define node_reclaim_mode 0 #endif +struct scan_control; + +extern unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan, + struct lruvec *lruvec, + struct scan_control *sc); +extern bool inactive_is_low(struct lruvec *lruvec, enum lru_list inactive_lru); +extern bool cgroup_reclaim(struct scan_control *sc); +extern void check_move_unevictable_pages(struct pagevec *pvec); +extern unsigned long shrink_slab(gfp_t gfp_mask, int nid, struct mem_cgroup *memcg, + int priority); +extern bool writeback_throttling_sane(struct scan_control *sc); +extern inline bool should_continue_reclaim(struct pglist_data *pgdat, + unsigned long nr_reclaimed, + struct scan_control *sc); + +extern int current_may_throttle(void); + static inline bool node_reclaim_enabled(void) { /* Is any node_reclaim_mode bit set? */ @@ -457,6 +474,9 @@ extern atomic_long_t nr_swap_pages; extern long total_swap_pages; extern atomic_t nr_rotate_swap; extern bool has_usable_swap(void); +#ifdef CONFIG_HYPERHOLD_ZSWAPD +extern bool free_swap_is_low(void); +#endif /* Swap 50% full? Release swapcache more aggressively.. */ static inline bool vm_swap_full(void) diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h index 8abfa124004003cb14d83083306cca68ab070086..d775f3ca9a38cfd1838668894972b2d6647d6640 100644 --- a/include/linux/vm_event_item.h +++ b/include/linux/vm_event_item.h @@ -155,6 +155,24 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, VMA_LOCK_ABORT, VMA_LOCK_RETRY, VMA_LOCK_MISS, +#endif +#ifdef CONFIG_HYPERHOLD_ZSWAPD + ZSWAPD_WAKEUP, + ZSWAPD_REFAULT, + ZSWAPD_MEDIUM_PRESS, + ZSWAPD_CRITICAL_PRESS, + ZSWAPD_MEMCG_RATIO_SKIP, + ZSWAPD_MEMCG_REFAULT_SKIP, + ZSWAPD_SWAPOUT, + ZSWAPD_EMPTY_ROUND, + ZSWAPD_EMPTY_ROUND_SKIP_TIMES, + ZSWAPD_SNAPSHOT_TIMES, + ZSWAPD_RECLAIMED, + ZSWAPD_SCANNED, +#endif +#ifdef CONFIG_HYPERHOLD_MEMCG + FREEZE_RECLAIMED, + FREEZE_RECLAIME_COUNT, #endif NR_VM_EVENT_ITEMS }; diff --git a/include/trace/events/vmscan.h b/include/trace/events/vmscan.h index d2123dd960d59b41408310d13310b5b41a2f40cc..bef2cf6f986d868b4c5b525b3c7e2b48a509ce37 100644 --- a/include/trace/events/vmscan.h +++ b/include/trace/events/vmscan.h @@ -350,6 +350,36 @@ TRACE_EVENT(mm_vmscan_write_folio, show_reclaim_flags(__entry->reclaim_flags)) ); +#ifdef CONFIG_HYPERHOLD_ZSWAPD +TRACE_EVENT(mm_vmscan_lru_zswapd_shrink_active, + + TP_PROTO(int nid, unsigned long nr_taken, + unsigned long nr_deactivated, int priority), + + TP_ARGS(nid, nr_taken, nr_deactivated, priority), + + TP_STRUCT__entry( + __field(int, nid) + __field(unsigned long, nr_taken) + __field(unsigned long, nr_deactivated) + __field(int, priority) + ), + + TP_fast_assign( + __entry->nid = nid; + __entry->nr_taken = nr_taken; + __entry->nr_deactivated = nr_deactivated; + __entry->priority = priority; + ), + + TP_printk("nid=%d nr_taken=%ld nr_deactivated=%ld priority=%d", + __entry->nid, + __entry->nr_taken, + __entry->nr_deactivated, + __entry->priority) +); +#endif + TRACE_EVENT(mm_vmscan_lru_shrink_inactive, TP_PROTO(int nid, diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index 76db6c67e39a9207dfecbc1a5fd375401e6a6142..fb9cd614f07de8a82497a32df34ca9683d4b58d5 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -511,7 +511,12 @@ static ssize_t __cgroup1_procs_write(struct kernfs_open_file *of, */ cred = of->file->f_cred; tcred = get_task_cred(task); +#ifdef CONFIG_HYPERHOLD + if (!uid_eq(cred->euid, GLOBAL_MEMMGR_UID) && + !uid_eq(cred->euid, GLOBAL_ROOT_UID) && +#else if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) && +#endif !uid_eq(cred->euid, tcred->uid) && !uid_eq(cred->euid, tcred->suid)) ret = -EACCES; diff --git a/mm/Kconfig b/mm/Kconfig index 264a2df5ecf5b91a2883e4594bd7707219ca309c..2385c2a0a6b25ac512d801592647ac13308c4d76 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -458,6 +458,41 @@ config SPARSEMEM_MANUAL endchoice +config MEMORY_MONITOR + bool "ENABLE MEMORY_MONITOR" + depends on PROC_FS + default n + help + MEMORY_MONITOR is a monitor of some memory reclaim method. + Now, kswapd wake up monitor use it. + +config HYPERHOLD_FILE_LRU + bool "Enable HyperHold FILE LRU" + depends on HYPERHOLD && MEMCG + select HYPERHOLD_MEMCG + default n + help + File-LRU is a mechanism that put file page in global lru list, + and anon page in memcg lru list(if MEMCG is enable), what's + more, recliam of anonymous pages and file page are separated. + +config HYPERHOLD_MEMCG + bool "Enable Memcg Management in HyperHold" + depends on HYPERHOLD && MEMCG + help + Add more attributes in memory cgroup, these attribute is used + to show information, shrink memory, swapin page and so on. + +config HYPERHOLD_ZSWAPD + bool "Enable zswapd thread to reclaim anon pages in background" + depends on HYPERHOLD && ZRAM + default n + help + zswapd is a kernel thread that reclaim anonymous pages in the + background. When the use of swap pages reaches the watermark + and the refault of anonymous pages is high, the content of + zram will exchanged to eswap by a certain percentage. + config SPARSEMEM def_bool y depends on (!SELECT_MEMORY_MODEL && ARCH_SPARSEMEM_ENABLE) || SPARSEMEM_MANUAL diff --git a/mm/Makefile b/mm/Makefile index ec65984e2adeee6719c54932280cbe0a9857d55c..f9fb7e07cdd854d764ef05768ab65483a9891df9 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -138,3 +138,7 @@ obj-$(CONFIG_IO_MAPPING) += io-mapping.o obj-$(CONFIG_HAVE_BOOTMEM_INFO_NODE) += bootmem_info.o obj-$(CONFIG_GENERIC_IOREMAP) += ioremap.o obj-$(CONFIG_SHRINKER_DEBUG) += shrinker_debug.o +obj-$(CONFIG_HYPERHOLD_FILE_LRU) += memcg_reclaim.o +obj-$(CONFIG_HYPERHOLD_MEMCG) += memcg_control.o +obj-$(CONFIG_HYPERHOLD_ZSWAPD) += zswapd.o zswapd_control.o +obj-$(CONFIG_MEMORY_MONITOR) += memory_monitor.o diff --git a/mm/internal.h b/mm/internal.h index 30cf724ddbce3399999d6a9e9816fa133e9c5c4b..de98d81712393615dc7b7484a3b0519fc1d2eb90 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -10,8 +10,11 @@ #include #include #include +#include #include #include +#include +#include struct folio_batch; @@ -35,6 +38,130 @@ struct folio_batch; /* Do not use these with a slab allocator */ #define GFP_SLAB_BUG_MASK (__GFP_DMA32|__GFP_HIGHMEM|~__GFP_BITS_MASK) +enum reclaim_invoker { + ALL, + KSWAPD, + ZSWAPD, + DIRECT_RECLAIM, + NODE_RECLAIM, + SOFT_LIMIT, + RCC_RECLAIM, + FILE_RECLAIM, + ANON_RECLAIM +}; + +struct scan_control { + /* How many pages shrink_list() should reclaim */ + unsigned long nr_to_reclaim; + + /* + * Nodemask of nodes allowed by the caller. If NULL, all nodes + * are scanned. + */ + nodemask_t *nodemask; + + /* + * The memory cgroup that hit its limit and as a result is the + * primary target of this reclaim invocation. + */ + struct mem_cgroup *target_mem_cgroup; + + /* + * Scan pressure balancing between anon and file LRUs + */ + unsigned long anon_cost; + unsigned long file_cost; + + /* Can active folios be deactivated as part of reclaim? */ +#define DEACTIVATE_ANON 1 +#define DEACTIVATE_FILE 2 + unsigned int may_deactivate:2; + unsigned int force_deactivate:1; + unsigned int skipped_deactivate:1; + + /* Writepage batching in laptop mode; RECLAIM_WRITE */ + unsigned int may_writepage:1; + + /* Can mapped folios be reclaimed? */ + unsigned int may_unmap:1; + + /* Can folios be swapped as part of reclaim? */ + unsigned int may_swap:1; + + /* Proactive reclaim invoked by userspace through memory.reclaim */ + unsigned int proactive:1; + + /* + * Cgroup memory below memory.low is protected as long as we + * don't threaten to OOM. If any cgroup is reclaimed at + * reduced force or passed over entirely due to its memory.low + * setting (memcg_low_skipped), and nothing is reclaimed as a + * result, then go back for one more cycle that reclaims the protected + * memory (memcg_low_reclaim) to avert OOM. + */ + unsigned int memcg_low_reclaim:1; + unsigned int memcg_low_skipped:1; + + unsigned int hibernation_mode:1; + + /* One of the zones is ready for compaction */ + unsigned int compaction_ready:1; + + /* There is easily reclaimable cold cache in the current node */ + unsigned int cache_trim_mode:1; + + /* The file folios on the current node are dangerously low */ + unsigned int file_is_tiny:1; + + /* Always discard instead of demoting to lower tier memory */ + unsigned int no_demotion:1; + + /* Allocation order */ + s8 order; + + /* Scan (total_size >> priority) pages at once */ + s8 priority; + + /* The highest zone to isolate folios for reclaim from */ + s8 reclaim_idx; + + /* This context's GFP mask */ + gfp_t gfp_mask; + + /* Incremented by the number of inactive pages that were scanned */ + unsigned long nr_scanned; + + /* Number of pages freed so far during a call to shrink_zones() */ + unsigned long nr_reclaimed; + + struct { + unsigned int dirty; + unsigned int unqueued_dirty; + unsigned int congested; + unsigned int writeback; + unsigned int immediate; + unsigned int file_taken; + unsigned int taken; + } nr; + + enum reclaim_invoker invoker; + u32 isolate_count; + unsigned long nr_scanned_anon; + unsigned long nr_scanned_file; + unsigned long nr_reclaimed_anon; + unsigned long nr_reclaimed_file; + + /* for recording the reclaimed slab by now */ + struct reclaim_state reclaim_state; +}; + +enum scan_balance { + SCAN_EQUAL, + SCAN_FRACT, + SCAN_ANON, + SCAN_FILE, +}; + /* * Different from WARN_ON_ONCE(), no warning will be issued * when we specify __GFP_NOWARN. @@ -198,11 +325,25 @@ extern unsigned long highest_memmap_pfn; /* * in mm/vmscan.c: */ +#ifdef CONFIG_MEMORY_MONITOR +extern void kswapd_monitor_wake_up_queue(void); +#endif bool isolate_lru_page(struct page *page); bool folio_isolate_lru(struct folio *folio); void putback_lru_page(struct page *page); void folio_putback_lru(struct folio *folio); extern void reclaim_throttle(pg_data_t *pgdat, enum vmscan_throttle_state reason); +extern unsigned int shrink_folio_list(struct list_head *page_list, struct pglist_data *pgdat, + struct scan_control *sc, struct reclaim_stat *stat, bool ignore_references); +extern unsigned long isolate_lru_folios(unsigned long nr_to_scan, struct lruvec *lruvec, + struct list_head *dst, unsigned long *nr_scanned, struct scan_control *sc, + enum lru_list lru); +extern unsigned move_folios_to_lru(struct lruvec *lruvec, struct list_head *list); +extern void shrink_active_list(unsigned long nr_to_scan, struct lruvec *lruvec, + struct scan_control *sc, enum lru_list lru); +extern unsigned long shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, + struct scan_control *sc, enum lru_list lru); +extern void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc); /* * in mm/rmap.c: diff --git a/mm/memcg_control.c b/mm/memcg_control.c new file mode 100644 index 0000000000000000000000000000000000000000..4ca565174add4c5ec54ae12e58d916032b06b76a --- /dev/null +++ b/mm/memcg_control.c @@ -0,0 +1,488 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * mm/memcg_control.c + * + * Copyright (c) 2020-2022 Huawei Technologies Co., Ltd. + */ +#include +#include +#include +#include +#include +#include "internal.h" + +#include "zswapd_internal.h" + +#ifdef CONFIG_HYPERHOLD_MEMCG + +struct list_head score_head; +bool score_head_inited; +DEFINE_RWLOCK(score_list_lock); +DEFINE_MUTEX(reclaim_para_lock); + +/** + * get_next_memcg - iterate over memory cgroup score_list + * @prev: previously returned memcg, NULL on first invocation + * + * Returns references to the next memg on score_list of @prev, + * or %NULL after a full round-trip. + * + * Caller must pass the return value in @prev on subsequent + * invocations for reference counting, or use get_next_memcg_break() + * to cancel a walk before the round-trip is complete. + */ +struct mem_cgroup *get_next_memcg(struct mem_cgroup *prev) +{ + struct mem_cgroup *memcg = NULL; + struct list_head *pos = NULL; + unsigned long flags; + + if (unlikely(!score_head_inited)) + return NULL; + + read_lock_irqsave(&score_list_lock, flags); + + if (unlikely(!prev)) + pos = &score_head; + else + pos = &(prev->score_node); + + if (list_empty(pos)) /* deleted node */ + goto unlock; + + if (pos->next == &score_head) + goto unlock; + + memcg = list_entry(pos->next, + struct mem_cgroup, score_node); + + if (!css_tryget(&memcg->css)) + memcg = NULL; + +unlock: + read_unlock_irqrestore(&score_list_lock, flags); + + if (prev) + css_put(&prev->css); + + return memcg; +} + +void get_next_memcg_break(struct mem_cgroup *memcg) +{ + if (memcg) + css_put(&memcg->css); +} + +struct mem_cgroup *get_prev_memcg(struct mem_cgroup *next) +{ + struct mem_cgroup *memcg = NULL; + struct list_head *pos = NULL; + unsigned long flags; + + if (unlikely(!score_head_inited)) + return NULL; + + read_lock_irqsave(&score_list_lock, flags); + + if (unlikely(!next)) + pos = &score_head; + else + pos = &next->score_node; + + if (list_empty(pos)) /* deleted node */ + goto unlock; + + if (pos->prev == &score_head) + goto unlock; + + memcg = list_entry(pos->prev, + struct mem_cgroup, score_node); + + if (unlikely(!memcg)) + goto unlock; + + if (!css_tryget(&memcg->css)) + memcg = NULL; + +unlock: + read_unlock_irqrestore(&score_list_lock, flags); + + if (next) + css_put(&next->css); + return memcg; +} + +void get_prev_memcg_break(struct mem_cgroup *memcg) +{ + if (memcg) + css_put(&memcg->css); +} + +void memcg_app_score_update(struct mem_cgroup *target) +{ + struct list_head *pos = NULL; + struct list_head *tmp; + unsigned long flags; + + write_lock_irqsave(&score_list_lock, flags); + list_for_each_prev_safe(pos, tmp, &score_head) { + struct mem_cgroup *memcg = list_entry(pos, + struct mem_cgroup, score_node); + if (atomic64_read(&memcg->memcg_reclaimed.app_score) < + atomic64_read(&target->memcg_reclaimed.app_score)) + break; + } + list_move_tail(&target->score_node, pos); + write_unlock_irqrestore(&score_list_lock, flags); +} + +static u64 mem_cgroup_app_score_read(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(css); + + return atomic64_read(&memcg->memcg_reclaimed.app_score); +} + +static int mem_cgroup_app_score_write(struct cgroup_subsys_state *css, + struct cftype *cft, u64 val) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(css); + + if (val > MAX_APP_SCORE) + return -EINVAL; + + if (atomic64_read(&memcg->memcg_reclaimed.app_score) != val) { + atomic64_set(&memcg->memcg_reclaimed.app_score, val); + memcg_app_score_update(memcg); + } + + return 0; +} + +static unsigned long move_pages_to_page_list(struct lruvec *lruvec, enum lru_list lru, + struct list_head *page_list) +{ + struct list_head *src = &lruvec->lists[lru]; + unsigned long nr_isolated = 0; + struct page *page; + + while (!list_empty(src)) { + page = lru_to_page(src); + + if (PageUnevictable(page)) + continue; + + if (likely(get_page_unless_zero(page))) { + if (isolate_lru_page(page)) { + put_page(page); + continue; + } + put_page(page); + + } else { + continue; + } + + + if (PageUnevictable(page)) { + putback_lru_page(page); + continue; + } + + if (PageAnon(page) && !PageSwapBacked(page)) { + putback_lru_page(page); + continue; + } + + list_add(&page->lru, page_list); + nr_isolated++; + } + + return nr_isolated; +} + + +unsigned long reclaim_all_anon_memcg(struct pglist_data *pgdat, struct mem_cgroup *memcg) +{ + struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat); + unsigned long nr_reclaimed; + LIST_HEAD(page_list); + struct page *page; + struct reclaim_stat stat = {}; + struct scan_control sc = { + .gfp_mask = GFP_KERNEL, + .may_writepage = 1, + .may_unmap = 1, + .may_swap = 1, + }; + +#ifdef CONFIG_RECLAIM_ACCT + reclaimacct_substage_start(RA_SHRINKANON); +#endif + count_vm_event(FREEZE_RECLAIME_COUNT); + move_pages_to_page_list(lruvec, LRU_INACTIVE_ANON, &page_list); + + nr_reclaimed = shrink_folio_list(&page_list, pgdat, &sc, &stat, true); + count_vm_event(FREEZE_RECLAIMED); + + while (!list_empty(&page_list)) { + page = lru_to_page(&page_list); + list_del(&page->lru); + putback_lru_page(page); + } + +#ifdef CONFIG_RECLAIM_ACCT + reclaimacct_substage_end(RA_SHRINKANON, nr_reclaimed, NULL); +#endif + + return nr_reclaimed; +} + +static ssize_t memcg_force_shrink_anon(struct kernfs_open_file *of, + char *buf, size_t nbytes, + loff_t off) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); + struct pglist_data *pgdat; + int nid; + + for_each_online_node(nid) { + pgdat = NODE_DATA(nid); + reclaim_all_anon_memcg(pgdat, memcg); + } + + return nbytes; +} + +static int memcg_name_show(struct seq_file *m, void *v) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); + + seq_printf(m, "%s\n", memcg->name); + return 0; +} + +static ssize_t memcg_name_write(struct kernfs_open_file *of, char *buf, + size_t nbytes, loff_t off) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); + + buf = strstrip(buf); + if (nbytes >= MEM_CGROUP_NAME_MAX_LEN) + return -EINVAL; + + mutex_lock(&reclaim_para_lock); + if (memcg) + strcpy(memcg->name, buf); + mutex_unlock(&reclaim_para_lock); + + return nbytes; +} + +static int memcg_total_info_per_app_show(struct seq_file *m, void *v) +{ + struct mem_cgroup *memcg = NULL; + struct mem_cgroup_per_node *mz = NULL; + struct lruvec *lruvec = NULL; + unsigned long anon_size; + unsigned long zram_compress_size; + unsigned long eswap_compress_size; + + + while ((memcg = get_next_memcg(memcg))) { + mz = mem_cgroup_nodeinfo(memcg, 0); + if (!mz) { + get_next_memcg_break(memcg); + return 0; + } + + lruvec = &mz->lruvec; + if (!lruvec) { + get_next_memcg_break(memcg); + return 0; + } + + anon_size = lruvec_lru_size(lruvec, LRU_ACTIVE_ANON, MAX_NR_ZONES) + + lruvec_lru_size(lruvec, LRU_INACTIVE_ANON, MAX_NR_ZONES); + zram_compress_size = memcg_data_size(memcg, CACHE_SIZE); + eswap_compress_size = memcg_data_size(memcg, SWAP_SIZE); + anon_size *= PAGE_SIZE / SZ_1K; + zram_compress_size /= SZ_1K; + eswap_compress_size /= SZ_1K; + + if (!strlen(memcg->name)) + continue; + + seq_printf(m, "%s %lu %lu %lu\n", memcg->name, anon_size, + zram_compress_size, eswap_compress_size); + } + + return 0; +} + +static int memcg_ub_ufs2zram_ratio_write(struct cgroup_subsys_state *css, + struct cftype *cft, u64 val) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(css); + const unsigned int ratio = 100; + + if (val > ratio) + return -EINVAL; + + atomic64_set(&memcg->memcg_reclaimed.ub_ufs2zram_ratio, val); + + return 0; +} + +static u64 memcg_ub_ufs2zram_ratio_read(struct cgroup_subsys_state *css, struct cftype *cft) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(css); + + return atomic64_read(&memcg->memcg_reclaimed.ub_ufs2zram_ratio); +} + +static int memcg_force_swapin_write(struct cgroup_subsys_state *css, struct cftype *cft, u64 val) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(css); + u64 size; + const unsigned int ratio = 100; + + size = memcg_data_size(memcg, SWAP_SIZE); + size = div_u64(atomic64_read(&memcg->memcg_reclaimed.ub_ufs2zram_ratio) * size, ratio); + + swapin_memcg(memcg, size); + + return 0; +} + +#ifdef CONFIG_MEM_PURGEABLE +static unsigned long purgeable_memcg_node(pg_data_t *pgdata, + struct scan_control *sc, struct mem_cgroup *memcg) +{ + unsigned long nr = 0; + struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdata); + if (!lruvec) + return 0; + + shrink_list(LRU_ACTIVE_PURGEABLE, -1, lruvec, sc); + nr += shrink_list(LRU_INACTIVE_PURGEABLE, -1, lruvec, sc); + + pr_info("reclaim %lu purgeable pages \n", nr); + return nr; +} + +static int memcg_force_shrink_purgeable_bysize(struct cgroup_subsys_state *css, + struct cftype *cft, u64 reclaim_size) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(css); + if (!memcg) + return 0; + + if (reclaim_size == 0) { + pr_err("reclaim_size is zero, skip shrink\n"); + return 0; + } + + struct scan_control sc = { + .gfp_mask = GFP_KERNEL, + .order = 0, + .priority = DEF_PRIORITY, + .may_deactivate = DEACTIVATE_ANON, + .may_writepage = 1, + .may_unmap = 1, + .may_swap = 1, + .reclaim_idx = MAX_NR_ZONES -1, + }; + int nid = 0; + sc.nr_to_reclaim = div_u64(reclaim_size, PAGE_SIZE); + + for_each_node_state(nid, N_MEMORY) + purgeable_memcg_node(NODE_DATA(nid), &sc, memcg); + return 0; +} +#endif + +static struct cftype memcg_policy_files[] = { + { + .name = "name", + .write = memcg_name_write, + .seq_show = memcg_name_show, + }, + { + .name = "ub_ufs2zram_ratio", + .write_u64 = memcg_ub_ufs2zram_ratio_write, + .read_u64 = memcg_ub_ufs2zram_ratio_read, + }, + { + .name = "total_info_per_app", + .seq_show = memcg_total_info_per_app_show, + }, + { + .name = "app_score", + .write_u64 = mem_cgroup_app_score_write, + .read_u64 = mem_cgroup_app_score_read, + }, + { + .name = "force_shrink_anon", + .write = memcg_force_shrink_anon + }, + { + .name = "force_swapin", + .write_u64 = memcg_force_swapin_write, + }, +#ifdef CONFIG_MEM_PURGEABLE + { + .name = "force_shrink_purgeable_bysize", + .write_u64 = memcg_force_shrink_purgeable_bysize, + }, +#endif + { }, /* terminate */ +}; + +static int __init memcg_policy_init(void) +{ + if (!mem_cgroup_disabled()) + WARN_ON(cgroup_add_legacy_cftypes(&memory_cgrp_subsys, + memcg_policy_files)); + + return 0; +} +subsys_initcall(memcg_policy_init); +#else +struct mem_cgroup *get_next_memcg(struct mem_cgroup *prev) +{ + return NULL; +} + +void get_next_memcg_break(struct mem_cgroup *memcg) +{ +} + + +struct mem_cgroup *get_prev_memcg(struct mem_cgroup *next) +{ + return NULL; +} + +void get_prev_memcg_break(struct mem_cgroup *memcg) +{ +} + +static u64 mem_cgroup_app_score_read(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + return 0; +} + +static int mem_cgroup_app_score_write(struct cgroup_subsys_state *css, + struct cftype *cft, u64 val) +{ + return 0; +} + +void memcg_app_score_update(struct mem_cgroup *target) +{ +} +#endif diff --git a/mm/memcg_reclaim.c b/mm/memcg_reclaim.c new file mode 100644 index 0000000000000000000000000000000000000000..b68545da8e2f392f52050bb4b78a7bfa7155db98 --- /dev/null +++ b/mm/memcg_reclaim.c @@ -0,0 +1,539 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * mm/memcg_reclaim.c + * + * Copyright (c) 2020-2022 Huawei Technologies Co., Ltd. + */ +#include +#include +#include +#include +#include + +#ifdef CONFIG_HYPERHOLD_FILE_LRU +#include +#include "internal.h" +#endif + +static inline bool is_swap_not_allowed(struct scan_control *sc, int swappiness) +{ + return !sc->may_swap || !swappiness || !get_nr_swap_pages(); +} + +/* + * From 0 .. 100. Higher means more swappy. + */ +#define HYPERHOLD_SWAPPINESS 100 + +static int get_hyperhold_swappiness(void) +{ + return is_hyperhold_enable() ? HYPERHOLD_SWAPPINESS : vm_swappiness; +} + +static void get_scan_count_hyperhold(struct pglist_data *pgdat, + struct scan_control *sc, unsigned long *nr, + unsigned long *lru_pages) +{ + int swappiness = get_hyperhold_swappiness(); + struct lruvec *lruvec = node_lruvec(pgdat); + u64 fraction[2]; + u64 denominator; + enum scan_balance scan_balance; + unsigned long ap, fp; + enum lru_list lru; + unsigned long pgdatfile; + unsigned long pgdatfree; + int z; + unsigned long anon_cost, file_cost, total_cost; + unsigned long total_high_wmark = 0; + + + if (cgroup_reclaim(sc) && !swappiness) { + scan_balance = SCAN_FILE; + goto out; + } + + /* + * Do not apply any pressure balancing cleverness when the + * system is close to OOM, scan both anon and file equally + * (unless the swappiness setting disagrees with swapping). + */ + if (!sc->priority && swappiness) { + scan_balance = SCAN_EQUAL; + goto out; + } + + if (!cgroup_reclaim(sc)) { + pgdatfree = sum_zone_node_page_state(pgdat->node_id, NR_FREE_PAGES); + pgdatfile = node_page_state(pgdat, NR_ACTIVE_FILE) + + node_page_state(pgdat, NR_INACTIVE_FILE); + + for (z = 0; z < MAX_NR_ZONES; z++) { + struct zone *zone = &pgdat->node_zones[z]; + + if (!managed_zone(zone)) + continue; + + total_high_wmark += high_wmark_pages(zone); + } + + if (unlikely(pgdatfile + pgdatfree <= total_high_wmark)) { + /* + * Force SCAN_ANON if there are enough inactive + * anonymous pages on the LRU in eligible zones. + * Otherwise, the small LRU gets thrashed. + */ + if (!inactive_is_low(lruvec, LRU_INACTIVE_ANON) && + (lruvec_lru_size(lruvec, LRU_INACTIVE_ANON, + sc->reclaim_idx) >> + (unsigned int)sc->priority)) { + scan_balance = SCAN_ANON; + goto out; + } + } + } + + /* + * If there is enough inactive page cache, i.e. if the size of the + * inactive list is greater than that of the active list *and* the + * inactive list actually has some pages to scan on this priority, we + * do not reclaim anything from the anonymous working set right now. + * Without the second condition we could end up never scanning an + * lruvec even if it has plenty of old anonymous pages unless the + * system is under heavy pressure. + */ + + if (!IS_ENABLED(CONFIG_BALANCE_ANON_FILE_RECLAIM) && + !inactive_is_low(lruvec, LRU_INACTIVE_FILE) && + lruvec_lru_size(lruvec, LRU_INACTIVE_FILE, sc->reclaim_idx) >> sc->priority) { + scan_balance = SCAN_FILE; + goto out; + } + + scan_balance = SCAN_FRACT; + + /* + * Calculate the pressure balance between anon and file pages. + * + * The amount of pressure we put on each LRU is inversely + * proportional to the cost of reclaiming each list, as + * determined by the share of pages that are refaulting, times + * the relative IO cost of bringing back a swapped out + * anonymous page vs reloading a filesystem page (swappiness). + * + * Although we limit that influence to ensure no list gets + * left behind completely: at least a third of the pressure is + * applied, before swappiness. + * + * With swappiness at 100, anon and file have equal IO cost. + */ + total_cost = sc->anon_cost + sc->file_cost; + anon_cost = total_cost + sc->anon_cost; + file_cost = total_cost + sc->file_cost; + total_cost = anon_cost + file_cost; + + ap = swappiness * (total_cost + 1); + ap /= anon_cost + 1; + + fp = (200 - swappiness) * (total_cost + 1); + fp /= file_cost + 1; + + fraction[0] = ap; + fraction[1] = fp; + denominator = ap + fp; + +out: + *lru_pages = 0; + for_each_evictable_lru(lru) { + int file = is_file_lru(lru); + unsigned long lruvec_size; + unsigned long scan; + + lruvec_size = lruvec_lru_size(lruvec, lru, sc->reclaim_idx); + scan = lruvec_size; + *lru_pages += scan; + scan >>= sc->priority; + + switch (scan_balance) { + case SCAN_EQUAL: + /* Scan lists relative to size */ + break; + case SCAN_FRACT: + /* + * Scan types proportional to swappiness and + * their relative recent reclaim efficiency. + * Make sure we don't miss the last page on + * the offlined memory cgroups because of a + * round-off error. + */ + scan = DIV64_U64_ROUND_UP(scan * fraction[file], + denominator); + break; + case SCAN_FILE: + case SCAN_ANON: + /* Scan one type exclusively */ + if ((scan_balance == SCAN_FILE) != file) + scan = 0; + break; + default: + /* Look ma, no brain */ + BUG(); + } + + nr[lru] = scan; + } +} + +#define ISOLATE_LIMIT_CNT 5 +void shrink_anon_memcg(struct pglist_data *pgdat, + struct mem_cgroup *memcg, struct scan_control *sc, + unsigned long *nr) +{ + struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat); + unsigned long nr_to_scan; + enum lru_list lru; + unsigned long nr_reclaimed = 0; + struct blk_plug plug; + + blk_start_plug(&plug); + + while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_ANON]) { + for (lru = 0; lru <= LRU_ACTIVE_ANON; lru++) { + if (nr[lru]) { + nr_to_scan = min(nr[lru], SWAP_CLUSTER_MAX); + nr[lru] -= nr_to_scan; + nr_reclaimed += + shrink_list(lru, nr_to_scan, + lruvec, sc); + } + } + if (sc->nr_reclaimed >= sc->nr_to_reclaim || + (sc->isolate_count > ISOLATE_LIMIT_CNT && + sc->invoker == DIRECT_RECLAIM)) + break; + } + blk_finish_plug(&plug); + sc->nr_reclaimed += nr_reclaimed; + sc->nr_reclaimed_anon += nr_reclaimed; +} + +static inline bool memcg_is_child_of(struct mem_cgroup *mcg, struct mem_cgroup *tmcg) +{ + if (tmcg == NULL) + return true; + + while (!mem_cgroup_is_root(mcg)) { + if (mcg == tmcg) + break; + + mcg = parent_mem_cgroup(mcg); + } + + return (mcg == tmcg); +} + +static void shrink_anon(struct pglist_data *pgdat, + struct scan_control *sc, unsigned long *nr) +{ + unsigned long reclaimed; + unsigned long scanned; + struct mem_cgroup *memcg = NULL; + struct mem_cgroup *target_memcg = sc->target_mem_cgroup; + unsigned long nr_memcg[NR_LRU_LISTS]; + unsigned long nr_node_active = lruvec_lru_size( + node_lruvec(pgdat), LRU_ACTIVE_ANON, MAX_NR_ZONES); + unsigned long nr_node_inactive = lruvec_lru_size( + node_lruvec(pgdat), LRU_INACTIVE_ANON, MAX_NR_ZONES); + + while ((memcg = get_next_memcg(memcg))) { + struct lruvec *lruvec = NULL; + + if (!memcg_is_child_of(memcg, target_memcg)) + continue; + + lruvec = mem_cgroup_lruvec(memcg, pgdat); + + reclaimed = sc->nr_reclaimed; + scanned = sc->nr_scanned; + + nr_memcg[LRU_ACTIVE_ANON] = nr[LRU_ACTIVE_ANON] * + lruvec_lru_size(lruvec, LRU_ACTIVE_ANON, + MAX_NR_ZONES) / (nr_node_active + 1); + nr_memcg[LRU_INACTIVE_ANON] = nr[LRU_INACTIVE_ANON] * + lruvec_lru_size(lruvec, LRU_INACTIVE_ANON, + MAX_NR_ZONES) / (nr_node_inactive + 1); + nr_memcg[LRU_ACTIVE_FILE] = 0; + nr_memcg[LRU_INACTIVE_FILE] = 0; + + /* + * This loop can become CPU-bound when target memcgs + * aren't eligible for reclaim - either because they + * don't have any reclaimable pages, or because their + * memory is explicitly protected. Avoid soft lockups. + */ + cond_resched(); + + mem_cgroup_calculate_protection(target_memcg, memcg); + + if (mem_cgroup_below_min(target_memcg, memcg)) { + /* + * Hard protection. + * If there is no reclaimable memory, OOM. + */ + continue; + } else if (mem_cgroup_below_low(target_memcg, memcg)) { + /* + * Soft protection. + * Respect the protection only as long as + * there is an unprotected supply + * of reclaimable memory from other cgroups. + */ + if (!sc->memcg_low_reclaim) { + sc->memcg_low_skipped = 1; + continue; + } + memcg_memory_event(memcg, MEMCG_LOW); + } + + shrink_anon_memcg(pgdat, memcg, sc, nr_memcg); + shrink_slab(sc->gfp_mask, pgdat->node_id, memcg, + sc->priority); + + vmpressure(sc->gfp_mask, memcg, false, + sc->nr_scanned - scanned, + sc->nr_reclaimed - reclaimed); + + if (sc->nr_reclaimed >= sc->nr_to_reclaim || + (sc->isolate_count > ISOLATE_LIMIT_CNT && + sc->invoker == DIRECT_RECLAIM)) { + get_next_memcg_break(memcg); + break; + } + } +} + +static void shrink_file(struct pglist_data *pgdat, + struct scan_control *sc, unsigned long *nr) +{ + struct lruvec *lruvec = node_lruvec(pgdat); + unsigned long nr_to_scan; + enum lru_list lru; + unsigned long nr_reclaimed = 0; + struct blk_plug plug; + + blk_start_plug(&plug); + + while (nr[LRU_ACTIVE_FILE] || nr[LRU_INACTIVE_FILE]) { + for (lru = LRU_INACTIVE_FILE; lru <= LRU_ACTIVE_FILE; lru++) { + if (nr[lru]) { + nr_to_scan = min(nr[lru], SWAP_CLUSTER_MAX); + nr[lru] -= nr_to_scan; + nr_reclaimed += + shrink_list(lru, + nr_to_scan, + lruvec, sc); + } + } + } + blk_finish_plug(&plug); + sc->nr_reclaimed += nr_reclaimed; + sc->nr_reclaimed_file += nr_reclaimed; +} + +bool shrink_node_hyperhold(struct pglist_data *pgdat, struct scan_control *sc) +{ + unsigned long nr_reclaimed; + struct lruvec *target_lruvec; + bool reclaimable = false; + unsigned long file; + + target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat); + do { + /* Get scan count for file and anon */ + unsigned long node_lru_pages = 0; + unsigned long nr[NR_LRU_LISTS] = {0}; + + memset(&sc->nr, 0, sizeof(sc->nr)); + nr_reclaimed = sc->nr_reclaimed; + + /* + * Determine the scan balance between anon and file LRUs. + */ + spin_lock_irq(&target_lruvec->lru_lock); + sc->anon_cost = mem_cgroup_lruvec(NULL, pgdat)->anon_cost; + sc->file_cost = node_lruvec(pgdat)->file_cost; + spin_unlock_irq(&target_lruvec->lru_lock); + + /* + * Target desirable inactive:active list ratios for the anon + * and file LRU lists. + */ + if (!sc->force_deactivate) { + unsigned long refaults; + + refaults = lruvec_page_state(target_lruvec, + WORKINGSET_ACTIVATE_ANON); + if (refaults != target_lruvec->refaults[0] || + inactive_is_low(target_lruvec, LRU_INACTIVE_ANON)) + sc->may_deactivate |= DEACTIVATE_ANON; + else + sc->may_deactivate &= ~DEACTIVATE_ANON; + + /* + * When refaults are being observed, it means a new + * workingset is being established. Deactivate to get + * rid of any stale active pages quickly. + */ +#ifdef CONFIG_HYPERHOLD_FILE_LRU + refaults = lruvec_page_state(node_lruvec(pgdat), + WORKINGSET_ACTIVATE_FILE); + if (refaults != node_lruvec(pgdat)->refaults[1] || + inactive_is_low(node_lruvec(pgdat), LRU_INACTIVE_FILE)) + sc->may_deactivate |= DEACTIVATE_FILE; +#else + refaults = lruvec_page_state(target_lruvec, + WORKINGSET_ACTIVATE_FILE); + if (refaults != target_lruvec->refaults[1] || + inactive_is_low(target_lruvec, LRU_INACTIVE_FILE)) + sc->may_deactivate |= DEACTIVATE_FILE; +#endif + else + sc->may_deactivate &= ~DEACTIVATE_FILE; + } else + sc->may_deactivate = DEACTIVATE_ANON | DEACTIVATE_FILE; + + /* + * If we have plenty of inactive file pages that aren't + * thrashing, try to reclaim those first before touching + * anonymous pages. + */ +#ifdef CONFIG_HYPERHOLD_FILE_LRU + file = lruvec_page_state(node_lruvec(pgdat), NR_INACTIVE_FILE); +#else + file = lruvec_page_state(target_lruvec, NR_INACTIVE_FILE); +#endif + if (file >> sc->priority && !(sc->may_deactivate & DEACTIVATE_FILE)) + sc->cache_trim_mode = 1; + else + sc->cache_trim_mode = 0; + + /* + * Prevent the reclaimer from falling into the cache trap: as + * cache pages start out inactive, every cache fault will tip + * the scan balance towards the file LRU. And as the file LRU + * shrinks, so does the window for rotation from references. + * This means we have a runaway feedback loop where a tiny + * thrashing file LRU becomes infinitely more attractive than + * anon pages. Try to detect this based on file LRU size. + */ + if (!cgroup_reclaim(sc)) { + unsigned long total_high_wmark = 0; + unsigned long free, anon; + int z; + + free = sum_zone_node_page_state(pgdat->node_id, NR_FREE_PAGES); + file = node_page_state(pgdat, NR_ACTIVE_FILE) + + node_page_state(pgdat, NR_INACTIVE_FILE); + + for (z = 0; z < MAX_NR_ZONES; z++) { + struct zone *zone = &pgdat->node_zones[z]; + + if (!managed_zone(zone)) + continue; + + total_high_wmark += high_wmark_pages(zone); + } + + /* + * Consider anon: if that's low too, this isn't a + * runaway file reclaim problem, but rather just + * extreme pressure. Reclaim as per usual then. + */ + anon = node_page_state(pgdat, NR_INACTIVE_ANON); + + sc->file_is_tiny = + file + free <= total_high_wmark && + !(sc->may_deactivate & DEACTIVATE_ANON) && + anon >> sc->priority; + } + + get_scan_count_hyperhold(pgdat, sc, nr, &node_lru_pages); + + if (!cgroup_reclaim(sc)) { + /* Shrink the Total-File-LRU */ + shrink_file(pgdat, sc, nr); + } + + /* Shrink Anon by iterating score_list */ + shrink_anon(pgdat, sc, nr); + + if (sc->nr_reclaimed - nr_reclaimed) + reclaimable = true; + + if (current_is_kswapd()) { + /* + * If reclaim is isolating dirty pages under writeback, + * it implies that the long-lived page allocation rate + * is exceeding the page laundering rate. Either the + * global limits are not being effective at throttling + * processes due to the page distribution throughout + * zones or there is heavy usage of a slow backing + * device. The only option is to throttle from reclaim + * context which is not ideal as there is no guarantee + * the dirtying process is throttled in the same way + * balance_dirty_pages() manages. + * + * Once a node is flagged PGDAT_WRITEBACK, kswapd will + * count the number of pages under pages flagged for + * immediate reclaim and stall if any are encountered + * in the nr_immediate check below. + */ + if (sc->nr.writeback && sc->nr.writeback == sc->nr.taken) + set_bit(PGDAT_WRITEBACK, &pgdat->flags); + + /* Allow kswapd to start writing pages during reclaim. */ + if (sc->nr.unqueued_dirty == sc->nr.file_taken) + set_bit(PGDAT_DIRTY, &pgdat->flags); + + /* + * If kswapd scans pages marked for immediate + * reclaim and under writeback (nr_immediate), it + * implies that pages are cycling through the LRU + * faster than they are written so also forcibly stall. + */ + if (sc->nr.immediate) + reclaim_throttle(pgdat, VMSCAN_THROTTLE_WRITEBACK); + } + /* + * Legacy memcg will stall in page writeback so avoid forcibly + * stalling in reclaim_throttle(). + */ + if ((current_is_kswapd() || + (cgroup_reclaim(sc) && writeback_throttling_sane(sc))) && + sc->nr.dirty && sc->nr.dirty == sc->nr.congested) + set_bit(LRUVEC_NODE_CONGESTED, &target_lruvec->flags); + + /* + * Stall direct reclaim for IO completions if underlying BDIs + * and node is congested. Allow kswapd to continue until it + * starts encountering unqueued dirty pages or cycling through + * the LRU too quickly. + */ + if (!current_is_kswapd() && current_may_throttle() && + !sc->hibernation_mode && + test_bit(LRUVEC_NODE_CONGESTED, &target_lruvec->flags)) + reclaim_throttle(pgdat, VMSCAN_THROTTLE_WRITEBACK); + + } while (should_continue_reclaim(pgdat, sc->nr_reclaimed - nr_reclaimed, + sc)); + /* + * Kswapd gives up on balancing particular nodes after too + * many failures to reclaim anything from them and goes to + * sleep. On reclaim progress, reset the failure counter. A + * successful direct reclaim run will revive a dormant kswapd. + */ + if (reclaimable) + pgdat->kswapd_failures = 0; + + return reclaimable; +} diff --git a/mm/memcontrol.c b/mm/memcontrol.c index dd854cc65fd9d052036792db6b1296a7739b95ec..e73a0df666da7c23fdfa2310e527181e42fb4161 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -71,6 +71,7 @@ #include "swap.h" #include +#include #include @@ -87,7 +88,7 @@ EXPORT_PER_CPU_SYMBOL_GPL(int_active_memcg); static bool cgroup_memory_nosocket __ro_after_init; /* Kernel memory accounting disabled? */ -static bool cgroup_memory_nokmem __ro_after_init; +static bool cgroup_memory_nokmem = true; /* BPF memory accounting disabled? */ static bool cgroup_memory_nobpf __ro_after_init; @@ -467,7 +468,15 @@ static void mem_cgroup_remove_exceeded(struct mem_cgroup_per_node *mz, static unsigned long soft_limit_excess(struct mem_cgroup *memcg) { +#ifdef CONFIG_HYPERHOLD_FILE_LRU + struct mem_cgroup_per_node *mz = mem_cgroup_nodeinfo(memcg, 0); + struct lruvec *lruvec = &mz->lruvec; + unsigned long nr_pages = lruvec_lru_size(lruvec, LRU_ACTIVE_ANON, + MAX_NR_ZONES) + lruvec_lru_size(lruvec, LRU_INACTIVE_ANON, + MAX_NR_ZONES); +#else unsigned long nr_pages = page_counter_read(&memcg->memory); +#endif unsigned long soft_limit = READ_ONCE(memcg->soft_limit); unsigned long excess = 0; @@ -845,8 +854,13 @@ void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, __mod_node_page_state(lruvec_pgdat(lruvec), idx, val); /* Update memcg and lruvec */ - if (!mem_cgroup_disabled()) + if (!mem_cgroup_disabled()) { +#ifdef CONFIG_HYPERHOLD_FILE_LRU + if (is_node_lruvec(lruvec)) + return; +#endif __mod_memcg_lruvec_state(lruvec, idx, val); + } } void __mod_lruvec_page_state(struct page *page, enum node_stat_item idx, @@ -857,6 +871,13 @@ void __mod_lruvec_page_state(struct page *page, enum node_stat_item idx, pg_data_t *pgdat = page_pgdat(page); struct lruvec *lruvec; +#ifdef CONFIG_HYPERHOLD_FILE_LRU + if (is_file_page(page) && !is_prot_page(page)) { + __mod_node_page_state(pgdat, idx, val); + return; + } +#endif + rcu_read_lock(); memcg = page_memcg(head); /* Untracked pages have no memcg, no lruvec. Update only the node */ @@ -909,6 +930,10 @@ void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx, if (mem_cgroup_disabled() || index < 0) return; +#ifdef CONFIG_HYPERHOLD_FILE_LRU + if (!memcg) + return; +#endif memcg_stats_lock(); __this_cpu_add(memcg->vmstats_percpu->events[index], count); @@ -1391,6 +1416,11 @@ void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru, if (mem_cgroup_disabled()) return; +#ifdef CONFIG_HYPERHOLD_FILE_LRU + if (is_node_lruvec(lruvec)) + return; +#endif + mz = container_of(lruvec, struct mem_cgroup_per_node, lruvec); lru_size = &mz->lru_zone_size[zid][lru]; @@ -5206,6 +5236,10 @@ static inline void mem_cgroup_id_put(struct mem_cgroup *memcg) struct mem_cgroup *mem_cgroup_from_id(unsigned short id) { WARN_ON_ONCE(!rcu_read_lock_held()); +#ifdef CONFIG_HYPERHOLD_FILE_LRU + if (id == -1) + return NULL; +#endif return idr_find(&mem_cgroup_idr, id); } @@ -5248,6 +5282,9 @@ static int alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node) } lruvec_init(&pn->lruvec); +#if defined(CONFIG_HYPERHOLD_FILE_LRU) && defined(CONFIG_MEMCG) + pn->lruvec.pgdat = NODE_DATA(node); +#endif pn->memcg = memcg; memcg->nodeinfo[node] = pn; @@ -5340,6 +5377,18 @@ static struct mem_cgroup *mem_cgroup_alloc(void) INIT_LIST_HEAD(&memcg->deferred_split_queue.split_queue); memcg->deferred_split_queue.split_queue_len = 0; #endif + +#ifdef CONFIG_HYPERHOLD_MEMCG + if (unlikely(!score_head_inited)) { + INIT_LIST_HEAD(&score_head); + score_head_inited = true; + } +#endif + +#ifdef CONFIG_HYPERHOLD_MEMCG + INIT_LIST_HEAD(&memcg->score_node); +#endif + lru_gen_init_memcg(memcg); return memcg; fail: @@ -5360,6 +5409,14 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) if (IS_ERR(memcg)) return ERR_CAST(memcg); +#ifdef CONFIG_HYPERHOLD_MEMCG + atomic64_set(&memcg->memcg_reclaimed.app_score, 300); +#endif +#ifdef CONFIG_HYPERHOLD_ZSWAPD + atomic_set(&memcg->memcg_reclaimed.ub_zram2ufs_ratio, 10); + atomic_set(&memcg->memcg_reclaimed.ub_mem2zram_ratio, 60); + atomic_set(&memcg->memcg_reclaimed.refault_threshold, 50); +#endif page_counter_set_high(&memcg->memory, PAGE_COUNTER_MAX); WRITE_ONCE(memcg->soft_limit, PAGE_COUNTER_MAX); #if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP) @@ -5416,6 +5473,11 @@ static int mem_cgroup_css_online(struct cgroup_subsys_state *css) FLUSH_TIME); lru_gen_online_memcg(memcg); +#ifdef CONFIG_HYPERHOLD_MEMCG + memcg_app_score_update(memcg); + css_get(css); +#endif + /* Online state pins memcg ID, memcg ID pins CSS */ refcount_set(&memcg->id.ref, 1); css_get(css); @@ -5445,6 +5507,15 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) struct mem_cgroup *memcg = mem_cgroup_from_css(css); struct mem_cgroup_event *event, *tmp; +#ifdef CONFIG_HYPERHOLD_MEMCG + unsigned long flags; + + write_lock_irqsave(&score_list_lock, flags); + list_del_init(&memcg->score_node); + write_unlock_irqrestore(&score_list_lock, flags); + css_put(css); +#endif + /* * Unregister events and notify userspace. * Notify userspace about cgroup removing only after rmdir of cgroup @@ -6616,6 +6687,9 @@ static int memory_stat_show(struct seq_file *m, void *v) memory_stat_format(memcg, &s); seq_puts(m, buf); kfree(buf); +#ifdef CONFIG_HYPERHOLD_DEBUG + memcg_eswap_info_show(m); +#endif return 0; } @@ -7362,6 +7436,8 @@ static int __init cgroup_memory(char *s) cgroup_memory_nokmem = true; if (!strcmp(token, "nobpf")) cgroup_memory_nobpf = true; + if (!strcmp(token, "kmem")) + cgroup_memory_nokmem = false; } return 1; } diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index f36525a595a93c8722fdf8c138a4096bc7be7930..ace4f004cdaa54041c3afdbd5d5220c0cf5d152f 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -35,6 +35,7 @@ #include #include #include +#include #include @@ -1208,6 +1209,9 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, kswapd_run(nid); kcompactd_run(nid); +#ifdef CONFIG_HYPERHOLD_ZSWAPD + zswapd_run(nid); +#endif writeback_set_ratelimit(); @@ -2024,6 +2028,9 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages, if (arg.status_change_nid >= 0) { kcompactd_stop(node); kswapd_stop(node); +#ifdef CONFIG_HYPERHOLD_ZSWAPD + zswapd_stop(node); +#endif } writeback_set_ratelimit(); diff --git a/mm/memory_monitor.c b/mm/memory_monitor.c new file mode 100644 index 0000000000000000000000000000000000000000..88fb97466b247eba470a1125ac74418f0c9d7cb2 --- /dev/null +++ b/mm/memory_monitor.c @@ -0,0 +1,58 @@ +#include +#include +#include +#include +#include +#include + +#include "internal.h" + +static atomic_t kswapd_monitor = ATOMIC_INIT(0); +static DECLARE_WAIT_QUEUE_HEAD(kswapd_poll_wait); + +void kswapd_monitor_wake_up_queue(void) +{ + atomic_inc(&kswapd_monitor); + wake_up_interruptible(&kswapd_poll_wait); +} + +static __poll_t kswapd_monitor_poll(struct file *file, struct poll_table_struct *wait) +{ + struct seq_file *seq = file->private_data; + + poll_wait(file, &kswapd_poll_wait, wait); + + if (seq->poll_event != atomic_read(&kswapd_monitor)) { + seq->poll_event = atomic_read(&kswapd_monitor); + return EPOLLPRI; + } + + return EPOLLIN | EPOLLRDNORM; +} + +static int kswapd_monitor_show(struct seq_file *m, void *v) +{ + seq_printf(m, "kswapd_monitor_show kswapd_monitor %d\n", atomic_read(&kswapd_monitor)); + return 0; +} + +static int kswapd_monitor_open(struct inode *inode, struct file *file) +{ + return single_open(file, kswapd_monitor_show, NULL); +} + +static const struct proc_ops proc_kswapd_monitor_operations = { + .proc_open = kswapd_monitor_open, + .proc_poll = kswapd_monitor_poll, + .proc_read = seq_read, + .proc_lseek = seq_lseek, + .proc_release = single_release, +}; + +static int __init memory_monitor_init(void) +{ + proc_create("kswapd_monitor", 0, NULL, &proc_kswapd_monitor_operations); + return 0; +} + +__initcall(memory_monitor_init) diff --git a/mm/mm_init.c b/mm/mm_init.c index 77fd04c83d046db29912b7f3afed3339a067318d..8b31d6a4390cc5b5ef51a294e5d6a521d1f36fbb 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -1361,12 +1361,18 @@ static void __meminit pgdat_init_internals(struct pglist_data *pgdat) init_waitqueue_head(&pgdat->kswapd_wait); init_waitqueue_head(&pgdat->pfmemalloc_wait); +#ifdef CONFIG_HYPERHOLD_ZSWAPD + init_waitqueue_head(&pgdat->zswapd_wait); +#endif for (i = 0; i < NR_VMSCAN_THROTTLE; i++) init_waitqueue_head(&pgdat->reclaim_wait[i]); pgdat_page_ext_init(pgdat); lruvec_init(&pgdat->__lruvec); +#if defined(CONFIG_HYPERHOLD_FILE_LRU) && defined(CONFIG_MEMCG) + pgdat->__lruvec.pgdat = pgdat; +#endif } static void __meminit zone_init_internals(struct zone *zone, enum zone_type idx, int nid, diff --git a/mm/page_alloc.c b/mm/page_alloc.c index afed33fd876128ac3b53db3e8da303b353c4c991..e378d89e3a165ce93c7f4e07db2955d056fff3e8 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -51,6 +51,10 @@ #include #include #include +#include +#ifdef CONFIG_RECLAIM_ACCT +#include +#endif #include #include #include "internal.h" @@ -4196,6 +4200,11 @@ static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order, might_alloc(gfp_mask); +#ifdef CONFIG_HYPERHOLD_ZSWAPD + if (gfp_mask & __GFP_KSWAPD_RECLAIM) + wake_all_zswapd(); +#endif + if (should_fail_alloc_page(gfp_mask, order)) return false; diff --git a/mm/swap.c b/mm/swap.c index cd8f0150ba3aa8cde8828d2760f34516a605fb1d..3d6a054168e201045738e81aeee39341ed318926 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -320,6 +320,13 @@ void lru_note_cost(struct lruvec *lruvec, bool file, void lru_note_cost_refault(struct folio *folio) { +#ifdef CONFIG_HYPERHOLD_FILE_LRU + if (page_is_file_lru(folio_page(folio, 0))) { + lru_note_cost(&(folio_pgdat(folio)->__lruvec), 1, folio_nr_pages(folio), 0); + return; + } +#endif + lru_note_cost(folio_lruvec(folio), folio_is_file_lru(folio), folio_nr_pages(folio), 0); } diff --git a/mm/swapfile.c b/mm/swapfile.c index 750314fff0c46fd6af28e74b0623448ff393dcfc..aa767f925d4d0d73c6a59f0e64bcc24c99197089 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -46,6 +46,7 @@ #include #include #include +#include #include "internal.h" #include "swap.h" @@ -3254,6 +3255,28 @@ void si_swapinfo(struct sysinfo *val) spin_unlock(&swap_lock); } +#ifdef CONFIG_HYPERHOLD_ZSWAPD +bool free_swap_is_low(void) +{ + unsigned int type; + unsigned long long freeswap = 0; + unsigned long nr_to_be_unused = 0; + + spin_lock(&swap_lock); + for (type = 0; type < nr_swapfiles; type++) { + struct swap_info_struct *si = swap_info[type]; + + if ((si->flags & SWP_USED) && !(si->flags & SWP_WRITEOK)) + nr_to_be_unused += si->inuse_pages; + } + freeswap = atomic_long_read(&nr_swap_pages) + nr_to_be_unused; + spin_unlock(&swap_lock); + + return (freeswap < get_free_swap_threshold()); +} +EXPORT_SYMBOL(free_swap_is_low); +#endif + /* * Verify that a swap entry is valid and increment its swap map count. * diff --git a/mm/vmscan.c b/mm/vmscan.c index 0dfb9a75dfa6d00dafbd2f9d1253f7fe57a2aa8d..3f48a713f020117e6d1c7111461084a217f3d1ce 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -71,103 +71,12 @@ #define CREATE_TRACE_POINTS #include -struct scan_control { - /* How many pages shrink_list() should reclaim */ - unsigned long nr_to_reclaim; - - /* - * Nodemask of nodes allowed by the caller. If NULL, all nodes - * are scanned. - */ - nodemask_t *nodemask; - - /* - * The memory cgroup that hit its limit and as a result is the - * primary target of this reclaim invocation. - */ - struct mem_cgroup *target_mem_cgroup; - - /* - * Scan pressure balancing between anon and file LRUs - */ - unsigned long anon_cost; - unsigned long file_cost; - - /* Can active folios be deactivated as part of reclaim? */ -#define DEACTIVATE_ANON 1 -#define DEACTIVATE_FILE 2 - unsigned int may_deactivate:2; - unsigned int force_deactivate:1; - unsigned int skipped_deactivate:1; - - /* Writepage batching in laptop mode; RECLAIM_WRITE */ - unsigned int may_writepage:1; - - /* Can mapped folios be reclaimed? */ - unsigned int may_unmap:1; - - /* Can folios be swapped as part of reclaim? */ - unsigned int may_swap:1; - - /* Proactive reclaim invoked by userspace through memory.reclaim */ - unsigned int proactive:1; - - /* - * Cgroup memory below memory.low is protected as long as we - * don't threaten to OOM. If any cgroup is reclaimed at - * reduced force or passed over entirely due to its memory.low - * setting (memcg_low_skipped), and nothing is reclaimed as a - * result, then go back for one more cycle that reclaims the protected - * memory (memcg_low_reclaim) to avert OOM. - */ - unsigned int memcg_low_reclaim:1; - unsigned int memcg_low_skipped:1; - - unsigned int hibernation_mode:1; - - /* One of the zones is ready for compaction */ - unsigned int compaction_ready:1; - - /* There is easily reclaimable cold cache in the current node */ - unsigned int cache_trim_mode:1; - - /* The file folios on the current node are dangerously low */ - unsigned int file_is_tiny:1; - - /* Always discard instead of demoting to lower tier memory */ - unsigned int no_demotion:1; - - /* Allocation order */ - s8 order; - - /* Scan (total_size >> priority) pages at once */ - s8 priority; - - /* The highest zone to isolate folios for reclaim from */ - s8 reclaim_idx; - - /* This context's GFP mask */ - gfp_t gfp_mask; - - /* Incremented by the number of inactive pages that were scanned */ - unsigned long nr_scanned; - - /* Number of pages freed so far during a call to shrink_zones() */ - unsigned long nr_reclaimed; - - struct { - unsigned int dirty; - unsigned int unqueued_dirty; - unsigned int congested; - unsigned int writeback; - unsigned int immediate; - unsigned int file_taken; - unsigned int taken; - } nr; - - /* for recording the reclaimed slab by now */ - struct reclaim_state reclaim_state; -}; +#ifdef CONFIG_HYPERHOLD_FILE_LRU +#include +#endif +#ifdef CONFIG_RECLAIM_ACCT +#include +#endif #ifdef ARCH_HAS_PREFETCHW #define prefetchw_prev_lru_folio(_folio, _base, _field) \ @@ -183,6 +92,10 @@ struct scan_control { #define prefetchw_prev_lru_folio(_folio, _base, _field) do { } while (0) #endif +#ifdef CONFIG_HYPERHOLD_FILE_LRU +unsigned int enough_inactive_file = 1; +#endif + /* * From 0 .. 200. Higher means more swappy. */ @@ -430,7 +343,7 @@ void reparent_shrinker_deferred(struct mem_cgroup *memcg) } /* Returns true for reclaim through cgroup limits or cgroup interfaces. */ -static bool cgroup_reclaim(struct scan_control *sc) +bool cgroup_reclaim(struct scan_control *sc) { return sc->target_mem_cgroup; } @@ -457,7 +370,7 @@ static bool root_reclaim(struct scan_control *sc) * This function tests whether the vmscan currently in progress can assume * that the normal dirty throttling mechanism is operational. */ -static bool writeback_throttling_sane(struct scan_control *sc) +bool writeback_throttling_sane(struct scan_control *sc) { if (!cgroup_reclaim(sc)) return true; @@ -489,7 +402,7 @@ static long add_nr_deferred_memcg(long nr, int nid, struct shrinker *shrinker, return 0; } -static bool cgroup_reclaim(struct scan_control *sc) +bool cgroup_reclaim(struct scan_control *sc) { return false; } @@ -499,7 +412,7 @@ static bool root_reclaim(struct scan_control *sc) return true; } -static bool writeback_throttling_sane(struct scan_control *sc) +bool writeback_throttling_sane(struct scan_control *sc) { return true; } @@ -651,12 +564,27 @@ unsigned long zone_reclaimable_pages(struct zone *zone) * @lru: lru to use * @zone_idx: zones to consider (use MAX_NR_ZONES - 1 for the whole LRU list) */ -static unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru, +unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru, int zone_idx) { unsigned long size = 0; int zid; +#ifdef CONFIG_HYPERHOLD_FILE_LRU + if (!mem_cgroup_disabled() && is_node_lruvec(lruvec)) { + for (zid = 0; zid <= zone_idx && zid < MAX_NR_ZONES; zid++) { + struct zone *zone = &lruvec_pgdat(lruvec)->node_zones[zid]; + + if (!managed_zone(zone)) + continue; + + size += zone_page_state(zone, NR_ZONE_LRU_BASE + lru); + } + + return size; + } +#endif + for (zid = 0; zid <= zone_idx; zid++) { struct zone *zone = &lruvec_pgdat(lruvec)->node_zones[zid]; @@ -1030,7 +958,7 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, * * Returns the number of reclaimed slab objects. */ -static unsigned long shrink_slab(gfp_t gfp_mask, int nid, +unsigned long shrink_slab(gfp_t gfp_mask, int nid, struct mem_cgroup *memcg, int priority) { @@ -1702,7 +1630,7 @@ static bool may_enter_fs(struct folio *folio, gfp_t gfp_mask) /* * shrink_folio_list() returns the number of reclaimed pages */ -static unsigned int shrink_folio_list(struct list_head *folio_list, +unsigned int shrink_folio_list(struct list_head *folio_list, struct pglist_data *pgdat, struct scan_control *sc, struct reclaim_stat *stat, bool ignore_references) { @@ -2301,7 +2229,7 @@ static bool skip_cma(struct folio *folio, struct scan_control *sc) * * returns how many pages were moved onto *@dst. */ -static unsigned long isolate_lru_folios(unsigned long nr_to_scan, +unsigned long isolate_lru_folios(unsigned long nr_to_scan, struct lruvec *lruvec, struct list_head *dst, unsigned long *nr_scanned, struct scan_control *sc, enum lru_list lru) @@ -2487,11 +2415,15 @@ static int too_many_isolated(struct pglist_data *pgdat, int file, * * Returns the number of pages moved to the given lruvec. */ -static unsigned int move_folios_to_lru(struct lruvec *lruvec, +unsigned int move_folios_to_lru(struct lruvec *lruvec, struct list_head *list) { int nr_pages, nr_moved = 0; LIST_HEAD(folios_to_free); +#ifdef CONFIG_HYPERHOLD_FILE_LRU + bool prot; + bool file; +#endif while (!list_empty(list)) { struct folio *folio = lru_to_folio(list); @@ -2539,8 +2471,23 @@ static unsigned int move_folios_to_lru(struct lruvec *lruvec, lruvec_add_folio(lruvec, folio); nr_pages = folio_nr_pages(folio); nr_moved += nr_pages; +#ifdef CONFIG_HYPERHOLD_FILE_LRU + if (folio_test_active(folio)) { + prot = is_prot_page(folio_page(folio, 0)); + file = page_is_file_lru(folio_page(folio, 0)); + if (!prot && file) { + lruvec = folio_lruvec(folio); + workingset_age_nonresident(lruvec, + nr_pages); + } else { + workingset_age_nonresident(lruvec, + nr_pages); + } + } +#else if (folio_test_active(folio)) workingset_age_nonresident(lruvec, nr_pages); +#endif } /* @@ -2556,7 +2503,7 @@ static unsigned int move_folios_to_lru(struct lruvec *lruvec, * device by writing to the page cache it sets PF_LOCAL_THROTTLE. In this case * we should not throttle. Otherwise it is safe to do so. */ -static int current_may_throttle(void) +int current_may_throttle(void) { return !(current->flags & PF_LOCAL_THROTTLE); } @@ -2565,7 +2512,7 @@ static int current_may_throttle(void) * shrink_inactive_list() is a helper for shrink_node(). It returns the number * of reclaimed pages */ -static unsigned long shrink_inactive_list(unsigned long nr_to_scan, +unsigned long shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, struct scan_control *sc, enum lru_list lru) { @@ -2583,6 +2530,9 @@ static unsigned long shrink_inactive_list(unsigned long nr_to_scan, if (stalled) return 0; +#ifdef CONFIG_HYPERHOLD_FILE_LRU + sc->isolate_count++; +#endif /* wait a bit for the reclaimer. */ stalled = true; reclaim_throttle(pgdat, VMSCAN_THROTTLE_ISOLATED); @@ -2624,7 +2574,14 @@ static unsigned long shrink_inactive_list(unsigned long nr_to_scan, __count_vm_events(PGSTEAL_ANON + file, nr_reclaimed); spin_unlock_irq(&lruvec->lru_lock); +#ifdef CONFIG_HYPERHOLD_FILE_LRU + if (file) + lru_note_cost(node_lruvec(pgdat), file, stat.nr_pageout, nr_scanned - nr_reclaimed); + else + lru_note_cost(lruvec, file, stat.nr_pageout, nr_scanned - nr_reclaimed); +#else lru_note_cost(lruvec, file, stat.nr_pageout, nr_scanned - nr_reclaimed); +#endif mem_cgroup_uncharge_list(&folio_list); free_unref_page_list(&folio_list); @@ -2685,7 +2642,7 @@ static unsigned long shrink_inactive_list(unsigned long nr_to_scan, * The downside is that we have to touch folio->_refcount against each folio. * But we had to alter folio->flags anyway. */ -static void shrink_active_list(unsigned long nr_to_scan, +void shrink_active_list(unsigned long nr_to_scan, struct lruvec *lruvec, struct scan_control *sc, enum lru_list lru) @@ -2841,7 +2798,7 @@ unsigned long reclaim_pages(struct list_head *folio_list) return nr_reclaimed; } -static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan, +unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan, struct lruvec *lruvec, struct scan_control *sc) { if (is_active_lru(lru)) { @@ -2883,7 +2840,7 @@ static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan, * 1TB 101 10GB * 10TB 320 32GB */ -static bool inactive_is_low(struct lruvec *lruvec, enum lru_list inactive_lru) +bool inactive_is_low(struct lruvec *lruvec, enum lru_list inactive_lru) { enum lru_list active_lru = inactive_lru + LRU_ACTIVE; unsigned long inactive, active; @@ -2902,13 +2859,6 @@ static bool inactive_is_low(struct lruvec *lruvec, enum lru_list inactive_lru) return inactive * inactive_ratio < active; } -enum scan_balance { - SCAN_EQUAL, - SCAN_FRACT, - SCAN_ANON, - SCAN_FILE, -}; - static void prepare_scan_count(pg_data_t *pgdat, struct scan_control *sc) { unsigned long file; @@ -5531,6 +5481,7 @@ static void shrink_many(struct pglist_data *pgdat, struct scan_control *sc) goto restart; } +#ifndef CONFIG_HYPERHOLD_FILE_LRU static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) { struct blk_plug plug; @@ -5551,6 +5502,7 @@ static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc blk_finish_plug(&plug); } +#endif #else /* !CONFIG_MEMCG */ @@ -5559,10 +5511,12 @@ static void shrink_many(struct pglist_data *pgdat, struct scan_control *sc) BUILD_BUG(); } +#ifndef CONFIG_HYPERHOLD_FILE_LRU static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) { BUILD_BUG(); } +#endif #endif @@ -6305,7 +6259,7 @@ static void lru_gen_shrink_node(struct pglist_data *pgdat, struct scan_control * #endif /* CONFIG_LRU_GEN */ -static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) +void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) { unsigned long nr[NR_LRU_LISTS]; unsigned long targets[NR_LRU_LISTS]; @@ -6493,6 +6447,7 @@ static inline bool should_continue_reclaim(struct pglist_data *pgdat, return inactive_lru_pages > pages_for_compaction; } +#ifndef CONFIG_HYPERHOLD_FILE_LRU static void shrink_node_memcgs(pg_data_t *pgdat, struct scan_control *sc) { struct mem_cgroup *target_memcg = sc->target_mem_cgroup; @@ -6661,6 +6616,7 @@ static void shrink_node(pg_data_t *pgdat, struct scan_control *sc) if (reclaimable) pgdat->kswapd_failures = 0; } +#endif /* * Returns true if compaction should go ahead for a costly-order request, or @@ -6811,7 +6767,11 @@ static void shrink_zones(struct zonelist *zonelist, struct scan_control *sc) if (zone->zone_pgdat == last_pgdat) continue; last_pgdat = zone->zone_pgdat; +#ifdef CONFIG_HYPERHOLD_FILE_LRU + shrink_node_hyperhold(zone->zone_pgdat, sc); +#else shrink_node(zone->zone_pgdat, sc); +#endif } if (first_pgdat) @@ -6828,10 +6788,19 @@ static void snapshot_refaults(struct mem_cgroup *target_memcg, pg_data_t *pgdat) { struct lruvec *target_lruvec; unsigned long refaults; +#ifdef CONFIG_HYPERHOLD_FILE_LRU + struct lruvec *lruvec; +#endif if (lru_gen_enabled()) return; +#ifdef CONFIG_HYPERHOLD_FILE_LRU + lruvec = node_lruvec(pgdat); + lruvec->refaults[0] = lruvec_page_state(lruvec, WORKINGSET_ACTIVATE_ANON); /* modified */ + lruvec->refaults[1] = lruvec_page_state(lruvec, WORKINGSET_ACTIVATE_FILE); /* modified */ +#endif + target_lruvec = mem_cgroup_lruvec(target_memcg, pgdat); refaults = lruvec_page_state(target_lruvec, WORKINGSET_ACTIVATE_ANON); target_lruvec->refaults[WORKINGSET_ANON] = refaults; @@ -7133,6 +7102,9 @@ unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg, .reclaim_idx = MAX_NR_ZONES - 1, .may_swap = !noswap, }; +#ifdef CONFIG_HYPERHOLD_FILE_LRU + unsigned long nr[NR_LRU_LISTS]; +#endif WARN_ON_ONCE(!current->reclaim_state); @@ -7149,7 +7121,17 @@ unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg, * will pick up pages from other mem cgroup's as well. We hack * the priority and make it zero. */ +#ifdef CONFIG_HYPERHOLD_FILE_LRU + nr[LRU_ACTIVE_ANON] = lruvec_lru_size(lruvec, + LRU_ACTIVE_ANON, MAX_NR_ZONES); + nr[LRU_INACTIVE_ANON] = lruvec_lru_size(lruvec, + LRU_INACTIVE_ANON, MAX_NR_ZONES); + nr[LRU_ACTIVE_FILE] = 0; + nr[LRU_INACTIVE_FILE] = 0; + shrink_anon_memcg(pgdat, memcg, &sc, nr); +#else shrink_lruvec(lruvec, &sc); +#endif trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed); @@ -7363,7 +7345,11 @@ static bool kswapd_shrink_node(pg_data_t *pgdat, * Historically care was taken to put equal pressure on all zones but * now pressure is applied based on node LRU order. */ +#ifdef CONFIG_HYPERHOLD_FILE_LRU + shrink_node_hyperhold(pgdat, sc); +#else shrink_node(pgdat, sc); +#endif /* * Fragmentation may mean that the system cannot be rebalanced for @@ -7813,6 +7799,9 @@ static int kswapd(void *p) */ trace_mm_vmscan_kswapd_wake(pgdat->node_id, highest_zoneidx, alloc_order); +#ifdef CONFIG_MEMORY_MONITOR + kswapd_monitor_wake_up_queue(); +#endif reclaim_order = balance_pgdat(pgdat, alloc_order, highest_zoneidx); if (reclaim_order < alloc_order) @@ -8075,7 +8064,11 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in * priorities until we have enough memory freed. */ do { +#ifdef CONFIG_HYPERHOLD_FILE_LRU + shrink_node_hyperhold(pgdat, &sc); +#else shrink_node(pgdat, &sc); +#endif } while (sc.nr_reclaimed < nr_pages && --sc.priority >= 0); } diff --git a/mm/vmstat.c b/mm/vmstat.c index 00e81e99c6ee24e419446b8c476faea0b8ef75bd..2142e2ac9470851804ceeac56d74c977acbf56be 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1408,6 +1408,24 @@ const char * const vmstat_text[] = { "vma_lock_retry", "vma_lock_miss", #endif +#ifdef CONFIG_HYPERHOLD_ZSWAPD + "zswapd_running", + "zswapd_hit_refaults", + "zswapd_medium_press", + "zswapd_critical_press", + "zswapd_memcg_ratio_skip", + "zswapd_memcg_refault_skip", + "zswapd_swapout", + "zswapd_empty_round", + "zswapd_empty_round_skip_times", + "zswapd_snapshot_times", + "zswapd_reclaimed", + "zswapd_scanned", +#endif +#ifdef CONFIG_HYPERHOLD_MEMCG + "freeze_reclaimed", + "freeze_reclaim_count", +#endif #endif /* CONFIG_VM_EVENT_COUNTERS || CONFIG_MEMCG */ }; #endif /* CONFIG_PROC_FS || CONFIG_SYSFS || CONFIG_NUMA || CONFIG_MEMCG */ diff --git a/mm/workingset.c b/mm/workingset.c index 2559a1f2fc1cfa70352e75b3963ff762e2629d03..4983d0c6a073ceda488ca6535577e701dcd10c0d 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -398,7 +398,16 @@ void *workingset_eviction(struct folio *folio, struct mem_cgroup *target_memcg) memcgid = mem_cgroup_id(lruvec_memcg(lruvec)); eviction = atomic_long_read(&lruvec->nonresident_age); eviction >>= bucket_order; +#ifdef CONFIG_HYPERHOLD_FILE_LRU + if (!is_prot_page(folio_page(folio, 0)) && page_is_file_lru(folio_page(folio, 0))) { + lruvec = folio_lruvec(folio); + workingset_age_nonresident(lruvec, folio_nr_pages(folio)); + } else { + workingset_age_nonresident(lruvec, folio_nr_pages(folio)); + } +#else workingset_age_nonresident(lruvec, folio_nr_pages(folio)); +#endif return pack_shadow(memcgid, pgdat, eviction, folio_test_workingset(folio)); } @@ -447,9 +456,17 @@ bool workingset_test_recent(void *shadow, bool file, bool *workingset) * would be better if the root_mem_cgroup existed in all * configurations instead. */ +#ifdef CONFIG_HYPERHOLD_FILE_LRU + if (memcgid != -1) { + eviction_memcg = mem_cgroup_from_id(memcgid); + if (!mem_cgroup_disabled() && !eviction_memcg) + return false; + } +#else eviction_memcg = mem_cgroup_from_id(memcgid); if (!mem_cgroup_disabled() && !eviction_memcg) return false; +#endif eviction_lruvec = mem_cgroup_lruvec(eviction_memcg, pgdat); refault = atomic_long_read(&eviction_lruvec->nonresident_age); @@ -479,10 +496,21 @@ bool workingset_test_recent(void *shadow, bool file, bool *workingset) * workingset competition needs to consider anon or not depends * on having free swap space. */ +#ifdef CONFIG_HYPERHOLD_FILE_LRU + workingset_size = lruvec_page_state(node_lruvec(pgdat), NR_ACTIVE_FILE); +#else workingset_size = lruvec_page_state(eviction_lruvec, NR_ACTIVE_FILE); +#endif + if (!file) { +#ifdef CONFIG_HYPERHOLD_FILE_LRU + workingset_size += lruvec_page_state(node_lruvec(pgdat), + NR_INACTIVE_FILE); +#else + workingset_size += lruvec_page_state(eviction_lruvec, NR_INACTIVE_FILE); +#endif } if (mem_cgroup_get_nr_swap_pages(eviction_memcg) > 0) { workingset_size += lruvec_page_state(eviction_lruvec, @@ -537,14 +565,33 @@ void workingset_refault(struct folio *folio, void *shadow) pgdat = folio_pgdat(folio); lruvec = mem_cgroup_lruvec(memcg, pgdat); +#ifdef CONFIG_HYPERHOLD_FILE_LRU + if (!is_prot_page(folio_page(folio, 0)) && file) + mod_lruvec_state(node_lruvec(pgdat), + WORKINGSET_REFAULT_BASE + file, folio_nr_pages(folio)); + else + mod_lruvec_state(lruvec, WORKINGSET_REFAULT_BASE + file, nr); +#else mod_lruvec_state(lruvec, WORKINGSET_REFAULT_BASE + file, nr); +#endif if (!workingset_test_recent(shadow, file, &workingset)) goto out; folio_set_active(folio); +#ifdef CONFIG_HYPERHOLD_FILE_LRU + if (!is_prot_page(folio_page(folio, 0)) && file) { + workingset_age_nonresident(node_lruvec(pgdat), + folio_nr_pages(folio)); + mod_lruvec_state(lruvec, WORKINGSET_ACTIVATE_BASE + file, folio_nr_pages(folio)); + } else { + workingset_age_nonresident(lruvec, nr); + mod_lruvec_state(lruvec, WORKINGSET_ACTIVATE_BASE + file, nr); + } +#else workingset_age_nonresident(lruvec, nr); mod_lruvec_state(lruvec, WORKINGSET_ACTIVATE_BASE + file, nr); +#endif /* Folio was active prior to eviction */ if (workingset) { @@ -554,7 +601,14 @@ void workingset_refault(struct folio *folio, void *shadow) * putback */ lru_note_cost_refault(folio); +#ifdef CONFIG_HYPERHOLD_FILE_LRU + if (!is_prot_page(folio_page(folio, 0)) && file) + mod_lruvec_state(node_lruvec(pgdat), WORKINGSET_RESTORE_BASE + file, folio_nr_pages(folio)); + else + mod_lruvec_state(lruvec, WORKINGSET_RESTORE_BASE + file, nr); +#else mod_lruvec_state(lruvec, WORKINGSET_RESTORE_BASE + file, nr); +#endif } out: rcu_read_unlock(); @@ -567,6 +621,7 @@ void workingset_refault(struct folio *folio, void *shadow) void workingset_activation(struct folio *folio) { struct mem_cgroup *memcg; + struct lruvec *lruvec; rcu_read_lock(); /* @@ -579,7 +634,16 @@ void workingset_activation(struct folio *folio) memcg = folio_memcg_rcu(folio); if (!mem_cgroup_disabled() && !memcg) goto out; +#ifdef CONFIG_HYPERHOLD_FILE_LRU + if (!is_prot_page(folio_page(folio, 0)) && page_is_file_lru(folio_page(folio, 0))) { + lruvec = folio_lruvec(folio); + workingset_age_nonresident(lruvec, folio_nr_pages(folio)); + } else { + workingset_age_nonresident(folio_lruvec(folio), folio_nr_pages(folio)); + } +#else workingset_age_nonresident(folio_lruvec(folio), folio_nr_pages(folio)); +#endif out: rcu_read_unlock(); } @@ -660,6 +724,7 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker, * PAGE_SIZE / xa_nodes / node_entries * 8 / PAGE_SIZE */ #ifdef CONFIG_MEMCG +#ifndef CONFIG_HYPERHOLD_FILE_LRU if (sc->memcg) { struct lruvec *lruvec; int i; @@ -674,6 +739,7 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker, pages += lruvec_page_state_local( lruvec, NR_SLAB_UNRECLAIMABLE_B) >> PAGE_SHIFT; } else +#endif #endif pages = node_present_pages(sc->nid); diff --git a/mm/zswapd.c b/mm/zswapd.c new file mode 100644 index 0000000000000000000000000000000000000000..d80a00d9f1fd90b8571c0f45df2416f7712db03f --- /dev/null +++ b/mm/zswapd.c @@ -0,0 +1,911 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * mm/zswapd.c + * + * Copyright (c) 2020-2022 Huawei Technologies Co., Ltd. + */ + +#include +#include +#include +#include +#include +#include +#ifdef CONFIG_RECLAIM_ACCT +#include +#endif + +#include "zswapd_internal.h" +#include "internal.h" + +#define UNSET_ZRAM_WM_RATIO 0 +#define ESWAP_PERCENT_CONSTANT 100 +#define DEFAULT_ZRAM_WM_RATIO 37 +#define SWAP_MORE_ZRAM (50 * (SZ_1M)) + +static wait_queue_head_t snapshotd_wait; +static atomic_t snapshotd_wait_flag; +static atomic_t snapshotd_init_flag = ATOMIC_INIT(0); +static struct task_struct *snapshotd_task; + +static pid_t zswapd_pid = -1; +static unsigned long long last_anon_pagefault; +static unsigned long long anon_refault_ratio; +static unsigned long long zswapd_skip_interval; +static unsigned long last_zswapd_time; +static unsigned long last_snapshot_time; +bool last_round_is_empty; + + +DECLARE_RWSEM(gs_lock); +LIST_HEAD(gs_list); + +void unregister_group_swap(struct group_swap_device *gsdev) +{ + down_write(&gs_lock); + list_del(&gsdev->list); + up_write(&gs_lock); + + kfree(gsdev); +} +EXPORT_SYMBOL(unregister_group_swap); + +struct group_swap_device *register_group_swap(struct group_swap_ops *ops, void *priv) +{ + struct group_swap_device *gsdev = kzalloc(sizeof(struct group_swap_device), GFP_KERNEL); + + if (!gsdev) + return NULL; + + gsdev->priv = priv; + gsdev->ops = ops; + + down_write(&gs_lock); + list_add(&gsdev->list, &gs_list); + up_write(&gs_lock); + + return gsdev; +} +EXPORT_SYMBOL(register_group_swap); + +u64 memcg_data_size(struct mem_cgroup *memcg, int type) +{ + struct group_swap_device *gsdev = NULL; + u64 size = 0; + + down_read(&gs_lock); + list_for_each_entry(gsdev, &gs_list, list) + size += gsdev->ops->group_data_size(memcg->id.id, type, gsdev->priv); + up_read(&gs_lock); + + return size; +} + +u64 swapin_memcg(struct mem_cgroup *memcg, u64 req_size) +{ + u64 swap_size = memcg_data_size(memcg, SWAP_SIZE); + u64 read_size = 0; + u64 ratio = atomic64_read(&memcg->memcg_reclaimed.ub_ufs2zram_ratio); + struct group_swap_device *gsdev = NULL; + + if (req_size > div_u64(swap_size * ratio, ESWAP_PERCENT_CONSTANT)) + req_size = div_u64(swap_size * ratio, ESWAP_PERCENT_CONSTANT); + down_read(&gs_lock); + list_for_each_entry(gsdev, &gs_list, list) { + read_size += gsdev->ops->group_read(memcg->id.id, req_size - read_size, + gsdev->priv); + if (read_size >= req_size) + break; + } + up_read(&gs_lock); + + return read_size; +} + +static u64 swapout_memcg(struct mem_cgroup *memcg, u64 req_size) +{ + u64 cache_size = memcg_data_size(memcg, CACHE_SIZE); + u64 swap_size = memcg_data_size(memcg, SWAP_SIZE); + u64 all_size = cache_size + swap_size; + u64 write_size = 0; + u32 ratio = atomic_read(&memcg->memcg_reclaimed.ub_zram2ufs_ratio); + struct group_swap_device *gsdev = NULL; + + if (div_u64(all_size * ratio, ESWAP_PERCENT_CONSTANT) <= swap_size) + return 0; + if (req_size > div_u64(all_size * ratio, ESWAP_PERCENT_CONSTANT) - swap_size) + req_size = div_u64(all_size * ratio, ESWAP_PERCENT_CONSTANT) - swap_size; + down_read(&gs_lock); + list_for_each_entry(gsdev, &gs_list, list) { + write_size += gsdev->ops->group_write(memcg->id.id, req_size - write_size, + gsdev->priv); + if (write_size >= req_size) + break; + } + up_read(&gs_lock); + + return write_size; +} + +static u64 swapout(u64 req_size) +{ + struct mem_cgroup *memcg = NULL; + u64 write_size = 0; + + while ((memcg = get_next_memcg(memcg)) != NULL) { + write_size += swapout_memcg(memcg, req_size - write_size); + if (write_size >= req_size) + break; + } + + return write_size; +} + +static unsigned long long get_zram_used_pages(void) +{ + struct mem_cgroup *memcg = NULL; + unsigned long long zram_pages = 0; + + while ((memcg = get_next_memcg(memcg)) != NULL) + zram_pages += memcg_data_size(memcg, CACHE_PAGE); + + return zram_pages; +} + +static unsigned long long get_eswap_used_pages(void) +{ + struct mem_cgroup *memcg = NULL; + unsigned long long eswap_pages = 0; + + while ((memcg = get_next_memcg(memcg)) != NULL) + eswap_pages += memcg_data_size(memcg, SWAP_PAGE); + + return eswap_pages; +} + +static unsigned long long get_zram_pagefault(void) +{ + struct mem_cgroup *memcg = NULL; + unsigned long long cache_fault = 0; + + while ((memcg = get_next_memcg(memcg)) != NULL) + cache_fault += memcg_data_size(memcg, CACHE_FAULT); + + return cache_fault; +} + +static unsigned int calc_sys_cur_avail_buffers(void) +{ + const unsigned int percent_constant = 100; + unsigned long freemem; + unsigned long active_file; + unsigned long inactive_file; + unsigned long buffers; + + freemem = global_zone_page_state(NR_FREE_PAGES) * PAGE_SIZE / SZ_1K; + active_file = global_node_page_state(NR_ACTIVE_FILE) * PAGE_SIZE / SZ_1K; + inactive_file = global_node_page_state(NR_INACTIVE_FILE) * PAGE_SIZE / SZ_1K; + + buffers = freemem + inactive_file * get_inactive_file_ratio() / percent_constant + + active_file * get_active_file_ratio() / percent_constant; + + return (buffers * SZ_1K / SZ_1M); /* kb to mb */ +} + +void zswapd_status_show(struct seq_file *m) +{ + unsigned int buffers = calc_sys_cur_avail_buffers(); + + seq_printf(m, "buffer_size:%u\n", buffers); + seq_printf(m, "recent_refault:%llu\n", anon_refault_ratio); +} + +pid_t get_zswapd_pid(void) +{ + return zswapd_pid; +} + +static bool min_buffer_is_suitable(void) +{ + unsigned int buffers = calc_sys_cur_avail_buffers(); + + if (buffers >= get_min_avail_buffers()) + return true; + + return false; +} + +static bool buffer_is_suitable(void) +{ + unsigned int buffers = calc_sys_cur_avail_buffers(); + + if (buffers >= get_avail_buffers()) + return true; + + return false; +} + +static bool high_buffer_is_suitable(void) +{ + unsigned int buffers = calc_sys_cur_avail_buffers(); + + if (buffers >= get_high_avail_buffers()) + return true; + + return false; +} + +static void snapshot_anon_refaults(void) +{ + struct mem_cgroup *memcg = NULL; + + while ((memcg = get_next_memcg(memcg)) != NULL) + memcg->memcg_reclaimed.reclaimed_pagefault = memcg_data_size(memcg, CACHE_FAULT); + + last_anon_pagefault = get_zram_pagefault(); + last_snapshot_time = jiffies; +} + +/* + * Return true if refault changes between two read operations. + */ +static bool get_memcg_anon_refault_status(struct mem_cgroup *memcg) +{ + const unsigned int percent_constant = 100; + unsigned long long anon_pagefault; + unsigned long long anon_total; + unsigned long long ratio; + struct mem_cgroup_per_node *mz = NULL; + struct lruvec *lruvec = NULL; + + if (!memcg) + return false; + + anon_pagefault = memcg_data_size(memcg, CACHE_FAULT); + if (anon_pagefault == memcg->memcg_reclaimed.reclaimed_pagefault) + return false; + + mz = mem_cgroup_nodeinfo(memcg, 0); + if (!mz) + return false; + + lruvec = &mz->lruvec; + if (!lruvec) + return false; + + anon_total = lruvec_lru_size(lruvec, LRU_ACTIVE_ANON, MAX_NR_ZONES) + + lruvec_lru_size(lruvec, LRU_INACTIVE_ANON, MAX_NR_ZONES) + + memcg_data_size(memcg, SWAP_PAGE) + memcg_data_size(memcg, CACHE_PAGE); + + ratio = div64_u64((anon_pagefault - memcg->memcg_reclaimed.reclaimed_pagefault) * + percent_constant, (anon_total + 1)); + if (ratio > atomic_read(&memcg->memcg_reclaimed.refault_threshold)) + return true; + + return false; +} + +static bool get_area_anon_refault_status(void) +{ + const unsigned int percent_constant = 1000; + unsigned long long anon_pagefault; + unsigned long long ratio; + unsigned long long time; + + anon_pagefault = get_zram_pagefault(); + time = jiffies; + if (anon_pagefault == last_anon_pagefault || time == last_snapshot_time) + return false; + + ratio = div_u64((anon_pagefault - last_anon_pagefault) * percent_constant, + (jiffies_to_msecs(time - last_snapshot_time) + 1)); + anon_refault_ratio = ratio; + + if (ratio > get_area_anon_refault_threshold()) + return true; + + return false; +} + +void wakeup_snapshotd(void) +{ + unsigned long snapshot_interval; + + snapshot_interval = jiffies_to_msecs(jiffies - last_snapshot_time); + if (snapshot_interval >= get_anon_refault_snapshot_min_interval()) { + atomic_set(&snapshotd_wait_flag, 1); + wake_up_interruptible(&snapshotd_wait); + } +} + +static int snapshotd(void *p) +{ + int ret; + + while (!kthread_should_stop()) { + ret = wait_event_interruptible(snapshotd_wait, atomic_read(&snapshotd_wait_flag)); + if (ret) + continue; + + atomic_set(&snapshotd_wait_flag, 0); + + snapshot_anon_refaults(); + count_vm_event(ZSWAPD_SNAPSHOT_TIMES); + } + + return 0; +} + +void set_snapshotd_init_flag(unsigned int val) +{ + atomic_set(&snapshotd_init_flag, val); +} + +/* + * This snapshotd start function will be called by init. + */ +int snapshotd_run(void) +{ + atomic_set(&snapshotd_wait_flag, 0); + init_waitqueue_head(&snapshotd_wait); + + snapshotd_task = kthread_run(snapshotd, NULL, "snapshotd"); + if (IS_ERR(snapshotd_task)) { + pr_err("Failed to start snapshotd\n"); + return PTR_ERR(snapshotd_task); + } + + return 0; +} + +static int __init snapshotd_init(void) +{ + snapshotd_run(); + + return 0; +} +module_init(snapshotd_init); + +static int get_zswapd_eswap_policy(void) +{ + if (get_zram_wm_ratio() == UNSET_ZRAM_WM_RATIO) + return CHECK_BUFFER_ONLY; + else + return CHECK_BUFFER_ZRAMRATIO_BOTH; +} + +static unsigned int get_policy_zram_wm_ratio(void) +{ + enum zswapd_eswap_policy policy = get_zswapd_eswap_policy(); + + if (policy == CHECK_BUFFER_ONLY) + return DEFAULT_ZRAM_WM_RATIO; + else + return get_zram_wm_ratio(); +} + +int get_zram_current_watermark(void) +{ + long long diff_buffers; + const unsigned int percent_constant = 10; + u64 nr_total; + unsigned int zram_wm_ratio = get_policy_zram_wm_ratio(); + + nr_total = totalram_pages(); + /* B_target - B_current */ + diff_buffers = get_avail_buffers() - calc_sys_cur_avail_buffers(); + /* MB to page */ + diff_buffers *= SZ_1M / PAGE_SIZE; + /* after_comp to before_comp */ + diff_buffers *= get_compress_ratio(); + /* page to ratio */ + diff_buffers = div64_s64(diff_buffers * percent_constant, nr_total); + + return min((long long)zram_wm_ratio, zram_wm_ratio - diff_buffers); +} + +bool zram_watermark_ok(void) +{ + const unsigned int percent_constant = 100; + u64 nr_zram_used; + u64 nr_wm; + u64 ratio; + + ratio = get_zram_current_watermark(); + nr_zram_used = get_zram_used_pages(); + nr_wm = div_u64(totalram_pages() * ratio, percent_constant); + if (nr_zram_used > nr_wm) + return true; + + return false; +} + +bool zram_watermark_exceed(void) +{ + u64 nr_zram_used; + const unsigned long long nr_wm = get_zram_critical_threshold() * (SZ_1M / PAGE_SIZE); + + if (!nr_wm) + return false; + + nr_zram_used = get_zram_used_pages(); + if (nr_zram_used > nr_wm) + return true; + return false; +} + +void wakeup_zswapd(pg_data_t *pgdat) +{ + unsigned long interval; + + if (IS_ERR(pgdat->zswapd)) + return; + + if (!wq_has_sleeper(&pgdat->zswapd_wait)) + return; + + /* + * make anon pagefault snapshots + * wake up snapshotd + */ + if (atomic_read(&snapshotd_init_flag) == 1) + wakeup_snapshotd(); + + /* wake up when the buffer is lower than min_avail_buffer */ + if (min_buffer_is_suitable()) + return; + + interval = jiffies_to_msecs(jiffies - last_zswapd_time); + if (interval < zswapd_skip_interval) { + count_vm_event(ZSWAPD_EMPTY_ROUND_SKIP_TIMES); + return; + } + + atomic_set(&pgdat->zswapd_wait_flag, 1); + wake_up_interruptible(&pgdat->zswapd_wait); +} + +void wake_all_zswapd(void) +{ + pg_data_t *pgdat = NULL; + int nid; + + for_each_online_node(nid) { + pgdat = NODE_DATA(nid); + wakeup_zswapd(pgdat); + } +} + +#ifdef CONFIG_HYPERHOLD_FILE_LRU +static void zswapd_shrink_active_list(unsigned long nr_to_scan, + struct lruvec *lruvec, struct scan_control *sc, enum lru_list lru) +{ + unsigned int nr_deactivate; + unsigned long nr_scanned; + unsigned long nr_taken; + + struct page *page = NULL; + struct pglist_data *pgdat = lruvec_pgdat(lruvec); + unsigned long *node_anon_cost = &pgdat->__lruvec.anon_cost; + unsigned long *anon_cost = &lruvec->anon_cost; + LIST_HEAD(l_inactive); + LIST_HEAD(l_hold); + + lru_add_drain(); + + spin_lock_irq(&lruvec->lru_lock); + nr_taken = isolate_lru_folios(nr_to_scan, lruvec, &l_hold, &nr_scanned, sc, lru); + __mod_node_page_state(pgdat, NR_ISOLATED_ANON, nr_taken); + *anon_cost += nr_taken; + *node_anon_cost += nr_taken; + __count_vm_events(PGREFILL, nr_scanned); + count_memcg_events(lruvec_memcg(lruvec), PGREFILL, nr_scanned); + spin_unlock_irq(&lruvec->lru_lock); + + while (!list_empty(&l_hold)) { + cond_resched(); + page = lru_to_page(&l_hold); + list_del(&page->lru); + + if (unlikely(!folio_evictable(page_folio(page)))) { + putback_lru_page(page); + continue; + } + + ClearPageActive(page); + SetPageWorkingset(page); + list_add(&page->lru, &l_inactive); + } + + spin_lock_irq(&lruvec->lru_lock); + nr_deactivate = move_folios_to_lru(lruvec, &l_inactive); + __mod_node_page_state(pgdat, NR_ISOLATED_ANON, -nr_taken); + spin_unlock_irq(&lruvec->lru_lock); + + mem_cgroup_uncharge_list(&l_inactive); + free_unref_page_list(&l_inactive); + + trace_mm_vmscan_lru_zswapd_shrink_active(pgdat->node_id, nr_taken, + nr_deactivate, sc->priority); +} + +static unsigned long zswapd_shrink_list(enum lru_list lru, + unsigned long nr_to_scan, struct lruvec *lruvec, + struct scan_control *sc) +{ +#ifdef CONFIG_RECLAIM_ACCT + unsigned long nr_reclaimed; + + reclaimacct_substage_start(RA_SHRINKANON); +#endif + if (is_active_lru(lru)) { + if (sc->may_deactivate & (1 << is_file_lru(lru))) + zswapd_shrink_active_list(nr_to_scan, lruvec, sc, lru); + else + sc->skipped_deactivate = 1; +#ifdef CONFIG_RECLAIM_ACCT + reclaimacct_substage_end(RA_SHRINKANON, 0, NULL); +#endif + return 0; + } + +#ifdef CONFIG_RECLAIM_ACCT + nr_reclaimed = shrink_inactive_list(nr_to_scan, lruvec, sc, lru); + reclaimacct_substage_end(RA_SHRINKANON, nr_reclaimed, NULL); + return nr_reclaimed; +#else + return shrink_inactive_list(nr_to_scan, lruvec, sc, lru); +#endif +} + +static void zswapd_shrink_anon_memcg(struct pglist_data *pgdat, + struct mem_cgroup *memcg, struct scan_control *sc, unsigned long *nr) +{ + struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat); + unsigned long nr_reclaimed = 0; + unsigned long nr_to_scan; + struct blk_plug plug; + enum lru_list lru; + + blk_start_plug(&plug); + + while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_ANON]) { + for (lru = 0; lru <= LRU_ACTIVE_ANON; lru++) { + if (nr[lru]) { + nr_to_scan = min(nr[lru], SWAP_CLUSTER_MAX); + nr[lru] -= nr_to_scan; + nr_reclaimed += zswapd_shrink_list(lru, + nr_to_scan, lruvec, sc); + } + } + } + + blk_finish_plug(&plug); + sc->nr_reclaimed += nr_reclaimed; +} +#endif + +static bool zswapd_shrink_anon(pg_data_t *pgdat, struct scan_control *sc) +{ + const unsigned int percent_constant = 100; + struct mem_cgroup *memcg = NULL; + unsigned long nr[NR_LRU_LISTS]; + + while ((memcg = get_next_memcg(memcg)) != NULL) { + struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat); + u64 nr_active, nr_inactive, nr_zram, nr_eswap, zram_ratio; + + /* reclaim and try to meet the high buffer watermark */ + if (high_buffer_is_suitable()) { + get_next_memcg_break(memcg); + break; + } + + if (get_memcg_anon_refault_status(memcg)) { + count_vm_event(ZSWAPD_MEMCG_REFAULT_SKIP); + continue; + } + + nr_active = lruvec_lru_size(lruvec, LRU_ACTIVE_ANON, MAX_NR_ZONES); + nr_inactive = lruvec_lru_size(lruvec, LRU_INACTIVE_ANON, MAX_NR_ZONES); + nr_zram = memcg_data_size(memcg, CACHE_PAGE); + nr_eswap = memcg_data_size(memcg, SWAP_PAGE); + + zram_ratio = div64_u64((nr_zram + nr_eswap) * percent_constant, + (nr_inactive + nr_active + nr_zram + nr_eswap + 1)); + if (zram_ratio >= (u32)atomic_read(&memcg->memcg_reclaimed.ub_mem2zram_ratio)) { + count_vm_event(ZSWAPD_MEMCG_RATIO_SKIP); + continue; + } + + nr[LRU_ACTIVE_ANON] = nr_active >> (unsigned int)sc->priority; + nr[LRU_INACTIVE_ANON] = nr_inactive >> (unsigned int)sc->priority; + nr[LRU_ACTIVE_FILE] = 0; + nr[LRU_INACTIVE_FILE] = 0; + +#ifdef CONFIG_HYPERHOLD_FILE_LRU + zswapd_shrink_anon_memcg(pgdat, memcg, sc, nr); +#else + shrink_lruvec(lruvec, sc); +#endif + shrink_slab(sc->gfp_mask, pgdat->node_id, memcg, sc->priority); + + if (sc->nr_reclaimed >= sc->nr_to_reclaim) { + get_next_memcg_break(memcg); + break; + } + } + + return sc->nr_scanned >= sc->nr_to_reclaim; +} + +static u64 __calc_nr_to_reclaim(void) +{ + unsigned int buffers; + unsigned int high_buffers; + unsigned int max_reclaim_size; + u64 reclaim_size = 0; + + high_buffers = get_high_avail_buffers(); + buffers = calc_sys_cur_avail_buffers(); + max_reclaim_size = get_zswapd_max_reclaim_size(); + if (buffers < high_buffers) + reclaim_size = high_buffers - buffers; + + /* once max reclaim target is max_reclaim_size */ + reclaim_size = min(reclaim_size, (u64)max_reclaim_size); + + /* MB to pages */ + return div_u64(reclaim_size * SZ_1M, PAGE_SIZE); +} + +static void zswapd_shrink_node(pg_data_t *pgdat) +{ + struct scan_control sc = { + .gfp_mask = GFP_KERNEL, + .order = 0, + .priority = DEF_PRIORITY / 2, + .may_writepage = !laptop_mode, + .may_unmap = 1, + .may_swap = 1, + .reclaim_idx = MAX_NR_ZONES - 1, + }; + const unsigned int increase_rate = 2; + + do { + unsigned long nr_reclaimed = sc.nr_reclaimed; + bool raise_priority = true; + + /* reclaim and try to meet the high buffer watermark */ + if (high_buffer_is_suitable()) + break; + + sc.nr_scanned = 0; + sc.nr_to_reclaim = __calc_nr_to_reclaim(); + + if (zswapd_shrink_anon(pgdat, &sc)) + raise_priority = false; + count_vm_events(ZSWAPD_SCANNED, sc.nr_scanned); + count_vm_events(ZSWAPD_RECLAIMED, sc.nr_reclaimed); + if (try_to_freeze() || kthread_should_stop()) + break; + + nr_reclaimed = sc.nr_reclaimed - nr_reclaimed; + if (raise_priority || !nr_reclaimed) + sc.priority--; + } while (sc.priority >= 1); + + /* + * When meets the first empty round, set the interval to t. + * If the following round is still empty, set the intervall + * to 2t. If the round is always empty, then 4t, 8t, and so on. + * But make sure the interval is not more than the max_skip_interval. + * Once a non-empty round occurs, reset the interval to 0. + */ + if (sc.nr_reclaimed < get_empty_round_check_threshold()) { + count_vm_event(ZSWAPD_EMPTY_ROUND); + if (last_round_is_empty) + zswapd_skip_interval = min(zswapd_skip_interval * + increase_rate, get_max_skip_interval()); + else + zswapd_skip_interval = get_empty_round_skip_interval(); + last_round_is_empty = true; + } else { + zswapd_skip_interval = 0; + last_round_is_empty = false; + } +} + +u64 zram_watermark_diff(void) +{ + const unsigned int percent_constant = 100; + u64 nr_zram_used; + u64 nr_wm; + u64 ratio; + + ratio = get_zram_current_watermark(); + nr_zram_used = get_zram_used_pages(); + nr_wm = div_u64(totalram_pages() * ratio, percent_constant); + if (nr_zram_used > nr_wm) + return (nr_zram_used - nr_wm) * PAGE_SIZE + SWAP_MORE_ZRAM; + + return 0; +} + +u64 zswapd_buffer_diff(void) +{ + u64 buffers; + u64 avail; + + buffers = calc_sys_cur_avail_buffers(); + avail = get_high_avail_buffers(); + if (buffers < avail) + return (avail - buffers) * SZ_1M; + + return 0; +} + +u64 get_do_eswap_size(bool refault) +{ + u64 size = 0; + enum zswapd_eswap_policy policy = get_zswapd_eswap_policy(); + + if (policy == CHECK_BUFFER_ZRAMRATIO_BOTH) + size = max(zram_watermark_diff(), zswapd_buffer_diff()); + else if (policy == CHECK_BUFFER_ONLY && (zram_watermark_ok() || refault)) + size = zswapd_buffer_diff(); + + return size; +} + +static int zswapd(void *p) +{ + struct task_struct *tsk = current; + pg_data_t *pgdat = (pg_data_t *)p; + const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id); +#ifdef CONFIG_RECLAIM_ACCT + struct reclaim_acct ra = {0}; +#endif + + /* save zswapd pid for schedule strategy */ + zswapd_pid = tsk->pid; + + + if (!cpumask_empty(cpumask)) + set_cpus_allowed_ptr(tsk, cpumask); + + set_freezable(); + + while (!kthread_should_stop()) { + bool refault = false; + u64 size = 0; + + (void)wait_event_freezable(pgdat->zswapd_wait, + atomic_read(&pgdat->zswapd_wait_flag)); + atomic_set(&pgdat->zswapd_wait_flag, 0); + count_vm_event(ZSWAPD_WAKEUP); + zswapd_pressure_report(LEVEL_LOW); + + if (get_area_anon_refault_status()) { + refault = true; + count_vm_event(ZSWAPD_REFAULT); + goto do_eswap; + } + +#ifdef CONFIG_RECLAIM_ACCT + reclaimacct_start(ZSWAPD_RECLAIM, &ra); +#endif + zswapd_shrink_node(pgdat); +#ifdef CONFIG_RECLAIM_ACCT + reclaimacct_end(ZSWAPD_RECLAIM); +#endif + last_zswapd_time = jiffies; + +do_eswap: + size = get_do_eswap_size(refault); + if (size >= SZ_1M) { + count_vm_event(ZSWAPD_SWAPOUT); + size = swapout(size); + } + + if (!buffer_is_suitable()) { + if (free_swap_is_low() || zram_watermark_exceed()) { + zswapd_pressure_report(LEVEL_CRITICAL); + count_vm_event(ZSWAPD_CRITICAL_PRESS); + pr_info("%s:zrampages:%llu, eswappages:%llu\n", __func__, + get_zram_used_pages(), get_eswap_used_pages()); + } else { + zswapd_pressure_report(LEVEL_MEDIUM); + count_vm_event(ZSWAPD_MEDIUM_PRESS); + } + } + } + + return 0; +} + +/* + * This zswapd start function will be called by init and node-hot-add. + */ +int zswapd_run(int nid) +{ + const unsigned int priority_less = 5; + struct sched_param param = { + .sched_priority = MAX_PRIO - priority_less, + }; + pg_data_t *pgdat = NODE_DATA(nid); + + if (pgdat->zswapd) + return 0; + + atomic_set(&pgdat->zswapd_wait_flag, 0); + pgdat->zswapd = kthread_create(zswapd, pgdat, "zswapd%d", nid); + if (IS_ERR(pgdat->zswapd)) { + pr_err("Failed to start zswapd on node %d\n", nid); + return PTR_ERR(pgdat->zswapd); + } + + sched_setscheduler_nocheck(pgdat->zswapd, SCHED_NORMAL, ¶m); + set_user_nice(pgdat->zswapd, PRIO_TO_NICE(param.sched_priority)); + wake_up_process(pgdat->zswapd); + + return 0; +} + +/* + * Called by memory hotplug when all memory in a node is offlined. Caller must + * hold mem_hotplug_begin/end(). + */ +void zswapd_stop(int nid) +{ + struct task_struct *zswapd = NODE_DATA(nid)->zswapd; + + if (zswapd) { + kthread_stop(zswapd); + NODE_DATA(nid)->zswapd = NULL; + } + + zswapd_pid = -1; +} + +/* + * It's optimal to keep kswapds on the same CPUs as their memory, but + * not required for correctness. So if the last cpu in a node goes away, + * we get changed to run anywhere: as the first one comes back, restore + * their cpu bindings. + */ +static int zswapd_cpu_online(unsigned int cpu) +{ + int nid; + + for_each_node_state(nid, N_MEMORY) { + pg_data_t *pgdat = NODE_DATA(nid); + const struct cpumask *mask; + + mask = cpumask_of_node(pgdat->node_id); + if (cpumask_any_and(cpu_online_mask, mask) < nr_cpu_ids) + /* One of our CPUs online: restore mask */ + set_cpus_allowed_ptr(pgdat->zswapd, mask); + } + + return 0; +} + +static int __init zswapd_init(void) +{ + int nid; + int ret; + + ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "mm/zswapd:online", + zswapd_cpu_online, NULL); + if (ret < 0) { + pr_err("zswapd: failed to register hotplug callbacks.\n"); + return ret; + } + + for_each_node_state(nid, N_MEMORY) + zswapd_run(nid); + + return 0; +} +module_init(zswapd_init) diff --git a/mm/zswapd_control.c b/mm/zswapd_control.c new file mode 100644 index 0000000000000000000000000000000000000000..340b6830619a439f0ddb8bf1a1ab3d831836830e --- /dev/null +++ b/mm/zswapd_control.c @@ -0,0 +1,860 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * mm/zswapd_control.c + * + * Copyright (c) 2020-2022 Huawei Technologies Co., Ltd. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include "zswapd_internal.h" + +#define ANON_REFAULT_SNAPSHOT_MIN_INTERVAL 200 +#define AREA_ANON_REFAULT_THRESHOLD 22000 +#define EMPTY_ROUND_CHECK_THRESHOLD 10 +#define EMPTY_ROUND_SKIP_INTERVAL 20 +#define ZSWAPD_MAX_LEVEL_NUM 10 +#define MAX_SKIP_INTERVAL 1000 +#define MAX_RECLAIM_SIZE 100 + +#define INACTIVE_FILE_RATIO 90 +#define ACTIVE_FILE_RATIO 70 +#define COMPRESS_RATIO 30 +#define ZRAM_WM_RATIO 0 +#define MAX_RATIO 100 + +#define CHECK_BUFFER_VALID(var1, var2) (((var2) != 0) && ((var1) > (var2))) + +struct zswapd_param { + unsigned int min_score; + unsigned int max_score; + unsigned int ub_mem2zram_ratio; + unsigned int ub_zram2ufs_ratio; + unsigned int refault_threshold; +}; + +static struct zswapd_param zswap_param[ZSWAPD_MAX_LEVEL_NUM]; +struct eventfd_ctx *zswapd_press_efd[LEVEL_COUNT]; +static DEFINE_MUTEX(pressure_event_lock); +static DEFINE_MUTEX(reclaim_para_lock); + +atomic_t avail_buffers = ATOMIC_INIT(0); +atomic_t min_avail_buffers = ATOMIC_INIT(0); +atomic_t high_avail_buffers = ATOMIC_INIT(0); +atomic_t max_reclaim_size = ATOMIC_INIT(MAX_RECLAIM_SIZE); + +atomic_t inactive_file_ratio = ATOMIC_INIT(INACTIVE_FILE_RATIO); +atomic_t active_file_ratio = ATOMIC_INIT(ACTIVE_FILE_RATIO); +atomic_t zram_wm_ratio = ATOMIC_INIT(ZRAM_WM_RATIO); +atomic_t compress_ratio = ATOMIC_INIT(COMPRESS_RATIO); + +atomic64_t zram_critical_threshold = ATOMIC_LONG_INIT(0); +atomic64_t free_swap_threshold = ATOMIC_LONG_INIT(0); +atomic64_t area_anon_refault_threshold = ATOMIC_LONG_INIT(AREA_ANON_REFAULT_THRESHOLD); +atomic64_t anon_refault_snapshot_min_interval = + ATOMIC_LONG_INIT(ANON_REFAULT_SNAPSHOT_MIN_INTERVAL); +atomic64_t empty_round_skip_interval = ATOMIC_LONG_INIT(EMPTY_ROUND_SKIP_INTERVAL); +atomic64_t max_skip_interval = ATOMIC_LONG_INIT(MAX_SKIP_INTERVAL); +atomic64_t empty_round_check_threshold = ATOMIC_LONG_INIT(EMPTY_ROUND_CHECK_THRESHOLD); + +inline unsigned int get_zram_wm_ratio(void) +{ + return atomic_read(&zram_wm_ratio); +} + +inline unsigned int get_compress_ratio(void) +{ + return atomic_read(&compress_ratio); +} + +inline unsigned int get_inactive_file_ratio(void) +{ + return atomic_read(&inactive_file_ratio); +} + +inline unsigned int get_active_file_ratio(void) +{ + return atomic_read(&active_file_ratio); +} + +inline unsigned int get_avail_buffers(void) +{ + return atomic_read(&avail_buffers); +} + +inline unsigned int get_min_avail_buffers(void) +{ + return atomic_read(&min_avail_buffers); +} + +inline unsigned int get_high_avail_buffers(void) +{ + return atomic_read(&high_avail_buffers); +} + +inline unsigned int get_zswapd_max_reclaim_size(void) +{ + return atomic_read(&max_reclaim_size); +} + +inline unsigned long long get_free_swap_threshold(void) +{ + return atomic64_read(&free_swap_threshold); +} + +inline unsigned long long get_area_anon_refault_threshold(void) +{ + return atomic64_read(&area_anon_refault_threshold); +} + +inline unsigned long long get_anon_refault_snapshot_min_interval(void) +{ + return atomic64_read(&anon_refault_snapshot_min_interval); +} + +inline unsigned long long get_empty_round_skip_interval(void) +{ + return atomic64_read(&empty_round_skip_interval); +} + +inline unsigned long long get_max_skip_interval(void) +{ + return atomic64_read(&max_skip_interval); +} + +inline unsigned long long get_empty_round_check_threshold(void) +{ + return atomic64_read(&empty_round_check_threshold); +} + +inline unsigned long long get_zram_critical_threshold(void) +{ + return atomic64_read(&zram_critical_threshold); +} + +static ssize_t avail_buffers_params_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + unsigned long long threshold; + unsigned int high_buffers; + unsigned int min_buffers; + unsigned int buffers; + + buf = strstrip(buf); + + if (sscanf(buf, "%u %u %u %llu", &buffers, &min_buffers, &high_buffers, &threshold) != 4) + return -EINVAL; + + if (CHECK_BUFFER_VALID(min_buffers, buffers) || + CHECK_BUFFER_VALID(min_buffers, high_buffers) || + CHECK_BUFFER_VALID(buffers, high_buffers)) + return -EINVAL; + + atomic_set(&avail_buffers, buffers); + atomic_set(&min_avail_buffers, min_buffers); + atomic_set(&high_avail_buffers, high_buffers); + atomic64_set(&free_swap_threshold, (threshold * (SZ_1M / PAGE_SIZE))); + + if (atomic_read(&min_avail_buffers) == 0) + set_snapshotd_init_flag(0); + else + set_snapshotd_init_flag(1); + + wake_all_zswapd(); + + return nbytes; +} + +static ssize_t zswapd_max_reclaim_size_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + u32 max; + int ret; + + buf = strstrip(buf); + ret = kstrtouint(buf, 10, &max); + if (ret) + return -EINVAL; + + atomic_set(&max_reclaim_size, max); + + return nbytes; +} + +static ssize_t buffers_ratio_params_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + unsigned int inactive; + unsigned int active; + + buf = strstrip(buf); + + if (sscanf(buf, "%u %u", &inactive, &active) != 2) + return -EINVAL; + + if (inactive > MAX_RATIO || active > MAX_RATIO) + return -EINVAL; + + atomic_set(&inactive_file_ratio, inactive); + atomic_set(&active_file_ratio, active); + + return nbytes; +} + +static int area_anon_refault_threshold_write(struct cgroup_subsys_state *css, + struct cftype *cft, u64 val) +{ + atomic64_set(&area_anon_refault_threshold, val); + + return 0; +} + +static int empty_round_skip_interval_write(struct cgroup_subsys_state *css, + struct cftype *cft, u64 val) +{ + atomic64_set(&empty_round_skip_interval, val); + + return 0; +} + +static int max_skip_interval_write(struct cgroup_subsys_state *css, + struct cftype *cft, u64 val) +{ + atomic64_set(&max_skip_interval, val); + + return 0; +} + +static int empty_round_check_threshold_write(struct cgroup_subsys_state *css, + struct cftype *cft, u64 val) +{ + atomic64_set(&empty_round_check_threshold, val); + + return 0; +} + +static int anon_refault_snapshot_min_interval_write(struct cgroup_subsys_state *css, + struct cftype *cft, u64 val) +{ + atomic64_set(&anon_refault_snapshot_min_interval, val); + + return 0; +} + +static int zram_critical_thres_write(struct cgroup_subsys_state *css, + struct cftype *cft, u64 val) +{ + atomic64_set(&zram_critical_threshold, val); + + return 0; +} + +static ssize_t zswapd_pressure_event_control(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + unsigned int level; + unsigned int efd; + struct fd efile; + int ret; + + buf = strstrip(buf); + if (sscanf(buf, "%u %u", &efd, &level) != 2) + return -EINVAL; + + if (level >= LEVEL_COUNT) + return -EINVAL; + + mutex_lock(&pressure_event_lock); + efile = fdget(efd); + if (!efile.file) { + ret = -EBADF; + goto out; + } + + zswapd_press_efd[level] = eventfd_ctx_fileget(efile.file); + if (IS_ERR(zswapd_press_efd[level])) { + ret = PTR_ERR(zswapd_press_efd[level]); + goto out_put_efile; + } + fdput(efile); + mutex_unlock(&pressure_event_lock); + return nbytes; + +out_put_efile: + fdput(efile); +out: + mutex_unlock(&pressure_event_lock); + + return ret; +} + +void zswapd_pressure_report(enum zswapd_pressure_level level) +{ + int ret; + + if (zswapd_press_efd[level] == NULL) + return; + + ret = eventfd_signal(zswapd_press_efd[level], 1); + if (ret < 0) + pr_err("SWAP-MM: %s : level:%u, ret:%d ", __func__, level, ret); +} + +static u64 zswapd_pid_read(struct cgroup_subsys_state *css, struct cftype *cft) +{ + return get_zswapd_pid(); +} + +static void zswapd_memcgs_param_parse(int level_num) +{ + struct mem_cgroup *memcg = NULL; + u64 score; + int i; + + while ((memcg = get_next_memcg(memcg))) { + score = atomic64_read(&memcg->memcg_reclaimed.app_score); + for (i = 0; i < level_num; ++i) + if (score >= zswap_param[i].min_score && + score <= zswap_param[i].max_score) + break; + + atomic_set(&memcg->memcg_reclaimed.ub_mem2zram_ratio, + zswap_param[i].ub_mem2zram_ratio); + atomic_set(&memcg->memcg_reclaimed.ub_zram2ufs_ratio, + zswap_param[i].ub_zram2ufs_ratio); + atomic_set(&memcg->memcg_reclaimed.refault_threshold, + zswap_param[i].refault_threshold); + } +} + +static ssize_t zswapd_memcgs_param_write(struct kernfs_open_file *of, char *buf, + size_t nbytes, loff_t off) +{ + char *token = NULL; + int level_num; + int i; + + buf = strstrip(buf); + token = strsep(&buf, " "); + + if (!token) + return -EINVAL; + + if (kstrtoint(token, 0, &level_num)) + return -EINVAL; + + if (level_num > ZSWAPD_MAX_LEVEL_NUM) + return -EINVAL; + + mutex_lock(&reclaim_para_lock); + for (i = 0; i < level_num; ++i) { + token = strsep(&buf, " "); + if (!token) + goto out; + + if (kstrtoint(token, 0, &zswap_param[i].min_score) || + zswap_param[i].min_score > MAX_APP_SCORE) + goto out; + + token = strsep(&buf, " "); + if (!token) + goto out; + + if (kstrtoint(token, 0, &zswap_param[i].max_score) || + zswap_param[i].max_score > MAX_APP_SCORE) + goto out; + + token = strsep(&buf, " "); + if (!token) + goto out; + + if (kstrtoint(token, 0, &zswap_param[i].ub_mem2zram_ratio) || + zswap_param[i].ub_mem2zram_ratio > MAX_RATIO) + goto out; + + token = strsep(&buf, " "); + if (!token) + goto out; + + if (kstrtoint(token, 0, &zswap_param[i].ub_zram2ufs_ratio) || + zswap_param[i].ub_zram2ufs_ratio > MAX_RATIO) + goto out; + + token = strsep(&buf, " "); + if (!token) + goto out; + + if (kstrtoint(token, 0, &zswap_param[i].refault_threshold)) + goto out; + } + + zswapd_memcgs_param_parse(level_num); + mutex_unlock(&reclaim_para_lock); + + return nbytes; + +out: + mutex_unlock(&reclaim_para_lock); + return -EINVAL; +} + +static ssize_t zswapd_single_memcg_param_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); + unsigned int ub_mem2zram_ratio; + unsigned int ub_zram2ufs_ratio; + unsigned int refault_threshold; + + buf = strstrip(buf); + + if (sscanf(buf, "%u %u %u", &ub_mem2zram_ratio, &ub_zram2ufs_ratio, + &refault_threshold) != 3) + return -EINVAL; + + if (ub_mem2zram_ratio > MAX_RATIO || ub_zram2ufs_ratio > MAX_RATIO || + refault_threshold > MAX_RATIO) + return -EINVAL; + + atomic_set(&memcg->memcg_reclaimed.ub_mem2zram_ratio, + ub_mem2zram_ratio); + atomic_set(&memcg->memcg_reclaimed.ub_zram2ufs_ratio, + ub_zram2ufs_ratio); + atomic_set(&memcg->memcg_reclaimed.refault_threshold, + refault_threshold); + + return nbytes; +} + +static ssize_t mem_cgroup_zram_wm_ratio_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + unsigned int ratio; + int ret; + + buf = strstrip(buf); + + ret = kstrtouint(buf, 10, &ratio); + if (ret) + return -EINVAL; + + if (ratio > MAX_RATIO) + return -EINVAL; + + atomic_set(&zram_wm_ratio, ratio); + + return nbytes; +} + +static ssize_t mem_cgroup_compress_ratio_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + unsigned int ratio; + int ret; + + buf = strstrip(buf); + + ret = kstrtouint(buf, 10, &ratio); + if (ret) + return -EINVAL; + + if (ratio > MAX_RATIO) + return -EINVAL; + + atomic_set(&compress_ratio, ratio); + + return nbytes; +} + +static int zswapd_pressure_show(struct seq_file *m, void *v) +{ + zswapd_status_show(m); + + return 0; +} + +static int memcg_active_app_info_list_show(struct seq_file *m, void *v) +{ + struct mem_cgroup_per_node *mz = NULL; + struct mem_cgroup *memcg = NULL; + struct lruvec *lruvec = NULL; + unsigned long eswap_size; + unsigned long anon_size; + unsigned long zram_size; + + while ((memcg = get_next_memcg(memcg))) { + u64 score = atomic64_read(&memcg->memcg_reclaimed.app_score); + + mz = mem_cgroup_nodeinfo(memcg, 0); + if (!mz) { + get_next_memcg_break(memcg); + return 0; + } + + lruvec = &mz->lruvec; + if (!lruvec) { + get_next_memcg_break(memcg); + return 0; + } + + anon_size = lruvec_lru_size(lruvec, LRU_ACTIVE_ANON, + MAX_NR_ZONES) + lruvec_lru_size(lruvec, + LRU_INACTIVE_ANON, MAX_NR_ZONES); + eswap_size = memcg_data_size(memcg, SWAP_SIZE); + zram_size = memcg_data_size(memcg, CACHE_SIZE); + + if (anon_size + zram_size + eswap_size == 0) + continue; + + if (!strlen(memcg->name)) + continue; + + anon_size *= PAGE_SIZE / SZ_1K; + zram_size *= PAGE_SIZE / SZ_1K; + eswap_size *= PAGE_SIZE / SZ_1K; + + seq_printf(m, "%s %llu %lu %lu %lu %llu\n", memcg->name, score, + anon_size, zram_size, eswap_size, + memcg->memcg_reclaimed.reclaimed_pagefault); + } + return 0; +} + +#ifdef CONFIG_HYPERHOLD_DEBUG +static int avail_buffers_params_show(struct seq_file *m, void *v) +{ + seq_printf(m, "avail_buffers: %u\n", atomic_read(&avail_buffers)); + seq_printf(m, "min_avail_buffers: %u\n", atomic_read(&min_avail_buffers)); + seq_printf(m, "high_avail_buffers: %u\n", atomic_read(&high_avail_buffers)); + seq_printf(m, "free_swap_threshold: %llu\n", + atomic64_read(&free_swap_threshold) * PAGE_SIZE / SZ_1M); + + return 0; +} + +static int zswapd_max_reclaim_size_show(struct seq_file *m, void *v) +{ + seq_printf(m, "zswapd_max_reclaim_size: %u\n", + atomic_read(&max_reclaim_size)); + + return 0; +} + +static int buffers_ratio_params_show(struct seq_file *m, void *v) +{ + seq_printf(m, "inactive_file_ratio: %u\n", atomic_read(&inactive_file_ratio)); + seq_printf(m, "active_file_ratio: %u\n", atomic_read(&active_file_ratio)); + + return 0; +} + +static u64 area_anon_refault_threshold_read(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + return atomic64_read(&area_anon_refault_threshold); +} + +static u64 empty_round_skip_interval_read(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + return atomic64_read(&empty_round_skip_interval); +} + +static u64 max_skip_interval_read(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + return atomic64_read(&max_skip_interval); +} + +static u64 empty_round_check_threshold_read(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + return atomic64_read(&empty_round_check_threshold); +} + +static u64 anon_refault_snapshot_min_interval_read( + struct cgroup_subsys_state *css, struct cftype *cft) +{ + return atomic64_read(&anon_refault_snapshot_min_interval); +} + +static u64 zram_critical_threshold_read(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + return atomic64_read(&zram_critical_threshold); +} + +static int zswapd_memcgs_param_show(struct seq_file *m, void *v) +{ + int i; + + for (i = 0; i < ZSWAPD_MAX_LEVEL_NUM; ++i) { + seq_printf(m, "level %d min score: %u\n", i, + zswap_param[i].min_score); + seq_printf(m, "level %d max score: %u\n", i, + zswap_param[i].max_score); + seq_printf(m, "level %d ub_mem2zram_ratio: %u\n", i, + zswap_param[i].ub_mem2zram_ratio); + seq_printf(m, "level %d ub_zram2ufs_ratio: %u\n", i, + zswap_param[i].ub_zram2ufs_ratio); + seq_printf(m, "level %d refault_threshold: %u\n", i, + zswap_param[i].refault_threshold); + } + + return 0; +} + +static int zswapd_single_memcg_param_show(struct seq_file *m, void *v) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); + + seq_printf(m, "memcg score: %llu\n", + atomic64_read(&memcg->memcg_reclaimed.app_score)); + seq_printf(m, "memcg ub_mem2zram_ratio: %u\n", + atomic_read(&memcg->memcg_reclaimed.ub_mem2zram_ratio)); + seq_printf(m, "memcg ub_zram2ufs_ratio: %u\n", + atomic_read(&memcg->memcg_reclaimed.ub_zram2ufs_ratio)); + seq_printf(m, "memcg refault_threshold: %u\n", + atomic_read(&memcg->memcg_reclaimed.refault_threshold)); + + return 0; +} + +static int zram_wm_ratio_show(struct seq_file *m, void *v) +{ + seq_printf(m, "zram_wm_ratio: %u\n", atomic_read(&zram_wm_ratio)); + + return 0; +} + +static int compress_ratio_show(struct seq_file *m, void *v) +{ + seq_printf(m, "compress_ratio: %u\n", atomic_read(&compress_ratio)); + + return 0; +} + +static int zswapd_vmstat_show(struct seq_file *m, void *v) +{ +#ifdef CONFIG_VM_EVENT_COUNTERS + unsigned long *vm_buf = NULL; + + vm_buf = kzalloc(sizeof(struct vm_event_state), GFP_KERNEL); + if (!vm_buf) + return -ENOMEM; + all_vm_events(vm_buf); + + seq_printf(m, "zswapd_wake_up:%lu\n", vm_buf[ZSWAPD_WAKEUP]); + seq_printf(m, "zswapd_area_refault:%lu\n", vm_buf[ZSWAPD_REFAULT]); + seq_printf(m, "zswapd_medium_press:%lu\n", vm_buf[ZSWAPD_MEDIUM_PRESS]); + seq_printf(m, "zswapd_critical_press:%lu\n", vm_buf[ZSWAPD_CRITICAL_PRESS]); + seq_printf(m, "zswapd_memcg_ratio_skip:%lu\n", vm_buf[ZSWAPD_MEMCG_RATIO_SKIP]); + seq_printf(m, "zswapd_memcg_refault_skip:%lu\n", vm_buf[ZSWAPD_MEMCG_REFAULT_SKIP]); + seq_printf(m, "zswapd_swapout:%lu\n", vm_buf[ZSWAPD_SWAPOUT]); + seq_printf(m, "zswapd_snapshot_times:%lu\n", vm_buf[ZSWAPD_SNAPSHOT_TIMES]); + seq_printf(m, "zswapd_reclaimed:%lu\n", vm_buf[ZSWAPD_RECLAIMED]); + seq_printf(m, "zswapd_scanned:%lu\n", vm_buf[ZSWAPD_SCANNED]); + + kfree(vm_buf); +#endif + + return 0; +} + +static int eswap_info_show(struct seq_file *m, void *v) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); + unsigned long long eswap_size; + + eswap_size = memcg_data_size(memcg, WRITE_SIZE) / SZ_1K; + seq_printf(m, "Total Swapout Size: %llu kB\n", eswap_size); + + return 0; +} + +void memcg_eswap_info_show(struct seq_file *m) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); + struct mem_cgroup_per_node *mz = NULL; + struct lruvec *lruvec = NULL; + unsigned long anon; + unsigned long file; + unsigned long zram; + unsigned long eswap; + + mz = mem_cgroup_nodeinfo(memcg, 0); + if (!mz) + return; + + lruvec = &mz->lruvec; + if (!lruvec) + return; + + anon = lruvec_lru_size(lruvec, LRU_ACTIVE_ANON, MAX_NR_ZONES) + + lruvec_lru_size(lruvec, LRU_INACTIVE_ANON, MAX_NR_ZONES); + file = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE, MAX_NR_ZONES) + + lruvec_lru_size(lruvec, LRU_INACTIVE_FILE, MAX_NR_ZONES); + zram = memcg_data_size(memcg, CACHE_SIZE) / SZ_1K; + eswap = memcg_data_size(memcg, SWAP_SIZE) / SZ_1K; + anon *= PAGE_SIZE / SZ_1K; + file *= PAGE_SIZE / SZ_1K; + seq_printf(m, "Anon:\t%12lu kB\nFile:\t%12lu kB\nzram:\t%12lu kB\nEswap:\t%12lu kB\n", + anon, file, zram, eswap); +} +#endif + +static struct cftype zswapd_policy_files[] = { + { + .name = "active_app_info_list", + .flags = CFTYPE_ONLY_ON_ROOT, + .seq_show = memcg_active_app_info_list_show, + }, + { + .name = "zram_wm_ratio", + .flags = CFTYPE_ONLY_ON_ROOT, + .write = mem_cgroup_zram_wm_ratio_write, +#ifdef CONFIG_HYPERHOLD_DEBUG + .seq_show = zram_wm_ratio_show, +#endif + }, + { + .name = "compress_ratio", + .flags = CFTYPE_ONLY_ON_ROOT, + .write = mem_cgroup_compress_ratio_write, +#ifdef CONFIG_HYPERHOLD_DEBUG + .seq_show = compress_ratio_show, +#endif + }, + { + .name = "zswapd_pressure", + .flags = CFTYPE_ONLY_ON_ROOT, + .write = zswapd_pressure_event_control, + }, + { + .name = "zswapd_pid", + .flags = CFTYPE_ONLY_ON_ROOT, + .read_u64 = zswapd_pid_read, + }, + { + .name = "avail_buffers", + .flags = CFTYPE_ONLY_ON_ROOT, + .write = avail_buffers_params_write, +#ifdef CONFIG_HYPERHOLD_DEBUG + .seq_show = avail_buffers_params_show, +#endif + }, + { + .name = "zswapd_max_reclaim_size", + .flags = CFTYPE_ONLY_ON_ROOT, + .write = zswapd_max_reclaim_size_write, +#ifdef CONFIG_HYPERHOLD_DEBUG + .seq_show = zswapd_max_reclaim_size_show, +#endif + }, + { + .name = "area_anon_refault_threshold", + .flags = CFTYPE_ONLY_ON_ROOT, + .write_u64 = area_anon_refault_threshold_write, +#ifdef CONFIG_HYPERHOLD_DEBUG + .read_u64 = area_anon_refault_threshold_read, +#endif + }, + { + .name = "empty_round_skip_interval", + .flags = CFTYPE_ONLY_ON_ROOT, + .write_u64 = empty_round_skip_interval_write, +#ifdef CONFIG_HYPERHOLD_DEBUG + .read_u64 = empty_round_skip_interval_read, +#endif + }, + { + .name = "max_skip_interval", + .flags = CFTYPE_ONLY_ON_ROOT, + .write_u64 = max_skip_interval_write, +#ifdef CONFIG_HYPERHOLD_DEBUG + .read_u64 = max_skip_interval_read, +#endif + }, + { + .name = "empty_round_check_threshold", + .flags = CFTYPE_ONLY_ON_ROOT, + .write_u64 = empty_round_check_threshold_write, +#ifdef CONFIG_HYPERHOLD_DEBUG + .read_u64 = empty_round_check_threshold_read, +#endif + }, + { + .name = "anon_refault_snapshot_min_interval", + .flags = CFTYPE_ONLY_ON_ROOT, + .write_u64 = anon_refault_snapshot_min_interval_write, +#ifdef CONFIG_HYPERHOLD_DEBUG + .read_u64 = anon_refault_snapshot_min_interval_read, +#endif + }, + { + .name = "zswapd_memcgs_param", + .flags = CFTYPE_ONLY_ON_ROOT, + .write = zswapd_memcgs_param_write, +#ifdef CONFIG_HYPERHOLD_DEBUG + .seq_show = zswapd_memcgs_param_show, +#endif + }, + { + .name = "zswapd_single_memcg_param", + .write = zswapd_single_memcg_param_write, +#ifdef CONFIG_HYPERHOLD_DEBUG + .seq_show = zswapd_single_memcg_param_show, +#endif + }, + { + .name = "buffer_ratio_params", + .flags = CFTYPE_ONLY_ON_ROOT, + .write = buffers_ratio_params_write, +#ifdef CONFIG_HYPERHOLD_DEBUG + .seq_show = buffers_ratio_params_show, +#endif + }, + { + .name = "zswapd_pressure_show", + .flags = CFTYPE_ONLY_ON_ROOT, + .seq_show = zswapd_pressure_show, + }, + { + .name = "zram_critical_threshold", + .flags = CFTYPE_ONLY_ON_ROOT, + .write_u64 = zram_critical_thres_write, +#ifdef CONFIG_HYPERHOLD_DEBUG + .read_u64 = zram_critical_threshold_read, +#endif + }, + +#ifdef CONFIG_HYPERHOLD_DEBUG + { + .name = "zswapd_vmstat_show", + .flags = CFTYPE_ONLY_ON_ROOT, + .seq_show = zswapd_vmstat_show, + }, +#endif + { + .name = "eswap_info", + .flags = CFTYPE_ONLY_ON_ROOT, + .seq_show = eswap_info_show, + }, + + { }, /* terminate */ +}; + +static int __init zswapd_policy_init(void) +{ + if (!mem_cgroup_disabled()) + WARN_ON(cgroup_add_legacy_cftypes(&memory_cgrp_subsys, zswapd_policy_files)); + + return 0; +} +subsys_initcall(zswapd_policy_init); diff --git a/mm/zswapd_internal.h b/mm/zswapd_internal.h new file mode 100644 index 0000000000000000000000000000000000000000..1447882ae49725663a160ed2d7a106690dd67e9b --- /dev/null +++ b/mm/zswapd_internal.h @@ -0,0 +1,41 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * mm/zswapd_internal.h + * + * Copyright (c) 2020-2022 Huawei Technologies Co., Ltd. + */ + +#ifndef _ZSWAPD_INTERNAL_H +#define _ZSWAPD_INTERNAL_H + +enum zswapd_pressure_level { + LEVEL_LOW = 0, + LEVEL_MEDIUM, + LEVEL_CRITICAL, + LEVEL_COUNT +}; + +enum zswapd_eswap_policy { + CHECK_BUFFER_ONLY = 0, + CHECK_BUFFER_ZRAMRATIO_BOTH +}; + +void zswapd_pressure_report(enum zswapd_pressure_level level); +inline unsigned int get_zram_wm_ratio(void); +inline unsigned int get_compress_ratio(void); +inline unsigned int get_avail_buffers(void); +inline unsigned int get_min_avail_buffers(void); +inline unsigned int get_high_avail_buffers(void); +inline unsigned int get_zswapd_max_reclaim_size(void); +inline unsigned int get_inactive_file_ratio(void); +inline unsigned int get_active_file_ratio(void); +inline unsigned long long get_area_anon_refault_threshold(void); +inline unsigned long long get_anon_refault_snapshot_min_interval(void); +inline unsigned long long get_empty_round_skip_interval(void); +inline unsigned long long get_max_skip_interval(void); +inline unsigned long long get_empty_round_check_threshold(void); +inline unsigned long long get_zram_critical_threshold(void); +u64 memcg_data_size(struct mem_cgroup *memcg, int type); +u64 swapin_memcg(struct mem_cgroup *memcg, u64 req_size); + +#endif /* MM_ZSWAPD_INTERNAL_H */