diff --git a/fs/proc/etmem_swap.c b/fs/proc/etmem_swap.c index 4aad6b9db9a6bf5b8c9720fd63c7e0138b670ee1..b4a35da9ac3d1851c782e72272150830790b83d8 100644 --- a/fs/proc/etmem_swap.c +++ b/fs/proc/etmem_swap.c @@ -11,6 +11,22 @@ #include #include #include +#include +#include + +#define RECLAIM_SWAPCACHE_MAGIC 0X77 +#define SET_SWAPCACHE_WMARK _IOW(RECLAIM_SWAPCACHE_MAGIC, 0x02, unsigned int) +#define RECLAIM_SWAPCACHE_ON _IOW(RECLAIM_SWAPCACHE_MAGIC, 0x01, unsigned int) +#define RECLAIM_SWAPCACHE_OFF _IOW(RECLAIM_SWAPCACHE_MAGIC, 0x00, unsigned int) + +#define WATERMARK_MAX 100 +#define SWAP_SCAN_NUM_MAX 32 + +static struct task_struct *reclaim_swapcache_tk; +static bool enable_swapcache_reclaim; +static unsigned long swapcache_watermark[ETMEM_SWAPCACHE_NR_WMARK]; + +static DECLARE_WAIT_QUEUE_HEAD(reclaim_queue); static ssize_t swap_pages_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) @@ -82,6 +98,152 @@ static int swap_pages_release(struct inode *inode, struct file *file) extern struct file_operations proc_swap_pages_operations; +/* check if swapcache meet requirements */ +static bool swapcache_balanced(void) +{ + return total_swapcache_pages() < swapcache_watermark[ETMEM_SWAPCACHE_WMARK_HIGH]; +} + +/* the flag present if swapcache reclaim is started */ +static bool swapcache_reclaim_enabled(void) +{ + return READ_ONCE(enable_swapcache_reclaim); +} + +static void start_swapcache_reclaim(void) +{ + if (swapcache_balanced()) + return; + /* RECLAIM_SWAPCACHE_ON trigger the thread to start running. */ + if (!waitqueue_active(&reclaim_queue)) + return; + + WRITE_ONCE(enable_swapcache_reclaim, true); + wake_up_interruptible(&reclaim_queue); +} + +static void stop_swapcache_reclaim(void) +{ + WRITE_ONCE(enable_swapcache_reclaim, false); +} + +static bool should_goto_sleep(void) +{ + if (swapcache_balanced()) + stop_swapcache_reclaim(); + + if (swapcache_reclaim_enabled()) + return false; + + return true; +} + +static int get_swapcache_watermark(unsigned int ratio) +{ + unsigned int low_watermark; + unsigned int high_watermark; + + low_watermark = ratio & 0xFF; + high_watermark = (ratio >> 8) & 0xFF; + if (low_watermark > WATERMARK_MAX || + high_watermark > WATERMARK_MAX || + low_watermark > high_watermark) + return -EPERM; + + swapcache_watermark[ETMEM_SWAPCACHE_WMARK_LOW] = totalram_pages() * + low_watermark / WATERMARK_MAX; + swapcache_watermark[ETMEM_SWAPCACHE_WMARK_HIGH] = totalram_pages() * + high_watermark / WATERMARK_MAX; + + return 0; +} + +static void reclaim_swapcache_try_to_sleep(void) +{ + DEFINE_WAIT(wait); + + if (freezing(current) || kthread_should_stop()) + return; + + prepare_to_wait(&reclaim_queue, &wait, TASK_INTERRUPTIBLE); + if (should_goto_sleep()) { + if (!kthread_should_stop()) + schedule(); + } + finish_wait(&reclaim_queue, &wait); +} + +static void etmem_reclaim_swapcache(void) +{ + do_swapcache_reclaim(swapcache_watermark, + ARRAY_SIZE(swapcache_watermark)); + stop_swapcache_reclaim(); +} + +static int reclaim_swapcache_proactive(void *para) +{ + set_freezable(); + + while (1) { + bool ret; + + reclaim_swapcache_try_to_sleep(); + ret = try_to_freeze(); + if (kthread_freezable_should_stop(NULL)) + break; + + if (ret) + continue; + + etmem_reclaim_swapcache(); + } + + return 0; +} + +static int reclaim_swapcache_run(void) +{ + int ret = 0; + + reclaim_swapcache_tk = kthread_run(reclaim_swapcache_proactive, NULL, + "etmem_recalim_swapcache"); + if (IS_ERR(reclaim_swapcache_tk)) { + ret = PTR_ERR(reclaim_swapcache_tk); + reclaim_swapcache_tk = NULL; + } + return ret; +} + +static long swap_page_ioctl(struct file *filp, unsigned int cmd, + unsigned long arg) +{ + void __user *argp = (void __user *)arg; + unsigned int ratio; + + switch (cmd) { + case RECLAIM_SWAPCACHE_ON: + if (swapcache_reclaim_enabled()) + return 0; + start_swapcache_reclaim(); + break; + case RECLAIM_SWAPCACHE_OFF: + stop_swapcache_reclaim(); + break; + case SET_SWAPCACHE_WMARK: + if (get_user(ratio, (unsigned int __user *)argp)) + return -EFAULT; + + if (get_swapcache_watermark(ratio) != 0) + return -EFAULT; + break; + default: + return -EPERM; + } + + return 0; +} + + static int swap_pages_entry(void) { proc_swap_pages_operations.flock(NULL, 1, NULL); @@ -89,8 +251,12 @@ static int swap_pages_entry(void) proc_swap_pages_operations.write = swap_pages_write; proc_swap_pages_operations.open = swap_pages_open; proc_swap_pages_operations.release = swap_pages_release; + proc_swap_pages_operations.unlocked_ioctl = swap_page_ioctl; proc_swap_pages_operations.flock(NULL, 0, NULL); + enable_swapcache_reclaim = false; + reclaim_swapcache_run(); + return 0; } @@ -101,7 +267,14 @@ static void swap_pages_exit(void) proc_swap_pages_operations.write = NULL; proc_swap_pages_operations.open = NULL; proc_swap_pages_operations.release = NULL; + proc_swap_pages_operations.unlocked_ioctl = NULL; proc_swap_pages_operations.flock(NULL, 0, NULL); + + if (!IS_ERR(reclaim_swapcache_tk)) { + kthread_stop(reclaim_swapcache_tk); + reclaim_swapcache_tk = NULL; + } + return; } MODULE_LICENSE("GPL"); diff --git a/include/linux/etmem.h b/include/linux/etmem.h index 9ec9657e56ed0531fe33bdf97a7cb48aa1e52f5b..c33542b339a0835443212be2742d4014225102dd 100644 --- a/include/linux/etmem.h +++ b/include/linux/etmem.h @@ -9,6 +9,28 @@ #include #ifdef CONFIG_ETMEM +/** + * list_for_each_entry_safe_reverse_from - iterate backwards over list from + * current point safe against removal + * @pos: the type * to use as a loop cursor. + * @n: another type * to use as temporary storage + * @head: the head for your list. + * @member: the name of the list_head within the struct. + * + * Iterate backwards over list of given type from current point, safe against + * removal of list entry. + */ +#define list_for_each_entry_safe_reverse_from(pos, n, head, member) \ + for (n = list_prev_entry(pos, member); \ + !list_entry_is_head(pos, head, member); \ + pos = n, n = list_prev_entry(n, member)) + + +enum etmem_swapcache_watermark_en { + ETMEM_SWAPCACHE_WMARK_LOW, + ETMEM_SWAPCACHE_WMARK_HIGH, + ETMEM_SWAPCACHE_NR_WMARK +}; #if IS_ENABLED(CONFIG_KVM) static inline struct kvm *mm_kvm(struct mm_struct *mm) @@ -27,6 +49,8 @@ extern struct page *get_page_from_vaddr(struct mm_struct *mm, unsigned long vaddr); extern struct kobj_attribute kernel_swap_enable_attr; extern bool kernel_swap_enabled(void); +extern int do_swapcache_reclaim(unsigned long *swapcache_watermark, + unsigned int watermark_nr); #else /* !CONFIG_ETMEM */ static inline int add_page_for_swap(struct page *page, struct list_head *pagelist) { @@ -43,5 +67,10 @@ static inline bool kernel_swap_enabled(void) { return true; } +static inline int do_swapcache_reclaim(unsigned long *swapcache_watermark, + unsigned int watermark_nr) +{ + return 0; +} #endif /* #ifdef CONFIG_ETMEM */ #endif /* define __MM_ETMEM_H_ */ diff --git a/include/linux/swap.h b/include/linux/swap.h index b47d288e657b0a2223e62f1015f7cf575926a4c4..42bbdfd2fbb18ce77ccba89959215c4d2e8bad11 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -419,6 +419,8 @@ extern void lru_cache_add_inactive_or_unevictable(struct page *page, extern unsigned long zone_reclaimable_pages(struct zone *zone); extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order, gfp_t gfp_mask, nodemask_t *mask); +extern unsigned int reclaim_folio_list(struct list_head *folio_list, + struct pglist_data *pgdat); extern unsigned long reclaim_pages(struct list_head *folio_list); #define MEMCG_RECLAIM_MAY_SWAP (1 << 1) diff --git a/mm/etmem.c b/mm/etmem.c index acd32e71a64311a70c09ce10a9641476612585ef..5accf8e0bbdffaa3c41e3aec35a1e4cb62febb1e 100644 --- a/mm/etmem.c +++ b/mm/etmem.c @@ -93,3 +93,176 @@ struct page *get_page_from_vaddr(struct mm_struct *mm, unsigned long vaddr) return page; } EXPORT_SYMBOL_GPL(get_page_from_vaddr); + +#define SWAP_SCAN_NUM_MAX 32 + +static unsigned long get_swapcache_reclaim_num(unsigned long *swapcache_watermark) +{ + return total_swapcache_pages() > + swapcache_watermark[ETMEM_SWAPCACHE_WMARK_LOW] ? + (total_swapcache_pages() - swapcache_watermark[ETMEM_SWAPCACHE_WMARK_LOW]) : 0; +} + +static int move_lru_folios_to_list(struct lruvec *lruvec, + struct folio *folio, struct list_head *foliolist) +{ + + if (!folio_test_large(folio)) { + /* If another process is also mapping this folio */ + if (folio_mapcount(folio) > 1) + return -EACCES; + } else if (folio_test_hugetlb(folio)) { + /* Do not reclaim hugetlb folios */ + return -EACCES; + } else { + /* Try to reclaim THP unless it is mapped by another process */ + if (folio_entire_mapcount(folio) > 1) + return -EACCES; + } + + /* + * try to a reference to a folio + * may fail if, the folio has been freed/frozen + */ + if (!(folio_try_get(folio))) + return -1; + + /* racing with another isolation */ + if (!folio_test_clear_lru(folio)) { + folio_put(folio); + return -1; + } + + list_move(&folio->lru, foliolist); + update_lru_size(lruvec, + LRU_INACTIVE_ANON, + folio_zonenum(folio), + -folio_nr_pages(folio)); + return 0; +} + +/* + * For each node, scan the inactive anon lru, isolate and move + * appropriate candidates to swapcache_list[nid] + */ +static void memcg_reclaim_swapcache(struct list_head *swapcache_list, + unsigned long swapcache_to_reclaim) +{ + struct mem_cgroup *memcg = NULL, *target_memcg = NULL; + struct lruvec *lruvec; + int nid; + pg_data_t *pgdat; + unsigned int scan_count = 0; + unsigned long swapcache_total_reclaimable = 0; + struct list_head *src = NULL; + struct folio *folio = NULL, *next = NULL, *pos = NULL; + + for_each_node_state(nid, N_MEMORY) { + INIT_LIST_HEAD(&swapcache_list[nid]); + cond_resched(); + pgdat = NODE_DATA(nid); + + memcg = mem_cgroup_iter(target_memcg, NULL, NULL); + do { + cond_resched(); + lruvec = mem_cgroup_lruvec(memcg, pgdat); + src = &(lruvec->lists[LRU_INACTIVE_ANON]); + + spin_lock_irq(&lruvec->lru_lock); + pos = list_last_entry(src, struct folio, lru); + spin_unlock_irq(&lruvec->lru_lock); +reverse_scan_lru: + cond_resched(); + scan_count = 0; + + spin_lock_irq(&lruvec->lru_lock); + if (!pos || list_entry_is_head(pos, src, lru)) { + spin_unlock_irq(&lruvec->lru_lock); + continue; + } + + if (!folio_test_lru(pos) || folio_lru_list(pos) != LRU_INACTIVE_ANON) { + spin_unlock_irq(&lruvec->lru_lock); + continue; + } + + folio = pos; + + list_for_each_entry_safe_reverse_from(folio, next, src, lru) { + pos = next; + scan_count++; + if (scan_count >= SWAP_SCAN_NUM_MAX) + break; + + if (!folio_test_swapcache(folio) || folio_mapped(folio)) + continue; + + if (move_lru_folios_to_list(lruvec, + folio, + &swapcache_list[nid]) != 0) + continue; + + swapcache_total_reclaimable += folio_nr_pages(folio); + } + spin_unlock_irq(&lruvec->lru_lock); + + if (swapcache_total_reclaimable >= swapcache_to_reclaim) + break; + + if (scan_count >= SWAP_SCAN_NUM_MAX) + goto reverse_scan_lru; + + } while ((memcg = mem_cgroup_iter(target_memcg, memcg, NULL))); + } +} + +static int lru_gen_reclaim_swapcache(struct list_head *swapcache_list, + unsigned long swapcache_to_reclaim) +{ + return 0; +} + +int do_swapcache_reclaim(unsigned long *swapcache_watermark, + unsigned int watermark_nr) +{ + int nid; + unsigned long swapcache_to_reclaim = 0; + struct list_head *swapcache_list = NULL, *folio_list = NULL; + struct folio *folio = NULL; + + if (swapcache_watermark == NULL || + watermark_nr < ETMEM_SWAPCACHE_NR_WMARK) + return -EINVAL; + + if (lru_gen_enabled()) + return lru_gen_reclaim_swapcache(swapcache_list, swapcache_to_reclaim); + + swapcache_to_reclaim = get_swapcache_reclaim_num(swapcache_watermark); + + swapcache_list = kcalloc(MAX_NUMNODES, sizeof(struct list_head), GFP_KERNEL); + if (swapcache_list == NULL) + return -ENOMEM; + + memcg_reclaim_swapcache(swapcache_list, swapcache_to_reclaim); + + /* Reclaim all the swapcache we have scanned */ + for_each_node_state(nid, N_MEMORY) { + cond_resched(); + reclaim_folio_list(&swapcache_list[nid], NODE_DATA(nid)); + } + + /* Put pack all the pages that are not reclaimed by shrink_folio_list */ + for_each_node_state(nid, N_MEMORY) { + cond_resched(); + folio_list = &swapcache_list[nid]; + while (!list_empty(folio_list)) { + folio = lru_to_folio(folio_list); + list_del(&folio->lru); + folio_putback_lru(folio); + } + } + + kfree(swapcache_list); + return 0; +} +EXPORT_SYMBOL_GPL(do_swapcache_reclaim); diff --git a/mm/vmscan.c b/mm/vmscan.c index 3b670b1d2b6115bcfe4c79f54582b01ced5160e8..fc3d70abc78ed0fda466d162bc0febe1ca6c280a 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2791,7 +2791,7 @@ static void shrink_active_list(unsigned long nr_to_scan, nr_deactivate, nr_rotated, sc->priority, file); } -static unsigned int reclaim_folio_list(struct list_head *folio_list, +unsigned int reclaim_folio_list(struct list_head *folio_list, struct pglist_data *pgdat) { struct reclaim_stat dummy_stat;