diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 5ad5cd51d015e33bbc8cf43508aaa3df953bc698..de3afdddcf6a021f96f2c7d0db5121af70f6d3e4 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -5543,6 +5543,12 @@ growing up) the main stack are reserved for no other mapping. Default value is 256 pages. + stack_depot_disable= [KNL] + Setting this to true through kernel command line will + disable the stack depot thereby saving the static memory + consumed by the stack hash table. By default this is set + to false. + stacktrace [FTRACE] Enabled the stack tracer on boot up. diff --git a/Documentation/vm/page_owner.rst b/Documentation/vm/page_owner.rst index 2175465c9bf2a6e804a1b2b66d63de0e5cdffe8e..a97f746368e2399846f87a1fa5c6abb2ab9d0d44 100644 --- a/Documentation/vm/page_owner.rst +++ b/Documentation/vm/page_owner.rst @@ -64,10 +64,22 @@ pages are investigated and marked as allocated in initialization phase. Although it doesn't mean that they have the right owner information, at least, we can tell whether the page is allocated or not, more accurately. On 2GB memory x86-64 VM box, 13343 early allocated pages -are catched and marked, although they are mostly allocated from struct +are caught and marked, although they are mostly allocated from struct page extension feature. Anyway, after that, no page is left in un-tracking state. +With CONFIG_PAGE_OWNER_MODULE_STAT config, page owner is able to track if +the pages are allocated by modules. If a page is allocated by a module, the +information dumped from /sys/kernel/debug/page_owner will show the module +name. Users can use the user-space helper to analyze the allocation situation +of modules. /sys/kernel/debug/page_owner_filter can be used to filter out the pages +that are not allocated by modules. The legal value is "module" or "none". The default value +is "none", which means do not filter out any page. + +Besides, the top N modules that allocate the most pages will be dumped +when oom occurs or users read /sys/kernel/debug/page_owner_module_stats. The N value +can be configured with /sys/kernel/debug/page_owner_show_max. The default N is 20. + Usage ===== @@ -85,5 +97,118 @@ Usage cat /sys/kernel/debug/page_owner > page_owner_full.txt ./page_owner_sort page_owner_full.txt sorted_page_owner.txt + The general output of ``page_owner_full.txt`` is as follows:: + + Page allocated via order XXX, ... + PFN XXX ... + // Detailed stack + + Page allocated via order XXX, ... + PFN XXX ... + // Detailed stack + + The ``page_owner_sort`` tool ignores ``PFN`` rows, puts the remaining rows + in buf, uses regexp to extract the page order value, counts the times + and pages of buf, and finally sorts them according to the parameter(s). + See the result about who allocated each page - in the ``sorted_page_owner.txt``. + in the ``sorted_page_owner.txt``. General output:: + + XXX times, XXX pages: + Page allocated via order XXX, ... + // Detailed stack + + By default, ``page_owner_sort`` is sorted according to the times of buf. + If you want to sort by the page nums of buf, use the ``-m`` parameter. + The parameters related to modules depend on the kernel built with + CONFIG_PAGE_OWNER_MODULE_STAT. The detailed parameters are: + + fundamental function:: + + Sort: + -a Sort by memory allocation time. + -m Sort by total memory. + -p Sort by pid. + -P Sort by tgid. + -n Sort by task command name. + -r Sort by memory release time. + -s Sort by stack trace. + -t Sort by times (default). + --sort Specify sorting order. Sorting syntax is [+|-]key[,[+|-]key[,...]]. + Choose a key from the **STANDARD FORMAT SPECIFIERS** section. The "+" is + optional since default direction is increasing numerical or lexicographic + order. Mixed use of abbreviated and complete-form of keys is allowed. + + Examples: + ./page_owner_sort --sort=n,+pid,-tgid + ./page_owner_sort --sort=at + + additional function:: + + Cull: + --cull + Specify culling rules.Culling syntax is key[,key[,...]].Choose a + multi-letter key from the **STANDARD FORMAT SPECIFIERS** section. + + is a single argument in the form of a comma-separated list, + which offers a way to specify individual culling rules. The recognized + keywords are described in the **STANDARD FORMAT SPECIFIERS** section below. + can be specified by the sequence of keys k1,k2, ..., as described in + the STANDARD SORT KEYS section below. Mixed use of abbreviated and + complete-form of keys is allowed. + + Examples: + ./page_owner_sort --cull=stacktrace + ./page_owner_sort --cull=st,pid,name + ./page_owner_sort --cull=n,f + + Filter: + -f Filter out the information of blocks whose memory has not been released. + -M Filter out the information of blocks whose memory isn't allocated by modules. + + Select: + --pid Select by pid. This selects the blocks whose process ID + numbers appear in . + --tgid Select by tgid. This selects the blocks whose thread + group ID numbers appear in . + --name Select by task command name. This selects the blocks whose + task command name appear in . + --module Select by module. This selects the information of blocks whose + memory is allocated by modules appear in . + + , , , are single arguments in the form of a comma-separated list, + which offers a way to specify individual selecting rules. + + + Examples: + ./page_owner_sort --pid=1 + ./page_owner_sort --tgid=1,2,3 + ./page_owner_sort --name name1,name2s + +STANDARD FORMAT SPECIFIERS +========================== +:: + + For --sort option: + + KEY LONG DESCRIPTION + p pid process ID + tg tgid thread group ID + n name task command name + st stacktrace stack trace of the page allocation + T txt full text of block + ft free_ts timestamp of the page when it was released + at alloc_ts timestamp of the page when it was allocated + ator allocator memory allocator for pages + mod module the name of the module that the page is allocated by + + For --cull option: + + KEY LONG DESCRIPTION + p pid process ID + tg tgid thread group ID + n name task command name + f free whether the page has been released or not + st stacktrace stack trace of the page allocation + ator allocator memory allocator for pages + mod module the name of the module that the page is allocated by diff --git a/drivers/gpu/drm/drm_dp_mst_topology.c b/drivers/gpu/drm/drm_dp_mst_topology.c index 27305f3398819046e257be95ba7d796e1154a6b1..58bff96e43a64b42ad1c3575f4d9937f4f82fdf8 100644 --- a/drivers/gpu/drm/drm_dp_mst_topology.c +++ b/drivers/gpu/drm/drm_dp_mst_topology.c @@ -5441,6 +5441,7 @@ int drm_dp_mst_topology_mgr_init(struct drm_dp_mst_topology_mgr *mgr, mutex_init(&mgr->probe_lock); #if IS_ENABLED(CONFIG_DRM_DEBUG_DP_MST_TOPOLOGY_REFS) mutex_init(&mgr->topology_ref_history_lock); + stack_depot_init(); #endif INIT_LIST_HEAD(&mgr->tx_msg_downq); INIT_LIST_HEAD(&mgr->destroy_port_list); diff --git a/drivers/gpu/drm/drm_mm.c b/drivers/gpu/drm/drm_mm.c index a4a04d2461353924e9a7cb79071a630d0f3de7f1..52faf3369ce2eee6d5dcff8a9b057f398f3fec8a 100644 --- a/drivers/gpu/drm/drm_mm.c +++ b/drivers/gpu/drm/drm_mm.c @@ -983,6 +983,10 @@ void drm_mm_init(struct drm_mm *mm, u64 start, u64 size) add_hole(&mm->head_node); mm->scan_active = 0; + +#ifdef CONFIG_DRM_DEBUG_MM + stack_depot_init(); +#endif } EXPORT_SYMBOL(drm_mm_init); diff --git a/drivers/gpu/drm/i915/intel_runtime_pm.c b/drivers/gpu/drm/i915/intel_runtime_pm.c index 8b725efb2254c7d609bb346d34396fd12cd9e4fa..c90210ac5fb7d8754c22395ba6fc2a29fdc5f029 100644 --- a/drivers/gpu/drm/i915/intel_runtime_pm.c +++ b/drivers/gpu/drm/i915/intel_runtime_pm.c @@ -78,6 +78,7 @@ static void __print_depot_stack(depot_stack_handle_t stack, static void init_intel_runtime_pm_wakeref(struct intel_runtime_pm *rpm) { spin_lock_init(&rpm->debug.lock); + stack_depot_init(); } static noinline depot_stack_handle_t diff --git a/include/linux/page_ext.h b/include/linux/page_ext.h index fabb2e1e087f49923cfce32bcc063284161214c1..ed27198cdaf46709423c33eeb8a16451747ec993 100644 --- a/include/linux/page_ext.h +++ b/include/linux/page_ext.h @@ -55,7 +55,8 @@ static inline void page_ext_init(void) } #endif -struct page_ext *lookup_page_ext(const struct page *page); +extern struct page_ext *page_ext_get(struct page *page); +extern void page_ext_put(struct page_ext *page_ext); static inline struct page_ext *page_ext_next(struct page_ext *curr) { @@ -71,11 +72,6 @@ static inline void pgdat_page_ext_init(struct pglist_data *pgdat) { } -static inline struct page_ext *lookup_page_ext(const struct page *page) -{ - return NULL; -} - static inline void page_ext_init(void) { } @@ -87,5 +83,14 @@ static inline void page_ext_init_flatmem_late(void) static inline void page_ext_init_flatmem(void) { } + +static inline struct page_ext *page_ext_get(struct page *page) +{ + return NULL; +} + +static inline void page_ext_put(struct page_ext *page_ext) +{ +} #endif /* CONFIG_PAGE_EXTENSION */ #endif /* __LINUX_PAGE_EXT_H */ diff --git a/include/linux/page_idle.h b/include/linux/page_idle.h index d8a6aecf99cb9f0fc63bc49126b06fe5936d82e4..83ccdf07d29fbdea31a20b0f22345094c8b11163 100644 --- a/include/linux/page_idle.h +++ b/include/linux/page_idle.h @@ -43,66 +43,80 @@ static inline void clear_page_idle(struct page *page) * If there is not enough space to store Idle and Young bits in page flags, use * page ext flags instead. */ -extern struct page_ext_operations page_idle_ops; static inline bool page_is_young(struct page *page) { - struct page_ext *page_ext = lookup_page_ext(page); + struct page_ext *page_ext = page_ext_get(page); + bool page_young; if (unlikely(!page_ext)) return false; - return test_bit(PAGE_EXT_YOUNG, &page_ext->flags); + page_young = test_bit(PAGE_EXT_YOUNG, &page_ext->flags); + page_ext_put(page_ext); + + return page_young; } static inline void set_page_young(struct page *page) { - struct page_ext *page_ext = lookup_page_ext(page); + struct page_ext *page_ext = page_ext_get(page); if (unlikely(!page_ext)) return; set_bit(PAGE_EXT_YOUNG, &page_ext->flags); + page_ext_put(page_ext); } static inline bool test_and_clear_page_young(struct page *page) { - struct page_ext *page_ext = lookup_page_ext(page); + struct page_ext *page_ext = page_ext_get(page); + bool page_young; if (unlikely(!page_ext)) return false; - return test_and_clear_bit(PAGE_EXT_YOUNG, &page_ext->flags); + page_young = test_and_clear_bit(PAGE_EXT_YOUNG, &page_ext->flags); + page_ext_put(page_ext); + + return page_young; } static inline bool page_is_idle(struct page *page) { - struct page_ext *page_ext = lookup_page_ext(page); + struct page_ext *page_ext = page_ext_get(page); + bool page_idle; if (unlikely(!page_ext)) return false; - return test_bit(PAGE_EXT_IDLE, &page_ext->flags); + page_idle = test_bit(PAGE_EXT_IDLE, &page_ext->flags); + page_ext_put(page_ext); + + return page_idle; } static inline void set_page_idle(struct page *page) { - struct page_ext *page_ext = lookup_page_ext(page); + struct page_ext *page_ext = page_ext_get(page); if (unlikely(!page_ext)) return; set_bit(PAGE_EXT_IDLE, &page_ext->flags); + page_ext_put(page_ext); } static inline void clear_page_idle(struct page *page) { - struct page_ext *page_ext = lookup_page_ext(page); + struct page_ext *page_ext = page_ext_get(page); if (unlikely(!page_ext)) return; clear_bit(PAGE_EXT_IDLE, &page_ext->flags); + page_ext_put(page_ext); } #endif /* CONFIG_64BIT */ diff --git a/include/linux/sched.h b/include/linux/sched.h index b4ab407cab37968510b38e8741c478b79dff4503..40022e4a48a6e7517048d93696fbdb62bef07021 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -884,6 +884,9 @@ struct task_struct { #ifdef CONFIG_IOMMU_SVA KABI_FILL_HOLE(unsigned pasid_activated:1) #endif +#ifdef CONFIG_PAGE_OWNER + KABI_FILL_HOLE(unsigned in_page_owner:1) +#endif unsigned long atomic_flags; /* Flags requiring atomic access. */ diff --git a/include/linux/stackdepot.h b/include/linux/stackdepot.h index 1cbe5ad0577d5fe9ca32c1a665ec7cf2ed861ea8..563e927540351501d07de432cc8a89ee71d3985b 100644 --- a/include/linux/stackdepot.h +++ b/include/linux/stackdepot.h @@ -13,11 +13,43 @@ typedef u32 depot_stack_handle_t; +/* + * Every user of stack depot has to call stack_depot_init() during its own init + * when it's decided that it will be calling stack_depot_save() later. This is + * recommended for e.g. modules initialized later in the boot process, when + * slab_is_available() is true. + * + * The alternative is to select STACKDEPOT_ALWAYS_INIT to have stack depot + * enabled as part of mm_init(), for subsystems where it's known at compile time + * that stack depot will be used. + * + * Another alternative is to call stack_depot_want_early_init(), when the + * decision to use stack depot is taken e.g. when evaluating kernel boot + * parameters, which precedes the enablement point in mm_init(). + * + * stack_depot_init() and stack_depot_want_early_init() can be called regardless + * of CONFIG_STACKDEPOT and are no-op when disabled. The actual save/fetch/print + * functions should only be called from code that makes sure CONFIG_STACKDEPOT + * is enabled. + */ +#ifdef CONFIG_STACKDEPOT +int stack_depot_init(void); +void __init stack_depot_want_early_init(void); + +/* This is supposed to be called only from mm_init() */ +int __init stack_depot_early_init(void); +#else +static inline int stack_depot_init(void) { return 0; } + +static inline void stack_depot_want_early_init(void) { } + +static inline int stack_depot_early_init(void) { return 0; } +#endif + depot_stack_handle_t stack_depot_save(unsigned long *entries, unsigned int nr_entries, gfp_t gfp_flags); unsigned int stack_depot_fetch(depot_stack_handle_t handle, unsigned long **entries); - #endif diff --git a/init/main.c b/init/main.c index f06fbe79a84af364e6a79320697f6bae155c1a30..b8306e52d046ebbfca58d167a24bf8bd9d952bff 100644 --- a/init/main.c +++ b/init/main.c @@ -98,6 +98,7 @@ #include #include #include +#include #include #include @@ -829,10 +830,14 @@ static void __init mm_init(void) init_debug_pagealloc(); kfence_alloc_pool(); report_meminit(); + stack_depot_early_init(); mem_init(); - /* page_owner must be initialized after buddy is ready */ - page_ext_init_flatmem_late(); kmem_cache_init(); + /* + * page_owner must be initialized after buddy is ready, and also after + * slab is ready so that stack_depot_init() works properly + */ + page_ext_init_flatmem_late(); kmemleak_init(); pgtable_init(); debug_objects_mem_init(); diff --git a/lib/Kconfig b/lib/Kconfig index 8026964596fd72b6a5d4c46ea7f56d00c4f95c21..497304f1647ca7e14547591ecad5da38afbce3ca 100644 --- a/lib/Kconfig +++ b/lib/Kconfig @@ -664,6 +664,10 @@ config STACKDEPOT bool select STACKTRACE +config STACKDEPOT_ALWAYS_INIT + bool + select STACKDEPOT + config SBITMAP bool diff --git a/lib/Kconfig.kasan b/lib/Kconfig.kasan index 542a9c18398e158ccec96f5adf952bd23737fd16..24d309819b747bed1903aa176bca97cc15c8fb5d 100644 --- a/lib/Kconfig.kasan +++ b/lib/Kconfig.kasan @@ -49,7 +49,7 @@ config KASAN_GENERIC depends on (SLUB && SYSFS) || (SLAB && !DEBUG_SLAB) select SLUB_DEBUG if SLUB select CONSTRUCTORS - select STACKDEPOT + select STACKDEPOT_ALWAYS_INIT help Enables generic KASAN mode. @@ -73,7 +73,7 @@ config KASAN_SW_TAGS depends on (SLUB && SYSFS) || (SLAB && !DEBUG_SLAB) select SLUB_DEBUG if SLUB select CONSTRUCTORS - select STACKDEPOT + select STACKDEPOT_ALWAYS_INIT help Enables software tag-based KASAN mode. diff --git a/lib/stackdepot.c b/lib/stackdepot.c index 3cab9ba618df5acf9f1c1deaf09ae025d9a0bb4d..2f73c861e81c47bbf20bdcf45e13b439de6a5068 100644 --- a/lib/stackdepot.c +++ b/lib/stackdepot.c @@ -23,6 +23,7 @@ #include #include #include +#include #include #include #include @@ -30,6 +31,8 @@ #include #include #include +#include +extern unsigned long nr_free_buffer_pages(void); #define DEPOT_STACK_BITS (sizeof(depot_stack_handle_t) * 8) @@ -64,6 +67,9 @@ struct stack_record { unsigned long entries[1]; /* Variable-sized array of entries. */ }; +static bool __stack_depot_want_early_init __initdata = IS_ENABLED(CONFIG_STACKDEPOT_ALWAYS_INIT); +static bool __stack_depot_early_init_passed __initdata; + static void *stack_slabs[STACK_ALLOC_MAX_SLABS]; static int depot_index; @@ -141,14 +147,118 @@ static struct stack_record *depot_alloc_stack(unsigned long *entries, int size, return stack; } -#define STACK_HASH_ORDER 20 -#define STACK_HASH_SIZE (1L << STACK_HASH_ORDER) -#define STACK_HASH_MASK (STACK_HASH_SIZE - 1) +/* one hash table bucket entry per 16kB of memory */ +#define STACK_HASH_SCALE 14 +/* limited between 4k and 1M buckets */ +#define STACK_HASH_ORDER_MIN 12 +#define STACK_HASH_ORDER_MAX 20 #define STACK_HASH_SEED 0x9747b28c -static struct stack_record *stack_table[STACK_HASH_SIZE] = { - [0 ... STACK_HASH_SIZE - 1] = NULL -}; +static unsigned int stack_hash_order; +static unsigned int stack_hash_mask; + +static bool stack_depot_disable; +static struct stack_record **stack_table; + +static int __init is_stack_depot_disabled(char *str) +{ + int ret; + + ret = kstrtobool(str, &stack_depot_disable); + if (!ret && stack_depot_disable) { + pr_info("Stack Depot is disabled\n"); + stack_table = NULL; + } + return 0; +} +early_param("stack_depot_disable", is_stack_depot_disabled); + +void __init stack_depot_want_early_init(void) +{ + /* Too late to request early init now */ + WARN_ON(__stack_depot_early_init_passed); + + __stack_depot_want_early_init = true; +} + +int __init stack_depot_early_init(void) +{ + unsigned long entries = 0; + + /* This is supposed to be called only once, from mm_init() */ + if (WARN_ON(__stack_depot_early_init_passed)) + return 0; + + __stack_depot_early_init_passed = true; + + if (IS_ENABLED(CONFIG_KASAN) && !stack_hash_order) + stack_hash_order = STACK_HASH_ORDER_MAX; + + if (!__stack_depot_want_early_init || stack_depot_disable) + return 0; + + if (stack_hash_order) + entries = 1UL << stack_hash_order; + stack_table = alloc_large_system_hash("stackdepot", + sizeof(struct stack_record *), + entries, + STACK_HASH_SCALE, + HASH_EARLY | HASH_ZERO, + NULL, + &stack_hash_mask, + 1UL << STACK_HASH_ORDER_MIN, + 1UL << STACK_HASH_ORDER_MAX); + + if (!stack_table) { + pr_err("Stack Depot hash table allocation failed, disabling\n"); + stack_depot_disable = true; + return -ENOMEM; + } + + return 0; +} + +int stack_depot_init(void) +{ + static DEFINE_MUTEX(stack_depot_init_mutex); + int ret = 0; + + mutex_lock(&stack_depot_init_mutex); + if (!stack_depot_disable && !stack_table) { + unsigned long entries; + int scale = STACK_HASH_SCALE; + + if (stack_hash_order) { + entries = 1UL << stack_hash_order; + } else { + entries = nr_free_buffer_pages(); + entries = roundup_pow_of_two(entries); + + if (scale > PAGE_SHIFT) + entries >>= (scale - PAGE_SHIFT); + else + entries <<= (PAGE_SHIFT - scale); + } + + if (entries < 1UL << STACK_HASH_ORDER_MIN) + entries = 1UL << STACK_HASH_ORDER_MIN; + if (entries > 1UL << STACK_HASH_ORDER_MAX) + entries = 1UL << STACK_HASH_ORDER_MAX; + + pr_info("Stack Depot allocating hash table of %lu entries with kvcalloc\n", + entries); + stack_table = kvcalloc(entries, sizeof(struct stack_record *), GFP_KERNEL); + if (!stack_table) { + pr_err("Stack Depot hash table allocation failed, disabling\n"); + stack_depot_disable = true; + ret = -ENOMEM; + } + stack_hash_mask = entries - 1; + } + mutex_unlock(&stack_depot_init_mutex); + return ret; +} +EXPORT_SYMBOL_GPL(stack_depot_init); /* Calculate hash for a stack */ static inline u32 hash_stack(unsigned long *entries, unsigned int size) @@ -242,11 +352,11 @@ depot_stack_handle_t stack_depot_save(unsigned long *entries, unsigned long flags; u32 hash; - if (unlikely(nr_entries == 0)) + if (unlikely(nr_entries == 0) || stack_depot_disable) goto fast_exit; hash = hash_stack(entries, nr_entries); - bucket = &stack_table[hash & STACK_HASH_MASK]; + bucket = &stack_table[hash & stack_hash_mask]; /* * Fast path: look the stack trace up without locking. diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug index 864f129f193704ef7b0465e4f076126bb7070da9..154ece4e7fc5c2711094c2042010ff3530a88f1c 100644 --- a/mm/Kconfig.debug +++ b/mm/Kconfig.debug @@ -62,6 +62,15 @@ config PAGE_OWNER If unsure, say N. +config PAGE_OWNER_MODULE_STAT + bool "Track module allocation with page owner" + depends on PAGE_OWNER && MODULES + help + This tracks if a page is allocated by modules, may help to find the + alloc_page(s) problem in modules. Even if you include this feature + on your build, it is disabled in default. You should pass "page_owner=on" + to boot parameter in order to enable it. + config PAGE_POISONING bool "Poison pages after freeing" select PAGE_POISONING_NO_SANITY if HIBERNATION diff --git a/mm/Makefile b/mm/Makefile index a014a5e08f7b6a011a27088ed197208f7d4ad442..7194a39e2a900931e293811043ecb2d26dec061b 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -101,6 +101,7 @@ obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o obj-$(CONFIG_DEBUG_RODATA_TEST) += rodata_test.o obj-$(CONFIG_DEBUG_VM_PGTABLE) += debug_vm_pgtable.o obj-$(CONFIG_PAGE_OWNER) += page_owner.o +obj-$(CONFIG_PAGE_OWNER_MODULE_STAT) += page_owner_module.o obj-$(CONFIG_CLEANCACHE) += cleancache.o obj-$(CONFIG_MEMORY_ISOLATION) += page_isolation.o obj-$(CONFIG_ZPOOL) += zpool.o diff --git a/mm/page_ext.c b/mm/page_ext.c index 8e59da0f4367ac6b7b3c6e96d0484139ea9efcc1..d521c93b3ed67a90a93600a4466b0ff19a811423 100644 --- a/mm/page_ext.c +++ b/mm/page_ext.c @@ -8,6 +8,7 @@ #include #include #include +#include /* * struct page extension @@ -58,17 +59,21 @@ * can utilize this callback to initialize the state of it correctly. */ +#ifdef CONFIG_SPARSEMEM +#define PAGE_EXT_INVALID (0x1) +#endif + #if defined(CONFIG_PAGE_IDLE_FLAG) && !defined(CONFIG_64BIT) static bool need_page_idle(void) { return true; } -struct page_ext_operations page_idle_ops = { +static struct page_ext_operations page_idle_ops __initdata = { .need = need_page_idle, }; #endif -static struct page_ext_operations *page_ext_ops[] = { +static struct page_ext_operations *page_ext_ops[] __initdata = { #ifdef CONFIG_PAGE_OWNER &page_owner_ops, #endif @@ -80,6 +85,7 @@ static struct page_ext_operations *page_ext_ops[] = { unsigned long page_ext_size = sizeof(struct page_ext); static unsigned long total_usage; +static struct page_ext *lookup_page_ext(const struct page *page); static bool __init invoke_need_callbacks(void) { @@ -121,6 +127,48 @@ static inline struct page_ext *get_entry(void *base, unsigned long index) return base + page_ext_size * index; } +/** + * page_ext_get() - Get the extended information for a page. + * @page: The page we're interested in. + * + * Ensures that the page_ext will remain valid until page_ext_put() + * is called. + * + * Return: NULL if no page_ext exists for this page. + * Context: Any context. Caller may not sleep until they have called + * page_ext_put(). + */ +struct page_ext *page_ext_get(struct page *page) +{ + struct page_ext *page_ext; + + rcu_read_lock(); + page_ext = lookup_page_ext(page); + if (!page_ext) { + rcu_read_unlock(); + return NULL; + } + + return page_ext; +} + +/** + * page_ext_put() - Working with page extended information is done. + * @page_ext: Page extended information received from page_ext_get(). + * + * The page extended information of the page may not be valid after this + * function is called. + * + * Return: None. + * Context: Any context with corresponding page_ext_get() is called. + */ +void page_ext_put(struct page_ext *page_ext) +{ + if (unlikely(!page_ext)) + return; + + rcu_read_unlock(); +} #ifndef CONFIG_SPARSEMEM @@ -129,12 +177,13 @@ void __meminit pgdat_page_ext_init(struct pglist_data *pgdat) pgdat->node_page_ext = NULL; } -struct page_ext *lookup_page_ext(const struct page *page) +static struct page_ext *lookup_page_ext(const struct page *page) { unsigned long pfn = page_to_pfn(page); unsigned long index; struct page_ext *base; + WARN_ON_ONCE(!rcu_read_lock_held()); base = NODE_DATA(page_to_nid(page))->node_page_ext; /* * The sanity checks the page allocator does upon freeing a @@ -203,19 +252,27 @@ void __init page_ext_init_flatmem(void) #else /* CONFIG_FLAT_NODE_MEM_MAP */ -struct page_ext *lookup_page_ext(const struct page *page) +static bool page_ext_invalid(struct page_ext *page_ext) +{ + return !page_ext || (((unsigned long)page_ext & PAGE_EXT_INVALID) == PAGE_EXT_INVALID); +} + +static struct page_ext *lookup_page_ext(const struct page *page) { unsigned long pfn = page_to_pfn(page); struct mem_section *section = __pfn_to_section(pfn); + struct page_ext *page_ext = READ_ONCE(section->page_ext); + + WARN_ON_ONCE(!rcu_read_lock_held()); /* * The sanity checks the page allocator does upon freeing a * page can reach here before the page_ext arrays are * allocated when feeding a range of pages to the allocator * for the first time during bootup or memory hotplug. */ - if (!section->page_ext) + if (page_ext_invalid(page_ext)) return NULL; - return get_entry(section->page_ext, pfn); + return get_entry(page_ext, pfn); } static void *__meminit alloc_page_ext(size_t size, int nid) @@ -294,9 +351,30 @@ static void __free_page_ext(unsigned long pfn) ms = __pfn_to_section(pfn); if (!ms || !ms->page_ext) return; - base = get_entry(ms->page_ext, pfn); + + base = READ_ONCE(ms->page_ext); + /* + * page_ext here can be valid while doing the roll back + * operation in online_page_ext(). + */ + if (page_ext_invalid(base)) + base = (void *)base - PAGE_EXT_INVALID; + WRITE_ONCE(ms->page_ext, NULL); + + base = get_entry(base, pfn); free_page_ext(base); - ms->page_ext = NULL; +} + +static void __invalidate_page_ext(unsigned long pfn) +{ + struct mem_section *ms; + void *val; + + ms = __pfn_to_section(pfn); + if (!ms || !ms->page_ext) + return; + val = (void *)ms->page_ext + PAGE_EXT_INVALID; + WRITE_ONCE(ms->page_ext, val); } static int __meminit online_page_ext(unsigned long start_pfn, @@ -339,6 +417,20 @@ static int __meminit offline_page_ext(unsigned long start_pfn, start = SECTION_ALIGN_DOWN(start_pfn); end = SECTION_ALIGN_UP(start_pfn + nr_pages); + /* + * Freeing of page_ext is done in 3 steps to avoid + * use-after-free of it: + * 1) Traverse all the sections and mark their page_ext + * as invalid. + * 2) Wait for all the existing users of page_ext who + * started before invalidation to finish. + * 3) Free the page_ext. + */ + for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) + __invalidate_page_ext(pfn); + + synchronize_rcu(); + for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) __free_page_ext(pfn); return 0; diff --git a/mm/page_owner.c b/mm/page_owner.c index 5b93fc85dc739fb85a863208641b435b562e07b1..ed948099c7fda687b039d0ce54b2dc0bd7c4de10 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -10,8 +10,10 @@ #include #include #include +#include #include +#include "page_owner.h" #include "internal.h" /* @@ -20,17 +22,6 @@ */ #define PAGE_OWNER_STACK_DEPTH (16) -struct page_owner { - unsigned short order; - short last_migrate_reason; - gfp_t gfp_mask; - depot_stack_handle_t handle; - depot_stack_handle_t free_handle; - u64 ts_nsec; - u64 free_ts_nsec; - pid_t pid; -}; - static bool page_owner_enabled = false; DEFINE_STATIC_KEY_FALSE(page_owner_inited); @@ -48,11 +39,14 @@ static int __init early_page_owner_param(char *buf) if (strcmp(buf, "on") == 0) page_owner_enabled = true; + if (page_owner_enabled) + stack_depot_want_early_init(); + return 0; } early_param("page_owner", early_page_owner_param); -static bool need_page_owner(void) +static __init bool need_page_owner(void) { return page_owner_enabled; } @@ -81,7 +75,7 @@ static noinline void register_early_stack(void) early_handle = create_dummy_stack(); } -static void init_page_owner(void) +static __init void init_page_owner(void) { if (!page_owner_enabled) return; @@ -104,42 +98,30 @@ static inline struct page_owner *get_page_owner(struct page_ext *page_ext) return (void *)page_ext + page_owner_ops.offset; } -static inline bool check_recursive_alloc(unsigned long *entries, - unsigned int nr_entries, - unsigned long ip) -{ - unsigned int i; - - for (i = 0; i < nr_entries; i++) { - if (entries[i] == ip) - return true; - } - return false; -} - static noinline depot_stack_handle_t save_stack(gfp_t flags) { unsigned long entries[PAGE_OWNER_STACK_DEPTH]; depot_stack_handle_t handle; unsigned int nr_entries; - nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 2); - /* - * We need to check recursion here because our request to - * stackdepot could trigger memory allocation to save new - * entry. New memory allocation would reach here and call - * stack_depot_save_entries() again if we don't catch it. There is - * still not enough memory in stackdepot so it would try to - * allocate memory again and loop forever. + * Avoid recursion. + * + * Sometimes page metadata allocation tracking requires more + * memory to be allocated: + * - when new stack trace is saved to stack depot + * - when backtrace itself is calculated (ia64) */ - if (check_recursive_alloc(entries, nr_entries, _RET_IP_)) + if (current->in_page_owner) return dummy_handle; + current->in_page_owner = 1; + nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 2); handle = stack_depot_save(entries, nr_entries, flags); if (!handle) handle = failure_handle; + current->in_page_owner = 0; return handle; } @@ -150,10 +132,12 @@ void __reset_page_owner(struct page *page, unsigned int order) depot_stack_handle_t handle = 0; struct page_owner *page_owner; u64 free_ts_nsec = local_clock(); + char mod_name[MODULE_NAME_LEN] = {0}; handle = save_stack(GFP_NOWAIT | __GFP_NOWARN); + po_find_module_name_with_update(handle, mod_name, MODULE_NAME_LEN, -(1 << order)); - page_ext = lookup_page_ext(page); + page_ext = page_ext_get(page); if (unlikely(!page_ext)) return; for (i = 0; i < (1 << order); i++) { @@ -161,8 +145,10 @@ void __reset_page_owner(struct page *page, unsigned int order) page_owner = get_page_owner(page_ext); page_owner->free_handle = handle; page_owner->free_ts_nsec = free_ts_nsec; + po_set_module_name(page_owner, mod_name); page_ext = page_ext_next(page_ext); } + page_ext_put(page_ext); } static inline void __set_page_owner_handle(struct page *page, @@ -171,6 +157,10 @@ static inline void __set_page_owner_handle(struct page *page, { struct page_owner *page_owner; int i; + u64 ts_nsec = local_clock(); + char mod_name[MODULE_NAME_LEN] = {0}; + + po_find_module_name_with_update(handle, mod_name, MODULE_NAME_LEN, 1 << order); for (i = 0; i < (1 << order); i++) { page_owner = get_page_owner(page_ext); @@ -179,7 +169,11 @@ static inline void __set_page_owner_handle(struct page *page, page_owner->gfp_mask = gfp_mask; page_owner->last_migrate_reason = -1; page_owner->pid = current->pid; - page_owner->ts_nsec = local_clock(); + page_owner->tgid = current->tgid; + page_owner->ts_nsec = ts_nsec; + strscpy(page_owner->comm, current->comm, + sizeof(page_owner->comm)); + po_set_module_name(page_owner, mod_name); __set_bit(PAGE_EXT_OWNER, &page_ext->flags); __set_bit(PAGE_EXT_OWNER_ALLOCATED, &page_ext->flags); @@ -190,19 +184,22 @@ static inline void __set_page_owner_handle(struct page *page, noinline void __set_page_owner(struct page *page, unsigned int order, gfp_t gfp_mask) { - struct page_ext *page_ext = lookup_page_ext(page); + struct page_ext *page_ext; depot_stack_handle_t handle; + handle = save_stack(gfp_mask); + + page_ext = page_ext_get(page); if (unlikely(!page_ext)) return; - handle = save_stack(gfp_mask); __set_page_owner_handle(page, page_ext, handle, order, gfp_mask); + page_ext_put(page_ext); } void __set_page_owner_migrate_reason(struct page *page, int reason) { - struct page_ext *page_ext = lookup_page_ext(page); + struct page_ext *page_ext = page_ext_get(page); struct page_owner *page_owner; if (unlikely(!page_ext)) @@ -210,12 +207,13 @@ void __set_page_owner_migrate_reason(struct page *page, int reason) page_owner = get_page_owner(page_ext); page_owner->last_migrate_reason = reason; + page_ext_put(page_ext); } void __split_page_owner(struct page *page, unsigned int nr) { int i; - struct page_ext *page_ext = lookup_page_ext(page); + struct page_ext *page_ext = page_ext_get(page); struct page_owner *page_owner; if (unlikely(!page_ext)) @@ -226,17 +224,25 @@ void __split_page_owner(struct page *page, unsigned int nr) page_owner->order = 0; page_ext = page_ext_next(page_ext); } + page_ext_put(page_ext); } void __copy_page_owner(struct page *oldpage, struct page *newpage) { - struct page_ext *old_ext = lookup_page_ext(oldpage); - struct page_ext *new_ext = lookup_page_ext(newpage); + struct page_ext *old_ext; + struct page_ext *new_ext; struct page_owner *old_page_owner, *new_page_owner; - if (unlikely(!old_ext || !new_ext)) + old_ext = page_ext_get(oldpage); + if (unlikely(!old_ext)) return; + new_ext = page_ext_get(newpage); + if (unlikely(!new_ext)) { + page_ext_put(old_ext); + return; + } + old_page_owner = get_page_owner(old_ext); new_page_owner = get_page_owner(new_ext); new_page_owner->order = old_page_owner->order; @@ -245,8 +251,11 @@ void __copy_page_owner(struct page *oldpage, struct page *newpage) old_page_owner->last_migrate_reason; new_page_owner->handle = old_page_owner->handle; new_page_owner->pid = old_page_owner->pid; + new_page_owner->tgid = old_page_owner->tgid; new_page_owner->ts_nsec = old_page_owner->ts_nsec; new_page_owner->free_ts_nsec = old_page_owner->ts_nsec; + strcpy(new_page_owner->comm, old_page_owner->comm); + po_copy_module_name(new_page_owner, old_page_owner); /* * We don't clear the bit on the oldpage as it's going to be freed @@ -259,6 +268,8 @@ void __copy_page_owner(struct page *oldpage, struct page *newpage) */ __set_bit(PAGE_EXT_OWNER, &new_ext->flags); __set_bit(PAGE_EXT_OWNER_ALLOCATED, &new_ext->flags); + page_ext_put(new_ext); + page_ext_put(old_ext); } void pagetypeinfo_showmixedcount_print(struct seq_file *m, @@ -315,12 +326,12 @@ void pagetypeinfo_showmixedcount_print(struct seq_file *m, if (PageReserved(page)) continue; - page_ext = lookup_page_ext(page); + page_ext = page_ext_get(page); if (unlikely(!page_ext)) continue; if (!test_bit(PAGE_EXT_OWNER_ALLOCATED, &page_ext->flags)) - continue; + goto ext_put_continue; page_owner = get_page_owner(page_ext); page_mt = gfp_migratetype(page_owner->gfp_mask); @@ -331,9 +342,12 @@ void pagetypeinfo_showmixedcount_print(struct seq_file *m, count[pageblock_mt]++; pfn = block_end_pfn; + page_ext_put(page_ext); break; } pfn += (1UL << page_owner->order) - 1; +ext_put_continue: + page_ext_put(page_ext); } } @@ -344,6 +358,45 @@ void pagetypeinfo_showmixedcount_print(struct seq_file *m, seq_putc(m, '\n'); } +/* + * Looking for memcg information and print it out + */ +static inline int print_page_owner_memcg(char *kbuf, size_t count, int ret, + struct page *page) +{ +#ifdef CONFIG_MEMCG + unsigned long memcg_data; + struct mem_cgroup *memcg; + bool online; + char name[80]; + + rcu_read_lock(); + memcg_data = READ_ONCE(page->memcg_data); + if (!memcg_data) + goto out_unlock; + + if (memcg_data & MEMCG_DATA_OBJCGS) + ret += scnprintf(kbuf + ret, count - ret, + "Slab cache page\n"); + + memcg = page_memcg_check(page); + if (!memcg) + goto out_unlock; + + online = (memcg->css.flags & CSS_ONLINE); + cgroup_name(memcg->css.cgroup, name, sizeof(name)); + ret += scnprintf(kbuf + ret, count - ret, + "Charged %sto %smemcg %s\n", + PageMemcgKmem(page) ? "(via objcg) " : "", + online ? "" : "offline ", + name); +out_unlock: + rcu_read_unlock(); +#endif /* CONFIG_MEMCG */ + + return ret; +} + static ssize_t print_page_owner(char __user *buf, size_t count, unsigned long pfn, struct page *page, struct page_owner *page_owner, @@ -359,19 +412,17 @@ print_page_owner(char __user *buf, size_t count, unsigned long pfn, if (!kbuf) return -ENOMEM; - ret = snprintf(kbuf, count, - "Page allocated via order %u, mask %#x(%pGg), pid %d, ts %llu ns, free_ts %llu ns\n", + ret = scnprintf(kbuf, count, + "Page allocated via order %u, mask %#x(%pGg), pid %d, tgid %d (%s), ts %llu ns, free_ts %llu ns\n", page_owner->order, page_owner->gfp_mask, &page_owner->gfp_mask, page_owner->pid, + page_owner->tgid, page_owner->comm, page_owner->ts_nsec, page_owner->free_ts_nsec); - if (ret >= count) - goto err; - /* Print information relevant to grouping pages by mobility */ pageblock_mt = get_pageblock_migratetype(page); page_mt = gfp_migratetype(page_owner->gfp_mask); - ret += snprintf(kbuf + ret, count - ret, + ret += scnprintf(kbuf + ret, count - ret, "PFN %lu type %s Block %lu type %s Flags %#lx(%pGp)\n", pfn, migratetype_names[page_mt], @@ -379,8 +430,7 @@ print_page_owner(char __user *buf, size_t count, unsigned long pfn, migratetype_names[pageblock_mt], page->flags, &page->flags); - if (ret >= count) - goto err; + ret += po_module_name_snprint(page_owner, kbuf + ret, count - ret); nr_entries = stack_depot_fetch(handle, &entries); ret += stack_trace_snprint(kbuf + ret, count - ret, entries, nr_entries, 0); @@ -388,13 +438,13 @@ print_page_owner(char __user *buf, size_t count, unsigned long pfn, goto err; if (page_owner->last_migrate_reason != -1) { - ret += snprintf(kbuf + ret, count - ret, + ret += scnprintf(kbuf + ret, count - ret, "Page has been migrated, last migrate reason: %s\n", migrate_reason_names[page_owner->last_migrate_reason]); - if (ret >= count) - goto err; } + ret = print_page_owner_memcg(kbuf, count, ret, page); + ret += snprintf(kbuf + ret, count - ret, "\n"); if (ret >= count) goto err; @@ -412,7 +462,7 @@ print_page_owner(char __user *buf, size_t count, unsigned long pfn, void __dump_page_owner(struct page *page) { - struct page_ext *page_ext = lookup_page_ext(page); + struct page_ext *page_ext = page_ext_get(page); struct page_owner *page_owner; depot_stack_handle_t handle; unsigned long *entries; @@ -431,6 +481,7 @@ void __dump_page_owner(struct page *page) if (!test_bit(PAGE_EXT_OWNER, &page_ext->flags)) { pr_alert("page_owner info is not present (never set?)\n"); + page_ext_put(page_ext); return; } @@ -439,9 +490,10 @@ void __dump_page_owner(struct page *page) else pr_alert("page_owner tracks the page as freed\n"); - pr_alert("page last allocated via order %u, migratetype %s, gfp_mask %#x(%pGg), pid %d, ts %llu, free_ts %llu\n", + pr_alert("page last allocated via order %u, migratetype %s, gfp_mask %#x(%pGg), pid %d, tgid %d (%s), ts %llu, free_ts %llu\n", page_owner->order, migratetype_names[mt], gfp_mask, &gfp_mask, - page_owner->pid, page_owner->ts_nsec, page_owner->free_ts_nsec); + page_owner->pid, page_owner->tgid, page_owner->comm, + page_owner->ts_nsec, page_owner->free_ts_nsec); handle = READ_ONCE(page_owner->handle); if (!handle) { @@ -463,6 +515,7 @@ void __dump_page_owner(struct page *page) if (page_owner->last_migrate_reason != -1) pr_alert("page has been migrated, last migrate reason: %s\n", migrate_reason_names[page_owner->last_migrate_reason]); + page_ext_put(page_ext); } static ssize_t @@ -484,10 +537,16 @@ read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos) while (!pfn_valid(pfn) && (pfn & (MAX_ORDER_NR_PAGES - 1)) != 0) pfn++; - drain_all_pages(NULL); - /* Find an allocated page */ for (; pfn < max_pfn; pfn++) { + /* + * This temporary page_owner is required so + * that we can avoid the context switches while holding + * the rcu lock and copying the page owner information to + * user through copy_to_user() or GFP_KERNEL allocations. + */ + struct page_owner page_owner_tmp; + /* * If the new page is in a new MAX_ORDER_NR_PAGES area, * validate the area as existing, skip it if not @@ -510,7 +569,7 @@ read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos) continue; } - page_ext = lookup_page_ext(page); + page_ext = page_ext_get(page); if (unlikely(!page_ext)) continue; @@ -519,14 +578,14 @@ read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos) * because we don't hold the zone lock. */ if (!test_bit(PAGE_EXT_OWNER, &page_ext->flags)) - continue; + goto ext_put_continue; /* * Although we do have the info about past allocation of free * pages, it's not relevant for current memory usage. */ if (!test_bit(PAGE_EXT_OWNER_ALLOCATED, &page_ext->flags)) - continue; + goto ext_put_continue; page_owner = get_page_owner(page_ext); @@ -535,7 +594,7 @@ read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos) * would inflate the stats. */ if (!IS_ALIGNED(pfn, 1 << page_owner->order)) - continue; + goto ext_put_continue; /* * Access to page_ext->handle isn't synchronous so we should @@ -543,13 +602,20 @@ read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos) */ handle = READ_ONCE(page_owner->handle); if (!handle) - continue; + goto ext_put_continue; + + if (po_is_filtered(page_owner)) + goto ext_put_continue; /* Record the next PFN to read in the file offset */ *ppos = (pfn - min_low_pfn) + 1; + page_owner_tmp = *page_owner; + page_ext_put(page_ext); return print_page_owner(buf, count, pfn, page, - page_owner, handle); + &page_owner_tmp, handle); +ext_put_continue: + page_ext_put(page_ext); } return 0; @@ -607,18 +673,20 @@ static void init_pages_in_zone(pg_data_t *pgdat, struct zone *zone) if (PageReserved(page)) continue; - page_ext = lookup_page_ext(page); + page_ext = page_ext_get(page); if (unlikely(!page_ext)) continue; /* Maybe overlapping zone */ if (test_bit(PAGE_EXT_OWNER, &page_ext->flags)) - continue; + goto ext_put_continue; /* Found early allocated page */ __set_page_owner_handle(page, page_ext, early_handle, 0, 0); count++; +ext_put_continue: + page_ext_put(page_ext); } cond_resched(); } @@ -662,6 +730,7 @@ static int __init pageowner_init(void) debugfs_create_file("page_owner", 0400, NULL, NULL, &proc_page_owner_operations); + po_module_stat_init(); return 0; } late_initcall(pageowner_init) diff --git a/mm/page_owner.h b/mm/page_owner.h new file mode 100644 index 0000000000000000000000000000000000000000..a8517d38d3de7860b5ee70f8c49545a8f38a089f --- /dev/null +++ b/mm/page_owner.h @@ -0,0 +1,71 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Copyright (C) Huawei Technologies Co., Ltd. 2023. All rights reserved. + */ + +#ifndef __MM_PAGE_OWNER_H +#define __MM_PAGE_OWNER_H + +#include + +struct page_owner { + unsigned short order; + short last_migrate_reason; + gfp_t gfp_mask; + depot_stack_handle_t handle; + depot_stack_handle_t free_handle; + u64 ts_nsec; + u64 free_ts_nsec; + char comm[TASK_COMM_LEN]; + pid_t pid; + pid_t tgid; +#ifdef CONFIG_PAGE_OWNER_MODULE_STAT + char module_name[MODULE_NAME_LEN]; +#endif +}; + +#ifdef CONFIG_PAGE_OWNER_MODULE_STAT +void po_find_module_name_with_update(depot_stack_handle_t handle, char *mod_name, + size_t size, long nr_pages); +void po_set_module_name(struct page_owner *page_owner, char *mod_name); +int po_module_name_snprint(struct page_owner *page_owner, char *kbuf, size_t size); +void po_module_stat_init(void); +bool po_is_filtered(struct page_owner *page_owner); + +static inline void po_copy_module_name(struct page_owner *dst, + struct page_owner *src) +{ + po_set_module_name(dst, src->module_name); +} + +#else +static void po_find_module_name_with_update(depot_stack_handle_t handle, char *mod_name, + size_t size, long nr_pages) +{ +} + +static void po_set_module_name(struct page_owner *page_owner, char *mod_name) +{ +} + +static inline int po_module_name_snprint(struct page_owner *page_owner, + char *kbuf, size_t size) +{ + return 0; +} + +static inline void po_copy_module_name(struct page_owner *dst, struct page_owner *src) +{ +} + +static inline void po_module_stat_init(void) +{ +} + +static inline bool po_is_filtered(struct page_owner *page_owner) +{ + return false; +} +#endif + +#endif diff --git a/mm/page_owner_module.c b/mm/page_owner_module.c new file mode 100644 index 0000000000000000000000000000000000000000..cb878bd3ddcb626dbee4adc32894644160810aaa --- /dev/null +++ b/mm/page_owner_module.c @@ -0,0 +1,402 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) Huawei Technologies Co., Ltd. 2023. All rights reserved. + * + * page_owner_module core file + */ + +#include +#include +#include +#include +#include +#include +#include + +#include "page_owner.h" + +#define PAGE_OWNER_FILTER_BUF_SIZE 16 +#define PAGE_OWNER_NONE_FILTER 0 +#define PAGE_OWNER_MODULE_FILTER 1 + +#define PO_MODULE_DEFAULT_TOPN 20 + +static unsigned int page_owner_filter = PAGE_OWNER_NONE_FILTER; + +struct po_module { + struct list_head list; + struct module *mod; + long nr_pages_used; +}; + +struct leaked_po_module { + struct list_head list; + char module_name[MODULE_NAME_LEN]; + long nr_pages_used; + u64 unload_ns; +}; + +LIST_HEAD(po_module_list); +LIST_HEAD(leaked_po_module_list); +DEFINE_SPINLOCK(po_module_list_lock); + +static unsigned int po_module_topn = PO_MODULE_DEFAULT_TOPN; + +static int po_module_cmp(void *priv, const struct list_head *h1, + const struct list_head *h2) +{ + struct po_module *lhs, *rhs; + + lhs = container_of(h1, struct po_module, list); + rhs = container_of(h2, struct po_module, list); + + return lhs->nr_pages_used < rhs->nr_pages_used; +} + +static inline struct po_module *po_find_module(const struct module *mod) +{ + struct po_module *po_mod; + + lockdep_assert_held(&po_module_list_lock); + list_for_each_entry(po_mod, &po_module_list, list) { + if (po_mod->mod == mod) + return po_mod; + } + + pr_warn("page_owner_module: failed to find module %s in po_module list\n", + mod->name); + return NULL; +} + +void po_update_module_pages(const struct module *mod, long nr_pages) +{ + struct po_module *po_mod; + unsigned long flags; + + if (unlikely(!mod)) + return; + + spin_lock_irqsave(&po_module_list_lock, flags); + po_mod = po_find_module(mod); + if (po_mod) + po_mod->nr_pages_used += nr_pages; + spin_unlock_irqrestore(&po_module_list_lock, flags); +} + + +void po_find_module_name_with_update(depot_stack_handle_t handle, char *mod_name, + size_t size, long nr_pages) +{ + int i; + struct module *mod = NULL; + unsigned long *entries; + unsigned int nr_entries; + + if (unlikely(!mod_name)) + return; + + nr_entries = stack_depot_fetch(handle, &entries); + if (!in_task()) + nr_entries = filter_irq_stacks(entries, nr_entries); + for (i = 0; i < nr_entries; i++) { + if (core_kernel_text(entries[i])) + continue; + + preempt_disable(); + mod = __module_address(entries[i]); + preempt_enable(); + + if (!mod) + continue; + + strscpy(mod_name, mod->name, size); + po_update_module_pages(mod, nr_pages); + return; + } +} + +void po_set_module_name(struct page_owner *page_owner, char *mod_name) +{ + if (unlikely(!page_owner || !mod_name)) + return; + + if (strlen(mod_name) != 0) + strscpy(page_owner->module_name, mod_name, MODULE_NAME_LEN); + else + memset(page_owner->module_name, 0, MODULE_NAME_LEN); +} + +static inline bool po_is_module(struct page_owner *page_owner) +{ + return strlen(page_owner->module_name) != 0; +} + +int po_module_name_snprint(struct page_owner *page_owner, + char *kbuf, size_t size) +{ + if (unlikely(!page_owner || !kbuf)) + return 0; + + if (po_is_module(page_owner)) + return scnprintf(kbuf, size, "Page allocated by module %s\n", + page_owner->module_name); + + return 0; +} + +static ssize_t read_page_owner_filter(struct file *file, + char __user *user_buf, size_t count, loff_t *ppos) +{ + char kbuf[PAGE_OWNER_FILTER_BUF_SIZE]; + int kcount; + + if (page_owner_filter & PAGE_OWNER_MODULE_FILTER) + kcount = snprintf(kbuf, sizeof(kbuf), "module\n"); + else + kcount = snprintf(kbuf, sizeof(kbuf), "none\n"); + + return simple_read_from_buffer(user_buf, count, ppos, kbuf, kcount); +} + +static ssize_t write_page_owner_filter(struct file *file, + const char __user *user_buf, size_t count, loff_t *ppos) +{ + char kbuf[PAGE_OWNER_FILTER_BUF_SIZE]; + char *p_kbuf; + size_t kbuf_size; + + kbuf_size = min(count, sizeof(kbuf) - 1); + if (copy_from_user(kbuf, user_buf, kbuf_size)) + return -EFAULT; + + kbuf[kbuf_size] = '\0'; + p_kbuf = strstrip(kbuf); + + if (!strcmp(p_kbuf, "module")) + page_owner_filter = PAGE_OWNER_MODULE_FILTER; + else if (!strcmp(p_kbuf, "none")) + page_owner_filter = PAGE_OWNER_NONE_FILTER; + else + return -EINVAL; + + return count; +} + +static const struct file_operations page_owner_filter_ops = { + .read = read_page_owner_filter, + .write = write_page_owner_filter, + .llseek = default_llseek, +}; + +bool po_is_filtered(struct page_owner *page_owner) +{ + if (unlikely(!page_owner)) + return false; + + if (page_owner_filter & PAGE_OWNER_MODULE_FILTER && + !po_is_module(page_owner)) + return true; + + return false; +} + +static int po_module_coming(struct module *mod) +{ + struct po_module *po_mod; + unsigned long flags; + + po_mod = kmalloc(sizeof(*po_mod), GFP_KERNEL); + if (!po_mod) + return -ENOMEM; + + po_mod->nr_pages_used = 0; + po_mod->mod = mod; + INIT_LIST_HEAD(&po_mod->list); + spin_lock_irqsave(&po_module_list_lock, flags); + list_add_tail(&po_mod->list, &po_module_list); + spin_unlock_irqrestore(&po_module_list_lock, flags); + + return 0; +} + +static void create_leaked_node(struct po_module *po_mod) +{ + struct leaked_po_module *leaked_po_mod; + unsigned long flags; + + leaked_po_mod = kmalloc(sizeof(struct leaked_po_module), GFP_KERNEL); + if (!leaked_po_mod) + return; + + leaked_po_mod->unload_ns = local_clock(); + strscpy(leaked_po_mod->module_name, po_mod->mod->name, MODULE_NAME_LEN); + leaked_po_mod->nr_pages_used = po_mod->nr_pages_used; + INIT_LIST_HEAD(&leaked_po_mod->list); + spin_lock_irqsave(&po_module_list_lock, flags); + list_add_tail(&leaked_po_mod->list, &leaked_po_module_list); + spin_unlock_irqrestore(&po_module_list_lock, flags); +} + +static void po_module_going(struct module *mod) +{ + struct po_module *po_mod; + unsigned long flags; + + spin_lock_irqsave(&po_module_list_lock, flags); + po_mod = po_find_module(mod); + list_del(&po_mod->list); + spin_unlock_irqrestore(&po_module_list_lock, flags); + + if (unlikely(po_mod->nr_pages_used)) + create_leaked_node(po_mod); + + kfree(po_mod); +} + +static int po_module_notify(struct notifier_block *self, + unsigned long val, void *data) +{ + struct module *mod = data; + int ret = 0; + + switch (val) { + case MODULE_STATE_COMING: + ret = po_module_coming(mod); + break; + case MODULE_STATE_GOING: + po_module_going(mod); + break; + } + + return notifier_from_errno(ret); +} + +static struct notifier_block po_module_nb = { + .notifier_call = po_module_notify, + .priority = 0 +}; + +static void print_list(unsigned int nr, struct seq_file *m) +{ + struct po_module *po_mod; + + lockdep_assert_held(&po_module_list_lock); + + if (list_empty(&po_module_list)) + return; + + list_sort(NULL, &po_module_list, po_module_cmp); + list_for_each_entry(po_mod, &po_module_list, list) { + if (m) + seq_printf(m, "%s %ld\n", po_mod->mod->name, + po_mod->nr_pages_used); + else + pr_info("\tModule %s allocated %ld pages\n", + po_mod->mod->name, po_mod->nr_pages_used); + --nr; + if (!nr) + break; + } +} + +static void print_leaked_list(struct seq_file *m) +{ + struct leaked_po_module *leaked_po_mod; + + lockdep_assert_held(&po_module_list_lock); + + if (list_empty(&leaked_po_module_list)) + return; + + list_for_each_entry(leaked_po_mod, &leaked_po_module_list, list) { + if (m) + seq_printf(m, "[unloaded %llu]%s %ld\n", leaked_po_mod->unload_ns, + leaked_po_mod->module_name, leaked_po_mod->nr_pages_used); + else + pr_info("\t[unloaded %llu]Module %s allocated %ld pages\n", + leaked_po_mod->unload_ns, leaked_po_mod->module_name, + leaked_po_mod->nr_pages_used); + } +} + +static int po_oom_notify(struct notifier_block *self, + unsigned long val, void *data) +{ + unsigned long flags; + unsigned int nr = po_module_topn; + int ret = notifier_from_errno(0); + + if (!nr) + return ret; + + spin_lock_irqsave(&po_module_list_lock, flags); + pr_info("Top modules allocating pages:\n"); + + print_list(nr, NULL); + print_leaked_list(NULL); + + spin_unlock_irqrestore(&po_module_list_lock, flags); + + return ret; +} + +static struct notifier_block po_oom_nb = { + .notifier_call = po_oom_notify, + .priority = 0 +}; + +static int po_module_topn_set(void *data, u64 val) +{ + po_module_topn = val; + return 0; +} + +static int po_module_topn_get(void *data, u64 *val) +{ + *val = po_module_topn; + return 0; +} + +DEFINE_SIMPLE_ATTRIBUTE(po_module_topn_fops, po_module_topn_get, + po_module_topn_set, "%llu\n"); + +static int page_owner_module_stats_show(struct seq_file *m, void *v) +{ + unsigned long flags; + unsigned int nr = po_module_topn; + + if (!nr) + return 0; + + spin_lock_irqsave(&po_module_list_lock, flags); + + print_list(nr, m); + print_leaked_list(m); + + spin_unlock_irqrestore(&po_module_list_lock, flags); + return 0; +} +DEFINE_SHOW_ATTRIBUTE(page_owner_module_stats); + + +void po_module_stat_init(void) +{ + int ret; + + debugfs_create_file("page_owner_filter", 0600, NULL, NULL, + &page_owner_filter_ops); + + ret = register_module_notifier(&po_module_nb); + if (ret) { + pr_warn("Failed to register page owner module enter notifier\n"); + return; + } + + ret = register_oom_notifier(&po_oom_nb); + if (ret) + pr_warn("Failed to register page owner oom notifier\n"); + + debugfs_create_file("page_owner_module_show_max", 0600, NULL, NULL, &po_module_topn_fops); + debugfs_create_file("page_owner_module_stats", 0400, NULL, NULL, + &page_owner_module_stats_fops); +} diff --git a/tools/vm/page_owner_sort.c b/tools/vm/page_owner_sort.c index 85eb65ea16d30b476845bb135d62e0e64c62ba21..b4936e78d7d5fb35a1e8924a3b4b41cc335b13df 100644 --- a/tools/vm/page_owner_sort.c +++ b/tools/vm/page_owner_sort.c @@ -5,6 +5,8 @@ * Example use: * cat /sys/kernel/debug/page_owner > page_owner_full.txt * ./page_owner_sort page_owner_full.txt sorted_page_owner.txt + * Or sort by total memory: + * ./page_owner_sort -m page_owner_full.txt sorted_page_owner.txt * * See Documentation/vm/page_owner.rst */ @@ -16,29 +18,113 @@ #include #include #include +#include +#include +#include +#include + +#define bool int +#define true 1 +#define false 0 +#define TASK_COMM_LEN 16 +#define MODULE_NAME_LEN (64 - sizeof(unsigned long)) struct block_list { char *txt; + char *comm; // task command name + char *stacktrace; + __u64 ts_nsec; + __u64 free_ts_nsec; int len; int num; + int page_num; + pid_t pid; + pid_t tgid; + int allocator; + char *module; }; - - +enum FILTER_BIT { + FILTER_UNRELEASE = 1<<1, + FILTER_PID = 1<<2, + FILTER_TGID = 1<<3, + FILTER_COMM = 1<<4, + FILTER_MODULE = 1<<5 +}; +enum CULL_BIT { + CULL_UNRELEASE = 1<<1, + CULL_PID = 1<<2, + CULL_TGID = 1<<3, + CULL_COMM = 1<<4, + CULL_STACKTRACE = 1<<5, + CULL_ALLOCATOR = 1<<6, + CULL_MODULE = 1 << 7 +}; +enum ALLOCATOR_BIT { + ALLOCATOR_CMA = 1<<1, + ALLOCATOR_SLAB = 1<<2, + ALLOCATOR_VMALLOC = 1<<3, + ALLOCATOR_OTHERS = 1<<4 +}; +enum ARG_TYPE { + ARG_TXT, ARG_COMM, ARG_STACKTRACE, ARG_ALLOC_TS, ARG_FREE_TS, + ARG_CULL_TIME, ARG_PAGE_NUM, ARG_PID, ARG_TGID, ARG_UNKNOWN, ARG_FREE, + ARG_ALLOCATOR, ARG_MODULE +}; +enum SORT_ORDER { + SORT_ASC = 1, + SORT_DESC = -1, +}; +struct filter_condition { + pid_t *pids; + pid_t *tgids; + char **comms; + char **modules; + int pids_size; + int tgids_size; + int comms_size; + int modules_size; +}; +struct sort_condition { + int (**cmps)(const void *, const void *); + int *signs; + int size; +}; +static struct filter_condition fc; +static struct sort_condition sc; +static regex_t order_pattern; +static regex_t pid_pattern; +static regex_t tgid_pattern; +static regex_t comm_pattern; +static regex_t ts_nsec_pattern; +static regex_t free_ts_nsec_pattern; +static regex_t module_pattern; static struct block_list *list; static int list_size; static int max_size; +static int cull; +static int filter; +static bool debug_on; -struct block_list *block_head; +static void set_single_cmp(int (*cmp)(const void *, const void *), int sign); -int read_block(char *buf, int buf_size, FILE *fin) +int read_block(char *buf, char *ext_buf, char *mod_buf, int buf_size, FILE *fin) { char *curr = buf, *const buf_end = buf + buf_size; + char *mod_string = "Page allocated by module"; + mod_buf[0] = '\0'; while (buf_end - curr > 1 && fgets(curr, buf_end - curr, fin)) { - if (*curr == '\n') /* empty line */ + if (*curr == '\n') { /* empty line */ return curr - buf; - if (!strncmp(curr, "PFN", 3)) + } + if (!strncmp(curr, "PFN", 3)) { + strcpy(ext_buf, curr); + continue; + } + if (!strncmp(curr, mod_string, strlen(mod_string))) { + strcpy(mod_buf, curr); continue; + } curr += strlen(curr); } @@ -52,102 +138,840 @@ static int compare_txt(const void *p1, const void *p2) return strcmp(l1->txt, l2->txt); } +static int compare_stacktrace(const void *p1, const void *p2) +{ + const struct block_list *l1 = p1, *l2 = p2; + + return strcmp(l1->stacktrace, l2->stacktrace); +} + static int compare_num(const void *p1, const void *p2) { const struct block_list *l1 = p1, *l2 = p2; - return l2->num - l1->num; + return l1->num - l2->num; +} + +static int compare_page_num(const void *p1, const void *p2) +{ + const struct block_list *l1 = p1, *l2 = p2; + + return l1->page_num - l2->page_num; +} + +static int compare_pid(const void *p1, const void *p2) +{ + const struct block_list *l1 = p1, *l2 = p2; + + return l1->pid - l2->pid; +} + +static int compare_tgid(const void *p1, const void *p2) +{ + const struct block_list *l1 = p1, *l2 = p2; + + return l1->tgid - l2->tgid; +} + +static int compare_allocator(const void *p1, const void *p2) +{ + const struct block_list *l1 = p1, *l2 = p2; + + return l1->allocator - l2->allocator; +} + +static int compare_comm(const void *p1, const void *p2) +{ + const struct block_list *l1 = p1, *l2 = p2; + + return strcmp(l1->comm, l2->comm); +} + +static int compare_ts(const void *p1, const void *p2) +{ + const struct block_list *l1 = p1, *l2 = p2; + + return l1->ts_nsec < l2->ts_nsec ? -1 : 1; +} + +static int compare_free_ts(const void *p1, const void *p2) +{ + const struct block_list *l1 = p1, *l2 = p2; + + return l1->free_ts_nsec < l2->free_ts_nsec ? -1 : 1; +} + +static int compare_release(const void *p1, const void *p2) +{ + const struct block_list *l1 = p1, *l2 = p2; + + if (!l1->free_ts_nsec && !l2->free_ts_nsec) + return 0; + if (l1->free_ts_nsec && l2->free_ts_nsec) + return 0; + return l1->free_ts_nsec ? 1 : -1; +} + +static int compare_module(const void *p1, const void *p2) +{ + const struct block_list *l1 = p1, *l2 = p2; + + return strcmp(l1->module, l2->module); +} + +static int compare_cull_condition(const void *p1, const void *p2) +{ + if (cull == 0) + return compare_txt(p1, p2); + if ((cull & CULL_STACKTRACE) && compare_stacktrace(p1, p2)) + return compare_stacktrace(p1, p2); + if ((cull & CULL_PID) && compare_pid(p1, p2)) + return compare_pid(p1, p2); + if ((cull & CULL_TGID) && compare_tgid(p1, p2)) + return compare_tgid(p1, p2); + if ((cull & CULL_COMM) && compare_comm(p1, p2)) + return compare_comm(p1, p2); + if ((cull & CULL_UNRELEASE) && compare_release(p1, p2)) + return compare_release(p1, p2); + if ((cull & CULL_ALLOCATOR) && compare_allocator(p1, p2)) + return compare_allocator(p1, p2); + if ((cull & CULL_MODULE) && compare_module(p1, p2)) + return compare_module(p1, p2); + return 0; +} + +static int compare_sort_condition(const void *p1, const void *p2) +{ + int cmp = 0; + + for (int i = 0; i < sc.size; ++i) + if (cmp == 0) + cmp = sc.signs[i] * sc.cmps[i](p1, p2); + return cmp; } -static void add_list(char *buf, int len) +static int search_pattern(regex_t *pattern, char *pattern_str, char *buf) +{ + int err, val_len; + regmatch_t pmatch[2]; + + err = regexec(pattern, buf, 2, pmatch, REG_NOTBOL); + if (err != 0 || pmatch[1].rm_so == -1) { + if (debug_on) + fprintf(stderr, "no matching pattern in %s\n", buf); + return -1; + } + val_len = pmatch[1].rm_eo - pmatch[1].rm_so; + + memcpy(pattern_str, buf + pmatch[1].rm_so, val_len); + + return 0; +} + +static bool check_regcomp(regex_t *pattern, const char *regex) +{ + int err; + + err = regcomp(pattern, regex, REG_EXTENDED | REG_NEWLINE); + if (err != 0 || pattern->re_nsub != 1) { + fprintf(stderr, "Invalid pattern %s code %d\n", regex, err); + return false; + } + return true; +} + +static char **explode(char sep, const char *str, int *size) +{ + int count = 0, len = strlen(str); + int lastindex = -1, j = 0; + + for (int i = 0; i < len; i++) + if (str[i] == sep) + count++; + char **ret = calloc(++count, sizeof(char *)); + + for (int i = 0; i < len; i++) { + if (str[i] == sep) { + ret[j] = calloc(i - lastindex, sizeof(char)); + memcpy(ret[j++], str + lastindex + 1, i - lastindex - 1); + lastindex = i; + } + } + if (lastindex <= len - 1) { + ret[j] = calloc(len - lastindex, sizeof(char)); + memcpy(ret[j++], str + lastindex + 1, strlen(str) - 1 - lastindex); + } + *size = j; + return ret; +} + +static void free_explode(char **arr, int size) +{ + for (int i = 0; i < size; i++) + free(arr[i]); + free(arr); +} + +# define FIELD_BUFF 25 + +static int get_page_num(char *buf) +{ + int order_val; + char order_str[FIELD_BUFF] = {0}; + char *endptr; + + search_pattern(&order_pattern, order_str, buf); + errno = 0; + order_val = strtol(order_str, &endptr, 10); + if (order_val > 64 || errno != 0 || endptr == order_str || *endptr != '\0') { + if (debug_on) + fprintf(stderr, "wrong order in follow buf:\n%s\n", buf); + return 0; + } + + return 1 << order_val; +} + +static pid_t get_pid(char *buf) +{ + pid_t pid; + char pid_str[FIELD_BUFF] = {0}; + char *endptr; + + search_pattern(&pid_pattern, pid_str, buf); + errno = 0; + pid = strtol(pid_str, &endptr, 10); + if (errno != 0 || endptr == pid_str || *endptr != '\0') { + if (debug_on) + fprintf(stderr, "wrong/invalid pid in follow buf:\n%s\n", buf); + return -1; + } + + return pid; + +} + +static pid_t get_tgid(char *buf) +{ + pid_t tgid; + char tgid_str[FIELD_BUFF] = {0}; + char *endptr; + + search_pattern(&tgid_pattern, tgid_str, buf); + errno = 0; + tgid = strtol(tgid_str, &endptr, 10); + if (errno != 0 || endptr == tgid_str || *endptr != '\0') { + if (debug_on) + fprintf(stderr, "wrong/invalid tgid in follow buf:\n%s\n", buf); + return -1; + } + + return tgid; + +} + +static __u64 get_ts_nsec(char *buf) +{ + __u64 ts_nsec; + char ts_nsec_str[FIELD_BUFF] = {0}; + char *endptr; + + search_pattern(&ts_nsec_pattern, ts_nsec_str, buf); + errno = 0; + ts_nsec = strtoull(ts_nsec_str, &endptr, 10); + if (errno != 0 || endptr == ts_nsec_str || *endptr != '\0') { + if (debug_on) + fprintf(stderr, "wrong ts_nsec in follow buf:\n%s\n", buf); + return -1; + } + + return ts_nsec; +} + +static __u64 get_free_ts_nsec(char *buf) +{ + __u64 free_ts_nsec; + char free_ts_nsec_str[FIELD_BUFF] = {0}; + char *endptr; + + search_pattern(&free_ts_nsec_pattern, free_ts_nsec_str, buf); + errno = 0; + free_ts_nsec = strtoull(free_ts_nsec_str, &endptr, 10); + if (errno != 0 || endptr == free_ts_nsec_str || *endptr != '\0') { + if (debug_on) + fprintf(stderr, "wrong free_ts_nsec in follow buf:\n%s\n", buf); + return -1; + } + + return free_ts_nsec; +} + +static char *get_comm(char *buf) +{ + char *comm_str = malloc(TASK_COMM_LEN); + + memset(comm_str, 0, TASK_COMM_LEN); + + search_pattern(&comm_pattern, comm_str, buf); + errno = 0; + if (errno != 0) { + if (debug_on) + fprintf(stderr, "wrong comm in follow buf:\n%s\n", buf); + return NULL; + } + + return comm_str; +} + +static char *get_module(char *buf) +{ + char *mod = malloc(MODULE_NAME_LEN); + + memset(mod, 0, MODULE_NAME_LEN); + search_pattern(&module_pattern, mod, buf); + + return mod; +} + +static int get_arg_type(const char *arg) +{ + if (!strcmp(arg, "pid") || !strcmp(arg, "p")) + return ARG_PID; + else if (!strcmp(arg, "tgid") || !strcmp(arg, "tg")) + return ARG_TGID; + else if (!strcmp(arg, "name") || !strcmp(arg, "n")) + return ARG_COMM; + else if (!strcmp(arg, "stacktrace") || !strcmp(arg, "st")) + return ARG_STACKTRACE; + else if (!strcmp(arg, "free") || !strcmp(arg, "f")) + return ARG_FREE; + else if (!strcmp(arg, "txt") || !strcmp(arg, "T")) + return ARG_TXT; + else if (!strcmp(arg, "free_ts") || !strcmp(arg, "ft")) + return ARG_FREE_TS; + else if (!strcmp(arg, "alloc_ts") || !strcmp(arg, "at")) + return ARG_ALLOC_TS; + else if (!strcmp(arg, "allocator") || !strcmp(arg, "ator")) + return ARG_ALLOCATOR; + else if (!strcmp(arg, "module") || !strcmp(arg, "mod")) + return ARG_MODULE; + else { + return ARG_UNKNOWN; + } +} + +static int get_allocator(const char *buf, const char *migrate_info) +{ + char *tmp, *first_line, *second_line; + int allocator = 0; + + if (strstr(migrate_info, "CMA")) + allocator |= ALLOCATOR_CMA; + if (strstr(migrate_info, "slab")) + allocator |= ALLOCATOR_SLAB; + tmp = strstr(buf, "__vmalloc_node_range"); + if (tmp) { + second_line = tmp; + while (*tmp != '\n') + tmp--; + tmp--; + while (*tmp != '\n') + tmp--; + first_line = ++tmp; + tmp = strstr(tmp, "alloc_pages"); + if (tmp && first_line <= tmp && tmp < second_line) + allocator |= ALLOCATOR_VMALLOC; + } + if (allocator == 0) + allocator = ALLOCATOR_OTHERS; + return allocator; +} + +static bool match_num_list(int num, int *list, int list_size) +{ + for (int i = 0; i < list_size; ++i) + if (list[i] == num) + return true; + return false; +} + +static bool match_str_list(const char *str, char **list, int list_size) +{ + for (int i = 0; i < list_size; ++i) + if (!strcmp(list[i], str)) + return true; + return false; +} + +static bool is_module_filtered(char *mod_buf) +{ + char *mod = get_module(mod_buf); + int ret = true; + + if (!strlen(mod)) + goto out; + + if (fc.modules_size == 0 || + match_str_list(mod, fc.modules, fc.modules_size)) + ret = false; + +out: + free(mod); + return ret; +} + +static bool is_need(char *buf, char *mod_buf) +{ + __u64 ts_nsec, free_ts_nsec; + + ts_nsec = get_ts_nsec(buf); + free_ts_nsec = get_free_ts_nsec(buf); + + if ((filter & FILTER_UNRELEASE) && free_ts_nsec != 0 && ts_nsec < free_ts_nsec) + return false; + if ((filter & FILTER_PID) && !match_num_list(get_pid(buf), fc.pids, fc.pids_size)) + return false; + if ((filter & FILTER_TGID) && + !match_num_list(get_tgid(buf), fc.tgids, fc.tgids_size)) + return false; + + if ((filter & FILTER_MODULE) && is_module_filtered(mod_buf)) + return false; + + char *comm = get_comm(buf); + + if ((filter & FILTER_COMM) && + !match_str_list(comm, fc.comms, fc.comms_size)) { + free(comm); + return false; + } + free(comm); + return true; +} + +static bool add_list(char *buf, int len, char *ext_buf, char *mod_buf) { if (list_size != 0 && - len == list[list_size-1].len && - memcmp(buf, list[list_size-1].txt, len) == 0) { + len == list[list_size-1].len && + memcmp(buf, list[list_size-1].txt, len) == 0) { list[list_size-1].num++; - return; + list[list_size-1].page_num += get_page_num(buf); + return true; } if (list_size == max_size) { - printf("max_size too small??\n"); - exit(1); + fprintf(stderr, "max_size too small??\n"); + return false; } + if (!is_need(buf, mod_buf)) + return true; + list[list_size].pid = get_pid(buf); + list[list_size].tgid = get_tgid(buf); + list[list_size].comm = get_comm(buf); list[list_size].txt = malloc(len+1); - list[list_size].len = len; - list[list_size].num = 1; + if (!list[list_size].txt) { + fprintf(stderr, "Out of memory\n"); + return false; + } memcpy(list[list_size].txt, buf, len); list[list_size].txt[len] = 0; + list[list_size].len = len; + list[list_size].num = 1; + list[list_size].page_num = get_page_num(buf); + + list[list_size].stacktrace = strchr(list[list_size].txt, '\n') ?: ""; + if (*list[list_size].stacktrace == '\n') + list[list_size].stacktrace++; + list[list_size].ts_nsec = get_ts_nsec(buf); + list[list_size].free_ts_nsec = get_free_ts_nsec(buf); + list[list_size].allocator = get_allocator(buf, ext_buf); + list[list_size].module = get_module(mod_buf); list_size++; if (list_size % 1000 == 0) { printf("loaded %d\r", list_size); fflush(stdout); } + return true; +} + +static bool parse_cull_args(const char *arg_str) +{ + int size = 0; + char **args = explode(',', arg_str, &size); + + for (int i = 0; i < size; ++i) { + int arg_type = get_arg_type(args[i]); + + if (arg_type == ARG_PID) + cull |= CULL_PID; + else if (arg_type == ARG_TGID) + cull |= CULL_TGID; + else if (arg_type == ARG_COMM) + cull |= CULL_COMM; + else if (arg_type == ARG_STACKTRACE) + cull |= CULL_STACKTRACE; + else if (arg_type == ARG_FREE) + cull |= CULL_UNRELEASE; + else if (arg_type == ARG_ALLOCATOR) + cull |= CULL_ALLOCATOR; + else if (arg_type == ARG_MODULE) + cull |= CULL_MODULE; + else { + free_explode(args, size); + return false; + } + } + free_explode(args, size); + if (sc.size == 0) + set_single_cmp(compare_num, SORT_DESC); + return true; +} + +static void set_single_cmp(int (*cmp)(const void *, const void *), int sign) +{ + if (sc.signs == NULL || sc.size < 1) + sc.signs = calloc(1, sizeof(int)); + sc.signs[0] = sign; + if (sc.cmps == NULL || sc.size < 1) + sc.cmps = calloc(1, sizeof(int *)); + sc.cmps[0] = cmp; + sc.size = 1; +} + +static bool parse_sort_args(const char *arg_str) +{ + int size = 0; + + if (sc.size != 0) { /* reset sort_condition */ + free(sc.signs); + free(sc.cmps); + size = 0; + } + + char **args = explode(',', arg_str, &size); + + sc.signs = calloc(size, sizeof(int)); + sc.cmps = calloc(size, sizeof(int *)); + for (int i = 0; i < size; ++i) { + int offset = 0; + + sc.signs[i] = SORT_ASC; + if (args[i][0] == '-' || args[i][0] == '+') { + if (args[i][0] == '-') + sc.signs[i] = SORT_DESC; + offset = 1; + } + + int arg_type = get_arg_type(args[i]+offset); + + if (arg_type == ARG_PID) + sc.cmps[i] = compare_pid; + else if (arg_type == ARG_TGID) + sc.cmps[i] = compare_tgid; + else if (arg_type == ARG_COMM) + sc.cmps[i] = compare_comm; + else if (arg_type == ARG_STACKTRACE) + sc.cmps[i] = compare_stacktrace; + else if (arg_type == ARG_ALLOC_TS) + sc.cmps[i] = compare_ts; + else if (arg_type == ARG_FREE_TS) + sc.cmps[i] = compare_free_ts; + else if (arg_type == ARG_TXT) + sc.cmps[i] = compare_txt; + else if (arg_type == ARG_ALLOCATOR) + sc.cmps[i] = compare_allocator; + else if (arg_type == ARG_MODULE) + sc.cmps[i] = compare_module; + else { + free_explode(args, size); + sc.size = 0; + return false; + } + } + sc.size = size; + free_explode(args, size); + return true; +} + +static int *parse_nums_list(char *arg_str, int *list_size) +{ + int size = 0; + char **args = explode(',', arg_str, &size); + int *list = calloc(size, sizeof(int)); + + errno = 0; + for (int i = 0; i < size; ++i) { + char *endptr = NULL; + + list[i] = strtol(args[i], &endptr, 10); + if (errno != 0 || endptr == args[i] || *endptr != '\0') { + free(list); + return NULL; + } + } + *list_size = size; + free_explode(args, size); + return list; +} + +static void print_allocator(FILE *out, int allocator) +{ + fprintf(out, "allocated by "); + if (allocator & ALLOCATOR_CMA) + fprintf(out, "CMA "); + if (allocator & ALLOCATOR_SLAB) + fprintf(out, "SLAB "); + if (allocator & ALLOCATOR_VMALLOC) + fprintf(out, "VMALLOC "); + if (allocator & ALLOCATOR_OTHERS) + fprintf(out, "OTHERS "); } #define BUF_SIZE (128 * 1024) +static void usage(void) +{ + printf("Usage: ./page_owner_sort [OPTIONS] \n" + "-m\t\tSort by total memory.\n" + "-s\t\tSort by the stack trace.\n" + "-t\t\tSort by times (default).\n" + "-p\t\tSort by pid.\n" + "-P\t\tSort by tgid.\n" + "-n\t\tSort by task command name.\n" + "-a\t\tSort by memory allocate time.\n" + "-r\t\tSort by memory release time.\n" + "-f\t\tFilter out the information of blocks whose memory has been released.\n" + "-M\t\tFilter out the information of blocks whose memory isn't allocated by modules.\n" + "-d\t\tPrint debug information.\n" + "--pid \tSelect by pid. This selects the information of blocks whose process ID numbers appear in .\n" + "--tgid \tSelect by tgid. This selects the information of blocks whose Thread Group ID numbers appear in .\n" + "--name \n\t\tSelect by command name. This selects the information of blocks whose command name appears in .\n" + "--cull \tCull by user-defined rules. is a single argument in the form of a comma-separated list with some common fields predefined\n" + "--sort \tSpecify sort order as: [+|-]key[,[+|-]key[,...]]\n" + "--module \tSelect by module. This selects the information of blocks whose memory is allocated by modules appear in .\n" + ); +} + int main(int argc, char **argv) { FILE *fin, *fout; - char *buf; - int ret, i, count; - struct block_list *list2; + char *buf, *ext_buf, *mod_buf; + int i, count; struct stat st; + int opt; + struct option longopts[] = { + { "pid", required_argument, NULL, 1 }, + { "tgid", required_argument, NULL, 2 }, + { "name", required_argument, NULL, 3 }, + { "cull", required_argument, NULL, 4 }, + { "sort", required_argument, NULL, 5 }, + { "module", required_argument, NULL, 6 }, + { 0, 0, 0, 0}, + }; - if (argc < 3) { - printf("Usage: ./program \n"); - perror("open: "); + while ((opt = getopt_long(argc, argv, "adfmnprstPM", longopts, NULL)) != -1) + switch (opt) { + case 'a': + set_single_cmp(compare_ts, SORT_ASC); + break; + case 'd': + debug_on = true; + break; + case 'f': + filter = filter | FILTER_UNRELEASE; + break; + case 'm': + set_single_cmp(compare_page_num, SORT_DESC); + break; + case 'p': + set_single_cmp(compare_pid, SORT_ASC); + break; + case 'r': + set_single_cmp(compare_free_ts, SORT_ASC); + break; + case 's': + set_single_cmp(compare_stacktrace, SORT_ASC); + break; + case 't': + set_single_cmp(compare_num, SORT_DESC); + break; + case 'P': + set_single_cmp(compare_tgid, SORT_ASC); + break; + case 'n': + set_single_cmp(compare_comm, SORT_ASC); + break; + case 'M': + filter = filter | FILTER_MODULE; + fc.modules_size = 0; + fc.modules = NULL; + break; + case 1: + filter = filter | FILTER_PID; + fc.pids = parse_nums_list(optarg, &fc.pids_size); + if (fc.pids == NULL) { + fprintf(stderr, "wrong/invalid pid in from the command line:%s\n", + optarg); + exit(1); + } + break; + case 2: + filter = filter | FILTER_TGID; + fc.tgids = parse_nums_list(optarg, &fc.tgids_size); + if (fc.tgids == NULL) { + fprintf(stderr, "wrong/invalid tgid in from the command line:%s\n", + optarg); + exit(1); + } + break; + case 3: + filter = filter | FILTER_COMM; + fc.comms = explode(',', optarg, &fc.comms_size); + break; + case 4: + if (!parse_cull_args(optarg)) { + fprintf(stderr, "wrong argument after --cull option:%s\n", + optarg); + exit(1); + } + break; + case 5: + if (!parse_sort_args(optarg)) { + fprintf(stderr, "wrong argument after --sort option:%s\n", + optarg); + exit(1); + } + break; + case 6: + filter = filter | FILTER_MODULE; + fc.modules = explode(',', optarg, &fc.modules_size); + break; + default: + usage(); + exit(1); + } + + if (optind >= (argc - 1)) { + usage(); exit(1); } - fin = fopen(argv[1], "r"); - fout = fopen(argv[2], "w"); + fin = fopen(argv[optind], "r"); + fout = fopen(argv[optind + 1], "w"); if (!fin || !fout) { - printf("Usage: ./program \n"); + usage(); perror("open: "); exit(1); } + if (!check_regcomp(&order_pattern, "order\\s*([0-9]*),")) + goto out_order; + if (!check_regcomp(&pid_pattern, "pid\\s*([0-9]*),")) + goto out_pid; + if (!check_regcomp(&tgid_pattern, "tgid\\s*([0-9]*) ")) + goto out_tgid; + if (!check_regcomp(&comm_pattern, "tgid\\s*[0-9]*\\s*\\((.*)\\),\\s*ts")) + goto out_comm; + if (!check_regcomp(&ts_nsec_pattern, "ts\\s*([0-9]*)\\s*ns,")) + goto out_ts; + if (!check_regcomp(&free_ts_nsec_pattern, "free_ts\\s*([0-9]*)\\s*ns")) + goto out_free_ts; + if (!check_regcomp(&module_pattern, "Page allocated by module (.*)")) + goto out_module; + fstat(fileno(fin), &st); max_size = st.st_size / 100; /* hack ... */ list = malloc(max_size * sizeof(*list)); buf = malloc(BUF_SIZE); - if (!list || !buf) { - printf("Out of memory\n"); - exit(1); + ext_buf = malloc(BUF_SIZE); + mod_buf = malloc(BUF_SIZE); + if (!list || !buf || !ext_buf || !mod_buf) { + fprintf(stderr, "Out of memory\n"); + goto out_free; } for ( ; ; ) { - ret = read_block(buf, BUF_SIZE, fin); - if (ret < 0) - break; + int buf_len = read_block(buf, ext_buf, mod_buf, BUF_SIZE, fin); - add_list(buf, ret); + if (buf_len < 0) + break; + if (!add_list(buf, buf_len, ext_buf, mod_buf)) + goto out_free; } printf("loaded %d\n", list_size); printf("sorting ....\n"); - qsort(list, list_size, sizeof(list[0]), compare_txt); - - list2 = malloc(sizeof(*list) * list_size); + qsort(list, list_size, sizeof(list[0]), compare_cull_condition); printf("culling\n"); for (i = count = 0; i < list_size; i++) { if (count == 0 || - strcmp(list2[count-1].txt, list[i].txt) != 0) { - list2[count++] = list[i]; + compare_cull_condition((void *)(&list[count-1]), (void *)(&list[i])) != 0) { + list[count++] = list[i]; } else { - list2[count-1].num += list[i].num; + list[count-1].num += list[i].num; + list[count-1].page_num += list[i].page_num; } } - qsort(list2, count, sizeof(list[0]), compare_num); + qsort(list, count, sizeof(list[0]), compare_sort_condition); + + for (i = 0; i < count; i++) { + if (cull == 0) { + fprintf(fout, "%d times, %d pages, ", list[i].num, list[i].page_num); + if (strlen(list[i].module) != 0) + fprintf(fout, "allocated by module %s, ", list[i].module); + print_allocator(fout, list[i].allocator); + fprintf(fout, " :\n%s\n", list[i].txt); + } + else { + fprintf(fout, "%d times, %d pages", + list[i].num, list[i].page_num); + if (cull & CULL_PID || filter & FILTER_PID) + fprintf(fout, ", PID %d", list[i].pid); + if (cull & CULL_TGID || filter & FILTER_TGID) + fprintf(fout, ", TGID %d", list[i].tgid); + if (cull & CULL_COMM || filter & FILTER_COMM) + fprintf(fout, ", task_comm_name: %s", list[i].comm); + if (cull & CULL_MODULE || filter & FILTER_MODULE) + fprintf(fout, ", module: %s", list[i].module); + if (cull & CULL_ALLOCATOR) { + fprintf(fout, ", "); + print_allocator(fout, list[i].allocator); + } + if (cull & CULL_UNRELEASE) + fprintf(fout, " (%s)", + list[i].free_ts_nsec ? "UNRELEASED" : "RELEASED"); + if (cull & CULL_STACKTRACE) + fprintf(fout, ":\n%s", list[i].stacktrace); + fprintf(fout, "\n"); + } + } - for (i = 0; i < count; i++) - fprintf(fout, "%d times:\n%s\n", list2[i].num, list2[i].txt); +out_free: + if (ext_buf) + free(ext_buf); + if (buf) + free(buf); + if (list) + free(list); +out_module: + regfree(&module_pattern); +out_free_ts: + regfree(&free_ts_nsec_pattern); +out_ts: + regfree(&ts_nsec_pattern); +out_comm: + regfree(&comm_pattern); +out_tgid: + regfree(&tgid_pattern); +out_pid: + regfree(&pid_pattern); +out_order: + regfree(&order_pattern); return 0; }