diff --git a/arch/ia64/mm/discontig.c b/arch/ia64/mm/discontig.c index 1928d5719e417d5b5899a7c2b48c73a3da3294bd..32c16a68815dbfd1925f522a5bbca573f925aabb 100644 --- a/arch/ia64/mm/discontig.c +++ b/arch/ia64/mm/discontig.c @@ -631,12 +631,11 @@ void __init paging_init(void) zero_page_memmap_ptr = virt_to_page(ia64_imva(empty_zero_page)); } -#ifdef CONFIG_MEMORY_HOTPLUG -pg_data_t *arch_alloc_nodedata(int nid) +pg_data_t * __init arch_alloc_nodedata(int nid) { unsigned long size = compute_pernodesize(nid); - return kzalloc(size, GFP_KERNEL); + return _va(memblock_alloc(size, SMP_CACHE_BYTES)); } void arch_free_nodedata(pg_data_t *pgdat) @@ -649,7 +648,6 @@ void arch_refresh_nodedata(int update_node, pg_data_t *update_pgdat) pgdat_list[update_node] = update_pgdat; scatter_node_data(); } -#endif #ifdef CONFIG_SPARSEMEM_VMEMMAP int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node, diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index fa150855647cc9c85ef2e2b126bf30bcaba06994..2284f279a1afcee4e25cbdb404e032a8db6c8d61 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -722,21 +722,6 @@ void __init x86_numa_init(void) numa_init(dummy_numa_init); } -static void __init init_memory_less_node(int nid) -{ - unsigned long zones_size[MAX_NR_ZONES] = {0}; - unsigned long zholes_size[MAX_NR_ZONES] = {0}; - - /* Allocate and initialize node data. Memory-less node is now online.*/ - alloc_node_data(nid); - free_area_init_node(nid, zones_size, 0, zholes_size); - - /* - * All zonelists will be built later in start_kernel() after per cpu - * areas are initialized. - */ -} - /* * Setup early cpu_to_node. * @@ -764,8 +749,17 @@ void __init init_cpu_to_node(void) if (node == NUMA_NO_NODE) continue; + /* + * Exclude this node from + * bringup_nonboot_cpus + * cpu_up + * __try_online_node + * register_one_node + * because node_subsys is not initialized yet. + * TODO remove dependency on node_online + */ if (!node_online(node)) - init_memory_less_node(node); + node_set_online(node); numa_set_node(cpu, node); } diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index bc433d459c8611b25e5cf50029d7df83912f5fdc..4ad765a6e7347adefc085c93c320cca9670a8c37 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -15,6 +15,66 @@ struct memory_block; struct resource; struct vmem_altmap; +#ifdef CONFIG_HAVE_ARCH_NODEDATA_EXTENSION +/* + * For supporting node-hotadd, we have to allocate a new pgdat. + * + * If an arch has generic style NODE_DATA(), + * node_data[nid] = kzalloc() works well. But it depends on the architecture. + * + * In general, generic_alloc_nodedata() is used. + * Now, arch_free_nodedata() is just defined for error path of node_hot_add. + * + */ +extern pg_data_t *arch_alloc_nodedata(int nid); +extern void arch_free_nodedata(pg_data_t *pgdat); +extern void arch_refresh_nodedata(int nid, pg_data_t *pgdat); + +#else /* CONFIG_HAVE_ARCH_NODEDATA_EXTENSION */ + +#define hotadd_add_pgdatarch_alloc_nodedata(nid) generic_alloc_nodedata(nid) +#define arch_free_nodedata(pgdat) generic_free_nodedata(pgdat) + +#ifdef CONFIG_NUMA +/* + * XXX: node aware allocation can't work well to get new node's memory at this time. + * Because, pgdat for the new node is not allocated/initialized yet itself. + * To use new node's memory, more consideration will be necessary. + */ +#define generic_alloc_nodedata(nid) \ +({ \ + __va(memblock_alloc(sizeof(*pgdat), SMP_CACHE_BYTES)); \ +}) +/* + * This definition is just for error path in node hotadd. + * For node hotremove, we have to replace this. + */ +#define generic_free_nodedata(pgdat) kfree(pgdat) + +extern pg_data_t *node_data[]; +static inline void arch_refresh_nodedata(int nid, pg_data_t *pgdat) +{ + node_data[nid] = pgdat; +} + +#else /* !CONFIG_NUMA */ + +/* never called */ +static inline pg_data_t *generic_alloc_nodedata(int nid) +{ + BUG(); + return NULL; +} +static inline void generic_free_nodedata(pg_data_t *pgdat) +{ +} +static inline void arch_refresh_nodedata(int nid, pg_data_t *pgdat) +{ +} +#endif /* CONFIG_NUMA */ +#endif /* CONFIG_HAVE_ARCH_NODEDATA_EXTENSION */ + + #ifdef CONFIG_MEMORY_HOTPLUG /* * Return page for the valid pfn only if the page is online. All pfn @@ -146,66 +206,6 @@ static inline int memory_add_physaddr_to_nid(u64 start) } #endif -#ifdef CONFIG_HAVE_ARCH_NODEDATA_EXTENSION -/* - * For supporting node-hotadd, we have to allocate a new pgdat. - * - * If an arch has generic style NODE_DATA(), - * node_data[nid] = kzalloc() works well. But it depends on the architecture. - * - * In general, generic_alloc_nodedata() is used. - * Now, arch_free_nodedata() is just defined for error path of node_hot_add. - * - */ -extern pg_data_t *arch_alloc_nodedata(int nid); -extern void arch_free_nodedata(pg_data_t *pgdat); -extern void arch_refresh_nodedata(int nid, pg_data_t *pgdat); - -#else /* CONFIG_HAVE_ARCH_NODEDATA_EXTENSION */ - -#define arch_alloc_nodedata(nid) generic_alloc_nodedata(nid) -#define arch_free_nodedata(pgdat) generic_free_nodedata(pgdat) - -#ifdef CONFIG_NUMA -/* - * If ARCH_HAS_NODEDATA_EXTENSION=n, this func is used to allocate pgdat. - * XXX: kmalloc_node() can't work well to get new node's memory at this time. - * Because, pgdat for the new node is not allocated/initialized yet itself. - * To use new node's memory, more consideration will be necessary. - */ -#define generic_alloc_nodedata(nid) \ -({ \ - kzalloc(sizeof(pg_data_t), GFP_KERNEL); \ -}) -/* - * This definition is just for error path in node hotadd. - * For node hotremove, we have to replace this. - */ -#define generic_free_nodedata(pgdat) kfree(pgdat) - -extern pg_data_t *node_data[]; -static inline void arch_refresh_nodedata(int nid, pg_data_t *pgdat) -{ - node_data[nid] = pgdat; -} - -#else /* !CONFIG_NUMA */ - -/* never called */ -static inline pg_data_t *generic_alloc_nodedata(int nid) -{ - BUG(); - return NULL; -} -static inline void generic_free_nodedata(pg_data_t *pgdat) -{ -} -static inline void arch_refresh_nodedata(int nid, pg_data_t *pgdat) -{ -} -#endif /* CONFIG_NUMA */ -#endif /* CONFIG_HAVE_ARCH_NODEDATA_EXTENSION */ - #ifdef CONFIG_HAVE_BOOTMEM_INFO_NODE extern void __init register_page_bootmem_info_node(struct pglist_data *pgdat); #else diff --git a/mm/internal.h b/mm/internal.h index deffd247b010a9458f8212c8a7a5499782651dd0..72f77379b58f330174b37b0846966374a0ce1e53 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -614,4 +614,6 @@ extern int is_pagecache_reading_kernel_recovery_enable(void); extern int is_get_user_kernel_recovery_enable(void); #endif +DECLARE_PER_CPU(struct per_cpu_nodestat, boot_nodestats); + #endif /* __MM_INTERNAL_H */ diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 3df9285b73b18c08e8a3599a5df3cb55b2cf5c73..3b961b0fffccd8eb81c1c95f9350fc9d878f2d9f 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -920,18 +920,20 @@ static void reset_node_present_pages(pg_data_t *pgdat) } /* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */ -static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start) +static pg_data_t __ref *hotadd_init_pgdat(int nid, u64 start) { struct pglist_data *pgdat; unsigned long start_pfn = PFN_DOWN(start); pgdat = NODE_DATA(nid); - if (!pgdat) { - pgdat = arch_alloc_nodedata(nid); - if (!pgdat) - return NULL; - arch_refresh_nodedata(nid, pgdat); + /* + * NODE_DATA is preallocated (free_area_init) but its internal + * state is not allocated completely. Add missing pieces. + * Completely offline nodes stay around and they just need + * reintialization. + */ + if (pgdat->per_cpu_nodestats == &boot_nodestats) { } else { /* * Reset the nr_zones, order and classzone_idx before reuse. @@ -943,10 +945,7 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start) pgdat->kswapd_classzone_idx = 0; } - /* we can use NODE_DATA(nid) from here */ - - pgdat->node_id = nid; - pgdat->node_start_pfn = start_pfn; + pgdat->node_start_pfn = 0; /* init node's zones as empty zones, we don't have any present pages.*/ free_area_init_core_hotplug(nid); @@ -1000,7 +999,7 @@ static int __try_online_node(int nid, u64 start, bool set_node_online) if (node_online(nid)) return 0; - pgdat = hotadd_new_pgdat(nid, start); + pgdat = hotadd_init_pgdat(nid, start); if (!pgdat) { pr_err("Cannot online node %d due to NULL pgdat\n", nid); ret = -ENOMEM; @@ -1123,9 +1122,6 @@ int __ref add_memory_resource(int nid, struct resource *res) return ret; error: - /* rollback pgdat allocation and others */ - if (new_node) - rollback_node_hotadd(nid); memblock_remove(start, size); mem_hotplug_done(); return ret; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index fc8be4b00125b3c7cf71b3846c11bebfa1da13d4..1d946043ee63725c73b1c5736d1130efc060887a 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5819,7 +5819,7 @@ static void build_zonelists(pg_data_t *pgdat) */ static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch); static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset); -static DEFINE_PER_CPU(struct per_cpu_nodestat, boot_nodestats); +DEFINE_PER_CPU(struct per_cpu_nodestat, boot_nodestats); static void __build_all_zonelists(void *data) { @@ -5854,7 +5854,11 @@ static void __build_all_zonelists(void *data) if (self && !node_online(self->node_id)) { build_zonelists(self); } else { - for_each_online_node(nid) { + /* + * All possible nodes have pgdat preallocated + * in free_area_init + */ + for_each_node(nid) { pg_data_t *pgdat = NODE_DATA(nid); build_zonelists(pgdat); @@ -7441,10 +7445,39 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn) mminit_verify_pageflags_layout(); setup_nr_node_ids(); zero_resv_unavail(); - for_each_online_node(nid) { - pg_data_t *pgdat = NODE_DATA(nid); - free_area_init_node(nid, NULL, - find_min_pfn_for_node(nid), NULL); + for_each_node(nid) { + pg_data_t *pgdat; + + if (!node_online(nid)) { + pr_info("Initializing node %d as memoryless\n", nid); + + /* Allocator not initialized yet */ + pgdat = arch_alloc_nodedata(nid); + if (!pgdat) { + pr_err("Cannot allocate %zuB for node %d.\n", + sizeof(*pgdat), nid); + continue; + } + memset(pgdat, 0, sizeof(*pgdat)); + arch_refresh_nodedata(nid, pgdat); + free_area_init_node(nid, NULL, 0, NULL); + + /* + * We do not want to confuse userspace by sysfs + * files/directories for node without any memory + * attached to it, so this node is not marked as + * N_MEMORY and not marked online so that no sysfs + * hierarchy will be created via register_one_node for + * it. The pgdat will get fully initialized by + * hotadd_init_pgdat() when memory is hotplugged into + * this node. + */ + continue; + } + + pgdat = NODE_DATA(nid); + free_area_init_node(nid, NULL, find_min_pfn_for_node(nid), + NULL); /* Any memory on that node */ if (pgdat->node_present_pages) {