From 16785bd7743104d57257a455001172b75afa7614 Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Tue, 22 Mar 2022 14:41:47 -0700 Subject: mm: merge pte_mkhuge() call into arch_make_huge_pte() Each call into pte_mkhuge() is invariably followed by arch_make_huge_pte(). Instead arch_make_huge_pte() can accommodate pte_mkhuge() at the beginning. This updates generic fallback stub for arch_make_huge_pte() and available platforms definitions. This makes huge pte creation much cleaner and easier to follow. Link: https://lkml.kernel.org/r/1643860669-26307-1-git-send-email-anshuman.khandual@arm.com Signed-off-by: Anshuman Khandual Reviewed-by: Christophe Leroy Acked-by: Mike Kravetz Acked-by: Catalin Marinas Cc: Will Deacon Cc: Michael Ellerman Cc: Paul Mackerras Cc: "David S. Miller" Cc: Mike Kravetz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm64/mm/hugetlbpage.c | 1 + arch/powerpc/include/asm/nohash/32/hugetlb-8xx.h | 4 ++-- arch/sparc/mm/hugetlbpage.c | 1 + 3 files changed, 4 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c index ffb9c229610a..228226c5fa80 100644 --- a/arch/arm64/mm/hugetlbpage.c +++ b/arch/arm64/mm/hugetlbpage.c @@ -347,6 +347,7 @@ pte_t arch_make_huge_pte(pte_t entry, unsigned int shift, vm_flags_t flags) { size_t pagesize = 1UL << shift; + entry = pte_mkhuge(entry); if (pagesize == CONT_PTE_SIZE) { entry = pte_mkcont(entry); } else if (pagesize == CONT_PMD_SIZE) { diff --git a/arch/powerpc/include/asm/nohash/32/hugetlb-8xx.h b/arch/powerpc/include/asm/nohash/32/hugetlb-8xx.h index 64b6c608eca4..de092b04ee1a 100644 --- a/arch/powerpc/include/asm/nohash/32/hugetlb-8xx.h +++ b/arch/powerpc/include/asm/nohash/32/hugetlb-8xx.h @@ -71,9 +71,9 @@ static inline pte_t arch_make_huge_pte(pte_t entry, unsigned int shift, vm_flags size_t size = 1UL << shift; if (size == SZ_16K) - return __pte(pte_val(entry) & ~_PAGE_HUGE); + return __pte(pte_val(entry) | _PAGE_SPS); else - return entry; + return __pte(pte_val(entry) | _PAGE_SPS | _PAGE_HUGE); } #define arch_make_huge_pte arch_make_huge_pte #endif diff --git a/arch/sparc/mm/hugetlbpage.c b/arch/sparc/mm/hugetlbpage.c index 0f49fada2093..d8e0e3c7038d 100644 --- a/arch/sparc/mm/hugetlbpage.c +++ b/arch/sparc/mm/hugetlbpage.c @@ -181,6 +181,7 @@ pte_t arch_make_huge_pte(pte_t entry, unsigned int shift, vm_flags_t flags) { pte_t pte; + entry = pte_mkhuge(entry); pte = hugepage_shift_to_tte(entry, shift); #ifdef CONFIG_SPARC64 -- cgit v1.2.3 From d6d224429a86a62263d0944f79c36dce010a4ebb Mon Sep 17 00:00:00 2001 From: Stafford Horne Date: Tue, 22 Mar 2022 14:41:50 -0700 Subject: mm: remove mmu_gathers storage from remaining architectures Originally the mmu_gathers were removed in commit 1c3951769621 ("mm: now that all old mmu_gather code is gone, remove the storage"). However, the openrisc and hexagon architecture were merged around the same time and mmu_gathers was not removed. This patch removes them from openrisc, hexagon and nds32: Noticed while cleaning this warning: arch/openrisc/mm/init.c:41:1: warning: symbol 'mmu_gathers' was not declared. Should it be static? Link: https://lkml.kernel.org/r/20220205141956.3315419-1-shorne@gmail.com Signed-off-by: Stafford Horne Acked-by: Mike Rapoport Cc: Brian Cain Cc: Nick Hu Cc: Greentime Hu Cc: Vincent Chen Cc: Jonas Bonn Cc: Stefan Kristiansson Cc: Russell King Cc: David Hildenbrand Cc: Dave Hansen Cc: Kefeng Wang Cc: Christophe Leroy Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/hexagon/mm/init.c | 2 -- arch/nds32/mm/init.c | 1 - arch/openrisc/mm/init.c | 2 -- 3 files changed, 5 deletions(-) (limited to 'arch') diff --git a/arch/hexagon/mm/init.c b/arch/hexagon/mm/init.c index f01e91e10d95..3167a3b5c97b 100644 --- a/arch/hexagon/mm/init.c +++ b/arch/hexagon/mm/init.c @@ -29,8 +29,6 @@ int max_kernel_seg = 0x303; /* indicate pfn's of high memory */ unsigned long highstart_pfn, highend_pfn; -DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); - /* Default cache attribute for newly created page tables */ unsigned long _dflt_cache_att = CACHEDEF; diff --git a/arch/nds32/mm/init.c b/arch/nds32/mm/init.c index f63f839738c4..825c85cab1a1 100644 --- a/arch/nds32/mm/init.c +++ b/arch/nds32/mm/init.c @@ -18,7 +18,6 @@ #include #include -DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); DEFINE_SPINLOCK(anon_alias_lock); extern pgd_t swapper_pg_dir[PTRS_PER_PGD]; diff --git a/arch/openrisc/mm/init.c b/arch/openrisc/mm/init.c index 97305bde1b16..3a021ab6f1ae 100644 --- a/arch/openrisc/mm/init.c +++ b/arch/openrisc/mm/init.c @@ -38,8 +38,6 @@ int mem_init_done; -DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); - static void __init zone_sizes_init(void) { unsigned long max_zone_pfn[MAX_NR_ZONES] = { 0 }; -- cgit v1.2.3 From e16faf26780fc0c8dd693ea9ee8420a7706cb2f5 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 22 Mar 2022 14:43:17 -0700 Subject: cma: factor out minimum alignment requirement Patch series "mm: enforce pageblock_order < MAX_ORDER". Having pageblock_order >= MAX_ORDER seems to be able to happen in corner cases and some parts of the kernel are not prepared for it. For example, Aneesh has shown [1] that such kernels can be compiled on ppc64 with 64k base pages by setting FORCE_MAX_ZONEORDER=8, which will run into a WARN_ON_ONCE(order >= MAX_ORDER) in comapction code right during boot. We can get pageblock_order >= MAX_ORDER when the default hugetlb size is bigger than the maximum allocation granularity of the buddy, in which case we are no longer talking about huge pages but instead gigantic pages. Having pageblock_order >= MAX_ORDER can only make alloc_contig_range() of such gigantic pages more likely to succeed. Reliable use of gigantic pages either requires boot time allcoation or CMA, no need to overcomplicate some places in the kernel to optimize for corner cases that are broken in other areas of the kernel. This patch (of 2): Let's enforce pageblock_order < MAX_ORDER and simplify. Especially patch #1 can be regarded a cleanup before: [PATCH v5 0/6] Use pageblock_order for cma and alloc_contig_range alignment. [2] [1] https://lkml.kernel.org/r/87r189a2ks.fsf@linux.ibm.com [2] https://lkml.kernel.org/r/20220211164135.1803616-1-zi.yan@sent.com Link: https://lkml.kernel.org/r/20220214174132.219303-2-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Zi Yan Acked-by: Rob Herring Cc: Aneesh Kumar K.V Cc: Michael Ellerman Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Frank Rowand Cc: Michael S. Tsirkin Cc: Christoph Hellwig Cc: Marek Szyprowski Cc: Robin Murphy Cc: Minchan Kim Cc: Vlastimil Babka Cc: John Garry via iommu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/powerpc/include/asm/fadump-internal.h | 5 ----- arch/powerpc/kernel/fadump.c | 2 +- 2 files changed, 1 insertion(+), 6 deletions(-) (limited to 'arch') diff --git a/arch/powerpc/include/asm/fadump-internal.h b/arch/powerpc/include/asm/fadump-internal.h index 52189928ec08..81bcb9abb371 100644 --- a/arch/powerpc/include/asm/fadump-internal.h +++ b/arch/powerpc/include/asm/fadump-internal.h @@ -19,11 +19,6 @@ #define memblock_num_regions(memblock_type) (memblock.memblock_type.cnt) -/* Alignment per CMA requirement. */ -#define FADUMP_CMA_ALIGNMENT (PAGE_SIZE << \ - max_t(unsigned long, MAX_ORDER - 1, \ - pageblock_order)) - /* FAD commands */ #define FADUMP_REGISTER 1 #define FADUMP_UNREGISTER 2 diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c index d03e488cfe9c..7eb67201ea41 100644 --- a/arch/powerpc/kernel/fadump.c +++ b/arch/powerpc/kernel/fadump.c @@ -544,7 +544,7 @@ int __init fadump_reserve_mem(void) if (!fw_dump.nocma) { fw_dump.boot_memory_size = ALIGN(fw_dump.boot_memory_size, - FADUMP_CMA_ALIGNMENT); + CMA_MIN_ALIGNMENT_BYTES); } #endif -- cgit v1.2.3 From 1ca75fa7f19d694c58af681fa023295072b03120 Mon Sep 17 00:00:00 2001 From: Oscar Salvador Date: Tue, 22 Mar 2022 14:43:51 -0700 Subject: arch/x86/mm/numa: Do not initialize nodes twice On x86, prior to ("mm: handle uninitialized numa nodes gracecully"), NUMA nodes could be allocated at three different places. - numa_register_memblks - init_cpu_to_node - init_gi_nodes All these calls happen at setup_arch, and have the following order: setup_arch ... x86_numa_init numa_init numa_register_memblks ... init_cpu_to_node init_memory_less_node alloc_node_data free_area_init_memoryless_node init_gi_nodes init_memory_less_node alloc_node_data free_area_init_memoryless_node numa_register_memblks() is only interested in those nodes which have memory, so it skips over any memoryless node it founds. Later on, when we have read ACPI's SRAT table, we call init_cpu_to_node() and init_gi_nodes(), which initialize any memoryless node we might have that have either CPU or Initiator affinity, meaning we allocate pg_data_t struct for them and we mark them as ONLINE. So far so good, but the thing is that after ("mm: handle uninitialized numa nodes gracefully"), we allocate all possible NUMA nodes in free_area_init(), meaning we have a picture like the following: setup_arch x86_numa_init numa_init numa_register_memblks <-- allocate non-memoryless node x86_init.paging.pagetable_init ... free_area_init free_area_init_memoryless <-- allocate memoryless node init_cpu_to_node alloc_node_data <-- allocate memoryless node with CPU free_area_init_memoryless_node init_gi_nodes alloc_node_data <-- allocate memoryless node with Initiator free_area_init_memoryless_node free_area_init() already allocates all possible NUMA nodes, but init_cpu_to_node() and init_gi_nodes() are clueless about that, so they go ahead and allocate a new pg_data_t struct without checking anything, meaning we end up allocating twice. It should be mad clear that this only happens in the case where memoryless NUMA node happens to have a CPU/Initiator affinity. So get rid of init_memory_less_node() and just set the node online. Note that setting the node online is needed, otherwise we choke down the chain when bringup_nonboot_cpus() ends up calling __try_online_node()->register_one_node()->... and we blow up in bus_add_device(). As can be seen here: BUG: kernel NULL pointer dereference, address: 0000000000000060 #PF: supervisor read access in kernel mode #PF: error_code(0x0000) - not-present page PGD 0 P4D 0 Oops: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC PTI CPU: 0 PID: 1 Comm: swapper/0 Not tainted 5.17.0-rc4-1-default+ #45 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.0.0-prebuilt.qemu-project.org 04/4 RIP: 0010:bus_add_device+0x5a/0x140 Code: 8b 74 24 20 48 89 df e8 84 96 ff ff 85 c0 89 c5 75 38 48 8b 53 50 48 85 d2 0f 84 bb 00 004 RSP: 0000:ffffc9000022bd10 EFLAGS: 00010246 RAX: 0000000000000000 RBX: ffff888100987400 RCX: ffff8881003e4e19 RDX: ffff8881009a5e00 RSI: ffff888100987400 RDI: ffff888100987400 RBP: 0000000000000000 R08: ffff8881003e4e18 R09: ffff8881003e4c98 R10: 0000000000000000 R11: ffff888100402bc0 R12: ffffffff822ceba0 R13: 0000000000000000 R14: ffff888100987400 R15: 0000000000000000 FS: 0000000000000000(0000) GS:ffff88853fc00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000060 CR3: 000000000200a001 CR4: 00000000001706b0 Call Trace: device_add+0x4c0/0x910 __register_one_node+0x97/0x2d0 __try_online_node+0x85/0xc0 try_online_node+0x25/0x40 cpu_up+0x4f/0x100 bringup_nonboot_cpus+0x4f/0x60 smp_init+0x26/0x79 kernel_init_freeable+0x130/0x2f1 kernel_init+0x17/0x150 ret_from_fork+0x22/0x30 The reason is simple, by the time bringup_nonboot_cpus() gets called, we did not register the node_subsys bus yet, so we crash when bus_add_device() tries to dereference bus()->p. The following shows the order of the calls: kernel_init_freeable smp_init bringup_nonboot_cpus ... bus_add_device() <- we did not register node_subsys yet do_basic_setup do_initcalls postcore_initcall(register_node_type); register_node_type subsys_system_register subsys_register bus_register <- register node_subsys bus Why setting the node online saves us then? Well, simply because __try_online_node() backs off when the node is online, meaning we do not end up calling register_one_node() in the first place. This is subtle, broken and deserves a deep analysis and thought about how to put this into shape, but for now let us have this easy fix for the leaking memory issue. [osalvador@suse.de: add comments] Link: https://lkml.kernel.org/r/20220221142649.3457-1-osalvador@suse.de Link: https://lkml.kernel.org/r/20220218224302.5282-2-osalvador@suse.de Fixes: da4490c958ad ("mm: handle uninitialized numa nodes gracefully") Signed-off-by: Oscar Salvador Acked-by: Michal Hocko Cc: David Hildenbrand Cc: Rafael Aquini Cc: Dave Hansen Cc: Wei Yang Cc: Dennis Zhou Cc: Alexey Makhalov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/mm/numa.c | 33 ++++++++++++++++++++------------- 1 file changed, 20 insertions(+), 13 deletions(-) (limited to 'arch') diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index c6b1213086d6..e8b061557887 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -738,17 +738,6 @@ void __init x86_numa_init(void) numa_init(dummy_numa_init); } -static void __init init_memory_less_node(int nid) -{ - /* Allocate and initialize node data. Memory-less node is now online.*/ - alloc_node_data(nid); - free_area_init_memoryless_node(nid); - - /* - * All zonelists will be built later in start_kernel() after per cpu - * areas are initialized. - */ -} /* * A node may exist which has one or more Generic Initiators but no CPUs and no @@ -766,9 +755,18 @@ void __init init_gi_nodes(void) { int nid; + /* + * Exclude this node from + * bringup_nonboot_cpus + * cpu_up + * __try_online_node + * register_one_node + * because node_subsys is not initialized yet. + * TODO remove dependency on node_online + */ for_each_node_state(nid, N_GENERIC_INITIATOR) if (!node_online(nid)) - init_memory_less_node(nid); + node_set_online(nid); } /* @@ -798,8 +796,17 @@ void __init init_cpu_to_node(void) if (node == NUMA_NO_NODE) continue; + /* + * Exclude this node from + * bringup_nonboot_cpus + * cpu_up + * __try_online_node + * register_one_node + * because node_subsys is not initialized yet. + * TODO remove dependency on node_online + */ if (!node_online(node)) - init_memory_less_node(node); + node_set_online(node); numa_set_node(cpu, node); } -- cgit v1.2.3 From d1fe111fb62a1cf0446a2919f5effbb33ad0702c Mon Sep 17 00:00:00 2001 From: luofei Date: Tue, 22 Mar 2022 14:44:38 -0700 Subject: mm/hwpoison: avoid the impact of hwpoison_filter() return value on mce handler When the hwpoison page meets the filter conditions, it should not be regarded as successful memory_failure() processing for mce handler, but should return a distinct value, otherwise mce handler regards the error page has been identified and isolated, which may lead to calling set_mce_nospec() to change page attribute, etc. Here memory_failure() return -EOPNOTSUPP to indicate that the error event is filtered, mce handler should not take any action for this situation and hwpoison injector should treat as correct. Link: https://lkml.kernel.org/r/20220223082135.2769649-1-luofei@unicloud.com Signed-off-by: luofei Acked-by: Borislav Petkov Cc: Dave Hansen Cc: H. Peter Anvin Cc: Ingo Molnar Cc: Miaohe Lin Cc: Naoya Horiguchi Cc: Thomas Gleixner Cc: Tony Luck Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/kernel/cpu/mce/core.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'arch') diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index 5818b837fd4d..05c6469db769 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -1304,10 +1304,12 @@ static void kill_me_maybe(struct callback_head *cb) /* * -EHWPOISON from memory_failure() means that it already sent SIGBUS - * to the current process with the proper error info, so no need to - * send SIGBUS here again. + * to the current process with the proper error info, + * -EOPNOTSUPP means hwpoison_filter() filtered the error event, + * + * In both cases, no further processing is required. */ - if (ret == -EHWPOISON) + if (ret == -EHWPOISON || ret == -EOPNOTSUPP) return; pr_err("Memory error not recovered"); -- cgit v1.2.3 From 07431506e8d752ff21c3d5fba0927fe8be4ed18f Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Tue, 22 Mar 2022 14:45:15 -0700 Subject: mm/hugetlb: generalize ARCH_WANT_GENERAL_HUGETLB ARCH_WANT_GENERAL_HUGETLB config has duplicate definitions on platforms that subscribe it. Instead make it a generic config option which can be selected on applicable platforms when required. Link: https://lkml.kernel.org/r/1643718465-4324-1-git-send-email-anshuman.khandual@arm.com Signed-off-by: Anshuman Khandual Cc: Russell King Cc: Paul Walmsley Cc: Palmer Dabbelt Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Mike Kravetz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm/Kconfig | 4 +--- arch/riscv/Kconfig | 4 +--- arch/x86/Kconfig | 4 +--- 3 files changed, 3 insertions(+), 9 deletions(-) (limited to 'arch') diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index 4c97cb40eebb..ba6ba78a9cb6 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -37,6 +37,7 @@ config ARM select ARCH_USE_CMPXCHG_LOCKREF select ARCH_USE_MEMTEST select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT if MMU + select ARCH_WANT_GENERAL_HUGETLB select ARCH_WANT_IPC_PARSE_VERSION select ARCH_WANT_LD_ORPHAN_WARN select BINFMT_FLAT_ARGVP_ENVP_ON_STACK @@ -1508,9 +1509,6 @@ config HW_PERF_EVENTS def_bool y depends on ARM_PMU -config ARCH_WANT_GENERAL_HUGETLB - def_bool y - config ARM_MODULE_PLTS bool "Use PLTs to allow module memory to spill over into vmalloc area" depends on MODULES diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index 5adcbd9b5e88..0804b9a11934 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -40,6 +40,7 @@ config RISCV select ARCH_USE_MEMTEST select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT if MMU select ARCH_WANT_FRAME_POINTERS + select ARCH_WANT_GENERAL_HUGETLB select ARCH_WANT_HUGE_PMD_SHARE if 64BIT select BINFMT_FLAT_NO_DATA_START_OFFSET if !MMU select BUILDTIME_TABLE_SORT if MMU @@ -171,9 +172,6 @@ config ARCH_SPARSEMEM_ENABLE config ARCH_SELECT_MEMORY_MODEL def_bool ARCH_SPARSEMEM_ENABLE -config ARCH_WANT_GENERAL_HUGETLB - def_bool y - config ARCH_SUPPORTS_UPROBES def_bool y diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 9f5bd41bf660..37372cd5c9a7 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -118,6 +118,7 @@ config X86 select ARCH_WANT_DEFAULT_BPF_JIT if X86_64 select ARCH_WANTS_DYNAMIC_TASK_STRUCT select ARCH_WANTS_NO_INSTR + select ARCH_WANT_GENERAL_HUGETLB select ARCH_WANT_HUGE_PMD_SHARE select ARCH_WANT_LD_ORPHAN_WARN select ARCH_WANTS_THP_SWAP if X86_64 @@ -347,9 +348,6 @@ config ARCH_NR_GPIO config ARCH_SUSPEND_POSSIBLE def_bool y -config ARCH_WANT_GENERAL_HUGETLB - def_bool y - config AUDIT_ARCH def_bool y if X86_64 -- cgit v1.2.3 From ee97347fe058d02035f354d59a5aa5aa6e1be4cc Mon Sep 17 00:00:00 2001 From: Hari Bathini Date: Tue, 22 Mar 2022 14:46:17 -0700 Subject: powerpc/fadump: opt out from freeing pages on cma activation failure With commit a4e92ce8e4c8 ("powerpc/fadump: Reservationless firmware assisted dump"), Linux kernel's Contiguous Memory Allocator (CMA) based reservation was introduced in fadump. That change was aimed at using CMA to let applications utilize the memory reserved for fadump while blocking it from being used for kernel pages. The assumption was, even if CMA activation fails for whatever reason, the memory still remains reserved to avoid it from being used for kernel pages. But commit 072355c1cf2d ("mm/cma: expose all pages to the buddy if activation of an area fails") breaks this assumption as it started exposing all pages to buddy allocator on CMA activation failure. It led to warning messages like below while running crash-utility on vmcore of a kernel having above two commits: crash: seek error: kernel virtual address: To fix this problem, opt out from exposing pages to buddy allocator on CMA activation failure for fadump reserved memory. Link: https://lkml.kernel.org/r/20220117075246.36072-3-hbathini@linux.ibm.com Signed-off-by: Hari Bathini Acked-by: David Hildenbrand Acked-by: Michael Ellerman Cc: Mahesh Salgaonkar Cc: Mike Kravetz Cc: Oscar Salvador Cc: Sourabh Jain Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/powerpc/kernel/fadump.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'arch') diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c index 7eb67201ea41..4fdb7c77fda1 100644 --- a/arch/powerpc/kernel/fadump.c +++ b/arch/powerpc/kernel/fadump.c @@ -112,6 +112,12 @@ static int __init fadump_cma_init(void) return 1; } + /* + * If CMA activation fails, keep the pages reserved, instead of + * exposing them to buddy allocator. Same as 'fadump=nocma' case. + */ + cma_reserve_pages_on_error(fadump_cma); + /* * So we now have successfully initialized cma area for fadump. */ -- cgit v1.2.3 From e930d999715073a70d306fb59a394ea8b84d0b45 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Tue, 22 Mar 2022 14:46:51 -0700 Subject: mm, memory_hotplug: make arch_alloc_nodedata independent on CONFIG_MEMORY_HOTPLUG Patch series "mm, memory_hotplug: handle unitialized numa node gracefully". The core of the fix is patch 2 which also links existing bug reports. The high level goal is to have all possible numa nodes have their pgdat allocated and initialized so for_each_possible_node(nid) NODE_DATA(nid) will never return garbage. This has proven to be problem in several places when an offline numa node is used for an allocation just to realize that node_data and therefore allocation fallback zonelists are not initialized and such an allocation request blows up. There were attempts to address that by checking node_online in several places including the page allocator. This patchset approaches the problem from a different perspective and instead of special casing, which just adds a runtime overhead, it allocates pglist_data for each possible node. This can add some memory overhead for platforms with high number of possible nodes if they do not contain any memory. This should be a rather rare configuration though. How to test this? David has provided and excellent howto: http://lkml.kernel.org/r/6e5ebc19-890c-b6dd-1924-9f25c441010d@redhat.com Patches 1 and 3-6 are mostly cleanups. The patchset has been reviewed by Rafael (thanks!) and the core fix tested by Rafael and Alexey (thanks to both). David has tested as per instructions above and hasn't found any fallouts in the memory hotplug scenarios. This patch (of 6): This is a preparatory patch and it doesn't introduce any functional change. It merely pulls out arch_alloc_nodedata (and co) outside of CONFIG_MEMORY_HOTPLUG because the following patch will need to call this from the generic MM code. Link: https://lkml.kernel.org/r/20220127085305.20890-1-mhocko@kernel.org Link: https://lkml.kernel.org/r/20220127085305.20890-2-mhocko@kernel.org Signed-off-by: Michal Hocko Acked-by: Rafael Aquini Acked-by: David Hildenbrand Acked-by: Mike Rapoport Reviewed-by: Oscar Salvador Reviewed-by: Wei Yang Cc: Alexey Makhalov Cc: Christoph Lameter Cc: Dennis Zhou Cc: Eric Dumazet Cc: Nico Pache Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/ia64/mm/discontig.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'arch') diff --git a/arch/ia64/mm/discontig.c b/arch/ia64/mm/discontig.c index 791d4176e4a6..8dc8a554f774 100644 --- a/arch/ia64/mm/discontig.c +++ b/arch/ia64/mm/discontig.c @@ -608,7 +608,6 @@ void __init paging_init(void) zero_page_memmap_ptr = virt_to_page(ia64_imva(empty_zero_page)); } -#ifdef CONFIG_MEMORY_HOTPLUG pg_data_t *arch_alloc_nodedata(int nid) { unsigned long size = compute_pernodesize(nid); @@ -626,7 +625,6 @@ void arch_refresh_nodedata(int update_node, pg_data_t *update_pgdat) pgdat_list[update_node] = update_pgdat; scatter_node_data(); } -#endif #ifdef CONFIG_SPARSEMEM_VMEMMAP int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node, -- cgit v1.2.3 From 09f49dca570a917a8c6bccd7e8c61f5141534e3a Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Tue, 22 Mar 2022 14:46:54 -0700 Subject: mm: handle uninitialized numa nodes gracefully We have had several reports [1][2][3] that page allocator blows up when an allocation from a possible node is requested. The underlying reason is that NODE_DATA for the specific node is not allocated. NUMA specific initialization is arch specific and it can vary a lot. E.g. x86 tries to initialize all nodes that have some cpu affinity (see init_cpu_to_node) but this can be insufficient because the node might be cpuless for example. One way to address this problem would be to check for !node_online nodes when trying to get a zonelist and silently fall back to another node. That is unfortunately adding a branch into allocator hot path and it doesn't handle any other potential NODE_DATA users. This patch takes a different approach (following a lead of [3]) and it pre allocates pgdat for all possible nodes in an arch indipendent code - free_area_init. All uninitialized nodes are treated as memoryless nodes. node_state of the node is not changed because that would lead to other side effects - e.g. sysfs representation of such a node and from past discussions [4] it is known that some tools might have problems digesting that. Newly allocated pgdat only gets a minimal initialization and the rest of the work is expected to be done by the memory hotplug - hotadd_new_pgdat (renamed to hotadd_init_pgdat). generic_alloc_nodedata is changed to use the memblock allocator because neither page nor slab allocators are available at the stage when all pgdats are allocated. Hotplug doesn't allocate pgdat anymore so we can use the early boot allocator. The only arch specific implementation is ia64 and that is changed to use the early allocator as well. [1] http://lkml.kernel.org/r/20211101201312.11589-1-amakhalov@vmware.com [2] http://lkml.kernel.org/r/20211207224013.880775-1-npache@redhat.com [3] http://lkml.kernel.org/r/20190114082416.30939-1-mhocko@kernel.org [4] http://lkml.kernel.org/r/20200428093836.27190-1-srikar@linux.vnet.ibm.com [akpm@linux-foundation.org: replace comment, per Mike] Link: https://lkml.kernel.org/r/Yfe7RBeLCijnWBON@dhcp22.suse.cz Reported-by: Alexey Makhalov Tested-by: Alexey Makhalov Reported-by: Nico Pache Acked-by: Rafael Aquini Tested-by: Rafael Aquini Acked-by: David Hildenbrand Reviewed-by: Oscar Salvador Acked-by: Mike Rapoport Signed-off-by: Michal Hocko Cc: Christoph Lameter Cc: Dennis Zhou Cc: Eric Dumazet Cc: Tejun Heo Cc: Wei Yang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/ia64/mm/discontig.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/ia64/mm/discontig.c b/arch/ia64/mm/discontig.c index 8dc8a554f774..dd0cf4834eaa 100644 --- a/arch/ia64/mm/discontig.c +++ b/arch/ia64/mm/discontig.c @@ -608,11 +608,11 @@ void __init paging_init(void) zero_page_memmap_ptr = virt_to_page(ia64_imva(empty_zero_page)); } -pg_data_t *arch_alloc_nodedata(int nid) +pg_data_t * __init arch_alloc_nodedata(int nid) { unsigned long size = compute_pernodesize(nid); - return kzalloc(size, GFP_KERNEL); + return memblock_alloc(size, SMP_CACHE_BYTES); } void arch_free_nodedata(pg_data_t *pgdat) -- cgit v1.2.3 From 390511e1476eb1cc41d420a7661b33f4d8584c3f Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Tue, 22 Mar 2022 14:46:57 -0700 Subject: mm, memory_hotplug: drop arch_free_nodedata Prior to "mm: handle uninitialized numa nodes gracefully" memory hotplug used to allocate pgdat when memory has been added to a node (hotadd_init_pgdat) arch_free_nodedata has been only used in the failure path because once the pgdat is exported (to be visible by NODA_DATA(nid)) it cannot really be freed because there is no synchronization available for that. pgdat is allocated for each possible nodes now so the memory hotplug doesn't need to do the ever use arch_free_nodedata so drop it. This patch doesn't introduce any functional change. Link: https://lkml.kernel.org/r/20220127085305.20890-4-mhocko@kernel.org Signed-off-by: Michal Hocko Acked-by: Rafael Aquini Acked-by: David Hildenbrand Acked-by: Mike Rapoport Reviewed-by: Oscar Salvador Cc: Alexey Makhalov Cc: Christoph Lameter Cc: Dennis Zhou Cc: Eric Dumazet Cc: Nico Pache Cc: Tejun Heo Cc: Wei Yang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/ia64/mm/discontig.c | 5 ----- 1 file changed, 5 deletions(-) (limited to 'arch') diff --git a/arch/ia64/mm/discontig.c b/arch/ia64/mm/discontig.c index dd0cf4834eaa..73d0db36edb6 100644 --- a/arch/ia64/mm/discontig.c +++ b/arch/ia64/mm/discontig.c @@ -615,11 +615,6 @@ pg_data_t * __init arch_alloc_nodedata(int nid) return memblock_alloc(size, SMP_CACHE_BYTES); } -void arch_free_nodedata(pg_data_t *pgdat) -{ - kfree(pgdat); -} - void arch_refresh_nodedata(int update_node, pg_data_t *update_pgdat) { pgdat_list[update_node] = update_pgdat; -- cgit v1.2.3 From 2848a28b0a6052a4c8450397d2647d7d8e3f6f06 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 22 Mar 2022 14:47:13 -0700 Subject: drivers/base/node: consolidate node device subsystem initialization in node_dev_init() ... and call node_dev_init() after memory_dev_init() from driver_init(), so before any of the existing arch/subsys calls. All online nodes should be known at that point: early during boot, arch code determines node and zone ranges and sets the relevant nodes online; usually this happens in setup_arch(). This is in line with memory_dev_init(), which initializes the memory device subsystem and creates all memory block devices. Similar to memory_dev_init(), panic() if anything goes wrong, we don't want to continue with such basic initialization errors. The important part is that node_dev_init() gets called after memory_dev_init() and after cpu_dev_init(), but before any of the relevant archs call register_cpu() to register the new cpu device under the node device. The latter should be the case for the current users of topology_init(). Link: https://lkml.kernel.org/r/20220203105212.30385-1-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Oscar Salvador Tested-by: Anatoly Pugachev (sparc64) Cc: Greg Kroah-Hartman Cc: Michal Hocko Cc: Oscar Salvador Cc: Mike Rapoport Cc: Catalin Marinas Cc: Will Deacon Cc: Thomas Bogendoerfer Cc: Michael Ellerman Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Paul Walmsley Cc: Palmer Dabbelt Cc: Albert Ou Cc: Heiko Carstens Cc: Vasily Gorbik Cc: Yoshinori Sato Cc: Rich Felker Cc: "David S. Miller" Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Borislav Petkov Cc: Dave Hansen Cc: "Rafael J. Wysocki" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm64/kernel/setup.c | 3 --- arch/ia64/kernel/topology.c | 10 ---------- arch/mips/kernel/topology.c | 5 ----- arch/powerpc/kernel/sysfs.c | 17 ----------------- arch/riscv/kernel/setup.c | 3 --- arch/s390/kernel/numa.c | 7 ------- arch/sh/kernel/topology.c | 5 ----- arch/sparc/kernel/sysfs.c | 12 ------------ arch/x86/kernel/topology.c | 5 ----- 9 files changed, 67 deletions(-) (limited to 'arch') diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c index f70573928f1b..3505789cf4bd 100644 --- a/arch/arm64/kernel/setup.c +++ b/arch/arm64/kernel/setup.c @@ -406,9 +406,6 @@ static int __init topology_init(void) { int i; - for_each_online_node(i) - register_one_node(i); - for_each_possible_cpu(i) { struct cpu *cpu = &per_cpu(cpu_data.cpu, i); cpu->hotpluggable = cpu_can_disable(i); diff --git a/arch/ia64/kernel/topology.c b/arch/ia64/kernel/topology.c index e4992917a24b..94a848b06f15 100644 --- a/arch/ia64/kernel/topology.c +++ b/arch/ia64/kernel/topology.c @@ -70,16 +70,6 @@ static int __init topology_init(void) { int i, err = 0; -#ifdef CONFIG_NUMA - /* - * MCD - Do we want to register all ONLINE nodes, or all POSSIBLE nodes? - */ - for_each_online_node(i) { - if ((err = register_one_node(i))) - goto out; - } -#endif - sysfs_cpus = kcalloc(NR_CPUS, sizeof(struct ia64_cpu), GFP_KERNEL); if (!sysfs_cpus) panic("kzalloc in topology_init failed - NR_CPUS too big?"); diff --git a/arch/mips/kernel/topology.c b/arch/mips/kernel/topology.c index 08ad6371fbe0..9429d85a4703 100644 --- a/arch/mips/kernel/topology.c +++ b/arch/mips/kernel/topology.c @@ -12,11 +12,6 @@ static int __init topology_init(void) { int i, ret; -#ifdef CONFIG_NUMA - for_each_online_node(i) - register_one_node(i); -#endif /* CONFIG_NUMA */ - for_each_present_cpu(i) { struct cpu *c = &per_cpu(cpu_devices, i); diff --git a/arch/powerpc/kernel/sysfs.c b/arch/powerpc/kernel/sysfs.c index d45a415d5374..2069bbb90a9a 100644 --- a/arch/powerpc/kernel/sysfs.c +++ b/arch/powerpc/kernel/sysfs.c @@ -1110,14 +1110,6 @@ EXPORT_SYMBOL_GPL(cpu_remove_dev_attr_group); /* NUMA stuff */ #ifdef CONFIG_NUMA -static void __init register_nodes(void) -{ - int i; - - for (i = 0; i < MAX_NUMNODES; i++) - register_one_node(i); -} - int sysfs_add_device_to_node(struct device *dev, int nid) { struct node *node = node_devices[nid]; @@ -1132,13 +1124,6 @@ void sysfs_remove_device_from_node(struct device *dev, int nid) sysfs_remove_link(&node->dev.kobj, kobject_name(&dev->kobj)); } EXPORT_SYMBOL_GPL(sysfs_remove_device_from_node); - -#else -static void __init register_nodes(void) -{ - return; -} - #endif /* Only valid if CPU is present. */ @@ -1155,8 +1140,6 @@ static int __init topology_init(void) { int cpu, r; - register_nodes(); - for_each_possible_cpu(cpu) { struct cpu *c = &per_cpu(cpu_devices, cpu); diff --git a/arch/riscv/kernel/setup.c b/arch/riscv/kernel/setup.c index b42bfdc67482..834eb652a7b9 100644 --- a/arch/riscv/kernel/setup.c +++ b/arch/riscv/kernel/setup.c @@ -301,9 +301,6 @@ static int __init topology_init(void) { int i, ret; - for_each_online_node(i) - register_one_node(i); - for_each_possible_cpu(i) { struct cpu *cpu = &per_cpu(cpu_devices, i); diff --git a/arch/s390/kernel/numa.c b/arch/s390/kernel/numa.c index 51c5a9f6e525..23ab9f02f278 100644 --- a/arch/s390/kernel/numa.c +++ b/arch/s390/kernel/numa.c @@ -33,10 +33,3 @@ void __init numa_setup(void) NODE_DATA(0)->node_spanned_pages = memblock_end_of_DRAM() >> PAGE_SHIFT; NODE_DATA(0)->node_id = 0; } - -static int __init numa_init_late(void) -{ - register_one_node(0); - return 0; -} -arch_initcall(numa_init_late); diff --git a/arch/sh/kernel/topology.c b/arch/sh/kernel/topology.c index 76af6db9daa2..2d2a7509b565 100644 --- a/arch/sh/kernel/topology.c +++ b/arch/sh/kernel/topology.c @@ -46,11 +46,6 @@ static int __init topology_init(void) { int i, ret; -#ifdef CONFIG_NUMA - for_each_online_node(i) - register_one_node(i); -#endif - for_each_present_cpu(i) { struct cpu *c = &per_cpu(cpu_devices, i); diff --git a/arch/sparc/kernel/sysfs.c b/arch/sparc/kernel/sysfs.c index 6d60d416f0dd..f19487e4cc71 100644 --- a/arch/sparc/kernel/sysfs.c +++ b/arch/sparc/kernel/sysfs.c @@ -244,22 +244,10 @@ static void __init check_mmu_stats(void) mmu_stats_supported = 1; } -static void register_nodes(void) -{ -#ifdef CONFIG_NUMA - int i; - - for (i = 0; i < MAX_NUMNODES; i++) - register_one_node(i); -#endif -} - static int __init topology_init(void) { int cpu, ret; - register_nodes(); - check_mmu_stats(); for_each_possible_cpu(cpu) { diff --git a/arch/x86/kernel/topology.c b/arch/x86/kernel/topology.c index bd83748e2bde..8617d1ed9d31 100644 --- a/arch/x86/kernel/topology.c +++ b/arch/x86/kernel/topology.c @@ -154,11 +154,6 @@ static int __init topology_init(void) { int i; -#ifdef CONFIG_NUMA - for_each_online_node(i) - register_one_node(i); -#endif - for_each_present_cpu(i) arch_register_cpu(i); -- cgit v1.2.3