aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2022-09-26 13:23:15 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2022-09-26 13:23:15 -0700
commit3800a713b6070d4f03fb43613a7b7d536a99b2b7 (patch)
tree28250666422f7316cb2434a985137efa90023a3e
parent3a710532cfe586710738670fb7e1ee44fc3c9b5c (diff)
parent59298997df89e19aad426d4ae0a7e5037074da5a (diff)
downloadlinux-3800a713b6070d4f03fb43613a7b7d536a99b2b7.tar.gz
Merge tag 'mm-hotfixes-stable-2022-09-26' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Pull last (?) hotfixes from Andrew Morton: "26 hotfixes. 8 are for issues which were introduced during this -rc cycle, 18 are for earlier issues, and are cc:stable" * tag 'mm-hotfixes-stable-2022-09-26' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm: (26 commits) x86/uaccess: avoid check_object_size() in copy_from_user_nmi() mm/page_isolation: fix isolate_single_pageblock() isolation behavior mm,hwpoison: check mm when killing accessing process mm/hugetlb: correct demote page offset logic mm: prevent page_frag_alloc() from corrupting the memory mm: bring back update_mmu_cache() to finish_fault() frontswap: don't call ->init if no ops are registered mm/huge_memory: use pfn_to_online_page() in split_huge_pages_all() mm: fix madivse_pageout mishandling on non-LRU page powerpc/64s/radix: don't need to broadcast IPI for radix pmd collapse flush mm: gup: fix the fast GUP race against THP collapse mm: fix dereferencing possible ERR_PTR vmscan: check folio_test_private(), not folio_get_private() mm: fix VM_BUG_ON in __delete_from_swap_cache() tools: fix compilation after gfp_types.h split mm/damon/dbgfs: fix memory leak when using debugfs_lookup() mm/migrate_device.c: copy pte dirty bit to page mm/migrate_device.c: add missing flush_cache_page() mm/migrate_device.c: flush TLB while holding PTL x86/mm: disable instrumentations of mm/pgprot.c ...
-rw-r--r--arch/powerpc/mm/book3s64/radix_pgtable.c9
-rw-r--r--arch/x86/lib/usercopy.c2
-rw-r--r--arch/x86/mm/Makefile3
-rw-r--r--fs/ntfs/super.c3
-rw-r--r--fs/xfs/xfs_notify_failure.c6
-rw-r--r--include/linux/memremap.h5
-rw-r--r--mm/damon/dbgfs.c19
-rw-r--r--mm/frontswap.c3
-rw-r--r--mm/gup.c34
-rw-r--r--mm/huge_memory.c6
-rw-r--r--mm/hugetlb.c14
-rw-r--r--mm/khugepaged.c10
-rw-r--r--mm/madvise.c7
-rw-r--r--mm/memory-failure.c27
-rw-r--r--mm/memory.c14
-rw-r--r--mm/migrate_device.c16
-rw-r--r--mm/page_alloc.c65
-rw-r--r--mm/page_isolation.c25
-rw-r--r--mm/secretmem.c2
-rw-r--r--mm/swap_state.c2
-rw-r--r--mm/vmscan.c4
-rw-r--r--tools/include/linux/gfp.h21
-rw-r--r--tools/include/linux/gfp_types.h1
23 files changed, 192 insertions, 106 deletions
diff --git a/arch/powerpc/mm/book3s64/radix_pgtable.c b/arch/powerpc/mm/book3s64/radix_pgtable.c
index 698274109c91..e712f80fe189 100644
--- a/arch/powerpc/mm/book3s64/radix_pgtable.c
+++ b/arch/powerpc/mm/book3s64/radix_pgtable.c
@@ -937,15 +937,6 @@ pmd_t radix__pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long addre
pmd = *pmdp;
pmd_clear(pmdp);
- /*
- * pmdp collapse_flush need to ensure that there are no parallel gup
- * walk after this call. This is needed so that we can have stable
- * page ref count when collapsing a page. We don't allow a collapse page
- * if we have gup taken on the page. We can ensure that by sending IPI
- * because gup walk happens with IRQ disabled.
- */
- serialize_against_pte_lookup(vma->vm_mm);
-
radix__flush_tlb_collapsed_pmd(vma->vm_mm, address);
return pmd;
diff --git a/arch/x86/lib/usercopy.c b/arch/x86/lib/usercopy.c
index ad0139d25401..f1bb18617156 100644
--- a/arch/x86/lib/usercopy.c
+++ b/arch/x86/lib/usercopy.c
@@ -44,7 +44,7 @@ copy_from_user_nmi(void *to, const void __user *from, unsigned long n)
* called from other contexts.
*/
pagefault_disable();
- ret = __copy_from_user_inatomic(to, from, n);
+ ret = raw_copy_from_user(to, from, n);
pagefault_enable();
return ret;
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index f8220fd2c169..829c1409ffbd 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -4,10 +4,12 @@ KCOV_INSTRUMENT_tlb.o := n
KCOV_INSTRUMENT_mem_encrypt.o := n
KCOV_INSTRUMENT_mem_encrypt_amd.o := n
KCOV_INSTRUMENT_mem_encrypt_identity.o := n
+KCOV_INSTRUMENT_pgprot.o := n
KASAN_SANITIZE_mem_encrypt.o := n
KASAN_SANITIZE_mem_encrypt_amd.o := n
KASAN_SANITIZE_mem_encrypt_identity.o := n
+KASAN_SANITIZE_pgprot.o := n
# Disable KCSAN entirely, because otherwise we get warnings that some functions
# reference __initdata sections.
@@ -17,6 +19,7 @@ ifdef CONFIG_FUNCTION_TRACER
CFLAGS_REMOVE_mem_encrypt.o = -pg
CFLAGS_REMOVE_mem_encrypt_amd.o = -pg
CFLAGS_REMOVE_mem_encrypt_identity.o = -pg
+CFLAGS_REMOVE_pgprot.o = -pg
endif
obj-y := init.o init_$(BITS).o fault.o ioremap.o extable.o mmap.o \
diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c
index 5ae8de09b271..001f4e053c85 100644
--- a/fs/ntfs/super.c
+++ b/fs/ntfs/super.c
@@ -2092,7 +2092,8 @@ get_ctx_vol_failed:
// TODO: Initialize security.
/* Get the extended system files' directory inode. */
vol->extend_ino = ntfs_iget(sb, FILE_Extend);
- if (IS_ERR(vol->extend_ino) || is_bad_inode(vol->extend_ino)) {
+ if (IS_ERR(vol->extend_ino) || is_bad_inode(vol->extend_ino) ||
+ !S_ISDIR(vol->extend_ino->i_mode)) {
if (!IS_ERR(vol->extend_ino))
iput(vol->extend_ino);
ntfs_error(sb, "Failed to load $Extend.");
diff --git a/fs/xfs/xfs_notify_failure.c b/fs/xfs/xfs_notify_failure.c
index 69d9c83ea4b2..5b1f9a24ed59 100644
--- a/fs/xfs/xfs_notify_failure.c
+++ b/fs/xfs/xfs_notify_failure.c
@@ -175,13 +175,13 @@ xfs_dax_notify_failure(
u64 ddev_start;
u64 ddev_end;
- if (!(mp->m_sb.sb_flags & SB_BORN)) {
+ if (!(mp->m_super->s_flags & SB_BORN)) {
xfs_warn(mp, "filesystem is not ready for notify_failure()!");
return -EIO;
}
if (mp->m_rtdev_targp && mp->m_rtdev_targp->bt_daxdev == dax_dev) {
- xfs_warn(mp,
+ xfs_debug(mp,
"notify_failure() not supported on realtime device!");
return -EOPNOTSUPP;
}
@@ -194,7 +194,7 @@ xfs_dax_notify_failure(
}
if (!xfs_has_rmapbt(mp)) {
- xfs_warn(mp, "notify_failure() needs rmapbt enabled!");
+ xfs_debug(mp, "notify_failure() needs rmapbt enabled!");
return -EOPNOTSUPP;
}
diff --git a/include/linux/memremap.h b/include/linux/memremap.h
index 19010491a603..c3b4cc84877b 100644
--- a/include/linux/memremap.h
+++ b/include/linux/memremap.h
@@ -139,6 +139,11 @@ struct dev_pagemap {
};
};
+static inline bool pgmap_has_memory_failure(struct dev_pagemap *pgmap)
+{
+ return pgmap->ops && pgmap->ops->memory_failure;
+}
+
static inline struct vmem_altmap *pgmap_altmap(struct dev_pagemap *pgmap)
{
if (pgmap->flags & PGMAP_ALTMAP_VALID)
diff --git a/mm/damon/dbgfs.c b/mm/damon/dbgfs.c
index cfdf63132d5a..4e51466c4e74 100644
--- a/mm/damon/dbgfs.c
+++ b/mm/damon/dbgfs.c
@@ -884,6 +884,7 @@ static int dbgfs_rm_context(char *name)
struct dentry *root, *dir, **new_dirs;
struct damon_ctx **new_ctxs;
int i, j;
+ int ret = 0;
if (damon_nr_running_ctxs())
return -EBUSY;
@@ -898,14 +899,16 @@ static int dbgfs_rm_context(char *name)
new_dirs = kmalloc_array(dbgfs_nr_ctxs - 1, sizeof(*dbgfs_dirs),
GFP_KERNEL);
- if (!new_dirs)
- return -ENOMEM;
+ if (!new_dirs) {
+ ret = -ENOMEM;
+ goto out_dput;
+ }
new_ctxs = kmalloc_array(dbgfs_nr_ctxs - 1, sizeof(*dbgfs_ctxs),
GFP_KERNEL);
if (!new_ctxs) {
- kfree(new_dirs);
- return -ENOMEM;
+ ret = -ENOMEM;
+ goto out_new_dirs;
}
for (i = 0, j = 0; i < dbgfs_nr_ctxs; i++) {
@@ -925,7 +928,13 @@ static int dbgfs_rm_context(char *name)
dbgfs_ctxs = new_ctxs;
dbgfs_nr_ctxs--;
- return 0;
+ goto out_dput;
+
+out_new_dirs:
+ kfree(new_dirs);
+out_dput:
+ dput(dir);
+ return ret;
}
static ssize_t dbgfs_rm_context_write(struct file *file,
diff --git a/mm/frontswap.c b/mm/frontswap.c
index 1a97610308cb..279e55b4ed87 100644
--- a/mm/frontswap.c
+++ b/mm/frontswap.c
@@ -125,6 +125,9 @@ void frontswap_init(unsigned type, unsigned long *map)
* p->frontswap set to something valid to work properly.
*/
frontswap_map_set(sis, map);
+
+ if (!frontswap_enabled())
+ return;
frontswap_ops->init(type);
}
diff --git a/mm/gup.c b/mm/gup.c
index 5abdaf487460..00926abb4426 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -2345,8 +2345,28 @@ static void __maybe_unused undo_dev_pagemap(int *nr, int nr_start,
}
#ifdef CONFIG_ARCH_HAS_PTE_SPECIAL
-static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
- unsigned int flags, struct page **pages, int *nr)
+/*
+ * Fast-gup relies on pte change detection to avoid concurrent pgtable
+ * operations.
+ *
+ * To pin the page, fast-gup needs to do below in order:
+ * (1) pin the page (by prefetching pte), then (2) check pte not changed.
+ *
+ * For the rest of pgtable operations where pgtable updates can be racy
+ * with fast-gup, we need to do (1) clear pte, then (2) check whether page
+ * is pinned.
+ *
+ * Above will work for all pte-level operations, including THP split.
+ *
+ * For THP collapse, it's a bit more complicated because fast-gup may be
+ * walking a pgtable page that is being freed (pte is still valid but pmd
+ * can be cleared already). To avoid race in such condition, we need to
+ * also check pmd here to make sure pmd doesn't change (corresponds to
+ * pmdp_collapse_flush() in the THP collapse code path).
+ */
+static int gup_pte_range(pmd_t pmd, pmd_t *pmdp, unsigned long addr,
+ unsigned long end, unsigned int flags,
+ struct page **pages, int *nr)
{
struct dev_pagemap *pgmap = NULL;
int nr_start = *nr, ret = 0;
@@ -2392,7 +2412,8 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
goto pte_unmap;
}
- if (unlikely(pte_val(pte) != pte_val(*ptep))) {
+ if (unlikely(pmd_val(pmd) != pmd_val(*pmdp)) ||
+ unlikely(pte_val(pte) != pte_val(*ptep))) {
gup_put_folio(folio, 1, flags);
goto pte_unmap;
}
@@ -2439,8 +2460,9 @@ pte_unmap:
* get_user_pages_fast_only implementation that can pin pages. Thus it's still
* useful to have gup_huge_pmd even if we can't operate on ptes.
*/
-static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
- unsigned int flags, struct page **pages, int *nr)
+static int gup_pte_range(pmd_t pmd, pmd_t *pmdp, unsigned long addr,
+ unsigned long end, unsigned int flags,
+ struct page **pages, int *nr)
{
return 0;
}
@@ -2764,7 +2786,7 @@ static int gup_pmd_range(pud_t *pudp, pud_t pud, unsigned long addr, unsigned lo
if (!gup_huge_pd(__hugepd(pmd_val(pmd)), addr,
PMD_SHIFT, next, flags, pages, nr))
return 0;
- } else if (!gup_pte_range(pmd, addr, next, flags, pages, nr))
+ } else if (!gup_pte_range(pmd, pmdp, addr, next, flags, pages, nr))
return 0;
} while (pmdp++, addr = next, addr != end);
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index e9414ee57c5b..f42bb51e023a 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2894,11 +2894,9 @@ static void split_huge_pages_all(void)
max_zone_pfn = zone_end_pfn(zone);
for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) {
int nr_pages;
- if (!pfn_valid(pfn))
- continue;
- page = pfn_to_page(pfn);
- if (!get_page_unless_zero(page))
+ page = pfn_to_online_page(pfn);
+ if (!page || !get_page_unless_zero(page))
continue;
if (zone != page_zone(page))
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index e070b8593b37..0bdfc7e1c933 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -3420,6 +3420,7 @@ static int demote_free_huge_page(struct hstate *h, struct page *page)
{
int i, nid = page_to_nid(page);
struct hstate *target_hstate;
+ struct page *subpage;
int rc = 0;
target_hstate = size_to_hstate(PAGE_SIZE << h->demote_order);
@@ -3453,15 +3454,16 @@ static int demote_free_huge_page(struct hstate *h, struct page *page)
mutex_lock(&target_hstate->resize_lock);
for (i = 0; i < pages_per_huge_page(h);
i += pages_per_huge_page(target_hstate)) {
+ subpage = nth_page(page, i);
if (hstate_is_gigantic(target_hstate))
- prep_compound_gigantic_page_for_demote(page + i,
+ prep_compound_gigantic_page_for_demote(subpage,
target_hstate->order);
else
- prep_compound_page(page + i, target_hstate->order);
- set_page_private(page + i, 0);
- set_page_refcounted(page + i);
- prep_new_huge_page(target_hstate, page + i, nid);
- put_page(page + i);
+ prep_compound_page(subpage, target_hstate->order);
+ set_page_private(subpage, 0);
+ set_page_refcounted(subpage);
+ prep_new_huge_page(target_hstate, subpage, nid);
+ put_page(subpage);
}
mutex_unlock(&target_hstate->resize_lock);
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 01f71786d530..70b7ac66411c 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -1083,10 +1083,12 @@ static void collapse_huge_page(struct mm_struct *mm,
pmd_ptl = pmd_lock(mm, pmd); /* probably unnecessary */
/*
- * After this gup_fast can't run anymore. This also removes
- * any huge TLB entry from the CPU so we won't allow
- * huge and small TLB entries for the same virtual address
- * to avoid the risk of CPU bugs in that area.
+ * This removes any huge TLB entry from the CPU so we won't allow
+ * huge and small TLB entries for the same virtual address to
+ * avoid the risk of CPU bugs in that area.
+ *
+ * Parallel fast GUP is fine since fast GUP will back off when
+ * it detects PMD is changed.
*/
_pmd = pmdp_collapse_flush(vma, address, pmd);
spin_unlock(pmd_ptl);
diff --git a/mm/madvise.c b/mm/madvise.c
index 5f0f0948a50e..9ff51650f4f0 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -451,8 +451,11 @@ regular_page:
continue;
}
- /* Do not interfere with other mappings of this page */
- if (page_mapcount(page) != 1)
+ /*
+ * Do not interfere with other mappings of this page and
+ * non-LRU page.
+ */
+ if (!PageLRU(page) || page_mapcount(page) != 1)
continue;
VM_BUG_ON_PAGE(PageTransCompound(page), page);
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 14439806b5ef..e7ac570dda75 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -345,13 +345,17 @@ static unsigned long dev_pagemap_mapping_shift(struct vm_area_struct *vma,
* not much we can do. We just print a message and ignore otherwise.
*/
+#define FSDAX_INVALID_PGOFF ULONG_MAX
+
/*
* Schedule a process for later kill.
* Uses GFP_ATOMIC allocations to avoid potential recursions in the VM.
*
- * Notice: @fsdax_pgoff is used only when @p is a fsdax page.
- * In other cases, such as anonymous and file-backend page, the address to be
- * killed can be caculated by @p itself.
+ * Note: @fsdax_pgoff is used only when @p is a fsdax page and a
+ * filesystem with a memory failure handler has claimed the
+ * memory_failure event. In all other cases, page->index and
+ * page->mapping are sufficient for mapping the page back to its
+ * corresponding user virtual address.
*/
static void add_to_kill(struct task_struct *tsk, struct page *p,
pgoff_t fsdax_pgoff, struct vm_area_struct *vma,
@@ -367,11 +371,7 @@ static void add_to_kill(struct task_struct *tsk, struct page *p,
tk->addr = page_address_in_vma(p, vma);
if (is_zone_device_page(p)) {
- /*
- * Since page->mapping is not used for fsdax, we need
- * calculate the address based on the vma.
- */
- if (p->pgmap->type == MEMORY_DEVICE_FS_DAX)
+ if (fsdax_pgoff != FSDAX_INVALID_PGOFF)
tk->addr = vma_pgoff_address(fsdax_pgoff, 1, vma);
tk->size_shift = dev_pagemap_mapping_shift(vma, tk->addr);
} else
@@ -523,7 +523,8 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill,
if (!page_mapped_in_vma(page, vma))
continue;
if (vma->vm_mm == t->mm)
- add_to_kill(t, page, 0, vma, to_kill);
+ add_to_kill(t, page, FSDAX_INVALID_PGOFF, vma,
+ to_kill);
}
}
read_unlock(&tasklist_lock);
@@ -559,7 +560,8 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill,
* to be informed of all such data corruptions.
*/
if (vma->vm_mm == t->mm)
- add_to_kill(t, page, 0, vma, to_kill);
+ add_to_kill(t, page, FSDAX_INVALID_PGOFF, vma,
+ to_kill);
}
}
read_unlock(&tasklist_lock);
@@ -743,6 +745,9 @@ static int kill_accessing_process(struct task_struct *p, unsigned long pfn,
};
priv.tk.tsk = p;
+ if (!p->mm)
+ return -EFAULT;
+
mmap_read_lock(p->mm);
ret = walk_page_range(p->mm, 0, TASK_SIZE, &hwp_walk_ops,
(void *)&priv);
@@ -1928,7 +1933,7 @@ static int memory_failure_dev_pagemap(unsigned long pfn, int flags,
* Call driver's implementation to handle the memory failure, otherwise
* fall back to generic handler.
*/
- if (pgmap->ops->memory_failure) {
+ if (pgmap_has_memory_failure(pgmap)) {
rc = pgmap->ops->memory_failure(pgmap, pfn, 1, flags);
/*
* Fall back to generic handler too if operation is not
diff --git a/mm/memory.c b/mm/memory.c
index 4ba73f5aa8bb..a78814413ac0 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -4386,14 +4386,20 @@ vm_fault_t finish_fault(struct vm_fault *vmf)
vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
vmf->address, &vmf->ptl);
- ret = 0;
+
/* Re-check under ptl */
- if (likely(!vmf_pte_changed(vmf)))
+ if (likely(!vmf_pte_changed(vmf))) {
do_set_pte(vmf, page, vmf->address);
- else
+
+ /* no need to invalidate: a not-present page won't be cached */
+ update_mmu_cache(vma, vmf->address, vmf->pte);
+
+ ret = 0;
+ } else {
+ update_mmu_tlb(vma, vmf->address, vmf->pte);
ret = VM_FAULT_NOPAGE;
+ }
- update_mmu_tlb(vma, vmf->address, vmf->pte);
pte_unmap_unlock(vmf->pte, vmf->ptl);
return ret;
}
diff --git a/mm/migrate_device.c b/mm/migrate_device.c
index 27fb37d65476..dbf6c7a7a7c9 100644
--- a/mm/migrate_device.c
+++ b/mm/migrate_device.c
@@ -7,6 +7,7 @@
#include <linux/export.h>
#include <linux/memremap.h>
#include <linux/migrate.h>
+#include <linux/mm.h>
#include <linux/mm_inline.h>
#include <linux/mmu_notifier.h>
#include <linux/oom.h>
@@ -193,10 +194,10 @@ again:
bool anon_exclusive;
pte_t swp_pte;
+ flush_cache_page(vma, addr, pte_pfn(*ptep));
anon_exclusive = PageAnon(page) && PageAnonExclusive(page);
if (anon_exclusive) {
- flush_cache_page(vma, addr, pte_pfn(*ptep));
- ptep_clear_flush(vma, addr, ptep);
+ pte = ptep_clear_flush(vma, addr, ptep);
if (page_try_share_anon_rmap(page)) {
set_pte_at(mm, addr, ptep, pte);
@@ -206,11 +207,15 @@ again:
goto next;
}
} else {
- ptep_get_and_clear(mm, addr, ptep);
+ pte = ptep_get_and_clear(mm, addr, ptep);
}
migrate->cpages++;
+ /* Set the dirty flag on the folio now the pte is gone. */
+ if (pte_dirty(pte))
+ folio_mark_dirty(page_folio(page));
+
/* Setup special migration page table entry */
if (mpfn & MIGRATE_PFN_WRITE)
entry = make_writable_migration_entry(
@@ -254,13 +259,14 @@ next:
migrate->dst[migrate->npages] = 0;
migrate->src[migrate->npages++] = mpfn;
}
- arch_leave_lazy_mmu_mode();
- pte_unmap_unlock(ptep - 1, ptl);
/* Only flush the TLB if we actually modified any entries */
if (unmapped)
flush_tlb_range(walk->vma, start, end);
+ arch_leave_lazy_mmu_mode();
+ pte_unmap_unlock(ptep - 1, ptl);
+
return 0;
}
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index e5486d47406e..d04211f0ef0b 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -4708,6 +4708,30 @@ void fs_reclaim_release(gfp_t gfp_mask)
EXPORT_SYMBOL_GPL(fs_reclaim_release);
#endif
+/*
+ * Zonelists may change due to hotplug during allocation. Detect when zonelists
+ * have been rebuilt so allocation retries. Reader side does not lock and
+ * retries the allocation if zonelist changes. Writer side is protected by the
+ * embedded spin_lock.
+ */
+static DEFINE_SEQLOCK(zonelist_update_seq);
+
+static unsigned int zonelist_iter_begin(void)
+{
+ if (IS_ENABLED(CONFIG_MEMORY_HOTREMOVE))
+ return read_seqbegin(&zonelist_update_seq);
+
+ return 0;
+}
+
+static unsigned int check_retry_zonelist(unsigned int seq)
+{
+ if (IS_ENABLED(CONFIG_MEMORY_HOTREMOVE))
+ return read_seqretry(&zonelist_update_seq, seq);
+
+ return seq;
+}
+
/* Perform direct synchronous page reclaim */
static unsigned long
__perform_reclaim(gfp_t gfp_mask, unsigned int order,
@@ -5001,6 +5025,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
int compaction_retries;
int no_progress_loops;
unsigned int cpuset_mems_cookie;
+ unsigned int zonelist_iter_cookie;
int reserve_flags;
/*
@@ -5011,11 +5036,12 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
(__GFP_ATOMIC|__GFP_DIRECT_RECLAIM)))
gfp_mask &= ~__GFP_ATOMIC;
-retry_cpuset:
+restart:
compaction_retries = 0;
no_progress_loops = 0;
compact_priority = DEF_COMPACT_PRIORITY;
cpuset_mems_cookie = read_mems_allowed_begin();
+ zonelist_iter_cookie = zonelist_iter_begin();
/*
* The fast path uses conservative alloc_flags to succeed only until
@@ -5187,9 +5213,13 @@ retry:
goto retry;
- /* Deal with possible cpuset update races before we start OOM killing */
- if (check_retry_cpuset(cpuset_mems_cookie, ac))
- goto retry_cpuset;
+ /*
+ * Deal with possible cpuset update races or zonelist updates to avoid
+ * a unnecessary OOM kill.
+ */
+ if (check_retry_cpuset(cpuset_mems_cookie, ac) ||
+ check_retry_zonelist(zonelist_iter_cookie))
+ goto restart;
/* Reclaim has failed us, start killing things */
page = __alloc_pages_may_oom(gfp_mask, order, ac, &did_some_progress);
@@ -5209,9 +5239,13 @@ retry:
}
nopage:
- /* Deal with possible cpuset update races before we fail */
- if (check_retry_cpuset(cpuset_mems_cookie, ac))
- goto retry_cpuset;
+ /*
+ * Deal with possible cpuset update races or zonelist updates to avoid
+ * a unnecessary OOM kill.
+ */
+ if (check_retry_cpuset(cpuset_mems_cookie, ac) ||
+ check_retry_zonelist(zonelist_iter_cookie))
+ goto restart;
/*
* Make sure that __GFP_NOFAIL request doesn't leak out and make sure
@@ -5706,6 +5740,18 @@ refill:
/* reset page count bias and offset to start of new frag */
nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
offset = size - fragsz;
+ if (unlikely(offset < 0)) {
+ /*
+ * The caller is trying to allocate a fragment
+ * with fragsz > PAGE_SIZE but the cache isn't big
+ * enough to satisfy the request, this may
+ * happen in low memory conditions.
+ * We don't release the cache page because
+ * it could make memory pressure worse
+ * so we simply return NULL here.
+ */
+ return NULL;
+ }
}
nc->pagecnt_bias--;
@@ -6514,9 +6560,8 @@ static void __build_all_zonelists(void *data)
int nid;
int __maybe_unused cpu;
pg_data_t *self = data;
- static DEFINE_SPINLOCK(lock);
- spin_lock(&lock);
+ write_seqlock(&zonelist_update_seq);
#ifdef CONFIG_NUMA
memset(node_load, 0, sizeof(node_load));
@@ -6553,7 +6598,7 @@ static void __build_all_zonelists(void *data)
#endif
}
- spin_unlock(&lock);
+ write_sequnlock(&zonelist_update_seq);
}
static noinline void __init
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index 9d73dc38e3d7..eb3a68ca92ad 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -288,6 +288,7 @@ __first_valid_page(unsigned long pfn, unsigned long nr_pages)
* @isolate_before: isolate the pageblock before the boundary_pfn
* @skip_isolation: the flag to skip the pageblock isolation in second
* isolate_single_pageblock()
+ * @migratetype: migrate type to set in error recovery.
*
* Free and in-use pages can be as big as MAX_ORDER-1 and contain more than one
* pageblock. When not all pageblocks within a page are isolated at the same
@@ -302,9 +303,9 @@ __first_valid_page(unsigned long pfn, unsigned long nr_pages)
* the in-use page then splitting the free page.
*/
static int isolate_single_pageblock(unsigned long boundary_pfn, int flags,
- gfp_t gfp_flags, bool isolate_before, bool skip_isolation)
+ gfp_t gfp_flags, bool isolate_before, bool skip_isolation,
+ int migratetype)
{
- unsigned char saved_mt;
unsigned long start_pfn;
unsigned long isolate_pageblock;
unsigned long pfn;
@@ -328,13 +329,13 @@ static int isolate_single_pageblock(unsigned long boundary_pfn, int flags,
start_pfn = max(ALIGN_DOWN(isolate_pageblock, MAX_ORDER_NR_PAGES),
zone->zone_start_pfn);
- saved_mt = get_pageblock_migratetype(pfn_to_page(isolate_pageblock));
+ if (skip_isolation) {
+ int mt = get_pageblock_migratetype(pfn_to_page(isolate_pageblock));
- if (skip_isolation)
- VM_BUG_ON(!is_migrate_isolate(saved_mt));
- else {
- ret = set_migratetype_isolate(pfn_to_page(isolate_pageblock), saved_mt, flags,
- isolate_pageblock, isolate_pageblock + pageblock_nr_pages);
+ VM_BUG_ON(!is_migrate_isolate(mt));
+ } else {
+ ret = set_migratetype_isolate(pfn_to_page(isolate_pageblock), migratetype,
+ flags, isolate_pageblock, isolate_pageblock + pageblock_nr_pages);
if (ret)
return ret;
@@ -475,7 +476,7 @@ static int isolate_single_pageblock(unsigned long boundary_pfn, int flags,
failed:
/* restore the original migratetype */
if (!skip_isolation)
- unset_migratetype_isolate(pfn_to_page(isolate_pageblock), saved_mt);
+ unset_migratetype_isolate(pfn_to_page(isolate_pageblock), migratetype);
return -EBUSY;
}
@@ -537,7 +538,8 @@ int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
bool skip_isolation = false;
/* isolate [isolate_start, isolate_start + pageblock_nr_pages) pageblock */
- ret = isolate_single_pageblock(isolate_start, flags, gfp_flags, false, skip_isolation);
+ ret = isolate_single_pageblock(isolate_start, flags, gfp_flags, false,
+ skip_isolation, migratetype);
if (ret)
return ret;
@@ -545,7 +547,8 @@ int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
skip_isolation = true;
/* isolate [isolate_end - pageblock_nr_pages, isolate_end) pageblock */
- ret = isolate_single_pageblock(isolate_end, flags, gfp_flags, true, skip_isolation);
+ ret = isolate_single_pageblock(isolate_end, flags, gfp_flags, true,
+ skip_isolation, migratetype);
if (ret) {
unset_migratetype_isolate(pfn_to_page(isolate_start), migratetype);
return ret;
diff --git a/mm/secretmem.c b/mm/secretmem.c
index e3e9590c6fb3..3f7154099795 100644
--- a/mm/secretmem.c
+++ b/mm/secretmem.c
@@ -285,7 +285,7 @@ static int secretmem_init(void)
secretmem_mnt = kern_mount(&secretmem_fs);
if (IS_ERR(secretmem_mnt))
- ret = PTR_ERR(secretmem_mnt);
+ return PTR_ERR(secretmem_mnt);
/* prevent secretmem mappings from ever getting PROT_EXEC */
secretmem_mnt->mnt_flags |= MNT_NOEXEC;
diff --git a/mm/swap_state.c b/mm/swap_state.c
index e166051566f4..41afa6d45b23 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -151,7 +151,7 @@ void __delete_from_swap_cache(struct folio *folio,
for (i = 0; i < nr; i++) {
void *entry = xas_store(&xas, shadow);
- VM_BUG_ON_FOLIO(entry != folio, folio);
+ VM_BUG_ON_PAGE(entry != folio, entry);
set_page_private(folio_page(folio, i), 0);
xas_next(&xas);
}
diff --git a/mm/vmscan.c b/mm/vmscan.c
index b2b1431352dc..382dbe97329f 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2550,8 +2550,8 @@ static void shrink_active_list(unsigned long nr_to_scan,
}
if (unlikely(buffer_heads_over_limit)) {
- if (folio_get_private(folio) && folio_trylock(folio)) {
- if (folio_get_private(folio))
+ if (folio_test_private(folio) && folio_trylock(folio)) {
+ if (folio_test_private(folio))
filemap_release_folio(folio, 0);
folio_unlock(folio);
}
diff --git a/tools/include/linux/gfp.h b/tools/include/linux/gfp.h
index b238dbc9eb85..6a10ff5f5be9 100644
--- a/tools/include/linux/gfp.h
+++ b/tools/include/linux/gfp.h
@@ -3,26 +3,7 @@
#define _TOOLS_INCLUDE_LINUX_GFP_H
#include <linux/types.h>
-
-#define __GFP_BITS_SHIFT 26
-#define __GFP_BITS_MASK ((gfp_t)((1 << __GFP_BITS_SHIFT) - 1))
-
-#define __GFP_HIGH 0x20u
-#define __GFP_IO 0x40u
-#define __GFP_FS 0x80u
-#define __GFP_NOWARN 0x200u
-#define __GFP_ZERO 0x8000u
-#define __GFP_ATOMIC 0x80000u
-#define __GFP_ACCOUNT 0x100000u
-#define __GFP_DIRECT_RECLAIM 0x400000u
-#define __GFP_KSWAPD_RECLAIM 0x2000000u
-
-#define __GFP_RECLAIM (__GFP_DIRECT_RECLAIM | __GFP_KSWAPD_RECLAIM)
-
-#define GFP_ZONEMASK 0x0fu
-#define GFP_ATOMIC (__GFP_HIGH | __GFP_ATOMIC | __GFP_KSWAPD_RECLAIM)
-#define GFP_KERNEL (__GFP_RECLAIM | __GFP_IO | __GFP_FS)
-#define GFP_NOWAIT (__GFP_KSWAPD_RECLAIM)
+#include <linux/gfp_types.h>
static inline bool gfpflags_allow_blocking(const gfp_t gfp_flags)
{
diff --git a/tools/include/linux/gfp_types.h b/tools/include/linux/gfp_types.h
new file mode 100644
index 000000000000..5f9f1ed190a0
--- /dev/null
+++ b/tools/include/linux/gfp_types.h
@@ -0,0 +1 @@
+#include "../../../include/linux/gfp_types.h"