aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLinaro CI <ci-bot@linaro.org>2020-03-04 23:35:13 +0000
committerLinaro CI <ci-bot@linaro.org>2020-03-04 23:35:13 +0000
commitad23e27c3243eeb4955a9bd29b5e12629777fe5c (patch)
tree30241dfe32201a3ac5818cec55ace5daf3b36cf5
parent0ae42ff9e2a258b13fc16b5e5f2f04a2e4ec9778 (diff)
parentc98dd3b1dfa42f55545451d290a2c30cd1130121 (diff)
downloadhikey-linaro-ad23e27c3243eeb4955a9bd29b5e12629777fe5c.tar.gz
Merge remote-tracking branch 'aosp/mirror-android-4.19' into android-hikey-linaro-4.19-lkftv4.19.107-1766-gc98dd3b1dfa4-20200304-49
-rw-r--r--Documentation/sysctl/vm.txt21
-rw-r--r--include/linux/gfp.h9
-rw-r--r--include/linux/highmem.h5
-rw-r--r--include/linux/mm.h1
-rw-r--r--include/linux/mmzone.h24
-rw-r--r--kernel/sysctl.c8
-rw-r--r--mm/compaction.c124
-rw-r--r--mm/internal.h14
-rw-r--r--mm/page_alloc.c264
-rw-r--r--mm/vmscan.c133
10 files changed, 529 insertions, 74 deletions
diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt
index a48baf202265..e33b6808db6b 100644
--- a/Documentation/sysctl/vm.txt
+++ b/Documentation/sysctl/vm.txt
@@ -64,6 +64,7 @@ Currently, these files are in /proc/sys/vm:
- swappiness
- user_reserve_kbytes
- vfs_cache_pressure
+- watermark_boost_factor
- watermark_scale_factor
- zone_reclaim_mode
@@ -872,6 +873,26 @@ ten times more freeable objects than there are.
=============================================================
+watermark_boost_factor:
+
+This factor controls the level of reclaim when memory is being fragmented.
+It defines the percentage of the high watermark of a zone that will be
+reclaimed if pages of different mobility are being mixed within pageblocks.
+The intent is that compaction has less work to do in the future and to
+increase the success rate of future high-order allocations such as SLUB
+allocations, THP and hugetlbfs pages.
+
+To make it sensible with respect to the watermark_scale_factor parameter,
+the unit is in fractions of 10,000. The default value of 15,000 means
+that up to 150% of the high watermark will be reclaimed in the event of
+a pageblock being mixed due to fragmentation. The level of reclaim is
+determined by the number of fragmentation events that occurred in the
+recent past. If this value is smaller than a pageblock then a pageblocks
+worth of pages will be reclaimed (e.g. 2MB on 64-bit x86). A boost factor
+of 0 will disable the feature.
+
+=============================================================
+
watermark_scale_factor:
This factor controls the aggressiveness of kswapd. It defines the
diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index f78d1e89593f..194da88f128f 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -44,6 +44,7 @@ struct vm_area_struct;
#else
#define ___GFP_NOLOCKDEP 0
#endif
+#define ___GFP_CMA 0x1000000u
/* If the above are modified, __GFP_BITS_SHIFT may need updating */
/*
@@ -57,6 +58,7 @@ struct vm_area_struct;
#define __GFP_HIGHMEM ((__force gfp_t)___GFP_HIGHMEM)
#define __GFP_DMA32 ((__force gfp_t)___GFP_DMA32)
#define __GFP_MOVABLE ((__force gfp_t)___GFP_MOVABLE) /* ZONE_MOVABLE allowed */
+#define __GFP_CMA ((__force gfp_t)___GFP_CMA)
#define GFP_ZONEMASK (__GFP_DMA|__GFP_HIGHMEM|__GFP_DMA32|__GFP_MOVABLE)
/**
@@ -217,8 +219,13 @@ struct vm_area_struct;
#define __GFP_NOLOCKDEP ((__force gfp_t)___GFP_NOLOCKDEP)
/* Room for N __GFP_FOO bits */
-#define __GFP_BITS_SHIFT (23 + IS_ENABLED(CONFIG_LOCKDEP))
+#define __GFP_BITS_SHIFT (25)
+#ifdef CONFIG_LOCKDEP
#define __GFP_BITS_MASK ((__force gfp_t)((1 << __GFP_BITS_SHIFT) - 1))
+#else
+#define __GFP_BITS_MASK (((__force gfp_t)((1 << __GFP_BITS_SHIFT) - 1)) & \
+ ~0x800000u)
+#endif
/**
* DOC: Useful GFP flag combinations
diff --git a/include/linux/highmem.h b/include/linux/highmem.h
index 0690679832d4..e80602ee359b 100644
--- a/include/linux/highmem.h
+++ b/include/linux/highmem.h
@@ -181,7 +181,12 @@ static inline struct page *
alloc_zeroed_user_highpage_movable(struct vm_area_struct *vma,
unsigned long vaddr)
{
+#ifndef CONFIG_CMA
return __alloc_zeroed_user_highpage(__GFP_MOVABLE, vma, vaddr);
+#else
+ return __alloc_zeroed_user_highpage(__GFP_MOVABLE|__GFP_CMA, vma,
+ vaddr);
+#endif
}
static inline void clear_highpage(struct page *page)
diff --git a/include/linux/mm.h b/include/linux/mm.h
index a3ece90256c0..c06305ce27d5 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2239,6 +2239,7 @@ extern void zone_pcp_reset(struct zone *zone);
/* page_alloc.c */
extern int min_free_kbytes;
+extern int watermark_boost_factor;
extern int watermark_scale_factor;
/* nommu.c */
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index fc0cfa808e34..3394901d0cf6 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -40,8 +40,6 @@ enum migratetype {
MIGRATE_UNMOVABLE,
MIGRATE_MOVABLE,
MIGRATE_RECLAIMABLE,
- MIGRATE_PCPTYPES, /* the number of types on the pcp lists */
- MIGRATE_HIGHATOMIC = MIGRATE_PCPTYPES,
#ifdef CONFIG_CMA
/*
* MIGRATE_CMA migration type is designed to mimic the way
@@ -58,6 +56,8 @@ enum migratetype {
*/
MIGRATE_CMA,
#endif
+ MIGRATE_PCPTYPES, /* the number of types on the pcp lists */
+ MIGRATE_HIGHATOMIC = MIGRATE_PCPTYPES,
#ifdef CONFIG_MEMORY_ISOLATION
MIGRATE_ISOLATE, /* can't allocate from here */
#endif
@@ -70,9 +70,11 @@ extern char * const migratetype_names[MIGRATE_TYPES];
#ifdef CONFIG_CMA
# define is_migrate_cma(migratetype) unlikely((migratetype) == MIGRATE_CMA)
# define is_migrate_cma_page(_page) (get_pageblock_migratetype(_page) == MIGRATE_CMA)
+# define get_cma_migrate_type() MIGRATE_CMA
#else
# define is_migrate_cma(migratetype) false
# define is_migrate_cma_page(_page) false
+# define get_cma_migrate_type() MIGRATE_MOVABLE
#endif
static inline bool is_migrate_movable(int mt)
@@ -271,9 +273,10 @@ enum zone_watermarks {
NR_WMARK
};
-#define min_wmark_pages(z) (z->watermark[WMARK_MIN])
-#define low_wmark_pages(z) (z->watermark[WMARK_LOW])
-#define high_wmark_pages(z) (z->watermark[WMARK_HIGH])
+#define min_wmark_pages(z) (z->_watermark[WMARK_MIN] + z->watermark_boost)
+#define low_wmark_pages(z) (z->_watermark[WMARK_LOW] + z->watermark_boost)
+#define high_wmark_pages(z) (z->_watermark[WMARK_HIGH] + z->watermark_boost)
+#define wmark_pages(z, i) (z->_watermark[i] + z->watermark_boost)
struct per_cpu_pages {
int count; /* number of pages in the list */
@@ -364,7 +367,8 @@ struct zone {
/* Read-mostly fields */
/* zone watermarks, access with *_wmark_pages(zone) macros */
- unsigned long watermark[NR_WMARK];
+ unsigned long _watermark[NR_WMARK];
+ unsigned long watermark_boost;
unsigned long nr_reserved_highatomic;
@@ -385,6 +389,10 @@ struct zone {
struct pglist_data *zone_pgdat;
struct per_cpu_pageset __percpu *pageset;
+#ifdef CONFIG_CMA
+ bool cma_alloc;
+#endif
+
#ifndef CONFIG_SPARSEMEM
/*
* Flags for a pageblock_nr_pages block. See pageblock-flags.h.
@@ -486,6 +494,8 @@ struct zone {
unsigned long compact_cached_free_pfn;
/* pfn where async and sync compaction migration scanner should start */
unsigned long compact_cached_migrate_pfn[2];
+ unsigned long compact_init_migrate_pfn;
+ unsigned long compact_init_free_pfn;
#endif
#ifdef CONFIG_COMPACTION
@@ -888,6 +898,8 @@ static inline int is_highmem(struct zone *zone)
struct ctl_table;
int min_free_kbytes_sysctl_handler(struct ctl_table *, int,
void __user *, size_t *, loff_t *);
+int watermark_boost_factor_sysctl_handler(struct ctl_table *, int,
+ void __user *, size_t *, loff_t *);
int watermark_scale_factor_sysctl_handler(struct ctl_table *, int,
void __user *, size_t *, loff_t *);
extern int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES];
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 521c38e9ac14..62949e56064a 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1495,6 +1495,14 @@ static struct ctl_table vm_table[] = {
.extra1 = &zero,
},
{
+ .procname = "watermark_boost_factor",
+ .data = &watermark_boost_factor,
+ .maxlen = sizeof(watermark_boost_factor),
+ .mode = 0644,
+ .proc_handler = watermark_boost_factor_sysctl_handler,
+ .extra1 = &zero,
+ },
+ {
.procname = "watermark_scale_factor",
.data = &watermark_scale_factor,
.maxlen = sizeof(watermark_scale_factor),
diff --git a/mm/compaction.c b/mm/compaction.c
index 120e5559e6b4..b8ced23f6c5c 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -237,6 +237,70 @@ static bool pageblock_skip_persistent(struct page *page)
return false;
}
+static bool
+__reset_isolation_pfn(struct zone *zone, unsigned long pfn, bool check_source,
+ bool check_target)
+{
+ struct page *page = pfn_to_online_page(pfn);
+ struct page *end_page;
+ unsigned long block_pfn;
+
+ if (!page)
+ return false;
+ if (zone != page_zone(page))
+ return false;
+ if (pageblock_skip_persistent(page))
+ return false;
+
+ /*
+ * If skip is already cleared do no further checking once the
+ * restart points have been set.
+ */
+ if (check_source && check_target && !get_pageblock_skip(page))
+ return true;
+
+ /*
+ * If clearing skip for the target scanner, do not select a
+ * non-movable pageblock as the starting point.
+ */
+ if (!check_source && check_target &&
+ get_pageblock_migratetype(page) != MIGRATE_MOVABLE)
+ return false;
+
+ /*
+ * Only clear the hint if a sample indicates there is either a
+ * free page or an LRU page in the block. One or other condition
+ * is necessary for the block to be a migration source/target.
+ */
+ block_pfn = pageblock_start_pfn(pfn);
+ pfn = max(block_pfn, zone->zone_start_pfn);
+ page = pfn_to_page(pfn);
+ if (zone != page_zone(page))
+ return false;
+ pfn = block_pfn + pageblock_nr_pages;
+ pfn = min(pfn, zone_end_pfn(zone));
+ end_page = pfn_to_page(pfn);
+
+ do {
+ if (pfn_valid_within(pfn)) {
+ if (check_source && PageLRU(page)) {
+ clear_pageblock_skip(page);
+ return true;
+ }
+
+ if (check_target && PageBuddy(page)) {
+ clear_pageblock_skip(page);
+ return true;
+ }
+ }
+
+ page += (1 << PAGE_ALLOC_COSTLY_ORDER);
+ pfn += (1 << PAGE_ALLOC_COSTLY_ORDER);
+ } while (page < end_page);
+
+ return false;
+}
+
/*
* This function is called to clear all cached information on pageblocks that
* should be skipped for page isolation when the migrate and free page scanner
@@ -244,30 +308,54 @@ static bool pageblock_skip_persistent(struct page *page)
*/
static void __reset_isolation_suitable(struct zone *zone)
{
- unsigned long start_pfn = zone->zone_start_pfn;
- unsigned long end_pfn = zone_end_pfn(zone);
- unsigned long pfn;
+ unsigned long migrate_pfn = zone->zone_start_pfn;
+ unsigned long free_pfn = zone_end_pfn(zone);
+ unsigned long reset_migrate = free_pfn;
+ unsigned long reset_free = migrate_pfn;
+ bool source_set = false;
+ bool free_set = false;
+
+ if (!zone->compact_blockskip_flush)
+ return;
zone->compact_blockskip_flush = false;
- /* Walk the zone and mark every pageblock as suitable for isolation */
- for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {
- struct page *page;
-
+ /*
+ * Walk the zone and update pageblock skip information. Source looks
+ * for PageLRU while target looks for PageBuddy. When the scanner
+ * is found, both PageBuddy and PageLRU are checked as the pageblock
+ * is suitable as both source and target.
+ */
+ for (; migrate_pfn < free_pfn; migrate_pfn += pageblock_nr_pages,
+ free_pfn -= pageblock_nr_pages) {
cond_resched();
- page = pfn_to_online_page(pfn);
- if (!page)
- continue;
- if (zone != page_zone(page))
- continue;
- if (pageblock_skip_persistent(page))
- continue;
+ /* Update the migrate PFN */
+ if (__reset_isolation_pfn(zone, migrate_pfn, true, source_set) &&
+ migrate_pfn < reset_migrate) {
+ source_set = true;
+ reset_migrate = migrate_pfn;
+ zone->compact_init_migrate_pfn = reset_migrate;
+ zone->compact_cached_migrate_pfn[0] = reset_migrate;
+ zone->compact_cached_migrate_pfn[1] = reset_migrate;
+ }
- clear_pageblock_skip(page);
+ /* Update the free PFN */
+ if (__reset_isolation_pfn(zone, free_pfn, free_set, true) &&
+ free_pfn > reset_free) {
+ free_set = true;
+ reset_free = free_pfn;
+ zone->compact_init_free_pfn = reset_free;
+ zone->compact_cached_free_pfn = reset_free;
+ }
}
- reset_cached_positions(zone);
+ /* Leave no distance if no suitable block was reset */
+ if (reset_migrate >= reset_free) {
+ zone->compact_cached_migrate_pfn[0] = migrate_pfn;
+ zone->compact_cached_migrate_pfn[1] = migrate_pfn;
+ zone->compact_cached_free_pfn = free_pfn;
+ }
}
void reset_isolation_suitable(pg_data_t *pgdat)
@@ -1431,7 +1519,7 @@ static enum compact_result __compaction_suitable(struct zone *zone, int order,
if (is_via_compact_memory(order))
return COMPACT_CONTINUE;
- watermark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];
+ watermark = wmark_pages(zone, alloc_flags & ALLOC_WMARK_MASK);
/*
* If watermarks for high-order allocation are already met, there
* should be no need for compaction at all.
@@ -1591,7 +1679,7 @@ static enum compact_result compact_zone(struct zone *zone, struct compact_contro
zone->compact_cached_migrate_pfn[1] = cc->migrate_pfn;
}
- if (cc->migrate_pfn == start_pfn)
+ if (cc->migrate_pfn <= cc->zone->compact_init_migrate_pfn)
cc->whole_zone = true;
}
diff --git a/mm/internal.h b/mm/internal.h
index 397183c8fe47..5b9734aff4a5 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -490,10 +490,16 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone,
#define ALLOC_OOM ALLOC_NO_WATERMARKS
#endif
-#define ALLOC_HARDER 0x10 /* try to alloc harder */
-#define ALLOC_HIGH 0x20 /* __GFP_HIGH set */
-#define ALLOC_CPUSET 0x40 /* check for correct cpuset */
-#define ALLOC_CMA 0x80 /* allow allocations from CMA areas */
+#define ALLOC_HARDER 0x10 /* try to alloc harder */
+#define ALLOC_HIGH 0x20 /* __GFP_HIGH set */
+#define ALLOC_CPUSET 0x40 /* check for correct cpuset */
+#define ALLOC_CMA 0x80 /* allow allocations from CMA areas */
+#ifdef CONFIG_ZONE_DMA32
+#define ALLOC_NOFRAGMENT 0x100 /* avoid mixing pageblock types */
+#else
+#define ALLOC_NOFRAGMENT 0x0
+#endif
+#define ALLOC_KSWAPD 0x200 /* allow waking of kswapd */
enum ttu_flags;
struct tlbflush_unmap_batch;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index e8137c9a013f..b26a1a67fb1a 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -291,10 +291,10 @@ char * const migratetype_names[MIGRATE_TYPES] = {
"Unmovable",
"Movable",
"Reclaimable",
- "HighAtomic",
#ifdef CONFIG_CMA
"CMA",
#endif
+ "HighAtomic",
#ifdef CONFIG_MEMORY_ISOLATION
"Isolate",
#endif
@@ -318,6 +318,7 @@ compound_page_dtor * const compound_page_dtors[] = {
*/
int min_free_kbytes = 1024;
int user_min_free_kbytes = -1;
+int watermark_boost_factor __read_mostly = 15000;
int watermark_scale_factor = 10;
/*
@@ -2218,6 +2219,21 @@ static bool can_steal_fallback(unsigned int order, int start_mt)
return false;
}
+static inline void boost_watermark(struct zone *zone)
+{
+ unsigned long max_boost;
+
+ if (!watermark_boost_factor)
+ return;
+
+ max_boost = mult_frac(zone->_watermark[WMARK_HIGH],
+ watermark_boost_factor, 10000);
+ max_boost = max(pageblock_nr_pages, max_boost);
+
+ zone->watermark_boost = min(zone->watermark_boost + pageblock_nr_pages,
+ max_boost);
+}
+
/*
* This function implements actual steal behaviour. If order is large enough,
* we can steal whole pageblock. If not, we first move freepages in this
@@ -2227,7 +2243,7 @@ static bool can_steal_fallback(unsigned int order, int start_mt)
* itself, so pages freed in the future will be put on the correct free list.
*/
static void steal_suitable_fallback(struct zone *zone, struct page *page,
- int start_type, bool whole_block)
+ unsigned int alloc_flags, int start_type, bool whole_block)
{
unsigned int current_order = page_order(page);
struct free_area *area;
@@ -2249,6 +2265,15 @@ static void steal_suitable_fallback(struct zone *zone, struct page *page,
goto single_page;
}
+ /*
+ * Boost watermarks to increase reclaim pressure to reduce the
+ * likelihood of future fallbacks. Wake kswapd now as the node
+ * may be balanced overall and kswapd will not wake naturally.
+ */
+ boost_watermark(zone);
+ if (alloc_flags & ALLOC_KSWAPD)
+ wakeup_kswapd(zone, 0, 0, zone_idx(zone));
+
/* We are not allowed to try stealing from the whole block */
if (!whole_block)
goto single_page;
@@ -2464,20 +2489,30 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac,
* condition simpler.
*/
static __always_inline bool
-__rmqueue_fallback(struct zone *zone, int order, int start_migratetype)
+__rmqueue_fallback(struct zone *zone, int order, int start_migratetype,
+ unsigned int alloc_flags)
{
struct free_area *area;
int current_order;
+ int min_order = order;
struct page *page;
int fallback_mt;
bool can_steal;
/*
+ * Do not steal pages from freelists belonging to other pageblocks
+ * i.e. orders < pageblock_order. If there are no local zones free,
+ * the zonelists will be reiterated without ALLOC_NOFRAGMENT.
+ */
+ if (alloc_flags & ALLOC_NOFRAGMENT)
+ min_order = pageblock_order;
+
+ /*
* Find the largest available free page in the other list. This roughly
* approximates finding the pageblock with the most free pages, which
* would be too costly to do exactly.
*/
- for (current_order = MAX_ORDER - 1; current_order >= order;
+ for (current_order = MAX_ORDER - 1; current_order >= min_order;
--current_order) {
area = &(zone->free_area[current_order]);
fallback_mt = find_suitable_fallback(area, current_order,
@@ -2522,7 +2557,8 @@ do_steal:
page = list_first_entry(&area->free_list[fallback_mt],
struct page, lru);
- steal_suitable_fallback(zone, page, start_migratetype, can_steal);
+ steal_suitable_fallback(zone, page, alloc_flags, start_migratetype,
+ can_steal);
trace_mm_page_alloc_extfrag(page, order, current_order,
start_migratetype, fallback_mt);
@@ -2536,24 +2572,40 @@ do_steal:
* Call me with the zone->lock already held.
*/
static __always_inline struct page *
-__rmqueue(struct zone *zone, unsigned int order, int migratetype)
+__rmqueue(struct zone *zone, unsigned int order, int migratetype,
+ unsigned int alloc_flags)
{
struct page *page;
retry:
page = __rmqueue_smallest(zone, order, migratetype);
- if (unlikely(!page)) {
- if (migratetype == MIGRATE_MOVABLE)
- page = __rmqueue_cma_fallback(zone, order);
- if (!page && __rmqueue_fallback(zone, order, migratetype))
- goto retry;
- }
+ if (unlikely(!page) && __rmqueue_fallback(zone, order, migratetype,
+ alloc_flags))
+ goto retry;
trace_mm_page_alloc_zone_locked(page, order, migratetype);
return page;
}
+#ifdef CONFIG_CMA
+static struct page *__rmqueue_cma(struct zone *zone, unsigned int order)
+{
+ struct page *page = 0;
+
+ if (IS_ENABLED(CONFIG_CMA))
+ if (!zone->cma_alloc)
+ page = __rmqueue_cma_fallback(zone, order);
+ trace_mm_page_alloc_zone_locked(page, order, MIGRATE_CMA);
+ return page;
+}
+#else
+static inline struct page *__rmqueue_cma(struct zone *zone, unsigned int order)
+{
+ return NULL;
+}
+#endif
+
/*
* Obtain a specified number of elements from the buddy allocator, all under
* a single hold of the lock, for efficiency. Add them to the supplied list.
@@ -2561,13 +2613,24 @@ retry:
*/
static int rmqueue_bulk(struct zone *zone, unsigned int order,
unsigned long count, struct list_head *list,
- int migratetype)
+ int migratetype, unsigned int alloc_flags)
{
int i, alloced = 0;
spin_lock(&zone->lock);
for (i = 0; i < count; ++i) {
- struct page *page = __rmqueue(zone, order, migratetype);
+ struct page *page;
+
+ /*
+ * If migrate type CMA is being requested only try to
+ * satisfy the request with CMA pages to try and increase
+ * CMA utlization.
+ */
+ if (is_migrate_cma(migratetype))
+ page = __rmqueue_cma(zone, order);
+ else
+ page = __rmqueue(zone, order, migratetype, alloc_flags);
+
if (unlikely(page == NULL))
break;
@@ -2602,6 +2665,28 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order,
return alloced;
}
+/*
+ * Return the pcp list that corresponds to the migrate type if that list isn't
+ * empty.
+ * If the list is empty return NULL.
+ */
+static struct list_head *get_populated_pcp_list(struct zone *zone,
+ unsigned int order, struct per_cpu_pages *pcp,
+ int migratetype, unsigned int alloc_flags)
+{
+ struct list_head *list = &pcp->lists[migratetype];
+
+ if (list_empty(list)) {
+ pcp->count += rmqueue_bulk(zone, order,
+ pcp->batch, list,
+ migratetype, alloc_flags);
+
+ if (list_empty(list))
+ list = NULL;
+ }
+ return list;
+}
+
#ifdef CONFIG_NUMA
/*
* Called from the vmstat counter updater to drain pagesets of this
@@ -3023,17 +3108,30 @@ static inline void zone_statistics(struct zone *preferred_zone, struct zone *z)
/* Remove page from the per-cpu list, caller must protect the list */
static struct page *__rmqueue_pcplist(struct zone *zone, int migratetype,
+ unsigned int alloc_flags,
struct per_cpu_pages *pcp,
- struct list_head *list)
+ gfp_t gfp_flags)
{
- struct page *page;
+ struct page *page = NULL;
+ struct list_head *list = NULL;
do {
- if (list_empty(list)) {
- pcp->count += rmqueue_bulk(zone, 0,
- pcp->batch, list,
- migratetype);
- if (unlikely(list_empty(list)))
+ /* First try to get CMA pages */
+ if (migratetype == MIGRATE_MOVABLE &&
+ gfp_flags & __GFP_CMA) {
+ list = get_populated_pcp_list(zone, 0, pcp,
+ get_cma_migrate_type(), alloc_flags);
+ }
+
+ if (list == NULL) {
+ /*
+ * Either CMA is not suitable or there are no
+ * free CMA pages.
+ */
+ list = get_populated_pcp_list(zone, 0, pcp,
+ migratetype, alloc_flags);
+ if (unlikely(list == NULL) ||
+ unlikely(list_empty(list)))
return NULL;
}
@@ -3048,17 +3146,17 @@ static struct page *__rmqueue_pcplist(struct zone *zone, int migratetype,
/* Lock and remove page from the per-cpu list */
static struct page *rmqueue_pcplist(struct zone *preferred_zone,
struct zone *zone, unsigned int order,
- gfp_t gfp_flags, int migratetype)
+ gfp_t gfp_flags, int migratetype,
+ unsigned int alloc_flags)
{
struct per_cpu_pages *pcp;
- struct list_head *list;
struct page *page;
unsigned long flags;
local_irq_save(flags);
pcp = &this_cpu_ptr(zone->pageset)->pcp;
- list = &pcp->lists[migratetype];
- page = __rmqueue_pcplist(zone, migratetype, pcp, list);
+ page = __rmqueue_pcplist(zone, migratetype, alloc_flags, pcp,
+ gfp_flags);
if (page) {
__count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
zone_statistics(preferred_zone, zone);
@@ -3081,7 +3179,7 @@ struct page *rmqueue(struct zone *preferred_zone,
if (likely(order == 0)) {
page = rmqueue_pcplist(preferred_zone, zone, order,
- gfp_flags, migratetype);
+ gfp_flags, migratetype, alloc_flags);
goto out;
}
@@ -3094,14 +3192,21 @@ struct page *rmqueue(struct zone *preferred_zone,
do {
page = NULL;
+
if (alloc_flags & ALLOC_HARDER) {
page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC);
if (page)
trace_mm_page_alloc_zone_locked(page, order, migratetype);
}
+
+ if (!page && migratetype == MIGRATE_MOVABLE &&
+ gfp_flags & __GFP_CMA)
+ page = __rmqueue_cma(zone, order);
+
if (!page)
- page = __rmqueue(zone, order, migratetype);
+ page = __rmqueue(zone, order, migratetype, alloc_flags);
} while (page && check_new_pages(page, order));
+
spin_unlock(&zone->lock);
if (!page)
goto failed;
@@ -3343,6 +3448,40 @@ static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
#endif /* CONFIG_NUMA */
/*
+ * The restriction on ZONE_DMA32 as being a suitable zone to use to avoid
+ * fragmentation is subtle. If the preferred zone was HIGHMEM then
+ * premature use of a lower zone may cause lowmem pressure problems that
+ * are worse than fragmentation. If the next zone is ZONE_DMA then it is
+ * probably too small. It only makes sense to spread allocations to avoid
+ * fragmentation between the Normal and DMA32 zones.
+ */
+static inline unsigned int
+alloc_flags_nofragment(struct zone *zone, gfp_t gfp_mask)
+{
+ unsigned int alloc_flags = 0;
+
+ if (gfp_mask & __GFP_KSWAPD_RECLAIM)
+ alloc_flags |= ALLOC_KSWAPD;
+
+#ifdef CONFIG_ZONE_DMA32
+ if (zone_idx(zone) != ZONE_NORMAL)
+ goto out;
+
+ /*
+ * If ZONE_DMA32 exists, assume it is the one after ZONE_NORMAL and
+ * the pointer is within zone->zone_pgdat->node_zones[]. Also assume
+ * on UMA that if Normal is populated then so is DMA32.
+ */
+ BUILD_BUG_ON(ZONE_NORMAL - ZONE_DMA32 != 1);
+ if (nr_online_nodes > 1 && !populated_zone(--zone))
+ goto out;
+
+out:
+#endif /* CONFIG_ZONE_DMA32 */
+ return alloc_flags;
+}
+
+/*
* get_page_from_freelist goes through the zonelist trying to allocate
* a page.
*/
@@ -3350,14 +3489,18 @@ static struct page *
get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,
const struct alloc_context *ac)
{
- struct zoneref *z = ac->preferred_zoneref;
+ struct zoneref *z;
struct zone *zone;
struct pglist_data *last_pgdat_dirty_limit = NULL;
+ bool no_fallback;
+retry:
/*
* Scan zonelist, looking for a zone with enough free.
* See also __cpuset_node_allowed() comment in kernel/cpuset.c.
*/
+ no_fallback = alloc_flags & ALLOC_NOFRAGMENT;
+ z = ac->preferred_zoneref;
for_next_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx,
ac->nodemask) {
struct page *page;
@@ -3396,7 +3539,23 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,
}
}
- mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];
+ if (no_fallback && nr_online_nodes > 1 &&
+ zone != ac->preferred_zoneref->zone) {
+ int local_nid;
+
+ /*
+ * If moving to a remote node, retry but allow
+ * fragmenting fallbacks. Locality is more important
+ * than fragmentation avoidance.
+ */
+ local_nid = zone_to_nid(ac->preferred_zoneref->zone);
+ if (zone_to_nid(zone) != local_nid) {
+ alloc_flags &= ~ALLOC_NOFRAGMENT;
+ goto retry;
+ }
+ }
+
+ mark = wmark_pages(zone, alloc_flags & ALLOC_WMARK_MASK);
if (!zone_watermark_fast(zone, order, mark,
ac_classzone_idx(ac), alloc_flags)) {
int ret;
@@ -3463,6 +3622,15 @@ try_this_zone:
}
}
+ /*
+ * It's possible on a UMA machine to get through all zones that are
+ * fragmented. If avoiding fragmentation, reset and try again.
+ */
+ if (no_fallback) {
+ alloc_flags &= ~ALLOC_NOFRAGMENT;
+ goto retry;
+ }
+
return NULL;
}
@@ -3964,6 +4132,9 @@ gfp_to_alloc_flags(gfp_t gfp_mask)
} else if (unlikely(rt_task(current)) && !in_interrupt())
alloc_flags |= ALLOC_HARDER;
+ if (gfp_mask & __GFP_KSWAPD_RECLAIM)
+ alloc_flags |= ALLOC_KSWAPD;
+
#ifdef CONFIG_CMA
if (gfpflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
alloc_flags |= ALLOC_CMA;
@@ -4195,7 +4366,7 @@ retry_cpuset:
if (!ac->preferred_zoneref->zone)
goto nopage;
- if (gfp_mask & __GFP_KSWAPD_RECLAIM)
+ if (alloc_flags & ALLOC_KSWAPD)
wake_all_kswapds(order, gfp_mask, ac);
/*
@@ -4253,7 +4424,7 @@ retry_cpuset:
retry:
/* Ensure kswapd doesn't accidentally go to sleep as long as we loop */
- if (gfp_mask & __GFP_KSWAPD_RECLAIM)
+ if (alloc_flags & ALLOC_KSWAPD)
wake_all_kswapds(order, gfp_mask, ac);
reserve_flags = __gfp_pfmemalloc_flags(gfp_mask);
@@ -4472,6 +4643,12 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, int preferred_nid,
finalise_ac(gfp_mask, &ac);
+ /*
+ * Forbid the first pass from falling back to types that fragment
+ * memory until all local zones are considered.
+ */
+ alloc_flags |= alloc_flags_nofragment(ac.preferred_zoneref->zone, gfp_mask);
+
/* First allocation attempt */
page = get_page_from_freelist(alloc_mask, order, alloc_flags, &ac);
if (likely(page))
@@ -4812,7 +4989,7 @@ long si_mem_available(void)
pages[lru] = global_node_page_state(NR_LRU_BASE + lru);
for_each_zone(zone)
- wmark_low += zone->watermark[WMARK_LOW];
+ wmark_low += low_wmark_pages(zone);
/*
* Estimate the amount of memory available for userspace allocations,
@@ -7386,13 +7563,13 @@ static void __setup_per_zone_wmarks(void)
min_pages = zone->managed_pages / 1024;
min_pages = clamp(min_pages, SWAP_CLUSTER_MAX, 128UL);
- zone->watermark[WMARK_MIN] = min_pages;
+ zone->_watermark[WMARK_MIN] = min_pages;
} else {
/*
* If it's a lowmem zone, reserve a number of pages
* proportionate to the zone's size.
*/
- zone->watermark[WMARK_MIN] = min;
+ zone->_watermark[WMARK_MIN] = min;
}
/*
@@ -7404,10 +7581,11 @@ static void __setup_per_zone_wmarks(void)
mult_frac(zone->managed_pages,
watermark_scale_factor, 10000));
- zone->watermark[WMARK_LOW] = min_wmark_pages(zone) +
+ zone->_watermark[WMARK_LOW] = min_wmark_pages(zone) +
low + min;
- zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) +
+ zone->_watermark[WMARK_HIGH] = min_wmark_pages(zone) +
low + min * 2;
+ zone->watermark_boost = 0;
spin_unlock_irqrestore(&zone->lock, flags);
}
@@ -7508,6 +7686,18 @@ int min_free_kbytes_sysctl_handler(struct ctl_table *table, int write,
return 0;
}
+int watermark_boost_factor_sysctl_handler(struct ctl_table *table, int write,
+ void __user *buffer, size_t *length, loff_t *ppos)
+{
+ int rc;
+
+ rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
+ if (rc)
+ return rc;
+
+ return 0;
+}
+
int watermark_scale_factor_sysctl_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *length, loff_t *ppos)
{
@@ -8033,6 +8223,7 @@ int alloc_contig_range(unsigned long start, unsigned long end,
if (ret)
return ret;
+ cc.zone->cma_alloc = 1;
/*
* In case of -EBUSY, we'd like to know which page causes problem.
* So, just fall through. test_pages_isolated() has a tracepoint
@@ -8115,6 +8306,7 @@ int alloc_contig_range(unsigned long start, unsigned long end,
done:
undo_isolate_page_range(pfn_max_align_down(start),
pfn_max_align_up(end), migratetype);
+ cc.zone->cma_alloc = 0;
return ret;
}
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 32b2424a3203..84d1b7488a48 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -87,6 +87,9 @@ struct scan_control {
/* Can pages be swapped as part of reclaim? */
unsigned int may_swap:1;
+ /* e.g. boosted watermark reclaim leaves slabs alone */
+ unsigned int may_shrinkslab:1;
+
/*
* Cgroups are not reclaimed below their configured memory.low,
* unless we threaten to OOM. If any cgroups are skipped due to
@@ -2739,8 +2742,10 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc)
shrink_node_memcg(pgdat, memcg, sc, &lru_pages);
node_lru_pages += lru_pages;
- shrink_slab(sc->gfp_mask, pgdat->node_id,
+ if (sc->may_shrinkslab) {
+ shrink_slab(sc->gfp_mask, pgdat->node_id,
memcg, sc->priority);
+ }
/* Record the group's reclaim efficiency */
vmpressure(sc->gfp_mask, memcg, false,
@@ -3218,6 +3223,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
.may_writepage = !laptop_mode,
.may_unmap = 1,
.may_swap = 1,
+ .may_shrinkslab = 1,
};
/*
@@ -3262,6 +3268,7 @@ unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg,
.may_unmap = 1,
.reclaim_idx = MAX_NR_ZONES - 1,
.may_swap = !noswap,
+ .may_shrinkslab = 1,
};
unsigned long lru_pages;
@@ -3308,6 +3315,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
.may_writepage = !laptop_mode,
.may_unmap = 1,
.may_swap = may_swap,
+ .may_shrinkslab = 1,
};
/*
@@ -3358,6 +3366,30 @@ static void age_active_anon(struct pglist_data *pgdat,
} while (memcg);
}
+static bool pgdat_watermark_boosted(pg_data_t *pgdat, int classzone_idx)
+{
+ int i;
+ struct zone *zone;
+
+ /*
+ * Check for watermark boosts top-down as the higher zones
+ * are more likely to be boosted. Both watermarks and boosts
+ * should not be checked at the time time as reclaim would
+ * start prematurely when there is no boosting and a lower
+ * zone is balanced.
+ */
+ for (i = classzone_idx; i >= 0; i--) {
+ zone = pgdat->node_zones + i;
+ if (!managed_zone(zone))
+ continue;
+
+ if (zone->watermark_boost)
+ return true;
+ }
+
+ return false;
+}
+
/*
* Returns true if there is an eligible zone balanced for the request order
* and classzone_idx
@@ -3368,6 +3400,10 @@ static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx)
unsigned long mark = -1;
struct zone *zone;
+ /*
+ * Check watermarks bottom-up as lower zones are more likely to
+ * meet watermarks.
+ */
for (i = 0; i <= classzone_idx; i++) {
zone = pgdat->node_zones + i;
@@ -3496,14 +3532,14 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
unsigned long nr_soft_reclaimed;
unsigned long nr_soft_scanned;
unsigned long pflags;
+ unsigned long nr_boost_reclaim;
+ unsigned long zone_boosts[MAX_NR_ZONES] = { 0, };
+ bool boosted;
struct zone *zone;
struct scan_control sc = {
.gfp_mask = GFP_KERNEL,
.order = order,
- .priority = DEF_PRIORITY,
- .may_writepage = !laptop_mode,
.may_unmap = 1,
- .may_swap = 1,
};
psi_memstall_enter(&pflags);
@@ -3511,9 +3547,28 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
count_vm_event(PAGEOUTRUN);
+ /*
+ * Account for the reclaim boost. Note that the zone boost is left in
+ * place so that parallel allocations that are near the watermark will
+ * stall or direct reclaim until kswapd is finished.
+ */
+ nr_boost_reclaim = 0;
+ for (i = 0; i <= classzone_idx; i++) {
+ zone = pgdat->node_zones + i;
+ if (!managed_zone(zone))
+ continue;
+
+ nr_boost_reclaim += zone->watermark_boost;
+ zone_boosts[i] = zone->watermark_boost;
+ }
+ boosted = nr_boost_reclaim;
+
+restart:
+ sc.priority = DEF_PRIORITY;
do {
unsigned long nr_reclaimed = sc.nr_reclaimed;
bool raise_priority = true;
+ bool balanced;
bool ret;
sc.reclaim_idx = classzone_idx;
@@ -3540,13 +3595,40 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
}
/*
- * Only reclaim if there are no eligible zones. Note that
- * sc.reclaim_idx is not used as buffer_heads_over_limit may
- * have adjusted it.
+ * If the pgdat is imbalanced then ignore boosting and preserve
+ * the watermarks for a later time and restart. Note that the
+ * zone watermarks will be still reset at the end of balancing
+ * on the grounds that the normal reclaim should be enough to
+ * re-evaluate if boosting is required when kswapd next wakes.
+ */
+ balanced = pgdat_balanced(pgdat, sc.order, classzone_idx);
+ if (!balanced && nr_boost_reclaim) {
+ nr_boost_reclaim = 0;
+ goto restart;
+ }
+
+ /*
+ * If boosting is not active then only reclaim if there are no
+ * eligible zones. Note that sc.reclaim_idx is not used as
+ * buffer_heads_over_limit may have adjusted it.
*/
- if (pgdat_balanced(pgdat, sc.order, classzone_idx))
+ if (!nr_boost_reclaim && balanced)
goto out;
+ /* Limit the priority of boosting to avoid reclaim writeback */
+ if (nr_boost_reclaim && sc.priority == DEF_PRIORITY - 2)
+ raise_priority = false;
+
+ /*
+ * Do not writeback or swap pages for boosted reclaim. The
+ * intent is to relieve pressure not issue sub-optimal IO
+ * from reclaim context. If no pages are reclaimed, the
+ * reclaim will be aborted.
+ */
+ sc.may_writepage = !laptop_mode && !nr_boost_reclaim;
+ sc.may_swap = !nr_boost_reclaim;
+ sc.may_shrinkslab = !nr_boost_reclaim;
+
/*
* Do some background aging of the anon list, to give
* pages a chance to be referenced before reclaiming. All
@@ -3598,6 +3680,16 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
* progress in reclaiming pages
*/
nr_reclaimed = sc.nr_reclaimed - nr_reclaimed;
+ nr_boost_reclaim -= min(nr_boost_reclaim, nr_reclaimed);
+
+ /*
+ * If reclaim made no progress for a boost, stop reclaim as
+ * IO cannot be queued and it could be an infinite loop in
+ * extreme circumstances.
+ */
+ if (nr_boost_reclaim && !nr_reclaimed)
+ break;
+
if (raise_priority || !nr_reclaimed)
sc.priority--;
} while (sc.priority >= 1);
@@ -3606,6 +3698,28 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
pgdat->kswapd_failures++;
out:
+ /* If reclaim was boosted, account for the reclaim done in this pass */
+ if (boosted) {
+ unsigned long flags;
+
+ for (i = 0; i <= classzone_idx; i++) {
+ if (!zone_boosts[i])
+ continue;
+
+ /* Increments are under the zone lock */
+ zone = pgdat->node_zones + i;
+ spin_lock_irqsave(&zone->lock, flags);
+ zone->watermark_boost -= min(zone->watermark_boost, zone_boosts[i]);
+ spin_unlock_irqrestore(&zone->lock, flags);
+ }
+
+ /*
+ * As there is now likely space, wakeup kcompact to defragment
+ * pageblocks.
+ */
+ wakeup_kcompactd(pgdat, pageblock_order, classzone_idx);
+ }
+
snapshot_refaults(NULL, pgdat);
__fs_reclaim_release();
psi_memstall_leave(&pflags);
@@ -3837,7 +3951,8 @@ void wakeup_kswapd(struct zone *zone, gfp_t gfp_flags, int order,
/* Hopeless node, leave it to direct reclaim if possible */
if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES ||
- pgdat_balanced(pgdat, order, classzone_idx)) {
+ (pgdat_balanced(pgdat, order, classzone_idx) &&
+ !pgdat_watermark_boosted(pgdat, classzone_idx))) {
/*
* There may be plenty of free memory available, but it's too
* fragmented for high-order allocations. Wake up kcompactd