diff options
author | Linaro CI <ci-bot@linaro.org> | 2020-03-04 23:35:13 +0000 |
---|---|---|
committer | Linaro CI <ci-bot@linaro.org> | 2020-03-04 23:35:13 +0000 |
commit | ad23e27c3243eeb4955a9bd29b5e12629777fe5c (patch) | |
tree | 30241dfe32201a3ac5818cec55ace5daf3b36cf5 | |
parent | 0ae42ff9e2a258b13fc16b5e5f2f04a2e4ec9778 (diff) | |
parent | c98dd3b1dfa42f55545451d290a2c30cd1130121 (diff) | |
download | hikey-linaro-ad23e27c3243eeb4955a9bd29b5e12629777fe5c.tar.gz |
Merge remote-tracking branch 'aosp/mirror-android-4.19' into android-hikey-linaro-4.19-lkftv4.19.107-1766-gc98dd3b1dfa4-20200304-49
-rw-r--r-- | Documentation/sysctl/vm.txt | 21 | ||||
-rw-r--r-- | include/linux/gfp.h | 9 | ||||
-rw-r--r-- | include/linux/highmem.h | 5 | ||||
-rw-r--r-- | include/linux/mm.h | 1 | ||||
-rw-r--r-- | include/linux/mmzone.h | 24 | ||||
-rw-r--r-- | kernel/sysctl.c | 8 | ||||
-rw-r--r-- | mm/compaction.c | 124 | ||||
-rw-r--r-- | mm/internal.h | 14 | ||||
-rw-r--r-- | mm/page_alloc.c | 264 | ||||
-rw-r--r-- | mm/vmscan.c | 133 |
10 files changed, 529 insertions, 74 deletions
diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt index a48baf202265..e33b6808db6b 100644 --- a/Documentation/sysctl/vm.txt +++ b/Documentation/sysctl/vm.txt @@ -64,6 +64,7 @@ Currently, these files are in /proc/sys/vm: - swappiness - user_reserve_kbytes - vfs_cache_pressure +- watermark_boost_factor - watermark_scale_factor - zone_reclaim_mode @@ -872,6 +873,26 @@ ten times more freeable objects than there are. ============================================================= +watermark_boost_factor: + +This factor controls the level of reclaim when memory is being fragmented. +It defines the percentage of the high watermark of a zone that will be +reclaimed if pages of different mobility are being mixed within pageblocks. +The intent is that compaction has less work to do in the future and to +increase the success rate of future high-order allocations such as SLUB +allocations, THP and hugetlbfs pages. + +To make it sensible with respect to the watermark_scale_factor parameter, +the unit is in fractions of 10,000. The default value of 15,000 means +that up to 150% of the high watermark will be reclaimed in the event of +a pageblock being mixed due to fragmentation. The level of reclaim is +determined by the number of fragmentation events that occurred in the +recent past. If this value is smaller than a pageblock then a pageblocks +worth of pages will be reclaimed (e.g. 2MB on 64-bit x86). A boost factor +of 0 will disable the feature. + +============================================================= + watermark_scale_factor: This factor controls the aggressiveness of kswapd. It defines the diff --git a/include/linux/gfp.h b/include/linux/gfp.h index f78d1e89593f..194da88f128f 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -44,6 +44,7 @@ struct vm_area_struct; #else #define ___GFP_NOLOCKDEP 0 #endif +#define ___GFP_CMA 0x1000000u /* If the above are modified, __GFP_BITS_SHIFT may need updating */ /* @@ -57,6 +58,7 @@ struct vm_area_struct; #define __GFP_HIGHMEM ((__force gfp_t)___GFP_HIGHMEM) #define __GFP_DMA32 ((__force gfp_t)___GFP_DMA32) #define __GFP_MOVABLE ((__force gfp_t)___GFP_MOVABLE) /* ZONE_MOVABLE allowed */ +#define __GFP_CMA ((__force gfp_t)___GFP_CMA) #define GFP_ZONEMASK (__GFP_DMA|__GFP_HIGHMEM|__GFP_DMA32|__GFP_MOVABLE) /** @@ -217,8 +219,13 @@ struct vm_area_struct; #define __GFP_NOLOCKDEP ((__force gfp_t)___GFP_NOLOCKDEP) /* Room for N __GFP_FOO bits */ -#define __GFP_BITS_SHIFT (23 + IS_ENABLED(CONFIG_LOCKDEP)) +#define __GFP_BITS_SHIFT (25) +#ifdef CONFIG_LOCKDEP #define __GFP_BITS_MASK ((__force gfp_t)((1 << __GFP_BITS_SHIFT) - 1)) +#else +#define __GFP_BITS_MASK (((__force gfp_t)((1 << __GFP_BITS_SHIFT) - 1)) & \ + ~0x800000u) +#endif /** * DOC: Useful GFP flag combinations diff --git a/include/linux/highmem.h b/include/linux/highmem.h index 0690679832d4..e80602ee359b 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -181,7 +181,12 @@ static inline struct page * alloc_zeroed_user_highpage_movable(struct vm_area_struct *vma, unsigned long vaddr) { +#ifndef CONFIG_CMA return __alloc_zeroed_user_highpage(__GFP_MOVABLE, vma, vaddr); +#else + return __alloc_zeroed_user_highpage(__GFP_MOVABLE|__GFP_CMA, vma, + vaddr); +#endif } static inline void clear_highpage(struct page *page) diff --git a/include/linux/mm.h b/include/linux/mm.h index a3ece90256c0..c06305ce27d5 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2239,6 +2239,7 @@ extern void zone_pcp_reset(struct zone *zone); /* page_alloc.c */ extern int min_free_kbytes; +extern int watermark_boost_factor; extern int watermark_scale_factor; /* nommu.c */ diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index fc0cfa808e34..3394901d0cf6 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -40,8 +40,6 @@ enum migratetype { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_RECLAIMABLE, - MIGRATE_PCPTYPES, /* the number of types on the pcp lists */ - MIGRATE_HIGHATOMIC = MIGRATE_PCPTYPES, #ifdef CONFIG_CMA /* * MIGRATE_CMA migration type is designed to mimic the way @@ -58,6 +56,8 @@ enum migratetype { */ MIGRATE_CMA, #endif + MIGRATE_PCPTYPES, /* the number of types on the pcp lists */ + MIGRATE_HIGHATOMIC = MIGRATE_PCPTYPES, #ifdef CONFIG_MEMORY_ISOLATION MIGRATE_ISOLATE, /* can't allocate from here */ #endif @@ -70,9 +70,11 @@ extern char * const migratetype_names[MIGRATE_TYPES]; #ifdef CONFIG_CMA # define is_migrate_cma(migratetype) unlikely((migratetype) == MIGRATE_CMA) # define is_migrate_cma_page(_page) (get_pageblock_migratetype(_page) == MIGRATE_CMA) +# define get_cma_migrate_type() MIGRATE_CMA #else # define is_migrate_cma(migratetype) false # define is_migrate_cma_page(_page) false +# define get_cma_migrate_type() MIGRATE_MOVABLE #endif static inline bool is_migrate_movable(int mt) @@ -271,9 +273,10 @@ enum zone_watermarks { NR_WMARK }; -#define min_wmark_pages(z) (z->watermark[WMARK_MIN]) -#define low_wmark_pages(z) (z->watermark[WMARK_LOW]) -#define high_wmark_pages(z) (z->watermark[WMARK_HIGH]) +#define min_wmark_pages(z) (z->_watermark[WMARK_MIN] + z->watermark_boost) +#define low_wmark_pages(z) (z->_watermark[WMARK_LOW] + z->watermark_boost) +#define high_wmark_pages(z) (z->_watermark[WMARK_HIGH] + z->watermark_boost) +#define wmark_pages(z, i) (z->_watermark[i] + z->watermark_boost) struct per_cpu_pages { int count; /* number of pages in the list */ @@ -364,7 +367,8 @@ struct zone { /* Read-mostly fields */ /* zone watermarks, access with *_wmark_pages(zone) macros */ - unsigned long watermark[NR_WMARK]; + unsigned long _watermark[NR_WMARK]; + unsigned long watermark_boost; unsigned long nr_reserved_highatomic; @@ -385,6 +389,10 @@ struct zone { struct pglist_data *zone_pgdat; struct per_cpu_pageset __percpu *pageset; +#ifdef CONFIG_CMA + bool cma_alloc; +#endif + #ifndef CONFIG_SPARSEMEM /* * Flags for a pageblock_nr_pages block. See pageblock-flags.h. @@ -486,6 +494,8 @@ struct zone { unsigned long compact_cached_free_pfn; /* pfn where async and sync compaction migration scanner should start */ unsigned long compact_cached_migrate_pfn[2]; + unsigned long compact_init_migrate_pfn; + unsigned long compact_init_free_pfn; #endif #ifdef CONFIG_COMPACTION @@ -888,6 +898,8 @@ static inline int is_highmem(struct zone *zone) struct ctl_table; int min_free_kbytes_sysctl_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *); +int watermark_boost_factor_sysctl_handler(struct ctl_table *, int, + void __user *, size_t *, loff_t *); int watermark_scale_factor_sysctl_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *); extern int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES]; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 521c38e9ac14..62949e56064a 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1495,6 +1495,14 @@ static struct ctl_table vm_table[] = { .extra1 = &zero, }, { + .procname = "watermark_boost_factor", + .data = &watermark_boost_factor, + .maxlen = sizeof(watermark_boost_factor), + .mode = 0644, + .proc_handler = watermark_boost_factor_sysctl_handler, + .extra1 = &zero, + }, + { .procname = "watermark_scale_factor", .data = &watermark_scale_factor, .maxlen = sizeof(watermark_scale_factor), diff --git a/mm/compaction.c b/mm/compaction.c index 120e5559e6b4..b8ced23f6c5c 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -237,6 +237,70 @@ static bool pageblock_skip_persistent(struct page *page) return false; } +static bool +__reset_isolation_pfn(struct zone *zone, unsigned long pfn, bool check_source, + bool check_target) +{ + struct page *page = pfn_to_online_page(pfn); + struct page *end_page; + unsigned long block_pfn; + + if (!page) + return false; + if (zone != page_zone(page)) + return false; + if (pageblock_skip_persistent(page)) + return false; + + /* + * If skip is already cleared do no further checking once the + * restart points have been set. + */ + if (check_source && check_target && !get_pageblock_skip(page)) + return true; + + /* + * If clearing skip for the target scanner, do not select a + * non-movable pageblock as the starting point. + */ + if (!check_source && check_target && + get_pageblock_migratetype(page) != MIGRATE_MOVABLE) + return false; + + /* + * Only clear the hint if a sample indicates there is either a + * free page or an LRU page in the block. One or other condition + * is necessary for the block to be a migration source/target. + */ + block_pfn = pageblock_start_pfn(pfn); + pfn = max(block_pfn, zone->zone_start_pfn); + page = pfn_to_page(pfn); + if (zone != page_zone(page)) + return false; + pfn = block_pfn + pageblock_nr_pages; + pfn = min(pfn, zone_end_pfn(zone)); + end_page = pfn_to_page(pfn); + + do { + if (pfn_valid_within(pfn)) { + if (check_source && PageLRU(page)) { + clear_pageblock_skip(page); + return true; + } + + if (check_target && PageBuddy(page)) { + clear_pageblock_skip(page); + return true; + } + } + + page += (1 << PAGE_ALLOC_COSTLY_ORDER); + pfn += (1 << PAGE_ALLOC_COSTLY_ORDER); + } while (page < end_page); + + return false; +} + /* * This function is called to clear all cached information on pageblocks that * should be skipped for page isolation when the migrate and free page scanner @@ -244,30 +308,54 @@ static bool pageblock_skip_persistent(struct page *page) */ static void __reset_isolation_suitable(struct zone *zone) { - unsigned long start_pfn = zone->zone_start_pfn; - unsigned long end_pfn = zone_end_pfn(zone); - unsigned long pfn; + unsigned long migrate_pfn = zone->zone_start_pfn; + unsigned long free_pfn = zone_end_pfn(zone); + unsigned long reset_migrate = free_pfn; + unsigned long reset_free = migrate_pfn; + bool source_set = false; + bool free_set = false; + + if (!zone->compact_blockskip_flush) + return; zone->compact_blockskip_flush = false; - /* Walk the zone and mark every pageblock as suitable for isolation */ - for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) { - struct page *page; - + /* + * Walk the zone and update pageblock skip information. Source looks + * for PageLRU while target looks for PageBuddy. When the scanner + * is found, both PageBuddy and PageLRU are checked as the pageblock + * is suitable as both source and target. + */ + for (; migrate_pfn < free_pfn; migrate_pfn += pageblock_nr_pages, + free_pfn -= pageblock_nr_pages) { cond_resched(); - page = pfn_to_online_page(pfn); - if (!page) - continue; - if (zone != page_zone(page)) - continue; - if (pageblock_skip_persistent(page)) - continue; + /* Update the migrate PFN */ + if (__reset_isolation_pfn(zone, migrate_pfn, true, source_set) && + migrate_pfn < reset_migrate) { + source_set = true; + reset_migrate = migrate_pfn; + zone->compact_init_migrate_pfn = reset_migrate; + zone->compact_cached_migrate_pfn[0] = reset_migrate; + zone->compact_cached_migrate_pfn[1] = reset_migrate; + } - clear_pageblock_skip(page); + /* Update the free PFN */ + if (__reset_isolation_pfn(zone, free_pfn, free_set, true) && + free_pfn > reset_free) { + free_set = true; + reset_free = free_pfn; + zone->compact_init_free_pfn = reset_free; + zone->compact_cached_free_pfn = reset_free; + } } - reset_cached_positions(zone); + /* Leave no distance if no suitable block was reset */ + if (reset_migrate >= reset_free) { + zone->compact_cached_migrate_pfn[0] = migrate_pfn; + zone->compact_cached_migrate_pfn[1] = migrate_pfn; + zone->compact_cached_free_pfn = free_pfn; + } } void reset_isolation_suitable(pg_data_t *pgdat) @@ -1431,7 +1519,7 @@ static enum compact_result __compaction_suitable(struct zone *zone, int order, if (is_via_compact_memory(order)) return COMPACT_CONTINUE; - watermark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK]; + watermark = wmark_pages(zone, alloc_flags & ALLOC_WMARK_MASK); /* * If watermarks for high-order allocation are already met, there * should be no need for compaction at all. @@ -1591,7 +1679,7 @@ static enum compact_result compact_zone(struct zone *zone, struct compact_contro zone->compact_cached_migrate_pfn[1] = cc->migrate_pfn; } - if (cc->migrate_pfn == start_pfn) + if (cc->migrate_pfn <= cc->zone->compact_init_migrate_pfn) cc->whole_zone = true; } diff --git a/mm/internal.h b/mm/internal.h index 397183c8fe47..5b9734aff4a5 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -490,10 +490,16 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone, #define ALLOC_OOM ALLOC_NO_WATERMARKS #endif -#define ALLOC_HARDER 0x10 /* try to alloc harder */ -#define ALLOC_HIGH 0x20 /* __GFP_HIGH set */ -#define ALLOC_CPUSET 0x40 /* check for correct cpuset */ -#define ALLOC_CMA 0x80 /* allow allocations from CMA areas */ +#define ALLOC_HARDER 0x10 /* try to alloc harder */ +#define ALLOC_HIGH 0x20 /* __GFP_HIGH set */ +#define ALLOC_CPUSET 0x40 /* check for correct cpuset */ +#define ALLOC_CMA 0x80 /* allow allocations from CMA areas */ +#ifdef CONFIG_ZONE_DMA32 +#define ALLOC_NOFRAGMENT 0x100 /* avoid mixing pageblock types */ +#else +#define ALLOC_NOFRAGMENT 0x0 +#endif +#define ALLOC_KSWAPD 0x200 /* allow waking of kswapd */ enum ttu_flags; struct tlbflush_unmap_batch; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index e8137c9a013f..b26a1a67fb1a 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -291,10 +291,10 @@ char * const migratetype_names[MIGRATE_TYPES] = { "Unmovable", "Movable", "Reclaimable", - "HighAtomic", #ifdef CONFIG_CMA "CMA", #endif + "HighAtomic", #ifdef CONFIG_MEMORY_ISOLATION "Isolate", #endif @@ -318,6 +318,7 @@ compound_page_dtor * const compound_page_dtors[] = { */ int min_free_kbytes = 1024; int user_min_free_kbytes = -1; +int watermark_boost_factor __read_mostly = 15000; int watermark_scale_factor = 10; /* @@ -2218,6 +2219,21 @@ static bool can_steal_fallback(unsigned int order, int start_mt) return false; } +static inline void boost_watermark(struct zone *zone) +{ + unsigned long max_boost; + + if (!watermark_boost_factor) + return; + + max_boost = mult_frac(zone->_watermark[WMARK_HIGH], + watermark_boost_factor, 10000); + max_boost = max(pageblock_nr_pages, max_boost); + + zone->watermark_boost = min(zone->watermark_boost + pageblock_nr_pages, + max_boost); +} + /* * This function implements actual steal behaviour. If order is large enough, * we can steal whole pageblock. If not, we first move freepages in this @@ -2227,7 +2243,7 @@ static bool can_steal_fallback(unsigned int order, int start_mt) * itself, so pages freed in the future will be put on the correct free list. */ static void steal_suitable_fallback(struct zone *zone, struct page *page, - int start_type, bool whole_block) + unsigned int alloc_flags, int start_type, bool whole_block) { unsigned int current_order = page_order(page); struct free_area *area; @@ -2249,6 +2265,15 @@ static void steal_suitable_fallback(struct zone *zone, struct page *page, goto single_page; } + /* + * Boost watermarks to increase reclaim pressure to reduce the + * likelihood of future fallbacks. Wake kswapd now as the node + * may be balanced overall and kswapd will not wake naturally. + */ + boost_watermark(zone); + if (alloc_flags & ALLOC_KSWAPD) + wakeup_kswapd(zone, 0, 0, zone_idx(zone)); + /* We are not allowed to try stealing from the whole block */ if (!whole_block) goto single_page; @@ -2464,20 +2489,30 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac, * condition simpler. */ static __always_inline bool -__rmqueue_fallback(struct zone *zone, int order, int start_migratetype) +__rmqueue_fallback(struct zone *zone, int order, int start_migratetype, + unsigned int alloc_flags) { struct free_area *area; int current_order; + int min_order = order; struct page *page; int fallback_mt; bool can_steal; /* + * Do not steal pages from freelists belonging to other pageblocks + * i.e. orders < pageblock_order. If there are no local zones free, + * the zonelists will be reiterated without ALLOC_NOFRAGMENT. + */ + if (alloc_flags & ALLOC_NOFRAGMENT) + min_order = pageblock_order; + + /* * Find the largest available free page in the other list. This roughly * approximates finding the pageblock with the most free pages, which * would be too costly to do exactly. */ - for (current_order = MAX_ORDER - 1; current_order >= order; + for (current_order = MAX_ORDER - 1; current_order >= min_order; --current_order) { area = &(zone->free_area[current_order]); fallback_mt = find_suitable_fallback(area, current_order, @@ -2522,7 +2557,8 @@ do_steal: page = list_first_entry(&area->free_list[fallback_mt], struct page, lru); - steal_suitable_fallback(zone, page, start_migratetype, can_steal); + steal_suitable_fallback(zone, page, alloc_flags, start_migratetype, + can_steal); trace_mm_page_alloc_extfrag(page, order, current_order, start_migratetype, fallback_mt); @@ -2536,24 +2572,40 @@ do_steal: * Call me with the zone->lock already held. */ static __always_inline struct page * -__rmqueue(struct zone *zone, unsigned int order, int migratetype) +__rmqueue(struct zone *zone, unsigned int order, int migratetype, + unsigned int alloc_flags) { struct page *page; retry: page = __rmqueue_smallest(zone, order, migratetype); - if (unlikely(!page)) { - if (migratetype == MIGRATE_MOVABLE) - page = __rmqueue_cma_fallback(zone, order); - if (!page && __rmqueue_fallback(zone, order, migratetype)) - goto retry; - } + if (unlikely(!page) && __rmqueue_fallback(zone, order, migratetype, + alloc_flags)) + goto retry; trace_mm_page_alloc_zone_locked(page, order, migratetype); return page; } +#ifdef CONFIG_CMA +static struct page *__rmqueue_cma(struct zone *zone, unsigned int order) +{ + struct page *page = 0; + + if (IS_ENABLED(CONFIG_CMA)) + if (!zone->cma_alloc) + page = __rmqueue_cma_fallback(zone, order); + trace_mm_page_alloc_zone_locked(page, order, MIGRATE_CMA); + return page; +} +#else +static inline struct page *__rmqueue_cma(struct zone *zone, unsigned int order) +{ + return NULL; +} +#endif + /* * Obtain a specified number of elements from the buddy allocator, all under * a single hold of the lock, for efficiency. Add them to the supplied list. @@ -2561,13 +2613,24 @@ retry: */ static int rmqueue_bulk(struct zone *zone, unsigned int order, unsigned long count, struct list_head *list, - int migratetype) + int migratetype, unsigned int alloc_flags) { int i, alloced = 0; spin_lock(&zone->lock); for (i = 0; i < count; ++i) { - struct page *page = __rmqueue(zone, order, migratetype); + struct page *page; + + /* + * If migrate type CMA is being requested only try to + * satisfy the request with CMA pages to try and increase + * CMA utlization. + */ + if (is_migrate_cma(migratetype)) + page = __rmqueue_cma(zone, order); + else + page = __rmqueue(zone, order, migratetype, alloc_flags); + if (unlikely(page == NULL)) break; @@ -2602,6 +2665,28 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, return alloced; } +/* + * Return the pcp list that corresponds to the migrate type if that list isn't + * empty. + * If the list is empty return NULL. + */ +static struct list_head *get_populated_pcp_list(struct zone *zone, + unsigned int order, struct per_cpu_pages *pcp, + int migratetype, unsigned int alloc_flags) +{ + struct list_head *list = &pcp->lists[migratetype]; + + if (list_empty(list)) { + pcp->count += rmqueue_bulk(zone, order, + pcp->batch, list, + migratetype, alloc_flags); + + if (list_empty(list)) + list = NULL; + } + return list; +} + #ifdef CONFIG_NUMA /* * Called from the vmstat counter updater to drain pagesets of this @@ -3023,17 +3108,30 @@ static inline void zone_statistics(struct zone *preferred_zone, struct zone *z) /* Remove page from the per-cpu list, caller must protect the list */ static struct page *__rmqueue_pcplist(struct zone *zone, int migratetype, + unsigned int alloc_flags, struct per_cpu_pages *pcp, - struct list_head *list) + gfp_t gfp_flags) { - struct page *page; + struct page *page = NULL; + struct list_head *list = NULL; do { - if (list_empty(list)) { - pcp->count += rmqueue_bulk(zone, 0, - pcp->batch, list, - migratetype); - if (unlikely(list_empty(list))) + /* First try to get CMA pages */ + if (migratetype == MIGRATE_MOVABLE && + gfp_flags & __GFP_CMA) { + list = get_populated_pcp_list(zone, 0, pcp, + get_cma_migrate_type(), alloc_flags); + } + + if (list == NULL) { + /* + * Either CMA is not suitable or there are no + * free CMA pages. + */ + list = get_populated_pcp_list(zone, 0, pcp, + migratetype, alloc_flags); + if (unlikely(list == NULL) || + unlikely(list_empty(list))) return NULL; } @@ -3048,17 +3146,17 @@ static struct page *__rmqueue_pcplist(struct zone *zone, int migratetype, /* Lock and remove page from the per-cpu list */ static struct page *rmqueue_pcplist(struct zone *preferred_zone, struct zone *zone, unsigned int order, - gfp_t gfp_flags, int migratetype) + gfp_t gfp_flags, int migratetype, + unsigned int alloc_flags) { struct per_cpu_pages *pcp; - struct list_head *list; struct page *page; unsigned long flags; local_irq_save(flags); pcp = &this_cpu_ptr(zone->pageset)->pcp; - list = &pcp->lists[migratetype]; - page = __rmqueue_pcplist(zone, migratetype, pcp, list); + page = __rmqueue_pcplist(zone, migratetype, alloc_flags, pcp, + gfp_flags); if (page) { __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order); zone_statistics(preferred_zone, zone); @@ -3081,7 +3179,7 @@ struct page *rmqueue(struct zone *preferred_zone, if (likely(order == 0)) { page = rmqueue_pcplist(preferred_zone, zone, order, - gfp_flags, migratetype); + gfp_flags, migratetype, alloc_flags); goto out; } @@ -3094,14 +3192,21 @@ struct page *rmqueue(struct zone *preferred_zone, do { page = NULL; + if (alloc_flags & ALLOC_HARDER) { page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC); if (page) trace_mm_page_alloc_zone_locked(page, order, migratetype); } + + if (!page && migratetype == MIGRATE_MOVABLE && + gfp_flags & __GFP_CMA) + page = __rmqueue_cma(zone, order); + if (!page) - page = __rmqueue(zone, order, migratetype); + page = __rmqueue(zone, order, migratetype, alloc_flags); } while (page && check_new_pages(page, order)); + spin_unlock(&zone->lock); if (!page) goto failed; @@ -3343,6 +3448,40 @@ static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone) #endif /* CONFIG_NUMA */ /* + * The restriction on ZONE_DMA32 as being a suitable zone to use to avoid + * fragmentation is subtle. If the preferred zone was HIGHMEM then + * premature use of a lower zone may cause lowmem pressure problems that + * are worse than fragmentation. If the next zone is ZONE_DMA then it is + * probably too small. It only makes sense to spread allocations to avoid + * fragmentation between the Normal and DMA32 zones. + */ +static inline unsigned int +alloc_flags_nofragment(struct zone *zone, gfp_t gfp_mask) +{ + unsigned int alloc_flags = 0; + + if (gfp_mask & __GFP_KSWAPD_RECLAIM) + alloc_flags |= ALLOC_KSWAPD; + +#ifdef CONFIG_ZONE_DMA32 + if (zone_idx(zone) != ZONE_NORMAL) + goto out; + + /* + * If ZONE_DMA32 exists, assume it is the one after ZONE_NORMAL and + * the pointer is within zone->zone_pgdat->node_zones[]. Also assume + * on UMA that if Normal is populated then so is DMA32. + */ + BUILD_BUG_ON(ZONE_NORMAL - ZONE_DMA32 != 1); + if (nr_online_nodes > 1 && !populated_zone(--zone)) + goto out; + +out: +#endif /* CONFIG_ZONE_DMA32 */ + return alloc_flags; +} + +/* * get_page_from_freelist goes through the zonelist trying to allocate * a page. */ @@ -3350,14 +3489,18 @@ static struct page * get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags, const struct alloc_context *ac) { - struct zoneref *z = ac->preferred_zoneref; + struct zoneref *z; struct zone *zone; struct pglist_data *last_pgdat_dirty_limit = NULL; + bool no_fallback; +retry: /* * Scan zonelist, looking for a zone with enough free. * See also __cpuset_node_allowed() comment in kernel/cpuset.c. */ + no_fallback = alloc_flags & ALLOC_NOFRAGMENT; + z = ac->preferred_zoneref; for_next_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx, ac->nodemask) { struct page *page; @@ -3396,7 +3539,23 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags, } } - mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK]; + if (no_fallback && nr_online_nodes > 1 && + zone != ac->preferred_zoneref->zone) { + int local_nid; + + /* + * If moving to a remote node, retry but allow + * fragmenting fallbacks. Locality is more important + * than fragmentation avoidance. + */ + local_nid = zone_to_nid(ac->preferred_zoneref->zone); + if (zone_to_nid(zone) != local_nid) { + alloc_flags &= ~ALLOC_NOFRAGMENT; + goto retry; + } + } + + mark = wmark_pages(zone, alloc_flags & ALLOC_WMARK_MASK); if (!zone_watermark_fast(zone, order, mark, ac_classzone_idx(ac), alloc_flags)) { int ret; @@ -3463,6 +3622,15 @@ try_this_zone: } } + /* + * It's possible on a UMA machine to get through all zones that are + * fragmented. If avoiding fragmentation, reset and try again. + */ + if (no_fallback) { + alloc_flags &= ~ALLOC_NOFRAGMENT; + goto retry; + } + return NULL; } @@ -3964,6 +4132,9 @@ gfp_to_alloc_flags(gfp_t gfp_mask) } else if (unlikely(rt_task(current)) && !in_interrupt()) alloc_flags |= ALLOC_HARDER; + if (gfp_mask & __GFP_KSWAPD_RECLAIM) + alloc_flags |= ALLOC_KSWAPD; + #ifdef CONFIG_CMA if (gfpflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE) alloc_flags |= ALLOC_CMA; @@ -4195,7 +4366,7 @@ retry_cpuset: if (!ac->preferred_zoneref->zone) goto nopage; - if (gfp_mask & __GFP_KSWAPD_RECLAIM) + if (alloc_flags & ALLOC_KSWAPD) wake_all_kswapds(order, gfp_mask, ac); /* @@ -4253,7 +4424,7 @@ retry_cpuset: retry: /* Ensure kswapd doesn't accidentally go to sleep as long as we loop */ - if (gfp_mask & __GFP_KSWAPD_RECLAIM) + if (alloc_flags & ALLOC_KSWAPD) wake_all_kswapds(order, gfp_mask, ac); reserve_flags = __gfp_pfmemalloc_flags(gfp_mask); @@ -4472,6 +4643,12 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, int preferred_nid, finalise_ac(gfp_mask, &ac); + /* + * Forbid the first pass from falling back to types that fragment + * memory until all local zones are considered. + */ + alloc_flags |= alloc_flags_nofragment(ac.preferred_zoneref->zone, gfp_mask); + /* First allocation attempt */ page = get_page_from_freelist(alloc_mask, order, alloc_flags, &ac); if (likely(page)) @@ -4812,7 +4989,7 @@ long si_mem_available(void) pages[lru] = global_node_page_state(NR_LRU_BASE + lru); for_each_zone(zone) - wmark_low += zone->watermark[WMARK_LOW]; + wmark_low += low_wmark_pages(zone); /* * Estimate the amount of memory available for userspace allocations, @@ -7386,13 +7563,13 @@ static void __setup_per_zone_wmarks(void) min_pages = zone->managed_pages / 1024; min_pages = clamp(min_pages, SWAP_CLUSTER_MAX, 128UL); - zone->watermark[WMARK_MIN] = min_pages; + zone->_watermark[WMARK_MIN] = min_pages; } else { /* * If it's a lowmem zone, reserve a number of pages * proportionate to the zone's size. */ - zone->watermark[WMARK_MIN] = min; + zone->_watermark[WMARK_MIN] = min; } /* @@ -7404,10 +7581,11 @@ static void __setup_per_zone_wmarks(void) mult_frac(zone->managed_pages, watermark_scale_factor, 10000)); - zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + + zone->_watermark[WMARK_LOW] = min_wmark_pages(zone) + low + min; - zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + + zone->_watermark[WMARK_HIGH] = min_wmark_pages(zone) + low + min * 2; + zone->watermark_boost = 0; spin_unlock_irqrestore(&zone->lock, flags); } @@ -7508,6 +7686,18 @@ int min_free_kbytes_sysctl_handler(struct ctl_table *table, int write, return 0; } +int watermark_boost_factor_sysctl_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *length, loff_t *ppos) +{ + int rc; + + rc = proc_dointvec_minmax(table, write, buffer, length, ppos); + if (rc) + return rc; + + return 0; +} + int watermark_scale_factor_sysctl_handler(struct ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos) { @@ -8033,6 +8223,7 @@ int alloc_contig_range(unsigned long start, unsigned long end, if (ret) return ret; + cc.zone->cma_alloc = 1; /* * In case of -EBUSY, we'd like to know which page causes problem. * So, just fall through. test_pages_isolated() has a tracepoint @@ -8115,6 +8306,7 @@ int alloc_contig_range(unsigned long start, unsigned long end, done: undo_isolate_page_range(pfn_max_align_down(start), pfn_max_align_up(end), migratetype); + cc.zone->cma_alloc = 0; return ret; } diff --git a/mm/vmscan.c b/mm/vmscan.c index 32b2424a3203..84d1b7488a48 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -87,6 +87,9 @@ struct scan_control { /* Can pages be swapped as part of reclaim? */ unsigned int may_swap:1; + /* e.g. boosted watermark reclaim leaves slabs alone */ + unsigned int may_shrinkslab:1; + /* * Cgroups are not reclaimed below their configured memory.low, * unless we threaten to OOM. If any cgroups are skipped due to @@ -2739,8 +2742,10 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc) shrink_node_memcg(pgdat, memcg, sc, &lru_pages); node_lru_pages += lru_pages; - shrink_slab(sc->gfp_mask, pgdat->node_id, + if (sc->may_shrinkslab) { + shrink_slab(sc->gfp_mask, pgdat->node_id, memcg, sc->priority); + } /* Record the group's reclaim efficiency */ vmpressure(sc->gfp_mask, memcg, false, @@ -3218,6 +3223,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order, .may_writepage = !laptop_mode, .may_unmap = 1, .may_swap = 1, + .may_shrinkslab = 1, }; /* @@ -3262,6 +3268,7 @@ unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg, .may_unmap = 1, .reclaim_idx = MAX_NR_ZONES - 1, .may_swap = !noswap, + .may_shrinkslab = 1, }; unsigned long lru_pages; @@ -3308,6 +3315,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, .may_writepage = !laptop_mode, .may_unmap = 1, .may_swap = may_swap, + .may_shrinkslab = 1, }; /* @@ -3358,6 +3366,30 @@ static void age_active_anon(struct pglist_data *pgdat, } while (memcg); } +static bool pgdat_watermark_boosted(pg_data_t *pgdat, int classzone_idx) +{ + int i; + struct zone *zone; + + /* + * Check for watermark boosts top-down as the higher zones + * are more likely to be boosted. Both watermarks and boosts + * should not be checked at the time time as reclaim would + * start prematurely when there is no boosting and a lower + * zone is balanced. + */ + for (i = classzone_idx; i >= 0; i--) { + zone = pgdat->node_zones + i; + if (!managed_zone(zone)) + continue; + + if (zone->watermark_boost) + return true; + } + + return false; +} + /* * Returns true if there is an eligible zone balanced for the request order * and classzone_idx @@ -3368,6 +3400,10 @@ static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx) unsigned long mark = -1; struct zone *zone; + /* + * Check watermarks bottom-up as lower zones are more likely to + * meet watermarks. + */ for (i = 0; i <= classzone_idx; i++) { zone = pgdat->node_zones + i; @@ -3496,14 +3532,14 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx) unsigned long nr_soft_reclaimed; unsigned long nr_soft_scanned; unsigned long pflags; + unsigned long nr_boost_reclaim; + unsigned long zone_boosts[MAX_NR_ZONES] = { 0, }; + bool boosted; struct zone *zone; struct scan_control sc = { .gfp_mask = GFP_KERNEL, .order = order, - .priority = DEF_PRIORITY, - .may_writepage = !laptop_mode, .may_unmap = 1, - .may_swap = 1, }; psi_memstall_enter(&pflags); @@ -3511,9 +3547,28 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx) count_vm_event(PAGEOUTRUN); + /* + * Account for the reclaim boost. Note that the zone boost is left in + * place so that parallel allocations that are near the watermark will + * stall or direct reclaim until kswapd is finished. + */ + nr_boost_reclaim = 0; + for (i = 0; i <= classzone_idx; i++) { + zone = pgdat->node_zones + i; + if (!managed_zone(zone)) + continue; + + nr_boost_reclaim += zone->watermark_boost; + zone_boosts[i] = zone->watermark_boost; + } + boosted = nr_boost_reclaim; + +restart: + sc.priority = DEF_PRIORITY; do { unsigned long nr_reclaimed = sc.nr_reclaimed; bool raise_priority = true; + bool balanced; bool ret; sc.reclaim_idx = classzone_idx; @@ -3540,13 +3595,40 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx) } /* - * Only reclaim if there are no eligible zones. Note that - * sc.reclaim_idx is not used as buffer_heads_over_limit may - * have adjusted it. + * If the pgdat is imbalanced then ignore boosting and preserve + * the watermarks for a later time and restart. Note that the + * zone watermarks will be still reset at the end of balancing + * on the grounds that the normal reclaim should be enough to + * re-evaluate if boosting is required when kswapd next wakes. + */ + balanced = pgdat_balanced(pgdat, sc.order, classzone_idx); + if (!balanced && nr_boost_reclaim) { + nr_boost_reclaim = 0; + goto restart; + } + + /* + * If boosting is not active then only reclaim if there are no + * eligible zones. Note that sc.reclaim_idx is not used as + * buffer_heads_over_limit may have adjusted it. */ - if (pgdat_balanced(pgdat, sc.order, classzone_idx)) + if (!nr_boost_reclaim && balanced) goto out; + /* Limit the priority of boosting to avoid reclaim writeback */ + if (nr_boost_reclaim && sc.priority == DEF_PRIORITY - 2) + raise_priority = false; + + /* + * Do not writeback or swap pages for boosted reclaim. The + * intent is to relieve pressure not issue sub-optimal IO + * from reclaim context. If no pages are reclaimed, the + * reclaim will be aborted. + */ + sc.may_writepage = !laptop_mode && !nr_boost_reclaim; + sc.may_swap = !nr_boost_reclaim; + sc.may_shrinkslab = !nr_boost_reclaim; + /* * Do some background aging of the anon list, to give * pages a chance to be referenced before reclaiming. All @@ -3598,6 +3680,16 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx) * progress in reclaiming pages */ nr_reclaimed = sc.nr_reclaimed - nr_reclaimed; + nr_boost_reclaim -= min(nr_boost_reclaim, nr_reclaimed); + + /* + * If reclaim made no progress for a boost, stop reclaim as + * IO cannot be queued and it could be an infinite loop in + * extreme circumstances. + */ + if (nr_boost_reclaim && !nr_reclaimed) + break; + if (raise_priority || !nr_reclaimed) sc.priority--; } while (sc.priority >= 1); @@ -3606,6 +3698,28 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx) pgdat->kswapd_failures++; out: + /* If reclaim was boosted, account for the reclaim done in this pass */ + if (boosted) { + unsigned long flags; + + for (i = 0; i <= classzone_idx; i++) { + if (!zone_boosts[i]) + continue; + + /* Increments are under the zone lock */ + zone = pgdat->node_zones + i; + spin_lock_irqsave(&zone->lock, flags); + zone->watermark_boost -= min(zone->watermark_boost, zone_boosts[i]); + spin_unlock_irqrestore(&zone->lock, flags); + } + + /* + * As there is now likely space, wakeup kcompact to defragment + * pageblocks. + */ + wakeup_kcompactd(pgdat, pageblock_order, classzone_idx); + } + snapshot_refaults(NULL, pgdat); __fs_reclaim_release(); psi_memstall_leave(&pflags); @@ -3837,7 +3951,8 @@ void wakeup_kswapd(struct zone *zone, gfp_t gfp_flags, int order, /* Hopeless node, leave it to direct reclaim if possible */ if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES || - pgdat_balanced(pgdat, order, classzone_idx)) { + (pgdat_balanced(pgdat, order, classzone_idx) && + !pgdat_watermark_boosted(pgdat, classzone_idx))) { /* * There may be plenty of free memory available, but it's too * fragmented for high-order allocations. Wake up kcompactd |