From 534bcc4a0bb5b24600891ce793f0295a142e9dae Mon Sep 17 00:00:00 2001
From: Yu Zhao <yuzhao@google.com>
Date: Mon, 5 Apr 2021 04:17:41 -0600
Subject: [PATCH 05/10] mm: multigenerational lru: mm_struct list

To scan PTEs for accessed pages, a mm_struct list is maintained for
each memcg. When multiple threads traverse the same memcg->mm_list,
each of them gets a unique mm_struct and therefore they can run
walk_page_range() concurrently to reach page tables of all processes
of this memcg.

This infrastructure also provides the following optimizations:
  1) it allows walkers to skip processes that have been sleeping since
  the last walk by tracking the usage of mm_struct between context
  switches.
  2) it allows walkers to add interesting items they find during a
  walk to a Bloom filter so that they can skip uninteresting items
  during the next walk by testing whether an item is in this Bloom
  filter.

Signed-off-by: Yu Zhao <yuzhao@google.com>
Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru>
Change-Id: I25d9eda8c6bdc7c3653b9f210a159d6c247c81e8
---
 fs/exec.c                  |   2 +
 include/linux/memcontrol.h |   4 +
 include/linux/mm_inline.h  |   6 +
 include/linux/mm_types.h   |  75 +++++++++
 include/linux/mmzone.h     |  63 +++++++
 kernel/exit.c              |   1 +
 kernel/fork.c              |   9 +
 kernel/sched/core.c        |   1 +
 mm/memcontrol.c            |  25 +++
 mm/vmscan.c                | 331 +++++++++++++++++++++++++++++++++++++
 10 files changed, 517 insertions(+)

--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1013,6 +1013,7 @@ static int exec_mmap(struct mm_struct *m
 	active_mm = tsk->active_mm;
 	tsk->active_mm = mm;
 	tsk->mm = mm;
+	lru_gen_add_mm(mm);
 	/*
 	 * This prevents preemption while active_mm is being loaded and
 	 * it and mm are being updated, which could cause problems for
@@ -1023,6 +1024,7 @@ static int exec_mmap(struct mm_struct *m
 	if (!IS_ENABLED(CONFIG_ARCH_WANT_IRQS_OFF_ACTIVATE_MM))
 		local_irq_enable();
 	activate_mm(active_mm, mm);
+	lru_gen_activate_mm(mm);
 	if (IS_ENABLED(CONFIG_ARCH_WANT_IRQS_OFF_ACTIVATE_MM))
 		local_irq_enable();
 	tsk->mm->vmacache_seqnum = 0;
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -348,6 +348,10 @@ struct mem_cgroup {
 	struct deferred_split deferred_split_queue;
 #endif
 
+#ifdef CONFIG_LRU_GEN
+	struct lru_gen_mm_list mm_list;
+#endif
+
 	struct mem_cgroup_per_node *nodeinfo[];
 };
 
--- a/include/linux/mm_inline.h
+++ b/include/linux/mm_inline.h
@@ -100,6 +100,12 @@ static inline int lru_gen_from_seq(unsig
 	return seq % MAX_NR_GENS;
 }
 
+/* Return a proper index regardless whether we keep stats for historical generations. */
+static inline int lru_hist_from_seq(unsigned long seq)
+{
+	return seq % NR_HIST_GENS;
+}
+
 /* The youngest and the second youngest generations are counted as active. */
 static inline bool lru_gen_is_active(struct lruvec *lruvec, int gen)
 {
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -3,6 +3,7 @@
 #define _LINUX_MM_TYPES_H
 
 #include <linux/mm_types_task.h>
+#include <linux/sched.h>
 
 #include <linux/auxvec.h>
 #include <linux/list.h>
@@ -15,6 +16,8 @@
 #include <linux/page-flags-layout.h>
 #include <linux/workqueue.h>
 #include <linux/seqlock.h>
+#include <linux/nodemask.h>
+#include <linux/mmdebug.h>
 
 #include <asm/mmu.h>
 
@@ -580,6 +583,18 @@ struct mm_struct {
 #ifdef CONFIG_IOMMU_SUPPORT
 		u32 pasid;
 #endif
+#ifdef CONFIG_LRU_GEN
+		struct {
+			/* the node of a global or per-memcg mm_struct list */
+			struct list_head list;
+#ifdef CONFIG_MEMCG
+			/* points to the memcg of the owner task above */
+			struct mem_cgroup *memcg;
+#endif
+			/* whether this mm_struct has been used since the last walk */
+			nodemask_t nodes;
+		} lrugen;
+#endif /* CONFIG_LRU_GEN */
 	} __randomize_layout;
 
 	/*
@@ -606,6 +621,66 @@ static inline cpumask_t *mm_cpumask(stru
 	return (struct cpumask *)&mm->cpu_bitmap;
 }
 
+#ifdef CONFIG_LRU_GEN
+
+struct lru_gen_mm_list {
+	/* a global or per-memcg mm_struct list */
+	struct list_head fifo;
+	/* protects the list above */
+	spinlock_t lock;
+};
+
+void lru_gen_add_mm(struct mm_struct *mm);
+void lru_gen_del_mm(struct mm_struct *mm);
+#ifdef CONFIG_MEMCG
+void lru_gen_migrate_mm(struct mm_struct *mm);
+#endif
+
+static inline void lru_gen_init_mm(struct mm_struct *mm)
+{
+	INIT_LIST_HEAD(&mm->lrugen.list);
+#ifdef CONFIG_MEMCG
+	mm->lrugen.memcg = NULL;
+#endif
+	nodes_clear(mm->lrugen.nodes);
+}
+
+/* Track the usage of each mm_struct so that we can skip inactive ones. */
+static inline void lru_gen_activate_mm(struct mm_struct *mm)
+{
+	/* unlikely but not a bug when racing with lru_gen_migrate_mm() */
+	VM_WARN_ON(list_empty(&mm->lrugen.list));
+
+	if (!(current->flags & PF_KTHREAD) && !nodes_full(mm->lrugen.nodes))
+		nodes_setall(mm->lrugen.nodes);
+}
+
+#else /* !CONFIG_LRU_GEN */
+
+static inline void lru_gen_add_mm(struct mm_struct *mm)
+{
+}
+
+static inline void lru_gen_del_mm(struct mm_struct *mm)
+{
+}
+
+#ifdef CONFIG_MEMCG
+static inline void lru_gen_migrate_mm(struct mm_struct *mm)
+{
+}
+#endif
+
+static inline void lru_gen_init_mm(struct mm_struct *mm)
+{
+}
+
+static inline void lru_gen_activate_mm(struct mm_struct *mm)
+{
+}
+
+#endif /* CONFIG_LRU_GEN */
+
 struct mmu_gather;
 extern void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm);
 extern void tlb_gather_mmu_fullmm(struct mmu_gather *tlb, struct mm_struct *mm);
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -318,6 +318,13 @@ struct lruvec;
 #define MIN_NR_GENS		2
 #define MAX_NR_GENS		((unsigned int)CONFIG_NR_LRU_GENS)
 
+/* Whether to keep stats for historical generations. */
+#ifdef CONFIG_LRU_GEN_STATS
+#define NR_HIST_GENS		((unsigned int)CONFIG_NR_LRU_GENS)
+#else
+#define NR_HIST_GENS		1U
+#endif
+
 struct lrugen {
 	/* the aging increments the max generation number */
 	unsigned long max_seq;
@@ -333,13 +340,63 @@ struct lrugen {
 	bool enabled[ANON_AND_FILE];
 };
 
+enum {
+	MM_LEAF_TOTAL,		/* total leaf entries */
+	MM_LEAF_OLD,		/* old leaf entries */
+	MM_LEAF_YOUNG,		/* young leaf entries */
+	MM_NONLEAF_TOTAL,	/* total non-leaf entries */
+	MM_NONLEAF_PREV,	/* previously worthy non-leaf entries */
+	MM_NONLEAF_CUR,		/* currently worthy non-leaf entries */
+	NR_MM_STATS
+};
+
+/* mnemonic codes for the stats above */
+#define MM_STAT_CODES		"toydpc"
+
+/* double buffering bloom filters */
+#define NR_BLOOM_FILTERS	2
+
+struct lru_gen_mm_walk {
+	/* set to max_seq after each round of walk */
+	unsigned long seq;
+	/* the next mm_struct on the list to walk */
+	struct list_head *head;
+	/* the first mm_struct never walked before */
+	struct list_head *tail;
+	/* to wait for the last walker to finish */
+	struct wait_queue_head wait;
+	/* bloom filters flip after each round of walk */
+	unsigned long *filters[NR_BLOOM_FILTERS];
+	/* page table stats for debugging */
+	unsigned long stats[NR_HIST_GENS][NR_MM_STATS];
+	/* the number of concurrent walkers */
+	int nr_walkers;
+};
+
+#define MIN_BATCH_SIZE		64
 #define MAX_BATCH_SIZE		8192
 
+struct mm_walk_args {
+	struct mem_cgroup *memcg;
+	unsigned long max_seq;
+	unsigned long start_pfn;
+	unsigned long end_pfn;
+	unsigned long next_addr;
+	unsigned long bitmap[BITS_TO_LONGS(MIN_BATCH_SIZE)];
+	int node_id;
+	int swappiness;
+	int batch_size;
+	int nr_pages[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES];
+	int mm_stats[NR_MM_STATS];
+	bool use_filter;
+};
+
 void lru_gen_init_state(struct mem_cgroup *memcg, struct lruvec *lruvec);
 void lru_gen_change_state(bool enable, bool main, bool swap);
 
 #ifdef CONFIG_MEMCG
 void lru_gen_init_memcg(struct mem_cgroup *memcg);
+void lru_gen_free_memcg(struct mem_cgroup *memcg);
 #endif
 
 #else /* !CONFIG_LRU_GEN */
@@ -356,6 +413,10 @@ static inline void lru_gen_change_state(
 static inline void lru_gen_init_memcg(struct mem_cgroup *memcg)
 {
 }
+
+static inline void lru_gen_free_memcg(struct mem_cgroup *memcg)
+{
+}
 #endif
 
 #endif /* CONFIG_LRU_GEN */
@@ -380,6 +441,8 @@ struct lruvec {
 #ifdef CONFIG_LRU_GEN
 	/* unevictable pages are on LRU_UNEVICTABLE */
 	struct lrugen			evictable;
+	/* state for mm list and page table walks */
+	struct lru_gen_mm_walk		mm_walk;
 #endif
 #ifdef CONFIG_MEMCG
 	struct pglist_data *pgdat;
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -422,6 +422,7 @@ assign_new_owner:
 		goto retry;
 	}
 	WRITE_ONCE(mm->owner, c);
+	lru_gen_migrate_mm(mm);
 	task_unlock(c);
 	put_task_struct(c);
 }
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1080,6 +1080,7 @@ static struct mm_struct *mm_init(struct
 		goto fail_nocontext;
 
 	mm->user_ns = get_user_ns(user_ns);
+	lru_gen_init_mm(mm);
 	return mm;
 
 fail_nocontext:
@@ -1122,6 +1123,7 @@ static inline void __mmput(struct mm_str
 	}
 	if (mm->binfmt)
 		module_put(mm->binfmt->module);
+	lru_gen_del_mm(mm);
 	mmdrop(mm);
 }
 
@@ -2617,6 +2619,13 @@ pid_t kernel_clone(struct kernel_clone_a
 		get_task_struct(p);
 	}
 
+	if (IS_ENABLED(CONFIG_LRU_GEN) && !(clone_flags & CLONE_VM)) {
+		/* lock the task to synchronize with memcg migration */
+		task_lock(p);
+		lru_gen_add_mm(p->mm);
+		task_unlock(p);
+	}
+
 	wake_up_new_task(p);
 
 	/* forking complete and child started to run, tell ptracer */
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4978,6 +4978,7 @@ context_switch(struct rq *rq, struct tas
 		 * finish_task_switch()'s mmdrop().
 		 */
 		switch_mm_irqs_off(prev->active_mm, next->mm, next);
+		lru_gen_activate_mm(next->mm);
 
 		if (!prev->mm) {                        // from kernel
 			/* will mmdrop() in finish_task_switch(). */
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -5163,6 +5163,7 @@ static void __mem_cgroup_free(struct mem
 
 static void mem_cgroup_free(struct mem_cgroup *memcg)
 {
+	lru_gen_free_memcg(memcg);
 	memcg_wb_domain_exit(memcg);
 	__mem_cgroup_free(memcg);
 }
@@ -6195,6 +6196,29 @@ static void mem_cgroup_move_task(void)
 }
 #endif
 
+#ifdef CONFIG_LRU_GEN
+static void mem_cgroup_attach(struct cgroup_taskset *tset)
+{
+	struct cgroup_subsys_state *css;
+	struct task_struct *task = NULL;
+
+	cgroup_taskset_for_each_leader(task, css, tset)
+		break;
+
+	if (!task)
+		return;
+
+	task_lock(task);
+	if (task->mm && task->mm->owner == task)
+		lru_gen_migrate_mm(task->mm);
+	task_unlock(task);
+}
+#else
+static void mem_cgroup_attach(struct cgroup_taskset *tset)
+{
+}
+#endif /* CONFIG_LRU_GEN */
+
 static int seq_puts_memcg_tunable(struct seq_file *m, unsigned long value)
 {
 	if (value == PAGE_COUNTER_MAX)
@@ -6538,6 +6562,7 @@ struct cgroup_subsys memory_cgrp_subsys
 	.css_reset = mem_cgroup_css_reset,
 	.css_rstat_flush = mem_cgroup_css_rstat_flush,
 	.can_attach = mem_cgroup_can_attach,
+	.attach = mem_cgroup_attach,
 	.cancel_attach = mem_cgroup_cancel_attach,
 	.post_attach = mem_cgroup_move_task,
 	.dfl_cftypes = memory_files,
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2929,6 +2929,306 @@ static bool __maybe_unused seq_is_valid(
 }
 
 /******************************************************************************
+ *                          mm_struct list
+ ******************************************************************************/
+
+static struct lru_gen_mm_list *get_mm_list(struct mem_cgroup *memcg)
+{
+	static struct lru_gen_mm_list mm_list = {
+		.fifo = LIST_HEAD_INIT(mm_list.fifo),
+		.lock = __SPIN_LOCK_UNLOCKED(mm_list.lock),
+	};
+
+#ifdef CONFIG_MEMCG
+	if (memcg)
+		return &memcg->mm_list;
+#endif
+	return &mm_list;
+}
+
+void lru_gen_add_mm(struct mm_struct *mm)
+{
+	int nid;
+	struct mem_cgroup *memcg = get_mem_cgroup_from_mm(mm);
+	struct lru_gen_mm_list *mm_list = get_mm_list(memcg);
+
+	VM_BUG_ON_MM(!list_empty(&mm->lrugen.list), mm);
+#ifdef CONFIG_MEMCG
+	VM_BUG_ON_MM(mm->lrugen.memcg, mm);
+	mm->lrugen.memcg = memcg;
+#endif
+	spin_lock(&mm_list->lock);
+
+	list_add_tail(&mm->lrugen.list, &mm_list->fifo);
+
+	for_each_node(nid) {
+		struct lruvec *lruvec = get_lruvec(nid, memcg);
+
+		if (!lruvec)
+			continue;
+
+		if (lruvec->mm_walk.tail == &mm_list->fifo)
+			lruvec->mm_walk.tail = lruvec->mm_walk.tail->prev;
+	}
+
+	spin_unlock(&mm_list->lock);
+}
+
+void lru_gen_del_mm(struct mm_struct *mm)
+{
+	int nid;
+	struct lru_gen_mm_list *mm_list;
+	struct mem_cgroup *memcg = NULL;
+
+	if (list_empty(&mm->lrugen.list))
+		return;
+
+#ifdef CONFIG_MEMCG
+	memcg = mm->lrugen.memcg;
+#endif
+	mm_list = get_mm_list(memcg);
+
+	spin_lock(&mm_list->lock);
+
+	for_each_node(nid) {
+		struct lruvec *lruvec = get_lruvec(nid, memcg);
+
+		if (!lruvec)
+			continue;
+
+		if (lruvec->mm_walk.tail == &mm->lrugen.list)
+			lruvec->mm_walk.tail = lruvec->mm_walk.tail->next;
+
+		if (lruvec->mm_walk.head != &mm->lrugen.list)
+			continue;
+
+		lruvec->mm_walk.head = lruvec->mm_walk.head->next;
+		if (lruvec->mm_walk.head == &mm_list->fifo)
+			WRITE_ONCE(lruvec->mm_walk.seq, lruvec->mm_walk.seq + 1);
+	}
+
+	list_del_init(&mm->lrugen.list);
+
+	spin_unlock(&mm_list->lock);
+
+#ifdef CONFIG_MEMCG
+	mem_cgroup_put(mm->lrugen.memcg);
+	mm->lrugen.memcg = NULL;
+#endif
+}
+
+#ifdef CONFIG_MEMCG
+void lru_gen_migrate_mm(struct mm_struct *mm)
+{
+	struct mem_cgroup *memcg;
+
+	lockdep_assert_held(&mm->owner->alloc_lock);
+
+	if (mem_cgroup_disabled())
+		return;
+
+	rcu_read_lock();
+	memcg = mem_cgroup_from_task(mm->owner);
+	rcu_read_unlock();
+	if (memcg == mm->lrugen.memcg)
+		return;
+
+	VM_BUG_ON_MM(!mm->lrugen.memcg, mm);
+	VM_BUG_ON_MM(list_empty(&mm->lrugen.list), mm);
+
+	lru_gen_del_mm(mm);
+	lru_gen_add_mm(mm);
+}
+#endif
+
+#define BLOOM_FILTER_SHIFT	15
+
+static inline int filter_gen_from_seq(unsigned long seq)
+{
+	return seq % NR_BLOOM_FILTERS;
+}
+
+static void get_item_key(void *item, int *key)
+{
+	u32 hash = hash_ptr(item, BLOOM_FILTER_SHIFT * 2);
+
+	BUILD_BUG_ON(BLOOM_FILTER_SHIFT * 2 > BITS_PER_TYPE(u32));
+
+	key[0] = hash & (BIT(BLOOM_FILTER_SHIFT) - 1);
+	key[1] = hash >> BLOOM_FILTER_SHIFT;
+}
+
+static void clear_bloom_filter(struct lruvec *lruvec, unsigned long seq)
+{
+	unsigned long *filter;
+	int gen = filter_gen_from_seq(seq);
+
+	lockdep_assert_held(&get_mm_list(lruvec_memcg(lruvec))->lock);
+
+	filter = lruvec->mm_walk.filters[gen];
+	if (filter) {
+		bitmap_clear(filter, 0, BIT(BLOOM_FILTER_SHIFT));
+		return;
+	}
+
+	filter = bitmap_zalloc(BIT(BLOOM_FILTER_SHIFT), GFP_ATOMIC);
+	WRITE_ONCE(lruvec->mm_walk.filters[gen], filter);
+}
+
+static void set_bloom_filter(struct lruvec *lruvec, unsigned long seq, void *item)
+{
+	int key[2];
+	unsigned long *filter;
+	int gen = filter_gen_from_seq(seq);
+
+	filter = READ_ONCE(lruvec->mm_walk.filters[gen]);
+	if (!filter)
+		return;
+
+	get_item_key(item, key);
+
+	if (!test_bit(key[0], filter))
+		set_bit(key[0], filter);
+	if (!test_bit(key[1], filter))
+		set_bit(key[1], filter);
+}
+
+static bool test_bloom_filter(struct lruvec *lruvec, unsigned long seq, void *item)
+{
+	int key[2];
+	unsigned long *filter;
+	int gen = filter_gen_from_seq(seq);
+
+	filter = READ_ONCE(lruvec->mm_walk.filters[gen]);
+	if (!filter)
+		return false;
+
+	get_item_key(item, key);
+
+	return test_bit(key[0], filter) && test_bit(key[1], filter);
+}
+
+static void reset_mm_stats(struct lruvec *lruvec, bool last, struct mm_walk_args *args)
+{
+	int i;
+	int hist = lru_hist_from_seq(args->max_seq);
+
+	lockdep_assert_held(&get_mm_list(lruvec_memcg(lruvec))->lock);
+
+	for (i = 0; i < NR_MM_STATS; i++) {
+		WRITE_ONCE(lruvec->mm_walk.stats[hist][i],
+			   lruvec->mm_walk.stats[hist][i] + args->mm_stats[i]);
+		args->mm_stats[i] = 0;
+	}
+
+	if (!last || NR_HIST_GENS == 1)
+		return;
+
+	hist = lru_hist_from_seq(args->max_seq + 1);
+	for (i = 0; i < NR_MM_STATS; i++)
+		WRITE_ONCE(lruvec->mm_walk.stats[hist][i], 0);
+}
+
+static bool should_skip_mm(struct mm_struct *mm, struct mm_walk_args *args)
+{
+	int type;
+	unsigned long size = 0;
+
+	if (cpumask_empty(mm_cpumask(mm)) && !node_isset(args->node_id, mm->lrugen.nodes))
+		return true;
+
+	if (mm_is_oom_victim(mm))
+		return true;
+
+	for (type = !args->swappiness; type < ANON_AND_FILE; type++) {
+		size += type ? get_mm_counter(mm, MM_FILEPAGES) :
+			       get_mm_counter(mm, MM_ANONPAGES) +
+			       get_mm_counter(mm, MM_SHMEMPAGES);
+	}
+
+	if (size < MIN_BATCH_SIZE)
+		return true;
+
+	if (!mmget_not_zero(mm))
+		return true;
+
+	node_clear(args->node_id, mm->lrugen.nodes);
+
+	return false;
+}
+
+/* To support multiple walkers that concurrently walk an mm_struct list. */
+static bool get_next_mm(struct lruvec *lruvec, struct mm_walk_args *args,
+			struct mm_struct **iter)
+{
+	bool first = false;
+	bool last = true;
+	struct mm_struct *mm = NULL;
+	struct lru_gen_mm_walk *mm_walk = &lruvec->mm_walk;
+	struct lru_gen_mm_list *mm_list = get_mm_list(args->memcg);
+
+	if (*iter)
+		mmput_async(*iter);
+	else if (args->max_seq <= READ_ONCE(mm_walk->seq))
+		return false;
+
+	spin_lock(&mm_list->lock);
+
+	VM_BUG_ON(args->max_seq > mm_walk->seq + 1);
+	VM_BUG_ON(*iter && args->max_seq < mm_walk->seq);
+	VM_BUG_ON(*iter && !mm_walk->nr_walkers);
+
+	if (args->max_seq <= mm_walk->seq) {
+		if (!*iter)
+			last = false;
+		goto done;
+	}
+
+	if (mm_walk->head == &mm_list->fifo) {
+		VM_BUG_ON(mm_walk->nr_walkers);
+		mm_walk->head = mm_walk->head->next;
+		first = true;
+	}
+
+	while (!mm && mm_walk->head != &mm_list->fifo) {
+		mm = list_entry(mm_walk->head, struct mm_struct, lrugen.list);
+
+		mm_walk->head = mm_walk->head->next;
+
+		if (mm_walk->tail == &mm->lrugen.list) {
+			mm_walk->tail = mm_walk->tail->next;
+			args->use_filter = false;
+		}
+
+		if (should_skip_mm(mm, args))
+			mm = NULL;
+	}
+
+	if (mm_walk->head == &mm_list->fifo)
+		WRITE_ONCE(mm_walk->seq, mm_walk->seq + 1);
+done:
+	if (*iter && !mm)
+		mm_walk->nr_walkers--;
+	if (!*iter && mm)
+		mm_walk->nr_walkers++;
+
+	if (mm_walk->nr_walkers)
+		last = false;
+
+	if (mm && first)
+		clear_bloom_filter(lruvec, args->max_seq + 1);
+
+	if (*iter || last)
+		reset_mm_stats(lruvec, last, args);
+
+	spin_unlock(&mm_list->lock);
+
+	*iter = mm;
+
+	return last;
+}
+
+/******************************************************************************
  *                          state change
  ******************************************************************************/
 
@@ -3112,6 +3412,7 @@ void lru_gen_init_state(struct mem_cgrou
 	int i;
 	int gen, type, zone;
 	struct lrugen *lrugen = &lruvec->evictable;
+	struct lru_gen_mm_list *mm_list = get_mm_list(memcg);
 
 	lrugen->max_seq = MIN_NR_GENS + 1;
 	lrugen->enabled[0] = lru_gen_enabled() && lru_gen_nr_swapfiles;
@@ -3122,6 +3423,17 @@ void lru_gen_init_state(struct mem_cgrou
 
 	for_each_gen_type_zone(gen, type, zone)
 		INIT_LIST_HEAD(&lrugen->lists[gen][type][zone]);
+
+	if (IS_ENABLED(CONFIG_MEMORY_HOTPLUG) && !memcg)
+		spin_lock(&mm_list->lock);
+
+	lruvec->mm_walk.seq = MIN_NR_GENS;
+	lruvec->mm_walk.head = &mm_list->fifo;
+	lruvec->mm_walk.tail = &mm_list->fifo;
+	init_waitqueue_head(&lruvec->mm_walk.wait);
+
+	if (IS_ENABLED(CONFIG_MEMORY_HOTPLUG) && !memcg)
+		spin_unlock(&mm_list->lock);
 }
 
 #ifdef CONFIG_MEMCG
@@ -3129,18 +3441,37 @@ void lru_gen_init_memcg(struct mem_cgrou
 {
 	int nid;
 
+	INIT_LIST_HEAD(&memcg->mm_list.fifo);
+	spin_lock_init(&memcg->mm_list.lock);
+
 	for_each_node(nid) {
 		struct lruvec *lruvec = get_lruvec(nid, memcg);
 
 		lru_gen_init_state(memcg, lruvec);
 	}
 }
+
+void lru_gen_free_memcg(struct mem_cgroup *memcg)
+{
+	int nid;
+
+	for_each_node(nid) {
+		int i;
+		struct lruvec *lruvec = get_lruvec(nid, memcg);
+
+		for (i = 0; i < NR_BLOOM_FILTERS; i++) {
+			bitmap_free(lruvec->mm_walk.filters[i]);
+			lruvec->mm_walk.filters[i] = NULL;
+		}
+	}
+}
 #endif
 
 static int __init init_lru_gen(void)
 {
 	BUILD_BUG_ON(MIN_NR_GENS + 1 >= MAX_NR_GENS);
 	BUILD_BUG_ON(BIT(LRU_GEN_WIDTH) <= MAX_NR_GENS);
+	BUILD_BUG_ON(sizeof(MM_STAT_CODES) != NR_MM_STATS + 1);
 
 	return 0;
 };
