* [PATCH v4 1/6] mm/memcg: Revert ("mm/memcg: optimize user context object stock access")
2022-02-21 18:25 [PATCH v4 0/6] mm/memcg: Address PREEMPT_RT problems instead of disabling it Sebastian Andrzej Siewior
@ 2022-02-21 18:25 ` Sebastian Andrzej Siewior
2022-02-21 18:25 ` [PATCH v4 2/6] mm/memcg: Disable threshold event handlers on PREEMPT_RT Sebastian Andrzej Siewior
` (4 subsequent siblings)
5 siblings, 0 replies; 17+ messages in thread
From: Sebastian Andrzej Siewior @ 2022-02-21 18:25 UTC (permalink / raw)
To: cgroups, linux-mm
Cc: Andrew Morton, Johannes Weiner, Michal Hocko, Michal Koutný,
Peter Zijlstra, Thomas Gleixner, Vladimir Davydov, Waiman Long,
Michal Hocko, Sebastian Andrzej Siewior, Roman Gushchin,
Shakeel Butt
From: Michal Hocko <mhocko@suse.com>
The optimisation is based on a micro benchmark where local_irq_save() is
more expensive than a preempt_disable(). There is no evidence that it is
visible in a real-world workload and there are CPUs where the opposite is
true (local_irq_save() is cheaper than preempt_disable()).
Based on micro benchmarks, the optimisation makes sense on PREEMPT_NONE
where preempt_disable() is optimized away. There is no improvement with
PREEMPT_DYNAMIC since the preemption counter is always available.
The optimization makes also the PREEMPT_RT integration more complicated
since most of the assumption are not true on PREEMPT_RT.
Revert the optimisation since it complicates the PREEMPT_RT integration
and the improvement is hardly visible.
[ bigeasy: Patch body around Michal's diff ]
Link: https://lore.kernel.org/all/YgOGkXXCrD%2F1k+p4@dhcp22.suse.cz
Link: https://lkml.kernel.org/r/YdX+INO9gQje6d0S@linutronix.de
Signed-off-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Acked-by: Roman Gushchin <guro@fb.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Acked-by: Michal Hocko <mhocko@suse.com>
---
mm/memcontrol.c | 94 ++++++++++++++-----------------------------------
1 file changed, 27 insertions(+), 67 deletions(-)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 3c4816147273a..8ab2dc75e70ec 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2078,23 +2078,17 @@ void unlock_page_memcg(struct page *page)
folio_memcg_unlock(page_folio(page));
}
-struct obj_stock {
+struct memcg_stock_pcp {
+ struct mem_cgroup *cached; /* this never be root cgroup */
+ unsigned int nr_pages;
+
#ifdef CONFIG_MEMCG_KMEM
struct obj_cgroup *cached_objcg;
struct pglist_data *cached_pgdat;
unsigned int nr_bytes;
int nr_slab_reclaimable_b;
int nr_slab_unreclaimable_b;
-#else
- int dummy[0];
#endif
-};
-
-struct memcg_stock_pcp {
- struct mem_cgroup *cached; /* this never be root cgroup */
- unsigned int nr_pages;
- struct obj_stock task_obj;
- struct obj_stock irq_obj;
struct work_struct work;
unsigned long flags;
@@ -2104,13 +2098,13 @@ static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock);
static DEFINE_MUTEX(percpu_charge_mutex);
#ifdef CONFIG_MEMCG_KMEM
-static void drain_obj_stock(struct obj_stock *stock);
+static void drain_obj_stock(struct memcg_stock_pcp *stock);
static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
struct mem_cgroup *root_memcg);
static void memcg_account_kmem(struct mem_cgroup *memcg, int nr_pages);
#else
-static inline void drain_obj_stock(struct obj_stock *stock)
+static inline void drain_obj_stock(struct memcg_stock_pcp *stock)
{
}
static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
@@ -2190,9 +2184,7 @@ static void drain_local_stock(struct work_struct *dummy)
local_irq_save(flags);
stock = this_cpu_ptr(&memcg_stock);
- drain_obj_stock(&stock->irq_obj);
- if (in_task())
- drain_obj_stock(&stock->task_obj);
+ drain_obj_stock(stock);
drain_stock(stock);
clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);
@@ -2767,41 +2759,6 @@ static struct mem_cgroup *get_mem_cgroup_from_objcg(struct obj_cgroup *objcg)
*/
#define OBJCGS_CLEAR_MASK (__GFP_DMA | __GFP_RECLAIMABLE | __GFP_ACCOUNT)
-/*
- * Most kmem_cache_alloc() calls are from user context. The irq disable/enable
- * sequence used in this case to access content from object stock is slow.
- * To optimize for user context access, there are now two object stocks for
- * task context and interrupt context access respectively.
- *
- * The task context object stock can be accessed by disabling preemption only
- * which is cheap in non-preempt kernel. The interrupt context object stock
- * can only be accessed after disabling interrupt. User context code can
- * access interrupt object stock, but not vice versa.
- */
-static inline struct obj_stock *get_obj_stock(unsigned long *pflags)
-{
- struct memcg_stock_pcp *stock;
-
- if (likely(in_task())) {
- *pflags = 0UL;
- preempt_disable();
- stock = this_cpu_ptr(&memcg_stock);
- return &stock->task_obj;
- }
-
- local_irq_save(*pflags);
- stock = this_cpu_ptr(&memcg_stock);
- return &stock->irq_obj;
-}
-
-static inline void put_obj_stock(unsigned long flags)
-{
- if (likely(in_task()))
- preempt_enable();
- else
- local_irq_restore(flags);
-}
-
/*
* mod_objcg_mlstate() may be called with irq enabled, so
* mod_memcg_lruvec_state() should be used.
@@ -3082,10 +3039,13 @@ void __memcg_kmem_uncharge_page(struct page *page, int order)
void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat,
enum node_stat_item idx, int nr)
{
+ struct memcg_stock_pcp *stock;
unsigned long flags;
- struct obj_stock *stock = get_obj_stock(&flags);
int *bytes;
+ local_irq_save(flags);
+ stock = this_cpu_ptr(&memcg_stock);
+
/*
* Save vmstat data in stock and skip vmstat array update unless
* accumulating over a page of vmstat data or when pgdat or idx
@@ -3136,26 +3096,29 @@ void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat,
if (nr)
mod_objcg_mlstate(objcg, pgdat, idx, nr);
- put_obj_stock(flags);
+ local_irq_restore(flags);
}
static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes)
{
+ struct memcg_stock_pcp *stock;
unsigned long flags;
- struct obj_stock *stock = get_obj_stock(&flags);
bool ret = false;
+ local_irq_save(flags);
+
+ stock = this_cpu_ptr(&memcg_stock);
if (objcg == stock->cached_objcg && stock->nr_bytes >= nr_bytes) {
stock->nr_bytes -= nr_bytes;
ret = true;
}
- put_obj_stock(flags);
+ local_irq_restore(flags);
return ret;
}
-static void drain_obj_stock(struct obj_stock *stock)
+static void drain_obj_stock(struct memcg_stock_pcp *stock)
{
struct obj_cgroup *old = stock->cached_objcg;
@@ -3211,13 +3174,8 @@ static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
{
struct mem_cgroup *memcg;
- if (in_task() && stock->task_obj.cached_objcg) {
- memcg = obj_cgroup_memcg(stock->task_obj.cached_objcg);
- if (memcg && mem_cgroup_is_descendant(memcg, root_memcg))
- return true;
- }
- if (stock->irq_obj.cached_objcg) {
- memcg = obj_cgroup_memcg(stock->irq_obj.cached_objcg);
+ if (stock->cached_objcg) {
+ memcg = obj_cgroup_memcg(stock->cached_objcg);
if (memcg && mem_cgroup_is_descendant(memcg, root_memcg))
return true;
}
@@ -3228,10 +3186,13 @@ static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes,
bool allow_uncharge)
{
+ struct memcg_stock_pcp *stock;
unsigned long flags;
- struct obj_stock *stock = get_obj_stock(&flags);
unsigned int nr_pages = 0;
+ local_irq_save(flags);
+
+ stock = this_cpu_ptr(&memcg_stock);
if (stock->cached_objcg != objcg) { /* reset if necessary */
drain_obj_stock(stock);
obj_cgroup_get(objcg);
@@ -3247,7 +3208,7 @@ static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes,
stock->nr_bytes &= (PAGE_SIZE - 1);
}
- put_obj_stock(flags);
+ local_irq_restore(flags);
if (nr_pages)
obj_cgroup_uncharge_pages(objcg, nr_pages);
@@ -6812,7 +6773,6 @@ static void uncharge_folio(struct folio *folio, struct uncharge_gather *ug)
long nr_pages;
struct mem_cgroup *memcg;
struct obj_cgroup *objcg;
- bool use_objcg = folio_memcg_kmem(folio);
VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
@@ -6821,7 +6781,7 @@ static void uncharge_folio(struct folio *folio, struct uncharge_gather *ug)
* folio memcg or objcg at this point, we have fully
* exclusive access to the folio.
*/
- if (use_objcg) {
+ if (folio_memcg_kmem(folio)) {
objcg = __folio_objcg(folio);
/*
* This get matches the put at the end of the function and
@@ -6849,7 +6809,7 @@ static void uncharge_folio(struct folio *folio, struct uncharge_gather *ug)
nr_pages = folio_nr_pages(folio);
- if (use_objcg) {
+ if (folio_memcg_kmem(folio)) {
ug->nr_memory += nr_pages;
ug->nr_kmem += nr_pages;
--
2.35.1
^ permalink raw reply [flat|nested] 17+ messages in thread* [PATCH v4 2/6] mm/memcg: Disable threshold event handlers on PREEMPT_RT
2022-02-21 18:25 [PATCH v4 0/6] mm/memcg: Address PREEMPT_RT problems instead of disabling it Sebastian Andrzej Siewior
2022-02-21 18:25 ` [PATCH v4 1/6] mm/memcg: Revert ("mm/memcg: optimize user context object stock access") Sebastian Andrzej Siewior
@ 2022-02-21 18:25 ` Sebastian Andrzej Siewior
2022-02-21 18:25 ` [PATCH v4 3/6] mm/memcg: Protect per-CPU counter by disabling preemption on PREEMPT_RT where needed Sebastian Andrzej Siewior
` (3 subsequent siblings)
5 siblings, 0 replies; 17+ messages in thread
From: Sebastian Andrzej Siewior @ 2022-02-21 18:25 UTC (permalink / raw)
To: cgroups, linux-mm
Cc: Andrew Morton, Johannes Weiner, Michal Hocko, Michal Koutný,
Peter Zijlstra, Thomas Gleixner, Vladimir Davydov, Waiman Long,
Sebastian Andrzej Siewior, Roman Gushchin, Shakeel Butt,
Michal Hocko
During the integration of PREEMPT_RT support, the code flow around
memcg_check_events() resulted in `twisted code'. Moving the code around
and avoiding then would then lead to an additional local-irq-save
section within memcg_check_events(). While looking better, it adds a
local-irq-save section to code flow which is usually within an
local-irq-off block on non-PREEMPT_RT configurations.
The threshold event handler is a deprecated memcg v1 feature. Instead of
trying to get it to work under PREEMPT_RT just disable it. There should
be no users on PREEMPT_RT. From that perspective it makes even less
sense to get it to work under PREEMPT_RT while having zero users.
Make memory.soft_limit_in_bytes and cgroup.event_control return
-EOPNOTSUPP on PREEMPT_RT. Make an empty memcg_check_events() and
memcg_write_event_control() which return only -EOPNOTSUPP on PREEMPT_RT.
Document that the two knobs are disabled on PREEMPT_RT.
Suggested-by: Michal Hocko <mhocko@kernel.org>
Suggested-by: Michal Koutný <mkoutny@suse.com>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Acked-by: Roman Gushchin <guro@fb.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Acked-by: Michal Hocko <mhocko@suse.com>
---
Documentation/admin-guide/cgroup-v1/memory.rst | 2 ++
mm/memcontrol.c | 14 ++++++++++++--
2 files changed, 14 insertions(+), 2 deletions(-)
diff --git a/Documentation/admin-guide/cgroup-v1/memory.rst b/Documentation/admin-guide/cgroup-v1/memory.rst
index faac50149a222..2cc502a75ef64 100644
--- a/Documentation/admin-guide/cgroup-v1/memory.rst
+++ b/Documentation/admin-guide/cgroup-v1/memory.rst
@@ -64,6 +64,7 @@ Brief summary of control files.
threads
cgroup.procs show list of processes
cgroup.event_control an interface for event_fd()
+ This knob is not available on CONFIG_PREEMPT_RT systems.
memory.usage_in_bytes show current usage for memory
(See 5.5 for details)
memory.memsw.usage_in_bytes show current usage for memory+Swap
@@ -75,6 +76,7 @@ Brief summary of control files.
memory.max_usage_in_bytes show max memory usage recorded
memory.memsw.max_usage_in_bytes show max memory+Swap usage recorded
memory.soft_limit_in_bytes set/show soft limit of memory usage
+ This knob is not available on CONFIG_PREEMPT_RT systems.
memory.stat show various statistics
memory.use_hierarchy set/show hierarchical account enabled
This knob is deprecated and shouldn't be
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 8ab2dc75e70ec..0b5117ed2ae08 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -859,6 +859,9 @@ static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
*/
static void memcg_check_events(struct mem_cgroup *memcg, int nid)
{
+ if (IS_ENABLED(CONFIG_PREEMPT_RT))
+ return;
+
/* threshold event is triggered in finer grain than soft limit */
if (unlikely(mem_cgroup_event_ratelimit(memcg,
MEM_CGROUP_TARGET_THRESH))) {
@@ -3731,8 +3734,12 @@ static ssize_t mem_cgroup_write(struct kernfs_open_file *of,
}
break;
case RES_SOFT_LIMIT:
- memcg->soft_limit = nr_pages;
- ret = 0;
+ if (IS_ENABLED(CONFIG_PREEMPT_RT)) {
+ ret = -EOPNOTSUPP;
+ } else {
+ memcg->soft_limit = nr_pages;
+ ret = 0;
+ }
break;
}
return ret ?: nbytes;
@@ -4708,6 +4715,9 @@ static ssize_t memcg_write_event_control(struct kernfs_open_file *of,
char *endp;
int ret;
+ if (IS_ENABLED(CONFIG_PREEMPT_RT))
+ return -EOPNOTSUPP;
+
buf = strstrip(buf);
efd = simple_strtoul(buf, &endp, 10);
--
2.35.1
^ permalink raw reply [flat|nested] 17+ messages in thread* [PATCH v4 3/6] mm/memcg: Protect per-CPU counter by disabling preemption on PREEMPT_RT where needed.
2022-02-21 18:25 [PATCH v4 0/6] mm/memcg: Address PREEMPT_RT problems instead of disabling it Sebastian Andrzej Siewior
2022-02-21 18:25 ` [PATCH v4 1/6] mm/memcg: Revert ("mm/memcg: optimize user context object stock access") Sebastian Andrzej Siewior
2022-02-21 18:25 ` [PATCH v4 2/6] mm/memcg: Disable threshold event handlers on PREEMPT_RT Sebastian Andrzej Siewior
@ 2022-02-21 18:25 ` Sebastian Andrzej Siewior
2022-02-22 5:41 ` Shakeel Butt
2022-02-21 18:25 ` [PATCH v4 4/6] mm/memcg: Opencode the inner part of obj_cgroup_uncharge_pages() in drain_obj_stock() Sebastian Andrzej Siewior
` (2 subsequent siblings)
5 siblings, 1 reply; 17+ messages in thread
From: Sebastian Andrzej Siewior @ 2022-02-21 18:25 UTC (permalink / raw)
To: cgroups, linux-mm
Cc: Andrew Morton, Johannes Weiner, Michal Hocko, Michal Koutný,
Peter Zijlstra, Thomas Gleixner, Vladimir Davydov, Waiman Long,
Sebastian Andrzej Siewior, Roman Gushchin
The per-CPU counter are modified with the non-atomic modifier. The
consistency is ensured by disabling interrupts for the update.
On non PREEMPT_RT configuration this works because acquiring a
spinlock_t typed lock with the _irq() suffix disables interrupts. On
PREEMPT_RT configurations the RMW operation can be interrupted.
Another problem is that mem_cgroup_swapout() expects to be invoked with
disabled interrupts because the caller has to acquire a spinlock_t which
is acquired with disabled interrupts. Since spinlock_t never disables
interrupts on PREEMPT_RT the interrupts are never disabled at this
point.
The code is never called from in_irq() context on PREEMPT_RT therefore
disabling preemption during the update is sufficient on PREEMPT_RT.
The sections which explicitly disable interrupts can remain on
PREEMPT_RT because the sections remain short and they don't involve
sleeping locks (memcg_check_events() is doing nothing on PREEMPT_RT).
Disable preemption during update of the per-CPU variables which do not
explicitly disable interrupts.
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Acked-by: Roman Gushchin <guro@fb.com>
---
mm/memcontrol.c | 49 ++++++++++++++++++++++++++++++++++++++++++++++++-
1 file changed, 48 insertions(+), 1 deletion(-)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 0b5117ed2ae08..28174c099aa1f 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -630,6 +630,35 @@ static DEFINE_SPINLOCK(stats_flush_lock);
static DEFINE_PER_CPU(unsigned int, stats_updates);
static atomic_t stats_flush_threshold = ATOMIC_INIT(0);
+/*
+ * Accessors to ensure that preemption is disabled on PREEMPT_RT because it can
+ * not rely on this as part of an acquired spinlock_t lock. These functions are
+ * never used in hardirq context on PREEMPT_RT and therefore disabling preemtion
+ * is sufficient.
+ */
+static void memcg_stats_lock(void)
+{
+#ifdef CONFIG_PREEMPT_RT
+ preempt_disable();
+#else
+ VM_BUG_ON(!irqs_disabled());
+#endif
+}
+
+static void __memcg_stats_lock(void)
+{
+#ifdef CONFIG_PREEMPT_RT
+ preempt_disable();
+#endif
+}
+
+static void memcg_stats_unlock(void)
+{
+#ifdef CONFIG_PREEMPT_RT
+ preempt_enable();
+#endif
+}
+
static inline void memcg_rstat_updated(struct mem_cgroup *memcg, int val)
{
unsigned int x;
@@ -706,6 +735,20 @@ void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
memcg = pn->memcg;
+ /*
+ * The caller from rmap relay on disabled preemption becase they never
+ * update their counter from in-interrupt context. For these two
+ * counters we check that the update is never performed from an
+ * interrupt context while other caller need to have disabled interrupt.
+ */
+ __memcg_stats_lock();
+ if (IS_ENABLED(CONFIG_DEBUG_VM)) {
+ if (idx == NR_ANON_MAPPED || idx == NR_FILE_MAPPED)
+ WARN_ON_ONCE(!in_task());
+ else
+ WARN_ON_ONCE(!irqs_disabled());
+ }
+
/* Update memcg */
__this_cpu_add(memcg->vmstats_percpu->state[idx], val);
@@ -713,6 +756,7 @@ void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
__this_cpu_add(pn->lruvec_stats_percpu->state[idx], val);
memcg_rstat_updated(memcg, val);
+ memcg_stats_unlock();
}
/**
@@ -795,8 +839,10 @@ void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx,
if (mem_cgroup_disabled())
return;
+ memcg_stats_lock();
__this_cpu_add(memcg->vmstats_percpu->events[idx], count);
memcg_rstat_updated(memcg, count);
+ memcg_stats_unlock();
}
static unsigned long memcg_events(struct mem_cgroup *memcg, int event)
@@ -7140,8 +7186,9 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
* important here to have the interrupts disabled because it is the
* only synchronisation we have for updating the per-CPU variables.
*/
- VM_BUG_ON(!irqs_disabled());
+ memcg_stats_lock();
mem_cgroup_charge_statistics(memcg, -nr_entries);
+ memcg_stats_unlock();
memcg_check_events(memcg, page_to_nid(page));
css_put(&memcg->css);
--
2.35.1
^ permalink raw reply [flat|nested] 17+ messages in thread* Re: [PATCH v4 3/6] mm/memcg: Protect per-CPU counter by disabling preemption on PREEMPT_RT where needed.
2022-02-21 18:25 ` [PATCH v4 3/6] mm/memcg: Protect per-CPU counter by disabling preemption on PREEMPT_RT where needed Sebastian Andrzej Siewior
@ 2022-02-22 5:41 ` Shakeel Butt
2022-02-25 15:17 ` [PATCH] mm/memcg: Add missing counter index which are not update in interrupt Sebastian Andrzej Siewior
0 siblings, 1 reply; 17+ messages in thread
From: Shakeel Butt @ 2022-02-22 5:41 UTC (permalink / raw)
To: Sebastian Andrzej Siewior
Cc: Cgroups, Linux MM, Andrew Morton, Johannes Weiner, Michal Hocko,
Michal Koutný,
Peter Zijlstra, Thomas Gleixner, Vladimir Davydov, Waiman Long,
Roman Gushchin
On Mon, Feb 21, 2022 at 10:26 AM Sebastian Andrzej Siewior
<bigeasy@linutronix.de> wrote:
>
[...]
> + /*
> + * The caller from rmap relay on disabled preemption becase they never
> + * update their counter from in-interrupt context. For these two
> + * counters we check that the update is never performed from an
> + * interrupt context while other caller need to have disabled interrupt.
> + */
> + __memcg_stats_lock();
> + if (IS_ENABLED(CONFIG_DEBUG_VM)) {
> + if (idx == NR_ANON_MAPPED || idx == NR_FILE_MAPPED)
NR_ANON_THPS, NR_SHMEM_PMDMAPPED, NR_FILE_PMDMAPPED are missing and
the switch statement would be cleaner.
^ permalink raw reply [flat|nested] 17+ messages in thread* [PATCH] mm/memcg: Add missing counter index which are not update in interrupt.
2022-02-22 5:41 ` Shakeel Butt
@ 2022-02-25 15:17 ` Sebastian Andrzej Siewior
2022-02-25 17:08 ` Shakeel Butt
0 siblings, 1 reply; 17+ messages in thread
From: Sebastian Andrzej Siewior @ 2022-02-25 15:17 UTC (permalink / raw)
To: Shakeel Butt
Cc: Cgroups, Linux MM, Andrew Morton, Johannes Weiner, Michal Hocko,
Michal Koutný,
Peter Zijlstra, Thomas Gleixner, Vladimir Davydov, Waiman Long,
Roman Gushchin
Shakeel Butt reported that I missed a few counters which are not updated
in-interrupt context and therefore disabling preemption is fine.
Please fold into:
"Protect per-CPU counter by disabling preemption on PREEMPT_RT"
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
mm/memcontrol.c | 11 +++++++++--
1 file changed, 9 insertions(+), 2 deletions(-)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 9c29b1a0e6bec..7883e2f2af3e8 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -743,10 +743,17 @@ void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
*/
__memcg_stats_lock();
if (IS_ENABLED(CONFIG_DEBUG_VM)) {
- if (idx == NR_ANON_MAPPED || idx == NR_FILE_MAPPED)
+ switch (idx) {
+ case NR_ANON_MAPPED:
+ case NR_FILE_MAPPED:
+ case NR_ANON_THPS:
+ case NR_SHMEM_PMDMAPPED:
+ case NR_FILE_PMDMAPPED:
WARN_ON_ONCE(!in_task());
- else
+ break;
+ default:
WARN_ON_ONCE(!irqs_disabled());
+ }
}
/* Update memcg */
--
2.35.1
^ permalink raw reply [flat|nested] 17+ messages in thread* Re: [PATCH] mm/memcg: Add missing counter index which are not update in interrupt.
2022-02-25 15:17 ` [PATCH] mm/memcg: Add missing counter index which are not update in interrupt Sebastian Andrzej Siewior
@ 2022-02-25 17:08 ` Shakeel Butt
2022-02-25 23:01 ` [PATCH] mm/memcg: Only perform the debug checks on !PREEMPT_RT Sebastian Andrzej Siewior
0 siblings, 1 reply; 17+ messages in thread
From: Shakeel Butt @ 2022-02-25 17:08 UTC (permalink / raw)
To: Sebastian Andrzej Siewior
Cc: Cgroups, Linux MM, Andrew Morton, Johannes Weiner, Michal Hocko,
Michal Koutný,
Peter Zijlstra, Thomas Gleixner, Vladimir Davydov, Waiman Long,
Roman Gushchin
On Fri, Feb 25, 2022 at 7:17 AM Sebastian Andrzej Siewior
<bigeasy@linutronix.de> wrote:
>
> Shakeel Butt reported that I missed a few counters which are not updated
> in-interrupt context and therefore disabling preemption is fine.
>
> Please fold into:
> "Protect per-CPU counter by disabling preemption on PREEMPT_RT"
>
> Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Thanks. For the folded patch:
Reviewed-by: Shakeel Butt <shakeelb@google.com>
^ permalink raw reply [flat|nested] 17+ messages in thread
* [PATCH] mm/memcg: Only perform the debug checks on !PREEMPT_RT
2022-02-25 17:08 ` Shakeel Butt
@ 2022-02-25 23:01 ` Sebastian Andrzej Siewior
2022-02-26 0:05 ` Andrew Morton
0 siblings, 1 reply; 17+ messages in thread
From: Sebastian Andrzej Siewior @ 2022-02-25 23:01 UTC (permalink / raw)
To: Shakeel Butt
Cc: Cgroups, Linux MM, Andrew Morton, Johannes Weiner, Michal Hocko,
Michal Koutný,
Peter Zijlstra, Thomas Gleixner, Vladimir Davydov, Waiman Long,
Roman Gushchin
On PREEMPT_RT interrupts and preemption is always enabled. The locking
function __memcg_stats_lock() always disabled preemptions. The recently
added checks need to performed only on !PREEMPT_RT where preemption and
disabled interrupts are used.
Please fold into:
"Protect per-CPU counter by disabling preemption on PREEMPT_RT"
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
Andrew, if this getting to confused at some point, I can fold it myself
and repost the whole lot. Whatever works best for your.
mm/memcontrol.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 73832cd1e9da4..63287fd03250b 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -741,7 +741,7 @@ void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
* interrupt context while other caller need to have disabled interrupt.
*/
__memcg_stats_lock();
- if (IS_ENABLED(CONFIG_DEBUG_VM)) {
+ if (IS_ENABLED(CONFIG_DEBUG_VM) && !IS_ENABLED(CONFIG_PREEMPT_RT)) {
switch (idx) {
case NR_ANON_MAPPED:
case NR_FILE_MAPPED:
--
2.35.1
^ permalink raw reply [flat|nested] 17+ messages in thread* Re: [PATCH] mm/memcg: Only perform the debug checks on !PREEMPT_RT
2022-02-25 23:01 ` [PATCH] mm/memcg: Only perform the debug checks on !PREEMPT_RT Sebastian Andrzej Siewior
@ 2022-02-26 0:05 ` Andrew Morton
0 siblings, 0 replies; 17+ messages in thread
From: Andrew Morton @ 2022-02-26 0:05 UTC (permalink / raw)
To: Sebastian Andrzej Siewior
Cc: Shakeel Butt, Cgroups, Linux MM, Johannes Weiner, Michal Hocko,
Michal Koutný,
Peter Zijlstra, Thomas Gleixner, Vladimir Davydov, Waiman Long,
Roman Gushchin
On Sat, 26 Feb 2022 00:01:47 +0100 Sebastian Andrzej Siewior <bigeasy@linutronix.de> wrote:
>
> Andrew, if this getting to confused at some point, I can fold it myself
> and repost the whole lot. Whatever works best for your.
I think I have it all but yes please, do send along a v5 series and
I'll double-check.
^ permalink raw reply [flat|nested] 17+ messages in thread
* [PATCH v4 4/6] mm/memcg: Opencode the inner part of obj_cgroup_uncharge_pages() in drain_obj_stock()
2022-02-21 18:25 [PATCH v4 0/6] mm/memcg: Address PREEMPT_RT problems instead of disabling it Sebastian Andrzej Siewior
` (2 preceding siblings ...)
2022-02-21 18:25 ` [PATCH v4 3/6] mm/memcg: Protect per-CPU counter by disabling preemption on PREEMPT_RT where needed Sebastian Andrzej Siewior
@ 2022-02-21 18:25 ` Sebastian Andrzej Siewior
2022-02-21 18:25 ` [PATCH v4 5/6] mm/memcg: Protect memcg_stock with a local_lock_t Sebastian Andrzej Siewior
2022-02-21 18:25 ` [PATCH v4 6/6] mm/memcg: Disable migration instead of preemption in drain_all_stock() Sebastian Andrzej Siewior
5 siblings, 0 replies; 17+ messages in thread
From: Sebastian Andrzej Siewior @ 2022-02-21 18:25 UTC (permalink / raw)
To: cgroups, linux-mm
Cc: Andrew Morton, Johannes Weiner, Michal Hocko, Michal Koutný,
Peter Zijlstra, Thomas Gleixner, Vladimir Davydov, Waiman Long,
Sebastian Andrzej Siewior, Shakeel Butt, Roman Gushchin,
Michal Hocko
From: Johannes Weiner <hannes@cmpxchg.org>
Provide the inner part of refill_stock() as __refill_stock() without
disabling interrupts. This eases the integration of local_lock_t where
recursive locking must be avoided.
Open code obj_cgroup_uncharge_pages() in drain_obj_stock() and use
__refill_stock(). The caller of drain_obj_stock() already disables
interrupts.
[bigeasy: Patch body around Johannes' diff ]
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Reviewed-by: Roman Gushchin <guro@fb.com>
Acked-by: Michal Hocko <mhocko@suse.com>
---
mm/memcontrol.c | 24 ++++++++++++++++++------
1 file changed, 18 insertions(+), 6 deletions(-)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 28174c099aa1f..cd3950256519d 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2244,12 +2244,9 @@ static void drain_local_stock(struct work_struct *dummy)
* Cache charges(val) to local per_cpu area.
* This will be consumed by consume_stock() function, later.
*/
-static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
+static void __refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
{
struct memcg_stock_pcp *stock;
- unsigned long flags;
-
- local_irq_save(flags);
stock = this_cpu_ptr(&memcg_stock);
if (stock->cached != memcg) { /* reset if necessary */
@@ -2261,7 +2258,14 @@ static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
if (stock->nr_pages > MEMCG_CHARGE_BATCH)
drain_stock(stock);
+}
+static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+ __refill_stock(memcg, nr_pages);
local_irq_restore(flags);
}
@@ -3178,8 +3182,16 @@ static void drain_obj_stock(struct memcg_stock_pcp *stock)
unsigned int nr_pages = stock->nr_bytes >> PAGE_SHIFT;
unsigned int nr_bytes = stock->nr_bytes & (PAGE_SIZE - 1);
- if (nr_pages)
- obj_cgroup_uncharge_pages(old, nr_pages);
+ if (nr_pages) {
+ struct mem_cgroup *memcg;
+
+ memcg = get_mem_cgroup_from_objcg(old);
+
+ memcg_account_kmem(memcg, -nr_pages);
+ __refill_stock(memcg, nr_pages);
+
+ css_put(&memcg->css);
+ }
/*
* The leftover is flushed to the centralized per-memcg value.
--
2.35.1
^ permalink raw reply [flat|nested] 17+ messages in thread* [PATCH v4 5/6] mm/memcg: Protect memcg_stock with a local_lock_t
2022-02-21 18:25 [PATCH v4 0/6] mm/memcg: Address PREEMPT_RT problems instead of disabling it Sebastian Andrzej Siewior
` (3 preceding siblings ...)
2022-02-21 18:25 ` [PATCH v4 4/6] mm/memcg: Opencode the inner part of obj_cgroup_uncharge_pages() in drain_obj_stock() Sebastian Andrzej Siewior
@ 2022-02-21 18:25 ` Sebastian Andrzej Siewior
2022-02-22 9:52 ` Michal Hocko
2022-02-21 18:25 ` [PATCH v4 6/6] mm/memcg: Disable migration instead of preemption in drain_all_stock() Sebastian Andrzej Siewior
5 siblings, 1 reply; 17+ messages in thread
From: Sebastian Andrzej Siewior @ 2022-02-21 18:25 UTC (permalink / raw)
To: cgroups, linux-mm
Cc: Andrew Morton, Johannes Weiner, Michal Hocko, Michal Koutný,
Peter Zijlstra, Thomas Gleixner, Vladimir Davydov, Waiman Long,
Sebastian Andrzej Siewior, kernel test robot
The members of the per-CPU structure memcg_stock_pcp are protected by
disabling interrupts. This is not working on PREEMPT_RT because it
creates atomic context in which actions are performed which require
preemptible context. One example is obj_cgroup_release().
The IRQ-disable sections can be replaced with local_lock_t which
preserves the explicit disabling of interrupts while keeps the code
preemptible on PREEMPT_RT.
drain_obj_stock() drops a reference on obj_cgroup which leads to an invocation
of obj_cgroup_release() if it is the last object. This in turn leads to
recursive locking of the local_lock_t. To avoid this, obj_cgroup_release() is
invoked outside of the locked section.
obj_cgroup_uncharge_pages() can be invoked with the local_lock_t acquired and
without it. This will lead later to a recursion in refill_stock(). To
avoid the locking recursion provide obj_cgroup_uncharge_pages_locked()
which uses the locked version of refill_stock().
- Replace disabling interrupts for memcg_stock with a local_lock_t.
- Let drain_obj_stock() return the old struct obj_cgroup which is passed
to obj_cgroup_put() outside of the locked section.
- Provide obj_cgroup_uncharge_pages_locked() which uses the locked
version of refill_stock() to avoid recursive locking in
drain_obj_stock().
Link: https://lkml.kernel.org/r/20220209014709.GA26885@xsang-OptiPlex-9020
Reported-by: kernel test robot <oliver.sang@intel.com>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
mm/memcontrol.c | 55 ++++++++++++++++++++++++++++++-------------------
1 file changed, 34 insertions(+), 21 deletions(-)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index cd3950256519d..3d7ccb104374c 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2128,6 +2128,7 @@ void unlock_page_memcg(struct page *page)
}
struct memcg_stock_pcp {
+ local_lock_t stock_lock;
struct mem_cgroup *cached; /* this never be root cgroup */
unsigned int nr_pages;
@@ -2143,18 +2144,21 @@ struct memcg_stock_pcp {
unsigned long flags;
#define FLUSHING_CACHED_CHARGE 0
};
-static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock);
+static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock) = {
+ .stock_lock = INIT_LOCAL_LOCK(stock_lock),
+};
static DEFINE_MUTEX(percpu_charge_mutex);
#ifdef CONFIG_MEMCG_KMEM
-static void drain_obj_stock(struct memcg_stock_pcp *stock);
+static struct obj_cgroup *drain_obj_stock(struct memcg_stock_pcp *stock);
static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
struct mem_cgroup *root_memcg);
static void memcg_account_kmem(struct mem_cgroup *memcg, int nr_pages);
#else
-static inline void drain_obj_stock(struct memcg_stock_pcp *stock)
+static inline struct obj_cgroup *drain_obj_stock(struct memcg_stock_pcp *stock)
{
+ return NULL;
}
static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
struct mem_cgroup *root_memcg)
@@ -2186,7 +2190,7 @@ static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
if (nr_pages > MEMCG_CHARGE_BATCH)
return ret;
- local_irq_save(flags);
+ local_lock_irqsave(&memcg_stock.stock_lock, flags);
stock = this_cpu_ptr(&memcg_stock);
if (memcg == stock->cached && stock->nr_pages >= nr_pages) {
@@ -2194,7 +2198,7 @@ static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
ret = true;
}
- local_irq_restore(flags);
+ local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
return ret;
}
@@ -2223,6 +2227,7 @@ static void drain_stock(struct memcg_stock_pcp *stock)
static void drain_local_stock(struct work_struct *dummy)
{
struct memcg_stock_pcp *stock;
+ struct obj_cgroup *old = NULL;
unsigned long flags;
/*
@@ -2230,14 +2235,16 @@ static void drain_local_stock(struct work_struct *dummy)
* drain_stock races is that we always operate on local CPU stock
* here with IRQ disabled
*/
- local_irq_save(flags);
+ local_lock_irqsave(&memcg_stock.stock_lock, flags);
stock = this_cpu_ptr(&memcg_stock);
- drain_obj_stock(stock);
+ old = drain_obj_stock(stock);
drain_stock(stock);
clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);
- local_irq_restore(flags);
+ local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
+ if (old)
+ obj_cgroup_put(old);
}
/*
@@ -2264,9 +2271,9 @@ static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
{
unsigned long flags;
- local_irq_save(flags);
+ local_lock_irqsave(&memcg_stock.stock_lock, flags);
__refill_stock(memcg, nr_pages);
- local_irq_restore(flags);
+ local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
}
/*
@@ -3093,10 +3100,11 @@ void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat,
enum node_stat_item idx, int nr)
{
struct memcg_stock_pcp *stock;
+ struct obj_cgroup *old = NULL;
unsigned long flags;
int *bytes;
- local_irq_save(flags);
+ local_lock_irqsave(&memcg_stock.stock_lock, flags);
stock = this_cpu_ptr(&memcg_stock);
/*
@@ -3105,7 +3113,7 @@ void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat,
* changes.
*/
if (stock->cached_objcg != objcg) {
- drain_obj_stock(stock);
+ old = drain_obj_stock(stock);
obj_cgroup_get(objcg);
stock->nr_bytes = atomic_read(&objcg->nr_charged_bytes)
? atomic_xchg(&objcg->nr_charged_bytes, 0) : 0;
@@ -3149,7 +3157,9 @@ void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat,
if (nr)
mod_objcg_mlstate(objcg, pgdat, idx, nr);
- local_irq_restore(flags);
+ local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
+ if (old)
+ obj_cgroup_put(old);
}
static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes)
@@ -3158,7 +3168,7 @@ static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes)
unsigned long flags;
bool ret = false;
- local_irq_save(flags);
+ local_lock_irqsave(&memcg_stock.stock_lock, flags);
stock = this_cpu_ptr(&memcg_stock);
if (objcg == stock->cached_objcg && stock->nr_bytes >= nr_bytes) {
@@ -3166,17 +3176,17 @@ static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes)
ret = true;
}
- local_irq_restore(flags);
+ local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
return ret;
}
-static void drain_obj_stock(struct memcg_stock_pcp *stock)
+static struct obj_cgroup *drain_obj_stock(struct memcg_stock_pcp *stock)
{
struct obj_cgroup *old = stock->cached_objcg;
if (!old)
- return;
+ return NULL;
if (stock->nr_bytes) {
unsigned int nr_pages = stock->nr_bytes >> PAGE_SHIFT;
@@ -3226,8 +3236,8 @@ static void drain_obj_stock(struct memcg_stock_pcp *stock)
stock->cached_pgdat = NULL;
}
- obj_cgroup_put(old);
stock->cached_objcg = NULL;
+ return old;
}
static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
@@ -3248,14 +3258,15 @@ static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes,
bool allow_uncharge)
{
struct memcg_stock_pcp *stock;
+ struct obj_cgroup *old = NULL;
unsigned long flags;
unsigned int nr_pages = 0;
- local_irq_save(flags);
+ local_lock_irqsave(&memcg_stock.stock_lock, flags);
stock = this_cpu_ptr(&memcg_stock);
if (stock->cached_objcg != objcg) { /* reset if necessary */
- drain_obj_stock(stock);
+ old = drain_obj_stock(stock);
obj_cgroup_get(objcg);
stock->cached_objcg = objcg;
stock->nr_bytes = atomic_read(&objcg->nr_charged_bytes)
@@ -3269,7 +3280,9 @@ static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes,
stock->nr_bytes &= (PAGE_SIZE - 1);
}
- local_irq_restore(flags);
+ local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
+ if (old)
+ obj_cgroup_put(old);
if (nr_pages)
obj_cgroup_uncharge_pages(objcg, nr_pages);
--
2.35.1
^ permalink raw reply [flat|nested] 17+ messages in thread* Re: [PATCH v4 5/6] mm/memcg: Protect memcg_stock with a local_lock_t
2022-02-21 18:25 ` [PATCH v4 5/6] mm/memcg: Protect memcg_stock with a local_lock_t Sebastian Andrzej Siewior
@ 2022-02-22 9:52 ` Michal Hocko
2022-02-25 15:15 ` [PATCH] mm/memcg: Add a comment regarding the release `obj' Sebastian Andrzej Siewior
0 siblings, 1 reply; 17+ messages in thread
From: Michal Hocko @ 2022-02-22 9:52 UTC (permalink / raw)
To: Sebastian Andrzej Siewior
Cc: cgroups, linux-mm, Andrew Morton, Johannes Weiner,
Michal Koutný,
Peter Zijlstra, Thomas Gleixner, Vladimir Davydov, Waiman Long,
kernel test robot
On Mon 21-02-22 19:25:39, Sebastian Andrzej Siewior wrote:
> The members of the per-CPU structure memcg_stock_pcp are protected by
> disabling interrupts. This is not working on PREEMPT_RT because it
> creates atomic context in which actions are performed which require
> preemptible context. One example is obj_cgroup_release().
>
> The IRQ-disable sections can be replaced with local_lock_t which
> preserves the explicit disabling of interrupts while keeps the code
> preemptible on PREEMPT_RT.
>
> drain_obj_stock() drops a reference on obj_cgroup which leads to an invocation
> of obj_cgroup_release() if it is the last object. This in turn leads to
> recursive locking of the local_lock_t. To avoid this, obj_cgroup_release() is
> invoked outside of the locked section.
>
> obj_cgroup_uncharge_pages() can be invoked with the local_lock_t acquired and
> without it. This will lead later to a recursion in refill_stock(). To
> avoid the locking recursion provide obj_cgroup_uncharge_pages_locked()
> which uses the locked version of refill_stock().
>
> - Replace disabling interrupts for memcg_stock with a local_lock_t.
>
> - Let drain_obj_stock() return the old struct obj_cgroup which is passed
> to obj_cgroup_put() outside of the locked section.
>
> - Provide obj_cgroup_uncharge_pages_locked() which uses the locked
> version of refill_stock() to avoid recursive locking in
> drain_obj_stock().
>
> Link: https://lkml.kernel.org/r/20220209014709.GA26885@xsang-OptiPlex-9020
> Reported-by: kernel test robot <oliver.sang@intel.com>
> Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Thank you, this looks much more straightforward. I am not super fan of
the requirement to call obj_cgroup_put oudside of drain_obj_stock but
I can live with that. A comment in drain_obj_stock to call that out
explicitly would be nice (can be done as a follow up and have Andrew
fold it in if there is no new resubmission).
Acked-by: Michal Hocko <mhocko@suse.com>
> ---
> mm/memcontrol.c | 55 ++++++++++++++++++++++++++++++-------------------
> 1 file changed, 34 insertions(+), 21 deletions(-)
>
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index cd3950256519d..3d7ccb104374c 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -2128,6 +2128,7 @@ void unlock_page_memcg(struct page *page)
> }
>
> struct memcg_stock_pcp {
> + local_lock_t stock_lock;
> struct mem_cgroup *cached; /* this never be root cgroup */
> unsigned int nr_pages;
>
> @@ -2143,18 +2144,21 @@ struct memcg_stock_pcp {
> unsigned long flags;
> #define FLUSHING_CACHED_CHARGE 0
> };
> -static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock);
> +static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock) = {
> + .stock_lock = INIT_LOCAL_LOCK(stock_lock),
> +};
> static DEFINE_MUTEX(percpu_charge_mutex);
>
> #ifdef CONFIG_MEMCG_KMEM
> -static void drain_obj_stock(struct memcg_stock_pcp *stock);
> +static struct obj_cgroup *drain_obj_stock(struct memcg_stock_pcp *stock);
> static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
> struct mem_cgroup *root_memcg);
> static void memcg_account_kmem(struct mem_cgroup *memcg, int nr_pages);
>
> #else
> -static inline void drain_obj_stock(struct memcg_stock_pcp *stock)
> +static inline struct obj_cgroup *drain_obj_stock(struct memcg_stock_pcp *stock)
> {
> + return NULL;
> }
> static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
> struct mem_cgroup *root_memcg)
> @@ -2186,7 +2190,7 @@ static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
> if (nr_pages > MEMCG_CHARGE_BATCH)
> return ret;
>
> - local_irq_save(flags);
> + local_lock_irqsave(&memcg_stock.stock_lock, flags);
>
> stock = this_cpu_ptr(&memcg_stock);
> if (memcg == stock->cached && stock->nr_pages >= nr_pages) {
> @@ -2194,7 +2198,7 @@ static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
> ret = true;
> }
>
> - local_irq_restore(flags);
> + local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
>
> return ret;
> }
> @@ -2223,6 +2227,7 @@ static void drain_stock(struct memcg_stock_pcp *stock)
> static void drain_local_stock(struct work_struct *dummy)
> {
> struct memcg_stock_pcp *stock;
> + struct obj_cgroup *old = NULL;
> unsigned long flags;
>
> /*
> @@ -2230,14 +2235,16 @@ static void drain_local_stock(struct work_struct *dummy)
> * drain_stock races is that we always operate on local CPU stock
> * here with IRQ disabled
> */
> - local_irq_save(flags);
> + local_lock_irqsave(&memcg_stock.stock_lock, flags);
>
> stock = this_cpu_ptr(&memcg_stock);
> - drain_obj_stock(stock);
> + old = drain_obj_stock(stock);
> drain_stock(stock);
> clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);
>
> - local_irq_restore(flags);
> + local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
> + if (old)
> + obj_cgroup_put(old);
> }
>
> /*
> @@ -2264,9 +2271,9 @@ static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
> {
> unsigned long flags;
>
> - local_irq_save(flags);
> + local_lock_irqsave(&memcg_stock.stock_lock, flags);
> __refill_stock(memcg, nr_pages);
> - local_irq_restore(flags);
> + local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
> }
>
> /*
> @@ -3093,10 +3100,11 @@ void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat,
> enum node_stat_item idx, int nr)
> {
> struct memcg_stock_pcp *stock;
> + struct obj_cgroup *old = NULL;
> unsigned long flags;
> int *bytes;
>
> - local_irq_save(flags);
> + local_lock_irqsave(&memcg_stock.stock_lock, flags);
> stock = this_cpu_ptr(&memcg_stock);
>
> /*
> @@ -3105,7 +3113,7 @@ void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat,
> * changes.
> */
> if (stock->cached_objcg != objcg) {
> - drain_obj_stock(stock);
> + old = drain_obj_stock(stock);
> obj_cgroup_get(objcg);
> stock->nr_bytes = atomic_read(&objcg->nr_charged_bytes)
> ? atomic_xchg(&objcg->nr_charged_bytes, 0) : 0;
> @@ -3149,7 +3157,9 @@ void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat,
> if (nr)
> mod_objcg_mlstate(objcg, pgdat, idx, nr);
>
> - local_irq_restore(flags);
> + local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
> + if (old)
> + obj_cgroup_put(old);
> }
>
> static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes)
> @@ -3158,7 +3168,7 @@ static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes)
> unsigned long flags;
> bool ret = false;
>
> - local_irq_save(flags);
> + local_lock_irqsave(&memcg_stock.stock_lock, flags);
>
> stock = this_cpu_ptr(&memcg_stock);
> if (objcg == stock->cached_objcg && stock->nr_bytes >= nr_bytes) {
> @@ -3166,17 +3176,17 @@ static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes)
> ret = true;
> }
>
> - local_irq_restore(flags);
> + local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
>
> return ret;
> }
>
> -static void drain_obj_stock(struct memcg_stock_pcp *stock)
> +static struct obj_cgroup *drain_obj_stock(struct memcg_stock_pcp *stock)
> {
> struct obj_cgroup *old = stock->cached_objcg;
>
> if (!old)
> - return;
> + return NULL;
>
> if (stock->nr_bytes) {
> unsigned int nr_pages = stock->nr_bytes >> PAGE_SHIFT;
> @@ -3226,8 +3236,8 @@ static void drain_obj_stock(struct memcg_stock_pcp *stock)
> stock->cached_pgdat = NULL;
> }
>
> - obj_cgroup_put(old);
> stock->cached_objcg = NULL;
> + return old;
> }
>
> static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
> @@ -3248,14 +3258,15 @@ static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes,
> bool allow_uncharge)
> {
> struct memcg_stock_pcp *stock;
> + struct obj_cgroup *old = NULL;
> unsigned long flags;
> unsigned int nr_pages = 0;
>
> - local_irq_save(flags);
> + local_lock_irqsave(&memcg_stock.stock_lock, flags);
>
> stock = this_cpu_ptr(&memcg_stock);
> if (stock->cached_objcg != objcg) { /* reset if necessary */
> - drain_obj_stock(stock);
> + old = drain_obj_stock(stock);
> obj_cgroup_get(objcg);
> stock->cached_objcg = objcg;
> stock->nr_bytes = atomic_read(&objcg->nr_charged_bytes)
> @@ -3269,7 +3280,9 @@ static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes,
> stock->nr_bytes &= (PAGE_SIZE - 1);
> }
>
> - local_irq_restore(flags);
> + local_unlock_irqrestore(&memcg_stock.stock_lock, flags);
> + if (old)
> + obj_cgroup_put(old);
>
> if (nr_pages)
> obj_cgroup_uncharge_pages(objcg, nr_pages);
> --
> 2.35.1
--
Michal Hocko
SUSE Labs
^ permalink raw reply [flat|nested] 17+ messages in thread* [PATCH] mm/memcg: Add a comment regarding the release `obj'.
2022-02-22 9:52 ` Michal Hocko
@ 2022-02-25 15:15 ` Sebastian Andrzej Siewior
0 siblings, 0 replies; 17+ messages in thread
From: Sebastian Andrzej Siewior @ 2022-02-25 15:15 UTC (permalink / raw)
To: Michal Hocko
Cc: cgroups, linux-mm, Andrew Morton, Johannes Weiner,
Michal Koutný,
Peter Zijlstra, Thomas Gleixner, Vladimir Davydov, Waiman Long,
kernel test robot
Please fold into
mm/memcg: Protect memcg_stock with a local_lock_t
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
mm/memcontrol.c | 4 ++++
1 file changed, 4 insertions(+)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 7883e2f2af3e8..19d4f9297b0c6 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -3245,6 +3245,10 @@ static struct obj_cgroup *drain_obj_stock(struct memcg_stock_pcp *stock)
}
stock->cached_objcg = NULL;
+ /*
+ * The `old' objects needs to be released by the caller via
+ * obj_cgroup_put() outside of memcg_stock_pcp::stock_lock.
+ */
return old;
}
--
2.35.1
^ permalink raw reply [flat|nested] 17+ messages in thread
* [PATCH v4 6/6] mm/memcg: Disable migration instead of preemption in drain_all_stock().
2022-02-21 18:25 [PATCH v4 0/6] mm/memcg: Address PREEMPT_RT problems instead of disabling it Sebastian Andrzej Siewior
` (4 preceding siblings ...)
2022-02-21 18:25 ` [PATCH v4 5/6] mm/memcg: Protect memcg_stock with a local_lock_t Sebastian Andrzej Siewior
@ 2022-02-21 18:25 ` Sebastian Andrzej Siewior
2022-02-22 9:56 ` Michal Hocko
5 siblings, 1 reply; 17+ messages in thread
From: Sebastian Andrzej Siewior @ 2022-02-21 18:25 UTC (permalink / raw)
To: cgroups, linux-mm
Cc: Andrew Morton, Johannes Weiner, Michal Hocko, Michal Koutný,
Peter Zijlstra, Thomas Gleixner, Vladimir Davydov, Waiman Long,
Sebastian Andrzej Siewior
Before the for-each-CPU loop, preemption is disabled so that so that
drain_local_stock() can be invoked directly instead of scheduling a
worker. Ensuring that drain_local_stock() completed on the local CPU is
not correctness problem. It _could_ be that the charging path will be
forced to reclaim memory because cached charges are still waiting for
their draining.
Disabling preemption before invoking drain_local_stock() is problematic
on PREEMPT_RT due to the sleeping locks involved. To ensure that no CPU
migrations happens across the for_each_online_cpu() it is enouhg to use
migrate_disable() which disables migration and keeps context preemptible
to a sleeping lock can be acquired.
Use migrate_disable() instead of get_cpu() around the
for_each_online_cpu() loop.
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
mm/memcontrol.c | 5 +++--
1 file changed, 3 insertions(+), 2 deletions(-)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 3d7ccb104374c..9c29b1a0e6bec 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2293,7 +2293,8 @@ static void drain_all_stock(struct mem_cgroup *root_memcg)
* as well as workers from this path always operate on the local
* per-cpu data. CPU up doesn't touch memcg_stock at all.
*/
- curcpu = get_cpu();
+ migrate_disable();
+ curcpu = smp_processor_id();
for_each_online_cpu(cpu) {
struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
struct mem_cgroup *memcg;
@@ -2316,7 +2317,7 @@ static void drain_all_stock(struct mem_cgroup *root_memcg)
schedule_work_on(cpu, &stock->work);
}
}
- put_cpu();
+ migrate_enable();
mutex_unlock(&percpu_charge_mutex);
}
--
2.35.1
^ permalink raw reply [flat|nested] 17+ messages in thread* Re: [PATCH v4 6/6] mm/memcg: Disable migration instead of preemption in drain_all_stock().
2022-02-21 18:25 ` [PATCH v4 6/6] mm/memcg: Disable migration instead of preemption in drain_all_stock() Sebastian Andrzej Siewior
@ 2022-02-22 9:56 ` Michal Hocko
2022-02-25 20:13 ` [PATCH v5 " Sebastian Andrzej Siewior
0 siblings, 1 reply; 17+ messages in thread
From: Michal Hocko @ 2022-02-22 9:56 UTC (permalink / raw)
To: Sebastian Andrzej Siewior
Cc: cgroups, linux-mm, Andrew Morton, Johannes Weiner,
Michal Koutný,
Peter Zijlstra, Thomas Gleixner, Vladimir Davydov, Waiman Long
On Mon 21-02-22 19:25:40, Sebastian Andrzej Siewior wrote:
> Before the for-each-CPU loop, preemption is disabled so that so that
> drain_local_stock() can be invoked directly instead of scheduling a
> worker. Ensuring that drain_local_stock() completed on the local CPU is
> not correctness problem. It _could_ be that the charging path will be
> forced to reclaim memory because cached charges are still waiting for
> their draining.
>
> Disabling preemption before invoking drain_local_stock() is problematic
> on PREEMPT_RT due to the sleeping locks involved. To ensure that no CPU
> migrations happens across the for_each_online_cpu() it is enouhg to use
> migrate_disable() which disables migration and keeps context preemptible
> to a sleeping lock can be acquired.
I would just add that a race with cpu hotplug is not a problem because
pcp data is not going away. In the worst case we just schedule draining
of an empty stock.
>
> Use migrate_disable() instead of get_cpu() around the
> for_each_online_cpu() loop.
>
> Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Acked-by: Michal Hocko <mhocko@suse.com>
Thanks!
> ---
> mm/memcontrol.c | 5 +++--
> 1 file changed, 3 insertions(+), 2 deletions(-)
>
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index 3d7ccb104374c..9c29b1a0e6bec 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -2293,7 +2293,8 @@ static void drain_all_stock(struct mem_cgroup *root_memcg)
> * as well as workers from this path always operate on the local
> * per-cpu data. CPU up doesn't touch memcg_stock at all.
> */
> - curcpu = get_cpu();
> + migrate_disable();
> + curcpu = smp_processor_id();
> for_each_online_cpu(cpu) {
> struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
> struct mem_cgroup *memcg;
> @@ -2316,7 +2317,7 @@ static void drain_all_stock(struct mem_cgroup *root_memcg)
> schedule_work_on(cpu, &stock->work);
> }
> }
> - put_cpu();
> + migrate_enable();
> mutex_unlock(&percpu_charge_mutex);
> }
>
> --
> 2.35.1
--
Michal Hocko
SUSE Labs
^ permalink raw reply [flat|nested] 17+ messages in thread* [PATCH v5 6/6] mm/memcg: Disable migration instead of preemption in drain_all_stock().
2022-02-22 9:56 ` Michal Hocko
@ 2022-02-25 20:13 ` Sebastian Andrzej Siewior
0 siblings, 0 replies; 17+ messages in thread
From: Sebastian Andrzej Siewior @ 2022-02-25 20:13 UTC (permalink / raw)
To: Michal Hocko
Cc: cgroups, linux-mm, Andrew Morton, Johannes Weiner,
Michal Koutný,
Peter Zijlstra, Thomas Gleixner, Vladimir Davydov, Waiman Long
Before the for-each-CPU loop, preemption is disabled so that so that
drain_local_stock() can be invoked directly instead of scheduling a
worker. Ensuring that drain_local_stock() completed on the local CPU is
not correctness problem. It _could_ be that the charging path will be
forced to reclaim memory because cached charges are still waiting for
their draining.
Disabling preemption before invoking drain_local_stock() is problematic
on PREEMPT_RT due to the sleeping locks involved. To ensure that no CPU
migrations happens across for_each_online_cpu() it is enouhg to use
migrate_disable() which disables migration and keeps context preemptible
to a sleeping lock can be acquired.
A race with CPU hotplug is not a problem because pcp data is not going away.
In the worst case we just schedule draining of an empty stock.
Use migrate_disable() instead of get_cpu() around the
for_each_online_cpu() loop.
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Acked-by: Michal Hocko <mhocko@suse.com>
---
v4…v5: Added Michal's notes regarding CPU hotplug.
mm/memcontrol.c | 5 +++--
1 file changed, 3 insertions(+), 2 deletions(-)
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 3d7ccb104374c..9c29b1a0e6bec 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2293,7 +2293,8 @@ static void drain_all_stock(struct mem_cgroup *root_memcg)
* as well as workers from this path always operate on the local
* per-cpu data. CPU up doesn't touch memcg_stock at all.
*/
- curcpu = get_cpu();
+ migrate_disable();
+ curcpu = smp_processor_id();
for_each_online_cpu(cpu) {
struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
struct mem_cgroup *memcg;
@@ -2316,7 +2317,7 @@ static void drain_all_stock(struct mem_cgroup *root_memcg)
schedule_work_on(cpu, &stock->work);
}
}
- put_cpu();
+ migrate_enable();
mutex_unlock(&percpu_charge_mutex);
}
--
2.35.1
^ permalink raw reply [flat|nested] 17+ messages in thread