linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed
From: Eric Dumazet <edumazet@google.com>
To: Johannes Weiner <hannes@cmpxchg.org>
Cc: Ivan Babrou <ivan@cloudflare.com>, Linux MM <linux-mm@kvack.org>,
	 Linux Kernel Network Developers <netdev@vger.kernel.org>,
	linux-kernel <linux-kernel@vger.kernel.org>,
	 Michal Hocko <mhocko@kernel.org>,
	Roman Gushchin <roman.gushchin@linux.dev>,
	 Shakeel Butt <shakeelb@google.com>,
	Muchun Song <songmuchun@bytedance.com>,
	 Andrew Morton <akpm@linux-foundation.org>,
	"David S. Miller" <davem@davemloft.net>,
	 Hideaki YOSHIFUJI <yoshfuji@linux-ipv6.org>,
	David Ahern <dsahern@kernel.org>,
	 Jakub Kicinski <kuba@kernel.org>,
	Paolo Abeni <pabeni@redhat.com>,
	cgroups@vger.kernel.org,
	 kernel-team <kernel-team@cloudflare.com>
Subject: Re: Low TCP throughput due to vmpressure with swap enabled
Date: Tue, 6 Dec 2022 20:13:50 +0100	[thread overview]
Message-ID: <CANn89iJfx4QdVBqJ23oFJoz5DJKou=ZwVBNNXFNDJRNAqNvzwQ@mail.gmail.com> (raw)
In-Reply-To: <Y4+RPry2tfbWFdSA@cmpxchg.org>

On Tue, Dec 6, 2022 at 8:00 PM Johannes Weiner <hannes@cmpxchg.org> wrote:
>
> On Mon, Dec 05, 2022 at 04:50:46PM -0800, Ivan Babrou wrote:
> > And now I can see plenty of this:
> >
> > [  108.156707][ T5175] socket pressure[2]: 4294673429
> > [  108.157050][ T5175] socket pressure[2]: 4294673429
> > [  108.157301][ T5175] socket pressure[2]: 4294673429
> > [  108.157581][ T5175] socket pressure[2]: 4294673429
> > [  108.157874][ T5175] socket pressure[2]: 4294673429
> > [  108.158254][ T5175] socket pressure[2]: 4294673429
> >
> > I think the first result below is to blame:
> >
> > $ rg '.->socket_pressure' mm
> > mm/memcontrol.c
> > 5280: memcg->socket_pressure = jiffies;
> > 7198: memcg->socket_pressure = 0;
> > 7201: memcg->socket_pressure = 1;
> > 7211: memcg->socket_pressure = 0;
> > 7215: memcg->socket_pressure = 1;
>
> Hoo boy, that's a silly mistake indeed. Thanks for tracking it down.
>
> > While we set socket_pressure to either zero or one in
> > mem_cgroup_charge_skmem, it is still initialized to jiffies on memcg
> > creation. Zero seems like a more appropriate starting point. With that
> > change I see it working as expected with no TCP speed bumps. My
> > ebpf_exporter program also looks happy and reports zero clamps in my
> > brief testing.
>
> Excellent, now this behavior makes sense.
>
> > I also think we should downgrade socket_pressure from "unsigned long"
> > to "bool", as it only holds zero and one now.
>
> Sounds good to me!
>
> Attaching the updated patch below. If nobody has any objections, I'll
> add a proper changelog, reported-bys, sign-off etc and send it out.
>
> ---
>  include/linux/memcontrol.h |  8 +++---
>  include/linux/vmpressure.h |  7 ++---
>  mm/memcontrol.c            | 20 +++++++++----
>  mm/vmpressure.c            | 58 ++++++--------------------------------
>  mm/vmscan.c                | 15 +---------
>  5 files changed, 30 insertions(+), 78 deletions(-)
>
> diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
> index e1644a24009c..ef1c388be5b3 100644
> --- a/include/linux/memcontrol.h
> +++ b/include/linux/memcontrol.h
> @@ -283,11 +283,11 @@ struct mem_cgroup {
>         atomic_long_t           memory_events[MEMCG_NR_MEMORY_EVENTS];
>         atomic_long_t           memory_events_local[MEMCG_NR_MEMORY_EVENTS];
>
> -       unsigned long           socket_pressure;
> +       /* Socket memory allocations have failed */
> +       bool                    socket_pressure;
>
>         /* Legacy tcp memory accounting */
>         bool                    tcpmem_active;
> -       int                     tcpmem_pressure;
>
>  #ifdef CONFIG_MEMCG_KMEM
>         int kmemcg_id;
> @@ -1701,10 +1701,10 @@ void mem_cgroup_sk_alloc(struct sock *sk);
>  void mem_cgroup_sk_free(struct sock *sk);
>  static inline bool mem_cgroup_under_socket_pressure(struct mem_cgroup *memcg)
>  {
> -       if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && memcg->tcpmem_pressure)
> +       if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && memcg->socket_pressure)

&& READ_ONCE(memcg->socket_pressure))

>                 return true;
>         do {
> -               if (time_before(jiffies, READ_ONCE(memcg->socket_pressure)))
> +               if (memcg->socket_pressure)

if (READ_ONCE(...))

>                         return true;
>         } while ((memcg = parent_mem_cgroup(memcg)));
>         return false;
> diff --git a/include/linux/vmpressure.h b/include/linux/vmpressure.h
> index 6a2f51ebbfd3..20d93de37a17 100644
> --- a/include/linux/vmpressure.h
> +++ b/include/linux/vmpressure.h
> @@ -11,9 +11,6 @@
>  #include <linux/eventfd.h>
>
>  struct vmpressure {
> -       unsigned long scanned;
> -       unsigned long reclaimed;
> -
>         unsigned long tree_scanned;
>         unsigned long tree_reclaimed;
>         /* The lock is used to keep the scanned/reclaimed above in sync. */
> @@ -30,7 +27,7 @@ struct vmpressure {
>  struct mem_cgroup;
>
>  #ifdef CONFIG_MEMCG
> -extern void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, bool tree,
> +extern void vmpressure(gfp_t gfp, struct mem_cgroup *memcg,
>                        unsigned long scanned, unsigned long reclaimed);
>  extern void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio);
>
> @@ -44,7 +41,7 @@ extern int vmpressure_register_event(struct mem_cgroup *memcg,
>  extern void vmpressure_unregister_event(struct mem_cgroup *memcg,
>                                         struct eventfd_ctx *eventfd);
>  #else
> -static inline void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, bool tree,
> +static inline void vmpressure(gfp_t gfp, struct mem_cgroup *memcg,
>                               unsigned long scanned, unsigned long reclaimed) {}
>  static inline void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg,
>                                    int prio) {}
> diff --git a/mm/memcontrol.c b/mm/memcontrol.c
> index 2d8549ae1b30..0d4b9dbe775a 100644
> --- a/mm/memcontrol.c
> +++ b/mm/memcontrol.c
> @@ -5277,7 +5277,6 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
>         vmpressure_init(&memcg->vmpressure);
>         INIT_LIST_HEAD(&memcg->event_list);
>         spin_lock_init(&memcg->event_list_lock);
> -       memcg->socket_pressure = jiffies;
>  #ifdef CONFIG_MEMCG_KMEM
>         memcg->kmemcg_id = -1;
>         INIT_LIST_HEAD(&memcg->objcg_list);
> @@ -7195,10 +7194,10 @@ bool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages,
>                 struct page_counter *fail;
>
>                 if (page_counter_try_charge(&memcg->tcpmem, nr_pages, &fail)) {
> -                       memcg->tcpmem_pressure = 0;

Orthogonal to your patch, but:

Maybe avoid touching this cache line too often and use READ/WRITE_ONCE() ?

    if (READ_ONCE(memcg->socket_pressure))
      WRITE_ONCE(memcg->socket_pressure, false);


> +                       memcg->socket_pressure = false;
>                         return true;
>                 }
> -               memcg->tcpmem_pressure = 1;
> +               memcg->socket_pressure = true;

Same remark.

>                 if (gfp_mask & __GFP_NOFAIL) {
>                         page_counter_charge(&memcg->tcpmem, nr_pages);
>                         return true;
> @@ -7206,12 +7205,21 @@ bool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages,
>                 return false;
>         }
>
> -       if (try_charge(memcg, gfp_mask, nr_pages) == 0) {
> -               mod_memcg_state(memcg, MEMCG_SOCK, nr_pages);
> -               return true;
> +       if (try_charge(memcg, gfp_mask & ~__GFP_NOFAIL, nr_pages) == 0) {
> +               memcg->socket_pressure = false;

same remark.

> +               goto success;
> +       }
> +       memcg->socket_pressure = true;

same remark.

> +       if (gfp_mask & __GFP_NOFAIL) {
> +               try_charge(memcg, gfp_mask, nr_pages);
> +               goto success;
>         }
>
>         return false;
> +
> +success:
> +       mod_memcg_state(memcg, MEMCG_SOCK, nr_pages);
> +       return true;
>  }
>
>  /**


  reply	other threads:[~2022-12-06 19:14 UTC|newest]

Thread overview: 26+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2022-11-22  0:53 Ivan Babrou
2022-11-22 18:01 ` Eric Dumazet
2022-11-22 18:11   ` Ivan Babrou
2022-11-22 18:23     ` Eric Dumazet
2022-11-22 18:59 ` Yu Zhao
2022-11-22 19:05   ` Ivan Babrou
2022-11-22 19:08     ` Yu Zhao
2022-11-22 19:46 ` Yu Zhao
2022-11-22 20:05   ` Yu Zhao
2022-11-23  0:44     ` Yu Zhao
2022-11-23 21:22       ` Johannes Weiner
2022-11-24  1:18         ` Yu Zhao
2022-11-24  1:29           ` Yu Zhao
2022-11-22 20:05 ` Johannes Weiner
2022-11-22 22:11   ` Ivan Babrou
2022-11-23  1:28     ` Ivan Babrou
2022-11-28 18:07       ` Johannes Weiner
2022-12-05 19:28         ` Shakeel Butt
2022-12-05 23:57         ` Ivan Babrou
2022-12-06  0:50           ` Ivan Babrou
2022-12-06 19:00             ` Johannes Weiner
2022-12-06 19:13               ` Eric Dumazet [this message]
2022-12-06 20:51                 ` Johannes Weiner
2022-12-06 23:10                   ` Shakeel Butt
2022-12-07 12:53                     ` Johannes Weiner
2022-12-08  0:31                       ` Shakeel Butt

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to='CANn89iJfx4QdVBqJ23oFJoz5DJKou=ZwVBNNXFNDJRNAqNvzwQ@mail.gmail.com' \
    --to=edumazet@google.com \
    --cc=akpm@linux-foundation.org \
    --cc=cgroups@vger.kernel.org \
    --cc=davem@davemloft.net \
    --cc=dsahern@kernel.org \
    --cc=hannes@cmpxchg.org \
    --cc=ivan@cloudflare.com \
    --cc=kernel-team@cloudflare.com \
    --cc=kuba@kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=mhocko@kernel.org \
    --cc=netdev@vger.kernel.org \
    --cc=pabeni@redhat.com \
    --cc=roman.gushchin@linux.dev \
    --cc=shakeelb@google.com \
    --cc=songmuchun@bytedance.com \
    --cc=yoshfuji@linux-ipv6.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox