linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed
From: Leon Huang Fu <leon.huangfu@shopee.com>
To: shakeel.butt@linux.dev
Cc: akpm@linux-foundation.org, cgroups@vger.kernel.org,
	corbet@lwn.net, hannes@cmpxchg.org, inwardvessel@gmail.com,
	jack@suse.cz, joel.granados@kernel.org, kyle.meyer@hpe.com,
	lance.yang@linux.dev, laoar.shao@gmail.com,
	leon.huangfu@shopee.com, linux-doc@vger.kernel.org,
	linux-kernel@vger.kernel.org, linux-mm@kvack.org,
	mclapinski@google.com, mhocko@kernel.org, muchun.song@linux.dev,
	roman.gushchin@linux.dev, yosry.ahmed@linux.dev
Subject: Re: [PATCH mm-new v2] mm/memcontrol: Flush stats when write stat file
Date: Thu,  6 Nov 2025 11:30:45 +0800	[thread overview]
Message-ID: <20251106033045.41607-1-leon.huangfu@shopee.com> (raw)
In-Reply-To: <6kh6hle2xp75hrtikasequ7qvfyginz7pyttltx6pkli26iir5@oqjmglatjg22>

On Thu, Nov 6, 2025 at 9:19 AM Shakeel Butt <shakeel.butt@linux.dev> wrote:
>
> +Yosry, JP
>
> On Wed, Nov 05, 2025 at 03:49:16PM +0800, Leon Huang Fu wrote:
> > On high-core count systems, memory cgroup statistics can become stale
> > due to per-CPU caching and deferred aggregation. Monitoring tools and
> > management applications sometimes need guaranteed up-to-date statistics
> > at specific points in time to make accurate decisions.
>
> Can you explain a bit more on your environment where you are seeing
> stale stats? More specifically, how often the management applications
> are reading the memcg stats and if these applications are reading memcg
> stats for each nodes of the cgroup tree.
>
> We force flush all the memcg stats at root level every 2 seconds but it
> seems like that is not enough for your case. I am fine with an explicit
> way for users to flush the memcg stats. In that way only users who want
> to has to pay for the flush cost.
>

Thanks for the feedback. I encountered this issue while running the LTP
memcontrol02 test case [1] on a 256-core server with the 6.6.y kernel on XFS,
where it consistently failed.

I was aware that Yosry had improved the memory statistics refresh mechanism
in "mm: memcg: subtree stats flushing and thresholds" [2], so I attempted to
backport that patchset to 6.6.y [3]. However, even on the 6.15.0-061500-generic
kernel with those improvements, the test still fails intermittently on XFS.

I've created a simplified reproducer that mirrors the LTP test behavior. The
test allocates 50 MiB of page cache and then verifies that memory.current and
memory.stat's "file" field are approximately equal (within 5% tolerance).

The failure pattern looks like:

  After alloc: memory.current=52690944, memory.stat.file=48496640, size=52428800
  Checks: current>=size=OK, file>0=OK, current~=file(5%)=FAIL

Here's the reproducer code and test script (attached below for reference).

To reproduce on XFS:
  sudo ./run.sh --xfs
  for i in {1..100}; do sudo ./run.sh --run; echo "==="; sleep 0.1; done
  sudo ./run.sh --cleanup

The test fails sporadically, typically a few times out of 100 runs, confirming
that the improved flush isn't sufficient for this workload pattern.

I agree that providing an explicit flush mechanism allows users who need
guaranteed accuracy to pay the cost only when necessary, rather than imposing
more aggressive global flushing on all users.

Thanks,
Leon

---

Links:
[1] https://github.com/linux-test-project/ltp/blob/master/testcases/kernel/controllers/memcg/memcontrol02.c
[2] https://lore.kernel.org/all/20231129032154.3710765-1-yosryahmed@google.com/
[3] https://lore.kernel.org/linux-mm/20251103075135.20254-1-leon.huangfu@shopee.com/

---

Reproducer code (pagecache_50m_demo.c):

#define _GNU_SOURCE
#include <errno.h>
#include <fcntl.h>
#include <limits.h>
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <unistd.h>

static int write_str(const char *path, const char *s) {
    int fd = open(path, O_WRONLY | O_CLOEXEC);
    if (fd < 0) { perror(path); return -1; }
    ssize_t w = write(fd, s, strlen(s));
    if (w != (ssize_t)strlen(s)) { perror("write"); close(fd); return -1; }
    close(fd);
    return 0;
}

static long read_long_file(const char *path) {
    FILE *f = fopen(path, "re");
    if (!f) { perror(path); return -1; }
    long v = -1;
    if (fscanf(f, "%ld", &v) != 1) v = -1;
    fclose(f);
    return v;
}

static long read_stat_field_bytes(const char *path, const char *key) {
    FILE *f = fopen(path, "re");
    if (!f) { perror(path); return -1; }
    char *line = NULL; size_t n = 0; long val = -1;
    while (getline(&line, &n, f) > 0) {
        if (strncmp(line, key, strlen(key)) == 0) {
            char *p = line + strlen(key);
            while (*p == ' ' || *p == '\t') p++;
            val = strtoll(p, NULL, 10);
            break;
        }
    }
    free(line);
    fclose(f);
    return val;
}

static int make_memcg_child(char *cg, size_t cg_sz) {
    const char *root = "/sys/fs/cgroup";
    int n = snprintf(cg, cg_sz, "%s/memcg_demo_%d", root, getpid());
    if (n < 0 || n >= (int)cg_sz) {
        fprintf(stderr, "cg path too long\n"); return -1;
    }
    if (mkdir(cg, 0755) && errno != EEXIST) { perror("mkdir cg"); return -1; }

    // best-effort enable memory controller on parent
    char parent[PATH_MAX];
    strncpy(parent, cg, sizeof(parent));
    parent[sizeof(parent)-1] = '\0';
    char *s = strrchr(parent, '/');
    if (s && s != parent) {
        *s = '\0';
        char stc[PATH_MAX];
        n = snprintf(stc, sizeof(stc), "%s/cgroup.subtree_control", parent);
        if (n >= 0 && n < (int)sizeof(stc)) (void)write_str(stc, "+memory");
    }

    char procs[PATH_MAX];
    n = snprintf(procs, sizeof(procs), "%s/cgroup.procs", cg);
    if (n < 0 || n >= (int)sizeof(procs)) { fprintf(stderr, "path too long\n"); return -1; }
    char pidbuf[32]; snprintf(pidbuf, sizeof(pidbuf), "%d", getpid());
    if (write_str(procs, pidbuf) < 0) return -1;
    return 0;
}

/* Exact mirror of LTP alloc_pagecache() behavior */
static inline void alloc_pagecache_exact(const int fd, size_t size)
{
    char buf[BUFSIZ];
    size_t i;

    if (lseek(fd, 0, SEEK_END) == (off_t)-1) {
        perror("lseek"); exit(1);
    }
    for (i = 0; i < size; i += sizeof(buf)) {
        ssize_t w = write(fd, buf, sizeof(buf));
        if (w < 0) { perror("write"); exit(1); }
        if ((size_t)w != sizeof(buf)) { fprintf(stderr, "short write\n"); exit(1); }
    }
}

static bool approx_equal(long a, long b, double tol_frac) {
    long diff = labs(a - b);
    long lim = (long)((double)b * tol_frac);
    return diff <= lim;
}

int main(int argc, char **argv) {
    const char *mnt = (argc >= 2) ? argv[1] : ".";
    char cg[PATH_MAX];
    if (make_memcg_child(cg, sizeof(cg)) < 0) {
        fprintf(stderr, "Failed to setup memcg (need root? cgroup v2?)\n");
        return 1;
    }
    printf("Created cg %s\n", cg);

    char p_current[PATH_MAX], p_stat[PATH_MAX];
    int n = snprintf(p_current, sizeof(p_current), "%s/memory.current", cg);
    if (n < 0 || n >= (int)sizeof(p_current)) { fprintf(stderr, "path too long\n"); return 1; }
    n = snprintf(p_stat, sizeof(p_stat), "%s/memory.stat", cg);
    if (n < 0 || n >= (int)sizeof(p_stat)) { fprintf(stderr, "path too long\n"); return 1; }

    char filepath[PATH_MAX];
    n = snprintf(filepath, sizeof(filepath), "%s/tmpfile", mnt);
    if (n < 0 || n >= (int)sizeof(filepath)) { fprintf(stderr, "file path too long\n"); return 1; }

    int fd = open(filepath, O_RDWR | O_CREAT | O_TRUNC, 0600);
    if (fd < 0) { perror("open tmpfile"); return 1; }

    long current = read_long_file(p_current);
    printf("Created temp file: memory.current=%ld\n", current);

    const size_t size = 50UL * 1024 * 1024; // 50 MiB
    alloc_pagecache_exact(fd, size);

    // No fsyncs; small wait to reduce XFS timing flakiness
    //fsync(fd);
    //usleep(2200000);

    current = read_long_file(p_current);
    long file_bytes = read_stat_field_bytes(p_stat, "file");
    printf("After alloc: memory.current=%ld, memory.stat.file=%ld, size=%zu\n",
           current, file_bytes, size);

    bool ge_size = (current >= (long)size);
    bool file_pos = (file_bytes > 0);

    // Approximate LTP's values_close(..., file_to_all_error) with 5% tolerance
    double tol = 0.05;
    bool approx = approx_equal(current, file_bytes, tol);

    printf("Checks: current>=size=%s, file>0=%s, current~=file(%.0f%%)=%s\n",
           ge_size ? "OK" : "FAIL",
           file_pos ? "OK" : "FAIL",
           tol * 100,
           approx ? "OK" : "FAIL");

    close(fd);
    return (ge_size && file_pos && approx) ? 0 : 2;
}

Build: gcc -O2 -Wall pagecache_50m_demo.c -o pagecache_50m_demo

Test runner (run.sh):

#!/usr/bin/env bash
set -euo pipefail

# Config (overridable via env)
SIZE_MB="${SIZE_MB:-500}"
IMG="${IMG:-/var/tmp/pagecache_demo.img}"
MNT="${MNT:-/mnt/pagecache_demo}"
DEMO_BIN="${DEMO_BIN:-./pagecache_50m_demo}"
STATE="${STATE:-/var/tmp/pagecache_demo.state}" # stores LOOP + FS type

usage() {
  echo "Usage:"
  echo "  sudo $0 --ext4         Prepare ext4 loopback FS and mount it at \$MNT"
  echo "  sudo $0 --xfs          Prepare xfs  loopback FS and mount it at \$MNT"
  echo "  sudo $0 --btrfs        Prepare btrfs loopback FS and mount it at \$MNT"
  echo "  sudo $0 --tmpfs        Prepare tmpfs mount at \$MNT (no image/loop)"
  echo "  sudo $0 --run          Run demo on mounted \$MNT"
  echo "  sudo $0 --cleanup      Unmount and remove loop/image/state"
  echo
  echo "Env overrides:"
  echo "  SIZE_MB (default 256), IMG, MNT, DEMO_BIN, STATE"
}

need_root() {
  if [[ ${EUID:-$(id -u)} -ne 0 ]]; then
    echo "Please run as root (sudo)"
    exit 1
  fi
}

have_cmd() { command -v "$1" > /dev/null 2>&1; }

save_state() {
  local loop="${1:-}" fstype="$2"
  printf 'LOOP=%q\nFSTYPE=%q\nIMG=%q\nMNT=%q\n' "$loop" "$fstype" "$IMG" "$MNT" > "$STATE"
}

load_state() {
  if [[ ! -f "$STATE" ]]; then
    echo "State not found: $STATE. Run --ext4/--xfs/--btrfs/--tmpfs first."
    exit 1
  fi
  # shellcheck disable=SC1090
  . "$STATE"
  : "${FSTYPE:?}" "${IMG:?}" "${MNT:?}"
  # LOOP may be empty for tmpfs
}

cleanup_mount() {
  set +e
  if mountpoint -q "$MNT"; then
    umount "$MNT" || umount -l "$MNT"
  fi
  if [[ -n "${LOOP:-}" ]] && [[ -b "${LOOP:-}" ]]; then
    losetup -d "$LOOP" 2> /dev/null || true
  else
    # Detach any loop using IMG as fallback
    if [[ -f "$IMG" ]]; then
      if losetup -j "$IMG" | grep -q .; then
        losetup -j "$IMG" | awk -F: '{print $1}' | xargs -r -n1 losetup -d
      fi
    fi
  fi
  rmdir "$MNT" 2> /dev/null || true
  set -e
}

cleanup_all() {
  echo "[*] Cleaning up..."
  if [[ -f "$STATE" ]]; then
    load_state || true
  fi
  cleanup_mount
  # For tmpfs there is no image; for others remove image
  if [[ "${FSTYPE:-}" != "tmpfs" ]]; then
    rm -f "$IMG"
  fi
  rm -f "$STATE"
  rmdir /sys/fs/cgroup/memcg_demo_* || true
  echo "[*] Done."
}

make_image() {
  echo "[*] Creating sparse image: $IMG (${SIZE_MB} MiB)"
  dd if=/dev/zero of="$IMG" bs=1M count=0 seek="$SIZE_MB" status=none
}

attach_loop() {
  # stdout returns loop device path only
  losetup -fP --show "$IMG"
}

ensure_loop_ready() {
  local dev="$1"
  if [[ -z "$dev" ]]; then
    echo "Failed to get loop device for $IMG"
    exit 1
  fi
  # udev settle
  for _ in {1..10}; do
    [[ -b "$dev" ]] && return 0
    sleep 0.1
  done
  echo "Loop device is not a block device: $dev"
  exit 1
}

mkfs_ext4() {
  have_cmd mkfs.ext4 || {
    echo "mkfs.ext4 not found"
    exit 1
  }
  echo "[*] Making ext4 on $1"
  mkfs.ext4 -F -q "$1"
}

mkfs_xfs() {
  have_cmd mkfs.xfs || {
    echo "mkfs.xfs not found (install xfsprogs)"
    exit 1
  }
  echo "[*] Making xfs on $1"
  mkfs.xfs -f -q "$1"
}

mkfs_btrfs() {
  have_cmd mkfs.btrfs || {
    echo "mkfs.btrfs not found (install btrfs-progs)"
    exit 1
  }
  echo "[*] Making btrfs on $1"
  mkfs.btrfs -f -q "$1"
}

mount_fs_dev() {
  mkdir -p "$MNT"
  echo "[*] Mounting $1 at $MNT"
  mount "$1" "$MNT"
  df -h "$MNT" || true
}

prepare_fs_loop() {
  need_root
  local fstype="$1" # ext4 | xfs | btrfs

  rm -f "$STATE"
  if mountpoint -q "$MNT" || losetup -j "$IMG" | grep -q . || [[ -f "$IMG" ]]; then
    echo "[*] Previous environment detected, cleaning first..."
    cleanup_all
  fi

  make_image
  local loop
  loop="$(attach_loop)"
  ensure_loop_ready "$loop"

  case "$fstype" in
    ext4) mkfs_ext4 "$loop" ;;
    xfs) mkfs_xfs "$loop" ;;
    btrfs) mkfs_btrfs "$loop" ;;
    *)
      echo "Unknown fs: $fstype"
      exit 1
      ;;
  esac

  mount_fs_dev "$loop"
  save_state "$loop" "$fstype"
  echo "[*] Prepared $fstype at $MNT (loop=$loop)"
}

prepare_tmpfs() {
  need_root
  rm -f "$STATE"
  if mountpoint -q "$MNT"; then
    echo "[*] Unmounting previous $MNT..."
    umount "$MNT" || umount -l "$MNT"
  fi
  mkdir -p "$MNT"
  echo "[*] Mounting tmpfs at $MNT (size=${SIZE_MB}m)"
  mount -t tmpfs -o "size=${SIZE_MB}m" tmpfs "$MNT"
  df -h "$MNT" || true
  save_state "" "tmpfs"
  echo "[*] Prepared tmpfs at $MNT"
}

run_demo() {
  need_root
  load_state
  if ! mountpoint -q "$MNT"; then
    echo "Mount point not mounted: $MNT. Did you run --$FSTYPE first?"
    exit 1
  fi
  if [[ ! -x "$DEMO_BIN" ]]; then
    echo "Demo binary not found or not executable: $DEMO_BIN"
    exit 1
  fi
  echo "[*] Running demo on $MNT: $DEMO_BIN $MNT"
  "$DEMO_BIN" "$MNT"
}

case "${1:-}" in
  --ext4) prepare_fs_loop ext4 ;;
  --xfs) prepare_fs_loop xfs ;;
  --btrfs) prepare_fs_loop btrfs ;;
  --tmpfs) prepare_tmpfs ;;
  --run) run_demo ;;
  --cleanup) cleanup_all ;;
  *)
    usage
    exit 1
    ;;
esac


  reply	other threads:[~2025-11-06  3:30 UTC|newest]

Thread overview: 14+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2025-11-05  7:49 Leon Huang Fu
2025-11-05  8:19 ` Michal Hocko
2025-11-05  8:39   ` Lance Yang
2025-11-05  8:51     ` Leon Huang Fu
2025-11-06  1:19 ` Shakeel Butt
2025-11-06  3:30   ` Leon Huang Fu [this message]
2025-11-06  5:35     ` JP Kobryn
2025-11-06  6:42       ` Leon Huang Fu
2025-11-06 23:55     ` Shakeel Butt
2025-11-10  6:37       ` Leon Huang Fu
2025-11-10 20:19         ` Yosry Ahmed
2025-11-06 17:02 ` JP Kobryn
2025-11-10  6:20   ` Leon Huang Fu
2025-11-10 19:24     ` JP Kobryn

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20251106033045.41607-1-leon.huangfu@shopee.com \
    --to=leon.huangfu@shopee.com \
    --cc=akpm@linux-foundation.org \
    --cc=cgroups@vger.kernel.org \
    --cc=corbet@lwn.net \
    --cc=hannes@cmpxchg.org \
    --cc=inwardvessel@gmail.com \
    --cc=jack@suse.cz \
    --cc=joel.granados@kernel.org \
    --cc=kyle.meyer@hpe.com \
    --cc=lance.yang@linux.dev \
    --cc=laoar.shao@gmail.com \
    --cc=linux-doc@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=mclapinski@google.com \
    --cc=mhocko@kernel.org \
    --cc=muchun.song@linux.dev \
    --cc=roman.gushchin@linux.dev \
    --cc=shakeel.butt@linux.dev \
    --cc=yosry.ahmed@linux.dev \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox