* [PATCH 1/3] cgroup, binfmt_elf: Add hwcap masks to the misc controller
2025-12-05 0:58 [PATCH 0/3] cgroup/misc: Add hwcap masks to the misc controller Andrei Vagin
@ 2025-12-05 0:58 ` Andrei Vagin
2025-12-05 3:40 ` Kees Cook
2025-12-05 10:10 ` Chen Ridong
2025-12-05 0:58 ` Andrei Vagin
` (3 subsequent siblings)
4 siblings, 2 replies; 11+ messages in thread
From: Andrei Vagin @ 2025-12-05 0:58 UTC (permalink / raw)
To: Kees Cook
Cc: linux-kernel, linux-fsdevel, linux-mm, cgroups, criu, Tejun Heo,
Johannes Weiner, Michal Koutný,
Vipin Sharma, Jonathan Corbet, Andrei Vagin
Add an interface to the misc cgroup controller that allows masking out
hardware capabilities (AT_HWCAP) reported to user-space processes. This
provides a mechanism to restrict the features a containerized
application can see.
The new "misc.mask" cgroup file allows users to specify masks for
AT_HWCAP, AT_HWCAP2, AT_HWCAP3, and AT_HWCAP4.
The output of "misc.mask" is extended to display the effective mask,
which is a combination of the masks from the current cgroup and all its
ancestors.
Signed-off-by: Andrei Vagin <avagin@google.com>
---
fs/binfmt_elf.c | 24 +++++--
include/linux/misc_cgroup.h | 25 +++++++
kernel/cgroup/misc.c | 126 ++++++++++++++++++++++++++++++++++++
3 files changed, 171 insertions(+), 4 deletions(-)
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 3eb734c192e9..59137784e81d 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -47,6 +47,7 @@
#include <linux/dax.h>
#include <linux/uaccess.h>
#include <uapi/linux/rseq.h>
+#include <linux/misc_cgroup.h>
#include <asm/param.h>
#include <asm/page.h>
@@ -182,6 +183,21 @@ create_elf_tables(struct linux_binprm *bprm, const struct elfhdr *exec,
int ei_index;
const struct cred *cred = current_cred();
struct vm_area_struct *vma;
+ struct misc_cg *misc_cg;
+ u64 hwcap_mask[4] = {0, 0, 0, 0};
+
+ misc_cg = get_current_misc_cg();
+ misc_cg_get_mask(MISC_CG_MASK_HWCAP, misc_cg, &hwcap_mask[0]);
+#ifdef ELF_HWCAP2
+ misc_cg_get_mask(MISC_CG_MASK_HWCAP2, misc_cg, &hwcap_mask[1]);
+#endif
+#ifdef ELF_HWCAP3
+ misc_cg_get_mask(MISC_CG_MASK_HWCAP3, misc_cg, &hwcap_mask[2]);
+#endif
+#ifdef ELF_HWCAP4
+ misc_cg_get_mask(MISC_CG_MASK_HWCAP4, misc_cg, &hwcap_mask[3]);
+#endif
+ put_misc_cg(misc_cg);
/*
* In some cases (e.g. Hyper-Threading), we want to avoid L1
@@ -246,7 +262,7 @@ create_elf_tables(struct linux_binprm *bprm, const struct elfhdr *exec,
*/
ARCH_DLINFO;
#endif
- NEW_AUX_ENT(AT_HWCAP, ELF_HWCAP);
+ NEW_AUX_ENT(AT_HWCAP, ELF_HWCAP & ~hwcap_mask[0]);
NEW_AUX_ENT(AT_PAGESZ, ELF_EXEC_PAGESIZE);
NEW_AUX_ENT(AT_CLKTCK, CLOCKS_PER_SEC);
NEW_AUX_ENT(AT_PHDR, phdr_addr);
@@ -264,13 +280,13 @@ create_elf_tables(struct linux_binprm *bprm, const struct elfhdr *exec,
NEW_AUX_ENT(AT_SECURE, bprm->secureexec);
NEW_AUX_ENT(AT_RANDOM, (elf_addr_t)(unsigned long)u_rand_bytes);
#ifdef ELF_HWCAP2
- NEW_AUX_ENT(AT_HWCAP2, ELF_HWCAP2);
+ NEW_AUX_ENT(AT_HWCAP2, ELF_HWCAP2 & ~hwcap_mask[1]);
#endif
#ifdef ELF_HWCAP3
- NEW_AUX_ENT(AT_HWCAP3, ELF_HWCAP3);
+ NEW_AUX_ENT(AT_HWCAP3, ELF_HWCAP3 & ~hwcap_mask[2]);
#endif
#ifdef ELF_HWCAP4
- NEW_AUX_ENT(AT_HWCAP4, ELF_HWCAP4);
+ NEW_AUX_ENT(AT_HWCAP4, ELF_HWCAP4 & ~hwcap_mask[3]);
#endif
NEW_AUX_ENT(AT_EXECFN, bprm->exec);
if (k_platform) {
diff --git a/include/linux/misc_cgroup.h b/include/linux/misc_cgroup.h
index 0cb36a3ffc47..cff830c238fb 100644
--- a/include/linux/misc_cgroup.h
+++ b/include/linux/misc_cgroup.h
@@ -8,6 +8,8 @@
#ifndef _MISC_CGROUP_H_
#define _MISC_CGROUP_H_
+#include <linux/elf.h>
+
/**
* enum misc_res_type - Types of misc cgroup entries supported by the host.
*/
@@ -26,6 +28,20 @@ enum misc_res_type {
MISC_CG_RES_TYPES
};
+enum misc_mask_type {
+ MISC_CG_MASK_HWCAP,
+#ifdef ELF_HWCAP2
+ MISC_CG_MASK_HWCAP2,
+#endif
+#ifdef ELF_HWCAP3
+ MISC_CG_MASK_HWCAP3,
+#endif
+#ifdef ELF_HWCAP4
+ MISC_CG_MASK_HWCAP4,
+#endif
+ MISC_CG_MASK_TYPES
+};
+
struct misc_cg;
#ifdef CONFIG_CGROUP_MISC
@@ -62,12 +78,15 @@ struct misc_cg {
struct cgroup_file events_local_file;
struct misc_res res[MISC_CG_RES_TYPES];
+ u64 mask[MISC_CG_MASK_TYPES];
};
int misc_cg_set_capacity(enum misc_res_type type, u64 capacity);
int misc_cg_try_charge(enum misc_res_type type, struct misc_cg *cg, u64 amount);
void misc_cg_uncharge(enum misc_res_type type, struct misc_cg *cg, u64 amount);
+int misc_cg_get_mask(enum misc_mask_type type, struct misc_cg *cg, u64 *pmask);
+
/**
* css_misc() - Get misc cgroup from the css.
* @css: cgroup subsys state object.
@@ -134,5 +153,11 @@ static inline void put_misc_cg(struct misc_cg *cg)
{
}
+static inline int misc_cg_get_mask(enum misc_mask_type type, struct misc_cg *cg, u64 *pmask)
+{
+ *pmask = 0;
+ return 0;
+}
+
#endif /* CONFIG_CGROUP_MISC */
#endif /* _MISC_CGROUP_H_ */
diff --git a/kernel/cgroup/misc.c b/kernel/cgroup/misc.c
index 6a01d91ea4cb..d1386d86060f 100644
--- a/kernel/cgroup/misc.c
+++ b/kernel/cgroup/misc.c
@@ -30,6 +30,19 @@ static const char *const misc_res_name[] = {
#endif
};
+static const char *const misc_mask_name[] = {
+ "AT_HWCAP",
+#ifdef ELF_HWCAP2
+ "AT_HWCAP2",
+#endif
+#ifdef ELF_HWCAP3
+ "AT_HWCAP3",
+#endif
+#ifdef ELF_HWCAP4
+ "AT_HWCAP4",
+#endif
+};
+
/* Root misc cgroup */
static struct misc_cg root_cg;
@@ -71,6 +84,11 @@ static inline bool valid_type(enum misc_res_type type)
return type >= 0 && type < MISC_CG_RES_TYPES;
}
+static inline bool valid_mask_type(enum misc_mask_type type)
+{
+ return type >= 0 && type < MISC_CG_MASK_TYPES;
+}
+
/**
* misc_cg_set_capacity() - Set the capacity of the misc cgroup res.
* @type: Type of the misc res.
@@ -391,6 +409,109 @@ static int misc_events_local_show(struct seq_file *sf, void *v)
return __misc_events_show(sf, true);
}
+/**
+ * misc_cg_get_mask() - Get the mask of the specified type.
+ * @type: The misc mask type.
+ * @cg: The misc cgroup.
+ * @pmask: Pointer to the resulting mask.
+ *
+ * This function calculates the effective mask for a given cgroup by walking up
+ * the hierarchy and ORing the masks from all parent cgroupfs. The final result
+ * is stored in the location pointed to by @pmask.
+ *
+ * Context: Any context.
+ * Return: 0 on success, -EINVAL if @type is invalid.
+ */
+int misc_cg_get_mask(enum misc_mask_type type, struct misc_cg *cg, u64 *pmask)
+{
+ struct misc_cg *i;
+ u64 mask = 0;
+
+ if (!(valid_mask_type(type)))
+ return -EINVAL;
+
+ for (i = cg; i; i = parent_misc(i))
+ mask |= READ_ONCE(i->mask[type]);
+
+ *pmask = mask;
+ return 0;
+}
+
+/**
+ * misc_cg_mask_show() - Show the misc cgroup masks.
+ * @sf: Interface file
+ * @v: Arguments passed
+ *
+ * Context: Any context.
+ * Return: 0 to denote successful print.
+ */
+static int misc_cg_mask_show(struct seq_file *sf, void *v)
+{
+ struct misc_cg *cg = css_misc(seq_css(sf));
+ int i;
+
+ for (i = 0; i < MISC_CG_MASK_TYPES; i++) {
+ u64 rval, val = READ_ONCE(cg->mask[i]);
+
+ misc_cg_get_mask(i, cg, &rval);
+ seq_printf(sf, "%s\t%#016llx\t%#016llx\n", misc_mask_name[i], val, rval);
+ }
+
+ return 0;
+}
+
+/**
+ * misc_cg_mask_write() - Update the mask of the specified type.
+ * @of: Handler for the file.
+ * @buf: The buffer containing the user's input.
+ * @nbytes: The number of bytes in @buf.
+ * @off: The offset in the file.
+ *
+ * This function parses a user-provided string to update a mask.
+ * The expected format is "<mask_name> <value>", for example:
+ *
+ * echo "AT_HWCAP 0xf00" > misc.mask
+ *
+ * Context: Process context.
+ * Return: The number of bytes processed on success, or a negative error code
+ * on failure.
+ */
+static ssize_t misc_cg_mask_write(struct kernfs_open_file *of, char *buf,
+ size_t nbytes, loff_t off)
+{
+ struct misc_cg *cg;
+ u64 max;
+ int ret = 0, i;
+ enum misc_mask_type type = MISC_CG_MASK_TYPES;
+ char *token;
+
+ buf = strstrip(buf);
+ token = strsep(&buf, " ");
+
+ if (!token || !buf)
+ return -EINVAL;
+
+ for (i = 0; i < MISC_CG_MASK_TYPES; i++) {
+ if (!strcmp(misc_mask_name[i], token)) {
+ type = i;
+ break;
+ }
+ }
+
+ if (type == MISC_CG_MASK_TYPES)
+ return -EINVAL;
+
+ ret = kstrtou64(buf, 0, &max);
+ if (ret)
+ return ret;
+
+ cg = css_misc(of_css(of));
+
+ WRITE_ONCE(cg->mask[type], max);
+
+ return nbytes;
+}
+
/* Misc cgroup interface files */
static struct cftype misc_cg_files[] = {
{
@@ -424,6 +545,11 @@ static struct cftype misc_cg_files[] = {
.file_offset = offsetof(struct misc_cg, events_local_file),
.seq_show = misc_events_local_show,
},
+ {
+ .name = "mask",
+ .write = misc_cg_mask_write,
+ .seq_show = misc_cg_mask_show,
+ },
{}
};
--
2.52.0.223.gf5cc29aaa4-goog
^ permalink raw reply [flat|nested] 11+ messages in thread* Re: [PATCH 1/3] cgroup, binfmt_elf: Add hwcap masks to the misc controller
2025-12-05 0:58 ` [PATCH 1/3] cgroup, binfmt_elf: " Andrei Vagin
@ 2025-12-05 3:40 ` Kees Cook
2025-12-05 10:10 ` Chen Ridong
1 sibling, 0 replies; 11+ messages in thread
From: Kees Cook @ 2025-12-05 3:40 UTC (permalink / raw)
To: Andrei Vagin
Cc: linux-kernel, linux-fsdevel, linux-mm, cgroups, criu, Tejun Heo,
Johannes Weiner, Michal Koutný,
Vipin Sharma, Jonathan Corbet
On Fri, Dec 05, 2025 at 12:58:29AM +0000, Andrei Vagin wrote:
> Add an interface to the misc cgroup controller that allows masking out
> hardware capabilities (AT_HWCAP) reported to user-space processes. This
> provides a mechanism to restrict the features a containerized
> application can see.
>
> The new "misc.mask" cgroup file allows users to specify masks for
> AT_HWCAP, AT_HWCAP2, AT_HWCAP3, and AT_HWCAP4.
>
> The output of "misc.mask" is extended to display the effective mask,
> which is a combination of the masks from the current cgroup and all its
> ancestors.
>
> Signed-off-by: Andrei Vagin <avagin@google.com>
> ---
> fs/binfmt_elf.c | 24 +++++--
> include/linux/misc_cgroup.h | 25 +++++++
> kernel/cgroup/misc.c | 126 ++++++++++++++++++++++++++++++++++++
> 3 files changed, 171 insertions(+), 4 deletions(-)
>
> diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
> index 3eb734c192e9..59137784e81d 100644
> --- a/fs/binfmt_elf.c
> +++ b/fs/binfmt_elf.c
> @@ -47,6 +47,7 @@
> #include <linux/dax.h>
> #include <linux/uaccess.h>
> #include <uapi/linux/rseq.h>
> +#include <linux/misc_cgroup.h>
> #include <asm/param.h>
> #include <asm/page.h>
>
> @@ -182,6 +183,21 @@ create_elf_tables(struct linux_binprm *bprm, const struct elfhdr *exec,
> int ei_index;
> const struct cred *cred = current_cred();
> struct vm_area_struct *vma;
> + struct misc_cg *misc_cg;
> + u64 hwcap_mask[4] = {0, 0, 0, 0};
> +
> + misc_cg = get_current_misc_cg();
> + misc_cg_get_mask(MISC_CG_MASK_HWCAP, misc_cg, &hwcap_mask[0]);
> +#ifdef ELF_HWCAP2
> + misc_cg_get_mask(MISC_CG_MASK_HWCAP2, misc_cg, &hwcap_mask[1]);
> +#endif
> +#ifdef ELF_HWCAP3
> + misc_cg_get_mask(MISC_CG_MASK_HWCAP3, misc_cg, &hwcap_mask[2]);
> +#endif
> +#ifdef ELF_HWCAP4
> + misc_cg_get_mask(MISC_CG_MASK_HWCAP4, misc_cg, &hwcap_mask[3]);
> +#endif
Can we avoid having the open-coded 4, 0, 1, 2, 3 where these are used?
I imagine it also doesn't need to be a 4 element array if ELF_HWCAP4
isn't defined, etc?
--
Kees Cook
^ permalink raw reply [flat|nested] 11+ messages in thread* Re: [PATCH 1/3] cgroup, binfmt_elf: Add hwcap masks to the misc controller
2025-12-05 0:58 ` [PATCH 1/3] cgroup, binfmt_elf: " Andrei Vagin
2025-12-05 3:40 ` Kees Cook
@ 2025-12-05 10:10 ` Chen Ridong
1 sibling, 0 replies; 11+ messages in thread
From: Chen Ridong @ 2025-12-05 10:10 UTC (permalink / raw)
To: Andrei Vagin, Kees Cook
Cc: linux-kernel, linux-fsdevel, linux-mm, cgroups, criu, Tejun Heo,
Johannes Weiner, Michal Koutný,
Vipin Sharma, Jonathan Corbet
On 2025/12/5 8:58, Andrei Vagin wrote:
> Add an interface to the misc cgroup controller that allows masking out
> hardware capabilities (AT_HWCAP) reported to user-space processes. This
> provides a mechanism to restrict the features a containerized
> application can see.
>
> The new "misc.mask" cgroup file allows users to specify masks for
> AT_HWCAP, AT_HWCAP2, AT_HWCAP3, and AT_HWCAP4.
>
> The output of "misc.mask" is extended to display the effective mask,
> which is a combination of the masks from the current cgroup and all its
> ancestors.
>
> Signed-off-by: Andrei Vagin <avagin@google.com>
> ---
> fs/binfmt_elf.c | 24 +++++--
> include/linux/misc_cgroup.h | 25 +++++++
> kernel/cgroup/misc.c | 126 ++++++++++++++++++++++++++++++++++++
> 3 files changed, 171 insertions(+), 4 deletions(-)
>
> diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
> index 3eb734c192e9..59137784e81d 100644
> --- a/fs/binfmt_elf.c
> +++ b/fs/binfmt_elf.c
> @@ -47,6 +47,7 @@
> #include <linux/dax.h>
> #include <linux/uaccess.h>
> #include <uapi/linux/rseq.h>
> +#include <linux/misc_cgroup.h>
> #include <asm/param.h>
> #include <asm/page.h>
>
> @@ -182,6 +183,21 @@ create_elf_tables(struct linux_binprm *bprm, const struct elfhdr *exec,
> int ei_index;
> const struct cred *cred = current_cred();
> struct vm_area_struct *vma;
> + struct misc_cg *misc_cg;
> + u64 hwcap_mask[4] = {0, 0, 0, 0};
> +
> + misc_cg = get_current_misc_cg();
> + misc_cg_get_mask(MISC_CG_MASK_HWCAP, misc_cg, &hwcap_mask[0]);
> +#ifdef ELF_HWCAP2
> + misc_cg_get_mask(MISC_CG_MASK_HWCAP2, misc_cg, &hwcap_mask[1]);
> +#endif
> +#ifdef ELF_HWCAP3
> + misc_cg_get_mask(MISC_CG_MASK_HWCAP3, misc_cg, &hwcap_mask[2]);
> +#endif
> +#ifdef ELF_HWCAP4
> + misc_cg_get_mask(MISC_CG_MASK_HWCAP4, misc_cg, &hwcap_mask[3]);
> +#endif
> + put_misc_cg(misc_cg);
>
> /*
> * In some cases (e.g. Hyper-Threading), we want to avoid L1
> @@ -246,7 +262,7 @@ create_elf_tables(struct linux_binprm *bprm, const struct elfhdr *exec,
> */
> ARCH_DLINFO;
> #endif
> - NEW_AUX_ENT(AT_HWCAP, ELF_HWCAP);
> + NEW_AUX_ENT(AT_HWCAP, ELF_HWCAP & ~hwcap_mask[0]);
> NEW_AUX_ENT(AT_PAGESZ, ELF_EXEC_PAGESIZE);
> NEW_AUX_ENT(AT_CLKTCK, CLOCKS_PER_SEC);
> NEW_AUX_ENT(AT_PHDR, phdr_addr);
> @@ -264,13 +280,13 @@ create_elf_tables(struct linux_binprm *bprm, const struct elfhdr *exec,
> NEW_AUX_ENT(AT_SECURE, bprm->secureexec);
> NEW_AUX_ENT(AT_RANDOM, (elf_addr_t)(unsigned long)u_rand_bytes);
> #ifdef ELF_HWCAP2
> - NEW_AUX_ENT(AT_HWCAP2, ELF_HWCAP2);
> + NEW_AUX_ENT(AT_HWCAP2, ELF_HWCAP2 & ~hwcap_mask[1]);
> #endif
> #ifdef ELF_HWCAP3
> - NEW_AUX_ENT(AT_HWCAP3, ELF_HWCAP3);
> + NEW_AUX_ENT(AT_HWCAP3, ELF_HWCAP3 & ~hwcap_mask[2]);
> #endif
> #ifdef ELF_HWCAP4
> - NEW_AUX_ENT(AT_HWCAP4, ELF_HWCAP4);
> + NEW_AUX_ENT(AT_HWCAP4, ELF_HWCAP4 & ~hwcap_mask[3]);
> #endif
> NEW_AUX_ENT(AT_EXECFN, bprm->exec);
> if (k_platform) {
> diff --git a/include/linux/misc_cgroup.h b/include/linux/misc_cgroup.h
> index 0cb36a3ffc47..cff830c238fb 100644
> --- a/include/linux/misc_cgroup.h
> +++ b/include/linux/misc_cgroup.h
> @@ -8,6 +8,8 @@
> #ifndef _MISC_CGROUP_H_
> #define _MISC_CGROUP_H_
>
> +#include <linux/elf.h>
> +
> /**
> * enum misc_res_type - Types of misc cgroup entries supported by the host.
> */
> @@ -26,6 +28,20 @@ enum misc_res_type {
> MISC_CG_RES_TYPES
> };
>
> +enum misc_mask_type {
> + MISC_CG_MASK_HWCAP,
> +#ifdef ELF_HWCAP2
> + MISC_CG_MASK_HWCAP2,
> +#endif
> +#ifdef ELF_HWCAP3
> + MISC_CG_MASK_HWCAP3,
> +#endif
> +#ifdef ELF_HWCAP4
> + MISC_CG_MASK_HWCAP4,
> +#endif
> + MISC_CG_MASK_TYPES
> +};
> +
> struct misc_cg;
>
> #ifdef CONFIG_CGROUP_MISC
> @@ -62,12 +78,15 @@ struct misc_cg {
> struct cgroup_file events_local_file;
>
> struct misc_res res[MISC_CG_RES_TYPES];
> + u64 mask[MISC_CG_MASK_TYPES];
> };
>
> int misc_cg_set_capacity(enum misc_res_type type, u64 capacity);
> int misc_cg_try_charge(enum misc_res_type type, struct misc_cg *cg, u64 amount);
> void misc_cg_uncharge(enum misc_res_type type, struct misc_cg *cg, u64 amount);
>
> +int misc_cg_get_mask(enum misc_mask_type type, struct misc_cg *cg, u64 *pmask);
> +
> /**
> * css_misc() - Get misc cgroup from the css.
> * @css: cgroup subsys state object.
> @@ -134,5 +153,11 @@ static inline void put_misc_cg(struct misc_cg *cg)
> {
> }
>
> +static inline int misc_cg_get_mask(enum misc_mask_type type, struct misc_cg *cg, u64 *pmask)
> +{
> + *pmask = 0;
> + return 0;
> +}
> +
> #endif /* CONFIG_CGROUP_MISC */
> #endif /* _MISC_CGROUP_H_ */
> diff --git a/kernel/cgroup/misc.c b/kernel/cgroup/misc.c
> index 6a01d91ea4cb..d1386d86060f 100644
> --- a/kernel/cgroup/misc.c
> +++ b/kernel/cgroup/misc.c
> @@ -30,6 +30,19 @@ static const char *const misc_res_name[] = {
> #endif
> };
>
> +static const char *const misc_mask_name[] = {
> + "AT_HWCAP",
> +#ifdef ELF_HWCAP2
> + "AT_HWCAP2",
> +#endif
> +#ifdef ELF_HWCAP3
> + "AT_HWCAP3",
> +#endif
> +#ifdef ELF_HWCAP4
> + "AT_HWCAP4",
> +#endif
> +};
> +
> /* Root misc cgroup */
> static struct misc_cg root_cg;
>
> @@ -71,6 +84,11 @@ static inline bool valid_type(enum misc_res_type type)
> return type >= 0 && type < MISC_CG_RES_TYPES;
> }
>
> +static inline bool valid_mask_type(enum misc_mask_type type)
> +{
> + return type >= 0 && type < MISC_CG_MASK_TYPES;
> +}
> +
> /**
> * misc_cg_set_capacity() - Set the capacity of the misc cgroup res.
> * @type: Type of the misc res.
> @@ -391,6 +409,109 @@ static int misc_events_local_show(struct seq_file *sf, void *v)
> return __misc_events_show(sf, true);
> }
>
> +/**
> + * misc_cg_get_mask() - Get the mask of the specified type.
> + * @type: The misc mask type.
> + * @cg: The misc cgroup.
> + * @pmask: Pointer to the resulting mask.
> + *
> + * This function calculates the effective mask for a given cgroup by walking up
> + * the hierarchy and ORing the masks from all parent cgroupfs. The final result
> + * is stored in the location pointed to by @pmask.
> + *
> + * Context: Any context.
> + * Return: 0 on success, -EINVAL if @type is invalid.
> + */
> +int misc_cg_get_mask(enum misc_mask_type type, struct misc_cg *cg, u64 *pmask)
> +{
> + struct misc_cg *i;
> + u64 mask = 0;
> +
> + if (!(valid_mask_type(type)))
> + return -EINVAL;
> +
> + for (i = cg; i; i = parent_misc(i))
> + mask |= READ_ONCE(i->mask[type]);
> +
> + *pmask = mask;
> + return 0;
> +}
> +
> +/**
> + * misc_cg_mask_show() - Show the misc cgroup masks.
> + * @sf: Interface file
> + * @v: Arguments passed
> + *
> + * Context: Any context.
> + * Return: 0 to denote successful print.
> + */
> +static int misc_cg_mask_show(struct seq_file *sf, void *v)
> +{
> + struct misc_cg *cg = css_misc(seq_css(sf));
> + int i;
> +
> + for (i = 0; i < MISC_CG_MASK_TYPES; i++) {
> + u64 rval, val = READ_ONCE(cg->mask[i]);
> +
> + misc_cg_get_mask(i, cg, &rval);
> + seq_printf(sf, "%s\t%#016llx\t%#016llx\n", misc_mask_name[i], val, rval);
> + }
> +
> + return 0;
> +}
> +
I'm concerned about the performance impact of the bottom-up traversal in deeply nested cgroup
hierarchies. Could this approach introduce noticeable latency in such scenarios?
> +/**
> + * misc_cg_mask_write() - Update the mask of the specified type.
> + * @of: Handler for the file.
> + * @buf: The buffer containing the user's input.
> + * @nbytes: The number of bytes in @buf.
> + * @off: The offset in the file.
> + *
> + * This function parses a user-provided string to update a mask.
> + * The expected format is "<mask_name> <value>", for example:
> + *
> + * echo "AT_HWCAP 0xf00" > misc.mask
> + *
> + * Context: Process context.
> + * Return: The number of bytes processed on success, or a negative error code
> + * on failure.
> + */
> +static ssize_t misc_cg_mask_write(struct kernfs_open_file *of, char *buf,
> + size_t nbytes, loff_t off)
> +{
> + struct misc_cg *cg;
> + u64 max;
> + int ret = 0, i;
> + enum misc_mask_type type = MISC_CG_MASK_TYPES;
> + char *token;
> +
> + buf = strstrip(buf);
> + token = strsep(&buf, " ");
> +
> + if (!token || !buf)
> + return -EINVAL;
> +
> + for (i = 0; i < MISC_CG_MASK_TYPES; i++) {
> + if (!strcmp(misc_mask_name[i], token)) {
> + type = i;
> + break;
> + }
> + }
> +
> + if (type == MISC_CG_MASK_TYPES)
> + return -EINVAL;
> +
> + ret = kstrtou64(buf, 0, &max);
> + if (ret)
> + return ret;
> +
> + cg = css_misc(of_css(of));
> +
> + WRITE_ONCE(cg->mask[type], max);
> +
> + return nbytes;
> +}
> +
> /* Misc cgroup interface files */
> static struct cftype misc_cg_files[] = {
> {
> @@ -424,6 +545,11 @@ static struct cftype misc_cg_files[] = {
> .file_offset = offsetof(struct misc_cg, events_local_file),
> .seq_show = misc_events_local_show,
> },
> + {
> + .name = "mask",
> + .write = misc_cg_mask_write,
> + .seq_show = misc_cg_mask_show,
> + },
> {}
> };
>
--
Best regards,
Ridong
^ permalink raw reply [flat|nested] 11+ messages in thread
* [PATCH 1/3] cgroup, binfmt_elf: Add hwcap masks to the misc controller
2025-12-05 0:58 [PATCH 0/3] cgroup/misc: Add hwcap masks to the misc controller Andrei Vagin
2025-12-05 0:58 ` [PATCH 1/3] cgroup, binfmt_elf: " Andrei Vagin
@ 2025-12-05 0:58 ` Andrei Vagin
2025-12-05 0:58 ` [PATCH 2/3] selftests/cgroup: Add a test for the misc.mask cgroup interface Andrei Vagin
` (2 subsequent siblings)
4 siblings, 0 replies; 11+ messages in thread
From: Andrei Vagin @ 2025-12-05 0:58 UTC (permalink / raw)
To: Kees Cook
Cc: linux-kernel, linux-fsdevel, linux-mm, cgroups, criu, Tejun Heo,
Johannes Weiner, Michal Koutný,
Vipin Sharma, Jonathan Corbet, Andrei Vagin
Add an interface to the misc cgroup controller that allows masking out
hardware capabilities (AT_HWCAP) reported to user-space processes. This
provides a mechanism to restrict the features a containerized
application can see.
The new "misc.mask" cgroup file allows users to specify masks for
AT_HWCAP, AT_HWCAP2, AT_HWCAP3, and AT_HWCAP4.
The output of "misc.mask" is extended to display the effective mask,
which is a combination of the masks from the current cgroup and all its
ancestors.
Signed-off-by: Andrei Vagin <avagin@google.com>
---
fs/binfmt_elf.c | 24 +++++--
include/linux/misc_cgroup.h | 25 +++++++
kernel/cgroup/misc.c | 126 ++++++++++++++++++++++++++++++++++++
3 files changed, 171 insertions(+), 4 deletions(-)
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 3eb734c192e9..59137784e81d 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -47,6 +47,7 @@
#include <linux/dax.h>
#include <linux/uaccess.h>
#include <uapi/linux/rseq.h>
+#include <linux/misc_cgroup.h>
#include <asm/param.h>
#include <asm/page.h>
@@ -182,6 +183,21 @@ create_elf_tables(struct linux_binprm *bprm, const struct elfhdr *exec,
int ei_index;
const struct cred *cred = current_cred();
struct vm_area_struct *vma;
+ struct misc_cg *misc_cg;
+ u64 hwcap_mask[4] = {0, 0, 0, 0};
+
+ misc_cg = get_current_misc_cg();
+ misc_cg_get_mask(MISC_CG_MASK_HWCAP, misc_cg, &hwcap_mask[0]);
+#ifdef ELF_HWCAP2
+ misc_cg_get_mask(MISC_CG_MASK_HWCAP2, misc_cg, &hwcap_mask[1]);
+#endif
+#ifdef ELF_HWCAP3
+ misc_cg_get_mask(MISC_CG_MASK_HWCAP3, misc_cg, &hwcap_mask[2]);
+#endif
+#ifdef ELF_HWCAP4
+ misc_cg_get_mask(MISC_CG_MASK_HWCAP4, misc_cg, &hwcap_mask[3]);
+#endif
+ put_misc_cg(misc_cg);
/*
* In some cases (e.g. Hyper-Threading), we want to avoid L1
@@ -246,7 +262,7 @@ create_elf_tables(struct linux_binprm *bprm, const struct elfhdr *exec,
*/
ARCH_DLINFO;
#endif
- NEW_AUX_ENT(AT_HWCAP, ELF_HWCAP);
+ NEW_AUX_ENT(AT_HWCAP, ELF_HWCAP & ~hwcap_mask[0]);
NEW_AUX_ENT(AT_PAGESZ, ELF_EXEC_PAGESIZE);
NEW_AUX_ENT(AT_CLKTCK, CLOCKS_PER_SEC);
NEW_AUX_ENT(AT_PHDR, phdr_addr);
@@ -264,13 +280,13 @@ create_elf_tables(struct linux_binprm *bprm, const struct elfhdr *exec,
NEW_AUX_ENT(AT_SECURE, bprm->secureexec);
NEW_AUX_ENT(AT_RANDOM, (elf_addr_t)(unsigned long)u_rand_bytes);
#ifdef ELF_HWCAP2
- NEW_AUX_ENT(AT_HWCAP2, ELF_HWCAP2);
+ NEW_AUX_ENT(AT_HWCAP2, ELF_HWCAP2 & ~hwcap_mask[1]);
#endif
#ifdef ELF_HWCAP3
- NEW_AUX_ENT(AT_HWCAP3, ELF_HWCAP3);
+ NEW_AUX_ENT(AT_HWCAP3, ELF_HWCAP3 & ~hwcap_mask[2]);
#endif
#ifdef ELF_HWCAP4
- NEW_AUX_ENT(AT_HWCAP4, ELF_HWCAP4);
+ NEW_AUX_ENT(AT_HWCAP4, ELF_HWCAP4 & ~hwcap_mask[3]);
#endif
NEW_AUX_ENT(AT_EXECFN, bprm->exec);
if (k_platform) {
diff --git a/include/linux/misc_cgroup.h b/include/linux/misc_cgroup.h
index 0cb36a3ffc47..cff830c238fb 100644
--- a/include/linux/misc_cgroup.h
+++ b/include/linux/misc_cgroup.h
@@ -8,6 +8,8 @@
#ifndef _MISC_CGROUP_H_
#define _MISC_CGROUP_H_
+#include <linux/elf.h>
+
/**
* enum misc_res_type - Types of misc cgroup entries supported by the host.
*/
@@ -26,6 +28,20 @@ enum misc_res_type {
MISC_CG_RES_TYPES
};
+enum misc_mask_type {
+ MISC_CG_MASK_HWCAP,
+#ifdef ELF_HWCAP2
+ MISC_CG_MASK_HWCAP2,
+#endif
+#ifdef ELF_HWCAP3
+ MISC_CG_MASK_HWCAP3,
+#endif
+#ifdef ELF_HWCAP4
+ MISC_CG_MASK_HWCAP4,
+#endif
+ MISC_CG_MASK_TYPES
+};
+
struct misc_cg;
#ifdef CONFIG_CGROUP_MISC
@@ -62,12 +78,15 @@ struct misc_cg {
struct cgroup_file events_local_file;
struct misc_res res[MISC_CG_RES_TYPES];
+ u64 mask[MISC_CG_MASK_TYPES];
};
int misc_cg_set_capacity(enum misc_res_type type, u64 capacity);
int misc_cg_try_charge(enum misc_res_type type, struct misc_cg *cg, u64 amount);
void misc_cg_uncharge(enum misc_res_type type, struct misc_cg *cg, u64 amount);
+int misc_cg_get_mask(enum misc_mask_type type, struct misc_cg *cg, u64 *pmask);
+
/**
* css_misc() - Get misc cgroup from the css.
* @css: cgroup subsys state object.
@@ -134,5 +153,11 @@ static inline void put_misc_cg(struct misc_cg *cg)
{
}
+static inline int misc_cg_get_mask(enum misc_mask_type type, struct misc_cg *cg, u64 *pmask)
+{
+ *pmask = 0;
+ return 0;
+}
+
#endif /* CONFIG_CGROUP_MISC */
#endif /* _MISC_CGROUP_H_ */
diff --git a/kernel/cgroup/misc.c b/kernel/cgroup/misc.c
index 6a01d91ea4cb..d1386d86060f 100644
--- a/kernel/cgroup/misc.c
+++ b/kernel/cgroup/misc.c
@@ -30,6 +30,19 @@ static const char *const misc_res_name[] = {
#endif
};
+static const char *const misc_mask_name[] = {
+ "AT_HWCAP",
+#ifdef ELF_HWCAP2
+ "AT_HWCAP2",
+#endif
+#ifdef ELF_HWCAP3
+ "AT_HWCAP3",
+#endif
+#ifdef ELF_HWCAP4
+ "AT_HWCAP4",
+#endif
+};
+
/* Root misc cgroup */
static struct misc_cg root_cg;
@@ -71,6 +84,11 @@ static inline bool valid_type(enum misc_res_type type)
return type >= 0 && type < MISC_CG_RES_TYPES;
}
+static inline bool valid_mask_type(enum misc_mask_type type)
+{
+ return type >= 0 && type < MISC_CG_MASK_TYPES;
+}
+
/**
* misc_cg_set_capacity() - Set the capacity of the misc cgroup res.
* @type: Type of the misc res.
@@ -391,6 +409,109 @@ static int misc_events_local_show(struct seq_file *sf, void *v)
return __misc_events_show(sf, true);
}
+/**
+ * misc_cg_get_mask() - Get the mask of the specified type.
+ * @type: The misc mask type.
+ * @cg: The misc cgroup.
+ * @pmask: Pointer to the resulting mask.
+ *
+ * This function calculates the effective mask for a given cgroup by walking up
+ * the hierarchy and ORing the masks from all parent cgroupfs. The final result
+ * is stored in the location pointed to by @pmask.
+ *
+ * Context: Any context.
+ * Return: 0 on success, -EINVAL if @type is invalid.
+ */
+int misc_cg_get_mask(enum misc_mask_type type, struct misc_cg *cg, u64 *pmask)
+{
+ struct misc_cg *i;
+ u64 mask = 0;
+
+ if (!(valid_mask_type(type)))
+ return -EINVAL;
+
+ for (i = cg; i; i = parent_misc(i))
+ mask |= READ_ONCE(i->mask[type]);
+
+ *pmask = mask;
+ return 0;
+}
+
+/**
+ * misc_cg_mask_show() - Show the misc cgroup masks.
+ * @sf: Interface file
+ * @v: Arguments passed
+ *
+ * Context: Any context.
+ * Return: 0 to denote successful print.
+ */
+static int misc_cg_mask_show(struct seq_file *sf, void *v)
+{
+ struct misc_cg *cg = css_misc(seq_css(sf));
+ int i;
+
+ for (i = 0; i < MISC_CG_MASK_TYPES; i++) {
+ u64 rval, val = READ_ONCE(cg->mask[i]);
+
+ misc_cg_get_mask(i, cg, &rval);
+ seq_printf(sf, "%s\t%#016llx\t%#016llx\n", misc_mask_name[i], val, rval);
+ }
+
+ return 0;
+}
+
+/**
+ * misc_cg_mask_write() - Update the mask of the specified type.
+ * @of: Handler for the file.
+ * @buf: The buffer containing the user's input.
+ * @nbytes: The number of bytes in @buf.
+ * @off: The offset in the file.
+ *
+ * This function parses a user-provided string to update a mask.
+ * The expected format is "<mask_name> <value>", for example:
+ *
+ * echo "AT_HWCAP 0xf00" > misc.mask
+ *
+ * Context: Process context.
+ * Return: The number of bytes processed on success, or a negative error code
+ * on failure.
+ */
+static ssize_t misc_cg_mask_write(struct kernfs_open_file *of, char *buf,
+ size_t nbytes, loff_t off)
+{
+ struct misc_cg *cg;
+ u64 max;
+ int ret = 0, i;
+ enum misc_mask_type type = MISC_CG_MASK_TYPES;
+ char *token;
+
+ buf = strstrip(buf);
+ token = strsep(&buf, " ");
+
+ if (!token || !buf)
+ return -EINVAL;
+
+ for (i = 0; i < MISC_CG_MASK_TYPES; i++) {
+ if (!strcmp(misc_mask_name[i], token)) {
+ type = i;
+ break;
+ }
+ }
+
+ if (type == MISC_CG_MASK_TYPES)
+ return -EINVAL;
+
+ ret = kstrtou64(buf, 0, &max);
+ if (ret)
+ return ret;
+
+ cg = css_misc(of_css(of));
+
+ WRITE_ONCE(cg->mask[type], max);
+
+ return nbytes;
+}
+
/* Misc cgroup interface files */
static struct cftype misc_cg_files[] = {
{
@@ -424,6 +545,11 @@ static struct cftype misc_cg_files[] = {
.file_offset = offsetof(struct misc_cg, events_local_file),
.seq_show = misc_events_local_show,
},
+ {
+ .name = "mask",
+ .write = misc_cg_mask_write,
+ .seq_show = misc_cg_mask_show,
+ },
{}
};
--
2.52.0.223.gf5cc29aaa4-goog
^ permalink raw reply [flat|nested] 11+ messages in thread* [PATCH 2/3] selftests/cgroup: Add a test for the misc.mask cgroup interface
2025-12-05 0:58 [PATCH 0/3] cgroup/misc: Add hwcap masks to the misc controller Andrei Vagin
2025-12-05 0:58 ` [PATCH 1/3] cgroup, binfmt_elf: " Andrei Vagin
2025-12-05 0:58 ` Andrei Vagin
@ 2025-12-05 0:58 ` Andrei Vagin
2025-12-05 0:58 ` [PATCH 3/3] Documentation: cgroup-v2: Document misc.mask interface Andrei Vagin
2025-12-05 2:52 ` [PATCH 0/3] cgroup/misc: Add hwcap masks to the misc controller Chen Ridong
4 siblings, 0 replies; 11+ messages in thread
From: Andrei Vagin @ 2025-12-05 0:58 UTC (permalink / raw)
To: Kees Cook
Cc: linux-kernel, linux-fsdevel, linux-mm, cgroups, criu, Tejun Heo,
Johannes Weiner, Michal Koutný,
Vipin Sharma, Jonathan Corbet, Andrei Vagin
Add a selftest for the misc.mask cgroup interface. The test verifies
that the misc.mask file is present and has the correct default value,
that it is possible to write a new mask to the file, and that the mask is
inherited by sub-cgroups.
Signed-off-by: Andrei Vagin <avagin@google.com>
---
tools/testing/selftests/cgroup/.gitignore | 1 +
tools/testing/selftests/cgroup/Makefile | 2 +
tools/testing/selftests/cgroup/config | 1 +
tools/testing/selftests/cgroup/test_misc.c | 118 +++++++++++++++++++++
4 files changed, 122 insertions(+)
create mode 100644 tools/testing/selftests/cgroup/test_misc.c
diff --git a/tools/testing/selftests/cgroup/.gitignore b/tools/testing/selftests/cgroup/.gitignore
index 952e4448bf07..3ced02a3634b 100644
--- a/tools/testing/selftests/cgroup/.gitignore
+++ b/tools/testing/selftests/cgroup/.gitignore
@@ -7,6 +7,7 @@ test_hugetlb_memcg
test_kill
test_kmem
test_memcontrol
+test_misc
test_pids
test_zswap
wait_inotify
diff --git a/tools/testing/selftests/cgroup/Makefile b/tools/testing/selftests/cgroup/Makefile
index e01584c2189a..6e9e92f89d8a 100644
--- a/tools/testing/selftests/cgroup/Makefile
+++ b/tools/testing/selftests/cgroup/Makefile
@@ -15,6 +15,7 @@ TEST_GEN_PROGS += test_hugetlb_memcg
TEST_GEN_PROGS += test_kill
TEST_GEN_PROGS += test_kmem
TEST_GEN_PROGS += test_memcontrol
+TEST_GEN_PROGS += test_misc
TEST_GEN_PROGS += test_pids
TEST_GEN_PROGS += test_zswap
@@ -31,5 +32,6 @@ $(OUTPUT)/test_hugetlb_memcg: $(LIBCGROUP_O)
$(OUTPUT)/test_kill: $(LIBCGROUP_O)
$(OUTPUT)/test_kmem: $(LIBCGROUP_O)
$(OUTPUT)/test_memcontrol: $(LIBCGROUP_O)
+$(OUTPUT)/test_misc: $(LIBCGROUP_O)
$(OUTPUT)/test_pids: $(LIBCGROUP_O)
$(OUTPUT)/test_zswap: $(LIBCGROUP_O)
diff --git a/tools/testing/selftests/cgroup/config b/tools/testing/selftests/cgroup/config
index 39f979690dd3..9e3d03736f5a 100644
--- a/tools/testing/selftests/cgroup/config
+++ b/tools/testing/selftests/cgroup/config
@@ -1,6 +1,7 @@
CONFIG_CGROUPS=y
CONFIG_CGROUP_CPUACCT=y
CONFIG_CGROUP_FREEZER=y
+CONFIG_CGROUP_MISC=y
CONFIG_CGROUP_SCHED=y
CONFIG_MEMCG=y
CONFIG_PAGE_COUNTER=y
diff --git a/tools/testing/selftests/cgroup/test_misc.c b/tools/testing/selftests/cgroup/test_misc.c
new file mode 100644
index 000000000000..50e8acb51852
--- /dev/null
+++ b/tools/testing/selftests/cgroup/test_misc.c
@@ -0,0 +1,118 @@
+// SPDX-License-Identifier: GPL-2.0
+#define _GNU_SOURCE
+
+#include <errno.h>
+#include <linux/limits.h>
+#include <signal.h>
+#include <string.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include "../kselftest.h"
+#include "cgroup_util.h"
+
+/*
+ * This test checks that misc.mask works correctly.
+ */
+static int test_misc_mask(const char *root)
+{
+ int ret = KSFT_FAIL;
+ char *cg_misc, *cg_misc_sub = NULL;
+
+ cg_misc = cg_name(root, "misc_test");
+ if (!cg_misc)
+ goto cleanup;
+
+ cg_misc_sub = cg_name(root, "misc_test/sub");
+ if (!cg_misc_sub)
+ goto cleanup;
+
+ if (cg_create(cg_misc))
+ goto cleanup;
+
+ if (cg_read_strcmp(cg_misc, "misc.mask",
+ "AT_HWCAP\t0x00000000000000\t0x00000000000000\n"))
+ goto cleanup;
+
+ if (cg_write(cg_misc, "misc.mask", "AT_HWCAP 0xf0000000000000"))
+ goto cleanup;
+
+ if (cg_read_strcmp(cg_misc, "misc.mask",
+ "AT_HWCAP\t0xf0000000000000\t0xf0000000000000\n"))
+ goto cleanup;
+
+ if (cg_write(cg_misc, "cgroup.subtree_control", "+misc"))
+ goto cleanup;
+
+ if (cg_create(cg_misc_sub))
+ goto cleanup;
+
+ if (cg_read_strcmp(cg_misc_sub, "misc.mask",
+ "AT_HWCAP\t0x00000000000000\t0xf0000000000000\n"))
+ goto cleanup;
+
+ if (cg_write(cg_misc_sub, "misc.mask", "AT_HWCAP 0x01000000000000"))
+ goto cleanup;
+
+ if (cg_read_strcmp(cg_misc_sub, "misc.mask",
+ "AT_HWCAP\t0x01000000000000\t0xf1000000000000\n"))
+ goto cleanup;
+
+ ret = KSFT_PASS;
+
+cleanup:
+ cg_enter_current(root);
+ cg_destroy(cg_misc_sub);
+ cg_destroy(cg_misc);
+ free(cg_misc);
+ free(cg_misc_sub);
+
+ return ret;
+}
+
+#define T(x) { x, #x }
+struct misc_test {
+ int (*fn)(const char *root);
+ const char *name;
+} tests[] = {
+ T(test_misc_mask),
+};
+#undef T
+
+int main(int argc, char **argv)
+{
+ char root[PATH_MAX];
+
+ ksft_print_header();
+ ksft_set_plan(ARRAY_SIZE(tests));
+ if (cg_find_unified_root(root, sizeof(root), NULL))
+ ksft_exit_skip("cgroup v2 isn't mounted\n");
+
+ /*
+ * Check that misc controller is available:
+ * misc is listed in cgroup.controllers
+ */
+ if (cg_read_strstr(root, "cgroup.controllers", "misc"))
+ ksft_exit_skip("misc controller isn't available\n");
+
+ if (cg_read_strstr(root, "cgroup.subtree_control", "misc"))
+ if (cg_write(root, "cgroup.subtree_control", "+misc"))
+ ksft_exit_skip("Failed to set misc controller\n");
+
+ for (int i = 0; i < ARRAY_SIZE(tests); i++) {
+ switch (tests[i].fn(root)) {
+ case KSFT_PASS:
+ ksft_test_result_pass("%s\n", tests[i].name);
+ break;
+ case KSFT_SKIP:
+ ksft_test_result_skip("%s\n", tests[i].name);
+ break;
+ default:
+ ksft_test_result_fail("%s\n", tests[i].name);
+ break;
+ }
+ }
+
+ ksft_finished();
+}
--
2.52.0.223.gf5cc29aaa4-goog
^ permalink raw reply [flat|nested] 11+ messages in thread* [PATCH 3/3] Documentation: cgroup-v2: Document misc.mask interface
2025-12-05 0:58 [PATCH 0/3] cgroup/misc: Add hwcap masks to the misc controller Andrei Vagin
` (2 preceding siblings ...)
2025-12-05 0:58 ` [PATCH 2/3] selftests/cgroup: Add a test for the misc.mask cgroup interface Andrei Vagin
@ 2025-12-05 0:58 ` Andrei Vagin
2025-12-05 2:52 ` [PATCH 0/3] cgroup/misc: Add hwcap masks to the misc controller Chen Ridong
4 siblings, 0 replies; 11+ messages in thread
From: Andrei Vagin @ 2025-12-05 0:58 UTC (permalink / raw)
To: Kees Cook
Cc: linux-kernel, linux-fsdevel, linux-mm, cgroups, criu, Tejun Heo,
Johannes Weiner, Michal Koutný,
Vipin Sharma, Jonathan Corbet, Andrei Vagin
Updates the cgroup-v2 documentation to include details about the newly
introduced 'misc.mask' interface. This interface, part of the 'misc'
cgroup controller, allows masking out hardware capabilities (AT_HWCAP,
AT_HWCAP2, AT_HWCAP3, AT_HWCAP4) reported to user-space processes within
a cgroup.
Signed-off-by: Andrei Vagin <avagin@google.com>
---
Documentation/admin-guide/cgroup-v2.rst | 25 +++++++++++++++++++++++++
Documentation/arch/arm64/elf_hwcaps.rst | 21 +++++++++++++++++++++
2 files changed, 46 insertions(+)
diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst
index 4c072e85acdf..9d9d923e0d4e 100644
--- a/Documentation/admin-guide/cgroup-v2.rst
+++ b/Documentation/admin-guide/cgroup-v2.rst
@@ -2924,6 +2924,31 @@ Miscellaneous controller provides 3 interface files. If two misc resources (res_
cgroup i.e. not hierarchical. The file modified event generated on
this file reflects only the local events.
+Miscellaneous controller provides one interface file to control masks.
+
+ misc.mask
+ A read-write flat-keyed file shown in all cgroups. It allows
+ setting/reading the masks. The file format is a series of lines, each
+ describing a mask of a specific mask type.
+
+ The file has the following format for each line::
+
+ $NAME\t$LOCAL_MASK\t$EFFECTIVE_MASK
+
+ Where $NAME is the mask type name, $LOCAL_MASK is the mask for the
+ current cgroup, and $EFFECTIVE_MASK is the effective mask for the
+ current cgroup, which is a combination of the masks from the current
+ cgroup and all its ancestors.
+
+ To set a mask, write a string in the following format to the file::
+
+ $NAME $MASK
+
+ For example, to set a mask for the mask_a type, you would write the
+ following to the file::
+
+ # echo "mask_a 0x3000" > misc.mask
+
Migration and Ownership
~~~~~~~~~~~~~~~~~~~~~~~
diff --git a/Documentation/arch/arm64/elf_hwcaps.rst b/Documentation/arch/arm64/elf_hwcaps.rst
index a15df4956849..5526daff5d30 100644
--- a/Documentation/arch/arm64/elf_hwcaps.rst
+++ b/Documentation/arch/arm64/elf_hwcaps.rst
@@ -450,3 +450,24 @@ HWCAP3_LSFE
For interoperation with userspace, the kernel guarantees that bits 62
and 63 of AT_HWCAP will always be returned as 0.
+
+5. Masking hwcaps for a group of processes
+--------------------------------
+
+The misc cgroup controller provides a mechanism to mask hwcaps for a specific
+workload. This can be useful for limiting the features available to a
+containerized application.
+
+To mask hwcaps, you can write a mask to the ``misc.mask`` file in the cgroup
+directory. The mask is specified per AT_HWCAP entry (AT_HWCAP, AT_HWCAP2,
+AT_HWCAP3) in the format ``<HWCAP_ENTRY_NAME> <BITMASK>``.
+
+For example, to mask ``HWCAP_FP`` and ``HWCAP_ASIMD`` (which are represented by
+bits 0 and 1 of AT_HWCAP, so a mask of 0x3) for a workload, you would write the
+mask for AT_HWCAP to the ``misc.mask`` file in the new cgroup directory::
+
+ # echo "AT_HWCAP 0x3" > /sys/fs/cgroup/misc/my-workload/misc.mask
+
+Any new processes started in this cgroup will have the specified hwcaps
+masked. You can verify this by reading the ``misc.mask`` file, which will
+show the effective mask for the cgroup.
--
2.52.0.223.gf5cc29aaa4-goog
^ permalink raw reply [flat|nested] 11+ messages in thread* Re: [PATCH 0/3] cgroup/misc: Add hwcap masks to the misc controller
2025-12-05 0:58 [PATCH 0/3] cgroup/misc: Add hwcap masks to the misc controller Andrei Vagin
` (3 preceding siblings ...)
2025-12-05 0:58 ` [PATCH 3/3] Documentation: cgroup-v2: Document misc.mask interface Andrei Vagin
@ 2025-12-05 2:52 ` Chen Ridong
2025-12-05 6:39 ` Andrei Vagin
4 siblings, 1 reply; 11+ messages in thread
From: Chen Ridong @ 2025-12-05 2:52 UTC (permalink / raw)
To: Andrei Vagin, Kees Cook
Cc: linux-kernel, linux-fsdevel, linux-mm, cgroups, criu, Tejun Heo,
Johannes Weiner, Michal Koutný,
Vipin Sharma, Jonathan Corbet
On 2025/12/5 8:58, Andrei Vagin wrote:
> This patch series introduces a mechanism to mask hardware capabilities
> (AT_HWCAP) reported to user-space processes via the misc cgroup
> controller.
>
> To support C/R operations (snapshots, live migration) in heterogeneous
> clusters, we must ensure that processes utilize CPU features available
> on all potential target nodes. To solve this, we need to advertise a
> common feature set across the cluster. This patchset allows users to
> configure a mask for AT_HWCAP, AT_HWCAP2. This ensures that applications
> within a container only detect and use features guaranteed to be
> available on all potential target hosts.
>
Could you elaborate on how this mask mechanism would be used in practice?
Based on my understanding of the implementation, the parent’s mask is effectively a subset of the
child’s mask, meaning the parent does not impose any additional restrictions on its children. This
behavior appears to differ from typical cgroup controllers, where children are further constrained
by their parent’s settings. This raises the question: is the cgroup model an appropriate fit for
this functionality?
> The first patch adds the mask interface to the misc cgroup controller,
> allowing users to set masks for AT_HWCAP, AT_HWCAP2...
>
> The second patch adds a selftest to verify the functionality of the new
> interface, ensuring masks are applied and inherited correctly.
>
> The third patch updates the documentation.
>
> Cc: Kees Cook <kees@kernel.org>
> Cc: Tejun Heo <tj@kernel.org>
> Cc: Johannes Weiner <hannes@cmpxchg.org>
> Cc: "Michal Koutný" <mkoutny@suse.com>
> Cc: Vipin Sharma <vipinsh@google.com>
> Cc: Jonathan Corbet <corbet@lwn.net>
>
> Andrei Vagin (3):
> cgroup, binfmt_elf: Add hwcap masks to the misc controller
> selftests/cgroup: Add a test for the misc.mask cgroup interface
> Documentation: cgroup-v2: Document misc.mask interface
>
> Documentation/admin-guide/cgroup-v2.rst | 25 ++++
> Documentation/arch/arm64/elf_hwcaps.rst | 21 ++++
> fs/binfmt_elf.c | 24 +++-
> include/linux/misc_cgroup.h | 25 ++++
> kernel/cgroup/misc.c | 126 +++++++++++++++++++++
> tools/testing/selftests/cgroup/.gitignore | 1 +
> tools/testing/selftests/cgroup/Makefile | 2 +
> tools/testing/selftests/cgroup/config | 1 +
> tools/testing/selftests/cgroup/test_misc.c | 114 +++++++++++++++++++
> 9 files changed, 335 insertions(+), 4 deletions(-)
> create mode 100644 tools/testing/selftests/cgroup/test_misc.c
--
Best regards,
Ridong
^ permalink raw reply [flat|nested] 11+ messages in thread* Re: [PATCH 0/3] cgroup/misc: Add hwcap masks to the misc controller
2025-12-05 2:52 ` [PATCH 0/3] cgroup/misc: Add hwcap masks to the misc controller Chen Ridong
@ 2025-12-05 6:39 ` Andrei Vagin
2025-12-05 10:04 ` Chen Ridong
0 siblings, 1 reply; 11+ messages in thread
From: Andrei Vagin @ 2025-12-05 6:39 UTC (permalink / raw)
To: Chen Ridong
Cc: Andrei Vagin, Kees Cook, linux-kernel, linux-fsdevel, linux-mm,
cgroups, criu, Tejun Heo, Johannes Weiner, Michal Koutný,
Vipin Sharma, Jonathan Corbet
On Thu, Dec 4, 2025 at 6:52 PM Chen Ridong <chenridong@huaweicloud.com> wrote:
>
>
>
> On 2025/12/5 8:58, Andrei Vagin wrote:
> > This patch series introduces a mechanism to mask hardware capabilities
> > (AT_HWCAP) reported to user-space processes via the misc cgroup
> > controller.
> >
> > To support C/R operations (snapshots, live migration) in heterogeneous
> > clusters, we must ensure that processes utilize CPU features available
> > on all potential target nodes. To solve this, we need to advertise a
> > common feature set across the cluster. This patchset allows users to
> > configure a mask for AT_HWCAP, AT_HWCAP2. This ensures that applications
> > within a container only detect and use features guaranteed to be
> > available on all potential target hosts.
> >
>
> Could you elaborate on how this mask mechanism would be used in practice?
>
> Based on my understanding of the implementation, the parent’s mask is effectively a subset of the
> child’s mask, meaning the parent does not impose any additional restrictions on its children. This
> behavior appears to differ from typical cgroup controllers, where children are further constrained
> by their parent’s settings. This raises the question: is the cgroup model an appropriate fit for
> this functionality?
Chen,
Thank you for the question. I think I was not clear enough in the
description.
The misc.mask file works by masking out available features; any feature
bit set in the mask will not be advertised to processes within that
cgroup. When a child cgroup is created, its effective mask is a
combination of its own mask and its parent's effective mask. This means
any feature masked by either the parent or the child will be hidden from
processes in the child cgroup.
For example:
- If a parent cgroup masks out feature A (mask=0b001), processes in it
won't see feature A.
- If we create a child cgroup under it and set its mask to hide feature
B (mask=0b010), the effective mask for processes in the child cgroup
becomes 0b011. They will see neither feature A nor B.
This ensures that a feature hidden by a parent cannot be re-enabled by a
child. A child can only impose further restrictions by masking out
additional features. I think this behaviour is well aligned with the cgroup
model.
Thanks,
Andrei
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH 0/3] cgroup/misc: Add hwcap masks to the misc controller
2025-12-05 6:39 ` Andrei Vagin
@ 2025-12-05 10:04 ` Chen Ridong
2025-12-05 20:19 ` Andrei Vagin
0 siblings, 1 reply; 11+ messages in thread
From: Chen Ridong @ 2025-12-05 10:04 UTC (permalink / raw)
To: Andrei Vagin
Cc: Andrei Vagin, Kees Cook, linux-kernel, linux-fsdevel, linux-mm,
cgroups, criu, Tejun Heo, Johannes Weiner, Michal Koutný,
Vipin Sharma, Jonathan Corbet
On 2025/12/5 14:39, Andrei Vagin wrote:
> On Thu, Dec 4, 2025 at 6:52 PM Chen Ridong <chenridong@huaweicloud.com> wrote:
>>
>>
>>
>> On 2025/12/5 8:58, Andrei Vagin wrote:
>>> This patch series introduces a mechanism to mask hardware capabilities
>>> (AT_HWCAP) reported to user-space processes via the misc cgroup
>>> controller.
>>>
>>> To support C/R operations (snapshots, live migration) in heterogeneous
>>> clusters, we must ensure that processes utilize CPU features available
>>> on all potential target nodes. To solve this, we need to advertise a
>>> common feature set across the cluster. This patchset allows users to
>>> configure a mask for AT_HWCAP, AT_HWCAP2. This ensures that applications
>>> within a container only detect and use features guaranteed to be
>>> available on all potential target hosts.
>>>
>>
>> Could you elaborate on how this mask mechanism would be used in practice?
>>
>> Based on my understanding of the implementation, the parent’s mask is effectively a subset of the
>> child’s mask, meaning the parent does not impose any additional restrictions on its children. This
>> behavior appears to differ from typical cgroup controllers, where children are further constrained
>> by their parent’s settings. This raises the question: is the cgroup model an appropriate fit for
>> this functionality?
>
> Chen,
>
> Thank you for the question. I think I was not clear enough in the
> description.
>
> The misc.mask file works by masking out available features; any feature
> bit set in the mask will not be advertised to processes within that
> cgroup. When a child cgroup is created, its effective mask is a
> combination of its own mask and its parent's effective mask. This means
> any feature masked by either the parent or the child will be hidden from
> processes in the child cgroup.
>
> For example:
> - If a parent cgroup masks out feature A (mask=0b001), processes in it
> won't see feature A.
> - If we create a child cgroup under it and set its mask to hide feature
> B (mask=0b010), the effective mask for processes in the child cgroup
> becomes 0b011. They will see neither feature A nor B.
>
Let me ask some basic questions:
When is the misc.mask typically set? Is it only configured before starting a container (e.g., before
docker run), or can it be adjusted dynamically while processes are already running?
I'm concerned about a potential scenario: If a child process initially has access to a CPU feature,
but then its parent cgroup masks that feature out, could the child process remain unaware of this
change?
Specifically, if a process has already cached or relied on a CPU capability before the mask was
applied, would it continue to assume it has that capability, leading to potential issues if it
attempts to use instructions that are now masked out?
Does such a scenario exist in practice?
> This ensures that a feature hidden by a parent cannot be re-enabled by a
> child. A child can only impose further restrictions by masking out
> additional features. I think this behaviour is well aligned with the cgroup
> model.
>
> Thanks,
> Andrei
--
Best regards,
Ridong
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH 0/3] cgroup/misc: Add hwcap masks to the misc controller
2025-12-05 10:04 ` Chen Ridong
@ 2025-12-05 20:19 ` Andrei Vagin
0 siblings, 0 replies; 11+ messages in thread
From: Andrei Vagin @ 2025-12-05 20:19 UTC (permalink / raw)
To: Chen Ridong
Cc: Andrei Vagin, Kees Cook, linux-kernel, linux-fsdevel, linux-mm,
cgroups, criu, Tejun Heo, Johannes Weiner, Michal Koutný,
Vipin Sharma, Jonathan Corbet
On Fri, Dec 5, 2025 at 2:04 AM Chen Ridong <chenridong@huaweicloud.com> wrote:
>
>
>
> On 2025/12/5 14:39, Andrei Vagin wrote:
> > On Thu, Dec 4, 2025 at 6:52 PM Chen Ridong <chenridong@huaweicloud.com> wrote:
> >>
> >>
> >>
> >> On 2025/12/5 8:58, Andrei Vagin wrote:
> >>> This patch series introduces a mechanism to mask hardware capabilities
> >>> (AT_HWCAP) reported to user-space processes via the misc cgroup
> >>> controller.
> >>>
> >>> To support C/R operations (snapshots, live migration) in heterogeneous
> >>> clusters, we must ensure that processes utilize CPU features available
> >>> on all potential target nodes. To solve this, we need to advertise a
> >>> common feature set across the cluster. This patchset allows users to
> >>> configure a mask for AT_HWCAP, AT_HWCAP2. This ensures that applications
> >>> within a container only detect and use features guaranteed to be
> >>> available on all potential target hosts.
> >>>
> >>
> >> Could you elaborate on how this mask mechanism would be used in practice?
> >>
> >> Based on my understanding of the implementation, the parent’s mask is effectively a subset of the
> >> child’s mask, meaning the parent does not impose any additional restrictions on its children. This
> >> behavior appears to differ from typical cgroup controllers, where children are further constrained
> >> by their parent’s settings. This raises the question: is the cgroup model an appropriate fit for
> >> this functionality?
> >
> > Chen,
> >
> > Thank you for the question. I think I was not clear enough in the
> > description.
> >
> > The misc.mask file works by masking out available features; any feature
> > bit set in the mask will not be advertised to processes within that
> > cgroup. When a child cgroup is created, its effective mask is a
> > combination of its own mask and its parent's effective mask. This means
> > any feature masked by either the parent or the child will be hidden from
> > processes in the child cgroup.
> >
> > For example:
> > - If a parent cgroup masks out feature A (mask=0b001), processes in it
> > won't see feature A.
> > - If we create a child cgroup under it and set its mask to hide feature
> > B (mask=0b010), the effective mask for processes in the child cgroup
> > becomes 0b011. They will see neither feature A nor B.
> >
> Let me ask some basic questions:
>
> When is the misc.mask typically set? Is it only configured before starting a container (e.g., before
> docker run), or can it be adjusted dynamically while processes are already running?
If we are talking about C/R use cases, it should be configured when
container is started. It can be adjusted dynamically, but all changes
will affect only new processes. The auxiliary vectors are set on execve.
>
> I'm concerned about a potential scenario: If a child process initially has access to a CPU feature,
> but then its parent cgroup masks that feature out, could the child process remain unaware of this
> change?
>
> Specifically, if a process has already cached or relied on a CPU capability before the mask was
> applied, would it continue to assume it has that capability, leading to potential issues if it
> attempts to use instructions that are now masked out?
I wouldn't classify this behavior as an issue; it's designed to function
this way. It's important to understand that this isn't enforcement, but
rather information for processes regarding which features are
"guaranteed" to them. A process can choose to utilize unexposed
features at its own risk, potentially encountering problems after
migration to a different host.
Thanks,
Andrei
^ permalink raw reply [flat|nested] 11+ messages in thread