From: Sourav Panda <souravpanda@google.com>
To: mathieu.desnoyers@efficios.com, willy@infradead.org,
david@redhat.com, pasha.tatashin@soleen.com,
rientjes@google.com, akpm@linux-foundation.org,
linux-mm@kvack.org, linux-kernel@vger.kernel.org,
weixugc@google.com, gthelen@google.com, souravpanda@google.com,
surenb@google.com
Subject: [RFC PATCH 6/6] mm: syscall alternative for SELECTIVE_KSM
Date: Fri, 21 Mar 2025 17:37:29 +0000 [thread overview]
Message-ID: <20250321173729.3175898-7-souravpanda@google.com> (raw)
In-Reply-To: <20250321173729.3175898-1-souravpanda@google.com>
Partition can be created or opened using:
int ksm_fd = ksm_open(ksm_name, flag);
name specifies the ksm partition to be created or opened.
flags:
O_CREAT
Create the ksm partition object if it does not exist.
O_EXCL
If O_CREAT was also specified, and a ksm partition object
with the given name already exists, return an error.
Trigger the merge using:
ksm_merge(ksm_fd, pid, start_addr, size);
Limitation: Only supporting x86 syscall_64.
Signed-off-by: Sourav Panda <souravpanda@google.com>
---
arch/x86/entry/syscalls/syscall_64.tbl | 3 +-
include/linux/ksm.h | 4 +
mm/ksm.c | 156 ++++++++++++++++++++++++-
3 files changed, 161 insertions(+), 2 deletions(-)
diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl
index 5eb708bff1c7..352d747dbe33 100644
--- a/arch/x86/entry/syscalls/syscall_64.tbl
+++ b/arch/x86/entry/syscalls/syscall_64.tbl
@@ -390,7 +390,8 @@
464 common getxattrat sys_getxattrat
465 common listxattrat sys_listxattrat
466 common removexattrat sys_removexattrat
-
+467 common ksm_open sys_ksm_open
+468 common ksm_merge sys_ksm_merge
#
# Due to a historical design error, certain syscalls are numbered differently
# in x32 as compared to native x86_64. These syscalls have numbers 512-547.
diff --git a/include/linux/ksm.h b/include/linux/ksm.h
index d73095b5cd96..a94c89403c29 100644
--- a/include/linux/ksm.h
+++ b/include/linux/ksm.h
@@ -14,6 +14,10 @@
#include <linux/rmap.h>
#include <linux/sched.h>
+#include <linux/anon_inodes.h>
+#include <linux/syscalls.h>
+#define MAX_KSM_NAME_LEN 128
+
#ifdef CONFIG_KSM
int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
unsigned long end, int advice, unsigned long *vm_flags);
diff --git a/mm/ksm.c b/mm/ksm.c
index fd7626d5d8c9..71558120b034 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -147,7 +147,8 @@ struct ksm_scan {
static struct kobject *ksm_base_kobj;
struct partition_kobj {
- struct kobject *kobj;
+ struct kobject *kobj; /* Not required for the syscall interface */
+ char name[MAX_KSM_NAME_LEN];
struct list_head list;
struct rb_root *root_stable_tree;
struct rb_root *root_unstable_tree;
@@ -166,6 +167,106 @@ static struct partition_kobj *find_partition_by_kobj(struct kobject *kobj)
return NULL;
}
+static struct partition_kobj *find_ksm_partition(char *partition_name)
+{
+ struct partition_kobj *partition;
+
+ list_for_each_entry(partition, &partition_list, list) {
+ if (strcmp(partition->name, partition_name) == 0)
+ return partition;
+ }
+ return NULL;
+}
+
+static DEFINE_MUTEX(ksm_partition_lock);
+
+static int ksm_release(struct inode *inode, struct file *file)
+{
+ struct partition_kobj *ksm = file->private_data;
+
+ mutex_lock(&ksm_partition_lock);
+ list_del(&ksm->list);
+ mutex_unlock(&ksm_partition_lock);
+
+ kfree(ksm);
+ return 0;
+}
+
+static const struct file_operations ksm_fops = {
+ .release = ksm_release,
+};
+
+static struct partition_kobj *ksm_create_partition(char *ksm_name)
+{
+ struct partition_kobj *partition;
+ struct rb_root *tree_root;
+
+ partition = kzalloc(sizeof(*partition), GFP_KERNEL);
+ if (!partition)
+ return NULL;
+
+ tree_root = kcalloc(nr_node_ids + nr_node_ids, sizeof(*tree_root),
+ GFP_KERNEL);
+ if (!tree_root)
+ return NULL;
+
+ partition->root_stable_tree = tree_root;
+ partition->root_unstable_tree = tree_root + nr_node_ids;
+ strncpy(partition->name, ksm_name, sizeof(partition->name));
+
+ list_add(&partition->list, &partition_list);
+
+ return partition;
+}
+
+static int ksm_partition_fd(struct partition_kobj *partition)
+{
+ int fd;
+ struct file *file;
+ int ret;
+
+ file = anon_inode_getfile("ksm_partition", &ksm_fops, partition, O_RDWR);
+ if (IS_ERR(file)) {
+ ret = PTR_ERR(file);
+ return ret;
+ }
+
+ fd = get_unused_fd_flags(O_RDWR);
+ if (fd < 0) {
+ fput(file);
+ return fd;
+ }
+ fd_install(fd, file);
+ return fd;
+}
+
+SYSCALL_DEFINE2(ksm_open, const char __user *, ksm_name, int, flags) {
+ char name[MAX_KSM_NAME_LEN];
+ struct partition_kobj *partition;
+ int ret;
+
+ ret = strncpy_from_user(name, ksm_name, sizeof(name));
+ if (ret < 0)
+ return -EFAULT;
+
+ partition = find_ksm_partition(name);
+
+ if (flags & O_EXCL && partition) /* Partition already exists, return error */
+ return -EEXIST;
+
+ if (flags & O_CREAT && !partition) {
+ /* Partition does not exist, but we are allowed to create one */
+ mutex_lock(&ksm_partition_lock);
+ partition = ksm_create_partition(name);
+ mutex_unlock(&ksm_partition_lock);
+ }
+
+ if (!partition)
+ return flags & O_CREAT ? -ENOMEM : -ENOENT;
+
+ return ksm_partition_fd(partition);
+}
+
/**
* struct ksm_stable_node - node of the stable rbtree
* @node: rb node of this ksm page in the stable tree
@@ -4324,6 +4425,59 @@ static int __init ksm_thread_sysfs_init(void)
}
#endif /* CONFIG_SELECTIVE_KSM */
+SYSCALL_DEFINE4(ksm_merge, int, ksm_fd, pid_t, pid, unsigned long, start, size_t, size) {
+ unsigned long end = start + size;
+ struct task_struct *task;
+ struct mm_struct *mm;
+ struct partition_kobj *partition;
+ struct file *file;
+
+ file = fget(ksm_fd);
+ if (!file)
+ return -EBADF;
+
+ partition = file->private_data;
+ if (!partition) {
+ fput(file);
+ return -EINVAL;
+ }
+
+ if (start >= end) {
+ fput(file);
+ return -EINVAL;
+ }
+
+ /* Find the mm_struct */
+ rcu_read_lock();
+ task = find_task_by_vpid(pid);
+ if (!task) {
+ fput(file);
+ rcu_read_unlock();
+ return -ESRCH;
+ }
+
+ get_task_struct(task);
+
+ rcu_read_unlock();
+ mm = get_task_mm(task);
+ put_task_struct(task);
+
+ if (!mm) {
+ fput(file);
+ return -EINVAL;
+ }
+
+ mutex_lock(&ksm_thread_mutex);
+ wait_while_offlining();
+ ksm_sync_merge(mm, start, end, partition);
+ mutex_unlock(&ksm_thread_mutex);
+
+ mmput(mm);
+
+ fput(file);
+ return 0;
+}
+
static int __init ksm_init(void)
{
int err;
--
2.49.0.395.g12beb8f557-goog
prev parent reply other threads:[~2025-03-21 17:37 UTC|newest]
Thread overview: 7+ messages / expand[flat|nested] mbox.gz Atom feed top
2025-03-21 17:37 [RFC PATCH 0/6] Selective KSM: Synchronous and Partitioned Merging Sourav Panda
2025-03-21 17:37 ` [RFC PATCH 1/6] mm: introduce SELECTIVE_KSM KConfig Sourav Panda
2025-03-21 17:37 ` [RFC PATCH 2/6] mm: make Selective KSM synchronous Sourav Panda
2025-03-21 17:37 ` [RFC PATCH 3/6] mm: make Selective KSM partitioned Sourav Panda
2025-03-21 17:37 ` [RFC PATCH 4/6] mm: create dedicated trees for SELECTIVE KSM partitions Sourav Panda
2025-03-21 17:37 ` [RFC PATCH 5/6] mm: trigger unmerge and remove SELECTIVE KSM partition Sourav Panda
2025-03-21 17:37 ` Sourav Panda [this message]
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20250321173729.3175898-7-souravpanda@google.com \
--to=souravpanda@google.com \
--cc=akpm@linux-foundation.org \
--cc=david@redhat.com \
--cc=gthelen@google.com \
--cc=linux-kernel@vger.kernel.org \
--cc=linux-mm@kvack.org \
--cc=mathieu.desnoyers@efficios.com \
--cc=pasha.tatashin@soleen.com \
--cc=rientjes@google.com \
--cc=surenb@google.com \
--cc=weixugc@google.com \
--cc=willy@infradead.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox