linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed
From: Bo Li <libo.gcs85@bytedance.com>
To: tglx@linutronix.de, mingo@redhat.com, bp@alien8.de,
	dave.hansen@linux.intel.com, x86@kernel.org, luto@kernel.org,
	kees@kernel.org, akpm@linux-foundation.org, david@redhat.com,
	juri.lelli@redhat.com, vincent.guittot@linaro.org,
	peterz@infradead.org
Cc: dietmar.eggemann@arm.com, hpa@zytor.com, acme@kernel.org,
	namhyung@kernel.org, mark.rutland@arm.com,
	alexander.shishkin@linux.intel.com, jolsa@kernel.org,
	irogers@google.com, adrian.hunter@intel.com,
	kan.liang@linux.intel.com, viro@zeniv.linux.org.uk,
	brauner@kernel.org, jack@suse.cz, lorenzo.stoakes@oracle.com,
	Liam.Howlett@oracle.com, vbabka@suse.cz, rppt@kernel.org,
	surenb@google.com, mhocko@suse.com, rostedt@goodmis.org,
	bsegall@google.com, mgorman@suse.de, vschneid@redhat.com,
	jannh@google.com, pfalcato@suse.de, riel@surriel.com,
	harry.yoo@oracle.com, linux-kernel@vger.kernel.org,
	linux-perf-users@vger.kernel.org, linux-fsdevel@vger.kernel.org,
	linux-mm@kvack.org, duanxiongchun@bytedance.com,
	yinhongbo@bytedance.com, dengliang.1214@bytedance.com,
	xieyongji@bytedance.com, chaiwen.cc@bytedance.com,
	songmuchun@bytedance.com, yuanzhu@bytedance.com,
	chengguozhu@bytedance.com, sunjiadong.lff@bytedance.com,
	Bo Li <libo.gcs85@bytedance.com>
Subject: [RFC v2 09/35] RPAL: enable address space sharing
Date: Fri, 30 May 2025 17:27:37 +0800	[thread overview]
Message-ID: <2b5378f3686fd2831468e65c49609fbb19072b43.1748594840.git.libo.gcs85@bytedance.com> (raw)
In-Reply-To: <cover.1748594840.git.libo.gcs85@bytedance.com>

RPAL's memory sharing is implemented by copying p4d entries, which requires
implementing corresponding interfaces. Meanwhile, copying p4d entries can
cause the process's page table to contain p4d entries that do not belong to
it, and RPAL needs to resolve compatibility issues with other subsystems
caused by this.

This patch implements the rpal_map_service() interface to complete the
mutual copying of p4d entries between two RPAL services. For the copied p4d
entries, RPAL adds a _PAGE_RPAL_IGN flag to them. This flag makes
p4d_none() return true and p4d_present() return false, ensuring that these
p4d entries are invisible to other kernel subsystems. The protection of p4d
entries is guaranteed by the memory balloon, which ensures that the address
space corresponding to the p4d entries is not used by the current service.

Signed-off-by: Bo Li <libo.gcs85@bytedance.com>
---
 arch/x86/include/asm/pgtable.h       |  25 ++++
 arch/x86/include/asm/pgtable_types.h |  11 ++
 arch/x86/rpal/internal.h             |   2 +
 arch/x86/rpal/mm.c                   | 175 +++++++++++++++++++++++++++
 4 files changed, 213 insertions(+)

diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 5ddba366d3b4..54351bfe4e47 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -1137,12 +1137,37 @@ static inline int pud_bad(pud_t pud)
 #if CONFIG_PGTABLE_LEVELS > 3
 static inline int p4d_none(p4d_t p4d)
 {
+#if IS_ENABLED(CONFIG_RPAL)
+	p4dval_t p4dv = native_p4d_val(p4d);
+
+	/*
+	 * Since RPAL copy p4d entry to share address space,
+	 * it is important that other process will not manipulate
+	 * this copied p4d. Thus, make p4d_none() always return
+	 * 0 to bypass kernel page table logic on copied p4d.
+	 */
+	return (p4dv & _PAGE_RPAL_IGN) ||
+	       ((p4dv & ~(_PAGE_KNL_ERRATUM_MASK)) == 0);
+#else
 	return (native_p4d_val(p4d) & ~(_PAGE_KNL_ERRATUM_MASK)) == 0;
+#endif
 }
 
 static inline int p4d_present(p4d_t p4d)
 {
+#if IS_ENABLED(CONFIG_RPAL)
+	p4dval_t p4df = p4d_flags(p4d);
+
+	/*
+	 * Since RPAL copy p4d entry to share address space,
+	 * it is important that other process will not manipulate
+	 * this copied p4d. Thus, make p4d_present() always return
+	 * 0 to bypass kernel page table logic on copied p4d.
+	 */
+	return ((p4df & (_PAGE_PRESENT | _PAGE_RPAL_IGN)) == _PAGE_PRESENT);
+#else
 	return p4d_flags(p4d) & _PAGE_PRESENT;
+#endif
 }
 
 static inline pud_t *p4d_pgtable(p4d_t p4d)
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index b74ec5c3643b..781b0f5bc359 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -35,6 +35,13 @@
 #define _PAGE_BIT_SOFT_DIRTY	_PAGE_BIT_SOFTW3 /* software dirty tracking */
 #define _PAGE_BIT_KERNEL_4K	_PAGE_BIT_SOFTW3 /* page must not be converted to large */
 #define _PAGE_BIT_DEVMAP	_PAGE_BIT_SOFTW4
+/*
+ * _PAGE_BIT_SOFTW1 is used by _PAGE_BIT_SPECIAL.
+ * but we are not conflicted with _PAGE_BIT_SPECIAL
+ * as we use it only on p4d/pud level and _PAGE_BIT_SPECIAL
+ * is only used on pte level.
+ */
+#define _PAGE_BIT_RPAL_IGN	_PAGE_BIT_SOFTW1
 
 #ifdef CONFIG_X86_64
 #define _PAGE_BIT_SAVED_DIRTY	_PAGE_BIT_SOFTW5 /* Saved Dirty bit (leaf) */
@@ -95,6 +102,10 @@
 #define _PAGE_SOFT_DIRTY	(_AT(pteval_t, 0))
 #endif
 
+#if IS_ENABLED(CONFIG_RPAL)
+#define _PAGE_RPAL_IGN	(_AT(pteval_t, 1) << _PAGE_BIT_RPAL_IGN)
+#endif
+
 /*
  * Tracking soft dirty bit when a page goes to a swap is tricky.
  * We need a bit which can be stored in pte _and_ not conflict
diff --git a/arch/x86/rpal/internal.h b/arch/x86/rpal/internal.h
index 3559c9c6e868..65f2cf4baf8f 100644
--- a/arch/x86/rpal/internal.h
+++ b/arch/x86/rpal/internal.h
@@ -34,6 +34,8 @@ static inline void rpal_put_shared_page(struct rpal_shared_page *rsp)
 int rpal_mmap(struct file *filp, struct vm_area_struct *vma);
 struct rpal_shared_page *rpal_find_shared_page(struct rpal_service *rs,
 					       unsigned long addr);
+int rpal_map_service(struct rpal_service *tgt);
+void rpal_unmap_service(struct rpal_service *tgt);
 
 /* thread.c */
 int rpal_register_sender(unsigned long addr);
diff --git a/arch/x86/rpal/mm.c b/arch/x86/rpal/mm.c
index 8a738c502d1d..f1003baae001 100644
--- a/arch/x86/rpal/mm.c
+++ b/arch/x86/rpal/mm.c
@@ -215,3 +215,178 @@ void rpal_exit_mmap(struct mm_struct *mm)
 		rpal_put_service(rs);
 	}
 }
+
+/*
+ * Since the user address space size of rpal process is 512G, which
+ * is the size of one p4d, we assume p4d entry will never change after
+ * rpal process is created.
+ */
+static int mm_link_p4d(struct mm_struct *dst_mm, p4d_t src_p4d,
+		       unsigned long addr)
+{
+	spinlock_t *dst_ptl = &dst_mm->page_table_lock;
+	unsigned long flags;
+	pgd_t *dst_pgdp;
+	p4d_t p4d, *dst_p4dp;
+	p4dval_t p4dv;
+	int ret = 0;
+
+	BUILD_BUG_ON(CONFIG_PGTABLE_LEVELS < 4);
+
+	mmap_write_lock(dst_mm);
+	spin_lock_irqsave(dst_ptl, flags);
+	dst_pgdp = pgd_offset(dst_mm, addr);
+	/*
+	 * dst_pgd must exists, otherwise we need to alloc pgd entry. When
+	 * src_p4d is freed, we also need to free the pgd entry. This should
+	 * be supported in the future.
+	 */
+	if (unlikely(pgd_none_or_clear_bad(dst_pgdp))) {
+		rpal_err("cannot find pgd entry for addr 0x%016lx\n", addr);
+		ret = -EINVAL;
+		goto unlock;
+	}
+
+	dst_p4dp = p4d_offset(dst_pgdp, addr);
+	if (unlikely(!p4d_none_or_clear_bad(dst_p4dp))) {
+		rpal_err("p4d is previously mapped\n");
+		ret = -EINVAL;
+		goto unlock;
+	}
+
+	p4dv = p4d_val(src_p4d);
+
+	/*
+	 * Since RPAL copy p4d entry to share address space,
+	 * it is important that other process will not manipulate
+	 * this copied p4d. We need mark the copied p4d and make
+	 * p4d_present() and p4d_none() ignore such p4d.
+	 */
+	p4dv |= _PAGE_RPAL_IGN;
+
+	if (boot_cpu_has(X86_FEATURE_PTI))
+		p4d = native_make_p4d((~_PAGE_NX) & p4dv);
+	else
+		p4d = native_make_p4d(p4dv);
+
+	set_p4d(dst_p4dp, p4d);
+	spin_unlock_irqrestore(dst_ptl, flags);
+	mmap_write_unlock(dst_mm);
+
+	return 0;
+unlock:
+	spin_unlock_irqrestore(dst_ptl, flags);
+	mmap_write_unlock(dst_mm);
+	return ret;
+}
+
+static void mm_unlink_p4d(struct mm_struct *mm, unsigned long addr)
+{
+	spinlock_t *ptl = &mm->page_table_lock;
+	unsigned long flags;
+	pgd_t *pgdp;
+	p4d_t *p4dp;
+
+	mmap_write_lock(mm);
+	spin_lock_irqsave(ptl, flags);
+	pgdp = pgd_offset(mm, addr);
+	p4dp = p4d_offset(pgdp, addr);
+	p4d_clear(p4dp);
+	spin_unlock_irqrestore(ptl, flags);
+	mmap_write_unlock(mm);
+
+	flush_tlb_mm(mm);
+}
+
+static int get_mm_p4d(struct mm_struct *mm, unsigned long addr, p4d_t *srcp)
+{
+	spinlock_t *ptl;
+	unsigned long flags;
+	pgd_t *pgdp;
+	p4d_t *p4dp;
+	int ret = 0;
+
+	ptl = &mm->page_table_lock;
+	spin_lock_irqsave(ptl, flags);
+	pgdp = pgd_offset(mm, addr);
+	if (pgd_none(*pgdp)) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	p4dp = p4d_offset(pgdp, addr);
+	if (p4d_none(*p4dp) || p4d_bad(*p4dp)) {
+		ret = -EINVAL;
+		goto out;
+	}
+	*srcp = *p4dp;
+
+out:
+	spin_unlock_irqrestore(ptl, flags);
+
+	return ret;
+}
+
+int rpal_map_service(struct rpal_service *tgt)
+{
+	struct rpal_service *cur = rpal_current_service();
+	struct mm_struct *cur_mm, *tgt_mm;
+	unsigned long cur_addr, tgt_addr;
+	p4d_t cur_p4d, tgt_p4d;
+	int ret = 0;
+
+	cur_mm = current->mm;
+	tgt_mm = tgt->mm;
+	if (!mmget_not_zero(tgt_mm)) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	cur_addr = rpal_get_base(cur);
+	tgt_addr = rpal_get_base(tgt);
+
+	ret = get_mm_p4d(tgt_mm, tgt_addr, &tgt_p4d);
+	if (ret)
+		goto put_tgt;
+
+	ret = get_mm_p4d(cur_mm, cur_addr, &cur_p4d);
+	if (ret)
+		goto put_tgt;
+
+	ret = mm_link_p4d(cur_mm, tgt_p4d, tgt_addr);
+	if (ret)
+		goto put_tgt;
+
+	ret = mm_link_p4d(tgt_mm, cur_p4d, cur_addr);
+	if (ret) {
+		mm_unlink_p4d(cur_mm, tgt_addr);
+		goto put_tgt;
+	}
+
+put_tgt:
+	mmput(tgt_mm);
+out:
+	return ret;
+}
+
+void rpal_unmap_service(struct rpal_service *tgt)
+{
+	struct rpal_service *cur = rpal_current_service();
+	struct mm_struct *cur_mm, *tgt_mm;
+	unsigned long cur_addr, tgt_addr;
+
+	cur_mm = current->mm;
+	tgt_mm = tgt->mm;
+
+	cur_addr = rpal_get_base(cur);
+	tgt_addr = rpal_get_base(tgt);
+
+	if (mmget_not_zero(tgt_mm)) {
+		mm_unlink_p4d(tgt_mm, cur_addr);
+		mmput(tgt_mm);
+	} else {
+		/* If tgt has exited, then we get a NULL tgt_mm */
+		pr_debug("rpal: [%d] cannot find target mm\n", current->pid);
+	}
+	mm_unlink_p4d(cur_mm, tgt->base);
+}
-- 
2.20.1



  parent reply	other threads:[~2025-05-30  9:30 UTC|newest]

Thread overview: 46+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2025-05-30  9:27 [RFC v2 00/35] optimize cost of inter-process communication Bo Li
2025-05-30  9:27 ` [RFC v2 01/35] Kbuild: rpal support Bo Li
2025-05-30  9:27 ` [RFC v2 02/35] RPAL: add struct rpal_service Bo Li
2025-05-30  9:27 ` [RFC v2 03/35] RPAL: add service registration interface Bo Li
2025-05-30  9:27 ` [RFC v2 04/35] RPAL: add member to task_struct and mm_struct Bo Li
2025-05-30  9:27 ` [RFC v2 05/35] RPAL: enable virtual address space partitions Bo Li
2025-05-30  9:27 ` [RFC v2 06/35] RPAL: add user interface Bo Li
2025-05-30  9:27 ` [RFC v2 07/35] RPAL: enable shared page mmap Bo Li
2025-05-30  9:27 ` [RFC v2 08/35] RPAL: enable sender/receiver registration Bo Li
2025-05-30  9:27 ` Bo Li [this message]
2025-05-30  9:27 ` [RFC v2 10/35] RPAL: allow service enable/disable Bo Li
2025-05-30  9:27 ` [RFC v2 11/35] RPAL: add service request/release Bo Li
2025-05-30  9:27 ` [RFC v2 12/35] RPAL: enable service disable notification Bo Li
2025-05-30  9:27 ` [RFC v2 13/35] RPAL: add tlb flushing support Bo Li
2025-05-30  9:27 ` [RFC v2 14/35] RPAL: enable page fault handling Bo Li
2025-05-30 13:59   ` Dave Hansen
2025-05-30  9:27 ` [RFC v2 15/35] RPAL: add sender/receiver state Bo Li
2025-05-30  9:27 ` [RFC v2 16/35] RPAL: add cpu lock interface Bo Li
2025-05-30  9:27 ` [RFC v2 17/35] RPAL: add a mapping between fsbase and tasks Bo Li
2025-05-30  9:27 ` [RFC v2 18/35] sched: pick a specified task Bo Li
2025-05-30  9:27 ` [RFC v2 19/35] RPAL: add lazy switch main logic Bo Li
2025-05-30  9:27 ` [RFC v2 20/35] RPAL: add rpal_ret_from_lazy_switch Bo Li
2025-05-30  9:27 ` [RFC v2 21/35] RPAL: add kernel entry handling for lazy switch Bo Li
2025-05-30  9:27 ` [RFC v2 22/35] RPAL: rebuild receiver state Bo Li
2025-05-30  9:27 ` [RFC v2 23/35] RPAL: resume cpumask when fork Bo Li
2025-05-30  9:27 ` [RFC v2 24/35] RPAL: critical section optimization Bo Li
2025-05-30  9:27 ` [RFC v2 25/35] RPAL: add MPK initialization and interface Bo Li
2025-05-30  9:27 ` [RFC v2 26/35] RPAL: enable MPK support Bo Li
2025-05-30 17:03   ` Dave Hansen
2025-05-30  9:27 ` [RFC v2 27/35] RPAL: add epoll support Bo Li
2025-05-30  9:27 ` [RFC v2 28/35] RPAL: add rpal_uds_fdmap() support Bo Li
2025-05-30  9:27 ` [RFC v2 29/35] RPAL: fix race condition in pkru update Bo Li
2025-05-30  9:27 ` [RFC v2 30/35] RPAL: fix pkru setup when fork Bo Li
2025-05-30  9:27 ` [RFC v2 31/35] RPAL: add receiver waker Bo Li
2025-05-30  9:28 ` [RFC v2 32/35] RPAL: fix unknown nmi on AMD CPU Bo Li
2025-05-30  9:28 ` [RFC v2 33/35] RPAL: enable time slice correction Bo Li
2025-05-30  9:28 ` [RFC v2 34/35] RPAL: enable fast epoll wait Bo Li
2025-05-30  9:28 ` [RFC v2 35/35] samples/rpal: add RPAL samples Bo Li
2025-05-30  9:33 ` [RFC v2 00/35] optimize cost of inter-process communication Lorenzo Stoakes
2025-06-03  8:22   ` Bo Li
2025-06-03  9:22     ` Lorenzo Stoakes
2025-05-30  9:41 ` Pedro Falcato
2025-05-30  9:56 ` David Hildenbrand
2025-05-30 22:42 ` Andrew Morton
2025-05-31  7:16 ` Ingo Molnar
2025-06-03 17:49 ` H. Peter Anvin

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=2b5378f3686fd2831468e65c49609fbb19072b43.1748594840.git.libo.gcs85@bytedance.com \
    --to=libo.gcs85@bytedance.com \
    --cc=Liam.Howlett@oracle.com \
    --cc=acme@kernel.org \
    --cc=adrian.hunter@intel.com \
    --cc=akpm@linux-foundation.org \
    --cc=alexander.shishkin@linux.intel.com \
    --cc=bp@alien8.de \
    --cc=brauner@kernel.org \
    --cc=bsegall@google.com \
    --cc=chaiwen.cc@bytedance.com \
    --cc=chengguozhu@bytedance.com \
    --cc=dave.hansen@linux.intel.com \
    --cc=david@redhat.com \
    --cc=dengliang.1214@bytedance.com \
    --cc=dietmar.eggemann@arm.com \
    --cc=duanxiongchun@bytedance.com \
    --cc=harry.yoo@oracle.com \
    --cc=hpa@zytor.com \
    --cc=irogers@google.com \
    --cc=jack@suse.cz \
    --cc=jannh@google.com \
    --cc=jolsa@kernel.org \
    --cc=juri.lelli@redhat.com \
    --cc=kan.liang@linux.intel.com \
    --cc=kees@kernel.org \
    --cc=linux-fsdevel@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=linux-perf-users@vger.kernel.org \
    --cc=lorenzo.stoakes@oracle.com \
    --cc=luto@kernel.org \
    --cc=mark.rutland@arm.com \
    --cc=mgorman@suse.de \
    --cc=mhocko@suse.com \
    --cc=mingo@redhat.com \
    --cc=namhyung@kernel.org \
    --cc=peterz@infradead.org \
    --cc=pfalcato@suse.de \
    --cc=riel@surriel.com \
    --cc=rostedt@goodmis.org \
    --cc=rppt@kernel.org \
    --cc=songmuchun@bytedance.com \
    --cc=sunjiadong.lff@bytedance.com \
    --cc=surenb@google.com \
    --cc=tglx@linutronix.de \
    --cc=vbabka@suse.cz \
    --cc=vincent.guittot@linaro.org \
    --cc=viro@zeniv.linux.org.uk \
    --cc=vschneid@redhat.com \
    --cc=x86@kernel.org \
    --cc=xieyongji@bytedance.com \
    --cc=yinhongbo@bytedance.com \
    --cc=yuanzhu@bytedance.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox