From: Shivank Garg <shivankg@amd.com>
To: <akpm@linux-foundation.org>, <david@redhat.com>
Cc: <ziy@nvidia.com>, <willy@infradead.org>,
<matthew.brost@intel.com>, <joshua.hahnjy@gmail.com>,
<rakie.kim@sk.com>, <byungchul@sk.com>, <gourry@gourry.net>,
<ying.huang@linux.alibaba.com>, <apopple@nvidia.com>,
<lorenzo.stoakes@oracle.com>, <Liam.Howlett@oracle.com>,
<vbabka@suse.cz>, <rppt@kernel.org>, <surenb@google.com>,
<mhocko@suse.com>, <vkoul@kernel.org>, <lucas.demarchi@intel.com>,
<rdunlap@infradead.org>, <jgg@ziepe.ca>, <kuba@kernel.org>,
<justonli@chromium.org>, <ivecera@redhat.com>,
<dave.jiang@intel.com>, <Jonathan.Cameron@huawei.com>,
<dan.j.williams@intel.com>, <rientjes@google.com>,
<Raghavendra.KodsaraThimmappa@amd.com>, <bharata@amd.com>,
<shivankg@amd.com>, <alirad.malek@zptcorp.com>,
<yiannis@zptcorp.com>, <weixugc@google.com>,
<linux-kernel@vger.kernel.org>, <linux-mm@kvack.org>
Subject: [RFC V3 7/9] dcbm: add dma core batch migrator for batch page offloading
Date: Tue, 23 Sep 2025 17:47:42 +0000 [thread overview]
Message-ID: <20250923174752.35701-8-shivankg@amd.com> (raw)
In-Reply-To: <20250923174752.35701-1-shivankg@amd.com>
The dcbm (DMA core batch migrator) provides a generic interface using
DMAEngine for end-to-end testing of the batch page migration offload
feature.
Enable DCBM offload:
echo 1 > /sys/kernel/dcbm/offloading
echo NR_DMA_CHAN_TO_USE > /sys/kernel/dcbm/nr_dma_chan
Disable DCBM offload:
echo 0 > /sys/kernel/dcbm/offloading
Signed-off-by: Shivank Garg <shivankg@amd.com>
---
drivers/migoffcopy/Kconfig | 8 +
drivers/migoffcopy/Makefile | 1 +
drivers/migoffcopy/dcbm/Makefile | 1 +
drivers/migoffcopy/dcbm/dcbm.c | 415 +++++++++++++++++++++++++++++++
4 files changed, 425 insertions(+)
create mode 100644 drivers/migoffcopy/dcbm/Makefile
create mode 100644 drivers/migoffcopy/dcbm/dcbm.c
diff --git a/drivers/migoffcopy/Kconfig b/drivers/migoffcopy/Kconfig
index e73698af3e72..c1b2eff7650d 100644
--- a/drivers/migoffcopy/Kconfig
+++ b/drivers/migoffcopy/Kconfig
@@ -6,4 +6,12 @@ config MTCOPY_CPU
Interface MT COPY CPU driver for batch page migration
offloading. Say Y if you want to try offloading with
MultiThreaded CPU copy APIs.
+config DCBM_DMA
+ bool "DMA Core Batch Migrator"
+ depends on OFFC_MIGRATION && DMA_ENGINE
+ default n
+ help
+ Interface DMA driver for batch page migration offloading.
+ Say Y if you want to try offloading with DMAEngine APIs
+ based driver.
diff --git a/drivers/migoffcopy/Makefile b/drivers/migoffcopy/Makefile
index 0a3c356d67e6..dedc86ff54c1 100644
--- a/drivers/migoffcopy/Makefile
+++ b/drivers/migoffcopy/Makefile
@@ -1 +1,2 @@
obj-$(CONFIG_MTCOPY_CPU) += mtcopy/
+obj-$(CONFIG_DCBM_DMA) += dcbm/
diff --git a/drivers/migoffcopy/dcbm/Makefile b/drivers/migoffcopy/dcbm/Makefile
new file mode 100644
index 000000000000..56ba47cce0f1
--- /dev/null
+++ b/drivers/migoffcopy/dcbm/Makefile
@@ -0,0 +1 @@
+obj-$(CONFIG_DCBM_DMA) += dcbm.o
diff --git a/drivers/migoffcopy/dcbm/dcbm.c b/drivers/migoffcopy/dcbm/dcbm.c
new file mode 100644
index 000000000000..87a58c0c3b9b
--- /dev/null
+++ b/drivers/migoffcopy/dcbm/dcbm.c
@@ -0,0 +1,415 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ *
+ * DMA batch-offloading interface driver
+ *
+ * Copyright (C) 2024-25 Advanced Micro Devices, Inc.
+ */
+
+#include <linux/module.h>
+#include <linux/dma-mapping.h>
+#include <linux/dmaengine.h>
+#include <linux/migrate.h>
+#include <linux/migrate_offc.h>
+
+#define MAX_DMA_CHANNELS 16
+
+static bool offloading_enabled;
+static unsigned int nr_dma_channels = 1;
+static DEFINE_MUTEX(dcbm_mutex);
+
+struct dma_work {
+ struct dma_chan *chan;
+ struct completion done;
+ atomic_t pending;
+ struct sg_table *src_sgt;
+ struct sg_table *dst_sgt;
+ bool mapped;
+};
+
+static void dma_completion_callback(void *data)
+{
+ struct dma_work *work = data;
+
+ if (atomic_dec_and_test(&work->pending))
+ complete(&work->done);
+}
+
+static int setup_sg_tables(struct dma_work *work, struct list_head **src_pos,
+ struct list_head **dst_pos, int nr)
+{
+ struct scatterlist *sg_src, *sg_dst;
+ struct device *dev;
+ int i, ret;
+
+ work->src_sgt = kmalloc(sizeof(*work->src_sgt), GFP_KERNEL);
+ if (!work->src_sgt)
+ return -ENOMEM;
+ work->dst_sgt = kmalloc(sizeof(*work->dst_sgt), GFP_KERNEL);
+ if (!work->dst_sgt)
+ goto err_free_src;
+
+ ret = sg_alloc_table(work->src_sgt, nr, GFP_KERNEL);
+ if (ret)
+ goto err_free_dst;
+ ret = sg_alloc_table(work->dst_sgt, nr, GFP_KERNEL);
+ if (ret)
+ goto err_free_src_table;
+
+ sg_src = work->src_sgt->sgl;
+ sg_dst = work->dst_sgt->sgl;
+ for (i = 0; i < nr; i++) {
+ struct folio *src = list_entry(*src_pos, struct folio, lru);
+ struct folio *dst = list_entry(*dst_pos, struct folio, lru);
+
+ sg_set_folio(sg_src, src, folio_size(src), 0);
+ sg_set_folio(sg_dst, dst, folio_size(dst), 0);
+
+ *src_pos = (*src_pos)->next;
+ *dst_pos = (*dst_pos)->next;
+
+ if (i < nr - 1) {
+ sg_src = sg_next(sg_src);
+ sg_dst = sg_next(sg_dst);
+ }
+ }
+
+ dev = dmaengine_get_dma_device(work->chan);
+ if (!dev) {
+ ret = -ENODEV;
+ goto err_free_dst_table;
+ }
+ ret = dma_map_sgtable(dev, work->src_sgt, DMA_TO_DEVICE,
+ DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_NO_KERNEL_MAPPING);
+ if (ret)
+ goto err_free_dst_table;
+ ret = dma_map_sgtable(dev, work->dst_sgt, DMA_FROM_DEVICE,
+ DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_NO_KERNEL_MAPPING);
+ if (ret)
+ goto err_unmap_src;
+ /* Verify mapping produced same number of entries */
+ if (work->src_sgt->nents != work->dst_sgt->nents) {
+ pr_err("Mismatched SG entries after mapping: src=%d dst=%d\n",
+ work->src_sgt->nents, work->dst_sgt->nents);
+ ret = -EINVAL;
+ goto err_unmap_dst;
+ }
+ work->mapped = true;
+ return 0;
+
+err_unmap_dst:
+ dma_unmap_sgtable(dev, work->dst_sgt, DMA_FROM_DEVICE,
+ DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_NO_KERNEL_MAPPING);
+err_unmap_src:
+ dma_unmap_sgtable(dev, work->src_sgt, DMA_TO_DEVICE,
+ DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_NO_KERNEL_MAPPING);
+err_free_dst_table:
+ sg_free_table(work->dst_sgt);
+err_free_src_table:
+ sg_free_table(work->src_sgt);
+err_free_dst:
+ kfree(work->dst_sgt);
+ work->dst_sgt = NULL;
+err_free_src:
+ kfree(work->src_sgt);
+ work->src_sgt = NULL;
+ pr_err("DCBM: Failed to setup SG tables\n");
+ return ret;
+}
+
+static void cleanup_dma_work(struct dma_work *works, int actual_channels)
+{
+ struct device *dev;
+ int i;
+
+ if (!works)
+ return;
+
+ for (i = 0; i < actual_channels; i++) {
+ if (!works[i].chan)
+ continue;
+
+ dev = dmaengine_get_dma_device(works[i].chan);
+
+ /* Terminate any pending transfers */
+ if (atomic_read(&works[i].pending) > 0)
+ dmaengine_terminate_sync(works[i].chan);
+
+ if (dev && works[i].mapped) {
+ if (works[i].src_sgt) {
+ dma_unmap_sgtable(dev, works[i].src_sgt, DMA_TO_DEVICE,
+ DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_NO_KERNEL_MAPPING);
+ sg_free_table(works[i].src_sgt);
+ kfree(works[i].src_sgt);
+ }
+ if (works[i].dst_sgt) {
+ dma_unmap_sgtable(dev, works[i].dst_sgt, DMA_FROM_DEVICE,
+ DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_NO_KERNEL_MAPPING);
+ sg_free_table(works[i].dst_sgt);
+ kfree(works[i].dst_sgt);
+ }
+ }
+ dma_release_channel(works[i].chan);
+ }
+ kfree(works);
+}
+
+static int submit_dma_transfers(struct dma_work *work)
+{
+ struct scatterlist *sg_src, *sg_dst;
+ struct dma_async_tx_descriptor *tx;
+ unsigned long flags = DMA_CTRL_ACK;
+ dma_cookie_t cookie;
+ int i;
+
+ /* Logic: send single callback for the entire batch */
+ atomic_set(&work->pending, 1);
+
+ sg_src = work->src_sgt->sgl;
+ sg_dst = work->dst_sgt->sgl;
+ /* Iterate over DMA-mapped entries */
+ for_each_sgtable_dma_sg(work->src_sgt, sg_src, i) {
+ /* Only interrupt on the last transfer */
+ if (i == work->src_sgt->nents - 1)
+ flags |= DMA_PREP_INTERRUPT;
+
+ tx = dmaengine_prep_dma_memcpy(work->chan,
+ sg_dma_address(sg_dst),
+ sg_dma_address(sg_src),
+ sg_dma_len(sg_src),
+ flags);
+ if (!tx) {
+ atomic_set(&work->pending, 0);
+ return -EIO;
+ }
+
+ /* Only set callback on last transfer */
+ if (i == work->src_sgt->nents - 1) {
+ tx->callback = dma_completion_callback;
+ tx->callback_param = work;
+ }
+
+ cookie = dmaengine_submit(tx);
+ if (dma_submit_error(cookie)) {
+ atomic_set(&work->pending, 0);
+ return -EIO;
+ }
+ sg_dst = sg_next(sg_dst);
+ }
+ return 0;
+}
+
+/**
+ * folios_copy_dma - Copy folios using DMA engine
+ * @dst_list: Destination folio list
+ * @src_list: Source folio list
+ * @nr_folios: Number of folios to copy
+ *
+ * Return: 0. Fallback to CPU copy on any error.
+ */
+static int folios_copy_dma(struct list_head *dst_list,
+ struct list_head *src_list,
+ unsigned int nr_folios)
+{
+ struct dma_work *works;
+ struct list_head *src_pos = src_list->next;
+ struct list_head *dst_pos = dst_list->next;
+ int i, folios_per_chan, ret = 0;
+ dma_cap_mask_t mask;
+ int actual_channels = 0;
+ int max_channels;
+
+ max_channels = min3(nr_dma_channels, nr_folios, MAX_DMA_CHANNELS);
+
+ works = kcalloc(max_channels, sizeof(*works), GFP_KERNEL);
+ if (!works)
+ goto fallback;
+
+ dma_cap_zero(mask);
+ dma_cap_set(DMA_MEMCPY, mask);
+
+ for (i = 0; i < max_channels; i++) {
+ works[actual_channels].chan = dma_request_chan_by_mask(&mask);
+ if (IS_ERR(works[actual_channels].chan))
+ break;
+ init_completion(&works[actual_channels].done);
+ actual_channels++;
+ }
+
+ if (actual_channels == 0) {
+ kfree(works);
+ goto fallback;
+ }
+
+ for (i = 0; i < actual_channels; i++) {
+ folios_per_chan = nr_folios * (i + 1) / actual_channels -
+ (nr_folios * i) / actual_channels;
+ if (folios_per_chan == 0)
+ continue;
+
+ ret = setup_sg_tables(&works[i], &src_pos, &dst_pos, folios_per_chan);
+ if (ret)
+ goto cleanup;
+ }
+
+ for (i = 0; i < actual_channels; i++) {
+ ret = submit_dma_transfers(&works[i]);
+ if (ret) {
+ dev_err(dmaengine_get_dma_device(works[i].chan),
+ "Failed to submit transfers for channel %d\n", i);
+ goto cleanup;
+ }
+ }
+
+ for (i = 0; i < actual_channels; i++) {
+ if (atomic_read(&works[i].pending) > 0)
+ dma_async_issue_pending(works[i].chan);
+ }
+
+ for (i = 0; i < actual_channels; i++) {
+ if (atomic_read(&works[i].pending) > 0) {
+ if (!wait_for_completion_timeout(&works[i].done, msecs_to_jiffies(10000))) {
+ dev_err(dmaengine_get_dma_device(works[i].chan),
+ "DMA timeout on channel %d\n", i);
+ ret = -ETIMEDOUT;
+ goto cleanup;
+ }
+ }
+ }
+
+cleanup:
+ cleanup_dma_work(works, actual_channels);
+ if (ret)
+ goto fallback;
+ return 0;
+fallback:
+ /* Fall back to CPU copy */
+ pr_err("DCBM: Falling back to CPU copy\n");
+ folios_mc_copy(dst_list, src_list, nr_folios);
+ return 0;
+}
+
+static struct migrator dma_migrator = {
+ .name = "DCBM",
+ .migrate_offc = folios_copy_dma,
+ .owner = THIS_MODULE,
+};
+
+static ssize_t offloading_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sysfs_emit(buf, "%d\n", offloading_enabled);
+}
+
+static ssize_t offloading_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ bool enable;
+ int ret;
+
+ ret = kstrtobool(buf, &enable);
+ if (ret)
+ return ret;
+
+ mutex_lock(&dcbm_mutex);
+
+ if (enable == offloading_enabled) {
+ pr_err("migration offloading is already %s\n",
+ enable ? "ON" : "OFF");
+ goto out;
+ }
+
+ if (enable) {
+ start_offloading(&dma_migrator);
+ offloading_enabled = true;
+ pr_info("migration offloading is now ON\n");
+ } else {
+ stop_offloading();
+ offloading_enabled = false;
+ pr_info("migration offloading is now OFF\n");
+ }
+out:
+ mutex_unlock(&dcbm_mutex);
+ return count;
+}
+
+static ssize_t nr_dma_chan_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sysfs_emit(buf, "%u\n", nr_dma_channels);
+}
+
+static ssize_t nr_dma_chan_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ unsigned int val;
+ int ret;
+
+ ret = kstrtouint(buf, 0, &val);
+ if (ret)
+ return ret;
+
+ if (val < 1 || val > MAX_DMA_CHANNELS)
+ return -EINVAL;
+
+ mutex_lock(&dcbm_mutex);
+ nr_dma_channels = val;
+ mutex_unlock(&dcbm_mutex);
+
+ return count;
+}
+
+static struct kobj_attribute offloading_attr = __ATTR_RW(offloading);
+static struct kobj_attribute nr_dma_chan_attr = __ATTR_RW(nr_dma_chan);
+
+static struct attribute *dcbm_attrs[] = {
+ &offloading_attr.attr,
+ &nr_dma_chan_attr.attr,
+ NULL,
+};
+ATTRIBUTE_GROUPS(dcbm);
+
+static struct kobject *dcbm_kobj;
+
+static int __init dcbm_init(void)
+{
+ int ret;
+
+ dcbm_kobj = kobject_create_and_add("dcbm", kernel_kobj);
+ if (!dcbm_kobj)
+ return -ENOMEM;
+
+ ret = sysfs_create_groups(dcbm_kobj, dcbm_groups);
+ if (ret) {
+ kobject_put(dcbm_kobj);
+ return ret;
+ }
+
+ pr_info("DMA Core Batch Migrator initialized\n");
+ return 0;
+}
+
+static void __exit dcbm_exit(void)
+{
+ /* Ensure offloading is stopped before module unload */
+ mutex_lock(&dcbm_mutex);
+ if (offloading_enabled) {
+ stop_offloading();
+ offloading_enabled = false;
+ }
+ mutex_unlock(&dcbm_mutex);
+
+ sysfs_remove_groups(dcbm_kobj, dcbm_groups);
+ kobject_put(dcbm_kobj);
+
+ pr_info("DMA Core Batch Migrator unloaded\n");
+}
+
+module_init(dcbm_init);
+module_exit(dcbm_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Shivank Garg");
+MODULE_DESCRIPTION("DMA Core Batch Migrator");
--
2.43.0
next prev parent reply other threads:[~2025-09-23 17:49 UTC|newest]
Thread overview: 26+ messages / expand[flat|nested] mbox.gz Atom feed top
2025-09-23 17:47 [RFC V3 0/9] Accelerate page migration with batch copying and hardware offload Shivank Garg
2025-09-23 17:47 ` [RFC V3 1/9] mm/migrate: factor out code in move_to_new_folio() and migrate_folio_move() Shivank Garg
2025-10-02 10:30 ` Jonathan Cameron
2025-09-23 17:47 ` [RFC V3 2/9] mm/migrate: revive MIGRATE_NO_COPY in migrate_mode Shivank Garg
2025-09-23 17:47 ` [RFC V3 3/9] mm: Introduce folios_mc_copy() for batch copying folios Shivank Garg
2025-09-23 17:47 ` [RFC V3 4/9] mm/migrate: add migrate_folios_batch_move to batch the folio move operations Shivank Garg
2025-10-02 11:03 ` Jonathan Cameron
2025-10-16 9:17 ` Garg, Shivank
2025-09-23 17:47 ` [RFC V3 5/9] mm: add support for copy offload for folio Migration Shivank Garg
2025-10-02 11:10 ` Jonathan Cameron
2025-10-16 9:40 ` Garg, Shivank
2025-09-23 17:47 ` [RFC V3 6/9] mtcopy: introduce multi-threaded page copy routine Shivank Garg
2025-10-02 11:29 ` Jonathan Cameron
2025-10-20 8:28 ` Byungchul Park
2025-11-06 6:27 ` Garg, Shivank
2025-11-12 2:12 ` Byungchul Park
2025-09-23 17:47 ` Shivank Garg [this message]
2025-10-02 11:38 ` [RFC V3 7/9] dcbm: add dma core batch migrator for batch page offloading Jonathan Cameron
2025-10-16 9:59 ` Garg, Shivank
2025-09-23 17:47 ` [RFC V3 8/9] adjust NR_MAX_BATCHED_MIGRATION for testing Shivank Garg
2025-09-23 17:47 ` [RFC V3 9/9] mtcopy: spread threads across die " Shivank Garg
2025-09-24 1:49 ` [RFC V3 0/9] Accelerate page migration with batch copying and hardware offload Huang, Ying
2025-09-24 2:03 ` Zi Yan
2025-09-24 3:11 ` Huang, Ying
2025-09-24 3:22 ` Zi Yan
2025-10-02 17:10 ` Garg, Shivank
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20250923174752.35701-8-shivankg@amd.com \
--to=shivankg@amd.com \
--cc=Jonathan.Cameron@huawei.com \
--cc=Liam.Howlett@oracle.com \
--cc=Raghavendra.KodsaraThimmappa@amd.com \
--cc=akpm@linux-foundation.org \
--cc=alirad.malek@zptcorp.com \
--cc=apopple@nvidia.com \
--cc=bharata@amd.com \
--cc=byungchul@sk.com \
--cc=dan.j.williams@intel.com \
--cc=dave.jiang@intel.com \
--cc=david@redhat.com \
--cc=gourry@gourry.net \
--cc=ivecera@redhat.com \
--cc=jgg@ziepe.ca \
--cc=joshua.hahnjy@gmail.com \
--cc=justonli@chromium.org \
--cc=kuba@kernel.org \
--cc=linux-kernel@vger.kernel.org \
--cc=linux-mm@kvack.org \
--cc=lorenzo.stoakes@oracle.com \
--cc=lucas.demarchi@intel.com \
--cc=matthew.brost@intel.com \
--cc=mhocko@suse.com \
--cc=rakie.kim@sk.com \
--cc=rdunlap@infradead.org \
--cc=rientjes@google.com \
--cc=rppt@kernel.org \
--cc=surenb@google.com \
--cc=vbabka@suse.cz \
--cc=vkoul@kernel.org \
--cc=weixugc@google.com \
--cc=willy@infradead.org \
--cc=yiannis@zptcorp.com \
--cc=ying.huang@linux.alibaba.com \
--cc=ziy@nvidia.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox