From: Leon Romanovsky <leon@kernel.org>
To: Marek Szyprowski <m.szyprowski@samsung.com>,
Jens Axboe <axboe@kernel.dk>, Christoph Hellwig <hch@lst.de>,
Keith Busch <kbusch@kernel.org>
Cc: "Leon Romanovsky" <leonro@nvidia.com>, "Jake Edge" <jake@lwn.net>,
"Jonathan Corbet" <corbet@lwn.net>,
"Jason Gunthorpe" <jgg@ziepe.ca>,
"Zhu Yanjun" <zyjzyj2000@gmail.com>,
"Robin Murphy" <robin.murphy@arm.com>,
"Joerg Roedel" <joro@8bytes.org>, "Will Deacon" <will@kernel.org>,
"Sagi Grimberg" <sagi@grimberg.me>,
"Bjorn Helgaas" <bhelgaas@google.com>,
"Logan Gunthorpe" <logang@deltatee.com>,
"Yishai Hadas" <yishaih@nvidia.com>,
"Shameer Kolothum" <shameerali.kolothum.thodi@huawei.com>,
"Kevin Tian" <kevin.tian@intel.com>,
"Alex Williamson" <alex.williamson@redhat.com>,
"Jérôme Glisse" <jglisse@redhat.com>,
"Andrew Morton" <akpm@linux-foundation.org>,
linux-doc@vger.kernel.org, linux-kernel@vger.kernel.org,
linux-block@vger.kernel.org, linux-rdma@vger.kernel.org,
iommu@lists.linux.dev, linux-nvme@lists.infradead.org,
linux-pci@vger.kernel.org, kvm@vger.kernel.org,
linux-mm@kvack.org, "Niklas Schnelle" <schnelle@linux.ibm.com>,
"Chuck Lever" <chuck.lever@oracle.com>,
"Luis Chamberlain" <mcgrof@kernel.org>,
"Matthew Wilcox" <willy@infradead.org>,
"Dan Williams" <dan.j.williams@intel.com>,
"Kanchan Joshi" <joshi.k@samsung.com>,
"Chaitanya Kulkarni" <kch@nvidia.com>,
"Jason Gunthorpe" <jgg@nvidia.com>
Subject: [PATCH v10 16/24] vfio/mlx5: Rewrite create mkey flow to allow better code reuse
Date: Mon, 28 Apr 2025 12:22:22 +0300 [thread overview]
Message-ID: <e273704d508374a24f109885656809aae9449462.1745831017.git.leon@kernel.org> (raw)
In-Reply-To: <cover.1745831017.git.leon@kernel.org>
From: Leon Romanovsky <leonro@nvidia.com>
Change the creation of mkey to be performed in multiple steps:
data allocation, DMA setup and actual call to HW to create that mkey.
In this new flow, the whole input to MKEY command is saved to eliminate
the need to keep array of pointers for DMA addresses for receive list
and in the future patches for send list too.
In addition to memory size reduce and elimination of unnecessary data
movements to set MKEY input, the code is prepared for future reuse.
Tested-by: Jens Axboe <axboe@kernel.dk>
Reviewed-by: Jason Gunthorpe <jgg@nvidia.com>
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
---
drivers/vfio/pci/mlx5/cmd.c | 157 ++++++++++++++++++++----------------
drivers/vfio/pci/mlx5/cmd.h | 4 +-
2 files changed, 91 insertions(+), 70 deletions(-)
diff --git a/drivers/vfio/pci/mlx5/cmd.c b/drivers/vfio/pci/mlx5/cmd.c
index 377dee7765fb..84dc3bc128c6 100644
--- a/drivers/vfio/pci/mlx5/cmd.c
+++ b/drivers/vfio/pci/mlx5/cmd.c
@@ -313,39 +313,21 @@ static int mlx5vf_cmd_get_vhca_id(struct mlx5_core_dev *mdev, u16 function_id,
return ret;
}
-static int _create_mkey(struct mlx5_core_dev *mdev, u32 pdn,
- struct mlx5_vhca_data_buffer *buf,
- struct mlx5_vhca_recv_buf *recv_buf,
- u32 *mkey)
+static u32 *alloc_mkey_in(u32 npages, u32 pdn)
{
- size_t npages = buf ? buf->npages : recv_buf->npages;
- int err = 0, inlen;
- __be64 *mtt;
+ int inlen;
void *mkc;
u32 *in;
inlen = MLX5_ST_SZ_BYTES(create_mkey_in) +
- sizeof(*mtt) * round_up(npages, 2);
+ sizeof(__be64) * round_up(npages, 2);
- in = kvzalloc(inlen, GFP_KERNEL);
+ in = kvzalloc(inlen, GFP_KERNEL_ACCOUNT);
if (!in)
- return -ENOMEM;
+ return NULL;
MLX5_SET(create_mkey_in, in, translations_octword_actual_size,
DIV_ROUND_UP(npages, 2));
- mtt = (__be64 *)MLX5_ADDR_OF(create_mkey_in, in, klm_pas_mtt);
-
- if (buf) {
- struct sg_dma_page_iter dma_iter;
-
- for_each_sgtable_dma_page(&buf->table.sgt, &dma_iter, 0)
- *mtt++ = cpu_to_be64(sg_page_iter_dma_address(&dma_iter));
- } else {
- int i;
-
- for (i = 0; i < npages; i++)
- *mtt++ = cpu_to_be64(recv_buf->dma_addrs[i]);
- }
mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_MTT);
@@ -359,9 +341,30 @@ static int _create_mkey(struct mlx5_core_dev *mdev, u32 pdn,
MLX5_SET(mkc, mkc, log_page_size, PAGE_SHIFT);
MLX5_SET(mkc, mkc, translations_octword_size, DIV_ROUND_UP(npages, 2));
MLX5_SET64(mkc, mkc, len, npages * PAGE_SIZE);
- err = mlx5_core_create_mkey(mdev, mkey, in, inlen);
- kvfree(in);
- return err;
+
+ return in;
+}
+
+static int create_mkey(struct mlx5_core_dev *mdev, u32 npages,
+ struct mlx5_vhca_data_buffer *buf, u32 *mkey_in,
+ u32 *mkey)
+{
+ __be64 *mtt;
+ int inlen;
+
+ mtt = (__be64 *)MLX5_ADDR_OF(create_mkey_in, mkey_in, klm_pas_mtt);
+ if (buf) {
+ struct sg_dma_page_iter dma_iter;
+
+ for_each_sgtable_dma_page(&buf->table.sgt, &dma_iter, 0)
+ *mtt++ = cpu_to_be64(
+ sg_page_iter_dma_address(&dma_iter));
+ }
+
+ inlen = MLX5_ST_SZ_BYTES(create_mkey_in) +
+ sizeof(__be64) * round_up(npages, 2);
+
+ return mlx5_core_create_mkey(mdev, mkey, mkey_in, inlen);
}
static int mlx5vf_dma_data_buffer(struct mlx5_vhca_data_buffer *buf)
@@ -374,20 +377,28 @@ static int mlx5vf_dma_data_buffer(struct mlx5_vhca_data_buffer *buf)
if (mvdev->mdev_detach)
return -ENOTCONN;
- if (buf->dmaed || !buf->npages)
+ if (buf->mkey_in || !buf->npages)
return -EINVAL;
ret = dma_map_sgtable(mdev->device, &buf->table.sgt, buf->dma_dir, 0);
if (ret)
return ret;
- ret = _create_mkey(mdev, buf->migf->pdn, buf, NULL, &buf->mkey);
- if (ret)
+ buf->mkey_in = alloc_mkey_in(buf->npages, buf->migf->pdn);
+ if (!buf->mkey_in) {
+ ret = -ENOMEM;
goto err;
+ }
- buf->dmaed = true;
+ ret = create_mkey(mdev, buf->npages, buf, buf->mkey_in, &buf->mkey);
+ if (ret)
+ goto err_create_mkey;
return 0;
+
+err_create_mkey:
+ kvfree(buf->mkey_in);
+ buf->mkey_in = NULL;
err:
dma_unmap_sgtable(mdev->device, &buf->table.sgt, buf->dma_dir, 0);
return ret;
@@ -401,8 +412,9 @@ void mlx5vf_free_data_buffer(struct mlx5_vhca_data_buffer *buf)
lockdep_assert_held(&migf->mvdev->state_mutex);
WARN_ON(migf->mvdev->mdev_detach);
- if (buf->dmaed) {
+ if (buf->mkey_in) {
mlx5_core_destroy_mkey(migf->mvdev->mdev, buf->mkey);
+ kvfree(buf->mkey_in);
dma_unmap_sgtable(migf->mvdev->mdev->device, &buf->table.sgt,
buf->dma_dir, 0);
}
@@ -783,7 +795,7 @@ int mlx5vf_cmd_load_vhca_state(struct mlx5vf_pci_core_device *mvdev,
if (mvdev->mdev_detach)
return -ENOTCONN;
- if (!buf->dmaed) {
+ if (!buf->mkey_in) {
err = mlx5vf_dma_data_buffer(buf);
if (err)
return err;
@@ -1384,56 +1396,54 @@ static int alloc_recv_pages(struct mlx5_vhca_recv_buf *recv_buf,
kvfree(recv_buf->page_list);
return -ENOMEM;
}
+static void unregister_dma_pages(struct mlx5_core_dev *mdev, u32 npages,
+ u32 *mkey_in)
+{
+ dma_addr_t addr;
+ __be64 *mtt;
+ int i;
+
+ mtt = (__be64 *)MLX5_ADDR_OF(create_mkey_in, mkey_in, klm_pas_mtt);
+ for (i = npages - 1; i >= 0; i--) {
+ addr = be64_to_cpu(mtt[i]);
+ dma_unmap_single(mdev->device, addr, PAGE_SIZE,
+ DMA_FROM_DEVICE);
+ }
+}
-static int register_dma_recv_pages(struct mlx5_core_dev *mdev,
- struct mlx5_vhca_recv_buf *recv_buf)
+static int register_dma_pages(struct mlx5_core_dev *mdev, u32 npages,
+ struct page **page_list, u32 *mkey_in)
{
- int i, j;
+ dma_addr_t addr;
+ __be64 *mtt;
+ int i;
- recv_buf->dma_addrs = kvcalloc(recv_buf->npages,
- sizeof(*recv_buf->dma_addrs),
- GFP_KERNEL_ACCOUNT);
- if (!recv_buf->dma_addrs)
- return -ENOMEM;
+ mtt = (__be64 *)MLX5_ADDR_OF(create_mkey_in, mkey_in, klm_pas_mtt);
- for (i = 0; i < recv_buf->npages; i++) {
- recv_buf->dma_addrs[i] = dma_map_page(mdev->device,
- recv_buf->page_list[i],
- 0, PAGE_SIZE,
- DMA_FROM_DEVICE);
- if (dma_mapping_error(mdev->device, recv_buf->dma_addrs[i]))
+ for (i = 0; i < npages; i++) {
+ addr = dma_map_page(mdev->device, page_list[i], 0, PAGE_SIZE,
+ DMA_FROM_DEVICE);
+ if (dma_mapping_error(mdev->device, addr))
goto error;
+
+ *mtt++ = cpu_to_be64(addr);
}
+
return 0;
error:
- for (j = 0; j < i; j++)
- dma_unmap_single(mdev->device, recv_buf->dma_addrs[j],
- PAGE_SIZE, DMA_FROM_DEVICE);
-
- kvfree(recv_buf->dma_addrs);
+ unregister_dma_pages(mdev, i, mkey_in);
return -ENOMEM;
}
-static void unregister_dma_recv_pages(struct mlx5_core_dev *mdev,
- struct mlx5_vhca_recv_buf *recv_buf)
-{
- int i;
-
- for (i = 0; i < recv_buf->npages; i++)
- dma_unmap_single(mdev->device, recv_buf->dma_addrs[i],
- PAGE_SIZE, DMA_FROM_DEVICE);
-
- kvfree(recv_buf->dma_addrs);
-}
-
static void mlx5vf_free_qp_recv_resources(struct mlx5_core_dev *mdev,
struct mlx5_vhca_qp *qp)
{
struct mlx5_vhca_recv_buf *recv_buf = &qp->recv_buf;
mlx5_core_destroy_mkey(mdev, recv_buf->mkey);
- unregister_dma_recv_pages(mdev, recv_buf);
+ unregister_dma_pages(mdev, recv_buf->npages, recv_buf->mkey_in);
+ kvfree(recv_buf->mkey_in);
free_recv_pages(&qp->recv_buf);
}
@@ -1449,18 +1459,29 @@ static int mlx5vf_alloc_qp_recv_resources(struct mlx5_core_dev *mdev,
if (err < 0)
return err;
- err = register_dma_recv_pages(mdev, recv_buf);
- if (err)
+ recv_buf->mkey_in = alloc_mkey_in(npages, pdn);
+ if (!recv_buf->mkey_in) {
+ err = -ENOMEM;
goto end;
+ }
+
+ err = register_dma_pages(mdev, npages, recv_buf->page_list,
+ recv_buf->mkey_in);
+ if (err)
+ goto err_register_dma;
- err = _create_mkey(mdev, pdn, NULL, recv_buf, &recv_buf->mkey);
+ err = create_mkey(mdev, npages, NULL, recv_buf->mkey_in,
+ &recv_buf->mkey);
if (err)
goto err_create_mkey;
return 0;
err_create_mkey:
- unregister_dma_recv_pages(mdev, recv_buf);
+ unregister_dma_pages(mdev, npages, recv_buf->mkey_in);
+err_register_dma:
+ kvfree(recv_buf->mkey_in);
+ recv_buf->mkey_in = NULL;
end:
free_recv_pages(recv_buf);
return err;
diff --git a/drivers/vfio/pci/mlx5/cmd.h b/drivers/vfio/pci/mlx5/cmd.h
index 7d4a833b6900..25dd6ff54591 100644
--- a/drivers/vfio/pci/mlx5/cmd.h
+++ b/drivers/vfio/pci/mlx5/cmd.h
@@ -58,8 +58,8 @@ struct mlx5_vhca_data_buffer {
u64 length;
u32 npages;
u32 mkey;
+ u32 *mkey_in;
enum dma_data_direction dma_dir;
- u8 dmaed:1;
u8 stop_copy_chunk_num;
struct list_head buf_elm;
struct mlx5_vf_migration_file *migf;
@@ -133,8 +133,8 @@ struct mlx5_vhca_cq {
struct mlx5_vhca_recv_buf {
u32 npages;
struct page **page_list;
- dma_addr_t *dma_addrs;
u32 next_rq_offset;
+ u32 *mkey_in;
u32 mkey;
};
--
2.49.0
next prev parent reply other threads:[~2025-04-28 9:23 UTC|newest]
Thread overview: 41+ messages / expand[flat|nested] mbox.gz Atom feed top
2025-04-28 9:22 [PATCH v10 00/24] Provide a new two step DMA mapping API Leon Romanovsky
2025-04-28 9:22 ` [PATCH v10 01/24] PCI/P2PDMA: Refactor the p2pdma mapping helpers Leon Romanovsky
2025-04-29 2:08 ` Baolu Lu
2025-04-28 9:22 ` [PATCH v10 02/24] dma-mapping: move the PCI P2PDMA mapping helpers to pci-p2pdma.h Leon Romanovsky
2025-04-29 2:09 ` Baolu Lu
2025-04-28 9:22 ` [PATCH v10 03/24] iommu: generalize the batched sync after map interface Leon Romanovsky
2025-04-29 2:19 ` Baolu Lu
2025-04-29 6:09 ` Leon Romanovsky
2025-04-29 11:53 ` Jason Gunthorpe
2025-04-28 9:22 ` [PATCH v10 04/24] iommu: add kernel-doc for iommu_unmap_fast Leon Romanovsky
2025-04-29 2:37 ` Baolu Lu
2025-04-28 9:22 ` [PATCH v10 05/24] dma-mapping: Provide an interface to allow allocate IOVA Leon Romanovsky
2025-04-29 3:10 ` Baolu Lu
2025-04-29 5:46 ` Leon Romanovsky
2025-04-28 9:22 ` [PATCH v10 06/24] iommu/dma: Factor out a iommu_dma_map_swiotlb helper Leon Romanovsky
2025-04-29 4:58 ` Baolu Lu
2025-04-29 5:53 ` Leon Romanovsky
2025-04-29 5:58 ` Baolu Lu
2025-04-29 6:18 ` Leon Romanovsky
2025-04-28 9:22 ` [PATCH v10 07/24] dma-mapping: Implement link/unlink ranges API Leon Romanovsky
2025-04-28 9:22 ` [PATCH v10 08/24] dma-mapping: add a dma_need_unmap helper Leon Romanovsky
2025-04-28 9:22 ` [PATCH v10 09/24] docs: core-api: document the IOVA-based API Leon Romanovsky
2025-04-28 9:22 ` [PATCH v10 10/24] mm/hmm: let users to tag specific PFN with DMA mapped bit Leon Romanovsky
2025-04-28 9:22 ` [PATCH v10 11/24] mm/hmm: provide generic DMA managing logic Leon Romanovsky
2025-04-28 9:22 ` [PATCH v10 12/24] RDMA/umem: Store ODP access mask information in PFN Leon Romanovsky
2025-04-28 9:22 ` [PATCH v10 13/24] RDMA/core: Convert UMEM ODP DMA mapping to caching IOVA and page linkage Leon Romanovsky
2025-04-28 9:22 ` [PATCH v10 14/24] RDMA/umem: Separate implicit ODP initialization from explicit ODP Leon Romanovsky
2025-04-28 9:22 ` [PATCH v10 15/24] vfio/mlx5: Explicitly use number of pages instead of allocated length Leon Romanovsky
2025-04-28 9:22 ` Leon Romanovsky [this message]
2025-04-28 9:22 ` [PATCH v10 17/24] vfio/mlx5: Enable the DMA link API Leon Romanovsky
2025-04-28 9:22 ` [PATCH v10 18/24] block: share more code for bio addition helper Leon Romanovsky
2025-04-28 9:22 ` [PATCH v10 19/24] block: don't merge different kinds of P2P transfers in a single bio Leon Romanovsky
2025-04-28 9:22 ` [PATCH v10 20/24] blk-mq: add scatterlist-less DMA mapping helpers Leon Romanovsky
2025-04-28 9:22 ` [PATCH v10 21/24] nvme-pci: remove struct nvme_descriptor Leon Romanovsky
2025-04-28 9:22 ` [PATCH v10 22/24] nvme-pci: use a better encoding for small prp pool allocations Leon Romanovsky
2025-04-28 9:22 ` [PATCH v10 23/24] nvme-pci: convert to blk_rq_dma_map Leon Romanovsky
2025-04-28 16:46 ` Keith Busch
2025-04-28 17:22 ` Leon Romanovsky
2025-04-28 17:30 ` Keith Busch
2025-04-28 9:22 ` [PATCH v10 24/24] nvme-pci: store aborted state in flags variable Leon Romanovsky
2025-05-12 10:07 ` (subset) [PATCH v10 00/24] Provide a new two step DMA mapping API Leon Romanovsky
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=e273704d508374a24f109885656809aae9449462.1745831017.git.leon@kernel.org \
--to=leon@kernel.org \
--cc=akpm@linux-foundation.org \
--cc=alex.williamson@redhat.com \
--cc=axboe@kernel.dk \
--cc=bhelgaas@google.com \
--cc=chuck.lever@oracle.com \
--cc=corbet@lwn.net \
--cc=dan.j.williams@intel.com \
--cc=hch@lst.de \
--cc=iommu@lists.linux.dev \
--cc=jake@lwn.net \
--cc=jgg@nvidia.com \
--cc=jgg@ziepe.ca \
--cc=jglisse@redhat.com \
--cc=joro@8bytes.org \
--cc=joshi.k@samsung.com \
--cc=kbusch@kernel.org \
--cc=kch@nvidia.com \
--cc=kevin.tian@intel.com \
--cc=kvm@vger.kernel.org \
--cc=leonro@nvidia.com \
--cc=linux-block@vger.kernel.org \
--cc=linux-doc@vger.kernel.org \
--cc=linux-kernel@vger.kernel.org \
--cc=linux-mm@kvack.org \
--cc=linux-nvme@lists.infradead.org \
--cc=linux-pci@vger.kernel.org \
--cc=linux-rdma@vger.kernel.org \
--cc=logang@deltatee.com \
--cc=m.szyprowski@samsung.com \
--cc=mcgrof@kernel.org \
--cc=robin.murphy@arm.com \
--cc=sagi@grimberg.me \
--cc=schnelle@linux.ibm.com \
--cc=shameerali.kolothum.thodi@huawei.com \
--cc=will@kernel.org \
--cc=willy@infradead.org \
--cc=yishaih@nvidia.com \
--cc=zyjzyj2000@gmail.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox