From: Leon Romanovsky <leon@kernel.org>
To: Jens Axboe <axboe@kernel.dk>, Jason Gunthorpe <jgg@ziepe.ca>,
Robin Murphy <robin.murphy@arm.com>,
Joerg Roedel <joro@8bytes.org>, Will Deacon <will@kernel.org>,
Christoph Hellwig <hch@lst.de>, Sagi Grimberg <sagi@grimberg.me>
Cc: "Leon Romanovsky" <leonro@nvidia.com>,
"Keith Busch" <kbusch@kernel.org>,
"Bjorn Helgaas" <bhelgaas@google.com>,
"Logan Gunthorpe" <logang@deltatee.com>,
"Yishai Hadas" <yishaih@nvidia.com>,
"Shameer Kolothum" <shameerali.kolothum.thodi@huawei.com>,
"Kevin Tian" <kevin.tian@intel.com>,
"Alex Williamson" <alex.williamson@redhat.com>,
"Marek Szyprowski" <m.szyprowski@samsung.com>,
"Jérôme Glisse" <jglisse@redhat.com>,
"Andrew Morton" <akpm@linux-foundation.org>,
"Jonathan Corbet" <corbet@lwn.net>,
linux-doc@vger.kernel.org, linux-kernel@vger.kernel.org,
linux-block@vger.kernel.org, linux-rdma@vger.kernel.org,
iommu@lists.linux.dev, linux-nvme@lists.infradead.org,
linux-pci@vger.kernel.org, kvm@vger.kernel.org,
linux-mm@kvack.org
Subject: [PATCH 11/18] mm/hmm: provide generic DMA managing logic
Date: Sun, 27 Oct 2024 16:21:11 +0200 [thread overview]
Message-ID: <505c3956e0101f3e4f180e67319ff33c789f83b5.1730037276.git.leon@kernel.org> (raw)
In-Reply-To: <cover.1730037276.git.leon@kernel.org>
From: Leon Romanovsky <leonro@nvidia.com>
HMM callers use PFN list to populate range while calling
to hmm_range_fault(), the conversion from PFN to DMA address
is done by the callers with help of another DMA list. However,
it is wasteful on any modern platform and by doing the right
logic, that DMA list can be avoided.
Provide generic logic to manage these lists and gave an interface
to map/unmap PFNs to DMA addresses, without requiring from the callers
to be an experts in DMA core API.
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
---
include/linux/hmm-dma.h | 32 +++++++
include/linux/hmm.h | 2 +
mm/hmm.c | 195 ++++++++++++++++++++++++++++++++++++++++
3 files changed, 229 insertions(+)
create mode 100644 include/linux/hmm-dma.h
diff --git a/include/linux/hmm-dma.h b/include/linux/hmm-dma.h
new file mode 100644
index 000000000000..f6ce2a00d74d
--- /dev/null
+++ b/include/linux/hmm-dma.h
@@ -0,0 +1,32 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/* Copyright (c) 2024 NVIDIA Corporation & Affiliates */
+#ifndef LINUX_HMM_DMA_H
+#define LINUX_HMM_DMA_H
+
+#include <linux/dma-mapping.h>
+
+struct dma_iova_state;
+struct pci_p2pdma_map_state;
+
+/*
+ * struct hmm_dma_map - array of PFNs and DMA addresses
+ *
+ * @state: DMA IOVA state
+ * @pfns: array of PFNs
+ * @dma_list: array of DMA addresses
+ * @dma_entry_size: size of each DMA entry in the array
+ */
+struct hmm_dma_map {
+ struct dma_iova_state state;
+ unsigned long *pfn_list;
+ dma_addr_t *dma_list;
+ size_t dma_entry_size;
+};
+
+int hmm_dma_map_alloc(struct device *dev, struct hmm_dma_map *map,
+ size_t nr_entries, size_t dma_entry_size);
+void hmm_dma_map_free(struct device *dev, struct hmm_dma_map *map);
+dma_addr_t hmm_dma_map_pfn(struct device *dev, struct hmm_dma_map *map,
+ size_t idx, struct pci_p2pdma_map_state *p2pdma_state);
+bool hmm_dma_unmap_pfn(struct device *dev, struct hmm_dma_map *map, size_t idx);
+#endif /* LINUX_HMM_DMA_H */
diff --git a/include/linux/hmm.h b/include/linux/hmm.h
index 5dd655f6766b..62980ca8f3c5 100644
--- a/include/linux/hmm.h
+++ b/include/linux/hmm.h
@@ -23,6 +23,7 @@ struct mmu_interval_notifier;
* HMM_PFN_WRITE - if the page memory can be written to (requires HMM_PFN_VALID)
* HMM_PFN_ERROR - accessing the pfn is impossible and the device should
* fail. ie poisoned memory, special pages, no vma, etc
+ * HMM_PFN_P2PDMA_BUS - Bus mapped P2P transfer
* HMM_PFN_DMA_MAPPED - Flag preserved on input-to-output transformation
* to mark that page is already DMA mapped
*
@@ -40,6 +41,7 @@ enum hmm_pfn_flags {
HMM_PFN_ERROR = 1UL << (BITS_PER_LONG - 3),
/* Sticky flag, carried from Input to Output */
+ HMM_PFN_P2PDMA_BUS = 1UL << (BITS_PER_LONG - 6),
HMM_PFN_DMA_MAPPED = 1UL << (BITS_PER_LONG - 7),
HMM_PFN_ORDER_SHIFT = (BITS_PER_LONG - 8),
diff --git a/mm/hmm.c b/mm/hmm.c
index 2a0c34d7cb2b..85cd6f20303c 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -10,6 +10,7 @@
*/
#include <linux/pagewalk.h>
#include <linux/hmm.h>
+#include <linux/hmm-dma.h>
#include <linux/init.h>
#include <linux/rmap.h>
#include <linux/swap.h>
@@ -23,6 +24,7 @@
#include <linux/sched/mm.h>
#include <linux/jump_label.h>
#include <linux/dma-mapping.h>
+#include <linux/pci-p2pdma.h>
#include <linux/mmu_notifier.h>
#include <linux/memory_hotplug.h>
@@ -615,3 +617,196 @@ int hmm_range_fault(struct hmm_range *range)
return ret;
}
EXPORT_SYMBOL(hmm_range_fault);
+
+/**
+ * hmm_dma_map_alloc - Allocate HMM map structure
+ * @dev: device to allocate structure for
+ * @map: HMM map to allocate
+ * @nr_entries: number of entries in the map
+ * @dma_entry_size: size of the DMA entry in the map
+ *
+ * Allocate the HMM map structure and all the lists it contains.
+ * Return 0 on success, -ENOMEM on failure.
+ */
+int hmm_dma_map_alloc(struct device *dev, struct hmm_dma_map *map,
+ size_t nr_entries, size_t dma_entry_size)
+{
+ bool dma_need_sync = false;
+ bool use_iova;
+
+ if (!(nr_entries * PAGE_SIZE / dma_entry_size))
+ return -EINVAL;
+
+ /*
+ * The HMM API violates our normal DMA buffer ownership rules and can't
+ * transfer buffer ownership. The dma_addressing_limited() check is a
+ * best approximation to ensure no swiotlb buffering happens.
+ */
+ if (IS_ENABLED(CONFIG_DMA_NEED_SYNC))
+ dma_need_sync = !dev->dma_skip_sync;
+ if (dma_need_sync || dma_addressing_limited(dev))
+ return -EOPNOTSUPP;
+
+ map->dma_entry_size = dma_entry_size;
+ map->pfn_list =
+ kvcalloc(nr_entries, sizeof(*map->pfn_list), GFP_KERNEL);
+ if (!map->pfn_list)
+ return -ENOMEM;
+
+ use_iova = dma_iova_try_alloc(dev, &map->state, 0,
+ nr_entries * PAGE_SIZE);
+ if (!use_iova && dma_need_unmap(dev)) {
+ map->dma_list = kvcalloc(nr_entries, sizeof(*map->dma_list),
+ GFP_KERNEL);
+ if (!map->dma_list)
+ goto err_dma;
+ }
+ return 0;
+
+err_dma:
+ kfree(map->pfn_list);
+ return -ENOMEM;
+}
+EXPORT_SYMBOL_GPL(hmm_dma_map_alloc);
+
+/**
+ * hmm_dma_map_free - iFree HMM map structure
+ * @dev: device to free structure from
+ * @map: HMM map containing the various lists and state
+ *
+ * Free the HMM map structure and all the lists it contains.
+ */
+void hmm_dma_map_free(struct device *dev, struct hmm_dma_map *map)
+{
+ if (dma_use_iova(&map->state))
+ dma_iova_free(dev, &map->state);
+ kfree(map->pfn_list);
+ kfree(map->dma_list);
+}
+EXPORT_SYMBOL_GPL(hmm_dma_map_free);
+
+/**
+ * hmm_dma_map_pfn - Map a physical HMM page to DMA address
+ * @dev: Device to map the page for
+ * @map: HMM map
+ * @idx: Index into the PFN and dma address arrays
+ * @pci_p2pdma_map_state: PCI P2P state.
+ *
+ * dma_alloc_iova() allocates IOVA based on the size specified by their use in
+ * iova->size. Call this function after IOVA allocation to link whole @page
+ * to get the DMA address. Note that very first call to this function
+ * will have @offset set to 0 in the IOVA space allocated from
+ * dma_alloc_iova(). For subsequent calls to this function on same @iova,
+ * @offset needs to be advanced by the caller with the size of previous
+ * page that was linked + DMA address returned for the previous page that was
+ * linked by this function.
+ */
+dma_addr_t hmm_dma_map_pfn(struct device *dev, struct hmm_dma_map *map,
+ size_t idx, struct pci_p2pdma_map_state *p2pdma_state)
+{
+ struct dma_iova_state *state = &map->state;
+ dma_addr_t *dma_addrs = map->dma_list;
+ unsigned long *pfns = map->pfn_list;
+ struct page *page = hmm_pfn_to_page(pfns[idx]);
+ phys_addr_t paddr = hmm_pfn_to_phys(pfns[idx]);
+ size_t offset = idx * map->dma_entry_size;
+ dma_addr_t dma_addr;
+ int ret;
+
+ if ((pfns[idx] & HMM_PFN_DMA_MAPPED) &&
+ !(pfns[idx] & HMM_PFN_P2PDMA_BUS)) {
+ /*
+ * We are in this flow when there is a need to resync flags,
+ * for example when page was already linked in prefetch call
+ * with READ flag and now we need to add WRITE flag
+ *
+ * This page was already programmed to HW and we don't want/need
+ * to unlink and link it again just to resync flags.
+ */
+ if (dma_use_iova(state))
+ return state->addr + offset;
+
+ /*
+ * Without dma_need_unmap, the dma_addrs array is NULL, thus we
+ * need to regenerate the address below even if there already
+ * was a mapping. But !dma_need_unmap implies that the
+ * mapping stateless, so this is fine.
+ */
+ if (dma_need_unmap(dev))
+ return dma_addrs[idx];
+
+ /* Continue to remapping */
+ }
+
+ switch (pci_p2pdma_state(p2pdma_state, dev, page)) {
+ case PCI_P2PDMA_MAP_THRU_HOST_BRIDGE:
+ case PCI_P2PDMA_MAP_NONE:
+ break;
+ case PCI_P2PDMA_MAP_BUS_ADDR:
+ dma_addr = pci_p2pdma_bus_addr_map(p2pdma_state, paddr);
+ pfns[idx] |= HMM_PFN_P2PDMA_BUS;
+ goto done;
+ default:
+ return DMA_MAPPING_ERROR;
+ }
+
+ if (dma_use_iova(state)) {
+ ret = dma_iova_link(dev, state, paddr, offset,
+ map->dma_entry_size, DMA_BIDIRECTIONAL, 0);
+ ret = dma_iova_sync(dev, state, offset, map->dma_entry_size,
+ ret);
+ if (ret)
+ return DMA_MAPPING_ERROR;
+
+ dma_addr = state->addr + offset;
+ } else {
+ if (WARN_ON_ONCE(dma_need_unmap(dev) && !dma_addrs))
+ return DMA_MAPPING_ERROR;
+
+ dma_addr = dma_map_page(dev, page, 0, map->dma_entry_size,
+ DMA_BIDIRECTIONAL);
+ if (dma_mapping_error(dev, dma_addr))
+ return DMA_MAPPING_ERROR;
+
+ if (dma_need_unmap(dev))
+ dma_addrs[idx] = dma_addr;
+ }
+
+done:
+ pfns[idx] |= HMM_PFN_DMA_MAPPED;
+ return dma_addr;
+}
+EXPORT_SYMBOL_GPL(hmm_dma_map_pfn);
+
+/**
+ * hmm_dma_unmap_pfn - Unmap a physical HMM page from DMA address
+ * @dev: Device to unmap the page from
+ * @map: HMM map
+ * @idx: Index of the PFN to unmap
+ *
+ * Returns true if the PFN was mapped and has been unmapped, false otherwise.
+ */
+bool hmm_dma_unmap_pfn(struct device *dev, struct hmm_dma_map *map, size_t idx)
+{
+ struct dma_iova_state *state = &map->state;
+ dma_addr_t *dma_addrs = map->dma_list;
+ unsigned long *pfns = map->pfn_list;
+
+#define HMM_PFN_VALID_DMA (HMM_PFN_VALID | HMM_PFN_DMA_MAPPED)
+ if ((pfns[idx] & HMM_PFN_VALID_DMA) != HMM_PFN_VALID_DMA)
+ return false;
+#undef HMM_PFN_VALID_DMA
+
+ if (pfns[idx] & HMM_PFN_P2PDMA_BUS)
+ ; /* no need to unmap bus address P2P mappings */
+ else if (dma_use_iova(state))
+ dma_iova_unlink(dev, state, idx * map->dma_entry_size,
+ map->dma_entry_size, DMA_BIDIRECTIONAL, 0);
+ else if (dma_need_unmap(dev))
+ dma_unmap_page(dev, dma_addrs[idx], map->dma_entry_size,
+ DMA_BIDIRECTIONAL);
+
+ pfns[idx] &= ~(HMM_PFN_DMA_MAPPED | HMM_PFN_P2PDMA_BUS);
+ return true;
+}
+EXPORT_SYMBOL_GPL(hmm_dma_unmap_pfn);
--
2.46.2
next prev parent reply other threads:[~2024-10-27 14:22 UTC|newest]
Thread overview: 34+ messages / expand[flat|nested] mbox.gz Atom feed top
2024-10-27 14:21 [PATCH 00/18] Provide a new two step DMA mapping API Leon Romanovsky
2024-10-27 14:21 ` [PATCH 01/18] PCI/P2PDMA: refactor the p2pdma mapping helpers Leon Romanovsky
2024-10-28 18:10 ` Logan Gunthorpe
2024-10-28 20:59 ` Bjorn Helgaas
2024-10-29 16:48 ` Leon Romanovsky
2024-10-27 14:21 ` [PATCH 02/18] dma-mapping: move the PCI P2PDMA mapping helpers to pci-p2pdma.h Leon Romanovsky
2024-10-28 18:11 ` Logan Gunthorpe
2024-10-29 15:11 ` Bjorn Helgaas
2024-10-27 14:21 ` [PATCH 03/18] iommu: generalize the batched sync after map interface Leon Romanovsky
2024-10-27 14:21 ` [PATCH 04/18] dma-mapping: Add check if IOVA can be used Leon Romanovsky
2024-10-27 14:21 ` [PATCH 05/18] dma: Provide an interface to allow allocate IOVA Leon Romanovsky
2024-10-28 1:24 ` Baolu Lu
2024-10-28 6:37 ` Leon Romanovsky
2024-10-29 7:46 ` Christoph Hellwig
2024-10-28 4:24 ` Srinivasulu Thanneeru
2024-10-28 6:46 ` Leon Romanovsky
2024-10-27 14:21 ` [PATCH 06/18] iommu/dma: Factor out a iommu_dma_map_swiotlb helper Leon Romanovsky
2024-10-27 14:21 ` [PATCH 07/18] dma-mapping: Implement link/unlink ranges API Leon Romanovsky
2024-10-28 2:00 ` Baolu Lu
2024-10-28 6:22 ` Leon Romanovsky
2024-10-28 18:31 ` Leon Romanovsky
2024-10-27 14:21 ` [PATCH 08/18] dma-mapping: add a dma_need_unmap helper Leon Romanovsky
2024-10-27 14:21 ` [PATCH 09/18] docs: core-api: document the IOVA-based API Leon Romanovsky
2024-10-28 18:12 ` Logan Gunthorpe
2024-10-28 18:28 ` Leon Romanovsky
2024-10-27 14:21 ` [PATCH 10/18] mm/hmm: let users to tag specific PFN with DMA mapped bit Leon Romanovsky
2024-10-27 14:21 ` Leon Romanovsky [this message]
2024-10-27 14:21 ` [PATCH 12/18] RDMA/umem: Store ODP access mask information in PFN Leon Romanovsky
2024-10-27 14:21 ` [PATCH 13/18] RDMA/core: Convert UMEM ODP DMA mapping to caching IOVA and page linkage Leon Romanovsky
2024-10-27 14:21 ` [PATCH 14/18] RDMA/umem: Separate implicit ODP initialization from explicit ODP Leon Romanovsky
2024-10-27 14:21 ` [PATCH 15/18] vfio/mlx5: Explicitly use number of pages instead of allocated length Leon Romanovsky
2024-10-27 14:21 ` [PATCH 16/18] vfio/mlx5: Rewrite create mkey flow to allow better code reuse Leon Romanovsky
2024-10-27 14:21 ` [PATCH 17/18] vfio/mlx5: Explicitly store page list Leon Romanovsky
2024-10-27 14:21 ` [PATCH 18/18] vfio/mlx5: Convert vfio to use DMA link API Leon Romanovsky
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=505c3956e0101f3e4f180e67319ff33c789f83b5.1730037276.git.leon@kernel.org \
--to=leon@kernel.org \
--cc=akpm@linux-foundation.org \
--cc=alex.williamson@redhat.com \
--cc=axboe@kernel.dk \
--cc=bhelgaas@google.com \
--cc=corbet@lwn.net \
--cc=hch@lst.de \
--cc=iommu@lists.linux.dev \
--cc=jgg@ziepe.ca \
--cc=jglisse@redhat.com \
--cc=joro@8bytes.org \
--cc=kbusch@kernel.org \
--cc=kevin.tian@intel.com \
--cc=kvm@vger.kernel.org \
--cc=leonro@nvidia.com \
--cc=linux-block@vger.kernel.org \
--cc=linux-doc@vger.kernel.org \
--cc=linux-kernel@vger.kernel.org \
--cc=linux-mm@kvack.org \
--cc=linux-nvme@lists.infradead.org \
--cc=linux-pci@vger.kernel.org \
--cc=linux-rdma@vger.kernel.org \
--cc=logang@deltatee.com \
--cc=m.szyprowski@samsung.com \
--cc=robin.murphy@arm.com \
--cc=sagi@grimberg.me \
--cc=shameerali.kolothum.thodi@huawei.com \
--cc=will@kernel.org \
--cc=yishaih@nvidia.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox