linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed
From: Leon Romanovsky <leon@kernel.org>
To: Jens Axboe <axboe@kernel.dk>, Jason Gunthorpe <jgg@ziepe.ca>,
	Robin Murphy <robin.murphy@arm.com>,
	Joerg Roedel <joro@8bytes.org>, Will Deacon <will@kernel.org>,
	Christoph Hellwig <hch@lst.de>, Sagi Grimberg <sagi@grimberg.me>
Cc: "Keith Busch" <kbusch@kernel.org>,
	"Bjorn Helgaas" <bhelgaas@google.com>,
	"Logan Gunthorpe" <logang@deltatee.com>,
	"Yishai Hadas" <yishaih@nvidia.com>,
	"Shameer Kolothum" <shameerali.kolothum.thodi@huawei.com>,
	"Kevin Tian" <kevin.tian@intel.com>,
	"Alex Williamson" <alex.williamson@redhat.com>,
	"Marek Szyprowski" <m.szyprowski@samsung.com>,
	"Jérôme Glisse" <jglisse@redhat.com>,
	"Andrew Morton" <akpm@linux-foundation.org>,
	"Jonathan Corbet" <corbet@lwn.net>,
	linux-doc@vger.kernel.org, linux-kernel@vger.kernel.org,
	linux-block@vger.kernel.org, linux-rdma@vger.kernel.org,
	iommu@lists.linux.dev, linux-nvme@lists.infradead.org,
	linux-pci@vger.kernel.org, kvm@vger.kernel.org,
	linux-mm@kvack.org
Subject: [RFC PATCH 2/7] block: don't merge different kinds of P2P transfers in a single bio
Date: Sun, 27 Oct 2024 16:21:55 +0200	[thread overview]
Message-ID: <34d44537a65aba6ede215a8ad882aeee028b423a.1730037261.git.leon@kernel.org> (raw)
In-Reply-To: <cover.1730037261.git.leon@kernel.org>

From: Christoph Hellwig <hch@lst.de>

To get out of the dma mapping helpers having to check every segment for
it's P2P status, ensure that bios either contain P2P transfers or non-P2P
transfers, and that a P2P bio only contains ranges from a single device.

This means we do the page zone access in the bio add path where it should
be still page hot, and will only have do the fairly expensive P2P topology
lookup once per bio down in the dma mapping path, and only for already
marked bios.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
---
 block/bio.c               | 36 +++++++++++++++++++++++++++++-------
 block/blk-map.c           | 32 ++++++++++++++++++++++++--------
 include/linux/blk_types.h |  2 ++
 3 files changed, 55 insertions(+), 15 deletions(-)

diff --git a/block/bio.c b/block/bio.c
index 2d3bc8bfb071..943a6d78cb3e 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -928,8 +928,6 @@ static bool bvec_try_merge_page(struct bio_vec *bv, struct page *page,
 		return false;
 	if (xen_domain() && !xen_biovec_phys_mergeable(bv, page))
 		return false;
-	if (!zone_device_pages_have_same_pgmap(bv->bv_page, page))
-		return false;
 
 	*same_page = ((vec_end_addr & PAGE_MASK) == ((page_addr + off) &
 		     PAGE_MASK));
@@ -993,6 +991,14 @@ int bio_add_hw_page(struct request_queue *q, struct bio *bio,
 	if (bio->bi_vcnt > 0) {
 		struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt - 1];
 
+		/*
+		 * When doing ZONE_DEVICE-based P2P transfers, all pages in a
+		 * bio must be P2P pages from the same device.
+		 */
+		if ((bio->bi_opf & REQ_P2PDMA) &&
+		    !zone_device_pages_have_same_pgmap(bv->bv_page, page))
+			return 0;
+
 		if (bvec_try_merge_hw_page(q, bv, page, len, offset,
 				same_page)) {
 			bio->bi_iter.bi_size += len;
@@ -1009,6 +1015,9 @@ int bio_add_hw_page(struct request_queue *q, struct bio *bio,
 		 */
 		if (bvec_gap_to_prev(&q->limits, bv, offset))
 			return 0;
+	} else {
+		if (is_pci_p2pdma_page(page))
+			bio->bi_opf |= REQ_P2PDMA | REQ_NOMERGE;
 	}
 
 	bvec_set_page(&bio->bi_io_vec[bio->bi_vcnt], page, len, offset);
@@ -1133,11 +1142,24 @@ static int bio_add_page_int(struct bio *bio, struct page *page,
 	if (bio->bi_iter.bi_size > UINT_MAX - len)
 		return 0;
 
-	if (bio->bi_vcnt > 0 &&
-	    bvec_try_merge_page(&bio->bi_io_vec[bio->bi_vcnt - 1],
-				page, len, offset, same_page)) {
-		bio->bi_iter.bi_size += len;
-		return len;
+	if (bio->bi_vcnt > 0) {
+		struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt - 1];
+
+		/*
+		 * When doing ZONE_DEVICE-based P2P transfers, all pages in a
+		 * bio must be P2P pages from the same device.
+		 */
+		if ((bio->bi_opf & REQ_P2PDMA) &&
+		    !zone_device_pages_have_same_pgmap(bv->bv_page, page))
+			return 0;
+
+		if (bvec_try_merge_page(bv, page, len, offset, same_page)) {
+			bio->bi_iter.bi_size += len;
+			return len;
+		}
+	} else {
+		if (is_pci_p2pdma_page(page))
+			bio->bi_opf |= REQ_P2PDMA | REQ_NOMERGE;
 	}
 
 	if (bio->bi_vcnt >= bio->bi_max_vecs)
diff --git a/block/blk-map.c b/block/blk-map.c
index 0e1167b23934..03192b1ca6ea 100644
--- a/block/blk-map.c
+++ b/block/blk-map.c
@@ -568,6 +568,7 @@ static int blk_rq_map_user_bvec(struct request *rq, const struct iov_iter *iter)
 	const struct queue_limits *lim = &q->limits;
 	unsigned int nsegs = 0, bytes = 0;
 	struct bio *bio;
+	int error;
 	size_t i;
 
 	if (!nr_iter || (nr_iter >> SECTOR_SHIFT) > queue_max_hw_sectors(q))
@@ -588,15 +589,30 @@ static int blk_rq_map_user_bvec(struct request *rq, const struct iov_iter *iter)
 	for (i = 0; i < nr_segs; i++) {
 		struct bio_vec *bv = &bvecs[i];
 
-		/*
-		 * If the queue doesn't support SG gaps and adding this
-		 * offset would create a gap, fallback to copy.
-		 */
-		if (bvprvp && bvec_gap_to_prev(lim, bvprvp, bv->bv_offset)) {
-			blk_mq_map_bio_put(bio);
-			return -EREMOTEIO;
+		error = -EREMOTEIO;
+		if (bvprvp) {
+			/*
+			 * If the queue doesn't support SG gaps and adding this
+			 * offset would create a gap, fallback to copy.
+			 */
+			if (bvec_gap_to_prev(lim, bvprvp, bv->bv_offset))
+				goto put_bio;
+
+			/*
+			 * When doing ZONE_DEVICE-based P2P transfers, all pages
+			 * in a bio must be P2P pages, and from the same device.
+			 */
+			if ((bio->bi_opf & REQ_P2PDMA) &&
+			    zone_device_pages_have_same_pgmap(bvprvp->bv_page,
+					bv->bv_page))
+				goto put_bio;
+		} else {
+			if (is_pci_p2pdma_page(bv->bv_page))
+				bio->bi_opf |= REQ_P2PDMA | REQ_NOMERGE;
 		}
+
 		/* check full condition */
+		error = -EINVAL;
 		if (nsegs >= nr_segs || bytes > UINT_MAX - bv->bv_len)
 			goto put_bio;
 		if (bytes + bv->bv_len > nr_iter)
@@ -611,7 +627,7 @@ static int blk_rq_map_user_bvec(struct request *rq, const struct iov_iter *iter)
 	return 0;
 put_bio:
 	blk_mq_map_bio_put(bio);
-	return -EINVAL;
+	return error;
 }
 
 /**
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index dce7615c35e7..94cf146e8ce6 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -378,6 +378,7 @@ enum req_flag_bits {
 	__REQ_DRV,		/* for driver use */
 	__REQ_FS_PRIVATE,	/* for file system (submitter) use */
 	__REQ_ATOMIC,		/* for atomic write operations */
+	__REQ_P2PDMA,		/* contains P2P DMA pages */
 	/*
 	 * Command specific flags, keep last:
 	 */
@@ -410,6 +411,7 @@ enum req_flag_bits {
 #define REQ_DRV		(__force blk_opf_t)(1ULL << __REQ_DRV)
 #define REQ_FS_PRIVATE	(__force blk_opf_t)(1ULL << __REQ_FS_PRIVATE)
 #define REQ_ATOMIC	(__force blk_opf_t)(1ULL << __REQ_ATOMIC)
+#define REQ_P2PDMA	(__force blk_opf_t)(1ULL << __REQ_P2PDMA)
 
 #define REQ_NOUNMAP	(__force blk_opf_t)(1ULL << __REQ_NOUNMAP)
 
-- 
2.46.2



  parent reply	other threads:[~2024-10-27 14:22 UTC|newest]

Thread overview: 17+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2024-10-27 14:21 [RFC PATCH 0/7] Block and NMMe PCI use of new DMA mapping API Leon Romanovsky
2024-10-27 14:21 ` [RFC PATCH 1/7] block: share more code for bio addition helpers Leon Romanovsky
2024-10-31 20:55   ` Bart Van Assche
2024-11-04  8:55     ` Christoph Hellwig
2024-10-27 14:21 ` Leon Romanovsky [this message]
2024-10-28 18:27   ` [RFC PATCH 2/7] block: don't merge different kinds of P2P transfers in a single bio Logan Gunthorpe
2024-10-31 20:58   ` Bart Van Assche
2024-11-01  6:11     ` Leon Romanovsky
2024-11-04  8:55     ` Christoph Hellwig
2024-11-02  7:39   ` Zhu Yanjun
2024-11-03 15:19     ` Leon Romanovsky
2024-11-04  8:56     ` Christoph Hellwig
2024-10-27 14:21 ` [RFC PATCH 3/7] blk-mq: add a dma mapping iterator Leon Romanovsky
2024-10-27 14:21 ` [RFC PATCH 4/7] blk-mq: add scatterlist-less DMA mapping helpers Leon Romanovsky
2024-10-27 14:21 ` [RFC PATCH 5/7] nvme-pci: remove struct nvme_descriptor Leon Romanovsky
2024-10-27 14:21 ` [RFC PATCH 6/7] nvme-pci: use a better encoding for small prp pool allocations Leon Romanovsky
2024-10-27 14:22 ` [RFC PATCH 7/7] nvme-pci: convert to blk_rq_dma_map Leon Romanovsky

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=34d44537a65aba6ede215a8ad882aeee028b423a.1730037261.git.leon@kernel.org \
    --to=leon@kernel.org \
    --cc=akpm@linux-foundation.org \
    --cc=alex.williamson@redhat.com \
    --cc=axboe@kernel.dk \
    --cc=bhelgaas@google.com \
    --cc=corbet@lwn.net \
    --cc=hch@lst.de \
    --cc=iommu@lists.linux.dev \
    --cc=jgg@ziepe.ca \
    --cc=jglisse@redhat.com \
    --cc=joro@8bytes.org \
    --cc=kbusch@kernel.org \
    --cc=kevin.tian@intel.com \
    --cc=kvm@vger.kernel.org \
    --cc=linux-block@vger.kernel.org \
    --cc=linux-doc@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=linux-nvme@lists.infradead.org \
    --cc=linux-pci@vger.kernel.org \
    --cc=linux-rdma@vger.kernel.org \
    --cc=logang@deltatee.com \
    --cc=m.szyprowski@samsung.com \
    --cc=robin.murphy@arm.com \
    --cc=sagi@grimberg.me \
    --cc=shameerali.kolothum.thodi@huawei.com \
    --cc=will@kernel.org \
    --cc=yishaih@nvidia.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox