* [PATCH v6 1/3] dmapool: add NUMA affinity support
2025-04-26 2:06 [PATCH v6 0/3] nvme/pci: PRP list DMA pool partitioning Caleb Sander Mateos
@ 2025-04-26 2:06 ` Caleb Sander Mateos
2025-04-28 10:35 ` John Garry
` (2 more replies)
2025-04-26 2:06 ` [PATCH v6 2/3] nvme/pci: factor out nvme_init_hctx() helper Caleb Sander Mateos
` (3 subsequent siblings)
4 siblings, 3 replies; 13+ messages in thread
From: Caleb Sander Mateos @ 2025-04-26 2:06 UTC (permalink / raw)
To: Keith Busch, Jens Axboe, Christoph Hellwig, Sagi Grimberg, Andrew Morton
Cc: Kanchan Joshi, linux-nvme, linux-mm, linux-kernel, Caleb Sander Mateos
From: Keith Busch <kbusch@kernel.org>
Introduce dma_pool_create_node(), like dma_pool_create() but taking an
additional NUMA node argument. Allocate struct dma_pool on the desired
node, and store the node on dma_pool for allocating struct dma_page.
Make dma_pool_create() an alias for dma_pool_create_node() with node set
to NUMA_NO_NODE.
Signed-off-by: Keith Busch <kbusch@kernel.org>
Signed-off-by: Caleb Sander Mateos <csander@purestorage.com>
Reviewed-by: Jens Axboe <axboe@kernel.dk>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
---
include/linux/dmapool.h | 17 +++++++++++++----
mm/dmapool.c | 16 ++++++++++------
2 files changed, 23 insertions(+), 10 deletions(-)
diff --git a/include/linux/dmapool.h b/include/linux/dmapool.h
index f632ecfb4238..bbf1833a24f7 100644
--- a/include/linux/dmapool.h
+++ b/include/linux/dmapool.h
@@ -9,19 +9,20 @@
*/
#ifndef LINUX_DMAPOOL_H
#define LINUX_DMAPOOL_H
+#include <linux/nodemask_types.h>
#include <linux/scatterlist.h>
#include <asm/io.h>
struct device;
#ifdef CONFIG_HAS_DMA
-struct dma_pool *dma_pool_create(const char *name, struct device *dev,
- size_t size, size_t align, size_t allocation);
+struct dma_pool *dma_pool_create_node(const char *name, struct device *dev,
+ size_t size, size_t align, size_t boundary, int node);
void dma_pool_destroy(struct dma_pool *pool);
void *dma_pool_alloc(struct dma_pool *pool, gfp_t mem_flags,
dma_addr_t *handle);
@@ -33,12 +34,13 @@ void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t addr);
struct dma_pool *dmam_pool_create(const char *name, struct device *dev,
size_t size, size_t align, size_t allocation);
void dmam_pool_destroy(struct dma_pool *pool);
#else /* !CONFIG_HAS_DMA */
-static inline struct dma_pool *dma_pool_create(const char *name,
- struct device *dev, size_t size, size_t align, size_t allocation)
+static inline struct dma_pool *dma_pool_create_node(const char *name,
+ struct device *dev, size_t size, size_t align, size_t boundary,
+ int node)
{ return NULL; }
static inline void dma_pool_destroy(struct dma_pool *pool) { }
static inline void *dma_pool_alloc(struct dma_pool *pool, gfp_t mem_flags,
dma_addr_t *handle) { return NULL; }
static inline void dma_pool_free(struct dma_pool *pool, void *vaddr,
@@ -47,10 +49,17 @@ static inline struct dma_pool *dmam_pool_create(const char *name,
struct device *dev, size_t size, size_t align, size_t allocation)
{ return NULL; }
static inline void dmam_pool_destroy(struct dma_pool *pool) { }
#endif /* !CONFIG_HAS_DMA */
+static inline struct dma_pool *dma_pool_create(const char *name,
+ struct device *dev, size_t size, size_t align, size_t boundary)
+{
+ return dma_pool_create_node(name, dev, size, align, boundary,
+ NUMA_NO_NODE);
+}
+
static inline void *dma_pool_zalloc(struct dma_pool *pool, gfp_t mem_flags,
dma_addr_t *handle)
{
return dma_pool_alloc(pool, mem_flags | __GFP_ZERO, handle);
}
diff --git a/mm/dmapool.c b/mm/dmapool.c
index f0bfc6c490f4..4de531542814 100644
--- a/mm/dmapool.c
+++ b/mm/dmapool.c
@@ -54,10 +54,11 @@ struct dma_pool { /* the pool */
size_t nr_pages;
struct device *dev;
unsigned int size;
unsigned int allocation;
unsigned int boundary;
+ int node;
char name[32];
struct list_head pools;
};
struct dma_page { /* cacheable header for 'allocation' bytes */
@@ -197,16 +198,17 @@ static void pool_block_push(struct dma_pool *pool, struct dma_block *block,
pool->next_block = block;
}
/**
- * dma_pool_create - Creates a pool of consistent memory blocks, for dma.
+ * dma_pool_create_node - Creates a pool of consistent memory blocks, for dma.
* @name: name of pool, for diagnostics
* @dev: device that will be doing the DMA
* @size: size of the blocks in this pool.
* @align: alignment requirement for blocks; must be a power of two
* @boundary: returned blocks won't cross this power of two boundary
+ * @node: optional NUMA node to allocate structs 'dma_pool' and 'dma_page' on
* Context: not in_interrupt()
*
* Given one of these pools, dma_pool_alloc()
* may be used to allocate memory. Such memory will all have "consistent"
* DMA mappings, accessible by the device and its driver without using
@@ -219,12 +221,13 @@ static void pool_block_push(struct dma_pool *pool, struct dma_block *block,
* boundaries of 4KBytes.
*
* Return: a dma allocation pool with the requested characteristics, or
* %NULL if one can't be created.
*/
-struct dma_pool *dma_pool_create(const char *name, struct device *dev,
- size_t size, size_t align, size_t boundary)
+struct dma_pool *dma_pool_create_node(const char *name, struct device *dev,
+ size_t size, size_t align, size_t boundary,
+ int node)
{
struct dma_pool *retval;
size_t allocation;
bool empty;
@@ -249,11 +252,11 @@ struct dma_pool *dma_pool_create(const char *name, struct device *dev,
else if ((boundary < size) || (boundary & (boundary - 1)))
return NULL;
boundary = min(boundary, allocation);
- retval = kzalloc(sizeof(*retval), GFP_KERNEL);
+ retval = kzalloc_node(sizeof(*retval), GFP_KERNEL, node);
if (!retval)
return retval;
strscpy(retval->name, name, sizeof(retval->name));
@@ -262,10 +265,11 @@ struct dma_pool *dma_pool_create(const char *name, struct device *dev,
INIT_LIST_HEAD(&retval->page_list);
spin_lock_init(&retval->lock);
retval->size = size;
retval->boundary = boundary;
retval->allocation = allocation;
+ retval->node = node;
INIT_LIST_HEAD(&retval->pools);
/*
* pools_lock ensures that the ->dma_pools list does not get corrupted.
* pools_reg_lock ensures that there is not a race between
@@ -293,11 +297,11 @@ struct dma_pool *dma_pool_create(const char *name, struct device *dev,
}
}
mutex_unlock(&pools_reg_lock);
return retval;
}
-EXPORT_SYMBOL(dma_pool_create);
+EXPORT_SYMBOL(dma_pool_create_node);
static void pool_initialise_page(struct dma_pool *pool, struct dma_page *page)
{
unsigned int next_boundary = pool->boundary, offset = 0;
struct dma_block *block, *first = NULL, *last = NULL;
@@ -333,11 +337,11 @@ static void pool_initialise_page(struct dma_pool *pool, struct dma_page *page)
static struct dma_page *pool_alloc_page(struct dma_pool *pool, gfp_t mem_flags)
{
struct dma_page *page;
- page = kmalloc(sizeof(*page), mem_flags);
+ page = kmalloc_node(sizeof(*page), mem_flags, pool->node);
if (!page)
return NULL;
page->vaddr = dma_alloc_coherent(pool->dev, pool->allocation,
&page->dma, mem_flags);
--
2.45.2
^ permalink raw reply [flat|nested] 13+ messages in thread* [PATCH v6 3/3] nvme/pci: make PRP list DMA pools per-NUMA-node
2025-04-26 2:06 [PATCH v6 0/3] nvme/pci: PRP list DMA pool partitioning Caleb Sander Mateos
2025-04-26 2:06 ` [PATCH v6 1/3] dmapool: add NUMA affinity support Caleb Sander Mateos
2025-04-26 2:06 ` [PATCH v6 2/3] nvme/pci: factor out nvme_init_hctx() helper Caleb Sander Mateos
@ 2025-04-26 2:06 ` Caleb Sander Mateos
2025-05-02 16:48 ` [PATCH v6 0/3] nvme/pci: PRP list DMA pool partitioning Caleb Sander Mateos
2025-05-12 14:14 ` Christoph Hellwig
4 siblings, 0 replies; 13+ messages in thread
From: Caleb Sander Mateos @ 2025-04-26 2:06 UTC (permalink / raw)
To: Keith Busch, Jens Axboe, Christoph Hellwig, Sagi Grimberg, Andrew Morton
Cc: Kanchan Joshi, linux-nvme, linux-mm, linux-kernel, Caleb Sander Mateos
NVMe commands with over 8 KB of discontiguous data allocate PRP list
pages from the per-nvme_device dma_pool prp_page_pool or prp_small_pool.
Each call to dma_pool_alloc() and dma_pool_free() takes the per-dma_pool
spinlock. These device-global spinlocks are a significant source of
contention when many CPUs are submitting to the same NVMe devices. On a
workload issuing 32 KB reads from 16 CPUs (8 hypertwin pairs) across 2
NUMA nodes to 23 NVMe devices, we observed 2.4% of CPU time spent in
_raw_spin_lock_irqsave called from dma_pool_alloc and dma_pool_free.
Ideally, the dma_pools would be per-hctx to minimize contention. But
that could impose considerable resource costs in a system with many NVMe
devices and CPUs.
As a compromise, allocate per-NUMA-node PRP list DMA pools. Map each
nvme_queue to the set of DMA pools corresponding to its device and its
hctx's NUMA node. This reduces the _raw_spin_lock_irqsave overhead by
about half, to 1.2%. Preventing the sharing of PRP list pages across
NUMA nodes also makes them cheaper to initialize.
Link: https://lore.kernel.org/linux-nvme/CADUfDZqa=OOTtTTznXRDmBQo1WrFcDw1hBA7XwM7hzJ-hpckcA@mail.gmail.com/T/#u
Signed-off-by: Caleb Sander Mateos <csander@purestorage.com>
Reviewed-by: Kanchan Joshi <joshi.k@samsung.com>
Reviewed-by: Sagi Grimberg <sagi@grimberg.me>
Reviewed-by: Keith Busch <kbusch@kernel.org>
Reviewed-by: Jens Axboe <axboe@kernel.dk>
---
drivers/nvme/host/pci.c | 144 +++++++++++++++++++++++-----------------
1 file changed, 84 insertions(+), 60 deletions(-)
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 642890ddada5..2c554bb7f984 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -16,10 +16,11 @@
#include <linux/kstrtox.h>
#include <linux/memremap.h>
#include <linux/mm.h>
#include <linux/module.h>
#include <linux/mutex.h>
+#include <linux/nodemask.h>
#include <linux/once.h>
#include <linux/pci.h>
#include <linux/suspend.h>
#include <linux/t10-pi.h>
#include <linux/types.h>
@@ -110,21 +111,24 @@ struct nvme_queue;
static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown);
static void nvme_delete_io_queues(struct nvme_dev *dev);
static void nvme_update_attrs(struct nvme_dev *dev);
+struct nvme_prp_dma_pools {
+ struct dma_pool *large;
+ struct dma_pool *small;
+};
+
/*
* Represents an NVM Express device. Each nvme_dev is a PCI function.
*/
struct nvme_dev {
struct nvme_queue *queues;
struct blk_mq_tag_set tagset;
struct blk_mq_tag_set admin_tagset;
u32 __iomem *dbs;
struct device *dev;
- struct dma_pool *prp_page_pool;
- struct dma_pool *prp_small_pool;
unsigned online_queues;
unsigned max_qid;
unsigned io_queues[HCTX_MAX_TYPES];
unsigned int num_vecs;
u32 q_depth;
@@ -160,10 +164,11 @@ struct nvme_dev {
struct nvme_host_mem_buf_desc *host_mem_descs;
void **host_mem_desc_bufs;
unsigned int nr_allocated_queues;
unsigned int nr_write_queues;
unsigned int nr_poll_queues;
+ struct nvme_prp_dma_pools prp_pools[];
};
static int io_queue_depth_set(const char *val, const struct kernel_param *kp)
{
return param_set_uint_minmax(val, kp, NVME_PCI_MIN_QUEUE_SIZE,
@@ -189,10 +194,11 @@ static inline struct nvme_dev *to_nvme_dev(struct nvme_ctrl *ctrl)
* An NVM Express queue. Each device has at least two (one for admin
* commands and one for I/O commands).
*/
struct nvme_queue {
struct nvme_dev *dev;
+ struct nvme_prp_dma_pools prp_pools;
spinlock_t sq_lock;
void *sq_cmds;
/* only used for poll queues: */
spinlock_t cq_poll_lock ____cacheline_aligned_in_smp;
struct nvme_completion *cqes;
@@ -395,18 +401,67 @@ static int nvme_pci_npages_prp(void)
unsigned max_bytes = (NVME_MAX_KB_SZ * 1024) + NVME_CTRL_PAGE_SIZE;
unsigned nprps = DIV_ROUND_UP(max_bytes, NVME_CTRL_PAGE_SIZE);
return DIV_ROUND_UP(8 * nprps, NVME_CTRL_PAGE_SIZE - 8);
}
+static struct nvme_prp_dma_pools *
+nvme_setup_prp_pools(struct nvme_dev *dev, unsigned numa_node)
+{
+ struct nvme_prp_dma_pools *prp_pools = &dev->prp_pools[numa_node];
+ size_t small_align = 256;
+
+ if (prp_pools->small)
+ return prp_pools; /* already initialized */
+
+ prp_pools->large = dma_pool_create_node("prp list page", dev->dev,
+ NVME_CTRL_PAGE_SIZE,
+ NVME_CTRL_PAGE_SIZE, 0,
+ numa_node);
+ if (!prp_pools->large)
+ return ERR_PTR(-ENOMEM);
+
+ if (dev->ctrl.quirks & NVME_QUIRK_DMAPOOL_ALIGN_512)
+ small_align = 512;
+
+ /* Optimisation for I/Os between 4k and 128k */
+ prp_pools->small = dma_pool_create_node("prp list 256", dev->dev,
+ 256, small_align, 0, numa_node);
+ if (!prp_pools->small) {
+ dma_pool_destroy(prp_pools->large);
+ prp_pools->large = NULL;
+ return ERR_PTR(-ENOMEM);
+ }
+
+ return prp_pools;
+}
+
+static void nvme_release_prp_pools(struct nvme_dev *dev)
+{
+ unsigned i;
+
+ for (i = 0; i < nr_node_ids; i++) {
+ struct nvme_prp_dma_pools *prp_pools = &dev->prp_pools[i];
+
+ dma_pool_destroy(prp_pools->large);
+ dma_pool_destroy(prp_pools->small);
+ }
+}
+
static int nvme_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, unsigned qid)
{
struct nvme_dev *dev = to_nvme_dev(data);
struct nvme_queue *nvmeq = &dev->queues[qid];
+ struct nvme_prp_dma_pools *prp_pools;
struct blk_mq_tags *tags;
tags = qid ? dev->tagset.tags[qid - 1] : dev->admin_tagset.tags[0];
WARN_ON(tags != hctx->tags);
+ prp_pools = nvme_setup_prp_pools(dev, hctx->numa_node);
+ if (IS_ERR(prp_pools))
+ return PTR_ERR(prp_pools);
+
+ nvmeq->prp_pools = *prp_pools;
hctx->driver_data = nvmeq;
return 0;
}
static int nvme_admin_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
@@ -536,27 +591,28 @@ static inline bool nvme_pci_use_sgls(struct nvme_dev *dev, struct request *req,
if (!sgl_threshold || avg_seg_size < sgl_threshold)
return nvme_req(req)->flags & NVME_REQ_USERCMD;
return true;
}
-static void nvme_free_prps(struct nvme_dev *dev, struct request *req)
+static void nvme_free_prps(struct nvme_queue *nvmeq, struct request *req)
{
const int last_prp = NVME_CTRL_PAGE_SIZE / sizeof(__le64) - 1;
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
dma_addr_t dma_addr = iod->first_dma;
int i;
for (i = 0; i < iod->nr_allocations; i++) {
__le64 *prp_list = iod->list[i].prp_list;
dma_addr_t next_dma_addr = le64_to_cpu(prp_list[last_prp]);
- dma_pool_free(dev->prp_page_pool, prp_list, dma_addr);
+ dma_pool_free(nvmeq->prp_pools.large, prp_list, dma_addr);
dma_addr = next_dma_addr;
}
}
-static void nvme_unmap_data(struct nvme_dev *dev, struct request *req)
+static void nvme_unmap_data(struct nvme_dev *dev, struct nvme_queue *nvmeq,
+ struct request *req)
{
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
if (iod->dma_len) {
dma_unmap_page(dev->dev, iod->first_dma, iod->dma_len,
@@ -567,17 +623,17 @@ static void nvme_unmap_data(struct nvme_dev *dev, struct request *req)
WARN_ON_ONCE(!iod->sgt.nents);
dma_unmap_sgtable(dev->dev, &iod->sgt, rq_dma_dir(req), 0);
if (iod->nr_allocations == 0)
- dma_pool_free(dev->prp_small_pool, iod->list[0].sg_list,
+ dma_pool_free(nvmeq->prp_pools.small, iod->list[0].sg_list,
iod->first_dma);
else if (iod->nr_allocations == 1)
- dma_pool_free(dev->prp_page_pool, iod->list[0].sg_list,
+ dma_pool_free(nvmeq->prp_pools.large, iod->list[0].sg_list,
iod->first_dma);
else
- nvme_free_prps(dev, req);
+ nvme_free_prps(nvmeq, req);
mempool_free(iod->sgt.sgl, dev->iod_mempool);
}
static void nvme_print_sgl(struct scatterlist *sgl, int nents)
{
@@ -591,11 +647,11 @@ static void nvme_print_sgl(struct scatterlist *sgl, int nents)
i, &phys, sg->offset, sg->length, &sg_dma_address(sg),
sg_dma_len(sg));
}
}
-static blk_status_t nvme_pci_setup_prps(struct nvme_dev *dev,
+static blk_status_t nvme_pci_setup_prps(struct nvme_queue *nvmeq,
struct request *req, struct nvme_rw_command *cmnd)
{
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
struct dma_pool *pool;
int length = blk_rq_payload_bytes(req);
@@ -627,14 +683,14 @@ static blk_status_t nvme_pci_setup_prps(struct nvme_dev *dev,
goto done;
}
nprps = DIV_ROUND_UP(length, NVME_CTRL_PAGE_SIZE);
if (nprps <= (256 / 8)) {
- pool = dev->prp_small_pool;
+ pool = nvmeq->prp_pools.small;
iod->nr_allocations = 0;
} else {
- pool = dev->prp_page_pool;
+ pool = nvmeq->prp_pools.large;
iod->nr_allocations = 1;
}
prp_list = dma_pool_alloc(pool, GFP_ATOMIC, &prp_dma);
if (!prp_list) {
@@ -672,11 +728,11 @@ static blk_status_t nvme_pci_setup_prps(struct nvme_dev *dev,
done:
cmnd->dptr.prp1 = cpu_to_le64(sg_dma_address(iod->sgt.sgl));
cmnd->dptr.prp2 = cpu_to_le64(iod->first_dma);
return BLK_STS_OK;
free_prps:
- nvme_free_prps(dev, req);
+ nvme_free_prps(nvmeq, req);
return BLK_STS_RESOURCE;
bad_sgl:
WARN(DO_ONCE(nvme_print_sgl, iod->sgt.sgl, iod->sgt.nents),
"Invalid SGL for payload:%d nents:%d\n",
blk_rq_payload_bytes(req), iod->sgt.nents);
@@ -697,11 +753,11 @@ static void nvme_pci_sgl_set_seg(struct nvme_sgl_desc *sge,
sge->addr = cpu_to_le64(dma_addr);
sge->length = cpu_to_le32(entries * sizeof(*sge));
sge->type = NVME_SGL_FMT_LAST_SEG_DESC << 4;
}
-static blk_status_t nvme_pci_setup_sgls(struct nvme_dev *dev,
+static blk_status_t nvme_pci_setup_sgls(struct nvme_queue *nvmeq,
struct request *req, struct nvme_rw_command *cmd)
{
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
struct dma_pool *pool;
struct nvme_sgl_desc *sg_list;
@@ -717,14 +773,14 @@ static blk_status_t nvme_pci_setup_sgls(struct nvme_dev *dev,
nvme_pci_sgl_set_data(&cmd->dptr.sgl, sg);
return BLK_STS_OK;
}
if (entries <= (256 / sizeof(struct nvme_sgl_desc))) {
- pool = dev->prp_small_pool;
+ pool = nvmeq->prp_pools.small;
iod->nr_allocations = 0;
} else {
- pool = dev->prp_page_pool;
+ pool = nvmeq->prp_pools.large;
iod->nr_allocations = 1;
}
sg_list = dma_pool_alloc(pool, GFP_ATOMIC, &sgl_dma);
if (!sg_list) {
@@ -784,16 +840,16 @@ static blk_status_t nvme_setup_sgl_simple(struct nvme_dev *dev,
}
static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req,
struct nvme_command *cmnd)
{
+ struct nvme_queue *nvmeq = req->mq_hctx->driver_data;
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
blk_status_t ret = BLK_STS_RESOURCE;
int rc;
if (blk_rq_nr_phys_segments(req) == 1) {
- struct nvme_queue *nvmeq = req->mq_hctx->driver_data;
struct bio_vec bv = req_bvec(req);
if (!is_pci_p2pdma_page(bv.bv_page)) {
if (!nvme_pci_metadata_use_sgls(dev, req) &&
(bv.bv_offset & (NVME_CTRL_PAGE_SIZE - 1)) +
@@ -824,13 +880,13 @@ static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req,
ret = BLK_STS_TARGET;
goto out_free_sg;
}
if (nvme_pci_use_sgls(dev, req, iod->sgt.nents))
- ret = nvme_pci_setup_sgls(dev, req, &cmnd->rw);
+ ret = nvme_pci_setup_sgls(nvmeq, req, &cmnd->rw);
else
- ret = nvme_pci_setup_prps(dev, req, &cmnd->rw);
+ ret = nvme_pci_setup_prps(nvmeq, req, &cmnd->rw);
if (ret != BLK_STS_OK)
goto out_unmap_sg;
return BLK_STS_OK;
out_unmap_sg:
@@ -841,10 +897,11 @@ static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req,
}
static blk_status_t nvme_pci_setup_meta_sgls(struct nvme_dev *dev,
struct request *req)
{
+ struct nvme_queue *nvmeq = req->mq_hctx->driver_data;
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
struct nvme_rw_command *cmnd = &iod->cmd.rw;
struct nvme_sgl_desc *sg_list;
struct scatterlist *sgl, *sg;
unsigned int entries;
@@ -864,11 +921,11 @@ static blk_status_t nvme_pci_setup_meta_sgls(struct nvme_dev *dev,
rc = dma_map_sgtable(dev->dev, &iod->meta_sgt, rq_dma_dir(req),
DMA_ATTR_NO_WARN);
if (rc)
goto out_free_sg;
- sg_list = dma_pool_alloc(dev->prp_small_pool, GFP_ATOMIC, &sgl_dma);
+ sg_list = dma_pool_alloc(nvmeq->prp_pools.small, GFP_ATOMIC, &sgl_dma);
if (!sg_list)
goto out_unmap_sg;
entries = iod->meta_sgt.nents;
iod->meta_list.sg_list = sg_list;
@@ -946,11 +1003,11 @@ static blk_status_t nvme_prep_rq(struct nvme_dev *dev, struct request *req)
nvme_start_request(req);
return BLK_STS_OK;
out_unmap_data:
if (blk_rq_nr_phys_segments(req))
- nvme_unmap_data(dev, req);
+ nvme_unmap_data(dev, req->mq_hctx->driver_data, req);
out_free_cmd:
nvme_cleanup_cmd(req);
return ret;
}
@@ -1036,10 +1093,11 @@ static void nvme_queue_rqs(struct rq_list *rqlist)
nvme_submit_cmds(nvmeq, &submit_list);
*rqlist = requeue_list;
}
static __always_inline void nvme_unmap_metadata(struct nvme_dev *dev,
+ struct nvme_queue *nvmeq,
struct request *req)
{
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
if (!iod->meta_sgt.nents) {
@@ -1047,11 +1105,11 @@ static __always_inline void nvme_unmap_metadata(struct nvme_dev *dev,
rq_integrity_vec(req).bv_len,
rq_dma_dir(req));
return;
}
- dma_pool_free(dev->prp_small_pool, iod->meta_list.sg_list,
+ dma_pool_free(nvmeq->prp_pools.small, iod->meta_list.sg_list,
iod->meta_dma);
dma_unmap_sgtable(dev->dev, &iod->meta_sgt, rq_dma_dir(req), 0);
mempool_free(iod->meta_sgt.sgl, dev->iod_meta_mempool);
}
@@ -1059,14 +1117,14 @@ static __always_inline void nvme_pci_unmap_rq(struct request *req)
{
struct nvme_queue *nvmeq = req->mq_hctx->driver_data;
struct nvme_dev *dev = nvmeq->dev;
if (blk_integrity_rq(req))
- nvme_unmap_metadata(dev, req);
+ nvme_unmap_metadata(dev, nvmeq, req);
if (blk_rq_nr_phys_segments(req))
- nvme_unmap_data(dev, req);
+ nvme_unmap_data(dev, nvmeq, req);
}
static void nvme_pci_complete_rq(struct request *req)
{
nvme_pci_unmap_rq(req);
@@ -2839,39 +2897,10 @@ static int nvme_disable_prepare_reset(struct nvme_dev *dev, bool shutdown)
return -EBUSY;
nvme_dev_disable(dev, shutdown);
return 0;
}
-static int nvme_setup_prp_pools(struct nvme_dev *dev)
-{
- size_t small_align = 256;
-
- dev->prp_page_pool = dma_pool_create("prp list page", dev->dev,
- NVME_CTRL_PAGE_SIZE,
- NVME_CTRL_PAGE_SIZE, 0);
- if (!dev->prp_page_pool)
- return -ENOMEM;
-
- if (dev->ctrl.quirks & NVME_QUIRK_DMAPOOL_ALIGN_512)
- small_align = 512;
-
- /* Optimisation for I/Os between 4k and 128k */
- dev->prp_small_pool = dma_pool_create("prp list 256", dev->dev,
- 256, small_align, 0);
- if (!dev->prp_small_pool) {
- dma_pool_destroy(dev->prp_page_pool);
- return -ENOMEM;
- }
- return 0;
-}
-
-static void nvme_release_prp_pools(struct nvme_dev *dev)
-{
- dma_pool_destroy(dev->prp_page_pool);
- dma_pool_destroy(dev->prp_small_pool);
-}
-
static int nvme_pci_alloc_iod_mempool(struct nvme_dev *dev)
{
size_t meta_size = sizeof(struct scatterlist) * (NVME_MAX_META_SEGS + 1);
size_t alloc_size = sizeof(struct scatterlist) * NVME_MAX_SEGS;
@@ -3182,11 +3211,12 @@ static struct nvme_dev *nvme_pci_alloc_dev(struct pci_dev *pdev,
unsigned long quirks = id->driver_data;
int node = dev_to_node(&pdev->dev);
struct nvme_dev *dev;
int ret = -ENOMEM;
- dev = kzalloc_node(sizeof(*dev), GFP_KERNEL, node);
+ dev = kzalloc_node(sizeof(*dev) + nr_node_ids * sizeof(*dev->prp_pools),
+ GFP_KERNEL, node);
if (!dev)
return ERR_PTR(-ENOMEM);
INIT_WORK(&dev->ctrl.reset_work, nvme_reset_work);
mutex_init(&dev->shutdown_lock);
@@ -3257,17 +3287,13 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
result = nvme_dev_map(dev);
if (result)
goto out_uninit_ctrl;
- result = nvme_setup_prp_pools(dev);
- if (result)
- goto out_dev_unmap;
-
result = nvme_pci_alloc_iod_mempool(dev);
if (result)
- goto out_release_prp_pools;
+ goto out_dev_unmap;
dev_info(dev->ctrl.device, "pci function %s\n", dev_name(&pdev->dev));
result = nvme_pci_enable(dev);
if (result)
@@ -3339,12 +3365,10 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
nvme_dbbuf_dma_free(dev);
nvme_free_queues(dev, 0);
out_release_iod_mempool:
mempool_destroy(dev->iod_mempool);
mempool_destroy(dev->iod_meta_mempool);
-out_release_prp_pools:
- nvme_release_prp_pools(dev);
out_dev_unmap:
nvme_dev_unmap(dev);
out_uninit_ctrl:
nvme_uninit_ctrl(&dev->ctrl);
out_put_ctrl:
--
2.45.2
^ permalink raw reply [flat|nested] 13+ messages in thread