* [PATCH v14 01/26] crypto: iaa - Reorganize the iaa_crypto driver code.
2026-01-25 3:35 [PATCH v14 00/26] zswap compression batching with optimized iaa_crypto driver Kanchana P Sridhar
@ 2026-01-25 3:35 ` Kanchana P Sridhar
2026-01-25 3:35 ` [PATCH v14 02/26] crypto: iaa - Replace sprintf with sysfs_emit in sysfs show functions Kanchana P Sridhar
` (25 subsequent siblings)
26 siblings, 0 replies; 48+ messages in thread
From: Kanchana P Sridhar @ 2026-01-25 3:35 UTC (permalink / raw)
To: linux-kernel, linux-mm, hannes, yosry.ahmed, nphamcs,
chengming.zhou, usamaarif642, ryan.roberts, 21cnbao, ying.huang,
akpm, senozhatsky, sj, kasong, linux-crypto, herbert, davem,
clabbe, ardb, ebiggers, surenb, kristen.c.accardi,
vinicius.gomes, giovanni.cabiddu
Cc: wajdi.k.feghali, kanchana.p.sridhar
This patch merely reorganizes the code in iaa_crypto_main.c, so that
the functions are consolidated into logically related sub-sections of
code, without requiring forward declarations.
This is expected to make the code more maintainable and for it to be
easier to replace functional layers and/or add new features.
Signed-off-by: Kanchana P Sridhar <kanchana.p.sridhar@intel.com>
---
drivers/crypto/intel/iaa/iaa_crypto_main.c | 677 +++++++++++----------
1 file changed, 350 insertions(+), 327 deletions(-)
diff --git a/drivers/crypto/intel/iaa/iaa_crypto_main.c b/drivers/crypto/intel/iaa/iaa_crypto_main.c
index d0058757b000..e21d5fe9004c 100644
--- a/drivers/crypto/intel/iaa/iaa_crypto_main.c
+++ b/drivers/crypto/intel/iaa/iaa_crypto_main.c
@@ -24,6 +24,10 @@
#define IAA_ALG_PRIORITY 300
+/**************************************
+ * Driver internal global variables.
+ **************************************/
+
/* number of iaa instances probed */
static unsigned int nr_iaa;
static unsigned int nr_cpus;
@@ -36,54 +40,6 @@ static unsigned int cpus_per_iaa;
/* Per-cpu lookup table for balanced wqs */
static struct wq_table_entry __percpu *wq_table;
-static struct idxd_wq *wq_table_next_wq(int cpu)
-{
- struct wq_table_entry *entry = per_cpu_ptr(wq_table, cpu);
-
- if (++entry->cur_wq >= entry->n_wqs)
- entry->cur_wq = 0;
-
- if (!entry->wqs[entry->cur_wq])
- return NULL;
-
- pr_debug("%s: returning wq at idx %d (iaa wq %d.%d) from cpu %d\n", __func__,
- entry->cur_wq, entry->wqs[entry->cur_wq]->idxd->id,
- entry->wqs[entry->cur_wq]->id, cpu);
-
- return entry->wqs[entry->cur_wq];
-}
-
-static void wq_table_add(int cpu, struct idxd_wq *wq)
-{
- struct wq_table_entry *entry = per_cpu_ptr(wq_table, cpu);
-
- if (WARN_ON(entry->n_wqs == entry->max_wqs))
- return;
-
- entry->wqs[entry->n_wqs++] = wq;
-
- pr_debug("%s: added iaa wq %d.%d to idx %d of cpu %d\n", __func__,
- entry->wqs[entry->n_wqs - 1]->idxd->id,
- entry->wqs[entry->n_wqs - 1]->id, entry->n_wqs - 1, cpu);
-}
-
-static void wq_table_free_entry(int cpu)
-{
- struct wq_table_entry *entry = per_cpu_ptr(wq_table, cpu);
-
- kfree(entry->wqs);
- memset(entry, 0, sizeof(*entry));
-}
-
-static void wq_table_clear_entry(int cpu)
-{
- struct wq_table_entry *entry = per_cpu_ptr(wq_table, cpu);
-
- entry->n_wqs = 0;
- entry->cur_wq = 0;
- memset(entry->wqs, 0, entry->max_wqs * sizeof(struct idxd_wq *));
-}
-
LIST_HEAD(iaa_devices);
DEFINE_MUTEX(iaa_devices_lock);
@@ -91,36 +47,11 @@ DEFINE_MUTEX(iaa_devices_lock);
static bool iaa_crypto_enabled;
static bool iaa_crypto_registered;
+static struct iaa_compression_mode *iaa_compression_modes[IAA_COMP_MODES_MAX];
+
/* Verify results of IAA compress or not */
static bool iaa_verify_compress = true;
-static ssize_t verify_compress_show(struct device_driver *driver, char *buf)
-{
- return sprintf(buf, "%d\n", iaa_verify_compress);
-}
-
-static ssize_t verify_compress_store(struct device_driver *driver,
- const char *buf, size_t count)
-{
- int ret = -EBUSY;
-
- mutex_lock(&iaa_devices_lock);
-
- if (iaa_crypto_enabled)
- goto out;
-
- ret = kstrtobool(buf, &iaa_verify_compress);
- if (ret)
- goto out;
-
- ret = count;
-out:
- mutex_unlock(&iaa_devices_lock);
-
- return ret;
-}
-static DRIVER_ATTR_RW(verify_compress);
-
/*
* The iaa crypto driver supports three 'sync' methods determining how
* compressions and decompressions are performed:
@@ -155,6 +86,37 @@ static bool async_mode;
/* Use interrupts */
static bool use_irq;
+/**************************************************
+ * Driver attributes along with get/set functions.
+ **************************************************/
+
+static ssize_t verify_compress_show(struct device_driver *driver, char *buf)
+{
+ return sprintf(buf, "%d\n", iaa_verify_compress);
+}
+
+static ssize_t verify_compress_store(struct device_driver *driver,
+ const char *buf, size_t count)
+{
+ int ret = -EBUSY;
+
+ mutex_lock(&iaa_devices_lock);
+
+ if (iaa_crypto_enabled)
+ goto out;
+
+ ret = kstrtobool(buf, &iaa_verify_compress);
+ if (ret)
+ goto out;
+
+ ret = count;
+out:
+ mutex_unlock(&iaa_devices_lock);
+
+ return ret;
+}
+static DRIVER_ATTR_RW(verify_compress);
+
/**
* set_iaa_sync_mode - Set IAA sync mode
* @name: The name of the sync mode
@@ -217,7 +179,9 @@ static ssize_t sync_mode_store(struct device_driver *driver,
}
static DRIVER_ATTR_RW(sync_mode);
-static struct iaa_compression_mode *iaa_compression_modes[IAA_COMP_MODES_MAX];
+/****************************
+ * Driver compression modes.
+ ****************************/
static int find_empty_iaa_compression_mode(void)
{
@@ -409,11 +373,6 @@ static void free_device_compression_mode(struct iaa_device *iaa_device,
IDXD_OP_FLAG_WR_SRC2_AECS_COMP | \
IDXD_OP_FLAG_AECS_RW_TGLS)
-static int check_completion(struct device *dev,
- struct iax_completion_record *comp,
- bool compress,
- bool only_once);
-
static int init_device_compression_mode(struct iaa_device *iaa_device,
struct iaa_compression_mode *mode,
int idx, struct idxd_wq *wq)
@@ -500,6 +459,11 @@ static void remove_device_compression_modes(struct iaa_device *iaa_device)
}
}
+/***********************************************************
+ * Functions for use in crypto probe and remove interfaces:
+ * allocate/init/query/deallocate devices/wqs.
+ ***********************************************************/
+
static struct iaa_device *iaa_device_alloc(void)
{
struct iaa_device *iaa_device;
@@ -513,18 +477,6 @@ static struct iaa_device *iaa_device_alloc(void)
return iaa_device;
}
-static bool iaa_has_wq(struct iaa_device *iaa_device, struct idxd_wq *wq)
-{
- struct iaa_wq *iaa_wq;
-
- list_for_each_entry(iaa_wq, &iaa_device->wqs, list) {
- if (iaa_wq->wq == wq)
- return true;
- }
-
- return false;
-}
-
static struct iaa_device *add_iaa_device(struct idxd_device *idxd)
{
struct iaa_device *iaa_device;
@@ -560,6 +512,27 @@ static void del_iaa_device(struct iaa_device *iaa_device)
nr_iaa--;
}
+static void free_iaa_device(struct iaa_device *iaa_device)
+{
+ if (!iaa_device)
+ return;
+
+ remove_device_compression_modes(iaa_device);
+ kfree(iaa_device);
+}
+
+static bool iaa_has_wq(struct iaa_device *iaa_device, struct idxd_wq *wq)
+{
+ struct iaa_wq *iaa_wq;
+
+ list_for_each_entry(iaa_wq, &iaa_device->wqs, list) {
+ if (iaa_wq->wq == wq)
+ return true;
+ }
+
+ return false;
+}
+
static int add_iaa_wq(struct iaa_device *iaa_device, struct idxd_wq *wq,
struct iaa_wq **new_wq)
{
@@ -612,23 +585,23 @@ static void del_iaa_wq(struct iaa_device *iaa_device, struct idxd_wq *wq)
}
}
-static void clear_wq_table(void)
+static void remove_iaa_wq(struct idxd_wq *wq)
{
- int cpu;
-
- for (cpu = 0; cpu < nr_cpus; cpu++)
- wq_table_clear_entry(cpu);
-
- pr_debug("cleared wq table\n");
-}
+ struct iaa_device *iaa_device;
-static void free_iaa_device(struct iaa_device *iaa_device)
-{
- if (!iaa_device)
- return;
+ list_for_each_entry(iaa_device, &iaa_devices, list) {
+ if (iaa_has_wq(iaa_device, wq)) {
+ del_iaa_wq(iaa_device, wq);
+ break;
+ }
+ }
- remove_device_compression_modes(iaa_device);
- kfree(iaa_device);
+ if (nr_iaa) {
+ cpus_per_iaa = (nr_nodes * nr_cpus_per_node) / nr_iaa;
+ if (!cpus_per_iaa)
+ cpus_per_iaa = 1;
+ } else
+ cpus_per_iaa = 1;
}
static void __free_iaa_wq(struct iaa_wq *iaa_wq)
@@ -655,6 +628,75 @@ static void free_iaa_wq(struct iaa_wq *iaa_wq)
idxd_wq_set_private(wq, NULL);
}
+static int save_iaa_wq(struct idxd_wq *wq)
+{
+ struct iaa_device *iaa_device, *found = NULL;
+ struct idxd_device *idxd;
+ struct pci_dev *pdev;
+ struct device *dev;
+ int ret = 0;
+
+ list_for_each_entry(iaa_device, &iaa_devices, list) {
+ if (iaa_device->idxd == wq->idxd) {
+ idxd = iaa_device->idxd;
+ pdev = idxd->pdev;
+ dev = &pdev->dev;
+ /*
+ * Check to see that we don't already have this wq.
+ * Shouldn't happen but we don't control probing.
+ */
+ if (iaa_has_wq(iaa_device, wq)) {
+ dev_dbg(dev, "same wq probed multiple times for iaa_device %p\n",
+ iaa_device);
+ goto out;
+ }
+
+ found = iaa_device;
+
+ ret = add_iaa_wq(iaa_device, wq, NULL);
+ if (ret)
+ goto out;
+
+ break;
+ }
+ }
+
+ if (!found) {
+ struct iaa_device *new_device;
+ struct iaa_wq *new_wq;
+
+ new_device = add_iaa_device(wq->idxd);
+ if (!new_device) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ ret = add_iaa_wq(new_device, wq, &new_wq);
+ if (ret) {
+ del_iaa_device(new_device);
+ free_iaa_device(new_device);
+ goto out;
+ }
+
+ ret = init_iaa_device(new_device, new_wq);
+ if (ret) {
+ del_iaa_wq(new_device, new_wq->wq);
+ del_iaa_device(new_device);
+ free_iaa_wq(new_wq);
+ goto out;
+ }
+ }
+
+ if (WARN_ON(nr_iaa == 0))
+ return -EINVAL;
+
+ cpus_per_iaa = (nr_nodes * nr_cpus_per_node) / nr_iaa;
+ if (!cpus_per_iaa)
+ cpus_per_iaa = 1;
+out:
+ return ret;
+}
+
static int iaa_wq_get(struct idxd_wq *wq)
{
struct idxd_device *idxd = wq->idxd;
@@ -702,6 +744,37 @@ static int iaa_wq_put(struct idxd_wq *wq)
return ret;
}
+/***************************************************************
+ * Mapping IAA devices and wqs to cores with per-cpu wq_tables.
+ ***************************************************************/
+
+static void wq_table_free_entry(int cpu)
+{
+ struct wq_table_entry *entry = per_cpu_ptr(wq_table, cpu);
+
+ kfree(entry->wqs);
+ memset(entry, 0, sizeof(*entry));
+}
+
+static void wq_table_clear_entry(int cpu)
+{
+ struct wq_table_entry *entry = per_cpu_ptr(wq_table, cpu);
+
+ entry->n_wqs = 0;
+ entry->cur_wq = 0;
+ memset(entry->wqs, 0, entry->max_wqs * sizeof(struct idxd_wq *));
+}
+
+static void clear_wq_table(void)
+{
+ int cpu;
+
+ for (cpu = 0; cpu < nr_cpus; cpu++)
+ wq_table_clear_entry(cpu);
+
+ pr_debug("cleared wq table\n");
+}
+
static void free_wq_table(void)
{
int cpu;
@@ -739,92 +812,18 @@ static int alloc_wq_table(int max_wqs)
return 0;
}
-static int save_iaa_wq(struct idxd_wq *wq)
+static void wq_table_add(int cpu, struct idxd_wq *wq)
{
- struct iaa_device *iaa_device, *found = NULL;
- struct idxd_device *idxd;
- struct pci_dev *pdev;
- struct device *dev;
- int ret = 0;
-
- list_for_each_entry(iaa_device, &iaa_devices, list) {
- if (iaa_device->idxd == wq->idxd) {
- idxd = iaa_device->idxd;
- pdev = idxd->pdev;
- dev = &pdev->dev;
- /*
- * Check to see that we don't already have this wq.
- * Shouldn't happen but we don't control probing.
- */
- if (iaa_has_wq(iaa_device, wq)) {
- dev_dbg(dev, "same wq probed multiple times for iaa_device %p\n",
- iaa_device);
- goto out;
- }
-
- found = iaa_device;
-
- ret = add_iaa_wq(iaa_device, wq, NULL);
- if (ret)
- goto out;
-
- break;
- }
- }
-
- if (!found) {
- struct iaa_device *new_device;
- struct iaa_wq *new_wq;
-
- new_device = add_iaa_device(wq->idxd);
- if (!new_device) {
- ret = -ENOMEM;
- goto out;
- }
-
- ret = add_iaa_wq(new_device, wq, &new_wq);
- if (ret) {
- del_iaa_device(new_device);
- free_iaa_device(new_device);
- goto out;
- }
-
- ret = init_iaa_device(new_device, new_wq);
- if (ret) {
- del_iaa_wq(new_device, new_wq->wq);
- del_iaa_device(new_device);
- free_iaa_wq(new_wq);
- goto out;
- }
- }
-
- if (WARN_ON(nr_iaa == 0))
- return -EINVAL;
-
- cpus_per_iaa = (nr_nodes * nr_cpus_per_node) / nr_iaa;
- if (!cpus_per_iaa)
- cpus_per_iaa = 1;
-out:
- return ret;
-}
+ struct wq_table_entry *entry = per_cpu_ptr(wq_table, cpu);
-static void remove_iaa_wq(struct idxd_wq *wq)
-{
- struct iaa_device *iaa_device;
+ if (WARN_ON(entry->n_wqs == entry->max_wqs))
+ return;
- list_for_each_entry(iaa_device, &iaa_devices, list) {
- if (iaa_has_wq(iaa_device, wq)) {
- del_iaa_wq(iaa_device, wq);
- break;
- }
- }
+ entry->wqs[entry->n_wqs++] = wq;
- if (nr_iaa) {
- cpus_per_iaa = (nr_nodes * nr_cpus_per_node) / nr_iaa;
- if (!cpus_per_iaa)
- cpus_per_iaa = 1;
- } else
- cpus_per_iaa = 1;
+ pr_debug("%s: added iaa wq %d.%d to idx %d of cpu %d\n", __func__,
+ entry->wqs[entry->n_wqs - 1]->idxd->id,
+ entry->wqs[entry->n_wqs - 1]->id, entry->n_wqs - 1, cpu);
}
static int wq_table_add_wqs(int iaa, int cpu)
@@ -930,6 +929,44 @@ static void rebalance_wq_table(void)
pr_debug("could not add any wqs for iaa %d to cpu %d!\n", iaa, cpu);
}
+/***************************************************************
+ * Assign work-queues for driver ops using per-cpu wq_tables.
+ ***************************************************************/
+
+static struct idxd_wq *wq_table_next_wq(int cpu)
+{
+ struct wq_table_entry *entry = per_cpu_ptr(wq_table, cpu);
+
+ if (++entry->cur_wq >= entry->n_wqs)
+ entry->cur_wq = 0;
+
+ if (!entry->wqs[entry->cur_wq])
+ return NULL;
+
+ pr_debug("%s: returning wq at idx %d (iaa wq %d.%d) from cpu %d\n", __func__,
+ entry->cur_wq, entry->wqs[entry->cur_wq]->idxd->id,
+ entry->wqs[entry->cur_wq]->id, cpu);
+
+ return entry->wqs[entry->cur_wq];
+}
+
+/*************************************************
+ * Core iaa_crypto compress/decompress functions.
+ *************************************************/
+
+static int deflate_generic_decompress(struct acomp_req *req)
+{
+ ACOMP_FBREQ_ON_STACK(fbreq, req);
+ int ret;
+
+ ret = crypto_acomp_decompress(fbreq);
+ req->dlen = fbreq->dlen;
+
+ update_total_sw_decomp_calls();
+
+ return ret;
+}
+
static inline int check_completion(struct device *dev,
struct iax_completion_record *comp,
bool compress,
@@ -990,27 +1027,132 @@ static inline int check_completion(struct device *dev,
return ret;
}
-static int deflate_generic_decompress(struct acomp_req *req)
+static int iaa_remap_for_verify(struct device *dev, struct iaa_wq *iaa_wq,
+ struct acomp_req *req,
+ dma_addr_t *src_addr, dma_addr_t *dst_addr)
{
- ACOMP_FBREQ_ON_STACK(fbreq, req);
- int ret;
+ int ret = 0;
+ int nr_sgs;
- ret = crypto_acomp_decompress(fbreq);
- req->dlen = fbreq->dlen;
+ dma_unmap_sg(dev, req->dst, sg_nents(req->dst), DMA_FROM_DEVICE);
+ dma_unmap_sg(dev, req->src, sg_nents(req->src), DMA_TO_DEVICE);
- update_total_sw_decomp_calls();
+ nr_sgs = dma_map_sg(dev, req->src, sg_nents(req->src), DMA_FROM_DEVICE);
+ if (nr_sgs <= 0 || nr_sgs > 1) {
+ dev_dbg(dev, "verify: couldn't map src sg for iaa device %d,"
+ " wq %d: ret=%d\n", iaa_wq->iaa_device->idxd->id,
+ iaa_wq->wq->id, ret);
+ ret = -EIO;
+ goto out;
+ }
+ *src_addr = sg_dma_address(req->src);
+ dev_dbg(dev, "verify: dma_map_sg, src_addr %llx, nr_sgs %d, req->src %p,"
+ " req->slen %d, sg_dma_len(sg) %d\n", *src_addr, nr_sgs,
+ req->src, req->slen, sg_dma_len(req->src));
+ nr_sgs = dma_map_sg(dev, req->dst, sg_nents(req->dst), DMA_TO_DEVICE);
+ if (nr_sgs <= 0 || nr_sgs > 1) {
+ dev_dbg(dev, "verify: couldn't map dst sg for iaa device %d,"
+ " wq %d: ret=%d\n", iaa_wq->iaa_device->idxd->id,
+ iaa_wq->wq->id, ret);
+ ret = -EIO;
+ dma_unmap_sg(dev, req->src, sg_nents(req->src), DMA_FROM_DEVICE);
+ goto out;
+ }
+ *dst_addr = sg_dma_address(req->dst);
+ dev_dbg(dev, "verify: dma_map_sg, dst_addr %llx, nr_sgs %d, req->dst %p,"
+ " req->dlen %d, sg_dma_len(sg) %d\n", *dst_addr, nr_sgs,
+ req->dst, req->dlen, sg_dma_len(req->dst));
+out:
return ret;
}
-static int iaa_remap_for_verify(struct device *dev, struct iaa_wq *iaa_wq,
- struct acomp_req *req,
- dma_addr_t *src_addr, dma_addr_t *dst_addr);
-
static int iaa_compress_verify(struct crypto_tfm *tfm, struct acomp_req *req,
struct idxd_wq *wq,
dma_addr_t src_addr, unsigned int slen,
- dma_addr_t dst_addr, unsigned int *dlen);
+ dma_addr_t dst_addr, unsigned int *dlen)
+{
+ struct iaa_device_compression_mode *active_compression_mode;
+ struct iaa_compression_ctx *ctx = crypto_tfm_ctx(tfm);
+ u32 *compression_crc = acomp_request_ctx(req);
+ struct iaa_device *iaa_device;
+ struct idxd_desc *idxd_desc;
+ struct iax_hw_desc *desc;
+ struct idxd_device *idxd;
+ struct iaa_wq *iaa_wq;
+ struct pci_dev *pdev;
+ struct device *dev;
+ int ret = 0;
+
+ iaa_wq = idxd_wq_get_private(wq);
+ iaa_device = iaa_wq->iaa_device;
+ idxd = iaa_device->idxd;
+ pdev = idxd->pdev;
+ dev = &pdev->dev;
+
+ active_compression_mode = get_iaa_device_compression_mode(iaa_device, ctx->mode);
+
+ idxd_desc = idxd_alloc_desc(wq, IDXD_OP_BLOCK);
+ if (IS_ERR(idxd_desc)) {
+ dev_dbg(dev, "idxd descriptor allocation failed\n");
+ dev_dbg(dev, "iaa compress failed: ret=%ld\n",
+ PTR_ERR(idxd_desc));
+ return PTR_ERR(idxd_desc);
+ }
+ desc = idxd_desc->iax_hw;
+
+ /* Verify (optional) - decompress and check crc, suppress dest write */
+
+ desc->flags = IDXD_OP_FLAG_CRAV | IDXD_OP_FLAG_RCR | IDXD_OP_FLAG_CC;
+ desc->opcode = IAX_OPCODE_DECOMPRESS;
+ desc->decompr_flags = IAA_DECOMP_FLAGS | IAA_DECOMP_SUPPRESS_OUTPUT;
+ desc->priv = 0;
+
+ desc->src1_addr = (u64)dst_addr;
+ desc->src1_size = *dlen;
+ desc->dst_addr = (u64)src_addr;
+ desc->max_dst_size = slen;
+ desc->completion_addr = idxd_desc->compl_dma;
+
+ dev_dbg(dev, "(verify) compression mode %s,"
+ " desc->src1_addr %llx, desc->src1_size %d,"
+ " desc->dst_addr %llx, desc->max_dst_size %d,"
+ " desc->src2_addr %llx, desc->src2_size %d\n",
+ active_compression_mode->name,
+ desc->src1_addr, desc->src1_size, desc->dst_addr,
+ desc->max_dst_size, desc->src2_addr, desc->src2_size);
+
+ ret = idxd_submit_desc(wq, idxd_desc);
+ if (ret) {
+ dev_dbg(dev, "submit_desc (verify) failed ret=%d\n", ret);
+ goto err;
+ }
+
+ ret = check_completion(dev, idxd_desc->iax_completion, false, false);
+ if (ret) {
+ dev_dbg(dev, "(verify) check_completion failed ret=%d\n", ret);
+ goto err;
+ }
+
+ if (*compression_crc != idxd_desc->iax_completion->crc) {
+ ret = -EINVAL;
+ dev_dbg(dev, "(verify) iaa comp/decomp crc mismatch:"
+ " comp=0x%x, decomp=0x%x\n", *compression_crc,
+ idxd_desc->iax_completion->crc);
+ print_hex_dump(KERN_INFO, "cmp-rec: ", DUMP_PREFIX_OFFSET,
+ 8, 1, idxd_desc->iax_completion, 64, 0);
+ goto err;
+ }
+
+ idxd_free_desc(wq, idxd_desc);
+out:
+ return ret;
+err:
+ idxd_free_desc(wq, idxd_desc);
+ dev_dbg(dev, "iaa compress failed: ret=%d\n", ret);
+
+ goto out;
+}
static void iaa_desc_complete(struct idxd_desc *idxd_desc,
enum idxd_complete_type comp_type,
@@ -1226,133 +1368,6 @@ static int iaa_compress(struct crypto_tfm *tfm, struct acomp_req *req,
goto out;
}
-static int iaa_remap_for_verify(struct device *dev, struct iaa_wq *iaa_wq,
- struct acomp_req *req,
- dma_addr_t *src_addr, dma_addr_t *dst_addr)
-{
- int ret = 0;
- int nr_sgs;
-
- dma_unmap_sg(dev, req->dst, sg_nents(req->dst), DMA_FROM_DEVICE);
- dma_unmap_sg(dev, req->src, sg_nents(req->src), DMA_TO_DEVICE);
-
- nr_sgs = dma_map_sg(dev, req->src, sg_nents(req->src), DMA_FROM_DEVICE);
- if (nr_sgs <= 0 || nr_sgs > 1) {
- dev_dbg(dev, "verify: couldn't map src sg for iaa device %d,"
- " wq %d: ret=%d\n", iaa_wq->iaa_device->idxd->id,
- iaa_wq->wq->id, ret);
- ret = -EIO;
- goto out;
- }
- *src_addr = sg_dma_address(req->src);
- dev_dbg(dev, "verify: dma_map_sg, src_addr %llx, nr_sgs %d, req->src %p,"
- " req->slen %d, sg_dma_len(sg) %d\n", *src_addr, nr_sgs,
- req->src, req->slen, sg_dma_len(req->src));
-
- nr_sgs = dma_map_sg(dev, req->dst, sg_nents(req->dst), DMA_TO_DEVICE);
- if (nr_sgs <= 0 || nr_sgs > 1) {
- dev_dbg(dev, "verify: couldn't map dst sg for iaa device %d,"
- " wq %d: ret=%d\n", iaa_wq->iaa_device->idxd->id,
- iaa_wq->wq->id, ret);
- ret = -EIO;
- dma_unmap_sg(dev, req->src, sg_nents(req->src), DMA_FROM_DEVICE);
- goto out;
- }
- *dst_addr = sg_dma_address(req->dst);
- dev_dbg(dev, "verify: dma_map_sg, dst_addr %llx, nr_sgs %d, req->dst %p,"
- " req->dlen %d, sg_dma_len(sg) %d\n", *dst_addr, nr_sgs,
- req->dst, req->dlen, sg_dma_len(req->dst));
-out:
- return ret;
-}
-
-static int iaa_compress_verify(struct crypto_tfm *tfm, struct acomp_req *req,
- struct idxd_wq *wq,
- dma_addr_t src_addr, unsigned int slen,
- dma_addr_t dst_addr, unsigned int *dlen)
-{
- struct iaa_device_compression_mode *active_compression_mode;
- struct iaa_compression_ctx *ctx = crypto_tfm_ctx(tfm);
- u32 *compression_crc = acomp_request_ctx(req);
- struct iaa_device *iaa_device;
- struct idxd_desc *idxd_desc;
- struct iax_hw_desc *desc;
- struct idxd_device *idxd;
- struct iaa_wq *iaa_wq;
- struct pci_dev *pdev;
- struct device *dev;
- int ret = 0;
-
- iaa_wq = idxd_wq_get_private(wq);
- iaa_device = iaa_wq->iaa_device;
- idxd = iaa_device->idxd;
- pdev = idxd->pdev;
- dev = &pdev->dev;
-
- active_compression_mode = get_iaa_device_compression_mode(iaa_device, ctx->mode);
-
- idxd_desc = idxd_alloc_desc(wq, IDXD_OP_BLOCK);
- if (IS_ERR(idxd_desc)) {
- dev_dbg(dev, "idxd descriptor allocation failed\n");
- dev_dbg(dev, "iaa compress failed: ret=%ld\n",
- PTR_ERR(idxd_desc));
- return PTR_ERR(idxd_desc);
- }
- desc = idxd_desc->iax_hw;
-
- /* Verify (optional) - decompress and check crc, suppress dest write */
-
- desc->flags = IDXD_OP_FLAG_CRAV | IDXD_OP_FLAG_RCR | IDXD_OP_FLAG_CC;
- desc->opcode = IAX_OPCODE_DECOMPRESS;
- desc->decompr_flags = IAA_DECOMP_FLAGS | IAA_DECOMP_SUPPRESS_OUTPUT;
- desc->priv = 0;
-
- desc->src1_addr = (u64)dst_addr;
- desc->src1_size = *dlen;
- desc->dst_addr = (u64)src_addr;
- desc->max_dst_size = slen;
- desc->completion_addr = idxd_desc->compl_dma;
-
- dev_dbg(dev, "(verify) compression mode %s,"
- " desc->src1_addr %llx, desc->src1_size %d,"
- " desc->dst_addr %llx, desc->max_dst_size %d,"
- " desc->src2_addr %llx, desc->src2_size %d\n",
- active_compression_mode->name,
- desc->src1_addr, desc->src1_size, desc->dst_addr,
- desc->max_dst_size, desc->src2_addr, desc->src2_size);
-
- ret = idxd_submit_desc(wq, idxd_desc);
- if (ret) {
- dev_dbg(dev, "submit_desc (verify) failed ret=%d\n", ret);
- goto err;
- }
-
- ret = check_completion(dev, idxd_desc->iax_completion, false, false);
- if (ret) {
- dev_dbg(dev, "(verify) check_completion failed ret=%d\n", ret);
- goto err;
- }
-
- if (*compression_crc != idxd_desc->iax_completion->crc) {
- ret = -EINVAL;
- dev_dbg(dev, "(verify) iaa comp/decomp crc mismatch:"
- " comp=0x%x, decomp=0x%x\n", *compression_crc,
- idxd_desc->iax_completion->crc);
- print_hex_dump(KERN_INFO, "cmp-rec: ", DUMP_PREFIX_OFFSET,
- 8, 1, idxd_desc->iax_completion, 64, 0);
- goto err;
- }
-
- idxd_free_desc(wq, idxd_desc);
-out:
- return ret;
-err:
- idxd_free_desc(wq, idxd_desc);
- dev_dbg(dev, "iaa compress failed: ret=%d\n", ret);
-
- goto out;
-}
-
static int iaa_decompress(struct crypto_tfm *tfm, struct acomp_req *req,
struct idxd_wq *wq,
dma_addr_t src_addr, unsigned int slen,
@@ -1662,6 +1677,10 @@ static void compression_ctx_init(struct iaa_compression_ctx *ctx)
ctx->use_irq = use_irq;
}
+/*********************************************
+ * Interfaces to crypto_alg and crypto_acomp.
+ *********************************************/
+
static int iaa_comp_init_fixed(struct crypto_acomp *acomp_tfm)
{
struct crypto_tfm *tfm = crypto_acomp_tfm(acomp_tfm);
@@ -1864,6 +1883,10 @@ static struct idxd_device_driver iaa_crypto_driver = {
.desc_complete = iaa_desc_complete,
};
+/********************
+ * Module init/exit.
+ ********************/
+
static int __init iaa_crypto_init_module(void)
{
int ret = 0;
--
2.27.0
^ permalink raw reply [flat|nested] 48+ messages in thread* [PATCH v14 02/26] crypto: iaa - Replace sprintf with sysfs_emit in sysfs show functions
2026-01-25 3:35 [PATCH v14 00/26] zswap compression batching with optimized iaa_crypto driver Kanchana P Sridhar
2026-01-25 3:35 ` [PATCH v14 01/26] crypto: iaa - Reorganize the iaa_crypto driver code Kanchana P Sridhar
@ 2026-01-25 3:35 ` Kanchana P Sridhar
2026-02-06 10:47 ` Herbert Xu
2026-01-25 3:35 ` [PATCH v14 03/26] crypto: iaa - New architecture for IAA device WQ [de]comp usage & core mapping Kanchana P Sridhar
` (24 subsequent siblings)
26 siblings, 1 reply; 48+ messages in thread
From: Kanchana P Sridhar @ 2026-01-25 3:35 UTC (permalink / raw)
To: linux-kernel, linux-mm, hannes, yosry.ahmed, nphamcs,
chengming.zhou, usamaarif642, ryan.roberts, 21cnbao, ying.huang,
akpm, senozhatsky, sj, kasong, linux-crypto, herbert, davem,
clabbe, ardb, ebiggers, surenb, kristen.c.accardi,
vinicius.gomes, giovanni.cabiddu
Cc: wajdi.k.feghali, kanchana.p.sridhar
Replace sprintf() with sysfs_emit() in verify_compress_show() and
sync_mode_show(). sysfs_emit() is preferred to format sysfs output as it
provides better bounds checking. No functional changes.
Signed-off-by: Thorsten Blum <thorsten.blum@linux.dev>
Acked-by: Kanchana P Sridhar <kanchana.p.sridhar@intel.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
drivers/crypto/intel/iaa/iaa_crypto_main.c | 9 +++++----
1 file changed, 5 insertions(+), 4 deletions(-)
diff --git a/drivers/crypto/intel/iaa/iaa_crypto_main.c b/drivers/crypto/intel/iaa/iaa_crypto_main.c
index e21d5fe9004c..8057e8d1571a 100644
--- a/drivers/crypto/intel/iaa/iaa_crypto_main.c
+++ b/drivers/crypto/intel/iaa/iaa_crypto_main.c
@@ -5,6 +5,7 @@
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/pci.h>
+#include <linux/sysfs.h>
#include <linux/device.h>
#include <linux/iommu.h>
#include <uapi/linux/idxd.h>
@@ -92,7 +93,7 @@ static bool use_irq;
static ssize_t verify_compress_show(struct device_driver *driver, char *buf)
{
- return sprintf(buf, "%d\n", iaa_verify_compress);
+ return sysfs_emit(buf, "%d\n", iaa_verify_compress);
}
static ssize_t verify_compress_store(struct device_driver *driver,
@@ -150,11 +151,11 @@ static ssize_t sync_mode_show(struct device_driver *driver, char *buf)
int ret = 0;
if (!async_mode && !use_irq)
- ret = sprintf(buf, "%s\n", "sync");
+ ret = sysfs_emit(buf, "%s\n", "sync");
else if (async_mode && !use_irq)
- ret = sprintf(buf, "%s\n", "async");
+ ret = sysfs_emit(buf, "%s\n", "async");
else if (async_mode && use_irq)
- ret = sprintf(buf, "%s\n", "async_irq");
+ ret = sysfs_emit(buf, "%s\n", "async_irq");
return ret;
}
--
2.27.0
^ permalink raw reply [flat|nested] 48+ messages in thread* Re: [PATCH v14 02/26] crypto: iaa - Replace sprintf with sysfs_emit in sysfs show functions
2026-01-25 3:35 ` [PATCH v14 02/26] crypto: iaa - Replace sprintf with sysfs_emit in sysfs show functions Kanchana P Sridhar
@ 2026-02-06 10:47 ` Herbert Xu
0 siblings, 0 replies; 48+ messages in thread
From: Herbert Xu @ 2026-02-06 10:47 UTC (permalink / raw)
To: Kanchana P Sridhar
Cc: linux-kernel, linux-mm, hannes, yosry.ahmed, nphamcs,
chengming.zhou, usamaarif642, ryan.roberts, 21cnbao, ying.huang,
akpm, senozhatsky, sj, kasong, linux-crypto, davem, clabbe, ardb,
ebiggers, surenb, kristen.c.accardi, vinicius.gomes,
giovanni.cabiddu, wajdi.k.feghali
On Sat, Jan 24, 2026 at 07:35:13PM -0800, Kanchana P Sridhar wrote:
> Replace sprintf() with sysfs_emit() in verify_compress_show() and
> sync_mode_show(). sysfs_emit() is preferred to format sysfs output as it
> provides better bounds checking. No functional changes.
>
> Signed-off-by: Thorsten Blum <thorsten.blum@linux.dev>
> Acked-by: Kanchana P Sridhar <kanchana.p.sridhar@intel.com>
> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
> ---
> drivers/crypto/intel/iaa/iaa_crypto_main.c | 9 +++++----
> 1 file changed, 5 insertions(+), 4 deletions(-)
This patch has already been applied to cryptodev. Please rebase
your patches.
Thanks,
--
Email: Herbert Xu <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
^ permalink raw reply [flat|nested] 48+ messages in thread
* [PATCH v14 03/26] crypto: iaa - New architecture for IAA device WQ [de]comp usage & core mapping.
2026-01-25 3:35 [PATCH v14 00/26] zswap compression batching with optimized iaa_crypto driver Kanchana P Sridhar
2026-01-25 3:35 ` [PATCH v14 01/26] crypto: iaa - Reorganize the iaa_crypto driver code Kanchana P Sridhar
2026-01-25 3:35 ` [PATCH v14 02/26] crypto: iaa - Replace sprintf with sysfs_emit in sysfs show functions Kanchana P Sridhar
@ 2026-01-25 3:35 ` Kanchana P Sridhar
2026-01-25 3:35 ` [PATCH v14 04/26] crypto: iaa - Simplify, consistency of function parameters, minor stats bug fix Kanchana P Sridhar
` (23 subsequent siblings)
26 siblings, 0 replies; 48+ messages in thread
From: Kanchana P Sridhar @ 2026-01-25 3:35 UTC (permalink / raw)
To: linux-kernel, linux-mm, hannes, yosry.ahmed, nphamcs,
chengming.zhou, usamaarif642, ryan.roberts, 21cnbao, ying.huang,
akpm, senozhatsky, sj, kasong, linux-crypto, herbert, davem,
clabbe, ardb, ebiggers, surenb, kristen.c.accardi,
vinicius.gomes, giovanni.cabiddu
Cc: wajdi.k.feghali, kanchana.p.sridhar
This patch re-architects the iaa_crypto driver in three main aspects, to
make it more robust, stable, generic and functionally versatile to
support zswap users on platforms with different number of cores/IAAs
running workloads with different swap characteristics, and most
importantly, for better performance.
Summary of latency improvement for large folio compression:
===========================================================
When measured in zswap using a simple madvise workload, where 64K
Folios are stored using IAA batch compressions, this is how the
per-page compress latency changes just by setting the
"distribute_comps" driver parameter to "1":
--------------------------------------------------------------
zswap compressor: deflate-iaa
64K Folios: zswap_store() latency normalized to per-page
--------------------------------------------------------------
p50 (ns) p99 (ns)
--------------------------------------------------------------
Sequential store 3,503 3,695
Batch compress, distribute_comps=0 1,356 1,384
Batch compress, distribute_comps=1 706 763
--------------------------------------------------------------
The rearchitecting aspects are:
A) Map IAA devices/wqs to cores based on packages instead of NUMA.
B) The WQ rebalancing algorithm that is invoked as WQs are
discovered/deleted has been made very general and flexible so that
the user can control exactly how IAA WQs are used, for optimizing
performance.
C) Additionally, the "iaa_crypto_enabled" driver global has been
modified to be an atomic, and used for synchronization between
dynamic/asynchronous WQ discovery/deletion and the fundamental
routines comp_wq_table_next_wq() and decomp_wq_table_next_wq() that
are queried by compress/decompress job submissions.
Description/motivation for (A):
===============================
This patch modifies the algorithm for mapping available IAA devices and
WQs to cores based on packages instead of NUMA nodes. This leads to a
more realistic mapping of IAA devices as compression/decompression
resources for a package, rather than for a NUMA node. This also resolves
problems that were observed during internal validation on Intel Granite
Rapids platforms with many more NUMA nodes than packages: for such
cases, the earlier NUMA based allocation caused some IAAs to be
over-subscribed and some to not be utilized at all.
As a result of this change from NUMA to packages, some of the core
functions used by the iaa_crypto driver's "probe" and "remove" API
have been re-written. The new infrastructure maintains a static mapping
of wqs per IAA device, in the "struct iaa_device" itself. The earlier
implementation would allocate memory per-cpu for this data, which never
changes once the IAA devices/wqs have been initialized.
Two main outcomes from this new iaa_crypto driver infrastructure are:
1) Resolves "task blocked for more than x seconds" errors observed during
internal validation on Intel systems with the earlier NUMA node based
mappings, which was root-caused to the non-optimal IAA-to-core mappings
described earlier.
2) Results in a NUM_THREADS factor reduction in memory footprint cost of
initializing IAA devices/wqs, due to eliminating the per-cpu copies of
each IAA device's wqs. On a 384 cores Intel Granite Rapids server with
8 IAA devices, this saves 140MiB.
An auxiliary change included in this patch is that the driver's "nr_iaa",
"nr_iaa_per_package" and "cpus_per_iaa" global variables are made
atomic, because iaa_crypto_probe() and iaa_crypto_remove() change the
values of these variables asynchronously and concurrently as wqs get
added/deleted and rebalance_wq_table() is called. This change allows the
rebalance_wq_table() code to see consistent values of the number of IAA
devices.
Description/motivation for (B):
===============================
This builds upon the package-based driver infrastructure, to provide
more flexibility in using particular WQs for compress-only or
decompress-only jobs. It also introduces the notion of using all the IAA
devices on a package as resources that are shared by all cores on the
package: this significantly improves batching (to be added in subsequent
patches) latency and compress/decompress throughput. sysfs driver
paramters provide configurability of these features.
Two main concepts are introduced as part of the rebalancing changes:
1) An IAA WQ can be used for specific ops, that determines a WQ "type"
for the iaa_crypto driver to submit compress/decompress jobs:
- compress only
- decompress only
- generic, i.e, for both compresses and decompresses
The WQ type is decided based on the number of WQs configured for a
given IAA device, and the new "g_comp_wqs_per_iaa" driver parameter.
2) An IAA WQ can be mapped to cores using either of the following
balancing techniques:
a) Shared by all cores on a package. The iaa_crypto driver will
dispatch compress/decompress jobs to all WQs of the same type,
across all IAA devices on the package:
- IAA compress jobs will be distributed to all same-package IAA
compress-only/generic WQs.
- IAA decompress jobs will be distributed to all same-package IAA
decompress-only/generic WQs.
b) Handles compress/decompress jobs only from "mapped cores", i.e.,
the cores derived by evenly dividing the number of IAAs among the
number of cores, per package.
Server setups that are moderately to highly contended can benefit from
(2.a). When the mix of workloads running on a system need high compress
throughput, and have relatively lower decompress activity, (2.b) might
be more optimal for decompressions.
These approaches can be accomplished with the following new iaa_crypto
driver parameters. These parameters are global settings and will apply
to all IAAs on a package, interpreted in the context of the number of
WQs configured per IAA device.
g_comp_wqs_per_iaa:
===================
Number of compress-only WQs. The default is 1, but is applicable only
if the device has more than 1 WQ. If the device has exactly 1 WQ
configured, "g_comp_wqs_per_iaa" is a don't care.
If the IAA device has more than "g_comp_wqs_per_iaa" WQs configured,
the last "g_comp_wqs_per_iaa" number of WQs will be considered as
"compress only". The remaining WQs will be considered as
"decompress only".
If the device has less than or equal to "g_comp_wqs_per_iaa" WQs, all
the device's WQs will be considered "generic", i.e., the driver will
submit compress and decompress jobs to all the WQs configured for the
device.
For e.g., if an IAA "X" has 2 WQs, this will set up 1 decompress WQ and
1 compress WQ:
echo 1 > /sys/bus/dsa/drivers/crypto/g_comp_wqs_per_iaa
wqX.0: decompress jobs only.
wqX.1: compress jobs only.
This setting would typically benefit workloads that see a high
level of compress and decompress activity.
If an IAA has 1 WQ, that WQ will be considered "generic": the driver
will submit compress and decompress jobs to the same WQ (this is
independent of the "g_comp_wqs_per_iaa" setting):
wqX.0: compress and decompress jobs.
This would typically benefit workloads that see significant cold
memory being reclaimed, and consequently, high swapout and low swapin
activity.
distribute_comps:
=================
Distribute compressions to all IAAs on package (default is Y).
Assuming the WQ type has been established as
compress-only/decompress-only/generic, this setting will determine if
the driver will distribute compress jobs to all IAAs on a package
(default behavior) or not.
If this is turned off, the driver will dispatch compress jobs to a
given IAA "compression enabled" WQ only from cores that are mapped to
that IAA using an algorithm that evenly distributes IAAs per package
to cores per package. For e.g., on a Sapphire Rapids server with
56-physical-cores and 4 IAAs per package, with Hyperthreading, 28
logical cores will be assigned to each IAA. With the
"distribute_comps" driver parameter turned off, the driver will send
compress jobs only to it's assigned IAA device.
Enabling "distribute_comps" would typically benefit workloads in
terms of batch compress latency and throughput.
distribute_decomps:
===================
Distribute decompressions to all IAAs on package (default is N).
Assuming the WQ type has been established as
compress-only/decompress-only/generic, this setting will determine if
the driver will distribute decompress jobs to all IAAs on a package
or not (default behavior).
We recommend leaving this parameter at its default setting of "N".
Enabling "distribute_decomps = Y" can be evaluated for workloads that
are sensitive to p99 decompress latency, and see a high level of
compress and decompress activity (for e.g. warm memory reclaim/swapin).
Recommended settings for best compress/decompress latency, throughput
and hence memory savings for a moderately contended server, are:
2 WQs per IAA
g_comp_wqs_per_iaa = 1 (separate WQ for comps/decomps per IAA)
distribute_decomps = N
distribute_comps = Y
For systems that have one IAA device, the distribute_[de]comps settings
will be a no-op. Even for such systems, as long as considerable swapout
and swapin activity is expected, we recommend setting up 2 WQs
for the IAA, one each for compressions/decompressions. If swapouts are
significantly more than swapins, 1 WQ would be a better configuration,
as mentioned earlier.
Examples:
=========
For a Sapphire Rapids server with 2 packages, 56 cores and 4 IAAs per
package, each IAA has 2 WQs, and these settings are in effect:
echo 1 > /sys/bus/dsa/drivers/crypto/g_comp_wqs_per_iaa
echo 1 > /sys/bus/dsa/drivers/crypto/distribute_comps
echo 0 > /sys/bus/dsa/drivers/crypto/distribute_decomps
wqX.0: decompress jobs only.
wqX.1: compress jobs only.
Compress jobs from all cores on package-0 will be distributed in
round-robin manner to [iax1, iax3, iax5, iax7]'s wqX.1, to maximize
compression throughput/latency/memory savings:
wq1.1
wq3.1
wq5.1
wq7.1
Likewise, compress jobs from all cores on package-1 will be
distributed in round-robin manner to [iax9, iax11, iax13, iax15]'s
wqX.1, to maximize compression throughput/latency/memory savings for
workloads running on package-1:
wq9.1
wq11.1
wq13.1
wq15.1
Decompress jobs will be submitted from mapped logical cores only, as
follows:
package-0:
CPU 0-13,112-125 14-27,126-139 28-41,140-153 42-55,154-167
IAA: iax1 iax3 iax5 iax7
WQ: wq1.0 wq3.0 wq5.0 wq7.0
package-1:
CPU 56-69,168-181 70-83,182-195 84-97,196-209 98-111,210-223
IAA: iax9 iax11 iax13 iax15
WQ: wq9.0 wq11.0 wq13.0 wq15.0
IAA WQs can be configured using higher level scripts as described in
Documentation/driver-api/crypto/iaa/iaa-crypto.rst. This documentation
has been updated for the above new sysfs parameters.
Signed-off-by: Kanchana P Sridhar <kanchana.p.sridhar@intel.com>
---
.../driver-api/crypto/iaa/iaa-crypto.rst | 136 +++
drivers/crypto/intel/iaa/iaa_crypto.h | 18 +-
drivers/crypto/intel/iaa/iaa_crypto_main.c | 905 ++++++++++++++----
3 files changed, 884 insertions(+), 175 deletions(-)
diff --git a/Documentation/driver-api/crypto/iaa/iaa-crypto.rst b/Documentation/driver-api/crypto/iaa/iaa-crypto.rst
index f815d4fd8372..0ff4ec603b43 100644
--- a/Documentation/driver-api/crypto/iaa/iaa-crypto.rst
+++ b/Documentation/driver-api/crypto/iaa/iaa-crypto.rst
@@ -290,6 +290,142 @@ The available attributes are:
'sync' mode. This is to ensure correct iaa_crypto behavior until true
async polling without interrupts is enabled in iaa_crypto.
+ - g_comp_wqs_per_iaa
+
+ Number of compress-only WQs. The default is 1, but is applicable only
+ if the device has more than 1 WQ. If the device has exactly 1 WQ
+ configured, "g_comp_wqs_per_iaa" is a don't care.
+
+ If the IAA device has more than "g_comp_wqs_per_iaa" WQs configured,
+ the last "g_comp_wqs_per_iaa" number of WQs will be considered as
+ "compress only". The remaining WQs will be considered as "decomp only".
+
+ If the device has less than or equal to "g_comp_wqs_per_iaa" WQs, all
+ the device's WQs will be considered "generic", i.e., the driver will
+ submit compress and decompress jobs to all the WQs configured for the
+ device.
+
+ For e.g., if an IAA "X" has 2 WQs, this will set up 1 decompress WQ and
+ 1 compress WQ::
+
+ echo 1 > /sys/bus/dsa/drivers/crypto/g_comp_wqs_per_iaa
+
+ wqX.0: decompress jobs only.
+ wqX.1: compress jobs only.
+
+ This setting would typically benefit workloads that see a high
+ level of compress and decompress activity.
+
+ If an IAA has 1 WQ, that WQ will be considered "generic": the driver
+ will submit compress and decompress jobs to the same WQ (this is
+ independent of the "g_comp_wqs_per_iaa" setting):
+
+ wqX.0: compress and decompress jobs.
+
+ This would typically benefit workloads that see significant cold
+ memory being reclaimed, and consequently, high swapout and low swapin
+ activity.
+
+ - distribute_comps
+
+ Distribute compressions to all IAAs on package (default is Y).
+
+ Assuming the WQ type has been established as
+ compress-only/decompress-only/generic, this setting will determine if
+ the driver will distribute compress jobs to all IAAs on a package
+ (default behavior) or not.
+
+ If this is turned off, the driver will dispatch compress jobs to a
+ given IAA "compression enabled" WQ only from cores that are mapped to
+ that IAA using an algorithm that evenly distributes IAAs per package
+ to cores per package. For e.g., on a Sapphire Rapids server with
+ 56-physical-cores and 4 IAAs per package, with Hyperthreading, 28
+ logical cores will be assigned to each IAA. With the
+ "distribute_comps" driver parameter turned off, the driver will send
+ compress jobs only to it's assigned IAA device.
+
+ Enabling "distribute_comps" would typically benefit workloads in
+ terms of batch compress latency and throughput.
+
+ - distribute_decomps
+
+ Distribute decompressions to all IAAs on package (default is Y).
+
+ Assuming the WQ type has been established as
+ compress-only/decompress-only/generic, this setting will determine if
+ the driver will distribute decompress jobs to all IAAs on a package
+ (default behavior) or not.
+
+ Enabling "distribute_decomps" would typically benefit workloads that
+ see a high level of compress and decompress activity, especially
+ p99 decompress latency.
+
+ Recommended settings for best compress/decompress latency, throughput
+ and hence memory savings for a moderately contended server that
+ has more than 1 IAA device enabled on a given package:
+
+ 2 WQs per IAA
+ g_comp_wqs_per_iaa = 1 (separate WQ for comps/decomps per IAA)
+ distribute_decomps = Y
+ distribute_comps = Y
+
+ For a system that has only 1 IAA device enabled on a given package,
+ the recommended settings are:
+
+ 1 WQ per IAA
+ g_comp_wqs_per_iaa = 0 (same WQ for comps/decomps)
+ distribute_decomps = N
+ distribute_comps = N
+
+ Examples:
+
+ For a Sapphire Rapids server with 2 packages, 56 cores and 4 IAAs per
+ package, each IAA has 2 WQs, and these settings are in effect::
+
+ echo 1 > /sys/bus/dsa/drivers/crypto/g_comp_wqs_per_iaa
+ echo 1 > /sys/bus/dsa/drivers/crypto/distribute_comps
+ echo 0 > /sys/bus/dsa/drivers/crypto/distribute_decomps
+
+ This enables the following behavior:
+
+ wqX.0: decompress jobs only.
+ wqX.1: compress jobs only.
+
+ Compress jobs from all cores on package-0 will be distributed in
+ round-robin manner to [iax1, iax3, iax5, iax7]'s wqX.1, to maximize
+ compression throughput/latency/memory savings:
+
+ wq1.1
+ wq3.1
+ wq5.1
+ wq7.1
+
+ Likewise, compress jobs from all cores on package-1 will be
+ distributed in round-robin manner to [iax9, iax11, iax13, iax15]'s
+ wqX.1, to maximize compression throughput/latency/memory savings for
+ workloads running on package-1:
+
+ wq9.1
+ wq11.1
+ wq13.1
+ wq15.1
+
+ Decompress jobs will be submitted from mapped logical cores only, as
+ follows:
+
+ package-0:
+
+ CPU 0-13,112-125 14-27,126-139 28-41,140-153 42-55,154-167
+ IAA: iax1 iax3 iax5 iax7
+ WQ: wq1.0 wq3.0 wq5.0 wq7.0
+
+ package-1:
+
+ CPU 56-69,168-181 70-83,182-195 84-97,196-209 98-111,210-223
+ IAA: iax9 iax11 iax13 iax15
+ WQ: wq9.0 wq11.0 wq13.0 wq15.0
+
+
.. _iaa_default_config:
IAA Default Configuration
diff --git a/drivers/crypto/intel/iaa/iaa_crypto.h b/drivers/crypto/intel/iaa/iaa_crypto.h
index 56985e395263..549ac98a9366 100644
--- a/drivers/crypto/intel/iaa/iaa_crypto.h
+++ b/drivers/crypto/intel/iaa/iaa_crypto.h
@@ -46,6 +46,7 @@ struct iaa_wq {
struct idxd_wq *wq;
int ref;
bool remove;
+ bool mapped;
struct iaa_device *iaa_device;
@@ -63,6 +64,13 @@ struct iaa_device_compression_mode {
dma_addr_t aecs_comp_table_dma_addr;
};
+struct wq_table_entry {
+ struct idxd_wq **wqs;
+ unsigned int max_wqs;
+ unsigned int n_wqs;
+ unsigned int cur_wq;
+};
+
/* Representation of IAA device with wqs, populated by probe */
struct iaa_device {
struct list_head list;
@@ -73,19 +81,15 @@ struct iaa_device {
int n_wq;
struct list_head wqs;
+ struct wq_table_entry *generic_wq_table;
+ struct wq_table_entry *comp_wq_table;
+
atomic64_t comp_calls;
atomic64_t comp_bytes;
atomic64_t decomp_calls;
atomic64_t decomp_bytes;
};
-struct wq_table_entry {
- struct idxd_wq **wqs;
- int max_wqs;
- int n_wqs;
- int cur_wq;
-};
-
#define IAA_AECS_ALIGN 32
/*
diff --git a/drivers/crypto/intel/iaa/iaa_crypto_main.c b/drivers/crypto/intel/iaa/iaa_crypto_main.c
index 8057e8d1571a..85944ff212e5 100644
--- a/drivers/crypto/intel/iaa/iaa_crypto_main.c
+++ b/drivers/crypto/intel/iaa/iaa_crypto_main.c
@@ -24,32 +24,86 @@
#define pr_fmt(fmt) "idxd: " IDXD_SUBDRIVER_NAME ": " fmt
#define IAA_ALG_PRIORITY 300
+#define MAX_PKG_IAA 8
+#define MAX_IAA_WQ 8
/**************************************
* Driver internal global variables.
**************************************/
/* number of iaa instances probed */
-static unsigned int nr_iaa;
+static atomic_t nr_iaa = ATOMIC_INIT(0);
static unsigned int nr_cpus;
-static unsigned int nr_nodes;
-static unsigned int nr_cpus_per_node;
+static unsigned int nr_packages;
+static unsigned int nr_cpus_per_package;
+static atomic_t nr_iaa_per_package = ATOMIC_INIT(0);
/* Number of physical cpus sharing each iaa instance */
-static unsigned int cpus_per_iaa;
+static atomic_t cpus_per_iaa = ATOMIC_INIT(0);
-/* Per-cpu lookup table for balanced wqs */
-static struct wq_table_entry __percpu *wq_table;
+/* Per-cpu lookup table for decomp wqs. */
+static struct wq_table_entry __percpu *cpu_decomp_wqs;
+
+/* Per-cpu lookup table for comp wqs. */
+static struct wq_table_entry __percpu *cpu_comp_wqs;
+
+/* All decomp wqs from IAAs on a package. */
+static struct wq_table_entry **pkg_global_decomp_wqs;
+/* All comp wqs from IAAs on a package. */
+static struct wq_table_entry **pkg_global_comp_wqs;
LIST_HEAD(iaa_devices);
DEFINE_MUTEX(iaa_devices_lock);
-/* If enabled, IAA hw crypto algos are registered, unavailable otherwise */
-static bool iaa_crypto_enabled;
+/*
+ * If enabled, IAA hw crypto algos are registered, unavailable otherwise:
+ *
+ * We use the atomic @iaa_crypto_enabled to know if the per-CPU
+ * compress/decompress wq tables have been setup successfully.
+ * Since @iaa_crypto_enabled is atomic, the core functions that
+ * return a wq for compression/decompression, namely,
+ * comp_wq_table_next_wq() and decomp_wq_table_next_wq() will
+ * test this atomic before proceeding to query the per-cpu wq tables.
+ *
+ * These events will set @iaa_crypto_enabled to 1:
+ * - Successful rebalance_wq_table() after individual wq addition/removal.
+ *
+ * These events will set @iaa_crypto_enabled to 0:
+ * - Error during rebalance_wq_table() after individual wq addition/removal.
+ * - check_completion() timeouts.
+ * - @nr_iaa is 0.
+ * - module cleanup.
+ */
+static atomic_t iaa_crypto_enabled = ATOMIC_INIT(0);
+
+/*
+ * First wq probed, to use until @iaa_crypto_enabled is 1:
+ *
+ * The first wq probed will be entered in the per-CPU comp/decomp wq tables
+ * until the IAA compression modes are registered. This is done to facilitate
+ * the compress/decompress calls from the crypto testmgr resulting from
+ * calling crypto_register_acomp().
+ *
+ * With the new dynamic package-level rebalancing of WQs being
+ * discovered asynchronously and concurrently with tests
+ * triggered from device registration, this is needed to
+ * determine when it is safe for the rebalancing of decomp/comp
+ * WQs to de-allocate the per-package WQs and re-allocate them
+ * based on the latest number of IAA devices and WQs.
+ */
+static struct idxd_wq *first_wq_found;
+DEFINE_MUTEX(first_wq_found_lock);
+
static bool iaa_crypto_registered;
static struct iaa_compression_mode *iaa_compression_modes[IAA_COMP_MODES_MAX];
+/* Distribute decompressions across all IAAs on the package. */
+static bool iaa_distribute_decomps;
+
+/* Distribute compressions across all IAAs on the package. */
+static bool iaa_distribute_comps = true;
+
/* Verify results of IAA compress or not */
static bool iaa_verify_compress = true;
@@ -87,6 +141,9 @@ static bool async_mode;
/* Use interrupts */
static bool use_irq;
+/* Number of compress-only wqs per iaa*/
+static unsigned int g_comp_wqs_per_iaa = 1;
+
/**************************************************
* Driver attributes along with get/set functions.
**************************************************/
@@ -103,7 +160,7 @@ static ssize_t verify_compress_store(struct device_driver *driver,
mutex_lock(&iaa_devices_lock);
- if (iaa_crypto_enabled)
+ if (atomic_read(&iaa_crypto_enabled))
goto out;
ret = kstrtobool(buf, &iaa_verify_compress);
@@ -167,7 +224,7 @@ static ssize_t sync_mode_store(struct device_driver *driver,
mutex_lock(&iaa_devices_lock);
- if (iaa_crypto_enabled)
+ if (atomic_read(&iaa_crypto_enabled))
goto out;
ret = set_iaa_sync_mode(buf);
@@ -180,6 +237,87 @@ static ssize_t sync_mode_store(struct device_driver *driver,
}
static DRIVER_ATTR_RW(sync_mode);
+static ssize_t g_comp_wqs_per_iaa_show(struct device_driver *driver, char *buf)
+{
+ return sysfs_emit(buf, "%u\n", g_comp_wqs_per_iaa);
+}
+
+static ssize_t g_comp_wqs_per_iaa_store(struct device_driver *driver,
+ const char *buf, size_t count)
+{
+ int ret = -EBUSY;
+
+ mutex_lock(&iaa_devices_lock);
+
+ if (atomic_read(&iaa_crypto_enabled))
+ goto out;
+
+ ret = kstrtouint(buf, 10, &g_comp_wqs_per_iaa);
+ if (ret)
+ goto out;
+
+ ret = count;
+out:
+ mutex_unlock(&iaa_devices_lock);
+
+ return ret;
+}
+static DRIVER_ATTR_RW(g_comp_wqs_per_iaa);
+
+static ssize_t distribute_decomps_show(struct device_driver *driver, char *buf)
+{
+ return sysfs_emit(buf, "%d\n", iaa_distribute_decomps);
+}
+
+static ssize_t distribute_decomps_store(struct device_driver *driver,
+ const char *buf, size_t count)
+{
+ int ret = -EBUSY;
+
+ mutex_lock(&iaa_devices_lock);
+
+ if (atomic_read(&iaa_crypto_enabled))
+ goto out;
+
+ ret = kstrtobool(buf, &iaa_distribute_decomps);
+ if (ret)
+ goto out;
+
+ ret = count;
+out:
+ mutex_unlock(&iaa_devices_lock);
+
+ return ret;
+}
+static DRIVER_ATTR_RW(distribute_decomps);
+
+static ssize_t distribute_comps_show(struct device_driver *driver, char *buf)
+{
+ return sysfs_emit(buf, "%d\n", iaa_distribute_comps);
+}
+
+static ssize_t distribute_comps_store(struct device_driver *driver,
+ const char *buf, size_t count)
+{
+ int ret = -EBUSY;
+
+ mutex_lock(&iaa_devices_lock);
+
+ if (atomic_read(&iaa_crypto_enabled))
+ goto out;
+
+ ret = kstrtobool(buf, &iaa_distribute_comps);
+ if (ret)
+ goto out;
+
+ ret = count;
+out:
+ mutex_unlock(&iaa_devices_lock);
+
+ return ret;
+}
+static DRIVER_ATTR_RW(distribute_comps);
+
/****************************
* Driver compression modes.
****************************/
@@ -465,32 +603,81 @@ static void remove_device_compression_modes(struct iaa_device *iaa_device)
* allocate/init/query/deallocate devices/wqs.
***********************************************************/
-static struct iaa_device *iaa_device_alloc(void)
+static struct iaa_device *iaa_device_alloc(struct idxd_device *idxd)
{
struct iaa_device *iaa_device;
+ struct wq_table_entry *wqt;
iaa_device = kzalloc(sizeof(*iaa_device), GFP_KERNEL);
if (!iaa_device)
- return NULL;
+ goto err;
+
+ iaa_device->idxd = idxd;
+
+ /* IAA device's generic/decomp wqs. */
+ iaa_device->generic_wq_table = kzalloc(sizeof(struct wq_table_entry), GFP_KERNEL);
+ if (!iaa_device->generic_wq_table)
+ goto err;
+
+ wqt = iaa_device->generic_wq_table;
+
+ wqt->wqs = kcalloc(iaa_device->idxd->max_wqs, sizeof(struct idxd_wq *), GFP_KERNEL);
+ if (!wqt->wqs)
+ goto err;
+
+ wqt->max_wqs = iaa_device->idxd->max_wqs;
+ wqt->n_wqs = 0;
+
+ /*
+ * IAA device's comp wqs (optional). If the device has more than
+ * "g_comp_wqs_per_iaa" WQs configured, the last "g_comp_wqs_per_iaa"
+ * number of WQs will be considered as "comp only". The remaining
+ * WQs will be considered as "decomp only".
+ * If the device has <= "g_comp_wqs_per_iaa" WQs, all the
+ * device's WQs will be considered "generic", i.e., cores can submit
+ * comp and decomp jobs to all the WQs configured for the device.
+ */
+ iaa_device->comp_wq_table = kzalloc(sizeof(struct wq_table_entry), GFP_KERNEL);
+ if (!iaa_device->comp_wq_table)
+ goto err;
+
+ wqt = iaa_device->comp_wq_table;
+
+ wqt->wqs = kcalloc(iaa_device->idxd->max_wqs, sizeof(struct idxd_wq *), GFP_KERNEL);
+ if (!wqt->wqs)
+ goto err;
+
+ wqt->max_wqs = iaa_device->idxd->max_wqs;
+ wqt->n_wqs = 0;
INIT_LIST_HEAD(&iaa_device->wqs);
return iaa_device;
+
+err:
+ if (iaa_device) {
+ if (iaa_device->generic_wq_table) {
+ kfree(iaa_device->generic_wq_table->wqs);
+ kfree(iaa_device->generic_wq_table);
+ }
+ kfree(iaa_device->comp_wq_table);
+ kfree(iaa_device);
+ }
+
+ return NULL;
}
static struct iaa_device *add_iaa_device(struct idxd_device *idxd)
{
struct iaa_device *iaa_device;
- iaa_device = iaa_device_alloc();
+ iaa_device = iaa_device_alloc(idxd);
if (!iaa_device)
return NULL;
- iaa_device->idxd = idxd;
-
list_add_tail(&iaa_device->list, &iaa_devices);
- nr_iaa++;
+ atomic_inc(&nr_iaa);
return iaa_device;
}
@@ -510,7 +697,7 @@ static void del_iaa_device(struct iaa_device *iaa_device)
{
list_del(&iaa_device->list);
- nr_iaa--;
+ atomic_dec(&nr_iaa);
}
static void free_iaa_device(struct iaa_device *iaa_device)
@@ -519,6 +706,17 @@ static void free_iaa_device(struct iaa_device *iaa_device)
return;
remove_device_compression_modes(iaa_device);
+
+ if (iaa_device->generic_wq_table) {
+ kfree(iaa_device->generic_wq_table->wqs);
+ kfree(iaa_device->generic_wq_table);
+ }
+
+ if (iaa_device->comp_wq_table) {
+ kfree(iaa_device->comp_wq_table->wqs);
+ kfree(iaa_device->comp_wq_table);
+ }
+
kfree(iaa_device);
}
@@ -568,16 +766,16 @@ static void del_iaa_wq(struct iaa_device *iaa_device, struct idxd_wq *wq)
struct idxd_device *idxd = iaa_device->idxd;
struct pci_dev *pdev = idxd->pdev;
struct device *dev = &pdev->dev;
- struct iaa_wq *iaa_wq;
+ struct iaa_wq *iaa_wq, *next_iaa_wq;
- list_for_each_entry(iaa_wq, &iaa_device->wqs, list) {
+ list_for_each_entry_safe(iaa_wq, next_iaa_wq, &iaa_device->wqs, list) {
if (iaa_wq->wq == wq) {
list_del(&iaa_wq->list);
iaa_device->n_wq--;
dev_dbg(dev, "removed wq %d from iaa_device %d, n_wq %d, nr_iaa %d\n",
wq->id, iaa_device->idxd->id,
- iaa_device->n_wq, nr_iaa);
+ iaa_device->n_wq, atomic_read(&nr_iaa));
if (iaa_device->n_wq == 0)
del_iaa_device(iaa_device);
@@ -588,21 +786,30 @@ static void del_iaa_wq(struct iaa_device *iaa_device, struct idxd_wq *wq)
static void remove_iaa_wq(struct idxd_wq *wq)
{
- struct iaa_device *iaa_device;
+ struct iaa_device *iaa_device, *next_iaa_device;
+ unsigned int num_pkg_iaa = 0;
- list_for_each_entry(iaa_device, &iaa_devices, list) {
+ list_for_each_entry_safe(iaa_device, next_iaa_device, &iaa_devices, list) {
if (iaa_has_wq(iaa_device, wq)) {
del_iaa_wq(iaa_device, wq);
break;
}
}
- if (nr_iaa) {
- cpus_per_iaa = (nr_nodes * nr_cpus_per_node) / nr_iaa;
- if (!cpus_per_iaa)
- cpus_per_iaa = 1;
- } else
- cpus_per_iaa = 1;
+ if (atomic_read(&nr_iaa)) {
+ atomic_set(&cpus_per_iaa, (nr_packages * nr_cpus_per_package) / atomic_read(&nr_iaa));
+ if (!atomic_read(&cpus_per_iaa))
+ atomic_set(&cpus_per_iaa, 1);
+
+ num_pkg_iaa = atomic_read(&nr_iaa) / nr_packages;
+ if (!num_pkg_iaa)
+ num_pkg_iaa = 1;
+ } else {
+ atomic_set(&cpus_per_iaa, 1);
+ num_pkg_iaa = 1;
+ }
+
+ atomic_set(&nr_iaa_per_package, num_pkg_iaa);
}
static void __free_iaa_wq(struct iaa_wq *iaa_wq)
@@ -636,6 +843,7 @@ static int save_iaa_wq(struct idxd_wq *wq)
struct pci_dev *pdev;
struct device *dev;
int ret = 0;
+ unsigned int num_pkg_iaa = 0;
list_for_each_entry(iaa_device, &iaa_devices, list) {
if (iaa_device->idxd == wq->idxd) {
@@ -688,12 +896,19 @@ static int save_iaa_wq(struct idxd_wq *wq)
}
}
- if (WARN_ON(nr_iaa == 0))
+ if (WARN_ON(atomic_read(&nr_iaa) == 0))
return -EINVAL;
- cpus_per_iaa = (nr_nodes * nr_cpus_per_node) / nr_iaa;
- if (!cpus_per_iaa)
- cpus_per_iaa = 1;
+ atomic_set(&cpus_per_iaa, (nr_packages * nr_cpus_per_package) / atomic_read(&nr_iaa));
+ if (!atomic_read(&cpus_per_iaa))
+ atomic_set(&cpus_per_iaa, 1);
+
+ num_pkg_iaa = atomic_read(&nr_iaa) / nr_packages;
+ if (!num_pkg_iaa)
+ num_pkg_iaa = 1;
+
+ atomic_set(&nr_iaa_per_package, num_pkg_iaa);
+
out:
return ret;
}
@@ -749,105 +964,290 @@ static int iaa_wq_put(struct idxd_wq *wq)
* Mapping IAA devices and wqs to cores with per-cpu wq_tables.
***************************************************************/
-static void wq_table_free_entry(int cpu)
+/*
+ * Given a cpu, find the closest IAA instance.
+ */
+static inline int cpu_to_iaa(int cpu)
{
- struct wq_table_entry *entry = per_cpu_ptr(wq_table, cpu);
+ int package_id, base_iaa, iaa = 0;
- kfree(entry->wqs);
- memset(entry, 0, sizeof(*entry));
+ if (!nr_packages || !atomic_read(&nr_iaa_per_package) || !atomic_read(&nr_iaa))
+ return -1;
+
+ package_id = topology_logical_package_id(cpu);
+ base_iaa = package_id * atomic_read(&nr_iaa_per_package);
+ iaa = base_iaa + ((cpu % nr_cpus_per_package) / atomic_read(&cpus_per_iaa));
+
+ pr_debug("cpu = %d, package_id = %d, base_iaa = %d, iaa = %d",
+ cpu, package_id, base_iaa, iaa);
+
+ if (iaa >= 0 && iaa < atomic_read(&nr_iaa))
+ return iaa;
+
+ return (atomic_read(&nr_iaa) - 1);
}
-static void wq_table_clear_entry(int cpu)
+static void free_wq_tables(void)
{
- struct wq_table_entry *entry = per_cpu_ptr(wq_table, cpu);
+ if (cpu_decomp_wqs) {
+ free_percpu(cpu_decomp_wqs);
+ cpu_decomp_wqs = NULL;
+ }
- entry->n_wqs = 0;
- entry->cur_wq = 0;
- memset(entry->wqs, 0, entry->max_wqs * sizeof(struct idxd_wq *));
+ if (cpu_comp_wqs) {
+ free_percpu(cpu_comp_wqs);
+ cpu_comp_wqs = NULL;
+ }
+
+ pr_debug("freed comp/decomp wq tables\n");
}
-static void clear_wq_table(void)
+static void pkg_global_wqs_dealloc(void)
{
- int cpu;
+ int i;
- for (cpu = 0; cpu < nr_cpus; cpu++)
- wq_table_clear_entry(cpu);
+ if (pkg_global_decomp_wqs) {
+ for (i = 0; i < nr_packages; ++i) {
+ kfree(pkg_global_decomp_wqs[i]->wqs);
+ kfree(pkg_global_decomp_wqs[i]);
+ }
+ kfree(pkg_global_decomp_wqs);
+ pkg_global_decomp_wqs = NULL;
+ }
- pr_debug("cleared wq table\n");
+ if (pkg_global_comp_wqs) {
+ for (i = 0; i < nr_packages; ++i) {
+ kfree(pkg_global_comp_wqs[i]->wqs);
+ kfree(pkg_global_comp_wqs[i]);
+ }
+ kfree(pkg_global_comp_wqs);
+ pkg_global_comp_wqs = NULL;
+ }
}
-static void free_wq_table(void)
+static bool pkg_global_wqs_alloc(void)
{
- int cpu;
+ int i;
+
+ pkg_global_decomp_wqs = kcalloc(nr_packages, sizeof(*pkg_global_decomp_wqs), GFP_KERNEL);
+ if (!pkg_global_decomp_wqs)
+ return false;
+
+ for (i = 0; i < nr_packages; ++i) {
+ pkg_global_decomp_wqs[i] = kzalloc(sizeof(struct wq_table_entry), GFP_KERNEL);
+ if (!pkg_global_decomp_wqs[i])
+ goto err;
+
+ pkg_global_decomp_wqs[i]->wqs = kcalloc(MAX_PKG_IAA * MAX_IAA_WQ, sizeof(struct idxd_wq *), GFP_KERNEL);
+ if (!pkg_global_decomp_wqs[i]->wqs)
+ goto err;
+
+ pkg_global_decomp_wqs[i]->max_wqs = MAX_PKG_IAA * MAX_IAA_WQ;
+ }
+
+ pkg_global_comp_wqs = kcalloc(nr_packages, sizeof(*pkg_global_comp_wqs), GFP_KERNEL);
+ if (!pkg_global_comp_wqs)
+ goto err;
- for (cpu = 0; cpu < nr_cpus; cpu++)
- wq_table_free_entry(cpu);
+ for (i = 0; i < nr_packages; ++i) {
+ pkg_global_comp_wqs[i] = kzalloc(sizeof(struct wq_table_entry), GFP_KERNEL);
+ if (!pkg_global_comp_wqs[i])
+ goto err;
+
+ pkg_global_comp_wqs[i]->wqs = kcalloc(MAX_PKG_IAA * MAX_IAA_WQ, sizeof(struct idxd_wq *), GFP_KERNEL);
+ if (!pkg_global_comp_wqs[i]->wqs)
+ goto err;
+
+ pkg_global_comp_wqs[i]->max_wqs = MAX_PKG_IAA * MAX_IAA_WQ;
+ }
- free_percpu(wq_table);
+ return true;
- pr_debug("freed wq table\n");
+err:
+ pkg_global_wqs_dealloc();
+ return false;
}
static int alloc_wq_table(int max_wqs)
{
- struct wq_table_entry *entry;
- int cpu;
-
- wq_table = alloc_percpu(struct wq_table_entry);
- if (!wq_table)
+ cpu_decomp_wqs = alloc_percpu_gfp(struct wq_table_entry, GFP_KERNEL | __GFP_ZERO);
+ if (!cpu_decomp_wqs)
return -ENOMEM;
- for (cpu = 0; cpu < nr_cpus; cpu++) {
- entry = per_cpu_ptr(wq_table, cpu);
- entry->wqs = kcalloc(max_wqs, sizeof(*entry->wqs), GFP_KERNEL);
- if (!entry->wqs) {
- free_wq_table();
- return -ENOMEM;
- }
+ cpu_comp_wqs = alloc_percpu_gfp(struct wq_table_entry, GFP_KERNEL | __GFP_ZERO);
+ if (!cpu_comp_wqs)
+ goto err;
- entry->max_wqs = max_wqs;
- }
+ if (!pkg_global_wqs_alloc())
+ goto err;
pr_debug("initialized wq table\n");
return 0;
+
+err:
+ free_wq_tables();
+ return -ENOMEM;
+}
+
+/*
+ * The caller should have established that device_iaa_wqs is not empty,
+ * i.e., every IAA device in "iaa_devices" has at least one WQ.
+ */
+static void add_device_wqs_to_wq_table(struct wq_table_entry *dst_wq_table,
+ struct wq_table_entry *device_wq_table)
+{
+ int i;
+
+ for (i = 0; i < device_wq_table->n_wqs; ++i)
+ dst_wq_table->wqs[dst_wq_table->n_wqs++] = device_wq_table->wqs[i];
}
-static void wq_table_add(int cpu, struct idxd_wq *wq)
+static bool reinit_pkg_global_wqs(bool comp)
{
- struct wq_table_entry *entry = per_cpu_ptr(wq_table, cpu);
+ int cur_iaa = 0, pkg = 0;
+ struct iaa_device *iaa_device;
+ struct wq_table_entry **pkg_wqs = comp ? pkg_global_comp_wqs : pkg_global_decomp_wqs;
+
+ for (pkg = 0; pkg < nr_packages; ++pkg)
+ pkg_wqs[pkg]->n_wqs = 0;
+
+ pkg = 0;
+
+one_iaa_special_case:
+ /* Re-initialize per-package wqs. */
+ list_for_each_entry(iaa_device, &iaa_devices, list) {
+ struct wq_table_entry *device_wq_table = comp ?
+ ((iaa_device->comp_wq_table->n_wqs > 0) ?
+ iaa_device->comp_wq_table : iaa_device->generic_wq_table) :
+ iaa_device->generic_wq_table;
+
+ if (pkg_wqs[pkg]->n_wqs + device_wq_table->n_wqs > pkg_wqs[pkg]->max_wqs) {
+ pkg_wqs[pkg]->wqs = krealloc(pkg_wqs[pkg]->wqs,
+ ksize(pkg_wqs[pkg]->wqs) +
+ max((MAX_PKG_IAA * MAX_IAA_WQ), iaa_device->n_wq) * sizeof(struct idxd_wq *),
+ GFP_KERNEL | __GFP_ZERO);
+ if (!pkg_wqs[pkg]->wqs)
+ return false;
+
+ pkg_wqs[pkg]->max_wqs = ksize(pkg_wqs[pkg]->wqs)/sizeof(struct idxd_wq *);
+ }
+
+ add_device_wqs_to_wq_table(pkg_wqs[pkg], device_wq_table);
+
+ pr_debug("pkg_global_%s_wqs[%d] has %u n_wqs %u max_wqs",
+ (comp ? "comp" : "decomp"), pkg, pkg_wqs[pkg]->n_wqs, pkg_wqs[pkg]->max_wqs);
+
+ if (++cur_iaa == atomic_read(&nr_iaa_per_package)) {
+ if (++pkg == nr_packages)
+ break;
+ cur_iaa = 0;
+ if (atomic_read(&nr_iaa) == 1)
+ goto one_iaa_special_case;
+ }
+ }
+
+ return true;
+}
+
+static void create_cpu_wq_table(int cpu, struct wq_table_entry *wq_table, bool comp)
+{
+ struct wq_table_entry *entry = comp ?
+ per_cpu_ptr(cpu_comp_wqs, cpu) :
+ per_cpu_ptr(cpu_decomp_wqs, cpu);
+
+ if (!atomic_read(&iaa_crypto_enabled)) {
+ mutex_lock(&first_wq_found_lock);
+
+ if (WARN_ON(!first_wq_found && !wq_table->n_wqs)) {
+ mutex_unlock(&first_wq_found_lock);
+ return;
+ }
+
+ if (!first_wq_found)
+ first_wq_found = wq_table->wqs[0];
- if (WARN_ON(entry->n_wqs == entry->max_wqs))
+ mutex_unlock(&first_wq_found_lock);
+
+ entry->wqs = &first_wq_found;
+ entry->max_wqs = 1;
+ entry->n_wqs = 1;
+ entry->cur_wq = 0;
+ pr_debug("%s: cpu %d: added %u first_wq_found for %s wqs up to wq %d.%d\n", __func__,
+ cpu, entry->n_wqs, comp ? "comp":"decomp",
+ entry->wqs[entry->n_wqs - 1]->idxd->id,
+ entry->wqs[entry->n_wqs - 1]->id);
+ return;
+ }
+
+ entry->wqs = wq_table->wqs;
+ entry->max_wqs = wq_table->max_wqs;
+ entry->n_wqs = wq_table->n_wqs;
+ entry->cur_wq = 0;
+
+ if (entry->n_wqs)
+ pr_debug("%s: cpu %d: added %u iaa %s wqs up to wq %d.%d: entry->max_wqs = %u\n", __func__,
+ cpu, entry->n_wqs, comp ? "comp":"decomp",
+ entry->wqs[entry->n_wqs - 1]->idxd->id, entry->wqs[entry->n_wqs - 1]->id,
+ entry->max_wqs);
+}
+
+static void set_cpu_wq_table_start_wq(int cpu, bool comp)
+{
+ struct wq_table_entry *entry = comp ?
+ per_cpu_ptr(cpu_comp_wqs, cpu) :
+ per_cpu_ptr(cpu_decomp_wqs, cpu);
+ unsigned int num_pkg_iaa = atomic_read(&nr_iaa_per_package);
+
+ if (!num_pkg_iaa)
return;
- entry->wqs[entry->n_wqs++] = wq;
+ int start_wq = (entry->n_wqs / num_pkg_iaa) * (cpu_to_iaa(cpu) % num_pkg_iaa);
+
+ if ((start_wq >= 0) && (start_wq < entry->n_wqs))
+ entry->cur_wq = start_wq;
+}
+
+static void create_cpu_wq_table_from_pkg_wqs(bool comp)
+{
+ int cpu;
- pr_debug("%s: added iaa wq %d.%d to idx %d of cpu %d\n", __func__,
- entry->wqs[entry->n_wqs - 1]->idxd->id,
- entry->wqs[entry->n_wqs - 1]->id, entry->n_wqs - 1, cpu);
+ /*
+ * All CPU on the same package share the same "package global"
+ * [de]comp_wqs.
+ */
+ for (cpu = 0; cpu < nr_cpus; cpu += nr_cpus_per_package) {
+ int package_id = topology_logical_package_id(cpu);
+ struct wq_table_entry *pkg_wq_table = comp ?
+ ((pkg_global_comp_wqs[package_id]->n_wqs > 0) ?
+ pkg_global_comp_wqs[package_id] : pkg_global_decomp_wqs[package_id])
+ : pkg_global_decomp_wqs[package_id];
+ int pkg_cpu;
+
+ for (pkg_cpu = cpu; pkg_cpu < cpu + nr_cpus_per_package; ++pkg_cpu) {
+ /* Initialize decomp/comp wq_table for CPU. */
+ create_cpu_wq_table(pkg_cpu, pkg_wq_table, comp);
+ /* Stagger the starting WQ in the package WQ table, for each CPU. */
+ set_cpu_wq_table_start_wq(pkg_cpu, comp);
+ }
+ }
}
-static int wq_table_add_wqs(int iaa, int cpu)
+static int add_mapped_device_wq_table_for_cpu(int iaa, int cpu, bool comp)
{
struct iaa_device *iaa_device, *found_device = NULL;
- int ret = 0, cur_iaa = 0, n_wqs_added = 0;
- struct idxd_device *idxd;
- struct iaa_wq *iaa_wq;
- struct pci_dev *pdev;
- struct device *dev;
+ struct wq_table_entry *device_wq_table;
+ int ret = 0, cur_iaa = 0;
list_for_each_entry(iaa_device, &iaa_devices, list) {
- idxd = iaa_device->idxd;
- pdev = idxd->pdev;
- dev = &pdev->dev;
-
if (cur_iaa != iaa) {
cur_iaa++;
continue;
}
found_device = iaa_device;
- dev_dbg(dev, "getting wq from iaa_device %d, cur_iaa %d\n",
+ dev_dbg(&found_device->idxd->pdev->dev,
+ "getting wq from iaa_device %d, cur_iaa %d\n",
found_device->idxd->id, cur_iaa);
break;
}
@@ -862,93 +1262,219 @@ static int wq_table_add_wqs(int iaa, int cpu)
}
cur_iaa = 0;
- idxd = found_device->idxd;
- pdev = idxd->pdev;
- dev = &pdev->dev;
- dev_dbg(dev, "getting wq from only iaa_device %d, cur_iaa %d\n",
+ dev_dbg(&found_device->idxd->pdev->dev,
+ "getting wq from only iaa_device %d, cur_iaa %d\n",
found_device->idxd->id, cur_iaa);
}
- list_for_each_entry(iaa_wq, &found_device->wqs, list) {
- wq_table_add(cpu, iaa_wq->wq);
- pr_debug("rebalance: added wq for cpu=%d: iaa wq %d.%d\n",
- cpu, iaa_wq->wq->idxd->id, iaa_wq->wq->id);
- n_wqs_added++;
+ device_wq_table = comp ?
+ ((found_device->comp_wq_table->n_wqs > 0) ?
+ found_device->comp_wq_table : found_device->generic_wq_table) :
+ found_device->generic_wq_table;
+
+ create_cpu_wq_table(cpu, device_wq_table, comp);
+
+out:
+ return ret;
+}
+
+static void create_cpu_wq_table_from_mapped_device(bool comp)
+{
+ int cpu, iaa;
+
+ for_each_possible_cpu(cpu) {
+ iaa = cpu_to_iaa(cpu);
+ pr_debug("rebalance: cpu=%d iaa=%d\n", cpu, iaa);
+
+ if (WARN_ON(iaa == -1)) {
+ pr_debug("rebalance (cpu_to_iaa(%d)) failed!\n", cpu);
+ return;
+ }
+
+ if (WARN_ON(add_mapped_device_wq_table_for_cpu(iaa, cpu, comp))) {
+ pr_debug("could not add any wqs of iaa %d to cpu %d!\n", iaa, cpu);
+ return;
+ }
+ }
+}
+
+static int map_iaa_device_wqs(struct iaa_device *iaa_device)
+{
+ struct wq_table_entry *generic, *for_comps;
+ int ret = 0, n_wqs_added = 0;
+ struct iaa_wq *iaa_wq;
+
+ generic = iaa_device->generic_wq_table;
+ for_comps = iaa_device->comp_wq_table;
+
+ list_for_each_entry(iaa_wq, &iaa_device->wqs, list) {
+ if (iaa_wq->mapped && ++n_wqs_added)
+ continue;
+
+ pr_debug("iaa_device %p: processing wq %d.%d\n", iaa_device, iaa_device->idxd->id, iaa_wq->wq->id);
+
+ if ((!n_wqs_added || ((n_wqs_added + g_comp_wqs_per_iaa) < iaa_device->n_wq)) &&
+ (generic->n_wqs < generic->max_wqs)) {
+
+ generic->wqs[generic->n_wqs++] = iaa_wq->wq;
+ pr_debug("iaa_device %p: added decomp wq %d.%d\n", iaa_device, iaa_device->idxd->id, iaa_wq->wq->id);
+ } else {
+ if (WARN_ON(for_comps->n_wqs == for_comps->max_wqs))
+ break;
+
+ for_comps->wqs[for_comps->n_wqs++] = iaa_wq->wq;
+ pr_debug("iaa_device %p: added comp wq %d.%d\n", iaa_device, iaa_device->idxd->id, iaa_wq->wq->id);
+ }
+
+ iaa_wq->mapped = true;
+ ++n_wqs_added;
}
- if (!n_wqs_added) {
- pr_debug("couldn't find any iaa wqs!\n");
+ if (!n_wqs_added && !iaa_device->n_wq) {
+ pr_debug("iaa_device %d: couldn't find any iaa wqs!\n", iaa_device->idxd->id);
ret = -EINVAL;
- goto out;
}
-out:
+
return ret;
}
+static void map_iaa_devices(void)
+{
+ struct iaa_device *iaa_device;
+
+ list_for_each_entry(iaa_device, &iaa_devices, list) {
+ WARN_ON(map_iaa_device_wqs(iaa_device));
+ }
+}
+
/*
- * Rebalance the wq table so that given a cpu, it's easy to find the
- * closest IAA instance. The idea is to try to choose the most
- * appropriate IAA instance for a caller and spread available
- * workqueues around to clients.
+ * Rebalance the per-cpu wq table based on available IAA devices/WQs.
+ * Three driver parameters control how this algorithm works:
+ *
+ * - g_comp_wqs_per_iaa:
+ *
+ * If multiple WQs are configured for a given device, this setting determines
+ * the number of WQs to be used as "compress only" WQs. The remaining WQs will
+ * be used as "decompress only WQs".
+ * Note that the comp WQ can be the same as the decomp WQ, for e.g., if
+ * g_comp_wqs_per_iaa is 0 (regardless of the # of available WQs per device), or,
+ * if there is only 1 WQ configured for a device (regardless of
+ * g_comp_wqs_per_iaa).
+ *
+ * - distribute_decomps, distribute_comps:
+ *
+ * If this is enabled, all [de]comp WQs found from the IAA devices on a
+ * package, will be aggregated into pkg_global_[de]comp_wqs, then assigned to
+ * each CPU on the package.
+ *
+ * Note:
+ * -----
+ * rebalance_wq_table() will return true if it was able to successfully
+ * configure comp/decomp wqs for all CPUs, without changing the
+ * @iaa_crypto_enabled atomic. The caller can re-enable the use of the wq
+ * tables after rebalance_wq_table() returns true, by setting the
+ * @iaa_crypto_enabled atomic to 1.
+ * In case of any errors, the @iaa_crypto_enabled atomic will be set to 0,
+ * and rebalance_wq_table() will return false.
*/
-static void rebalance_wq_table(void)
+static bool rebalance_wq_table(void)
{
- const struct cpumask *node_cpus;
- int node_cpu, node, cpu, iaa = 0;
+ int cpu;
- if (nr_iaa == 0)
- return;
+ if (atomic_read(&nr_iaa) == 0)
+ goto err;
- pr_debug("rebalance: nr_nodes=%d, nr_cpus %d, nr_iaa %d, cpus_per_iaa %d\n",
- nr_nodes, nr_cpus, nr_iaa, cpus_per_iaa);
+ map_iaa_devices();
- clear_wq_table();
+ pr_info("rebalance: nr_packages=%d, nr_cpus %d, nr_iaa %d, nr_iaa_per_package %d, cpus_per_iaa %d\n",
+ nr_packages, nr_cpus, atomic_read(&nr_iaa),
+ atomic_read(&nr_iaa_per_package), atomic_read(&cpus_per_iaa));
- if (nr_iaa == 1) {
- for_each_possible_cpu(cpu) {
- if (WARN_ON(wq_table_add_wqs(0, cpu)))
- goto err;
- }
+ if (iaa_distribute_decomps) {
+ /* Each CPU uses all IAA devices on package for decomps. */
+ if (!reinit_pkg_global_wqs(false))
+ goto err;
+ create_cpu_wq_table_from_pkg_wqs(false);
+ } else {
+ /*
+ * Each CPU uses the decomp WQ on the mapped IAA device using
+ * a balanced mapping of cores to IAA.
+ */
+ create_cpu_wq_table_from_mapped_device(false);
+ }
- return;
+ if (iaa_distribute_comps) {
+ /* Each CPU uses all IAA devices on package for comps. */
+ if (!reinit_pkg_global_wqs(true))
+ goto err;
+ create_cpu_wq_table_from_pkg_wqs(true);
+ } else {
+ /*
+ * Each CPU uses the comp WQ on the mapped IAA device using
+ * a balanced mapping of cores to IAA.
+ */
+ create_cpu_wq_table_from_mapped_device(true);
}
- for_each_node_with_cpus(node) {
- cpu = 0;
- node_cpus = cpumask_of_node(node);
+ /* Verify that each cpu has comp and decomp wqs.*/
+ for_each_possible_cpu(cpu) {
+ struct wq_table_entry *entry = per_cpu_ptr(cpu_decomp_wqs, cpu);
- for_each_cpu(node_cpu, node_cpus) {
- iaa = cpu / cpus_per_iaa;
- if (WARN_ON(wq_table_add_wqs(iaa, node_cpu)))
- goto err;
- cpu++;
+ if (!entry->wqs || !entry->n_wqs) {
+ pr_err("%s: cpu %d does not have decomp_wqs", __func__, cpu);
+ goto err;
+ }
+
+ entry = per_cpu_ptr(cpu_comp_wqs, cpu);
+ if (!entry->wqs || !entry->n_wqs) {
+ pr_err("%s: cpu %d does not have comp_wqs", __func__, cpu);
+ goto err;
}
}
- return;
+ pr_debug("Finished rebalance decomp/comp wqs.");
+ return true;
+
err:
- pr_debug("could not add any wqs for iaa %d to cpu %d!\n", iaa, cpu);
+ atomic_set(&iaa_crypto_enabled, 0);
+ pr_debug("Error during rebalance decomp/comp wqs.");
+ return false;
}
/***************************************************************
* Assign work-queues for driver ops using per-cpu wq_tables.
***************************************************************/
-static struct idxd_wq *wq_table_next_wq(int cpu)
+static struct idxd_wq *decomp_wq_table_next_wq(int cpu)
{
- struct wq_table_entry *entry = per_cpu_ptr(wq_table, cpu);
+ struct wq_table_entry *entry = per_cpu_ptr(cpu_decomp_wqs, cpu);
+ struct idxd_wq *wq;
+
+ if (!atomic_read(&iaa_crypto_enabled))
+ return NULL;
+
+ wq = entry->wqs[entry->cur_wq];
- if (++entry->cur_wq >= entry->n_wqs)
+ if (++entry->cur_wq == entry->n_wqs)
entry->cur_wq = 0;
- if (!entry->wqs[entry->cur_wq])
+ return wq;
+}
+
+static struct idxd_wq *comp_wq_table_next_wq(int cpu)
+{
+ struct wq_table_entry *entry = per_cpu_ptr(cpu_comp_wqs, cpu);
+ struct idxd_wq *wq;
+
+ if (!atomic_read(&iaa_crypto_enabled))
return NULL;
- pr_debug("%s: returning wq at idx %d (iaa wq %d.%d) from cpu %d\n", __func__,
- entry->cur_wq, entry->wqs[entry->cur_wq]->idxd->id,
- entry->wqs[entry->cur_wq]->id, cpu);
+ wq = entry->wqs[entry->cur_wq];
- return entry->wqs[entry->cur_wq];
+ if (++entry->cur_wq == entry->n_wqs)
+ entry->cur_wq = 0;
+
+ return wq;
}
/*************************************************
@@ -986,7 +1512,7 @@ static inline int check_completion(struct device *dev,
dev_err(dev, "%s completion timed out - "
"assuming broken hw, iaa_crypto now DISABLED\n",
op_str);
- iaa_crypto_enabled = false;
+ atomic_set(&iaa_crypto_enabled, 0);
ret = -ETIMEDOUT;
goto out;
}
@@ -1502,18 +2028,13 @@ static int iaa_comp_acompress(struct acomp_req *req)
compression_ctx = crypto_tfm_ctx(tfm);
- if (!iaa_crypto_enabled) {
- pr_debug("iaa_crypto disabled, not compressing\n");
- return -ENODEV;
- }
-
if (!req->src || !req->slen) {
pr_debug("invalid src, not compressing\n");
return -EINVAL;
}
cpu = get_cpu();
- wq = wq_table_next_wq(cpu);
+ wq = comp_wq_table_next_wq(cpu);
put_cpu();
if (!wq) {
pr_debug("no wq configured for cpu=%d\n", cpu);
@@ -1600,18 +2121,13 @@ static int iaa_comp_adecompress(struct acomp_req *req)
struct device *dev;
struct idxd_wq *wq;
- if (!iaa_crypto_enabled) {
- pr_debug("iaa_crypto disabled, not decompressing\n");
- return -ENODEV;
- }
-
if (!req->src || !req->slen) {
pr_debug("invalid src, not decompressing\n");
return -EINVAL;
}
cpu = get_cpu();
- wq = wq_table_next_wq(cpu);
+ wq = decomp_wq_table_next_wq(cpu);
put_cpu();
if (!wq) {
pr_debug("no wq configured for cpu=%d\n", cpu);
@@ -1726,6 +2242,8 @@ static int iaa_register_compression_device(void)
static int iaa_unregister_compression_device(void)
{
+ atomic_set(&iaa_crypto_enabled, 0);
+
if (iaa_crypto_registered)
crypto_unregister_acomp(&iaa_acomp_fixed_deflate);
@@ -1747,10 +2265,13 @@ static int iaa_crypto_probe(struct idxd_dev *idxd_dev)
if (data->type != IDXD_TYPE_IAX)
return -ENODEV;
+ mutex_lock(&iaa_devices_lock);
+
mutex_lock(&wq->wq_lock);
if (idxd_wq_get_private(wq)) {
mutex_unlock(&wq->wq_lock);
+ mutex_unlock(&iaa_devices_lock);
return -EBUSY;
}
@@ -1772,8 +2293,6 @@ static int iaa_crypto_probe(struct idxd_dev *idxd_dev)
goto err;
}
- mutex_lock(&iaa_devices_lock);
-
if (list_empty(&iaa_devices)) {
ret = alloc_wq_table(wq->idxd->max_wqs);
if (ret)
@@ -1785,24 +2304,33 @@ static int iaa_crypto_probe(struct idxd_dev *idxd_dev)
if (ret)
goto err_save;
- rebalance_wq_table();
+ if (!rebalance_wq_table()) {
+ dev_dbg(dev, "%s: IAA rebalancing device wq tables failed\n", __func__);
+ goto err_register;
+ }
+ atomic_set(&iaa_crypto_enabled, 1);
if (first_wq) {
- iaa_crypto_enabled = true;
ret = iaa_register_compression_device();
if (ret != 0) {
- iaa_crypto_enabled = false;
dev_dbg(dev, "IAA compression device registration failed\n");
goto err_register;
}
+
+ if (!rebalance_wq_table()) {
+ dev_dbg(dev, "%s: Rerun after registration: IAA rebalancing device wq tables failed\n", __func__);
+ goto err_register;
+ }
+ atomic_set(&iaa_crypto_enabled, 1);
+
try_module_get(THIS_MODULE);
pr_info("iaa_crypto now ENABLED\n");
}
- mutex_unlock(&iaa_devices_lock);
out:
mutex_unlock(&wq->wq_lock);
+ mutex_unlock(&iaa_devices_lock);
return ret;
@@ -1811,9 +2339,8 @@ static int iaa_crypto_probe(struct idxd_dev *idxd_dev)
free_iaa_wq(idxd_wq_get_private(wq));
err_save:
if (first_wq)
- free_wq_table();
+ free_wq_tables();
err_alloc:
- mutex_unlock(&iaa_devices_lock);
idxd_drv_disable_wq(wq);
err:
wq->type = IDXD_WQT_NONE;
@@ -1828,13 +2355,17 @@ static void iaa_crypto_remove(struct idxd_dev *idxd_dev)
struct iaa_wq *iaa_wq;
bool free = false;
+ atomic_set(&iaa_crypto_enabled, 0);
idxd_wq_quiesce(wq);
- mutex_lock(&wq->wq_lock);
mutex_lock(&iaa_devices_lock);
+ mutex_lock(&wq->wq_lock);
remove_iaa_wq(wq);
+ if (!rebalance_wq_table())
+ pr_debug("%s: IAA rebalancing device wq tables failed\n", __func__);
+
spin_lock(&idxd->dev_lock);
iaa_wq = idxd_wq_get_private(wq);
if (!iaa_wq) {
@@ -1857,18 +2388,24 @@ static void iaa_crypto_remove(struct idxd_dev *idxd_dev)
}
idxd_drv_disable_wq(wq);
- rebalance_wq_table();
- if (nr_iaa == 0) {
- iaa_crypto_enabled = false;
- free_wq_table();
+ if (atomic_read(&nr_iaa) == 0) {
+ atomic_set(&iaa_crypto_enabled, 0);
+ pkg_global_wqs_dealloc();
+ free_wq_tables();
+ WARN_ON(!list_empty(&iaa_devices));
+ INIT_LIST_HEAD(&iaa_devices);
module_put(THIS_MODULE);
pr_info("iaa_crypto now DISABLED\n");
+ } else if (rebalance_wq_table()) {
+ atomic_set(&iaa_crypto_enabled, 1);
+ } else {
+ pr_debug("%s: IAA re-rebalancing device wq tables failed\n", __func__);
}
out:
- mutex_unlock(&iaa_devices_lock);
mutex_unlock(&wq->wq_lock);
+ mutex_unlock(&iaa_devices_lock);
}
static enum idxd_dev_type dev_types[] = {
@@ -1891,16 +2428,12 @@ static struct idxd_device_driver iaa_crypto_driver = {
static int __init iaa_crypto_init_module(void)
{
int ret = 0;
- int node;
+
+ INIT_LIST_HEAD(&iaa_devices);
nr_cpus = num_possible_cpus();
- for_each_node_with_cpus(node)
- nr_nodes++;
- if (!nr_nodes) {
- pr_err("IAA couldn't find any nodes with cpus\n");
- return -ENODEV;
- }
- nr_cpus_per_node = nr_cpus / nr_nodes;
+ nr_cpus_per_package = topology_num_cores_per_package();
+ nr_packages = topology_max_packages();
ret = iaa_aecs_init_fixed();
if (ret < 0) {
@@ -1914,6 +2447,27 @@ static int __init iaa_crypto_init_module(void)
goto err_driver_reg;
}
+ ret = driver_create_file(&iaa_crypto_driver.drv,
+ &driver_attr_g_comp_wqs_per_iaa);
+ if (ret) {
+ pr_debug("IAA g_comp_wqs_per_iaa attr creation failed\n");
+ goto err_g_comp_wqs_per_iaa_attr_create;
+ }
+
+ ret = driver_create_file(&iaa_crypto_driver.drv,
+ &driver_attr_distribute_decomps);
+ if (ret) {
+ pr_debug("IAA distribute_decomps attr creation failed\n");
+ goto err_distribute_decomps_attr_create;
+ }
+
+ ret = driver_create_file(&iaa_crypto_driver.drv,
+ &driver_attr_distribute_comps);
+ if (ret) {
+ pr_debug("IAA distribute_comps attr creation failed\n");
+ goto err_distribute_comps_attr_create;
+ }
+
ret = driver_create_file(&iaa_crypto_driver.drv,
&driver_attr_verify_compress);
if (ret) {
@@ -1939,6 +2493,15 @@ static int __init iaa_crypto_init_module(void)
driver_remove_file(&iaa_crypto_driver.drv,
&driver_attr_verify_compress);
err_verify_attr_create:
+ driver_remove_file(&iaa_crypto_driver.drv,
+ &driver_attr_distribute_comps);
+err_distribute_comps_attr_create:
+ driver_remove_file(&iaa_crypto_driver.drv,
+ &driver_attr_distribute_decomps);
+err_distribute_decomps_attr_create:
+ driver_remove_file(&iaa_crypto_driver.drv,
+ &driver_attr_g_comp_wqs_per_iaa);
+err_g_comp_wqs_per_iaa_attr_create:
idxd_driver_unregister(&iaa_crypto_driver);
err_driver_reg:
iaa_aecs_cleanup_fixed();
@@ -1957,6 +2520,12 @@ static void __exit iaa_crypto_cleanup_module(void)
&driver_attr_sync_mode);
driver_remove_file(&iaa_crypto_driver.drv,
&driver_attr_verify_compress);
+ driver_remove_file(&iaa_crypto_driver.drv,
+ &driver_attr_distribute_comps);
+ driver_remove_file(&iaa_crypto_driver.drv,
+ &driver_attr_distribute_decomps);
+ driver_remove_file(&iaa_crypto_driver.drv,
+ &driver_attr_g_comp_wqs_per_iaa);
idxd_driver_unregister(&iaa_crypto_driver);
iaa_aecs_cleanup_fixed();
--
2.27.0
^ permalink raw reply [flat|nested] 48+ messages in thread* [PATCH v14 04/26] crypto: iaa - Simplify, consistency of function parameters, minor stats bug fix.
2026-01-25 3:35 [PATCH v14 00/26] zswap compression batching with optimized iaa_crypto driver Kanchana P Sridhar
` (2 preceding siblings ...)
2026-01-25 3:35 ` [PATCH v14 03/26] crypto: iaa - New architecture for IAA device WQ [de]comp usage & core mapping Kanchana P Sridhar
@ 2026-01-25 3:35 ` Kanchana P Sridhar
2026-01-25 3:35 ` [PATCH v14 05/26] crypto: iaa - Descriptor allocation timeouts with mitigations Kanchana P Sridhar
` (22 subsequent siblings)
26 siblings, 0 replies; 48+ messages in thread
From: Kanchana P Sridhar @ 2026-01-25 3:35 UTC (permalink / raw)
To: linux-kernel, linux-mm, hannes, yosry.ahmed, nphamcs,
chengming.zhou, usamaarif642, ryan.roberts, 21cnbao, ying.huang,
akpm, senozhatsky, sj, kasong, linux-crypto, herbert, davem,
clabbe, ardb, ebiggers, surenb, kristen.c.accardi,
vinicius.gomes, giovanni.cabiddu
Cc: wajdi.k.feghali, kanchana.p.sridhar
This patch further simplifies the code in some places and makes it more
consistent and readable:
1) Change iaa_compress_verify() @dlen parameter to be a value instead of
a pointer, because @dlen's value is only read, not modified by this
procedure.
2) Simplify the success/error return paths in iaa_compress(),
iaa_decompress() and iaa_compress_verify().
3) Delete dev_dbg() statements to make the code more readable.
4) Change return value from descriptor allocation failures to be
-ENODEV, for better maintainability.
5) Fix a minor statistics bug in iaa_decompress(), with the
decomp_bytes getting updated in case of errors.
6) Change some dev_dbg() statements related to verify compress errors
to instead be pr_err().
Signed-off-by: Kanchana P Sridhar <kanchana.p.sridhar@intel.com>
---
drivers/crypto/intel/iaa/iaa_crypto_main.c | 114 +++++----------------
1 file changed, 26 insertions(+), 88 deletions(-)
diff --git a/drivers/crypto/intel/iaa/iaa_crypto_main.c b/drivers/crypto/intel/iaa/iaa_crypto_main.c
index 85944ff212e5..bbc72254982c 100644
--- a/drivers/crypto/intel/iaa/iaa_crypto_main.c
+++ b/drivers/crypto/intel/iaa/iaa_crypto_main.c
@@ -1597,7 +1597,7 @@ static int iaa_remap_for_verify(struct device *dev, struct iaa_wq *iaa_wq,
static int iaa_compress_verify(struct crypto_tfm *tfm, struct acomp_req *req,
struct idxd_wq *wq,
dma_addr_t src_addr, unsigned int slen,
- dma_addr_t dst_addr, unsigned int *dlen)
+ dma_addr_t dst_addr, unsigned int dlen)
{
struct iaa_device_compression_mode *active_compression_mode;
struct iaa_compression_ctx *ctx = crypto_tfm_ctx(tfm);
@@ -1621,10 +1621,8 @@ static int iaa_compress_verify(struct crypto_tfm *tfm, struct acomp_req *req,
idxd_desc = idxd_alloc_desc(wq, IDXD_OP_BLOCK);
if (IS_ERR(idxd_desc)) {
- dev_dbg(dev, "idxd descriptor allocation failed\n");
- dev_dbg(dev, "iaa compress failed: ret=%ld\n",
- PTR_ERR(idxd_desc));
- return PTR_ERR(idxd_desc);
+ dev_dbg(dev, "iaa compress_verify failed: idxd descriptor allocation failure: ret=%ld\n", PTR_ERR(idxd_desc));
+ return -ENODEV;
}
desc = idxd_desc->iax_hw;
@@ -1636,19 +1634,11 @@ static int iaa_compress_verify(struct crypto_tfm *tfm, struct acomp_req *req,
desc->priv = 0;
desc->src1_addr = (u64)dst_addr;
- desc->src1_size = *dlen;
+ desc->src1_size = dlen;
desc->dst_addr = (u64)src_addr;
desc->max_dst_size = slen;
desc->completion_addr = idxd_desc->compl_dma;
- dev_dbg(dev, "(verify) compression mode %s,"
- " desc->src1_addr %llx, desc->src1_size %d,"
- " desc->dst_addr %llx, desc->max_dst_size %d,"
- " desc->src2_addr %llx, desc->src2_size %d\n",
- active_compression_mode->name,
- desc->src1_addr, desc->src1_size, desc->dst_addr,
- desc->max_dst_size, desc->src2_addr, desc->src2_size);
-
ret = idxd_submit_desc(wq, idxd_desc);
if (ret) {
dev_dbg(dev, "submit_desc (verify) failed ret=%d\n", ret);
@@ -1671,14 +1661,10 @@ static int iaa_compress_verify(struct crypto_tfm *tfm, struct acomp_req *req,
goto err;
}
- idxd_free_desc(wq, idxd_desc);
-out:
- return ret;
err:
idxd_free_desc(wq, idxd_desc);
- dev_dbg(dev, "iaa compress failed: ret=%d\n", ret);
- goto out;
+ return ret;
}
static void iaa_desc_complete(struct idxd_desc *idxd_desc,
@@ -1752,15 +1738,15 @@ static void iaa_desc_complete(struct idxd_desc *idxd_desc,
ret = iaa_remap_for_verify(dev, iaa_wq, ctx->req, &src_addr, &dst_addr);
if (ret) {
- dev_dbg(dev, "%s: compress verify remap failed ret=%d\n", __func__, ret);
+ pr_err("%s: compress verify remap failed ret=%d\n", __func__, ret);
err = -EIO;
goto out;
}
ret = iaa_compress_verify(ctx->tfm, ctx->req, iaa_wq->wq, src_addr,
- ctx->req->slen, dst_addr, &ctx->req->dlen);
+ ctx->req->slen, dst_addr, ctx->req->dlen);
if (ret) {
- dev_dbg(dev, "%s: compress verify failed ret=%d\n", __func__, ret);
+ pr_err("%s: compress verify failed ret=%d\n", __func__, ret);
err = -EIO;
}
@@ -1774,7 +1760,8 @@ static void iaa_desc_complete(struct idxd_desc *idxd_desc,
dma_unmap_sg(dev, ctx->req->src, sg_nents(ctx->req->src), DMA_TO_DEVICE);
out:
if (ret != 0)
- dev_dbg(dev, "asynchronous compress failed ret=%d\n", ret);
+ dev_dbg(dev, "asynchronous %s failed ret=%d\n",
+ ctx->compress ? "compress":"decompress", ret);
if (ctx->req->base.complete)
acomp_request_complete(ctx->req, err);
@@ -1784,7 +1771,7 @@ static void iaa_desc_complete(struct idxd_desc *idxd_desc,
iaa_wq_put(idxd_desc->wq);
}
-static int iaa_compress(struct crypto_tfm *tfm, struct acomp_req *req,
+static int iaa_compress(struct crypto_tfm *tfm, struct acomp_req *req,
struct idxd_wq *wq,
dma_addr_t src_addr, unsigned int slen,
dma_addr_t dst_addr, unsigned int *dlen)
@@ -1811,9 +1798,9 @@ static int iaa_compress(struct crypto_tfm *tfm, struct acomp_req *req,
idxd_desc = idxd_alloc_desc(wq, IDXD_OP_BLOCK);
if (IS_ERR(idxd_desc)) {
- dev_dbg(dev, "idxd descriptor allocation failed\n");
- dev_dbg(dev, "iaa compress failed: ret=%ld\n", PTR_ERR(idxd_desc));
- return PTR_ERR(idxd_desc);
+ dev_dbg(dev, "iaa compress failed: idxd descriptor allocation failure: ret=%ld\n",
+ PTR_ERR(idxd_desc));
+ return -ENODEV;
}
desc = idxd_desc->iax_hw;
@@ -1839,21 +1826,8 @@ static int iaa_compress(struct crypto_tfm *tfm, struct acomp_req *req,
idxd_desc->crypto.src_addr = src_addr;
idxd_desc->crypto.dst_addr = dst_addr;
idxd_desc->crypto.compress = true;
-
- dev_dbg(dev, "%s use_async_irq: compression mode %s,"
- " src_addr %llx, dst_addr %llx\n", __func__,
- active_compression_mode->name,
- src_addr, dst_addr);
}
- dev_dbg(dev, "%s: compression mode %s,"
- " desc->src1_addr %llx, desc->src1_size %d,"
- " desc->dst_addr %llx, desc->max_dst_size %d,"
- " desc->src2_addr %llx, desc->src2_size %d\n", __func__,
- active_compression_mode->name,
- desc->src1_addr, desc->src1_size, desc->dst_addr,
- desc->max_dst_size, desc->src2_addr, desc->src2_size);
-
ret = idxd_submit_desc(wq, idxd_desc);
if (ret) {
dev_dbg(dev, "submit_desc failed ret=%d\n", ret);
@@ -1866,7 +1840,6 @@ static int iaa_compress(struct crypto_tfm *tfm, struct acomp_req *req,
if (ctx->async_mode) {
ret = -EINPROGRESS;
- dev_dbg(dev, "%s: returning -EINPROGRESS\n", __func__);
goto out;
}
@@ -1884,15 +1857,10 @@ static int iaa_compress(struct crypto_tfm *tfm, struct acomp_req *req,
*compression_crc = idxd_desc->iax_completion->crc;
- if (!ctx->async_mode)
- idxd_free_desc(wq, idxd_desc);
-out:
- return ret;
err:
idxd_free_desc(wq, idxd_desc);
- dev_dbg(dev, "iaa compress failed: ret=%d\n", ret);
-
- goto out;
+out:
+ return ret;
}
static int iaa_decompress(struct crypto_tfm *tfm, struct acomp_req *req,
@@ -1921,10 +1889,10 @@ static int iaa_decompress(struct crypto_tfm *tfm, struct acomp_req *req,
idxd_desc = idxd_alloc_desc(wq, IDXD_OP_BLOCK);
if (IS_ERR(idxd_desc)) {
- dev_dbg(dev, "idxd descriptor allocation failed\n");
- dev_dbg(dev, "iaa decompress failed: ret=%ld\n",
+ ret = -ENODEV;
+ dev_dbg(dev, "%s: idxd descriptor allocation failed: ret=%ld\n", __func__,
PTR_ERR(idxd_desc));
- return PTR_ERR(idxd_desc);
+ return ret;
}
desc = idxd_desc->iax_hw;
@@ -1948,21 +1916,8 @@ static int iaa_decompress(struct crypto_tfm *tfm, struct acomp_req *req,
idxd_desc->crypto.src_addr = src_addr;
idxd_desc->crypto.dst_addr = dst_addr;
idxd_desc->crypto.compress = false;
-
- dev_dbg(dev, "%s: use_async_irq compression mode %s,"
- " src_addr %llx, dst_addr %llx\n", __func__,
- active_compression_mode->name,
- src_addr, dst_addr);
}
- dev_dbg(dev, "%s: decompression mode %s,"
- " desc->src1_addr %llx, desc->src1_size %d,"
- " desc->dst_addr %llx, desc->max_dst_size %d,"
- " desc->src2_addr %llx, desc->src2_size %d\n", __func__,
- active_compression_mode->name,
- desc->src1_addr, desc->src1_size, desc->dst_addr,
- desc->max_dst_size, desc->src2_addr, desc->src2_size);
-
ret = idxd_submit_desc(wq, idxd_desc);
if (ret) {
dev_dbg(dev, "submit_desc failed ret=%d\n", ret);
@@ -1975,7 +1930,6 @@ static int iaa_decompress(struct crypto_tfm *tfm, struct acomp_req *req,
if (ctx->async_mode) {
ret = -EINPROGRESS;
- dev_dbg(dev, "%s: returning -EINPROGRESS\n", __func__);
goto out;
}
@@ -1997,23 +1951,19 @@ static int iaa_decompress(struct crypto_tfm *tfm, struct acomp_req *req,
}
} else {
req->dlen = idxd_desc->iax_completion->output_size;
+
+ /* Update stats */
+ update_total_decomp_bytes_in(slen);
+ update_wq_decomp_bytes(wq, slen);
}
*dlen = req->dlen;
- if (!ctx->async_mode)
+err:
+ if (idxd_desc)
idxd_free_desc(wq, idxd_desc);
-
- /* Update stats */
- update_total_decomp_bytes_in(slen);
- update_wq_decomp_bytes(wq, slen);
out:
return ret;
-err:
- idxd_free_desc(wq, idxd_desc);
- dev_dbg(dev, "iaa decompress failed: ret=%d\n", ret);
-
- goto out;
}
static int iaa_comp_acompress(struct acomp_req *req)
@@ -2060,9 +2010,6 @@ static int iaa_comp_acompress(struct acomp_req *req)
goto out;
}
src_addr = sg_dma_address(req->src);
- dev_dbg(dev, "dma_map_sg, src_addr %llx, nr_sgs %d, req->src %p,"
- " req->slen %d, sg_dma_len(sg) %d\n", src_addr, nr_sgs,
- req->src, req->slen, sg_dma_len(req->src));
nr_sgs = dma_map_sg(dev, req->dst, sg_nents(req->dst), DMA_FROM_DEVICE);
if (nr_sgs <= 0 || nr_sgs > 1) {
@@ -2073,9 +2020,6 @@ static int iaa_comp_acompress(struct acomp_req *req)
goto err_map_dst;
}
dst_addr = sg_dma_address(req->dst);
- dev_dbg(dev, "dma_map_sg, dst_addr %llx, nr_sgs %d, req->dst %p,"
- " req->dlen %d, sg_dma_len(sg) %d\n", dst_addr, nr_sgs,
- req->dst, req->dlen, sg_dma_len(req->dst));
ret = iaa_compress(tfm, req, wq, src_addr, req->slen, dst_addr,
&req->dlen);
@@ -2090,7 +2034,7 @@ static int iaa_comp_acompress(struct acomp_req *req)
}
ret = iaa_compress_verify(tfm, req, wq, src_addr, req->slen,
- dst_addr, &req->dlen);
+ dst_addr, req->dlen);
if (ret)
dev_dbg(dev, "asynchronous compress verification failed ret=%d\n", ret);
@@ -2153,9 +2097,6 @@ static int iaa_comp_adecompress(struct acomp_req *req)
goto out;
}
src_addr = sg_dma_address(req->src);
- dev_dbg(dev, "dma_map_sg, src_addr %llx, nr_sgs %d, req->src %p,"
- " req->slen %d, sg_dma_len(sg) %d\n", src_addr, nr_sgs,
- req->src, req->slen, sg_dma_len(req->src));
nr_sgs = dma_map_sg(dev, req->dst, sg_nents(req->dst), DMA_FROM_DEVICE);
if (nr_sgs <= 0 || nr_sgs > 1) {
@@ -2166,9 +2107,6 @@ static int iaa_comp_adecompress(struct acomp_req *req)
goto err_map_dst;
}
dst_addr = sg_dma_address(req->dst);
- dev_dbg(dev, "dma_map_sg, dst_addr %llx, nr_sgs %d, req->dst %p,"
- " req->dlen %d, sg_dma_len(sg) %d\n", dst_addr, nr_sgs,
- req->dst, req->dlen, sg_dma_len(req->dst));
ret = iaa_decompress(tfm, req, wq, src_addr, req->slen,
dst_addr, &req->dlen);
--
2.27.0
^ permalink raw reply [flat|nested] 48+ messages in thread* [PATCH v14 05/26] crypto: iaa - Descriptor allocation timeouts with mitigations.
2026-01-25 3:35 [PATCH v14 00/26] zswap compression batching with optimized iaa_crypto driver Kanchana P Sridhar
` (3 preceding siblings ...)
2026-01-25 3:35 ` [PATCH v14 04/26] crypto: iaa - Simplify, consistency of function parameters, minor stats bug fix Kanchana P Sridhar
@ 2026-01-25 3:35 ` Kanchana P Sridhar
2026-01-25 3:35 ` [PATCH v14 06/26] crypto: iaa - iaa_wq uses percpu_refs for get/put reference counting Kanchana P Sridhar
` (21 subsequent siblings)
26 siblings, 0 replies; 48+ messages in thread
From: Kanchana P Sridhar @ 2026-01-25 3:35 UTC (permalink / raw)
To: linux-kernel, linux-mm, hannes, yosry.ahmed, nphamcs,
chengming.zhou, usamaarif642, ryan.roberts, 21cnbao, ying.huang,
akpm, senozhatsky, sj, kasong, linux-crypto, herbert, davem,
clabbe, ardb, ebiggers, surenb, kristen.c.accardi,
vinicius.gomes, giovanni.cabiddu
Cc: wajdi.k.feghali, kanchana.p.sridhar
This patch modifies the descriptor allocation from blocking to
non-blocking with bounded retries or "timeouts".
This is necessary to prevent task blocked errors in high contention
scenarios, for instance, when the platform has only 1 IAA device
enabled. With 1 IAA device enabled per package on a dual-package
Sapphire Rapids with 56 cores/package, there are 112 logical cores
mapped to this single IAA device. In this scenario, the task blocked
errors can occur because idxd_alloc_desc() is called with
IDXD_OP_BLOCK. With batching, multiple descriptors will need to be
allocated per batch. Any process that is able to do so, can cause
contention for allocating descriptors for all other processes that share
the use of the same sbitmap_queue. Under IDXD_OP_BLOCK, this causes
compress/decompress jobs to stall in stress test scenarios
(e.g. zswap_store() of 2M folios).
In order to make the iaa_crypto driver be more fail-safe, this commit
implements the following:
1) Change compress/decompress descriptor allocations to be non-blocking
with retries ("timeouts").
2) Return compress error to zswap if descriptor allocation with timeouts
fails during compress ops. zswap_store() will return an error and the
folio gets stored in the backing swap device.
3) Fallback to software decompress if descriptor allocation with timeouts
fails during decompress ops.
With these fixes, there are no task blocked errors seen under stress
testing conditions, and no performance degradation observed.
Signed-off-by: Kanchana P Sridhar <kanchana.p.sridhar@intel.com>
---
drivers/crypto/intel/iaa/iaa_crypto.h | 5 ++
drivers/crypto/intel/iaa/iaa_crypto_main.c | 58 +++++++++++++++-------
2 files changed, 44 insertions(+), 19 deletions(-)
diff --git a/drivers/crypto/intel/iaa/iaa_crypto.h b/drivers/crypto/intel/iaa/iaa_crypto.h
index 549ac98a9366..cc76a047b54a 100644
--- a/drivers/crypto/intel/iaa/iaa_crypto.h
+++ b/drivers/crypto/intel/iaa/iaa_crypto.h
@@ -21,6 +21,9 @@
#define IAA_COMPLETION_TIMEOUT 1000000
+#define IAA_ALLOC_DESC_COMP_TIMEOUT 1000
+#define IAA_ALLOC_DESC_DECOMP_TIMEOUT 500
+
#define IAA_ANALYTICS_ERROR 0x0a
#define IAA_ERROR_DECOMP_BUF_OVERFLOW 0x0b
#define IAA_ERROR_COMP_BUF_OVERFLOW 0x19
@@ -141,6 +144,8 @@ enum iaa_mode {
struct iaa_compression_ctx {
enum iaa_mode mode;
+ u16 alloc_comp_desc_timeout;
+ u16 alloc_decomp_desc_timeout;
bool verify_compress;
bool async_mode;
bool use_irq;
diff --git a/drivers/crypto/intel/iaa/iaa_crypto_main.c b/drivers/crypto/intel/iaa/iaa_crypto_main.c
index bbc72254982c..3466414f926a 100644
--- a/drivers/crypto/intel/iaa/iaa_crypto_main.c
+++ b/drivers/crypto/intel/iaa/iaa_crypto_main.c
@@ -1603,7 +1603,8 @@ static int iaa_compress_verify(struct crypto_tfm *tfm, struct acomp_req *req,
struct iaa_compression_ctx *ctx = crypto_tfm_ctx(tfm);
u32 *compression_crc = acomp_request_ctx(req);
struct iaa_device *iaa_device;
- struct idxd_desc *idxd_desc;
+ struct idxd_desc *idxd_desc = ERR_PTR(-EAGAIN);
+ u16 alloc_desc_retries = 0;
struct iax_hw_desc *desc;
struct idxd_device *idxd;
struct iaa_wq *iaa_wq;
@@ -1619,7 +1620,11 @@ static int iaa_compress_verify(struct crypto_tfm *tfm, struct acomp_req *req,
active_compression_mode = get_iaa_device_compression_mode(iaa_device, ctx->mode);
- idxd_desc = idxd_alloc_desc(wq, IDXD_OP_BLOCK);
+ while ((idxd_desc == ERR_PTR(-EAGAIN)) && (alloc_desc_retries++ < ctx->alloc_decomp_desc_timeout)) {
+ idxd_desc = idxd_alloc_desc(wq, IDXD_OP_NONBLOCK);
+ cpu_relax();
+ }
+
if (IS_ERR(idxd_desc)) {
dev_dbg(dev, "iaa compress_verify failed: idxd descriptor allocation failure: ret=%ld\n", PTR_ERR(idxd_desc));
return -ENODEV;
@@ -1780,7 +1785,8 @@ static int iaa_compress(struct crypto_tfm *tfm, struct acomp_req *req,
struct iaa_compression_ctx *ctx = crypto_tfm_ctx(tfm);
u32 *compression_crc = acomp_request_ctx(req);
struct iaa_device *iaa_device;
- struct idxd_desc *idxd_desc;
+ struct idxd_desc *idxd_desc = ERR_PTR(-EAGAIN);
+ u16 alloc_desc_retries = 0;
struct iax_hw_desc *desc;
struct idxd_device *idxd;
struct iaa_wq *iaa_wq;
@@ -1796,7 +1802,11 @@ static int iaa_compress(struct crypto_tfm *tfm, struct acomp_req *req,
active_compression_mode = get_iaa_device_compression_mode(iaa_device, ctx->mode);
- idxd_desc = idxd_alloc_desc(wq, IDXD_OP_BLOCK);
+ while ((idxd_desc == ERR_PTR(-EAGAIN)) && (alloc_desc_retries++ < ctx->alloc_comp_desc_timeout)) {
+ idxd_desc = idxd_alloc_desc(wq, IDXD_OP_NONBLOCK);
+ cpu_relax();
+ }
+
if (IS_ERR(idxd_desc)) {
dev_dbg(dev, "iaa compress failed: idxd descriptor allocation failure: ret=%ld\n",
PTR_ERR(idxd_desc));
@@ -1871,7 +1881,8 @@ static int iaa_decompress(struct crypto_tfm *tfm, struct acomp_req *req,
struct iaa_device_compression_mode *active_compression_mode;
struct iaa_compression_ctx *ctx = crypto_tfm_ctx(tfm);
struct iaa_device *iaa_device;
- struct idxd_desc *idxd_desc;
+ struct idxd_desc *idxd_desc = ERR_PTR(-EAGAIN);
+ u16 alloc_desc_retries = 0;
struct iax_hw_desc *desc;
struct idxd_device *idxd;
struct iaa_wq *iaa_wq;
@@ -1887,12 +1898,17 @@ static int iaa_decompress(struct crypto_tfm *tfm, struct acomp_req *req,
active_compression_mode = get_iaa_device_compression_mode(iaa_device, ctx->mode);
- idxd_desc = idxd_alloc_desc(wq, IDXD_OP_BLOCK);
+ while ((idxd_desc == ERR_PTR(-EAGAIN)) && (alloc_desc_retries++ < ctx->alloc_decomp_desc_timeout)) {
+ idxd_desc = idxd_alloc_desc(wq, IDXD_OP_NONBLOCK);
+ cpu_relax();
+ }
+
if (IS_ERR(idxd_desc)) {
ret = -ENODEV;
dev_dbg(dev, "%s: idxd descriptor allocation failed: ret=%ld\n", __func__,
PTR_ERR(idxd_desc));
- return ret;
+ idxd_desc = NULL;
+ goto fallback_software_decomp;
}
desc = idxd_desc->iax_hw;
@@ -1921,7 +1937,7 @@ static int iaa_decompress(struct crypto_tfm *tfm, struct acomp_req *req,
ret = idxd_submit_desc(wq, idxd_desc);
if (ret) {
dev_dbg(dev, "submit_desc failed ret=%d\n", ret);
- goto err;
+ goto fallback_software_decomp;
}
/* Update stats */
@@ -1934,19 +1950,21 @@ static int iaa_decompress(struct crypto_tfm *tfm, struct acomp_req *req,
}
ret = check_completion(dev, idxd_desc->iax_completion, false, false);
+
+fallback_software_decomp:
if (ret) {
- dev_dbg(dev, "%s: check_completion failed ret=%d\n", __func__, ret);
- if (idxd_desc->iax_completion->status == IAA_ANALYTICS_ERROR) {
+ dev_dbg(dev, "%s: desc allocation/submission/check_completion failed ret=%d\n", __func__, ret);
+ if (idxd_desc && idxd_desc->iax_completion->status == IAA_ANALYTICS_ERROR) {
pr_warn("%s: falling back to deflate-generic decompress, "
"analytics error code %x\n", __func__,
idxd_desc->iax_completion->error_code);
- ret = deflate_generic_decompress(req);
- if (ret) {
- dev_dbg(dev, "%s: deflate-generic failed ret=%d\n",
- __func__, ret);
- goto err;
- }
- } else {
+ }
+
+ ret = deflate_generic_decompress(req);
+
+ if (ret) {
+ pr_err("%s: iaa decompress failed: deflate-generic fallback error ret=%d\n",
+ __func__, ret);
goto err;
}
} else {
@@ -2127,6 +2145,8 @@ static int iaa_comp_adecompress(struct acomp_req *req)
static void compression_ctx_init(struct iaa_compression_ctx *ctx)
{
+ ctx->alloc_comp_desc_timeout = IAA_ALLOC_DESC_COMP_TIMEOUT;
+ ctx->alloc_decomp_desc_timeout = IAA_ALLOC_DESC_DECOMP_TIMEOUT;
ctx->verify_compress = iaa_verify_compress;
ctx->async_mode = async_mode;
ctx->use_irq = use_irq;
@@ -2141,10 +2161,10 @@ static int iaa_comp_init_fixed(struct crypto_acomp *acomp_tfm)
struct crypto_tfm *tfm = crypto_acomp_tfm(acomp_tfm);
struct iaa_compression_ctx *ctx = crypto_tfm_ctx(tfm);
- compression_ctx_init(ctx);
-
ctx->mode = IAA_MODE_FIXED;
+ compression_ctx_init(ctx);
+
return 0;
}
--
2.27.0
^ permalink raw reply [flat|nested] 48+ messages in thread* [PATCH v14 06/26] crypto: iaa - iaa_wq uses percpu_refs for get/put reference counting.
2026-01-25 3:35 [PATCH v14 00/26] zswap compression batching with optimized iaa_crypto driver Kanchana P Sridhar
` (4 preceding siblings ...)
2026-01-25 3:35 ` [PATCH v14 05/26] crypto: iaa - Descriptor allocation timeouts with mitigations Kanchana P Sridhar
@ 2026-01-25 3:35 ` Kanchana P Sridhar
2026-01-25 3:35 ` [PATCH v14 07/26] crypto: iaa - Simplify the code flow in iaa_compress() and iaa_decompress() Kanchana P Sridhar
` (20 subsequent siblings)
26 siblings, 0 replies; 48+ messages in thread
From: Kanchana P Sridhar @ 2026-01-25 3:35 UTC (permalink / raw)
To: linux-kernel, linux-mm, hannes, yosry.ahmed, nphamcs,
chengming.zhou, usamaarif642, ryan.roberts, 21cnbao, ying.huang,
akpm, senozhatsky, sj, kasong, linux-crypto, herbert, davem,
clabbe, ardb, ebiggers, surenb, kristen.c.accardi,
vinicius.gomes, giovanni.cabiddu
Cc: wajdi.k.feghali, kanchana.p.sridhar
This patch modifies the reference counting on "struct iaa_wq" to be a
percpu_ref in atomic mode, instead of an "int refcount" combined with
the "idxd->dev_lock" spin_lock currently used as a synchronization
mechanism to achieve get/put semantics.
This enables a more light-weight, cleaner and effective refcount
implementation for the iaa_wq, that prevents race conditions and
significantly reduces batch compress/decompress latency submitted to
the IAA accelerator.
For a single-threaded madvise-based workload with the Silesia.tar
dataset, these are the before/after batch compression latencies for a
compress batch of 8 pages:
==================================
p50 (ns) p99 (ns)
==================================
before 5,576 5,992
after 5,472 5,848
Change -104 -144
==================================
Signed-off-by: Kanchana P Sridhar <kanchana.p.sridhar@intel.com>
---
drivers/crypto/intel/iaa/iaa_crypto.h | 4 +-
drivers/crypto/intel/iaa/iaa_crypto_main.c | 119 +++++++--------------
2 files changed, 41 insertions(+), 82 deletions(-)
diff --git a/drivers/crypto/intel/iaa/iaa_crypto.h b/drivers/crypto/intel/iaa/iaa_crypto.h
index cc76a047b54a..9611f2518f42 100644
--- a/drivers/crypto/intel/iaa/iaa_crypto.h
+++ b/drivers/crypto/intel/iaa/iaa_crypto.h
@@ -47,8 +47,8 @@ struct iaa_wq {
struct list_head list;
struct idxd_wq *wq;
- int ref;
- bool remove;
+ struct percpu_ref ref;
+ bool free;
bool mapped;
struct iaa_device *iaa_device;
diff --git a/drivers/crypto/intel/iaa/iaa_crypto_main.c b/drivers/crypto/intel/iaa/iaa_crypto_main.c
index 3466414f926a..01d7150dbbd8 100644
--- a/drivers/crypto/intel/iaa/iaa_crypto_main.c
+++ b/drivers/crypto/intel/iaa/iaa_crypto_main.c
@@ -702,7 +702,7 @@ static void del_iaa_device(struct iaa_device *iaa_device)
static void free_iaa_device(struct iaa_device *iaa_device)
{
- if (!iaa_device)
+ if (!iaa_device || iaa_device->n_wq)
return;
remove_device_compression_modes(iaa_device);
@@ -732,6 +732,13 @@ static bool iaa_has_wq(struct iaa_device *iaa_device, struct idxd_wq *wq)
return false;
}
+static void __iaa_wq_release(struct percpu_ref *ref)
+{
+ struct iaa_wq *iaa_wq = container_of(ref, typeof(*iaa_wq), ref);
+
+ iaa_wq->free = true;
+}
+
static int add_iaa_wq(struct iaa_device *iaa_device, struct idxd_wq *wq,
struct iaa_wq **new_wq)
{
@@ -739,11 +746,20 @@ static int add_iaa_wq(struct iaa_device *iaa_device, struct idxd_wq *wq,
struct pci_dev *pdev = idxd->pdev;
struct device *dev = &pdev->dev;
struct iaa_wq *iaa_wq;
+ int ret;
iaa_wq = kzalloc(sizeof(*iaa_wq), GFP_KERNEL);
if (!iaa_wq)
return -ENOMEM;
+ ret = percpu_ref_init(&iaa_wq->ref, __iaa_wq_release,
+ PERCPU_REF_INIT_ATOMIC, GFP_KERNEL);
+
+ if (ret) {
+ kfree(iaa_wq);
+ return -ENOMEM;
+ }
+
iaa_wq->wq = wq;
iaa_wq->iaa_device = iaa_device;
idxd_wq_set_private(wq, iaa_wq);
@@ -819,6 +835,9 @@ static void __free_iaa_wq(struct iaa_wq *iaa_wq)
if (!iaa_wq)
return;
+ WARN_ON(!percpu_ref_is_zero(&iaa_wq->ref));
+ percpu_ref_exit(&iaa_wq->ref);
+
iaa_device = iaa_wq->iaa_device;
if (iaa_device->n_wq == 0)
free_iaa_device(iaa_wq->iaa_device);
@@ -913,53 +932,6 @@ static int save_iaa_wq(struct idxd_wq *wq)
return ret;
}
-static int iaa_wq_get(struct idxd_wq *wq)
-{
- struct idxd_device *idxd = wq->idxd;
- struct iaa_wq *iaa_wq;
- int ret = 0;
-
- spin_lock(&idxd->dev_lock);
- iaa_wq = idxd_wq_get_private(wq);
- if (iaa_wq && !iaa_wq->remove) {
- iaa_wq->ref++;
- idxd_wq_get(wq);
- } else {
- ret = -ENODEV;
- }
- spin_unlock(&idxd->dev_lock);
-
- return ret;
-}
-
-static int iaa_wq_put(struct idxd_wq *wq)
-{
- struct idxd_device *idxd = wq->idxd;
- struct iaa_wq *iaa_wq;
- bool free = false;
- int ret = 0;
-
- spin_lock(&idxd->dev_lock);
- iaa_wq = idxd_wq_get_private(wq);
- if (iaa_wq) {
- iaa_wq->ref--;
- if (iaa_wq->ref == 0 && iaa_wq->remove) {
- idxd_wq_set_private(wq, NULL);
- free = true;
- }
- idxd_wq_put(wq);
- } else {
- ret = -ENODEV;
- }
- spin_unlock(&idxd->dev_lock);
- if (free) {
- __free_iaa_wq(iaa_wq);
- kfree(iaa_wq);
- }
-
- return ret;
-}
-
/***************************************************************
* Mapping IAA devices and wqs to cores with per-cpu wq_tables.
***************************************************************/
@@ -1773,7 +1745,7 @@ static void iaa_desc_complete(struct idxd_desc *idxd_desc,
if (free_desc)
idxd_free_desc(idxd_desc->wq, idxd_desc);
- iaa_wq_put(idxd_desc->wq);
+ percpu_ref_put(&iaa_wq->ref);
}
static int iaa_compress(struct crypto_tfm *tfm, struct acomp_req *req,
@@ -2004,19 +1976,13 @@ static int iaa_comp_acompress(struct acomp_req *req)
cpu = get_cpu();
wq = comp_wq_table_next_wq(cpu);
put_cpu();
- if (!wq) {
- pr_debug("no wq configured for cpu=%d\n", cpu);
- return -ENODEV;
- }
- ret = iaa_wq_get(wq);
- if (ret) {
+ iaa_wq = wq ? idxd_wq_get_private(wq) : NULL;
+ if (unlikely(!iaa_wq || !percpu_ref_tryget(&iaa_wq->ref))) {
pr_debug("no wq available for cpu=%d\n", cpu);
return -ENODEV;
}
- iaa_wq = idxd_wq_get_private(wq);
-
dev = &wq->idxd->pdev->dev;
nr_sgs = dma_map_sg(dev, req->src, sg_nents(req->src), DMA_TO_DEVICE);
@@ -2069,7 +2035,7 @@ static int iaa_comp_acompress(struct acomp_req *req)
err_map_dst:
dma_unmap_sg(dev, req->src, sg_nents(req->src), DMA_TO_DEVICE);
out:
- iaa_wq_put(wq);
+ percpu_ref_put(&iaa_wq->ref);
return ret;
}
@@ -2091,19 +2057,13 @@ static int iaa_comp_adecompress(struct acomp_req *req)
cpu = get_cpu();
wq = decomp_wq_table_next_wq(cpu);
put_cpu();
- if (!wq) {
- pr_debug("no wq configured for cpu=%d\n", cpu);
- return -ENODEV;
- }
- ret = iaa_wq_get(wq);
- if (ret) {
+ iaa_wq = wq ? idxd_wq_get_private(wq) : NULL;
+ if (unlikely(!iaa_wq || !percpu_ref_tryget(&iaa_wq->ref))) {
pr_debug("no wq available for cpu=%d\n", cpu);
- return -ENODEV;
+ return deflate_generic_decompress(req);
}
- iaa_wq = idxd_wq_get_private(wq);
-
dev = &wq->idxd->pdev->dev;
nr_sgs = dma_map_sg(dev, req->src, sg_nents(req->src), DMA_TO_DEVICE);
@@ -2138,7 +2098,7 @@ static int iaa_comp_adecompress(struct acomp_req *req)
err_map_dst:
dma_unmap_sg(dev, req->src, sg_nents(req->src), DMA_TO_DEVICE);
out:
- iaa_wq_put(wq);
+ percpu_ref_put(&iaa_wq->ref);
return ret;
}
@@ -2311,7 +2271,6 @@ static void iaa_crypto_remove(struct idxd_dev *idxd_dev)
struct idxd_wq *wq = idxd_dev_to_wq(idxd_dev);
struct idxd_device *idxd = wq->idxd;
struct iaa_wq *iaa_wq;
- bool free = false;
atomic_set(&iaa_crypto_enabled, 0);
idxd_wq_quiesce(wq);
@@ -2332,18 +2291,18 @@ static void iaa_crypto_remove(struct idxd_dev *idxd_dev)
goto out;
}
- if (iaa_wq->ref) {
- iaa_wq->remove = true;
- } else {
- wq = iaa_wq->wq;
- idxd_wq_set_private(wq, NULL);
- free = true;
- }
+ /* Drop the initial reference. */
+ percpu_ref_kill(&iaa_wq->ref);
+
+ while (!iaa_wq->free)
+ cpu_relax();
+
+ __free_iaa_wq(iaa_wq);
+
+ idxd_wq_set_private(wq, NULL);
spin_unlock(&idxd->dev_lock);
- if (free) {
- __free_iaa_wq(iaa_wq);
- kfree(iaa_wq);
- }
+
+ kfree(iaa_wq);
idxd_drv_disable_wq(wq);
--
2.27.0
^ permalink raw reply [flat|nested] 48+ messages in thread* [PATCH v14 07/26] crypto: iaa - Simplify the code flow in iaa_compress() and iaa_decompress().
2026-01-25 3:35 [PATCH v14 00/26] zswap compression batching with optimized iaa_crypto driver Kanchana P Sridhar
` (5 preceding siblings ...)
2026-01-25 3:35 ` [PATCH v14 06/26] crypto: iaa - iaa_wq uses percpu_refs for get/put reference counting Kanchana P Sridhar
@ 2026-01-25 3:35 ` Kanchana P Sridhar
2026-01-25 3:35 ` [PATCH v14 08/26] crypto: iaa - Refactor hardware descriptor setup into separate procedures Kanchana P Sridhar
` (19 subsequent siblings)
26 siblings, 0 replies; 48+ messages in thread
From: Kanchana P Sridhar @ 2026-01-25 3:35 UTC (permalink / raw)
To: linux-kernel, linux-mm, hannes, yosry.ahmed, nphamcs,
chengming.zhou, usamaarif642, ryan.roberts, 21cnbao, ying.huang,
akpm, senozhatsky, sj, kasong, linux-crypto, herbert, davem,
clabbe, ardb, ebiggers, surenb, kristen.c.accardi,
vinicius.gomes, giovanni.cabiddu
Cc: wajdi.k.feghali, kanchana.p.sridhar
This commit simplifies and streamlines the logic in the core
iaa_compress() and iaa_decompress() routines, eliminates branches, etc.
This makes it easier to add improvements such as asynchronous
submissions and polling for job completions, essential to accomplish
batching with hardware parallelism.
Signed-off-by: Kanchana P Sridhar <kanchana.p.sridhar@intel.com>
---
drivers/crypto/intel/iaa/iaa_crypto_main.c | 114 ++++++++++++---------
1 file changed, 67 insertions(+), 47 deletions(-)
diff --git a/drivers/crypto/intel/iaa/iaa_crypto_main.c b/drivers/crypto/intel/iaa/iaa_crypto_main.c
index 01d7150dbbd8..a727496d5791 100644
--- a/drivers/crypto/intel/iaa/iaa_crypto_main.c
+++ b/drivers/crypto/intel/iaa/iaa_crypto_main.c
@@ -1800,7 +1800,34 @@ static int iaa_compress(struct crypto_tfm *tfm, struct acomp_req *req,
desc->src2_size = sizeof(struct aecs_comp_table_record);
desc->completion_addr = idxd_desc->compl_dma;
- if (ctx->use_irq) {
+ if (likely(!ctx->use_irq)) {
+ ret = idxd_submit_desc(wq, idxd_desc);
+ if (ret) {
+ dev_dbg(dev, "submit_desc failed ret=%d\n", ret);
+ goto out;
+ }
+
+ /* Update stats */
+ update_total_comp_calls();
+ update_wq_comp_calls(wq);
+
+ if (ctx->async_mode)
+ return -EINPROGRESS;
+
+ ret = check_completion(dev, idxd_desc->iax_completion, true, false);
+ if (ret) {
+ dev_dbg(dev, "check_completion failed ret=%d\n", ret);
+ goto out;
+ }
+
+ *dlen = idxd_desc->iax_completion->output_size;
+
+ /* Update stats */
+ update_total_comp_bytes_out(*dlen);
+ update_wq_comp_bytes(wq, *dlen);
+
+ *compression_crc = idxd_desc->iax_completion->crc;
+ } else {
desc->flags |= IDXD_OP_FLAG_RCI;
idxd_desc->crypto.req = req;
@@ -1808,40 +1835,23 @@ static int iaa_compress(struct crypto_tfm *tfm, struct acomp_req *req,
idxd_desc->crypto.src_addr = src_addr;
idxd_desc->crypto.dst_addr = dst_addr;
idxd_desc->crypto.compress = true;
- }
-
- ret = idxd_submit_desc(wq, idxd_desc);
- if (ret) {
- dev_dbg(dev, "submit_desc failed ret=%d\n", ret);
- goto err;
- }
- /* Update stats */
- update_total_comp_calls();
- update_wq_comp_calls(wq);
+ ret = idxd_submit_desc(wq, idxd_desc);
+ if (ret) {
+ dev_dbg(dev, "submit_desc failed ret=%d\n", ret);
+ goto out;
+ }
- if (ctx->async_mode) {
- ret = -EINPROGRESS;
- goto out;
- }
+ /* Update stats */
+ update_total_comp_calls();
+ update_wq_comp_calls(wq);
- ret = check_completion(dev, idxd_desc->iax_completion, true, false);
- if (ret) {
- dev_dbg(dev, "check_completion failed ret=%d\n", ret);
- goto err;
+ return -EINPROGRESS;
}
- *dlen = idxd_desc->iax_completion->output_size;
-
- /* Update stats */
- update_total_comp_bytes_out(*dlen);
- update_wq_comp_bytes(wq, *dlen);
-
- *compression_crc = idxd_desc->iax_completion->crc;
-
-err:
- idxd_free_desc(wq, idxd_desc);
out:
+ idxd_free_desc(wq, idxd_desc);
+
return ret;
}
@@ -1896,7 +1906,22 @@ static int iaa_decompress(struct crypto_tfm *tfm, struct acomp_req *req,
desc->src1_size = slen;
desc->completion_addr = idxd_desc->compl_dma;
- if (ctx->use_irq) {
+ if (likely(!ctx->use_irq)) {
+ ret = idxd_submit_desc(wq, idxd_desc);
+ if (ret) {
+ dev_dbg(dev, "submit_desc failed ret=%d\n", ret);
+ goto fallback_software_decomp;
+ }
+
+ /* Update stats */
+ update_total_decomp_calls();
+ update_wq_decomp_calls(wq);
+
+ if (ctx->async_mode)
+ return -EINPROGRESS;
+
+ ret = check_completion(dev, idxd_desc->iax_completion, false, false);
+ } else {
desc->flags |= IDXD_OP_FLAG_RCI;
idxd_desc->crypto.req = req;
@@ -1904,25 +1929,20 @@ static int iaa_decompress(struct crypto_tfm *tfm, struct acomp_req *req,
idxd_desc->crypto.src_addr = src_addr;
idxd_desc->crypto.dst_addr = dst_addr;
idxd_desc->crypto.compress = false;
- }
- ret = idxd_submit_desc(wq, idxd_desc);
- if (ret) {
- dev_dbg(dev, "submit_desc failed ret=%d\n", ret);
- goto fallback_software_decomp;
- }
+ ret = idxd_submit_desc(wq, idxd_desc);
+ if (ret) {
+ dev_dbg(dev, "submit_desc failed ret=%d\n", ret);
+ goto fallback_software_decomp;
+ }
- /* Update stats */
- update_total_decomp_calls();
- update_wq_decomp_calls(wq);
+ /* Update stats */
+ update_total_decomp_calls();
+ update_wq_decomp_calls(wq);
- if (ctx->async_mode) {
- ret = -EINPROGRESS;
- goto out;
+ return -EINPROGRESS;
}
- ret = check_completion(dev, idxd_desc->iax_completion, false, false);
-
fallback_software_decomp:
if (ret) {
dev_dbg(dev, "%s: desc allocation/submission/check_completion failed ret=%d\n", __func__, ret);
@@ -1937,7 +1957,7 @@ static int iaa_decompress(struct crypto_tfm *tfm, struct acomp_req *req,
if (ret) {
pr_err("%s: iaa decompress failed: deflate-generic fallback error ret=%d\n",
__func__, ret);
- goto err;
+ goto out;
}
} else {
req->dlen = idxd_desc->iax_completion->output_size;
@@ -1949,10 +1969,10 @@ static int iaa_decompress(struct crypto_tfm *tfm, struct acomp_req *req,
*dlen = req->dlen;
-err:
+out:
if (idxd_desc)
idxd_free_desc(wq, idxd_desc);
-out:
+
return ret;
}
--
2.27.0
^ permalink raw reply [flat|nested] 48+ messages in thread* [PATCH v14 08/26] crypto: iaa - Refactor hardware descriptor setup into separate procedures.
2026-01-25 3:35 [PATCH v14 00/26] zswap compression batching with optimized iaa_crypto driver Kanchana P Sridhar
` (6 preceding siblings ...)
2026-01-25 3:35 ` [PATCH v14 07/26] crypto: iaa - Simplify the code flow in iaa_compress() and iaa_decompress() Kanchana P Sridhar
@ 2026-01-25 3:35 ` Kanchana P Sridhar
2026-01-25 3:35 ` [PATCH v14 09/26] crypto: iaa - Simplified, efficient job submissions for non-irq mode Kanchana P Sridhar
` (18 subsequent siblings)
26 siblings, 0 replies; 48+ messages in thread
From: Kanchana P Sridhar @ 2026-01-25 3:35 UTC (permalink / raw)
To: linux-kernel, linux-mm, hannes, yosry.ahmed, nphamcs,
chengming.zhou, usamaarif642, ryan.roberts, 21cnbao, ying.huang,
akpm, senozhatsky, sj, kasong, linux-crypto, herbert, davem,
clabbe, ardb, ebiggers, surenb, kristen.c.accardi,
vinicius.gomes, giovanni.cabiddu
Cc: wajdi.k.feghali, kanchana.p.sridhar
This patch refactors the code that sets up the "struct iax_hw_desc" for
compress/decompress ops, into distinct procedures to make the code more
readable.
Also, get_iaa_device_compression_mode() is deleted and the compression
mode directly accessed from the iaa_device in the calling procedures.
Signed-off-by: Kanchana P Sridhar <kanchana.p.sridhar@intel.com>
---
drivers/crypto/intel/iaa/iaa_crypto_main.c | 99 ++++++++++++----------
1 file changed, 56 insertions(+), 43 deletions(-)
diff --git a/drivers/crypto/intel/iaa/iaa_crypto_main.c b/drivers/crypto/intel/iaa/iaa_crypto_main.c
index a727496d5791..f0e9eb52eec4 100644
--- a/drivers/crypto/intel/iaa/iaa_crypto_main.c
+++ b/drivers/crypto/intel/iaa/iaa_crypto_main.c
@@ -484,12 +484,6 @@ int add_iaa_compression_mode(const char *name,
}
EXPORT_SYMBOL_GPL(add_iaa_compression_mode);
-static struct iaa_device_compression_mode *
-get_iaa_device_compression_mode(struct iaa_device *iaa_device, int idx)
-{
- return iaa_device->compression_modes[idx];
-}
-
static void free_device_compression_mode(struct iaa_device *iaa_device,
struct iaa_device_compression_mode *device_mode)
{
@@ -1571,7 +1565,6 @@ static int iaa_compress_verify(struct crypto_tfm *tfm, struct acomp_req *req,
dma_addr_t src_addr, unsigned int slen,
dma_addr_t dst_addr, unsigned int dlen)
{
- struct iaa_device_compression_mode *active_compression_mode;
struct iaa_compression_ctx *ctx = crypto_tfm_ctx(tfm);
u32 *compression_crc = acomp_request_ctx(req);
struct iaa_device *iaa_device;
@@ -1590,8 +1583,6 @@ static int iaa_compress_verify(struct crypto_tfm *tfm, struct acomp_req *req,
pdev = idxd->pdev;
dev = &pdev->dev;
- active_compression_mode = get_iaa_device_compression_mode(iaa_device, ctx->mode);
-
while ((idxd_desc == ERR_PTR(-EAGAIN)) && (alloc_desc_retries++ < ctx->alloc_decomp_desc_timeout)) {
idxd_desc = idxd_alloc_desc(wq, IDXD_OP_NONBLOCK);
cpu_relax();
@@ -1667,8 +1658,7 @@ static void iaa_desc_complete(struct idxd_desc *idxd_desc,
pdev = idxd->pdev;
dev = &pdev->dev;
- active_compression_mode = get_iaa_device_compression_mode(iaa_device,
- compression_ctx->mode);
+ active_compression_mode = iaa_device->compression_modes[compression_ctx->mode];
dev_dbg(dev, "%s: compression mode %s,"
" ctx->src_addr %llx, ctx->dst_addr %llx\n", __func__,
active_compression_mode->name,
@@ -1748,12 +1738,63 @@ static void iaa_desc_complete(struct idxd_desc *idxd_desc,
percpu_ref_put(&iaa_wq->ref);
}
+static struct iax_hw_desc *
+iaa_setup_compress_hw_desc(struct idxd_desc *idxd_desc,
+ dma_addr_t src_addr,
+ unsigned int slen,
+ dma_addr_t dst_addr,
+ unsigned int dlen,
+ enum iaa_mode mode,
+ struct iaa_device_compression_mode *active_compression_mode)
+{
+ struct iax_hw_desc *desc = idxd_desc->iax_hw;
+
+ desc->flags = IDXD_OP_FLAG_CRAV | IDXD_OP_FLAG_RCR | IDXD_OP_FLAG_CC;
+ desc->opcode = IAX_OPCODE_COMPRESS;
+ desc->compr_flags = IAA_COMP_FLAGS;
+ desc->priv = 0;
+
+ desc->src1_addr = (u64)src_addr;
+ desc->src1_size = slen;
+ desc->dst_addr = (u64)dst_addr;
+ desc->max_dst_size = dlen;
+ desc->flags |= IDXD_OP_FLAG_RD_SRC2_AECS;
+ desc->src2_addr = active_compression_mode->aecs_comp_table_dma_addr;
+ desc->src2_size = sizeof(struct aecs_comp_table_record);
+ desc->completion_addr = idxd_desc->compl_dma;
+
+ return desc;
+}
+
+static struct iax_hw_desc *
+iaa_setup_decompress_hw_desc(struct idxd_desc *idxd_desc,
+ dma_addr_t src_addr,
+ unsigned int slen,
+ dma_addr_t dst_addr,
+ unsigned int dlen)
+{
+ struct iax_hw_desc *desc = idxd_desc->iax_hw;
+
+ desc->flags = IDXD_OP_FLAG_CRAV | IDXD_OP_FLAG_RCR | IDXD_OP_FLAG_CC;
+ desc->opcode = IAX_OPCODE_DECOMPRESS;
+ desc->max_dst_size = PAGE_SIZE;
+ desc->decompr_flags = IAA_DECOMP_FLAGS;
+ desc->priv = 0;
+
+ desc->src1_addr = (u64)src_addr;
+ desc->dst_addr = (u64)dst_addr;
+ desc->max_dst_size = dlen;
+ desc->src1_size = slen;
+ desc->completion_addr = idxd_desc->compl_dma;
+
+ return desc;
+}
+
static int iaa_compress(struct crypto_tfm *tfm, struct acomp_req *req,
struct idxd_wq *wq,
dma_addr_t src_addr, unsigned int slen,
dma_addr_t dst_addr, unsigned int *dlen)
{
- struct iaa_device_compression_mode *active_compression_mode;
struct iaa_compression_ctx *ctx = crypto_tfm_ctx(tfm);
u32 *compression_crc = acomp_request_ctx(req);
struct iaa_device *iaa_device;
@@ -1772,8 +1813,6 @@ static int iaa_compress(struct crypto_tfm *tfm, struct acomp_req *req,
pdev = idxd->pdev;
dev = &pdev->dev;
- active_compression_mode = get_iaa_device_compression_mode(iaa_device, ctx->mode);
-
while ((idxd_desc == ERR_PTR(-EAGAIN)) && (alloc_desc_retries++ < ctx->alloc_comp_desc_timeout)) {
idxd_desc = idxd_alloc_desc(wq, IDXD_OP_NONBLOCK);
cpu_relax();
@@ -1784,21 +1823,9 @@ static int iaa_compress(struct crypto_tfm *tfm, struct acomp_req *req,
PTR_ERR(idxd_desc));
return -ENODEV;
}
- desc = idxd_desc->iax_hw;
- desc->flags = IDXD_OP_FLAG_CRAV | IDXD_OP_FLAG_RCR |
- IDXD_OP_FLAG_RD_SRC2_AECS | IDXD_OP_FLAG_CC;
- desc->opcode = IAX_OPCODE_COMPRESS;
- desc->compr_flags = IAA_COMP_FLAGS;
- desc->priv = 0;
-
- desc->src1_addr = (u64)src_addr;
- desc->src1_size = slen;
- desc->dst_addr = (u64)dst_addr;
- desc->max_dst_size = *dlen;
- desc->src2_addr = active_compression_mode->aecs_comp_table_dma_addr;
- desc->src2_size = sizeof(struct aecs_comp_table_record);
- desc->completion_addr = idxd_desc->compl_dma;
+ desc = iaa_setup_compress_hw_desc(idxd_desc, src_addr, slen, dst_addr, *dlen,
+ ctx->mode, iaa_device->compression_modes[ctx->mode]);
if (likely(!ctx->use_irq)) {
ret = idxd_submit_desc(wq, idxd_desc);
@@ -1860,7 +1887,6 @@ static int iaa_decompress(struct crypto_tfm *tfm, struct acomp_req *req,
dma_addr_t src_addr, unsigned int slen,
dma_addr_t dst_addr, unsigned int *dlen)
{
- struct iaa_device_compression_mode *active_compression_mode;
struct iaa_compression_ctx *ctx = crypto_tfm_ctx(tfm);
struct iaa_device *iaa_device;
struct idxd_desc *idxd_desc = ERR_PTR(-EAGAIN);
@@ -1878,8 +1904,6 @@ static int iaa_decompress(struct crypto_tfm *tfm, struct acomp_req *req,
pdev = idxd->pdev;
dev = &pdev->dev;
- active_compression_mode = get_iaa_device_compression_mode(iaa_device, ctx->mode);
-
while ((idxd_desc == ERR_PTR(-EAGAIN)) && (alloc_desc_retries++ < ctx->alloc_decomp_desc_timeout)) {
idxd_desc = idxd_alloc_desc(wq, IDXD_OP_NONBLOCK);
cpu_relax();
@@ -1892,19 +1916,8 @@ static int iaa_decompress(struct crypto_tfm *tfm, struct acomp_req *req,
idxd_desc = NULL;
goto fallback_software_decomp;
}
- desc = idxd_desc->iax_hw;
- desc->flags = IDXD_OP_FLAG_CRAV | IDXD_OP_FLAG_RCR | IDXD_OP_FLAG_CC;
- desc->opcode = IAX_OPCODE_DECOMPRESS;
- desc->max_dst_size = PAGE_SIZE;
- desc->decompr_flags = IAA_DECOMP_FLAGS;
- desc->priv = 0;
-
- desc->src1_addr = (u64)src_addr;
- desc->dst_addr = (u64)dst_addr;
- desc->max_dst_size = *dlen;
- desc->src1_size = slen;
- desc->completion_addr = idxd_desc->compl_dma;
+ desc = iaa_setup_decompress_hw_desc(idxd_desc, src_addr, slen, dst_addr, *dlen);
if (likely(!ctx->use_irq)) {
ret = idxd_submit_desc(wq, idxd_desc);
--
2.27.0
^ permalink raw reply [flat|nested] 48+ messages in thread* [PATCH v14 09/26] crypto: iaa - Simplified, efficient job submissions for non-irq mode.
2026-01-25 3:35 [PATCH v14 00/26] zswap compression batching with optimized iaa_crypto driver Kanchana P Sridhar
` (7 preceding siblings ...)
2026-01-25 3:35 ` [PATCH v14 08/26] crypto: iaa - Refactor hardware descriptor setup into separate procedures Kanchana P Sridhar
@ 2026-01-25 3:35 ` Kanchana P Sridhar
2026-01-25 3:35 ` [PATCH v14 10/26] crypto: iaa - Deprecate exporting add/remove IAA compression modes Kanchana P Sridhar
` (17 subsequent siblings)
26 siblings, 0 replies; 48+ messages in thread
From: Kanchana P Sridhar @ 2026-01-25 3:35 UTC (permalink / raw)
To: linux-kernel, linux-mm, hannes, yosry.ahmed, nphamcs,
chengming.zhou, usamaarif642, ryan.roberts, 21cnbao, ying.huang,
akpm, senozhatsky, sj, kasong, linux-crypto, herbert, davem,
clabbe, ardb, ebiggers, surenb, kristen.c.accardi,
vinicius.gomes, giovanni.cabiddu
Cc: wajdi.k.feghali, kanchana.p.sridhar
This patch adds a new procedure, iaa_submit_desc_movdir64b(), that
directly calls movdir64b. The core iaa_crypto routines that submit
compress and decompress jobs now invoke iaa_submit_desc_movdir64b() in
non-irq driver modes, instead of idxd_submit_desc().
idxd_submit_desc() is called only in irq mode.
This improves latency for the most commonly used iaa_crypto usage
(i.e., async non-irq) in zswap by eliminating redundant computes
that would otherwise be incurred in idxd_submit_desc():
For a single-threaded madvise-based workload with the Silesia.tar
dataset, these are the before/after batch compression latencies for a
compress batch of 8 pages:
==================================
p50 (ns) p99 (ns)
==================================
before 5,568 6,056
after 5,472 5,848
Change -96 -208
==================================
Signed-off-by: Kanchana P Sridhar <kanchana.p.sridhar@intel.com>
---
drivers/crypto/intel/iaa/iaa_crypto_main.c | 30 ++++++++++++++--------
1 file changed, 20 insertions(+), 10 deletions(-)
diff --git a/drivers/crypto/intel/iaa/iaa_crypto_main.c b/drivers/crypto/intel/iaa/iaa_crypto_main.c
index f0e9eb52eec4..4b275cc09404 100644
--- a/drivers/crypto/intel/iaa/iaa_crypto_main.c
+++ b/drivers/crypto/intel/iaa/iaa_crypto_main.c
@@ -1790,6 +1790,24 @@ iaa_setup_decompress_hw_desc(struct idxd_desc *idxd_desc,
return desc;
}
+/*
+ * Call this for non-irq, non-enqcmds job submissions.
+ */
+static __always_inline void iaa_submit_desc_movdir64b(struct idxd_wq *wq,
+ struct idxd_desc *desc)
+{
+ void __iomem *portal = idxd_wq_portal_addr(wq);
+
+ /*
+ * The wmb() flushes writes to coherent DMA data before
+ * possibly triggering a DMA read. The wmb() is necessary
+ * even on UP because the recipient is a device.
+ */
+ wmb();
+
+ iosubmit_cmds512(portal, desc->hw, 1);
+}
+
static int iaa_compress(struct crypto_tfm *tfm, struct acomp_req *req,
struct idxd_wq *wq,
dma_addr_t src_addr, unsigned int slen,
@@ -1828,11 +1846,7 @@ static int iaa_compress(struct crypto_tfm *tfm, struct acomp_req *req,
ctx->mode, iaa_device->compression_modes[ctx->mode]);
if (likely(!ctx->use_irq)) {
- ret = idxd_submit_desc(wq, idxd_desc);
- if (ret) {
- dev_dbg(dev, "submit_desc failed ret=%d\n", ret);
- goto out;
- }
+ iaa_submit_desc_movdir64b(wq, idxd_desc);
/* Update stats */
update_total_comp_calls();
@@ -1920,11 +1934,7 @@ static int iaa_decompress(struct crypto_tfm *tfm, struct acomp_req *req,
desc = iaa_setup_decompress_hw_desc(idxd_desc, src_addr, slen, dst_addr, *dlen);
if (likely(!ctx->use_irq)) {
- ret = idxd_submit_desc(wq, idxd_desc);
- if (ret) {
- dev_dbg(dev, "submit_desc failed ret=%d\n", ret);
- goto fallback_software_decomp;
- }
+ iaa_submit_desc_movdir64b(wq, idxd_desc);
/* Update stats */
update_total_decomp_calls();
--
2.27.0
^ permalink raw reply [flat|nested] 48+ messages in thread* [PATCH v14 10/26] crypto: iaa - Deprecate exporting add/remove IAA compression modes.
2026-01-25 3:35 [PATCH v14 00/26] zswap compression batching with optimized iaa_crypto driver Kanchana P Sridhar
` (8 preceding siblings ...)
2026-01-25 3:35 ` [PATCH v14 09/26] crypto: iaa - Simplified, efficient job submissions for non-irq mode Kanchana P Sridhar
@ 2026-01-25 3:35 ` Kanchana P Sridhar
2026-01-25 3:35 ` [PATCH v14 11/26] crypto: iaa - Expect a single scatterlist for a [de]compress request's src/dst Kanchana P Sridhar
` (16 subsequent siblings)
26 siblings, 0 replies; 48+ messages in thread
From: Kanchana P Sridhar @ 2026-01-25 3:35 UTC (permalink / raw)
To: linux-kernel, linux-mm, hannes, yosry.ahmed, nphamcs,
chengming.zhou, usamaarif642, ryan.roberts, 21cnbao, ying.huang,
akpm, senozhatsky, sj, kasong, linux-crypto, herbert, davem,
clabbe, ardb, ebiggers, surenb, kristen.c.accardi,
vinicius.gomes, giovanni.cabiddu
Cc: wajdi.k.feghali, kanchana.p.sridhar
There is no use case right now for kernel users to dynamically
add/remove IAA compression modes; hence this commit deletes the symbol
exports of add_iaa_compression_mode() and remove_iaa_compression_mode().
The only supported usage model of IAA compression modes is for the code
to be statically linked during the iaa_crypto module build,
e.g. iaa_crypto_comp_fixed.c, and for available modes to be registered
when the first IAA device wq is probed.
Signed-off-by: Kanchana P Sridhar <kanchana.p.sridhar@intel.com>
---
drivers/crypto/intel/iaa/iaa_crypto_main.c | 6 ------
1 file changed, 6 deletions(-)
diff --git a/drivers/crypto/intel/iaa/iaa_crypto_main.c b/drivers/crypto/intel/iaa/iaa_crypto_main.c
index 4b275cc09404..1b44c0524692 100644
--- a/drivers/crypto/intel/iaa/iaa_crypto_main.c
+++ b/drivers/crypto/intel/iaa/iaa_crypto_main.c
@@ -368,10 +368,6 @@ static void free_iaa_compression_mode(struct iaa_compression_mode *mode)
* These tables are typically generated and captured using statistics
* collected from running actual compress/decompress workloads.
*
- * A module or other kernel code can add and remove compression modes
- * with a given name using the exported @add_iaa_compression_mode()
- * and @remove_iaa_compression_mode functions.
- *
* When a new compression mode is added, the tables are saved in a
* global compression mode list. When IAA devices are added, a
* per-IAA device dma mapping is created for each IAA device, for each
@@ -405,7 +401,6 @@ void remove_iaa_compression_mode(const char *name)
out:
mutex_unlock(&iaa_devices_lock);
}
-EXPORT_SYMBOL_GPL(remove_iaa_compression_mode);
/**
* add_iaa_compression_mode - Add an IAA compression mode
@@ -482,7 +477,6 @@ int add_iaa_compression_mode(const char *name,
free_iaa_compression_mode(mode);
goto out;
}
-EXPORT_SYMBOL_GPL(add_iaa_compression_mode);
static void free_device_compression_mode(struct iaa_device *iaa_device,
struct iaa_device_compression_mode *device_mode)
--
2.27.0
^ permalink raw reply [flat|nested] 48+ messages in thread* [PATCH v14 11/26] crypto: iaa - Expect a single scatterlist for a [de]compress request's src/dst.
2026-01-25 3:35 [PATCH v14 00/26] zswap compression batching with optimized iaa_crypto driver Kanchana P Sridhar
` (9 preceding siblings ...)
2026-01-25 3:35 ` [PATCH v14 10/26] crypto: iaa - Deprecate exporting add/remove IAA compression modes Kanchana P Sridhar
@ 2026-01-25 3:35 ` Kanchana P Sridhar
2026-01-25 3:35 ` [PATCH v14 12/26] crypto: iaa - Rearchitect iaa_crypto to have clean interfaces with crypto_acomp Kanchana P Sridhar
` (15 subsequent siblings)
26 siblings, 0 replies; 48+ messages in thread
From: Kanchana P Sridhar @ 2026-01-25 3:35 UTC (permalink / raw)
To: linux-kernel, linux-mm, hannes, yosry.ahmed, nphamcs,
chengming.zhou, usamaarif642, ryan.roberts, 21cnbao, ying.huang,
akpm, senozhatsky, sj, kasong, linux-crypto, herbert, davem,
clabbe, ardb, ebiggers, surenb, kristen.c.accardi,
vinicius.gomes, giovanni.cabiddu
Cc: wajdi.k.feghali, kanchana.p.sridhar
The calls to dma_map_sg() were passing sg_nents() for the @nents
parameter, then error-ing out if more than one @nr_sgs were
returned. Furthermore, there are no use-cases for iaa_crypto that allow
multiple SG lists to be mapped for dma at once.
Moreover, as per Herbert's direction in [1] for the batching API from
higher mm layers to interface with crypto using SG lists, batching
within iaa_crypto will rely on there being exactly one SG list per
"unit" of [de]compression in a batch, where the component SG lists are
obtained by breaking down the @req->src and @req->dst.
Given all of the above, this patch simplifies the design by expecting
only 1 @nents in req->src and req->dst, which aligns with current and
batching use cases that will be developed in subsequent patches.
This alleviates the latency penalty of calling sg_nents() per
[de]compress op submitted to the hardware.
Some unlikely() annotations are added to conditionals in the core
[de]compress routines to further improve latency per op.
[1]: https://lore.kernel.org/all/aJ7Fk6RpNc815Ivd@gondor.apana.org.au/T/#m99aea2ce3d284e6c5a3253061d97b08c4752a798
Signed-off-by: Kanchana P Sridhar <kanchana.p.sridhar@intel.com>
---
drivers/crypto/intel/iaa/iaa_crypto_main.c | 54 +++++++++++-----------
1 file changed, 27 insertions(+), 27 deletions(-)
diff --git a/drivers/crypto/intel/iaa/iaa_crypto_main.c b/drivers/crypto/intel/iaa/iaa_crypto_main.c
index 1b44c0524692..aafa8d4afcf4 100644
--- a/drivers/crypto/intel/iaa/iaa_crypto_main.c
+++ b/drivers/crypto/intel/iaa/iaa_crypto_main.c
@@ -1521,11 +1521,11 @@ static int iaa_remap_for_verify(struct device *dev, struct iaa_wq *iaa_wq,
int ret = 0;
int nr_sgs;
- dma_unmap_sg(dev, req->dst, sg_nents(req->dst), DMA_FROM_DEVICE);
- dma_unmap_sg(dev, req->src, sg_nents(req->src), DMA_TO_DEVICE);
+ dma_unmap_sg(dev, req->dst, 1, DMA_FROM_DEVICE);
+ dma_unmap_sg(dev, req->src, 1, DMA_TO_DEVICE);
- nr_sgs = dma_map_sg(dev, req->src, sg_nents(req->src), DMA_FROM_DEVICE);
- if (nr_sgs <= 0 || nr_sgs > 1) {
+ nr_sgs = dma_map_sg(dev, req->src, 1, DMA_FROM_DEVICE);
+ if (unlikely(nr_sgs <= 0 || nr_sgs > 1)) {
dev_dbg(dev, "verify: couldn't map src sg for iaa device %d,"
" wq %d: ret=%d\n", iaa_wq->iaa_device->idxd->id,
iaa_wq->wq->id, ret);
@@ -1537,13 +1537,13 @@ static int iaa_remap_for_verify(struct device *dev, struct iaa_wq *iaa_wq,
" req->slen %d, sg_dma_len(sg) %d\n", *src_addr, nr_sgs,
req->src, req->slen, sg_dma_len(req->src));
- nr_sgs = dma_map_sg(dev, req->dst, sg_nents(req->dst), DMA_TO_DEVICE);
- if (nr_sgs <= 0 || nr_sgs > 1) {
+ nr_sgs = dma_map_sg(dev, req->dst, 1, DMA_TO_DEVICE);
+ if (unlikely(nr_sgs <= 0 || nr_sgs > 1)) {
dev_dbg(dev, "verify: couldn't map dst sg for iaa device %d,"
" wq %d: ret=%d\n", iaa_wq->iaa_device->idxd->id,
iaa_wq->wq->id, ret);
ret = -EIO;
- dma_unmap_sg(dev, req->src, sg_nents(req->src), DMA_FROM_DEVICE);
+ dma_unmap_sg(dev, req->src, 1, DMA_FROM_DEVICE);
goto out;
}
*dst_addr = sg_dma_address(req->dst);
@@ -1711,14 +1711,14 @@ static void iaa_desc_complete(struct idxd_desc *idxd_desc,
err = -EIO;
}
- dma_unmap_sg(dev, ctx->req->dst, sg_nents(ctx->req->dst), DMA_TO_DEVICE);
- dma_unmap_sg(dev, ctx->req->src, sg_nents(ctx->req->src), DMA_FROM_DEVICE);
+ dma_unmap_sg(dev, ctx->req->dst, 1, DMA_TO_DEVICE);
+ dma_unmap_sg(dev, ctx->req->src, 1, DMA_FROM_DEVICE);
goto out;
}
err:
- dma_unmap_sg(dev, ctx->req->dst, sg_nents(ctx->req->dst), DMA_FROM_DEVICE);
- dma_unmap_sg(dev, ctx->req->src, sg_nents(ctx->req->src), DMA_TO_DEVICE);
+ dma_unmap_sg(dev, ctx->req->dst, 1, DMA_FROM_DEVICE);
+ dma_unmap_sg(dev, ctx->req->src, 1, DMA_TO_DEVICE);
out:
if (ret != 0)
dev_dbg(dev, "asynchronous %s failed ret=%d\n",
@@ -2022,8 +2022,8 @@ static int iaa_comp_acompress(struct acomp_req *req)
dev = &wq->idxd->pdev->dev;
- nr_sgs = dma_map_sg(dev, req->src, sg_nents(req->src), DMA_TO_DEVICE);
- if (nr_sgs <= 0 || nr_sgs > 1) {
+ nr_sgs = dma_map_sg(dev, req->src, 1, DMA_TO_DEVICE);
+ if (unlikely(nr_sgs <= 0 || nr_sgs > 1)) {
dev_dbg(dev, "couldn't map src sg for iaa device %d,"
" wq %d: ret=%d\n", iaa_wq->iaa_device->idxd->id,
iaa_wq->wq->id, ret);
@@ -2032,8 +2032,8 @@ static int iaa_comp_acompress(struct acomp_req *req)
}
src_addr = sg_dma_address(req->src);
- nr_sgs = dma_map_sg(dev, req->dst, sg_nents(req->dst), DMA_FROM_DEVICE);
- if (nr_sgs <= 0 || nr_sgs > 1) {
+ nr_sgs = dma_map_sg(dev, req->dst, 1, DMA_FROM_DEVICE);
+ if (unlikely(nr_sgs <= 0 || nr_sgs > 1)) {
dev_dbg(dev, "couldn't map dst sg for iaa device %d,"
" wq %d: ret=%d\n", iaa_wq->iaa_device->idxd->id,
iaa_wq->wq->id, ret);
@@ -2059,18 +2059,18 @@ static int iaa_comp_acompress(struct acomp_req *req)
if (ret)
dev_dbg(dev, "asynchronous compress verification failed ret=%d\n", ret);
- dma_unmap_sg(dev, req->dst, sg_nents(req->dst), DMA_TO_DEVICE);
- dma_unmap_sg(dev, req->src, sg_nents(req->src), DMA_FROM_DEVICE);
+ dma_unmap_sg(dev, req->dst, 1, DMA_TO_DEVICE);
+ dma_unmap_sg(dev, req->src, 1, DMA_FROM_DEVICE);
goto out;
}
- if (ret)
+ if (unlikely(ret))
dev_dbg(dev, "asynchronous compress failed ret=%d\n", ret);
- dma_unmap_sg(dev, req->dst, sg_nents(req->dst), DMA_FROM_DEVICE);
+ dma_unmap_sg(dev, req->dst, 1, DMA_FROM_DEVICE);
err_map_dst:
- dma_unmap_sg(dev, req->src, sg_nents(req->src), DMA_TO_DEVICE);
+ dma_unmap_sg(dev, req->src, 1, DMA_TO_DEVICE);
out:
percpu_ref_put(&iaa_wq->ref);
@@ -2103,8 +2103,8 @@ static int iaa_comp_adecompress(struct acomp_req *req)
dev = &wq->idxd->pdev->dev;
- nr_sgs = dma_map_sg(dev, req->src, sg_nents(req->src), DMA_TO_DEVICE);
- if (nr_sgs <= 0 || nr_sgs > 1) {
+ nr_sgs = dma_map_sg(dev, req->src, 1, DMA_TO_DEVICE);
+ if (unlikely(nr_sgs <= 0 || nr_sgs > 1)) {
dev_dbg(dev, "couldn't map src sg for iaa device %d,"
" wq %d: ret=%d\n", iaa_wq->iaa_device->idxd->id,
iaa_wq->wq->id, ret);
@@ -2113,8 +2113,8 @@ static int iaa_comp_adecompress(struct acomp_req *req)
}
src_addr = sg_dma_address(req->src);
- nr_sgs = dma_map_sg(dev, req->dst, sg_nents(req->dst), DMA_FROM_DEVICE);
- if (nr_sgs <= 0 || nr_sgs > 1) {
+ nr_sgs = dma_map_sg(dev, req->dst, 1, DMA_FROM_DEVICE);
+ if (unlikely(nr_sgs <= 0 || nr_sgs > 1)) {
dev_dbg(dev, "couldn't map dst sg for iaa device %d,"
" wq %d: ret=%d\n", iaa_wq->iaa_device->idxd->id,
iaa_wq->wq->id, ret);
@@ -2128,12 +2128,12 @@ static int iaa_comp_adecompress(struct acomp_req *req)
if (ret == -EINPROGRESS)
return ret;
- if (ret != 0)
+ if (unlikely(ret != 0))
dev_dbg(dev, "asynchronous decompress failed ret=%d\n", ret);
- dma_unmap_sg(dev, req->dst, sg_nents(req->dst), DMA_FROM_DEVICE);
+ dma_unmap_sg(dev, req->dst, 1, DMA_FROM_DEVICE);
err_map_dst:
- dma_unmap_sg(dev, req->src, sg_nents(req->src), DMA_TO_DEVICE);
+ dma_unmap_sg(dev, req->src, 1, DMA_TO_DEVICE);
out:
percpu_ref_put(&iaa_wq->ref);
--
2.27.0
^ permalink raw reply [flat|nested] 48+ messages in thread* [PATCH v14 12/26] crypto: iaa - Rearchitect iaa_crypto to have clean interfaces with crypto_acomp.
2026-01-25 3:35 [PATCH v14 00/26] zswap compression batching with optimized iaa_crypto driver Kanchana P Sridhar
` (10 preceding siblings ...)
2026-01-25 3:35 ` [PATCH v14 11/26] crypto: iaa - Expect a single scatterlist for a [de]compress request's src/dst Kanchana P Sridhar
@ 2026-01-25 3:35 ` Kanchana P Sridhar
2026-02-06 10:49 ` Herbert Xu
2026-01-25 3:35 ` [PATCH v14 13/26] crypto: acomp - Define a unit_size in struct acomp_req to enable batching Kanchana P Sridhar
` (14 subsequent siblings)
26 siblings, 1 reply; 48+ messages in thread
From: Kanchana P Sridhar @ 2026-01-25 3:35 UTC (permalink / raw)
To: linux-kernel, linux-mm, hannes, yosry.ahmed, nphamcs,
chengming.zhou, usamaarif642, ryan.roberts, 21cnbao, ying.huang,
akpm, senozhatsky, sj, kasong, linux-crypto, herbert, davem,
clabbe, ardb, ebiggers, surenb, kristen.c.accardi,
vinicius.gomes, giovanni.cabiddu
Cc: wajdi.k.feghali, kanchana.p.sridhar
This patch modifies the core functions in the iaa_crypto driver to be
independent of crypto_acomp, by adding a layer between the core driver
functionality and the crypto API. The core driver code is moved under
this layer that relies only on idxd, dma and scatterlist. This leads to
a cleaner interface.
We introduce a new "struct iaa_req" data structure, and light-weight
internal translation routines to/from crypto_acomp, namely,
acomp_to_iaa() and iaa_to_acomp().
The exception is that the driver defines a "static struct crypto_acomp
*deflate_crypto_comp" for the software decompress fall-back
path.
The acomp_alg .compress() and .decompress() interfaces call into
iaa_comp_acompress_main() and iaa_comp_adecompress_main(), which are
wrappers around the core crypto-independent driver functions.
These iaa_crypto interfaces will continue to be available through
crypto_acomp for use in zswap:
int crypto_acomp_compress(struct acomp_req *req);
int crypto_acomp_decompress(struct acomp_req *req);
Additionally, this patch resolves a race condition triggered when
IAA wqs and devices are continuously disabled/enabled when workloads are
using IAA for compression/decompression. This commit, in combination
with patches 0002 ("crypto: iaa - New architecture for IAA device WQ
comp/decomp usage & core mapping.) and 0005 (crypto: iaa - iaa_wq uses
percpu_refs for get/put reference counting.) in this series fix the race
condition. This has been verified using bisecting.
One other change made towards a cleaner architecture is the iaa_crypto
symbol namespace is changed from "IDXD" to "CRYPTO_DEV_IAA_CRYPTO".
Fixes: ea7a5cbb4369 ("crypto: iaa - Add Intel IAA Compression Accelerator crypto driver core")
Signed-off-by: Kanchana P Sridhar <kanchana.p.sridhar@intel.com>
---
drivers/crypto/intel/iaa/Makefile | 2 +-
drivers/crypto/intel/iaa/iaa_crypto.h | 24 +-
drivers/crypto/intel/iaa/iaa_crypto_main.c | 283 ++++++++++++++++-----
3 files changed, 244 insertions(+), 65 deletions(-)
diff --git a/drivers/crypto/intel/iaa/Makefile b/drivers/crypto/intel/iaa/Makefile
index 55bda7770fac..ebfa1a425f80 100644
--- a/drivers/crypto/intel/iaa/Makefile
+++ b/drivers/crypto/intel/iaa/Makefile
@@ -3,7 +3,7 @@
# Makefile for IAA crypto device drivers
#
-ccflags-y += -I $(srctree)/drivers/dma/idxd -DDEFAULT_SYMBOL_NAMESPACE='"IDXD"'
+ccflags-y += -I $(srctree)/drivers/dma/idxd -DDEFAULT_SYMBOL_NAMESPACE='"CRYPTO_DEV_IAA_CRYPTO"'
obj-$(CONFIG_CRYPTO_DEV_IAA_CRYPTO) := iaa_crypto.o
diff --git a/drivers/crypto/intel/iaa/iaa_crypto.h b/drivers/crypto/intel/iaa/iaa_crypto.h
index 9611f2518f42..4dfb65c88f83 100644
--- a/drivers/crypto/intel/iaa/iaa_crypto.h
+++ b/drivers/crypto/intel/iaa/iaa_crypto.h
@@ -7,6 +7,7 @@
#include <linux/crypto.h>
#include <linux/idxd.h>
#include <uapi/linux/idxd.h>
+#include <linux/scatterlist.h>
#define IDXD_SUBDRIVER_NAME "crypto"
@@ -29,8 +30,6 @@
#define IAA_ERROR_COMP_BUF_OVERFLOW 0x19
#define IAA_ERROR_WATCHDOG_EXPIRED 0x24
-#define IAA_COMP_MODES_MAX 2
-
#define FIXED_HDR 0x2
#define FIXED_HDR_SIZE 3
@@ -42,6 +41,23 @@
IAA_DECOMP_CHECK_FOR_EOB | \
IAA_DECOMP_STOP_ON_EOB)
+#define IAA_COMP_MODES_MAX IAA_MODE_NONE
+
+enum iaa_mode {
+ IAA_MODE_FIXED = 0,
+ IAA_MODE_NONE = 1,
+};
+
+struct iaa_req {
+ struct scatterlist *src;
+ struct scatterlist *dst;
+ unsigned int slen;
+ unsigned int dlen;
+ u32 flags;
+ u32 compression_crc;
+ void *drv_data; /* for driver internal use */
+};
+
/* Representation of IAA workqueue */
struct iaa_wq {
struct list_head list;
@@ -138,10 +154,6 @@ int add_iaa_compression_mode(const char *name,
void remove_iaa_compression_mode(const char *name);
-enum iaa_mode {
- IAA_MODE_FIXED,
-};
-
struct iaa_compression_ctx {
enum iaa_mode mode;
u16 alloc_comp_desc_timeout;
diff --git a/drivers/crypto/intel/iaa/iaa_crypto_main.c b/drivers/crypto/intel/iaa/iaa_crypto_main.c
index aafa8d4afcf4..d4b0c09bff21 100644
--- a/drivers/crypto/intel/iaa/iaa_crypto_main.c
+++ b/drivers/crypto/intel/iaa/iaa_crypto_main.c
@@ -52,6 +52,10 @@ static struct wq_table_entry **pkg_global_decomp_wqs;
/* All comp wqs from IAAs on a package. */
static struct wq_table_entry **pkg_global_comp_wqs;
+/* For software deflate fallback compress/decompress. */
+static struct crypto_acomp *deflate_crypto_acomp;
+DEFINE_MUTEX(deflate_crypto_acomp_lock);
+
LIST_HEAD(iaa_devices);
DEFINE_MUTEX(iaa_devices_lock);
@@ -94,9 +98,18 @@ static atomic_t iaa_crypto_enabled = ATOMIC_INIT(0);
static struct idxd_wq *first_wq_found;
DEFINE_MUTEX(first_wq_found_lock);
-static bool iaa_crypto_registered;
+const char *iaa_compression_mode_names[IAA_COMP_MODES_MAX] = {
+ "fixed",
+};
+
+const char *iaa_compression_alg_names[IAA_COMP_MODES_MAX] = {
+ "deflate-iaa",
+};
static struct iaa_compression_mode *iaa_compression_modes[IAA_COMP_MODES_MAX];
+static struct iaa_compression_ctx *iaa_ctx[IAA_COMP_MODES_MAX];
+static bool iaa_mode_registered[IAA_COMP_MODES_MAX];
+static u8 num_iaa_modes_registered;
/* Distribute decompressions across all IAAs on the package. */
static bool iaa_distribute_decomps;
@@ -354,6 +367,20 @@ static struct iaa_compression_mode *find_iaa_compression_mode(const char *name,
return NULL;
}
+static bool iaa_alg_is_registered(const char *name, int *idx)
+{
+ int i;
+
+ for (i = 0; i < IAA_COMP_MODES_MAX; ++i) {
+ if (!strcmp(name, iaa_compression_alg_names[i]) && iaa_mode_registered[i]) {
+ *idx = i;
+ return true;
+ }
+ }
+
+ return false;
+}
+
static void free_iaa_compression_mode(struct iaa_compression_mode *mode)
{
kfree(mode->name);
@@ -467,6 +494,7 @@ int add_iaa_compression_mode(const char *name,
mode->name, idx);
iaa_compression_modes[idx] = mode;
+ ++num_iaa_modes_registered;
ret = 0;
out:
@@ -1441,19 +1469,46 @@ static struct idxd_wq *comp_wq_table_next_wq(int cpu)
* Core iaa_crypto compress/decompress functions.
*************************************************/
-static int deflate_generic_decompress(struct acomp_req *req)
+static int deflate_generic_decompress(struct iaa_req *req)
{
- ACOMP_FBREQ_ON_STACK(fbreq, req);
+ ACOMP_REQUEST_ON_STACK(fbreq, deflate_crypto_acomp);
int ret;
+ acomp_request_set_callback(fbreq, 0, NULL, NULL);
+ acomp_request_set_params(fbreq, req->src, req->dst, req->slen,
+ PAGE_SIZE);
+
+ mutex_lock(&deflate_crypto_acomp_lock);
+
ret = crypto_acomp_decompress(fbreq);
req->dlen = fbreq->dlen;
+ mutex_unlock(&deflate_crypto_acomp_lock);
+
update_total_sw_decomp_calls();
return ret;
}
+static __always_inline void acomp_to_iaa(struct acomp_req *areq,
+ struct iaa_req *req,
+ struct iaa_compression_ctx *ctx)
+{
+ req->src = areq->src;
+ req->dst = areq->dst;
+ req->slen = areq->slen;
+ req->dlen = areq->dlen;
+ req->flags = 0;
+ if (unlikely(ctx->use_irq))
+ req->drv_data = areq;
+}
+
+static __always_inline void iaa_to_acomp(int dlen, struct acomp_req *areq)
+{
+ areq->dst->length = dlen;
+ areq->dlen = dlen;
+}
+
static inline int check_completion(struct device *dev,
struct iax_completion_record *comp,
bool compress,
@@ -1515,7 +1570,7 @@ static inline int check_completion(struct device *dev,
}
static int iaa_remap_for_verify(struct device *dev, struct iaa_wq *iaa_wq,
- struct acomp_req *req,
+ struct iaa_req *req,
dma_addr_t *src_addr, dma_addr_t *dst_addr)
{
int ret = 0;
@@ -1554,13 +1609,11 @@ static int iaa_remap_for_verify(struct device *dev, struct iaa_wq *iaa_wq,
return ret;
}
-static int iaa_compress_verify(struct crypto_tfm *tfm, struct acomp_req *req,
+static int iaa_compress_verify(struct iaa_compression_ctx *ctx, struct iaa_req *req,
struct idxd_wq *wq,
dma_addr_t src_addr, unsigned int slen,
dma_addr_t dst_addr, unsigned int dlen)
{
- struct iaa_compression_ctx *ctx = crypto_tfm_ctx(tfm);
- u32 *compression_crc = acomp_request_ctx(req);
struct iaa_device *iaa_device;
struct idxd_desc *idxd_desc = ERR_PTR(-EAGAIN);
u16 alloc_desc_retries = 0;
@@ -1613,10 +1666,10 @@ static int iaa_compress_verify(struct crypto_tfm *tfm, struct acomp_req *req,
goto err;
}
- if (*compression_crc != idxd_desc->iax_completion->crc) {
+ if (req->compression_crc != idxd_desc->iax_completion->crc) {
ret = -EINVAL;
dev_dbg(dev, "(verify) iaa comp/decomp crc mismatch:"
- " comp=0x%x, decomp=0x%x\n", *compression_crc,
+ " comp=0x%x, decomp=0x%x\n", req->compression_crc,
idxd_desc->iax_completion->crc);
print_hex_dump(KERN_INFO, "cmp-rec: ", DUMP_PREFIX_OFFSET,
8, 1, idxd_desc->iax_completion, 64, 0);
@@ -1642,6 +1695,7 @@ static void iaa_desc_complete(struct idxd_desc *idxd_desc,
struct iaa_wq *iaa_wq;
struct pci_dev *pdev;
struct device *dev;
+ struct iaa_req req;
int ret, err = 0;
compression_ctx = crypto_tfm_ctx(ctx->tfm);
@@ -1667,19 +1721,25 @@ static void iaa_desc_complete(struct idxd_desc *idxd_desc,
pr_warn("%s: falling back to deflate-generic decompress, "
"analytics error code %x\n", __func__,
idxd_desc->iax_completion->error_code);
- ret = deflate_generic_decompress(ctx->req);
+
+ acomp_to_iaa(ctx->req, &req, compression_ctx);
+ ret = deflate_generic_decompress(&req);
+
if (ret) {
dev_dbg(dev, "%s: deflate-generic failed ret=%d\n",
__func__, ret);
err = -EIO;
goto err;
+ } else {
+ iaa_to_acomp(req.dlen, ctx->req);
+ goto verify;
}
} else {
err = -EIO;
goto err;
}
} else {
- ctx->req->dlen = idxd_desc->iax_completion->output_size;
+ iaa_to_acomp(idxd_desc->iax_completion->output_size, ctx->req);
}
/* Update stats */
@@ -1691,21 +1751,24 @@ static void iaa_desc_complete(struct idxd_desc *idxd_desc,
update_wq_decomp_bytes(iaa_wq->wq, ctx->req->slen);
}
+verify:
if (ctx->compress && compression_ctx->verify_compress) {
- u32 *compression_crc = acomp_request_ctx(ctx->req);
dma_addr_t src_addr, dst_addr;
- *compression_crc = idxd_desc->iax_completion->crc;
+ acomp_to_iaa(ctx->req, &req, compression_ctx);
+ req.compression_crc = idxd_desc->iax_completion->crc;
+
+ ret = iaa_remap_for_verify(dev, iaa_wq, &req, &src_addr, &dst_addr);
- ret = iaa_remap_for_verify(dev, iaa_wq, ctx->req, &src_addr, &dst_addr);
if (ret) {
pr_err("%s: compress verify remap failed ret=%d\n", __func__, ret);
err = -EIO;
goto out;
}
- ret = iaa_compress_verify(ctx->tfm, ctx->req, iaa_wq->wq, src_addr,
+ ret = iaa_compress_verify(compression_ctx, &req, iaa_wq->wq, src_addr,
ctx->req->slen, dst_addr, ctx->req->dlen);
+
if (ret) {
pr_err("%s: compress verify failed ret=%d\n", __func__, ret);
err = -EIO;
@@ -1720,9 +1783,11 @@ static void iaa_desc_complete(struct idxd_desc *idxd_desc,
dma_unmap_sg(dev, ctx->req->dst, 1, DMA_FROM_DEVICE);
dma_unmap_sg(dev, ctx->req->src, 1, DMA_TO_DEVICE);
out:
- if (ret != 0)
+ if (ret) {
+ iaa_to_acomp(ret, ctx->req);
dev_dbg(dev, "asynchronous %s failed ret=%d\n",
ctx->compress ? "compress":"decompress", ret);
+ }
if (ctx->req->base.complete)
acomp_request_complete(ctx->req, err);
@@ -1802,13 +1867,11 @@ static __always_inline void iaa_submit_desc_movdir64b(struct idxd_wq *wq,
iosubmit_cmds512(portal, desc->hw, 1);
}
-static int iaa_compress(struct crypto_tfm *tfm, struct acomp_req *req,
+static int iaa_compress(struct iaa_compression_ctx *ctx, struct iaa_req *req,
struct idxd_wq *wq,
dma_addr_t src_addr, unsigned int slen,
dma_addr_t dst_addr, unsigned int *dlen)
{
- struct iaa_compression_ctx *ctx = crypto_tfm_ctx(tfm);
- u32 *compression_crc = acomp_request_ctx(req);
struct iaa_device *iaa_device;
struct idxd_desc *idxd_desc = ERR_PTR(-EAGAIN);
u16 alloc_desc_retries = 0;
@@ -1856,17 +1919,18 @@ static int iaa_compress(struct crypto_tfm *tfm, struct acomp_req *req,
}
*dlen = idxd_desc->iax_completion->output_size;
+ req->compression_crc = idxd_desc->iax_completion->crc;
/* Update stats */
update_total_comp_bytes_out(*dlen);
update_wq_comp_bytes(wq, *dlen);
-
- *compression_crc = idxd_desc->iax_completion->crc;
} else {
+ struct acomp_req *areq = req->drv_data;
+
desc->flags |= IDXD_OP_FLAG_RCI;
- idxd_desc->crypto.req = req;
- idxd_desc->crypto.tfm = tfm;
+ idxd_desc->crypto.req = areq;
+ idxd_desc->crypto.tfm = areq->base.tfm;
idxd_desc->crypto.src_addr = src_addr;
idxd_desc->crypto.dst_addr = dst_addr;
idxd_desc->crypto.compress = true;
@@ -1890,12 +1954,11 @@ static int iaa_compress(struct crypto_tfm *tfm, struct acomp_req *req,
return ret;
}
-static int iaa_decompress(struct crypto_tfm *tfm, struct acomp_req *req,
+static int iaa_decompress(struct iaa_compression_ctx *ctx, struct iaa_req *req,
struct idxd_wq *wq,
dma_addr_t src_addr, unsigned int slen,
dma_addr_t dst_addr, unsigned int *dlen)
{
- struct iaa_compression_ctx *ctx = crypto_tfm_ctx(tfm);
struct iaa_device *iaa_device;
struct idxd_desc *idxd_desc = ERR_PTR(-EAGAIN);
u16 alloc_desc_retries = 0;
@@ -1939,10 +2002,12 @@ static int iaa_decompress(struct crypto_tfm *tfm, struct acomp_req *req,
ret = check_completion(dev, idxd_desc->iax_completion, false, false);
} else {
+ struct acomp_req *areq = req->drv_data;
+
desc->flags |= IDXD_OP_FLAG_RCI;
- idxd_desc->crypto.req = req;
- idxd_desc->crypto.tfm = tfm;
+ idxd_desc->crypto.req = areq;
+ idxd_desc->crypto.tfm = areq->base.tfm;
idxd_desc->crypto.src_addr = src_addr;
idxd_desc->crypto.dst_addr = dst_addr;
idxd_desc->crypto.compress = false;
@@ -1993,20 +2058,16 @@ static int iaa_decompress(struct crypto_tfm *tfm, struct acomp_req *req,
return ret;
}
-static int iaa_comp_acompress(struct acomp_req *req)
+static int iaa_comp_acompress(struct iaa_compression_ctx *ctx, struct iaa_req *req)
{
- struct iaa_compression_ctx *compression_ctx;
- struct crypto_tfm *tfm = req->base.tfm;
dma_addr_t src_addr, dst_addr;
int nr_sgs, cpu, ret = 0;
struct iaa_wq *iaa_wq;
struct idxd_wq *wq;
struct device *dev;
- compression_ctx = crypto_tfm_ctx(tfm);
-
- if (!req->src || !req->slen) {
- pr_debug("invalid src, not compressing\n");
+ if (!req->src || !req->slen || !req->dst) {
+ pr_debug("invalid src/dst, not compressing\n");
return -EINVAL;
}
@@ -2042,19 +2103,19 @@ static int iaa_comp_acompress(struct acomp_req *req)
}
dst_addr = sg_dma_address(req->dst);
- ret = iaa_compress(tfm, req, wq, src_addr, req->slen, dst_addr,
+ ret = iaa_compress(ctx, req, wq, src_addr, req->slen, dst_addr,
&req->dlen);
if (ret == -EINPROGRESS)
return ret;
- if (!ret && compression_ctx->verify_compress) {
+ if (!ret && ctx->verify_compress) {
ret = iaa_remap_for_verify(dev, iaa_wq, req, &src_addr, &dst_addr);
if (ret) {
dev_dbg(dev, "%s: compress verify remap failed ret=%d\n", __func__, ret);
goto out;
}
- ret = iaa_compress_verify(tfm, req, wq, src_addr, req->slen,
+ ret = iaa_compress_verify(ctx, req, wq, src_addr, req->slen,
dst_addr, req->dlen);
if (ret)
dev_dbg(dev, "asynchronous compress verification failed ret=%d\n", ret);
@@ -2077,9 +2138,8 @@ static int iaa_comp_acompress(struct acomp_req *req)
return ret;
}
-static int iaa_comp_adecompress(struct acomp_req *req)
+static int iaa_comp_adecompress(struct iaa_compression_ctx *ctx, struct iaa_req *req)
{
- struct crypto_tfm *tfm = req->base.tfm;
dma_addr_t src_addr, dst_addr;
int nr_sgs, cpu, ret = 0;
struct iaa_wq *iaa_wq;
@@ -2123,7 +2183,7 @@ static int iaa_comp_adecompress(struct acomp_req *req)
}
dst_addr = sg_dma_address(req->dst);
- ret = iaa_decompress(tfm, req, wq, src_addr, req->slen,
+ ret = iaa_decompress(ctx, req, wq, src_addr, req->slen,
dst_addr, &req->dlen);
if (ret == -EINPROGRESS)
return ret;
@@ -2140,8 +2200,9 @@ static int iaa_comp_adecompress(struct acomp_req *req)
return ret;
}
-static void compression_ctx_init(struct iaa_compression_ctx *ctx)
+static void compression_ctx_init(struct iaa_compression_ctx *ctx, enum iaa_mode mode)
{
+ ctx->mode = mode;
ctx->alloc_comp_desc_timeout = IAA_ALLOC_DESC_COMP_TIMEOUT;
ctx->alloc_decomp_desc_timeout = IAA_ALLOC_DESC_DECOMP_TIMEOUT;
ctx->verify_compress = iaa_verify_compress;
@@ -2153,22 +2214,56 @@ static void compression_ctx_init(struct iaa_compression_ctx *ctx)
* Interfaces to crypto_alg and crypto_acomp.
*********************************************/
-static int iaa_comp_init_fixed(struct crypto_acomp *acomp_tfm)
+static int iaa_crypto_acomp_acompress_main(struct acomp_req *areq)
+{
+ struct crypto_tfm *tfm = areq->base.tfm;
+ struct iaa_compression_ctx *ctx;
+ struct iaa_req parent_req;
+ int ret = -ENODEV, idx;
+
+ if (iaa_alg_is_registered(crypto_tfm_alg_driver_name(tfm), &idx)) {
+ ctx = iaa_ctx[idx];
+
+ acomp_to_iaa(areq, &parent_req, ctx);
+ ret = iaa_comp_acompress(ctx, &parent_req);
+ iaa_to_acomp(unlikely(ret) ? ret : parent_req.dlen, areq);
+ }
+
+ return ret;
+}
+
+static int iaa_crypto_acomp_adecompress_main(struct acomp_req *areq)
+{
+ struct crypto_tfm *tfm = areq->base.tfm;
+ struct iaa_compression_ctx *ctx;
+ struct iaa_req parent_req;
+ int ret = -ENODEV, idx;
+
+ if (iaa_alg_is_registered(crypto_tfm_alg_driver_name(tfm), &idx)) {
+ ctx = iaa_ctx[idx];
+
+ acomp_to_iaa(areq, &parent_req, ctx);
+ ret = iaa_comp_adecompress(ctx, &parent_req);
+ iaa_to_acomp(unlikely(ret) ? ret : parent_req.dlen, areq);
+ }
+
+ return ret;
+}
+
+static int iaa_crypto_acomp_init_fixed(struct crypto_acomp *acomp_tfm)
{
struct crypto_tfm *tfm = crypto_acomp_tfm(acomp_tfm);
struct iaa_compression_ctx *ctx = crypto_tfm_ctx(tfm);
- ctx->mode = IAA_MODE_FIXED;
-
- compression_ctx_init(ctx);
+ ctx = iaa_ctx[IAA_MODE_FIXED];
return 0;
}
static struct acomp_alg iaa_acomp_fixed_deflate = {
- .init = iaa_comp_init_fixed,
- .compress = iaa_comp_acompress,
- .decompress = iaa_comp_adecompress,
+ .init = iaa_crypto_acomp_init_fixed,
+ .compress = iaa_crypto_acomp_acompress_main,
+ .decompress = iaa_crypto_acomp_adecompress_main,
.base = {
.cra_name = "deflate",
.cra_driver_name = "deflate-iaa",
@@ -2180,29 +2275,76 @@ static struct acomp_alg iaa_acomp_fixed_deflate = {
}
};
+/*******************************************
+ * Implement idxd_device_driver interfaces.
+ *******************************************/
+
+static void iaa_unregister_compression_device(void)
+{
+ unsigned int i;
+
+ atomic_set(&iaa_crypto_enabled, 0);
+
+ for (i = 0; i < IAA_COMP_MODES_MAX; ++i) {
+ iaa_mode_registered[i] = false;
+ kfree(iaa_ctx[i]);
+ iaa_ctx[i] = NULL;
+ }
+
+ num_iaa_modes_registered = 0;
+}
+
static int iaa_register_compression_device(void)
{
- int ret;
+ struct iaa_compression_mode *mode;
+ int i, idx;
+
+ for (i = 0; i < IAA_COMP_MODES_MAX; ++i) {
+ iaa_mode_registered[i] = false;
+ mode = find_iaa_compression_mode(iaa_compression_mode_names[i], &idx);
+ if (mode) {
+ iaa_ctx[i] = kmalloc(sizeof(struct iaa_compression_ctx), GFP_KERNEL);
+ if (!iaa_ctx[i])
+ goto err;
+
+ compression_ctx_init(iaa_ctx[i], (enum iaa_mode)i);
+ iaa_mode_registered[i] = true;
+ }
+ }
+
+ if (iaa_mode_registered[IAA_MODE_FIXED])
+ return 0;
+
+ pr_err("%s: IAA_MODE_FIXED is not registered.", __func__);
+
+err:
+ iaa_unregister_compression_device();
+ return -ENODEV;
+}
+
+static int iaa_register_acomp_compression_device(void)
+{
+ int ret = -ENOMEM;
ret = crypto_register_acomp(&iaa_acomp_fixed_deflate);
if (ret) {
pr_err("deflate algorithm acomp fixed registration failed (%d)\n", ret);
- goto out;
+ goto err_fixed;
}
- iaa_crypto_registered = true;
-out:
+ return 0;
+
+err_fixed:
+ iaa_unregister_compression_device();
return ret;
}
-static int iaa_unregister_compression_device(void)
+static void iaa_unregister_acomp_compression_device(void)
{
atomic_set(&iaa_crypto_enabled, 0);
- if (iaa_crypto_registered)
+ if (iaa_mode_registered[IAA_MODE_FIXED])
crypto_unregister_acomp(&iaa_acomp_fixed_deflate);
-
- return 0;
}
static int iaa_crypto_probe(struct idxd_dev *idxd_dev)
@@ -2272,6 +2414,12 @@ static int iaa_crypto_probe(struct idxd_dev *idxd_dev)
goto err_register;
}
+ ret = iaa_register_acomp_compression_device();
+ if (ret != 0) {
+ dev_dbg(dev, "IAA compression device acomp registration failed\n");
+ goto err_register;
+ }
+
if (!rebalance_wq_table()) {
dev_dbg(dev, "%s: Rerun after registration: IAA rebalancing device wq tables failed\n", __func__);
goto err_register;
@@ -2348,6 +2496,8 @@ static void iaa_crypto_remove(struct idxd_dev *idxd_dev)
pkg_global_wqs_dealloc();
free_wq_tables();
WARN_ON(!list_empty(&iaa_devices));
+ iaa_unregister_acomp_compression_device();
+ iaa_unregister_compression_device();
INIT_LIST_HEAD(&iaa_devices);
module_put(THIS_MODULE);
@@ -2389,6 +2539,13 @@ static int __init iaa_crypto_init_module(void)
nr_cpus_per_package = topology_num_cores_per_package();
nr_packages = topology_max_packages();
+ /* Software fallback compressor */
+ deflate_crypto_acomp = crypto_alloc_acomp("deflate", 0, 0);
+ if (IS_ERR_OR_NULL(deflate_crypto_acomp)) {
+ ret = -ENODEV;
+ goto err_deflate_acomp;
+ }
+
ret = iaa_aecs_init_fixed();
if (ret < 0) {
pr_debug("IAA fixed compression mode init failed\n");
@@ -2460,14 +2617,19 @@ static int __init iaa_crypto_init_module(void)
err_driver_reg:
iaa_aecs_cleanup_fixed();
err_aecs_init:
+ if (!IS_ERR_OR_NULL(deflate_crypto_acomp)) {
+ crypto_free_acomp(deflate_crypto_acomp);
+ deflate_crypto_acomp = NULL;
+ }
+err_deflate_acomp:
goto out;
}
static void __exit iaa_crypto_cleanup_module(void)
{
- if (iaa_unregister_compression_device())
- pr_debug("IAA compression device unregister failed\n");
+ iaa_unregister_acomp_compression_device();
+ iaa_unregister_compression_device();
iaa_crypto_debugfs_cleanup();
driver_remove_file(&iaa_crypto_driver.drv,
@@ -2483,6 +2645,11 @@ static void __exit iaa_crypto_cleanup_module(void)
idxd_driver_unregister(&iaa_crypto_driver);
iaa_aecs_cleanup_fixed();
+ if (!IS_ERR_OR_NULL(deflate_crypto_acomp)) {
+ crypto_free_acomp(deflate_crypto_acomp);
+ deflate_crypto_acomp = NULL;
+ }
+
pr_debug("cleaned up\n");
}
--
2.27.0
^ permalink raw reply [flat|nested] 48+ messages in thread* Re: [PATCH v14 12/26] crypto: iaa - Rearchitect iaa_crypto to have clean interfaces with crypto_acomp.
2026-01-25 3:35 ` [PATCH v14 12/26] crypto: iaa - Rearchitect iaa_crypto to have clean interfaces with crypto_acomp Kanchana P Sridhar
@ 2026-02-06 10:49 ` Herbert Xu
0 siblings, 0 replies; 48+ messages in thread
From: Herbert Xu @ 2026-02-06 10:49 UTC (permalink / raw)
To: Kanchana P Sridhar
Cc: linux-kernel, linux-mm, hannes, yosry.ahmed, nphamcs,
chengming.zhou, usamaarif642, ryan.roberts, 21cnbao, ying.huang,
akpm, senozhatsky, sj, kasong, linux-crypto, davem, clabbe, ardb,
ebiggers, surenb, kristen.c.accardi, vinicius.gomes,
giovanni.cabiddu, wajdi.k.feghali
On Sat, Jan 24, 2026 at 07:35:23PM -0800, Kanchana P Sridhar wrote:
>
> +static int iaa_crypto_acomp_init_fixed(struct crypto_acomp *acomp_tfm)
> {
> struct crypto_tfm *tfm = crypto_acomp_tfm(acomp_tfm);
> struct iaa_compression_ctx *ctx = crypto_tfm_ctx(tfm);
>
> - ctx->mode = IAA_MODE_FIXED;
> -
> - compression_ctx_init(ctx);
> + ctx = iaa_ctx[IAA_MODE_FIXED];
Gcc warns about ctx as this line does nothing. What is meant to
initialize?
Thanks,
--
Email: Herbert Xu <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
^ permalink raw reply [flat|nested] 48+ messages in thread
* [PATCH v14 13/26] crypto: acomp - Define a unit_size in struct acomp_req to enable batching.
2026-01-25 3:35 [PATCH v14 00/26] zswap compression batching with optimized iaa_crypto driver Kanchana P Sridhar
` (11 preceding siblings ...)
2026-01-25 3:35 ` [PATCH v14 12/26] crypto: iaa - Rearchitect iaa_crypto to have clean interfaces with crypto_acomp Kanchana P Sridhar
@ 2026-01-25 3:35 ` Kanchana P Sridhar
2026-01-25 3:35 ` [PATCH v14 14/26] crypto: acomp - Add bit to indicate segmentation support Kanchana P Sridhar
` (13 subsequent siblings)
26 siblings, 0 replies; 48+ messages in thread
From: Kanchana P Sridhar @ 2026-01-25 3:35 UTC (permalink / raw)
To: linux-kernel, linux-mm, hannes, yosry.ahmed, nphamcs,
chengming.zhou, usamaarif642, ryan.roberts, 21cnbao, ying.huang,
akpm, senozhatsky, sj, kasong, linux-crypto, herbert, davem,
clabbe, ardb, ebiggers, surenb, kristen.c.accardi,
vinicius.gomes, giovanni.cabiddu
Cc: wajdi.k.feghali, kanchana.p.sridhar
mm: zswap: Set the unit size for zswap to PAGE_SIZE.
We add a new @unit_size data member to struct acomp_req along with a
helper function acomp_request_set_unit_size() for kernel modules to set
the unit size to use while breaking down the request's src/dst
scatterlists.
An acomp_alg can implement batching by using the @req->unit_size to
break down the SG lists passed in via @req->dst and/or @req->src, to
submit individual @req->slen/@req->unit_size compress jobs or
@req->dlen/@req->unit_size decompress jobs, for batch compression and
batch decompression respectively.
In case of batch compression, the folio's pages for the batch can be
retrieved from the @req->src scatterlist by using a struct sg_page_iter
after determining the number of pages as @req->slen/@req->unit_size.
1) acomp_request_set_callback() sets the @req->unit_size to 0.
2) In zswap_cpu_comp_prepare(), after the call to
acomp_request_set_callback(), we call:
acomp_request_set_unit_size(acomp_ctx->req, PAGE_SIZE);
to set the unit size for zswap to PAGE_SIZE.
Suggested-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: Kanchana P Sridhar <kanchana.p.sridhar@intel.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
include/crypto/acompress.h | 48 ++++++++++++++++++++++++++++++++++++++
mm/zswap.c | 3 +++
2 files changed, 51 insertions(+)
diff --git a/include/crypto/acompress.h b/include/crypto/acompress.h
index 9eacb9fa375d..23a1a659843c 100644
--- a/include/crypto/acompress.h
+++ b/include/crypto/acompress.h
@@ -79,6 +79,7 @@ struct acomp_req_chain {
* @dvirt: Destination virtual address
* @slen: Size of the input buffer
* @dlen: Size of the output buffer and number of bytes produced
+ * @unit_size: Unit size for the request for use in batching
* @chain: Private API code data, do not use
* @__ctx: Start of private context data
*/
@@ -94,6 +95,7 @@ struct acomp_req {
};
unsigned int slen;
unsigned int dlen;
+ unsigned int unit_size;
struct acomp_req_chain chain;
@@ -328,9 +330,55 @@ static inline void acomp_request_set_callback(struct acomp_req *req,
{
flgs &= ~CRYPTO_ACOMP_REQ_PRIVATE;
flgs |= req->base.flags & CRYPTO_ACOMP_REQ_PRIVATE;
+ req->unit_size = 0;
crypto_request_set_callback(&req->base, flgs, cmpl, data);
}
+/**
+ * acomp_request_set_unit_size() -- Sets the unit size for the request.
+ *
+ * This is a helper function that enables batching for zswap, IPComp, etc.
+ * It allows multiple independent compression (or decompression) operations to
+ * be submitted in a single request's SG lists, where each SG list ("segment")
+ * is processed independently. The unit size helps derive segments from a
+ * single request. crypto_acomp does not expect the segments to be related in
+ * any way.
+ *
+ * Example usage model:
+ *
+ * A module such as zswap that's configured to use a batching compressor, can
+ * accomplish batch compression of "nr_pages" with crypto_acomp by creating an
+ * output SG table for the batch, initialized to contain "nr_pages" SG
+ * lists. Each scatterlist is mapped to the nth destination buffer for the
+ * batch. Depending on whether the @req is used for batch compress/decompress,
+ * zswap must set the @req's source/destination length to be
+ * "nr_pages * @req->unit_size" respectively.
+ *
+ * An acomp_alg can implement batch compression by using the @req->unit_size
+ * to break down the SG lists passed in via @req->dst to submit individual
+ * "@req->slen/@req->unit_size" compress jobs to be processed as a batch.
+ *
+ * Similarly, zswap can implement batch decompression by passing an
+ * SG table with "nr_pages" SG lists via @req->src to process
+ * "@req->dlen/@req->unit_size" decompress jobs as a batch.
+ *
+ * This API must be called after acomp_request_set_callback(),
+ * which sets @req->unit_size to 0. This makes it easy for users of
+ * crypto_acomp to rely on a default of not opting in to batching.
+ * Users such as zswap opt in to batching by defining @req->unit_size
+ * to a non-zero value for use by acomp_algs supporting batching.
+ *
+ * @du would be PAGE_SIZE for zswap, it could be the MTU for IPsec.
+ *
+ * @req: asynchronous compress/decompress request
+ * @du: data unit size of the input/output buffer scatterlist.
+ */
+static inline void acomp_request_set_unit_size(struct acomp_req *req,
+ unsigned int du)
+{
+ req->unit_size = du;
+}
+
/**
* acomp_request_set_params() -- Sets request parameters
*
diff --git a/mm/zswap.c b/mm/zswap.c
index a3811b05ab57..038e240c03dd 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -781,6 +781,9 @@ static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node)
acomp_ctx->buffer = buffer;
acomp_ctx->acomp = acomp;
acomp_ctx->req = req;
+
+ acomp_request_set_unit_size(acomp_ctx->req, PAGE_SIZE);
+
mutex_unlock(&acomp_ctx->mutex);
return 0;
--
2.27.0
^ permalink raw reply [flat|nested] 48+ messages in thread* [PATCH v14 14/26] crypto: acomp - Add bit to indicate segmentation support
2026-01-25 3:35 [PATCH v14 00/26] zswap compression batching with optimized iaa_crypto driver Kanchana P Sridhar
` (12 preceding siblings ...)
2026-01-25 3:35 ` [PATCH v14 13/26] crypto: acomp - Define a unit_size in struct acomp_req to enable batching Kanchana P Sridhar
@ 2026-01-25 3:35 ` Kanchana P Sridhar
2026-01-25 3:35 ` [PATCH v14 15/26] crypto: acomp - Add trivial segmentation wrapper Kanchana P Sridhar
` (12 subsequent siblings)
26 siblings, 0 replies; 48+ messages in thread
From: Kanchana P Sridhar @ 2026-01-25 3:35 UTC (permalink / raw)
To: linux-kernel, linux-mm, hannes, yosry.ahmed, nphamcs,
chengming.zhou, usamaarif642, ryan.roberts, 21cnbao, ying.huang,
akpm, senozhatsky, sj, kasong, linux-crypto, herbert, davem,
clabbe, ardb, ebiggers, surenb, kristen.c.accardi,
vinicius.gomes, giovanni.cabiddu
Cc: wajdi.k.feghali, kanchana.p.sridhar
This patch adds segmentation support for compression.
Add a bit to the crypto_alg flags to indicate support for segmentation.
Also add a helper for acomp to test whether a given tfm supports
segmentation.
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
include/crypto/algapi.h | 5 +++++
include/crypto/internal/acompress.h | 5 +++++
include/linux/crypto.h | 3 +++
3 files changed, 13 insertions(+)
diff --git a/include/crypto/algapi.h b/include/crypto/algapi.h
index 05deea9dac5e..7d406cfe5751 100644
--- a/include/crypto/algapi.h
+++ b/include/crypto/algapi.h
@@ -280,6 +280,11 @@ static inline bool crypto_tfm_req_virt(struct crypto_tfm *tfm)
return tfm->__crt_alg->cra_flags & CRYPTO_ALG_REQ_VIRT;
}
+static inline bool crypto_tfm_req_seg(struct crypto_tfm *tfm)
+{
+ return tfm->__crt_alg->cra_flags & CRYPTO_ALG_REQ_SEG;
+}
+
static inline u32 crypto_request_flags(struct crypto_async_request *req)
{
return req->flags & ~CRYPTO_TFM_REQ_ON_STACK;
diff --git a/include/crypto/internal/acompress.h b/include/crypto/internal/acompress.h
index 2d97440028ff..366dbdb987e8 100644
--- a/include/crypto/internal/acompress.h
+++ b/include/crypto/internal/acompress.h
@@ -188,6 +188,11 @@ static inline bool crypto_acomp_req_virt(struct crypto_acomp *tfm)
return crypto_tfm_req_virt(&tfm->base);
}
+static inline bool crypto_acomp_req_seg(struct crypto_acomp *tfm)
+{
+ return crypto_tfm_req_seg(&tfm->base);
+}
+
void crypto_acomp_free_streams(struct crypto_acomp_streams *s);
int crypto_acomp_alloc_streams(struct crypto_acomp_streams *s);
diff --git a/include/linux/crypto.h b/include/linux/crypto.h
index a2137e19be7d..89b9c3f87f4d 100644
--- a/include/linux/crypto.h
+++ b/include/linux/crypto.h
@@ -139,6 +139,9 @@
/* Set if the algorithm cannot have a fallback (e.g., phmac). */
#define CRYPTO_ALG_NO_FALLBACK 0x00080000
+/* Set if the algorithm supports segmentation. */
+#define CRYPTO_ALG_REQ_SEG 0x00100000
+
/* The high bits 0xff000000 are reserved for type-specific flags. */
/*
--
2.27.0
^ permalink raw reply [flat|nested] 48+ messages in thread* [PATCH v14 15/26] crypto: acomp - Add trivial segmentation wrapper
2026-01-25 3:35 [PATCH v14 00/26] zswap compression batching with optimized iaa_crypto driver Kanchana P Sridhar
` (13 preceding siblings ...)
2026-01-25 3:35 ` [PATCH v14 14/26] crypto: acomp - Add bit to indicate segmentation support Kanchana P Sridhar
@ 2026-01-25 3:35 ` Kanchana P Sridhar
2026-01-25 3:35 ` [PATCH v14 16/26] crypto: iaa - IAA Batching for parallel compressions/decompressions Kanchana P Sridhar
` (11 subsequent siblings)
26 siblings, 0 replies; 48+ messages in thread
From: Kanchana P Sridhar @ 2026-01-25 3:35 UTC (permalink / raw)
To: linux-kernel, linux-mm, hannes, yosry.ahmed, nphamcs,
chengming.zhou, usamaarif642, ryan.roberts, 21cnbao, ying.huang,
akpm, senozhatsky, sj, kasong, linux-crypto, herbert, davem,
clabbe, ardb, ebiggers, surenb, kristen.c.accardi,
vinicius.gomes, giovanni.cabiddu
Cc: wajdi.k.feghali, kanchana.p.sridhar
This patch provides a wrapper for existing algorithms so that they can
accept a single segment while returning the compressed length or error
through the dst SG list length.
This trivial segmentation wrapper only supports compression with a
segment count of exactly one.
The reason is that the first user zswap will only allocate the
extra memory if the underlying algorithm supports segmentation,
and otherwise only one segment will be given at a time.
Having this wrapper means that the same calling convention can
be used for all algorithms, regardless of segmentation support.
Signed-off-by: Kanchana P Sridhar <kanchana.p.sridhar@intel.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
crypto/acompress.c | 33 ++++++++++++++++++++++++++-------
include/crypto/acompress.h | 1 +
2 files changed, 27 insertions(+), 7 deletions(-)
diff --git a/crypto/acompress.c b/crypto/acompress.c
index be28cbfd22e3..cfb8ede02cf4 100644
--- a/crypto/acompress.c
+++ b/crypto/acompress.c
@@ -170,8 +170,13 @@ static void acomp_save_req(struct acomp_req *req, crypto_completion_t cplt)
state->compl = req->base.complete;
state->data = req->base.data;
+ state->unit_size = req->unit_size;
+ state->flags = req->base.flags & (CRYPTO_ACOMP_REQ_SRC_VIRT |
+ CRYPTO_ACOMP_REQ_DST_VIRT);
+
req->base.complete = cplt;
req->base.data = state;
+ req->unit_size = 0;
}
static void acomp_restore_req(struct acomp_req *req)
@@ -180,6 +185,7 @@ static void acomp_restore_req(struct acomp_req *req)
req->base.complete = state->compl;
req->base.data = state->data;
+ req->unit_size = state->unit_size;
}
static void acomp_reqchain_virt(struct acomp_req *req)
@@ -198,9 +204,6 @@ static void acomp_virt_to_sg(struct acomp_req *req)
{
struct acomp_req_chain *state = &req->chain;
- state->flags = req->base.flags & (CRYPTO_ACOMP_REQ_SRC_VIRT |
- CRYPTO_ACOMP_REQ_DST_VIRT);
-
if (acomp_request_src_isvirt(req)) {
unsigned int slen = req->slen;
const u8 *svirt = req->svirt;
@@ -248,6 +251,10 @@ static int acomp_reqchain_finish(struct acomp_req *req, int err)
{
acomp_reqchain_virt(req);
acomp_restore_req(req);
+
+ if (req->unit_size)
+ req->dst->length = unlikely(err) ? err : req->dlen;
+
return err;
}
@@ -268,14 +275,17 @@ static void acomp_reqchain_done(void *data, int err)
compl(data, err);
}
-static int acomp_do_req_chain(struct acomp_req *req, bool comp)
+static __always_inline int acomp_do_req_chain(struct acomp_req *req, bool comp)
{
int err;
+ if (unlikely(req->unit_size && req->slen > req->unit_size))
+ return -ENOSYS;
+
acomp_save_req(req, acomp_reqchain_done);
err = acomp_do_one_req(req, comp);
- if (err == -EBUSY || err == -EINPROGRESS)
+ if (unlikely(err == -EBUSY || err == -EINPROGRESS))
return err;
return acomp_reqchain_finish(req, err);
@@ -287,8 +297,17 @@ int crypto_acomp_compress(struct acomp_req *req)
if (acomp_req_on_stack(req) && acomp_is_async(tfm))
return -EAGAIN;
+
+ if (req->unit_size && acomp_request_issg(req)) {
+ if (!crypto_acomp_req_seg(tfm))
+ return acomp_do_req_chain(req, true);
+
+ return tfm->compress(req);
+ }
+
if (crypto_acomp_req_virt(tfm) || acomp_request_issg(req))
- return crypto_acomp_reqtfm(req)->compress(req);
+ return tfm->compress(req);
+
return acomp_do_req_chain(req, true);
}
EXPORT_SYMBOL_GPL(crypto_acomp_compress);
@@ -300,7 +319,7 @@ int crypto_acomp_decompress(struct acomp_req *req)
if (acomp_req_on_stack(req) && acomp_is_async(tfm))
return -EAGAIN;
if (crypto_acomp_req_virt(tfm) || acomp_request_issg(req))
- return crypto_acomp_reqtfm(req)->decompress(req);
+ return tfm->decompress(req);
return acomp_do_req_chain(req, false);
}
EXPORT_SYMBOL_GPL(crypto_acomp_decompress);
diff --git a/include/crypto/acompress.h b/include/crypto/acompress.h
index 23a1a659843c..86e4932cd112 100644
--- a/include/crypto/acompress.h
+++ b/include/crypto/acompress.h
@@ -67,6 +67,7 @@ struct acomp_req_chain {
struct folio *dfolio;
};
u32 flags;
+ u32 unit_size;
};
/**
--
2.27.0
^ permalink raw reply [flat|nested] 48+ messages in thread* [PATCH v14 16/26] crypto: iaa - IAA Batching for parallel compressions/decompressions.
2026-01-25 3:35 [PATCH v14 00/26] zswap compression batching with optimized iaa_crypto driver Kanchana P Sridhar
` (14 preceding siblings ...)
2026-01-25 3:35 ` [PATCH v14 15/26] crypto: acomp - Add trivial segmentation wrapper Kanchana P Sridhar
@ 2026-01-25 3:35 ` Kanchana P Sridhar
2026-01-25 3:35 ` [PATCH v14 17/26] crypto: iaa - Submit the two largest source buffers first in batch decompress Kanchana P Sridhar
` (10 subsequent siblings)
26 siblings, 0 replies; 48+ messages in thread
From: Kanchana P Sridhar @ 2026-01-25 3:35 UTC (permalink / raw)
To: linux-kernel, linux-mm, hannes, yosry.ahmed, nphamcs,
chengming.zhou, usamaarif642, ryan.roberts, 21cnbao, ying.huang,
akpm, senozhatsky, sj, kasong, linux-crypto, herbert, davem,
clabbe, ardb, ebiggers, surenb, kristen.c.accardi,
vinicius.gomes, giovanni.cabiddu
Cc: wajdi.k.feghali, kanchana.p.sridhar
This patch adds core batching capabilities in the IAA driver for kernel
users such as zswap to compress/decompress multiple pages/buffers in
parallel using IAA hardware acceleration, without the use of
interrupts. Instead, this is accomplished using an async "submit-poll"
mechanism. IAA Batching significantly improves swapout/swapin latency
and throughput.
To achieve this, we break down a compress/decompress job into two
separate activities if the driver is configured for non-irq async mode:
1) Submit a descriptor after caching the "idxd_desc" descriptor in the
req->drv_data, and return -EINPROGRESS.
2) Poll: Given a request, retrieve the descriptor and poll its completion
status for success/error.
This is enabled by the following additions in the driver:
1) The idxd_desc is cached in the "drv_data" member of "struct iaa_req".
2) IAA_REQ_POLL_FLAG: if set in the iaa_req's flags, this tells
the driver that it should submit the descriptor and return
-EINPROGRESS. If not set, the driver will proceed to call
check_completion() in fully synchronous mode, until the hardware
returns a completion status.
3) iaa_comp_poll() procedure: This routine is intended to be called
after submission returns -EINPROGRESS. It will check the completion
status once, and return -EAGAIN if the job has not completed. If the
job has completed, it will return the completion status.
The purpose of this commit is to allow kernel users of iaa_crypto, such
as zswap, to be able to invoke the crypto_acomp_compress() API in fully
synchronous mode for sequential/non-batching use cases (i.e. today's
status-quo), wherein zswap calls:
crypto_wait_req(crypto_acomp_compress(req), wait);
and to enable invoking fully asynchronous batch compress/decompress
functionality. Both use cases need to reuse same code paths in the
driver to interface with hardware: the IAA_REQ_POLL_FLAG allows this
shared code to determine whether we need to process an iaa_req
synchronously/asynchronously. The idea is to simplify iaa_crypto's
sequential/batching interfaces for use by swap modules.
Thus, regardless of the iaa_crypto driver's 'sync_mode' setting, it
can still be forced to use synchronous mode by *not setting* the
IAA_REQ_POLL_FLAG in iaa_req->flags: this is the default to support
sequential use cases in zswap today. In other words, both these
conditions need to be met for a request to be processed in fully async
submit-poll mode:
1) use_irq should be "false"
2) iaa_req->flags & IAA_REQ_POLL_FLAG should be "true"
The patch defines an iaa_crypto constant, IAA_CRYPTO_MAX_BATCH_SIZE
(set to 8U currently). This is the maximum batch-size for IAA, and
represents the maximum number of pages/buffers that can be
compressed/decompressed in parallel, respectively.
In order to support IAA batching, the iaa_crypto driver allocates
IAA_CRYPTO_MAX_BATCH_SIZE "struct iaa_req *reqs" per-CPU, upon
initialization, and statically annotates them for batch-parallelism by
setting the IAA_REQ_POLL_FLAG. Notably, the task of allocating multiple
requests to submit to the hardware for parallel [de]compressions is
taken over by iaa_crypto, so that zswap doesn't need to allocate the
reqs.
Within the core IAA batching routines the driver uses these per-CPU
"iaa_batch_ctx->reqs" to submit descriptors for each request in the
batch in iaa_[de]compress(), and returns -EINPROGRESS. The hardware will
begin processing each request as soon as it is submitted; essentially
all compress/decompress jobs will be parallelized.
The polling function, "iaa_comp_poll()", will retrieve the descriptor
from each iaa_req->drv_data to check its completion status.
Compress batching is expected to be called by kernel modules such as
zswap by passing the folio pages as the "source" SG list of the
acomp_req, and by constructing an SG table of SG lists for the output
buffers and setting the acomp_req's "dst" to the head of this list of
scatterlists. Thanks to Herbert Xu for suggesting this batching
architecture.
Within the iaa_crypto driver's core compress batching function:
1) The per-CPU iaa_reqs are populated from the acomp_req's src/dst SG
lists.
2) All iaa_reqs are submitted to the hardware in async mode, using
movdir64b. This enables hardware parallelism, because we don't wait
for one compress/decompress job to finish before submitting the next
one.
3) The iaa_reqs submitted are polled for completion statuses in a
non-blocking manner in a while loop: each request that is still
pending is polled once, and this repeats, until all requests have
completed.
The core IAA batching functions are:
static int iaa_comp_acompress_batch(
struct iaa_compression_ctx *ctx,
struct iaa_req *parent_req,
unsigned int unit_size);
static int iaa_comp_adecompress_batch(
struct iaa_compression_ctx *ctx,
struct iaa_req *parent_req,
unsigned int unit_size);
The parameter @unit_size represents the unit size in bytes, for
dis-assembling the source or destination @parent_req->slen or
@parent_req->dlen and SG lists passed in through
@parent_req->src and @parent_req->dst.
Suggested-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: Kanchana P Sridhar <kanchana.p.sridhar@intel.com>
---
drivers/crypto/intel/iaa/iaa_crypto.h | 35 +++
drivers/crypto/intel/iaa/iaa_crypto_main.c | 346 ++++++++++++++++++++-
2 files changed, 374 insertions(+), 7 deletions(-)
diff --git a/drivers/crypto/intel/iaa/iaa_crypto.h b/drivers/crypto/intel/iaa/iaa_crypto.h
index 4dfb65c88f83..db83c21e92f1 100644
--- a/drivers/crypto/intel/iaa/iaa_crypto.h
+++ b/drivers/crypto/intel/iaa/iaa_crypto.h
@@ -41,6 +41,40 @@
IAA_DECOMP_CHECK_FOR_EOB | \
IAA_DECOMP_STOP_ON_EOB)
+/*
+ * If set, the driver must have a way to submit the req, then
+ * poll its completion status for success/error.
+ */
+#define IAA_REQ_POLL_FLAG 0x00000002
+
+/*
+ * The maximum compress/decompress batch size for IAA's batch compression
+ * and batch decompression functionality.
+ */
+#define IAA_CRYPTO_MAX_BATCH_SIZE 8U
+
+/*
+ * Used to create per-CPU structure comprising of IAA_CRYPTO_MAX_BATCH_SIZE
+ * reqs for batch [de]compressions.
+ *
+ * @reqs: Used to submit up to IAA_CRYPTO_MAX_BATCH_SIZE parallel
+ * compress/decompress jobs to the accelerator. The driver statically
+ * sets the IAA_REQ_POLL_FLAG on @reqs to indicate that these need to
+ * be processed asynchronously: submit for parallel processing
+ * and return; then polled for completion statuses.
+ *
+ * @mutex: Used to protect the per-CPU batch compression/decompression context
+ * from preemption/process migration; and to allow upper layers in the
+ * kernel to use synchronous/asynchronous compress/decompress calls to
+ * IAA. In other words, don't make any assumptions, and protect
+ * compression/decompression data.
+ *
+ */
+struct iaa_batch_ctx {
+ struct iaa_req **reqs;
+ struct mutex mutex;
+};
+
#define IAA_COMP_MODES_MAX IAA_MODE_NONE
enum iaa_mode {
@@ -51,6 +85,7 @@ enum iaa_mode {
struct iaa_req {
struct scatterlist *src;
struct scatterlist *dst;
+ struct scatterlist sg_src;
unsigned int slen;
unsigned int dlen;
u32 flags;
diff --git a/drivers/crypto/intel/iaa/iaa_crypto_main.c b/drivers/crypto/intel/iaa/iaa_crypto_main.c
index d4b0c09bff21..a447555f4eb9 100644
--- a/drivers/crypto/intel/iaa/iaa_crypto_main.c
+++ b/drivers/crypto/intel/iaa/iaa_crypto_main.c
@@ -56,6 +56,9 @@ static struct wq_table_entry **pkg_global_comp_wqs;
static struct crypto_acomp *deflate_crypto_acomp;
DEFINE_MUTEX(deflate_crypto_acomp_lock);
+/* Per-cpu iaa_reqs for batching. */
+static struct iaa_batch_ctx __percpu *iaa_batch_ctx;
+
LIST_HEAD(iaa_devices);
DEFINE_MUTEX(iaa_devices_lock);
@@ -1614,6 +1617,8 @@ static int iaa_compress_verify(struct iaa_compression_ctx *ctx, struct iaa_req *
dma_addr_t src_addr, unsigned int slen,
dma_addr_t dst_addr, unsigned int dlen)
{
+ u16 alloc_decomp_desc_timeout = ctx ?
+ ctx->alloc_decomp_desc_timeout : IAA_ALLOC_DESC_DECOMP_TIMEOUT;
struct iaa_device *iaa_device;
struct idxd_desc *idxd_desc = ERR_PTR(-EAGAIN);
u16 alloc_desc_retries = 0;
@@ -1630,7 +1635,7 @@ static int iaa_compress_verify(struct iaa_compression_ctx *ctx, struct iaa_req *
pdev = idxd->pdev;
dev = &pdev->dev;
- while ((idxd_desc == ERR_PTR(-EAGAIN)) && (alloc_desc_retries++ < ctx->alloc_decomp_desc_timeout)) {
+ while ((idxd_desc == ERR_PTR(-EAGAIN)) && (alloc_desc_retries++ < alloc_decomp_desc_timeout)) {
idxd_desc = idxd_alloc_desc(wq, IDXD_OP_NONBLOCK);
cpu_relax();
}
@@ -1902,14 +1907,15 @@ static int iaa_compress(struct iaa_compression_ctx *ctx, struct iaa_req *req,
desc = iaa_setup_compress_hw_desc(idxd_desc, src_addr, slen, dst_addr, *dlen,
ctx->mode, iaa_device->compression_modes[ctx->mode]);
- if (likely(!ctx->use_irq)) {
+ if (likely(!ctx->use_irq || (req->flags & IAA_REQ_POLL_FLAG))) {
+ req->drv_data = idxd_desc;
iaa_submit_desc_movdir64b(wq, idxd_desc);
/* Update stats */
update_total_comp_calls();
update_wq_comp_calls(wq);
- if (ctx->async_mode)
+ if (req->flags & IAA_REQ_POLL_FLAG)
return -EINPROGRESS;
ret = check_completion(dev, idxd_desc->iax_completion, true, false);
@@ -1990,14 +1996,15 @@ static int iaa_decompress(struct iaa_compression_ctx *ctx, struct iaa_req *req,
desc = iaa_setup_decompress_hw_desc(idxd_desc, src_addr, slen, dst_addr, *dlen);
- if (likely(!ctx->use_irq)) {
+ if (likely(!ctx->use_irq || (req->flags & IAA_REQ_POLL_FLAG))) {
+ req->drv_data = idxd_desc;
iaa_submit_desc_movdir64b(wq, idxd_desc);
/* Update stats */
update_total_decomp_calls();
update_wq_decomp_calls(wq);
- if (ctx->async_mode)
+ if (req->flags & IAA_REQ_POLL_FLAG)
return -EINPROGRESS;
ret = check_completion(dev, idxd_desc->iax_completion, false, false);
@@ -2200,6 +2207,268 @@ static int iaa_comp_adecompress(struct iaa_compression_ctx *ctx, struct iaa_req
return ret;
}
+static int iaa_comp_poll(struct iaa_compression_ctx *ctx, struct iaa_req *req)
+{
+ struct idxd_desc *idxd_desc;
+ struct idxd_device *idxd;
+ struct iaa_wq *iaa_wq;
+ struct pci_dev *pdev;
+ struct device *dev;
+ struct idxd_wq *wq;
+ bool compress_op;
+ int ret;
+
+ idxd_desc = req->drv_data;
+ if (!idxd_desc)
+ return -EAGAIN;
+
+ compress_op = (idxd_desc->iax_hw->opcode == IAX_OPCODE_COMPRESS);
+ wq = idxd_desc->wq;
+ iaa_wq = idxd_wq_get_private(wq);
+ idxd = iaa_wq->iaa_device->idxd;
+ pdev = idxd->pdev;
+ dev = &pdev->dev;
+
+ ret = check_completion(dev, idxd_desc->iax_completion, compress_op, true);
+ if (ret == -EAGAIN)
+ return ret;
+ if (ret)
+ goto out;
+
+ req->dlen = idxd_desc->iax_completion->output_size;
+
+ /* Update stats */
+ if (compress_op) {
+ update_total_comp_bytes_out(req->dlen);
+ update_wq_comp_bytes(wq, req->dlen);
+ } else {
+ update_total_decomp_bytes_in(req->slen);
+ update_wq_decomp_bytes(wq, req->slen);
+ }
+
+ if (compress_op && iaa_verify_compress) {
+ dma_addr_t src_addr, dst_addr;
+
+ req->compression_crc = idxd_desc->iax_completion->crc;
+
+ dma_sync_sg_for_device(dev, req->dst, 1, DMA_FROM_DEVICE);
+ dma_sync_sg_for_device(dev, req->src, 1, DMA_TO_DEVICE);
+
+ src_addr = sg_dma_address(req->src);
+ dst_addr = sg_dma_address(req->dst);
+
+ ret = iaa_compress_verify(ctx, req, wq, src_addr, req->slen,
+ dst_addr, req->dlen);
+ }
+
+out:
+ /* caller doesn't call crypto_wait_req, so no acomp_request_complete() */
+ dma_unmap_sg(dev, req->dst, 1, DMA_FROM_DEVICE);
+ dma_unmap_sg(dev, req->src, 1, DMA_TO_DEVICE);
+
+ idxd_free_desc(idxd_desc->wq, idxd_desc);
+ percpu_ref_put(&iaa_wq->ref);
+
+ return ret;
+}
+
+static __always_inline int iaa_comp_submit_acompress_batch(
+ struct iaa_compression_ctx *ctx,
+ struct iaa_req *parent_req,
+ struct iaa_req **reqs,
+ int nr_reqs,
+ unsigned int unit_size)
+{
+ struct sg_page_iter sgiter;
+ struct scatterlist *sg;
+ int i, err, ret = 0;
+
+ __sg_page_iter_start(&sgiter, parent_req->src, nr_reqs,
+ parent_req->src->offset/unit_size);
+
+ for (i = 0; i < nr_reqs; ++i, ++sgiter.sg_pgoffset) {
+ sg_set_page(reqs[i]->src, sg_page_iter_page(&sgiter), PAGE_SIZE, 0);
+ reqs[i]->slen = PAGE_SIZE;
+ }
+
+ /*
+ * Prepare and submit the batch of iaa_reqs to IAA. IAA will process
+ * these compress jobs in parallel.
+ */
+ for_each_sg(parent_req->dst, sg, nr_reqs, i) {
+ sg->length = PAGE_SIZE;
+ reqs[i]->dst = sg;
+ reqs[i]->dlen = PAGE_SIZE;
+
+ err = iaa_comp_acompress(ctx, reqs[i]);
+
+ if (likely(err == -EINPROGRESS)) {
+ reqs[i]->dst->length = -EAGAIN;
+ } else if (unlikely(err)) {
+ reqs[i]->dst->length = err;
+ ret = -EINVAL;
+ } else {
+ reqs[i]->dst->length = reqs[i]->dlen;
+ }
+ }
+
+ return ret;
+}
+
+static __always_inline int iaa_comp_submit_adecompress_batch(
+ struct iaa_compression_ctx *ctx,
+ struct iaa_req *parent_req,
+ struct iaa_req **reqs,
+ int nr_reqs)
+{
+ struct scatterlist *sg;
+ int i, err, ret = 0;
+
+ for_each_sg(parent_req->src, sg, nr_reqs, i) {
+ reqs[i]->src = sg;
+ reqs[i]->slen = sg->length;
+ }
+
+ for_each_sg(parent_req->dst, sg, nr_reqs, i) {
+ reqs[i]->dst = sg;
+ reqs[i]->dlen = PAGE_SIZE;
+ }
+
+ /*
+ * Prepare and submit the batch of iaa_reqs to IAA. IAA will process
+ * these decompress jobs in parallel.
+ */
+ for (i = 0; i < nr_reqs; ++i) {
+ err = iaa_comp_adecompress(ctx, reqs[i]);
+
+ /*
+ * In case of idxd desc allocation/submission errors, the
+ * software decompress fallback path is taken, which will set
+ * @err to 0 or an error value.
+ */
+ if (likely(err == -EINPROGRESS)) {
+ reqs[i]->dst->length = -EAGAIN;
+ } else if (unlikely(err)) {
+ reqs[i]->dst->length = err;
+ ret = -EINVAL;
+ } else {
+ reqs[i]->dst->length = reqs[i]->dlen;
+ }
+ }
+
+ return ret;
+}
+
+static int iaa_comp_batch_completed(struct iaa_compression_ctx *ctx,
+ struct iaa_req **reqs,
+ int nr_reqs)
+{
+ bool batch_completed = false;
+ int i, *err, ret = 0;
+
+ while (!batch_completed) {
+ batch_completed = true;
+
+ for (i = 0; i < nr_reqs; ++i) {
+ err = &reqs[i]->dst->length;
+
+ /*
+ * Skip, if the compression/decompression has already
+ * completed successfully or with an error.
+ */
+ if (*err != -EAGAIN)
+ continue;
+
+ *err = iaa_comp_poll(ctx, reqs[i]);
+
+ if (*err) {
+ if (likely(*err == -EAGAIN))
+ batch_completed = false;
+ else
+ ret = -EINVAL;
+ } else {
+ *err = reqs[i]->dlen;
+ }
+ }
+ }
+
+ return ret;
+}
+
+/**
+ * This API implements the core IAA compress batching functionality.
+ *
+ * @ctx: compression ctx for the requested IAA mode (fixed/dynamic).
+ * @parent_req: The "parent" iaa_req that contains SG lists for the batch's
+ * inputs and outputs.
+ * @unit_size: The unit size to apply to @parent_req->slen to get the number of
+ * scatterlists it contains.
+ *
+ * The caller should check the individual sg->lengths in the @parent_req for
+ * errors, including incompressible page errors.
+ *
+ * Returns 0 if all compress requests in the batch complete successfully,
+ * -EINVAL otherwise.
+ */
+static int __maybe_unused iaa_comp_acompress_batch(
+ struct iaa_compression_ctx *ctx,
+ struct iaa_req *parent_req,
+ unsigned int unit_size)
+{
+ struct iaa_batch_ctx *cpu_ctx = raw_cpu_ptr(iaa_batch_ctx);
+ int ret, nr_reqs = parent_req->slen / unit_size;
+ struct iaa_req **reqs;
+
+ mutex_lock(&cpu_ctx->mutex);
+
+ reqs = cpu_ctx->reqs;
+
+ ret = iaa_comp_submit_acompress_batch(ctx, parent_req, reqs, nr_reqs, unit_size);
+
+ ret |= iaa_comp_batch_completed(ctx, reqs, nr_reqs);
+
+ mutex_unlock(&cpu_ctx->mutex);
+
+ return ret;
+}
+
+/**
+ * This API implements the core IAA decompress batching functionality.
+ *
+ * @ctx: compression ctx for the requested IAA mode (fixed/dynamic).
+ * @parent_req: The "parent" iaa_req that contains SG lists for the batch's
+ * inputs and outputs.
+ * @unit_size: The unit size to apply to @parent_req->dlen to get the number of
+ * scatterlists it contains.
+ *
+ * The caller should check @parent_req->dst scatterlist's component SG lists'
+ * @length for errors and handle @length != PAGE_SIZE.
+ *
+ * Returns 0 if all decompress requests complete successfully,
+ * -EINVAL otherwise.
+ */
+static int __maybe_unused iaa_comp_adecompress_batch(
+ struct iaa_compression_ctx *ctx,
+ struct iaa_req *parent_req,
+ unsigned int unit_size)
+{
+ struct iaa_batch_ctx *cpu_ctx = raw_cpu_ptr(iaa_batch_ctx);
+ int ret, nr_reqs = parent_req->dlen / unit_size;
+ struct iaa_req **reqs;
+
+ mutex_lock(&cpu_ctx->mutex);
+
+ reqs = cpu_ctx->reqs;
+
+ ret = iaa_comp_submit_adecompress_batch(ctx, parent_req, reqs, nr_reqs);
+
+ ret |= iaa_comp_batch_completed(ctx, reqs, nr_reqs);
+
+ mutex_unlock(&cpu_ctx->mutex);
+
+ return ret;
+}
+
static void compression_ctx_init(struct iaa_compression_ctx *ctx, enum iaa_mode mode)
{
ctx->mode = mode;
@@ -2244,7 +2513,7 @@ static int iaa_crypto_acomp_adecompress_main(struct acomp_req *areq)
acomp_to_iaa(areq, &parent_req, ctx);
ret = iaa_comp_adecompress(ctx, &parent_req);
- iaa_to_acomp(unlikely(ret) ? ret : parent_req.dlen, areq);
+ iaa_to_acomp(parent_req.dlen, areq);
}
return ret;
@@ -2529,9 +2798,31 @@ static struct idxd_device_driver iaa_crypto_driver = {
* Module init/exit.
********************/
+static void iaa_batch_ctx_dealloc(void)
+{
+ int cpu;
+ u8 i;
+
+ if (!iaa_batch_ctx)
+ return;
+
+ for_each_possible_cpu(cpu) {
+ struct iaa_batch_ctx *cpu_ctx = per_cpu_ptr(iaa_batch_ctx, cpu);
+
+ if (cpu_ctx && cpu_ctx->reqs) {
+ for (i = 0; i < IAA_CRYPTO_MAX_BATCH_SIZE; ++i)
+ kfree(cpu_ctx->reqs[i]);
+ kfree(cpu_ctx->reqs);
+ }
+ }
+
+ free_percpu(iaa_batch_ctx);
+}
+
static int __init iaa_crypto_init_module(void)
{
- int ret = 0;
+ int cpu, ret = 0;
+ u8 i;
INIT_LIST_HEAD(&iaa_devices);
@@ -2593,6 +2884,41 @@ static int __init iaa_crypto_init_module(void)
goto err_sync_attr_create;
}
+ /* Allocate batching resources for iaa_crypto. */
+ iaa_batch_ctx = alloc_percpu_gfp(struct iaa_batch_ctx, GFP_KERNEL | __GFP_ZERO);
+ if (!iaa_batch_ctx) {
+ pr_debug("Failed to allocate per-cpu iaa_batch_ctx\n");
+ goto batch_ctx_fail;
+ }
+
+ for_each_possible_cpu(cpu) {
+ struct iaa_batch_ctx *cpu_ctx = per_cpu_ptr(iaa_batch_ctx, cpu);
+ int nid = cpu_to_node(cpu);
+
+ cpu_ctx->reqs = kcalloc_node(IAA_CRYPTO_MAX_BATCH_SIZE,
+ sizeof(struct iaa_req *),
+ GFP_KERNEL, nid);
+
+ if (!cpu_ctx->reqs)
+ goto reqs_fail;
+
+ for (i = 0; i < IAA_CRYPTO_MAX_BATCH_SIZE; ++i) {
+ cpu_ctx->reqs[i] = kzalloc_node(sizeof(struct iaa_req),
+ GFP_KERNEL, nid);
+ if (!cpu_ctx->reqs[i]) {
+ pr_debug("Could not alloc iaa_req reqs[%d]\n", i);
+ goto reqs_fail;
+ }
+
+ sg_init_table(&cpu_ctx->reqs[i]->sg_src, 1);
+ cpu_ctx->reqs[i]->src = &cpu_ctx->reqs[i]->sg_src;
+
+ cpu_ctx->reqs[i]->flags |= IAA_REQ_POLL_FLAG;
+ }
+
+ mutex_init(&cpu_ctx->mutex);
+ }
+
if (iaa_crypto_debugfs_init())
pr_warn("debugfs init failed, stats not available\n");
@@ -2600,6 +2926,11 @@ static int __init iaa_crypto_init_module(void)
out:
return ret;
+reqs_fail:
+ iaa_batch_ctx_dealloc();
+batch_ctx_fail:
+ driver_remove_file(&iaa_crypto_driver.drv,
+ &driver_attr_sync_mode);
err_sync_attr_create:
driver_remove_file(&iaa_crypto_driver.drv,
&driver_attr_verify_compress);
@@ -2631,6 +2962,7 @@ static void __exit iaa_crypto_cleanup_module(void)
iaa_unregister_acomp_compression_device();
iaa_unregister_compression_device();
+ iaa_batch_ctx_dealloc();
iaa_crypto_debugfs_cleanup();
driver_remove_file(&iaa_crypto_driver.drv,
&driver_attr_sync_mode);
--
2.27.0
^ permalink raw reply [flat|nested] 48+ messages in thread* [PATCH v14 17/26] crypto: iaa - Submit the two largest source buffers first in batch decompress.
2026-01-25 3:35 [PATCH v14 00/26] zswap compression batching with optimized iaa_crypto driver Kanchana P Sridhar
` (15 preceding siblings ...)
2026-01-25 3:35 ` [PATCH v14 16/26] crypto: iaa - IAA Batching for parallel compressions/decompressions Kanchana P Sridhar
@ 2026-01-25 3:35 ` Kanchana P Sridhar
2026-01-25 3:35 ` [PATCH v14 18/26] crypto: acomp, iaa - crypto_acomp integration of IAA Batching Kanchana P Sridhar
` (9 subsequent siblings)
26 siblings, 0 replies; 48+ messages in thread
From: Kanchana P Sridhar @ 2026-01-25 3:35 UTC (permalink / raw)
To: linux-kernel, linux-mm, hannes, yosry.ahmed, nphamcs,
chengming.zhou, usamaarif642, ryan.roberts, 21cnbao, ying.huang,
akpm, senozhatsky, sj, kasong, linux-crypto, herbert, davem,
clabbe, ardb, ebiggers, surenb, kristen.c.accardi,
vinicius.gomes, giovanni.cabiddu
Cc: wajdi.k.feghali, kanchana.p.sridhar
This patch finds the two largest source buffers in a given decompression
batch, and submits them first to the IAA decompress engines.
This improves decompress batching latency because the hardware has a
head start on decompressing the highest latency source buffers in the
batch. Workload performance is also significantly improved as a result
of this optimization.
Signed-off-by: Kanchana P Sridhar <kanchana.p.sridhar@intel.com>
---
drivers/crypto/intel/iaa/iaa_crypto_main.c | 49 ++++++++++++++++++++--
1 file changed, 45 insertions(+), 4 deletions(-)
diff --git a/drivers/crypto/intel/iaa/iaa_crypto_main.c b/drivers/crypto/intel/iaa/iaa_crypto_main.c
index a447555f4eb9..8d83a1ea15d7 100644
--- a/drivers/crypto/intel/iaa/iaa_crypto_main.c
+++ b/drivers/crypto/intel/iaa/iaa_crypto_main.c
@@ -2315,12 +2315,46 @@ static __always_inline int iaa_comp_submit_acompress_batch(
return ret;
}
+/*
+ * Find the two largest source buffers in @reqs for a decompress batch,
+ * based on @reqs[i]->slen. Save their indices as the first two elements in
+ * @submit_order, and the rest of the indices from the batch order.
+ */
+static void get_decompress_batch_submit_order(
+ struct iaa_req *reqs[],
+ int nr_pages,
+ int submit_order[])
+{
+ int i, j = 0, max_i = 0, next_max_i = 0;
+
+ for (i = 0; i < nr_pages; ++i) {
+ if (reqs[i]->slen >= reqs[max_i]->slen) {
+ next_max_i = max_i;
+ max_i = i;
+ } else if ((next_max_i == max_i) ||
+ (reqs[i]->slen > reqs[next_max_i]->slen)) {
+ next_max_i = i;
+ }
+ }
+
+ submit_order[j++] = max_i;
+
+ if (next_max_i != max_i)
+ submit_order[j++] = next_max_i;
+
+ for (i = 0; i < nr_pages; ++i) {
+ if ((i != max_i) && (i != next_max_i))
+ submit_order[j++] = i;
+ }
+}
+
static __always_inline int iaa_comp_submit_adecompress_batch(
struct iaa_compression_ctx *ctx,
struct iaa_req *parent_req,
struct iaa_req **reqs,
int nr_reqs)
{
+ int submit_order[IAA_CRYPTO_MAX_BATCH_SIZE];
struct scatterlist *sg;
int i, err, ret = 0;
@@ -2334,12 +2368,19 @@ static __always_inline int iaa_comp_submit_adecompress_batch(
reqs[i]->dlen = PAGE_SIZE;
}
+ /*
+ * Construct the submit order by finding the indices of the two largest
+ * compressed data buffers in the batch, so that they are submitted
+ * first. This improves latency of the batch.
+ */
+ get_decompress_batch_submit_order(reqs, nr_reqs, submit_order);
+
/*
* Prepare and submit the batch of iaa_reqs to IAA. IAA will process
* these decompress jobs in parallel.
*/
for (i = 0; i < nr_reqs; ++i) {
- err = iaa_comp_adecompress(ctx, reqs[i]);
+ err = iaa_comp_adecompress(ctx, reqs[submit_order[i]]);
/*
* In case of idxd desc allocation/submission errors, the
@@ -2347,12 +2388,12 @@ static __always_inline int iaa_comp_submit_adecompress_batch(
* @err to 0 or an error value.
*/
if (likely(err == -EINPROGRESS)) {
- reqs[i]->dst->length = -EAGAIN;
+ reqs[submit_order[i]]->dst->length = -EAGAIN;
} else if (unlikely(err)) {
- reqs[i]->dst->length = err;
+ reqs[submit_order[i]]->dst->length = err;
ret = -EINVAL;
} else {
- reqs[i]->dst->length = reqs[i]->dlen;
+ reqs[submit_order[i]]->dst->length = reqs[submit_order[i]]->dlen;
}
}
--
2.27.0
^ permalink raw reply [flat|nested] 48+ messages in thread* [PATCH v14 18/26] crypto: acomp, iaa - crypto_acomp integration of IAA Batching.
2026-01-25 3:35 [PATCH v14 00/26] zswap compression batching with optimized iaa_crypto driver Kanchana P Sridhar
` (16 preceding siblings ...)
2026-01-25 3:35 ` [PATCH v14 17/26] crypto: iaa - Submit the two largest source buffers first in batch decompress Kanchana P Sridhar
@ 2026-01-25 3:35 ` Kanchana P Sridhar
2026-02-05 4:14 ` Herbert Xu
2026-01-25 3:35 ` [PATCH v14 19/26] crypto: iaa - Enable async mode and make it the default Kanchana P Sridhar
` (8 subsequent siblings)
26 siblings, 1 reply; 48+ messages in thread
From: Kanchana P Sridhar @ 2026-01-25 3:35 UTC (permalink / raw)
To: linux-kernel, linux-mm, hannes, yosry.ahmed, nphamcs,
chengming.zhou, usamaarif642, ryan.roberts, 21cnbao, ying.huang,
akpm, senozhatsky, sj, kasong, linux-crypto, herbert, davem,
clabbe, ardb, ebiggers, surenb, kristen.c.accardi,
vinicius.gomes, giovanni.cabiddu
Cc: wajdi.k.feghali, kanchana.p.sridhar
This commit makes the necessary changes for correctly integrating IAA
compress/decompress batching with the crypto_acomp API as per the
discussions in [1]. Further, IAA sets crypto_alg flags to indicate
support for segmentation.
To provide context from the perspective of a kernel user such as zswap,
the zswap interface to these batching API will be done by setting up the
acomp_req through these crypto API to designate multiple src/dst SG
lists representing the batch being sent to iaa_crypto:
acomp_request_set_src_folio()
acomp_request_set_dst_sg()
acomp_request_set_unit_size()
before proceeding to invoke batch compression using the existing
crypto_acomp_compress() interface.
Within crypto_acomp_compress(), an acomp_req whose tfm supports
segmentation is further tested for an "slen" that is greater than the
request's unit_size. If so, we invoke "acomp_do_req_batch_parallel()",
similar to the "acomp_do_req_chain()" case.
acomp_do_req_batch_parallel() creates a wait_queue_head
"batch_parallel_wq", stores it in the acomp_req's "__ctx", then calls
tfm->compress()/tfm->decompress().
Next, the iaa_crypto driver alg's compress() implementation submits the
batch's requests and immediately returns to
acomp_do_req_batch_parallel(); which then waits for the
"batch_parallel_wq" to be notified by a tfm->batch_completed() event.
To support this, a "batch_completed()" API is added to
"struct crypto_acomp" and "struct acomp_alg".
The iaa_crypto driver alg's batch_completed() implementation waits for
each batch sub-request to complete and notifies the batch_parallel_wq.
If any sub-request has an error, -EINVAL is returned to the acomp_req's
callback, else 0.
[1]: https://lore.kernel.org/all/aRqSqQxR4eHzvb2g@gondor.apana.org.au/
Suggested-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: Kanchana P Sridhar <kanchana.p.sridhar@intel.com>
---
crypto/acompress.c | 63 ++++++++++
drivers/crypto/intel/iaa/iaa_crypto.h | 3 +
drivers/crypto/intel/iaa/iaa_crypto_main.c | 137 +++++++++++++++++++--
include/crypto/acompress.h | 7 ++
include/crypto/internal/acompress.h | 7 ++
5 files changed, 210 insertions(+), 7 deletions(-)
diff --git a/crypto/acompress.c b/crypto/acompress.c
index cfb8ede02cf4..c48a1a20e21f 100644
--- a/crypto/acompress.c
+++ b/crypto/acompress.c
@@ -105,6 +105,7 @@ static int crypto_acomp_init_tfm(struct crypto_tfm *tfm)
acomp->compress = alg->compress;
acomp->decompress = alg->decompress;
+ acomp->batch_completed = alg->batch_completed;
acomp->reqsize = alg->base.cra_reqsize;
acomp->base.exit = crypto_acomp_exit_tfm;
@@ -291,6 +292,65 @@ static __always_inline int acomp_do_req_chain(struct acomp_req *req, bool comp)
return acomp_reqchain_finish(req, err);
}
+static int acomp_do_req_batch_parallel(struct acomp_req *req, bool comp)
+{
+ struct crypto_acomp *tfm = crypto_acomp_reqtfm(req);
+ unsigned long *bpwq_addr = acomp_request_ctx(req);
+ wait_queue_head_t batch_parallel_wq;
+ int ret;
+
+ init_waitqueue_head(&batch_parallel_wq);
+ *bpwq_addr = (unsigned long)&batch_parallel_wq;
+
+ ret = comp ? tfm->compress(req) : tfm->decompress(req);
+
+ wait_event(batch_parallel_wq, tfm->batch_completed(req, comp));
+
+ if (req->slen < 0)
+ ret |= -EINVAL;
+
+ return ret;
+}
+
+/**
+ * Please note:
+ * ============
+ *
+ * 1) If @req->unit_size is 0, there is no impact to existing acomp users.
+ *
+ * 2) If @req->unit_size is non-0 (for e.g. zswap compress batching) and
+ * @req->src and @req->dst are scatterlists:
+ *
+ * a) Algorithms that do not support segmentation:
+ *
+ * We call acomp_do_req_chain() that handles the trivial case when
+ * the caller has passed exactly one segment. The dst SG list's length is
+ * set to the compression error/compressed length for that segment.
+ *
+ * b) Algorithms that support segmentation:
+ *
+ * If the source length is more than @req->unit_size,
+ * acomp_do_req_batch_parallel() is invoked: this calls the tfm's
+ * compress() API, which uses the @req->unit_size being greater than
+ * @req->slen to ascertain that it needs to do batching. The algorithm's
+ * compress() implementation submits the batch's sub-requests for
+ * compression and returns.
+ *
+ * Algorithms that support batching must provide a batch_completed() API.
+ * When the batch's compression sub-requests have completed, they must
+ * notify a wait_queue using the batch_completed() API. The batching tfm
+ * implementation must set the dst SG lists to contain the individual
+ * sub-requests' error/compressed lengths.
+ *
+ * If the source length == @req->unit_size, the tfm's compress() API is
+ * invoked. The assumption is that segmentation algorithms will internally
+ * set the dst SG list's length to indicate error/compressed length in
+ * this case, similar to the batching case.
+ *
+ * 3) To prevent functional/performance regressions, we preserve existing
+ * behavior in all other cases, such as, when @req->unit_size is non-0 and
+ * @req->src and/or @req->dst is virtual; instead of returning an error.
+ */
int crypto_acomp_compress(struct acomp_req *req)
{
struct crypto_acomp *tfm = crypto_acomp_reqtfm(req);
@@ -302,6 +362,9 @@ int crypto_acomp_compress(struct acomp_req *req)
if (!crypto_acomp_req_seg(tfm))
return acomp_do_req_chain(req, true);
+ if (likely((req->slen > req->unit_size) && tfm->batch_completed))
+ return acomp_do_req_batch_parallel(req, true);
+
return tfm->compress(req);
}
diff --git a/drivers/crypto/intel/iaa/iaa_crypto.h b/drivers/crypto/intel/iaa/iaa_crypto.h
index db83c21e92f1..d85a8f1cbb93 100644
--- a/drivers/crypto/intel/iaa/iaa_crypto.h
+++ b/drivers/crypto/intel/iaa/iaa_crypto.h
@@ -69,10 +69,13 @@
* IAA. In other words, don't make any assumptions, and protect
* compression/decompression data.
*
+ * @data: Driver internal data to interface with crypto_acomp.
+ *
*/
struct iaa_batch_ctx {
struct iaa_req **reqs;
struct mutex mutex;
+ void *data;
};
#define IAA_COMP_MODES_MAX IAA_MODE_NONE
diff --git a/drivers/crypto/intel/iaa/iaa_crypto_main.c b/drivers/crypto/intel/iaa/iaa_crypto_main.c
index 8d83a1ea15d7..915bf9b17b39 100644
--- a/drivers/crypto/intel/iaa/iaa_crypto_main.c
+++ b/drivers/crypto/intel/iaa/iaa_crypto_main.c
@@ -2524,6 +2524,71 @@ static void compression_ctx_init(struct iaa_compression_ctx *ctx, enum iaa_mode
* Interfaces to crypto_alg and crypto_acomp.
*********************************************/
+static __always_inline int iaa_crypto_acomp_acompress_batch(
+ struct iaa_compression_ctx *ctx,
+ struct iaa_req *parent_req,
+ struct iaa_req **reqs,
+ unsigned int unit_size)
+{
+ int nr_reqs = parent_req->slen / unit_size;
+
+ return iaa_comp_submit_acompress_batch(ctx, parent_req, reqs, nr_reqs, unit_size);
+}
+
+static __always_inline int iaa_crypto_acomp_adecompress_batch(
+ struct iaa_compression_ctx *ctx,
+ struct iaa_req *parent_req,
+ struct iaa_req **reqs,
+ unsigned int unit_size)
+{
+ int nr_reqs = parent_req->dlen / unit_size;
+
+ return iaa_comp_submit_adecompress_batch(ctx, parent_req, reqs, nr_reqs);
+}
+
+static bool iaa_crypto_acomp_batch_completed(struct acomp_req *areq, bool comp)
+{
+ unsigned long *cpu_ctx_addr = acomp_request_ctx(areq);
+ struct iaa_batch_ctx *cpu_ctx = (struct iaa_batch_ctx *)*cpu_ctx_addr;
+ wait_queue_head_t *batch_parallel_wq = (wait_queue_head_t *)cpu_ctx->data;
+ struct iaa_req **reqs = cpu_ctx->reqs;
+ int nr_reqs = (comp ? areq->slen : areq->dlen) / areq->unit_size;
+
+ /*
+ * Since both, compress and decompress require the eventual
+ * caller (zswap) to verify @areq->dlen, we use @areq->slen to
+ * flag the batch's success/error to crypto_acomp, which will
+ * return this as the @err status to the crypto_acomp callback
+ * function.
+ */
+ if (iaa_comp_batch_completed(NULL, reqs, nr_reqs))
+ areq->slen = -EINVAL;
+
+ /*
+ * Set the acomp_req's dlen to be the first SG list's
+ * compressed/decompressed length/error value to enable zswap code
+ * equivalence for non-batching and batching acomp_algs.
+ */
+ areq->dlen = areq->dst->length;
+
+ /* All sub-requests have finished. Notify the @batch_parallel_wq. */
+ if (waitqueue_active(batch_parallel_wq))
+ wake_up(batch_parallel_wq);
+
+ mutex_unlock(&cpu_ctx->mutex);
+
+ return true;
+}
+
+/*
+ * Main compression API for kernel users of crypto_acomp, such as zswap.
+ *
+ * crypto_acomp_compress() calls into this procedure for:
+ * - Sequential compression of a single page,
+ * - Parallel batch compression of multiple pages.
+ *
+ * @areq: asynchronous compress request
+ */
static int iaa_crypto_acomp_acompress_main(struct acomp_req *areq)
{
struct crypto_tfm *tfm = areq->base.tfm;
@@ -2534,14 +2599,47 @@ static int iaa_crypto_acomp_acompress_main(struct acomp_req *areq)
if (iaa_alg_is_registered(crypto_tfm_alg_driver_name(tfm), &idx)) {
ctx = iaa_ctx[idx];
- acomp_to_iaa(areq, &parent_req, ctx);
- ret = iaa_comp_acompress(ctx, &parent_req);
- iaa_to_acomp(unlikely(ret) ? ret : parent_req.dlen, areq);
+ if (likely(areq->slen == areq->unit_size) || !areq->unit_size) {
+ acomp_to_iaa(areq, &parent_req, ctx);
+ ret = iaa_comp_acompress(ctx, &parent_req);
+ iaa_to_acomp(unlikely(ret) ? ret : parent_req.dlen, areq);
+ } else {
+ struct iaa_batch_ctx *cpu_ctx = raw_cpu_ptr(iaa_batch_ctx);
+ struct iaa_req **reqs;
+ unsigned long *cpu_ctx_addr, *bpwq_addr;
+
+ acomp_to_iaa(areq, &parent_req, ctx);
+
+ mutex_lock(&cpu_ctx->mutex);
+
+ bpwq_addr = acomp_request_ctx(areq);
+ /* Save the wait_queue_head. */
+ cpu_ctx->data = (wait_queue_head_t *)*bpwq_addr;
+
+ reqs = cpu_ctx->reqs;
+
+ ret = iaa_crypto_acomp_acompress_batch(ctx,
+ &parent_req,
+ reqs,
+ areq->unit_size);
+
+ cpu_ctx_addr = acomp_request_ctx(areq);
+ *cpu_ctx_addr = (unsigned long)cpu_ctx;
+ }
}
return ret;
}
+/*
+ * Main decompression API for kernel users of crypto_acomp, such as zswap.
+ *
+ * crypto_acomp_decompress() calls into this procedure for:
+ * - Sequential decompression of a single buffer,
+ * - Parallel batch decompression of multiple buffers.
+ *
+ * @areq: asynchronous decompress request
+ */
static int iaa_crypto_acomp_adecompress_main(struct acomp_req *areq)
{
struct crypto_tfm *tfm = areq->base.tfm;
@@ -2552,9 +2650,33 @@ static int iaa_crypto_acomp_adecompress_main(struct acomp_req *areq)
if (iaa_alg_is_registered(crypto_tfm_alg_driver_name(tfm), &idx)) {
ctx = iaa_ctx[idx];
- acomp_to_iaa(areq, &parent_req, ctx);
- ret = iaa_comp_adecompress(ctx, &parent_req);
- iaa_to_acomp(parent_req.dlen, areq);
+ if (likely(areq->dlen == areq->unit_size) || !areq->unit_size) {
+ acomp_to_iaa(areq, &parent_req, ctx);
+ ret = iaa_comp_adecompress(ctx, &parent_req);
+ iaa_to_acomp(parent_req.dlen, areq);
+ } else {
+ struct iaa_batch_ctx *cpu_ctx = raw_cpu_ptr(iaa_batch_ctx);
+ struct iaa_req **reqs;
+ unsigned long *cpu_ctx_addr, *bpwq_addr;
+
+ acomp_to_iaa(areq, &parent_req, ctx);
+
+ mutex_lock(&cpu_ctx->mutex);
+
+ bpwq_addr = acomp_request_ctx(areq);
+ /* Save the wait_queue_head. */
+ cpu_ctx->data = (wait_queue_head_t *)*bpwq_addr;
+
+ reqs = cpu_ctx->reqs;
+
+ ret = iaa_crypto_acomp_adecompress_batch(ctx,
+ &parent_req,
+ reqs,
+ areq->unit_size);
+
+ cpu_ctx_addr = acomp_request_ctx(areq);
+ *cpu_ctx_addr = (unsigned long)cpu_ctx;
+ }
}
return ret;
@@ -2574,10 +2696,11 @@ static struct acomp_alg iaa_acomp_fixed_deflate = {
.init = iaa_crypto_acomp_init_fixed,
.compress = iaa_crypto_acomp_acompress_main,
.decompress = iaa_crypto_acomp_adecompress_main,
+ .batch_completed = iaa_crypto_acomp_batch_completed,
.base = {
.cra_name = "deflate",
.cra_driver_name = "deflate-iaa",
- .cra_flags = CRYPTO_ALG_ASYNC,
+ .cra_flags = CRYPTO_ALG_ASYNC | CRYPTO_ALG_REQ_SEG,
.cra_ctxsize = sizeof(struct iaa_compression_ctx),
.cra_reqsize = sizeof(u32),
.cra_module = THIS_MODULE,
diff --git a/include/crypto/acompress.h b/include/crypto/acompress.h
index 86e4932cd112..752110a7719c 100644
--- a/include/crypto/acompress.h
+++ b/include/crypto/acompress.h
@@ -109,6 +109,12 @@ struct acomp_req {
*
* @compress: Function performs a compress operation
* @decompress: Function performs a de-compress operation
+ * @batch_completed: Waits for batch completion of parallel
+ * compress/decompress requests submitted via
+ * @compress/@decompress. Returns bool status
+ * of all batch sub-requests having completed.
+ * Returns an error code in @req->slen if any
+ * of the sub-requests completed with an error.
* @reqsize: Context size for (de)compression requests
* @fb: Synchronous fallback tfm
* @base: Common crypto API algorithm data structure
@@ -116,6 +122,7 @@ struct acomp_req {
struct crypto_acomp {
int (*compress)(struct acomp_req *req);
int (*decompress)(struct acomp_req *req);
+ bool (*batch_completed)(struct acomp_req *req, bool comp);
unsigned int reqsize;
struct crypto_tfm base;
};
diff --git a/include/crypto/internal/acompress.h b/include/crypto/internal/acompress.h
index 366dbdb987e8..7c4e14491d59 100644
--- a/include/crypto/internal/acompress.h
+++ b/include/crypto/internal/acompress.h
@@ -28,6 +28,12 @@
*
* @compress: Function performs a compress operation
* @decompress: Function performs a de-compress operation
+ * @batch_completed: Waits for batch completion of parallel
+ * compress/decompress requests submitted via
+ * @compress/@decompress. Returns bool status
+ * of all batch sub-requests having completed.
+ * Returns an error code in @req->slen if any
+ * of the sub-requests completed with an error.
* @init: Initialize the cryptographic transformation object.
* This function is used to initialize the cryptographic
* transformation object. This function is called only once at
@@ -46,6 +52,7 @@
struct acomp_alg {
int (*compress)(struct acomp_req *req);
int (*decompress)(struct acomp_req *req);
+ bool (*batch_completed)(struct acomp_req *req, bool comp);
int (*init)(struct crypto_acomp *tfm);
void (*exit)(struct crypto_acomp *tfm);
--
2.27.0
^ permalink raw reply [flat|nested] 48+ messages in thread* Re: [PATCH v14 18/26] crypto: acomp, iaa - crypto_acomp integration of IAA Batching.
2026-01-25 3:35 ` [PATCH v14 18/26] crypto: acomp, iaa - crypto_acomp integration of IAA Batching Kanchana P Sridhar
@ 2026-02-05 4:14 ` Herbert Xu
0 siblings, 0 replies; 48+ messages in thread
From: Herbert Xu @ 2026-02-05 4:14 UTC (permalink / raw)
To: Kanchana P Sridhar
Cc: linux-kernel, linux-mm, hannes, yosry.ahmed, nphamcs,
chengming.zhou, usamaarif642, ryan.roberts, 21cnbao, ying.huang,
akpm, senozhatsky, sj, kasong, linux-crypto, davem, clabbe, ardb,
ebiggers, surenb, kristen.c.accardi, vinicius.gomes,
giovanni.cabiddu, wajdi.k.feghali
On Sat, Jan 24, 2026 at 07:35:29PM -0800, Kanchana P Sridhar wrote:
>
> @@ -291,6 +292,65 @@ static __always_inline int acomp_do_req_chain(struct acomp_req *req, bool comp)
> return acomp_reqchain_finish(req, err);
> }
>
> +static int acomp_do_req_batch_parallel(struct acomp_req *req, bool comp)
> +{
> + struct crypto_acomp *tfm = crypto_acomp_reqtfm(req);
> + unsigned long *bpwq_addr = acomp_request_ctx(req);
> + wait_queue_head_t batch_parallel_wq;
> + int ret;
> +
> + init_waitqueue_head(&batch_parallel_wq);
> + *bpwq_addr = (unsigned long)&batch_parallel_wq;
> +
> + ret = comp ? tfm->compress(req) : tfm->decompress(req);
> +
> + wait_event(batch_parallel_wq, tfm->batch_completed(req, comp));
> +
> + if (req->slen < 0)
> + ret |= -EINVAL;
> +
> + return ret;
> +}
I don't think we should have this in acomp. Just return EINPROGRESS
and let the user check each unit for the success/error.
Thanks,
--
Email: Herbert Xu <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
^ permalink raw reply [flat|nested] 48+ messages in thread
* [PATCH v14 19/26] crypto: iaa - Enable async mode and make it the default.
2026-01-25 3:35 [PATCH v14 00/26] zswap compression batching with optimized iaa_crypto driver Kanchana P Sridhar
` (17 preceding siblings ...)
2026-01-25 3:35 ` [PATCH v14 18/26] crypto: acomp, iaa - crypto_acomp integration of IAA Batching Kanchana P Sridhar
@ 2026-01-25 3:35 ` Kanchana P Sridhar
2026-01-25 3:35 ` [PATCH v14 20/26] crypto: iaa - Disable iaa_verify_compress by default Kanchana P Sridhar
` (7 subsequent siblings)
26 siblings, 0 replies; 48+ messages in thread
From: Kanchana P Sridhar @ 2026-01-25 3:35 UTC (permalink / raw)
To: linux-kernel, linux-mm, hannes, yosry.ahmed, nphamcs,
chengming.zhou, usamaarif642, ryan.roberts, 21cnbao, ying.huang,
akpm, senozhatsky, sj, kasong, linux-crypto, herbert, davem,
clabbe, ardb, ebiggers, surenb, kristen.c.accardi,
vinicius.gomes, giovanni.cabiddu
Cc: wajdi.k.feghali, kanchana.p.sridhar
This patch enables the 'async' sync_mode in the driver. Further, it sets
the default sync_mode to 'async', which makes it easier for IAA hardware
acceleration in the iaa_crypto driver to be loaded by default in the most
efficient/recommended 'async' mode for parallel
compressions/decompressions, namely, asynchronous submission of
descriptors, followed by polling for job completions. Earlier, the
"sync" mode used to be the default.
The iaa_crypto driver documentation has been updated with these
changes.
This way, anyone who wants to use IAA as a zswap compressor, can do so
right after building the kernel. Specifically, they *do not* have to go
through these steps to use async mode:
1) disable all the IAA device/wq bindings that happen at boot time
2) rmmod iaa_crypto
3) modprobe iaa_crypto
4) echo async > /sys/bus/dsa/drivers/crypto/sync_mode
5) re-run initialization of the IAA devices and wqs
Signed-off-by: Kanchana P Sridhar <kanchana.p.sridhar@intel.com>
---
Documentation/driver-api/crypto/iaa/iaa-crypto.rst | 11 ++---------
drivers/crypto/intel/iaa/iaa_crypto_main.c | 4 ++--
2 files changed, 4 insertions(+), 11 deletions(-)
diff --git a/Documentation/driver-api/crypto/iaa/iaa-crypto.rst b/Documentation/driver-api/crypto/iaa/iaa-crypto.rst
index 0ff4ec603b43..d5e610ef4612 100644
--- a/Documentation/driver-api/crypto/iaa/iaa-crypto.rst
+++ b/Documentation/driver-api/crypto/iaa/iaa-crypto.rst
@@ -272,7 +272,7 @@ The available attributes are:
echo async_irq > /sys/bus/dsa/drivers/crypto/sync_mode
Async mode without interrupts (caller must poll) can be enabled by
- writing 'async' to it (please see Caveat)::
+ writing 'async' to it::
echo async > /sys/bus/dsa/drivers/crypto/sync_mode
@@ -281,14 +281,7 @@ The available attributes are:
echo sync > /sys/bus/dsa/drivers/crypto/sync_mode
- The default mode is 'sync'.
-
- Caveat: since the only mechanism that iaa_crypto currently implements
- for async polling without interrupts is via the 'sync' mode as
- described earlier, writing 'async' to
- '/sys/bus/dsa/drivers/crypto/sync_mode' will internally enable the
- 'sync' mode. This is to ensure correct iaa_crypto behavior until true
- async polling without interrupts is enabled in iaa_crypto.
+ The default mode is 'async'.
- g_comp_wqs_per_iaa
diff --git a/drivers/crypto/intel/iaa/iaa_crypto_main.c b/drivers/crypto/intel/iaa/iaa_crypto_main.c
index 915bf9b17b39..f6e18f458fbf 100644
--- a/drivers/crypto/intel/iaa/iaa_crypto_main.c
+++ b/drivers/crypto/intel/iaa/iaa_crypto_main.c
@@ -153,7 +153,7 @@ static bool iaa_verify_compress = true;
*/
/* Use async mode */
-static bool async_mode;
+static bool async_mode = true;
/* Use interrupts */
static bool use_irq;
@@ -207,7 +207,7 @@ static int set_iaa_sync_mode(const char *name)
async_mode = false;
use_irq = false;
} else if (sysfs_streq(name, "async")) {
- async_mode = false;
+ async_mode = true;
use_irq = false;
} else if (sysfs_streq(name, "async_irq")) {
async_mode = true;
--
2.27.0
^ permalink raw reply [flat|nested] 48+ messages in thread* [PATCH v14 20/26] crypto: iaa - Disable iaa_verify_compress by default.
2026-01-25 3:35 [PATCH v14 00/26] zswap compression batching with optimized iaa_crypto driver Kanchana P Sridhar
` (18 preceding siblings ...)
2026-01-25 3:35 ` [PATCH v14 19/26] crypto: iaa - Enable async mode and make it the default Kanchana P Sridhar
@ 2026-01-25 3:35 ` Kanchana P Sridhar
2026-01-25 3:35 ` [PATCH v14 21/26] crypto: iaa - Add deflate-iaa-dynamic compression mode Kanchana P Sridhar
` (6 subsequent siblings)
26 siblings, 0 replies; 48+ messages in thread
From: Kanchana P Sridhar @ 2026-01-25 3:35 UTC (permalink / raw)
To: linux-kernel, linux-mm, hannes, yosry.ahmed, nphamcs,
chengming.zhou, usamaarif642, ryan.roberts, 21cnbao, ying.huang,
akpm, senozhatsky, sj, kasong, linux-crypto, herbert, davem,
clabbe, ardb, ebiggers, surenb, kristen.c.accardi,
vinicius.gomes, giovanni.cabiddu
Cc: wajdi.k.feghali, kanchana.p.sridhar
This patch makes it easier for IAA hardware acceleration in the iaa_crypto
driver to be loaded by default with "iaa_verify_compress" disabled, to
facilitate performance comparisons with software compressors (which also
do not run compress verification by default). Earlier, iaa_crypto compress
verification used to be enabled by default.
The iaa_crypto driver documentation has been updated with this change.
With this patch, if users want to enable compress verification, they can do
so with these steps:
1) disable all the IAA device/wq bindings that happen at boot time
2) rmmod iaa_crypto
3) modprobe iaa_crypto
4) echo 1 > /sys/bus/dsa/drivers/crypto/verify_compress
5) re-run initialization of the IAA devices and wqs
Signed-off-by: Kanchana P Sridhar <kanchana.p.sridhar@intel.com>
---
Documentation/driver-api/crypto/iaa/iaa-crypto.rst | 2 +-
drivers/crypto/intel/iaa/iaa_crypto_main.c | 2 +-
2 files changed, 2 insertions(+), 2 deletions(-)
diff --git a/Documentation/driver-api/crypto/iaa/iaa-crypto.rst b/Documentation/driver-api/crypto/iaa/iaa-crypto.rst
index d5e610ef4612..81a7dbd15f8b 100644
--- a/Documentation/driver-api/crypto/iaa/iaa-crypto.rst
+++ b/Documentation/driver-api/crypto/iaa/iaa-crypto.rst
@@ -239,7 +239,7 @@ The available attributes are:
echo 0 > /sys/bus/dsa/drivers/crypto/verify_compress
- The default setting is '1' - verify all compresses.
+ The default setting is '0' - to not verify compresses.
- sync_mode
diff --git a/drivers/crypto/intel/iaa/iaa_crypto_main.c b/drivers/crypto/intel/iaa/iaa_crypto_main.c
index f6e18f458fbf..7bc4a80bd68b 100644
--- a/drivers/crypto/intel/iaa/iaa_crypto_main.c
+++ b/drivers/crypto/intel/iaa/iaa_crypto_main.c
@@ -121,7 +121,7 @@ static bool iaa_distribute_decomps;
static bool iaa_distribute_comps = true;
/* Verify results of IAA compress or not */
-static bool iaa_verify_compress = true;
+static bool iaa_verify_compress;
/*
* The iaa crypto driver supports three 'sync' methods determining how
--
2.27.0
^ permalink raw reply [flat|nested] 48+ messages in thread* [PATCH v14 21/26] crypto: iaa - Add deflate-iaa-dynamic compression mode.
2026-01-25 3:35 [PATCH v14 00/26] zswap compression batching with optimized iaa_crypto driver Kanchana P Sridhar
` (19 preceding siblings ...)
2026-01-25 3:35 ` [PATCH v14 20/26] crypto: iaa - Disable iaa_verify_compress by default Kanchana P Sridhar
@ 2026-01-25 3:35 ` Kanchana P Sridhar
2026-01-25 3:35 ` [PATCH v14 22/26] crypto: acomp - Add crypto_acomp_batch_size() to get an algorithm's batch-size Kanchana P Sridhar
` (5 subsequent siblings)
26 siblings, 0 replies; 48+ messages in thread
From: Kanchana P Sridhar @ 2026-01-25 3:35 UTC (permalink / raw)
To: linux-kernel, linux-mm, hannes, yosry.ahmed, nphamcs,
chengming.zhou, usamaarif642, ryan.roberts, 21cnbao, ying.huang,
akpm, senozhatsky, sj, kasong, linux-crypto, herbert, davem,
clabbe, ardb, ebiggers, surenb, kristen.c.accardi,
vinicius.gomes, giovanni.cabiddu
Cc: wajdi.k.feghali, kanchana.p.sridhar
Some versions of Intel IAA such as Granite Rapids, support dynamic
compression where the hardware dynamically computes the Huffman tables
and generates a Deflate header if the input size is no larger than
4KB. This patch will use IAA for dynamic compression if an appropriate
IAA is present and the input size is not greater than 4KB. If an IAA is
not present, the algorithm will not be available. Otherwise, if the size
of the input is greater than PAGE_SIZE, zlib is used to do the
compression. If the algorithm is selected, IAA will be used for
decompression. If the compressed stream contains a reference whose
distance is greater than 4KB, hardware decompression will fail, and the
decompression will be done with zlib.
Intel IAA dynamic compression results in a compression ratio that is
better than or equal to the currently supported "fixed" compression mode
on the same data set. Compressing a data set of 4300 4KB pages sampled
from SPEC CPU17 workloads produces a compression ratio of 3.14 for IAA
dynamic compression and 2.69 for IAA fixed compression.
If an appropriate IAA exists, dynamic mode can be chosen as the IAA
compression mode by selecting the corresponding algorithm.
For example, to use IAA dynamic mode in zswap:
echo deflate-iaa-dynamic > /sys/module/zswap/parameters/compressor
This patch also adds a deflate_generic_compress() fallback when dynamic
mode is selected and the input size is over 4KB; along with stats
support that will count these software fallback calls as
"total_sw_comp_calls" in the driver's global_stats.
Furthermore, we define IAA_DYN_ALLOC_DESC_COMP_TIMEOUT as 2000 for
dynamic mode compression on Granite Rapids.
The acomp_alg flags for deflate-iaa-dynamic indicate support for
segmentation.
Signed-off-by: Andre Glover <andre.glover@linux.intel.com>
Signed-off-by: Kanchana P Sridhar <kanchana.p.sridhar@intel.com>
---
.../driver-api/crypto/iaa/iaa-crypto.rst | 21 ++++
crypto/testmgr.c | 10 ++
crypto/testmgr.h | 74 +++++++++++++
drivers/crypto/intel/iaa/Makefile | 2 +-
drivers/crypto/intel/iaa/iaa_crypto.h | 8 +-
.../intel/iaa/iaa_crypto_comp_dynamic.c | 22 ++++
drivers/crypto/intel/iaa/iaa_crypto_main.c | 104 ++++++++++++++++--
drivers/crypto/intel/iaa/iaa_crypto_stats.c | 8 ++
drivers/crypto/intel/iaa/iaa_crypto_stats.h | 2 +
9 files changed, 241 insertions(+), 10 deletions(-)
create mode 100644 drivers/crypto/intel/iaa/iaa_crypto_comp_dynamic.c
diff --git a/Documentation/driver-api/crypto/iaa/iaa-crypto.rst b/Documentation/driver-api/crypto/iaa/iaa-crypto.rst
index 81a7dbd15f8b..e841a33564db 100644
--- a/Documentation/driver-api/crypto/iaa/iaa-crypto.rst
+++ b/Documentation/driver-api/crypto/iaa/iaa-crypto.rst
@@ -33,6 +33,8 @@ compresses and decompresses.
Currently, there is only one compression modes available, 'fixed'
mode.
+'dynamic' mode is available on certain generations of IAA hardware.
+
The 'fixed' compression mode implements the compression scheme
specified by RFC 1951 and is given the crypto algorithm name
'deflate-iaa'. (Because the IAA hardware has a 4k history-window
@@ -43,6 +45,25 @@ the IAA fixed mode deflate algorithm is given its own algorithm name
rather than simply 'deflate').
+The 'dynamic' compression mode implements a compression scheme where
+the IAA hardware will internally do one pass through the data, compute the
+Huffman tables and generate a Deflate header, then automatically do a
+second pass through the data, generating the final compressed output. IAA
+dynamic compression can be used if an appropriate IAA is present and the
+input size is not too big. If an appropriate IAA is not present, the
+algorithm will not be available. Otherwise, if the size of the input is too
+big, zlib is used to do the compression. If the algorithm is selected,
+IAA will be used for decompression. If the compressed stream contains a
+reference whose distance is greater than 4KB, hardware decompression will
+fail, and the decompression will be done with zlib. If an appropriate IAA
+exists, 'dynamic' compression, it is implemented by the
+'deflate-iaa-dynamic' crypto algorithm.
+
+A zswap device can select the IAA 'dynamic' mode represented by
+selecting the 'deflate-iaa-dynamic' crypto compression algorithm::
+
+ # echo deflate-iaa-dynamic> /sys/module/zswap/parameters/compressor
+
Config options and other setup
==============================
diff --git a/crypto/testmgr.c b/crypto/testmgr.c
index a302be53896d..de91d23e6d40 100644
--- a/crypto/testmgr.c
+++ b/crypto/testmgr.c
@@ -4571,6 +4571,16 @@ static const struct alg_test_desc alg_test_descs[] = {
.decomp = __VECS(deflate_decomp_tv_template)
}
}
+ }, {
+ .alg = "deflate-iaa-dynamic",
+ .test = alg_test_comp,
+ .fips_allowed = 1,
+ .suite = {
+ .comp = {
+ .comp = __VECS(deflate_iaa_dynamic_comp_tv_template),
+ .decomp = __VECS(deflate_iaa_dynamic_decomp_tv_template)
+ }
+ }
}, {
.alg = "dh",
.test = alg_test_kpp,
diff --git a/crypto/testmgr.h b/crypto/testmgr.h
index 80bf5f1b67a6..819503131cdd 100644
--- a/crypto/testmgr.h
+++ b/crypto/testmgr.h
@@ -33364,6 +33364,80 @@ static const struct comp_testvec deflate_decomp_tv_template[] = {
},
};
+static const struct comp_testvec deflate_iaa_dynamic_comp_tv_template[] = {
+ {
+ .inlen = 70,
+ .outlen = 46,
+ .input = "Join us now and share the software "
+ "Join us now and share the software ",
+ .output = "\x85\xca\xc1\x09\x00\x20\x08\x05"
+ "\xd0\x55\xfe\x3c\x6e\x21\x64\xd8"
+ "\x45\x21\x0d\xd7\xb7\x26\xe8\xf8"
+ "\xe0\x91\x2f\xc3\x09\x98\x17\xd8"
+ "\x06\x42\x79\x0b\x52\x05\xe1\x33"
+ "\xeb\x81\x3e\xe5\xa2\x01",
+ }, {
+ .inlen = 191,
+ .outlen = 121,
+ .input = "This document describes a compression method based on the DEFLATE "
+ "compression algorithm. This document defines the application of "
+ "the DEFLATE algorithm to the IP Payload Compression Protocol.",
+ .output = "\x5d\x8d\xc1\x0d\xc2\x30\x10\x04"
+ "\x5b\xd9\x0a\xd2\x03\x82\x20\x21"
+ "\xf1\xf0\x23\x0d\x5c\xec\x0b\xb6"
+ "\x64\xfb\x2c\xdf\xf1\xa0\x7b\x12"
+ "\x3e\x58\x79\xae\x76\x67\x76\x89"
+ "\x49\x11\xc4\xbf\x0b\x57\x43\x60"
+ "\xf5\x3d\xad\xac\x20\x78\x29\xad"
+ "\xb3\x6a\x92\x8a\xc2\x16\x25\x60"
+ "\x25\xe5\x80\x3d\x5b\x64\xdc\xe6"
+ "\xfb\xf3\xb2\xcc\xe3\x8c\xf2\x4b"
+ "\x7a\xb2\x58\x26\xe0\x2c\xde\x52"
+ "\xdd\xb5\x07\x48\xad\xe5\xe4\xc9"
+ "\x0e\x42\xb6\xd1\xf5\x17\xc0\xe4"
+ "\x57\x3c\x1c\x1c\x7d\xb2\x50\xc0"
+ "\x75\x38\x72\x5d\x4c\xbc\xe4\xe9"
+ "\x0b",
+ },
+};
+
+static const struct comp_testvec deflate_iaa_dynamic_decomp_tv_template[] = {
+ {
+ .inlen = 121,
+ .outlen = 191,
+ .input = "\x5d\x8d\xc1\x0d\xc2\x30\x10\x04"
+ "\x5b\xd9\x0a\xd2\x03\x82\x20\x21"
+ "\xf1\xf0\x23\x0d\x5c\xec\x0b\xb6"
+ "\x64\xfb\x2c\xdf\xf1\xa0\x7b\x12"
+ "\x3e\x58\x79\xae\x76\x67\x76\x89"
+ "\x49\x11\xc4\xbf\x0b\x57\x43\x60"
+ "\xf5\x3d\xad\xac\x20\x78\x29\xad"
+ "\xb3\x6a\x92\x8a\xc2\x16\x25\x60"
+ "\x25\xe5\x80\x3d\x5b\x64\xdc\xe6"
+ "\xfb\xf3\xb2\xcc\xe3\x8c\xf2\x4b"
+ "\x7a\xb2\x58\x26\xe0\x2c\xde\x52"
+ "\xdd\xb5\x07\x48\xad\xe5\xe4\xc9"
+ "\x0e\x42\xb6\xd1\xf5\x17\xc0\xe4"
+ "\x57\x3c\x1c\x1c\x7d\xb2\x50\xc0"
+ "\x75\x38\x72\x5d\x4c\xbc\xe4\xe9"
+ "\x0b",
+ .output = "This document describes a compression method based on the DEFLATE "
+ "compression algorithm. This document defines the application of "
+ "the DEFLATE algorithm to the IP Payload Compression Protocol.",
+ }, {
+ .inlen = 46,
+ .outlen = 70,
+ .input = "\x85\xca\xc1\x09\x00\x20\x08\x05"
+ "\xd0\x55\xfe\x3c\x6e\x21\x64\xd8"
+ "\x45\x21\x0d\xd7\xb7\x26\xe8\xf8"
+ "\xe0\x91\x2f\xc3\x09\x98\x17\xd8"
+ "\x06\x42\x79\x0b\x52\x05\xe1\x33"
+ "\xeb\x81\x3e\xe5\xa2\x01",
+ .output = "Join us now and share the software "
+ "Join us now and share the software ",
+ },
+};
+
/*
* LZO test vectors (null-terminated strings).
*/
diff --git a/drivers/crypto/intel/iaa/Makefile b/drivers/crypto/intel/iaa/Makefile
index ebfa1a425f80..96f22cd39924 100644
--- a/drivers/crypto/intel/iaa/Makefile
+++ b/drivers/crypto/intel/iaa/Makefile
@@ -7,6 +7,6 @@ ccflags-y += -I $(srctree)/drivers/dma/idxd -DDEFAULT_SYMBOL_NAMESPACE='"CRYPTO_
obj-$(CONFIG_CRYPTO_DEV_IAA_CRYPTO) := iaa_crypto.o
-iaa_crypto-y := iaa_crypto_main.o iaa_crypto_comp_fixed.o
+iaa_crypto-y := iaa_crypto_main.o iaa_crypto_comp_fixed.o iaa_crypto_comp_dynamic.o
iaa_crypto-$(CONFIG_CRYPTO_DEV_IAA_CRYPTO_STATS) += iaa_crypto_stats.o
diff --git a/drivers/crypto/intel/iaa/iaa_crypto.h b/drivers/crypto/intel/iaa/iaa_crypto.h
index d85a8f1cbb93..e523e4476282 100644
--- a/drivers/crypto/intel/iaa/iaa_crypto.h
+++ b/drivers/crypto/intel/iaa/iaa_crypto.h
@@ -19,12 +19,15 @@
#define IAA_COMP_FLUSH_OUTPUT BIT(1)
#define IAA_COMP_APPEND_EOB BIT(2)
+#define IAA_COMP_GEN_HDR_1_PASS (BIT(12) | BIT(13))
#define IAA_COMPLETION_TIMEOUT 1000000
#define IAA_ALLOC_DESC_COMP_TIMEOUT 1000
#define IAA_ALLOC_DESC_DECOMP_TIMEOUT 500
+#define IAA_DYN_ALLOC_DESC_COMP_TIMEOUT 2000
+
#define IAA_ANALYTICS_ERROR 0x0a
#define IAA_ERROR_DECOMP_BUF_OVERFLOW 0x0b
#define IAA_ERROR_COMP_BUF_OVERFLOW 0x19
@@ -82,7 +85,8 @@ struct iaa_batch_ctx {
enum iaa_mode {
IAA_MODE_FIXED = 0,
- IAA_MODE_NONE = 1,
+ IAA_MODE_DYNAMIC = 1,
+ IAA_MODE_NONE = 2,
};
struct iaa_req {
@@ -168,6 +172,8 @@ struct aecs_comp_table_record {
int iaa_aecs_init_fixed(void);
void iaa_aecs_cleanup_fixed(void);
+int iaa_aecs_init_dynamic(void);
+void iaa_aecs_cleanup_dynamic(void);
typedef int (*iaa_dev_comp_init_fn_t) (struct iaa_device_compression_mode *mode);
typedef int (*iaa_dev_comp_free_fn_t) (struct iaa_device_compression_mode *mode);
diff --git a/drivers/crypto/intel/iaa/iaa_crypto_comp_dynamic.c b/drivers/crypto/intel/iaa/iaa_crypto_comp_dynamic.c
new file mode 100644
index 000000000000..3a93d7913443
--- /dev/null
+++ b/drivers/crypto/intel/iaa/iaa_crypto_comp_dynamic.c
@@ -0,0 +1,22 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright(c) 2023 Intel Corporation. All rights rsvd. */
+
+#include "idxd.h"
+#include "iaa_crypto.h"
+
+int iaa_aecs_init_dynamic(void)
+{
+ int ret;
+
+ ret = add_iaa_compression_mode("dynamic", NULL, 0, NULL, 0, NULL, NULL);
+
+ if (!ret)
+ pr_debug("IAA dynamic compression mode initialized\n");
+
+ return ret;
+}
+
+void iaa_aecs_cleanup_dynamic(void)
+{
+ remove_iaa_compression_mode("dynamic");
+}
diff --git a/drivers/crypto/intel/iaa/iaa_crypto_main.c b/drivers/crypto/intel/iaa/iaa_crypto_main.c
index 7bc4a80bd68b..fe9f59ede577 100644
--- a/drivers/crypto/intel/iaa/iaa_crypto_main.c
+++ b/drivers/crypto/intel/iaa/iaa_crypto_main.c
@@ -103,10 +103,12 @@ DEFINE_MUTEX(first_wq_found_lock);
const char *iaa_compression_mode_names[IAA_COMP_MODES_MAX] = {
"fixed",
+ "dynamic",
};
const char *iaa_compression_alg_names[IAA_COMP_MODES_MAX] = {
"deflate-iaa",
+ "deflate-iaa-dynamic",
};
static struct iaa_compression_mode *iaa_compression_modes[IAA_COMP_MODES_MAX];
@@ -1493,6 +1495,27 @@ static int deflate_generic_decompress(struct iaa_req *req)
return ret;
}
+static int deflate_generic_compress(struct iaa_req *req)
+{
+ ACOMP_REQUEST_ON_STACK(fbreq, deflate_crypto_acomp);
+ int ret;
+
+ acomp_request_set_callback(fbreq, 0, NULL, NULL);
+ acomp_request_set_params(fbreq, req->src, req->dst, req->slen,
+ PAGE_SIZE);
+
+ mutex_lock(&deflate_crypto_acomp_lock);
+
+ ret = crypto_acomp_compress(fbreq);
+ req->dlen = fbreq->dlen;
+
+ mutex_unlock(&deflate_crypto_acomp_lock);
+
+ update_total_sw_comp_calls();
+
+ return ret;
+}
+
static __always_inline void acomp_to_iaa(struct acomp_req *areq,
struct iaa_req *req,
struct iaa_compression_ctx *ctx)
@@ -1822,9 +1845,13 @@ iaa_setup_compress_hw_desc(struct idxd_desc *idxd_desc,
desc->src1_size = slen;
desc->dst_addr = (u64)dst_addr;
desc->max_dst_size = dlen;
- desc->flags |= IDXD_OP_FLAG_RD_SRC2_AECS;
- desc->src2_addr = active_compression_mode->aecs_comp_table_dma_addr;
- desc->src2_size = sizeof(struct aecs_comp_table_record);
+ if (mode == IAA_MODE_DYNAMIC) {
+ desc->compr_flags |= IAA_COMP_GEN_HDR_1_PASS;
+ } else {
+ desc->flags |= IDXD_OP_FLAG_RD_SRC2_AECS;
+ desc->src2_addr = active_compression_mode->aecs_comp_table_dma_addr;
+ desc->src2_size = sizeof(struct aecs_comp_table_record);
+ }
desc->completion_addr = idxd_desc->compl_dma;
return desc;
@@ -2078,6 +2105,9 @@ static int iaa_comp_acompress(struct iaa_compression_ctx *ctx, struct iaa_req *r
return -EINVAL;
}
+ if (ctx->mode == IAA_MODE_DYNAMIC && req->slen > PAGE_SIZE)
+ return deflate_generic_compress(req);
+
cpu = get_cpu();
wq = comp_wq_table_next_wq(cpu);
put_cpu();
@@ -2513,7 +2543,9 @@ static int __maybe_unused iaa_comp_adecompress_batch(
static void compression_ctx_init(struct iaa_compression_ctx *ctx, enum iaa_mode mode)
{
ctx->mode = mode;
- ctx->alloc_comp_desc_timeout = IAA_ALLOC_DESC_COMP_TIMEOUT;
+ ctx->alloc_comp_desc_timeout = (mode == IAA_MODE_DYNAMIC ?
+ IAA_DYN_ALLOC_DESC_COMP_TIMEOUT :
+ IAA_ALLOC_DESC_COMP_TIMEOUT);
ctx->alloc_decomp_desc_timeout = IAA_ALLOC_DESC_DECOMP_TIMEOUT;
ctx->verify_compress = iaa_verify_compress;
ctx->async_mode = async_mode;
@@ -2708,6 +2740,32 @@ static struct acomp_alg iaa_acomp_fixed_deflate = {
}
};
+static int iaa_crypto_acomp_init_dynamic(struct crypto_acomp *acomp_tfm)
+{
+ struct crypto_tfm *tfm = crypto_acomp_tfm(acomp_tfm);
+ struct iaa_compression_ctx *ctx = crypto_tfm_ctx(tfm);
+
+ ctx = iaa_ctx[IAA_MODE_DYNAMIC];
+
+ return 0;
+}
+
+static struct acomp_alg iaa_acomp_dynamic_deflate = {
+ .init = iaa_crypto_acomp_init_dynamic,
+ .compress = iaa_crypto_acomp_acompress_main,
+ .decompress = iaa_crypto_acomp_adecompress_main,
+ .batch_completed = iaa_crypto_acomp_batch_completed,
+ .base = {
+ .cra_name = "deflate",
+ .cra_driver_name = "deflate-iaa-dynamic",
+ .cra_flags = CRYPTO_ALG_ASYNC | CRYPTO_ALG_REQ_SEG,
+ .cra_ctxsize = sizeof(struct iaa_compression_ctx),
+ .cra_reqsize = sizeof(u32),
+ .cra_module = THIS_MODULE,
+ .cra_priority = IAA_ALG_PRIORITY + 1,
+ }
+};
+
/*******************************************
* Implement idxd_device_driver interfaces.
*******************************************/
@@ -2727,7 +2785,7 @@ static void iaa_unregister_compression_device(void)
num_iaa_modes_registered = 0;
}
-static int iaa_register_compression_device(void)
+static int iaa_register_compression_device(struct idxd_device *idxd)
{
struct iaa_compression_mode *mode;
int i, idx;
@@ -2736,6 +2794,13 @@ static int iaa_register_compression_device(void)
iaa_mode_registered[i] = false;
mode = find_iaa_compression_mode(iaa_compression_mode_names[i], &idx);
if (mode) {
+ /* Header Generation Capability is required for the dynamic algorithm. */
+ if ((!strcmp(mode->name, "dynamic")) && !idxd->hw.iaa_cap.header_gen) {
+ if (num_iaa_modes_registered > 0)
+ --num_iaa_modes_registered;
+ continue;
+ }
+
iaa_ctx[i] = kmalloc(sizeof(struct iaa_compression_ctx), GFP_KERNEL);
if (!iaa_ctx[i])
goto err;
@@ -2755,7 +2820,7 @@ static int iaa_register_compression_device(void)
return -ENODEV;
}
-static int iaa_register_acomp_compression_device(void)
+static int iaa_register_acomp_compression_device(struct idxd_device *idxd)
{
int ret = -ENOMEM;
@@ -2765,8 +2830,19 @@ static int iaa_register_acomp_compression_device(void)
goto err_fixed;
}
+ if (iaa_mode_registered[IAA_MODE_DYNAMIC]) {
+ ret = crypto_register_acomp(&iaa_acomp_dynamic_deflate);
+ if (ret) {
+ pr_err("deflate algorithm acomp dynamic registration failed (%d)\n", ret);
+ goto err_dynamic;
+ }
+ }
+
return 0;
+err_dynamic:
+ crypto_unregister_acomp(&iaa_acomp_fixed_deflate);
+
err_fixed:
iaa_unregister_compression_device();
return ret;
@@ -2778,6 +2854,9 @@ static void iaa_unregister_acomp_compression_device(void)
if (iaa_mode_registered[IAA_MODE_FIXED])
crypto_unregister_acomp(&iaa_acomp_fixed_deflate);
+
+ if (iaa_mode_registered[IAA_MODE_DYNAMIC])
+ crypto_unregister_acomp(&iaa_acomp_dynamic_deflate);
}
static int iaa_crypto_probe(struct idxd_dev *idxd_dev)
@@ -2841,13 +2920,13 @@ static int iaa_crypto_probe(struct idxd_dev *idxd_dev)
atomic_set(&iaa_crypto_enabled, 1);
if (first_wq) {
- ret = iaa_register_compression_device();
+ ret = iaa_register_compression_device(idxd);
if (ret != 0) {
dev_dbg(dev, "IAA compression device registration failed\n");
goto err_register;
}
- ret = iaa_register_acomp_compression_device();
+ ret = iaa_register_acomp_compression_device(idxd);
if (ret != 0) {
dev_dbg(dev, "IAA compression device acomp registration failed\n");
goto err_register;
@@ -3007,6 +3086,12 @@ static int __init iaa_crypto_init_module(void)
goto err_aecs_init;
}
+ ret = iaa_aecs_init_dynamic();
+ if (ret < 0) {
+ pr_debug("IAA dynamic compression mode init failed\n");
+ goto err_dynamic;
+ }
+
ret = idxd_driver_register(&iaa_crypto_driver);
if (ret) {
pr_debug("IAA wq sub-driver registration failed\n");
@@ -3110,6 +3195,8 @@ static int __init iaa_crypto_init_module(void)
err_g_comp_wqs_per_iaa_attr_create:
idxd_driver_unregister(&iaa_crypto_driver);
err_driver_reg:
+ iaa_aecs_cleanup_dynamic();
+err_dynamic:
iaa_aecs_cleanup_fixed();
err_aecs_init:
if (!IS_ERR_OR_NULL(deflate_crypto_acomp)) {
@@ -3139,6 +3226,7 @@ static void __exit iaa_crypto_cleanup_module(void)
driver_remove_file(&iaa_crypto_driver.drv,
&driver_attr_g_comp_wqs_per_iaa);
idxd_driver_unregister(&iaa_crypto_driver);
+ iaa_aecs_cleanup_dynamic();
iaa_aecs_cleanup_fixed();
if (!IS_ERR_OR_NULL(deflate_crypto_acomp)) {
diff --git a/drivers/crypto/intel/iaa/iaa_crypto_stats.c b/drivers/crypto/intel/iaa/iaa_crypto_stats.c
index f5cc3d29ca19..42aae8a738ac 100644
--- a/drivers/crypto/intel/iaa/iaa_crypto_stats.c
+++ b/drivers/crypto/intel/iaa/iaa_crypto_stats.c
@@ -19,6 +19,7 @@
static atomic64_t total_comp_calls;
static atomic64_t total_decomp_calls;
+static atomic64_t total_sw_comp_calls;
static atomic64_t total_sw_decomp_calls;
static atomic64_t total_comp_bytes_out;
static atomic64_t total_decomp_bytes_in;
@@ -43,6 +44,11 @@ void update_total_decomp_calls(void)
atomic64_inc(&total_decomp_calls);
}
+void update_total_sw_comp_calls(void)
+{
+ atomic64_inc(&total_sw_comp_calls);
+}
+
void update_total_sw_decomp_calls(void)
{
atomic64_inc(&total_sw_decomp_calls);
@@ -174,6 +180,8 @@ static int global_stats_show(struct seq_file *m, void *v)
atomic64_read(&total_comp_calls));
seq_printf(m, " total_decomp_calls: %llu\n",
atomic64_read(&total_decomp_calls));
+ seq_printf(m, " total_sw_comp_calls: %llu\n",
+ atomic64_read(&total_sw_comp_calls));
seq_printf(m, " total_sw_decomp_calls: %llu\n",
atomic64_read(&total_sw_decomp_calls));
seq_printf(m, " total_comp_bytes_out: %llu\n",
diff --git a/drivers/crypto/intel/iaa/iaa_crypto_stats.h b/drivers/crypto/intel/iaa/iaa_crypto_stats.h
index 3787a5f507eb..6e0c6f9939bf 100644
--- a/drivers/crypto/intel/iaa/iaa_crypto_stats.h
+++ b/drivers/crypto/intel/iaa/iaa_crypto_stats.h
@@ -11,6 +11,7 @@ void iaa_crypto_debugfs_cleanup(void);
void update_total_comp_calls(void);
void update_total_comp_bytes_out(int n);
void update_total_decomp_calls(void);
+void update_total_sw_comp_calls(void);
void update_total_sw_decomp_calls(void);
void update_total_decomp_bytes_in(int n);
void update_completion_einval_errs(void);
@@ -29,6 +30,7 @@ static inline void iaa_crypto_debugfs_cleanup(void) {}
static inline void update_total_comp_calls(void) {}
static inline void update_total_comp_bytes_out(int n) {}
static inline void update_total_decomp_calls(void) {}
+static inline void update_total_sw_comp_calls(void) {}
static inline void update_total_sw_decomp_calls(void) {}
static inline void update_total_decomp_bytes_in(int n) {}
static inline void update_completion_einval_errs(void) {}
--
2.27.0
^ permalink raw reply [flat|nested] 48+ messages in thread* [PATCH v14 22/26] crypto: acomp - Add crypto_acomp_batch_size() to get an algorithm's batch-size.
2026-01-25 3:35 [PATCH v14 00/26] zswap compression batching with optimized iaa_crypto driver Kanchana P Sridhar
` (20 preceding siblings ...)
2026-01-25 3:35 ` [PATCH v14 21/26] crypto: iaa - Add deflate-iaa-dynamic compression mode Kanchana P Sridhar
@ 2026-01-25 3:35 ` Kanchana P Sridhar
2026-01-25 3:35 ` [PATCH v14 23/26] mm: zswap: Tie per-CPU acomp_ctx lifetime to the pool Kanchana P Sridhar
` (4 subsequent siblings)
26 siblings, 0 replies; 48+ messages in thread
From: Kanchana P Sridhar @ 2026-01-25 3:35 UTC (permalink / raw)
To: linux-kernel, linux-mm, hannes, yosry.ahmed, nphamcs,
chengming.zhou, usamaarif642, ryan.roberts, 21cnbao, ying.huang,
akpm, senozhatsky, sj, kasong, linux-crypto, herbert, davem,
clabbe, ardb, ebiggers, surenb, kristen.c.accardi,
vinicius.gomes, giovanni.cabiddu
Cc: wajdi.k.feghali, kanchana.p.sridhar
This commit adds a @batch_size data member to struct acomp_alg.
An acomp_alg compression algorithm that supports batching of
compressions and decompressions must provide a @batch_size greater than
one, representing the maximum batch-size that the compressor supports,
so that kernel users of crypto_acomp, such as zswap, can allocate
resources for submitting multiple compress/decompress jobs that can be
batched, and invoke batching of [de]compressions.
The new crypto_acomp_batch_size() API queries the crypto_acomp's
acomp_alg for the batch-size. If the acomp_alg has registered a
@batch_size greater than 1, this is returned. If not, a default of "1"
is returned.
zswap can invoke crypto_acomp_batch_size() to query the maximum number
of requests that can be batch [de]compressed. Based on this, zswap
can use the minimum of any zswap-specific upper limits for batch-size
and the compressor's max @batch_size, to allocate batching resources.
The IAA acomp_algs Fixed ("deflate-iaa") and Dynamic
("deflate-iaa-dynamic") register @batch_size as
IAA_CRYPTO_MAX_BATCH_SIZE.
This enables zswap to compress/decompress pages in parallel in the IAA
hardware accelerator to improve swapout/swapin performance and memory
savings.
Signed-off-by: Kanchana P Sridhar <kanchana.p.sridhar@intel.com>
---
crypto/acompress.c | 14 ++++++++++++++
drivers/crypto/intel/iaa/iaa_crypto_main.c | 2 ++
include/crypto/acompress.h | 12 ++++++++++++
include/crypto/internal/acompress.h | 3 +++
4 files changed, 31 insertions(+)
diff --git a/crypto/acompress.c b/crypto/acompress.c
index c48a1a20e21f..02c25c79c0d4 100644
--- a/crypto/acompress.c
+++ b/crypto/acompress.c
@@ -387,6 +387,20 @@ int crypto_acomp_decompress(struct acomp_req *req)
}
EXPORT_SYMBOL_GPL(crypto_acomp_decompress);
+unsigned int crypto_acomp_batch_size(struct crypto_acomp *tfm)
+{
+ if (acomp_is_async(tfm) &&
+ (crypto_comp_alg_common(tfm)->base.cra_flags & CRYPTO_ALG_TYPE_ACOMPRESS)) {
+ struct acomp_alg *alg = crypto_acomp_alg(tfm);
+
+ if (alg && alg->batch_size > 1)
+ return alg->batch_size;
+ }
+
+ return 1;
+}
+EXPORT_SYMBOL_GPL(crypto_acomp_batch_size);
+
void comp_prepare_alg(struct comp_alg_common *alg)
{
struct crypto_alg *base = &alg->base;
diff --git a/drivers/crypto/intel/iaa/iaa_crypto_main.c b/drivers/crypto/intel/iaa/iaa_crypto_main.c
index fe9f59ede577..e735aa01dce8 100644
--- a/drivers/crypto/intel/iaa/iaa_crypto_main.c
+++ b/drivers/crypto/intel/iaa/iaa_crypto_main.c
@@ -2729,6 +2729,7 @@ static struct acomp_alg iaa_acomp_fixed_deflate = {
.compress = iaa_crypto_acomp_acompress_main,
.decompress = iaa_crypto_acomp_adecompress_main,
.batch_completed = iaa_crypto_acomp_batch_completed,
+ .batch_size = IAA_CRYPTO_MAX_BATCH_SIZE,
.base = {
.cra_name = "deflate",
.cra_driver_name = "deflate-iaa",
@@ -2755,6 +2756,7 @@ static struct acomp_alg iaa_acomp_dynamic_deflate = {
.compress = iaa_crypto_acomp_acompress_main,
.decompress = iaa_crypto_acomp_adecompress_main,
.batch_completed = iaa_crypto_acomp_batch_completed,
+ .batch_size = IAA_CRYPTO_MAX_BATCH_SIZE,
.base = {
.cra_name = "deflate",
.cra_driver_name = "deflate-iaa-dynamic",
diff --git a/include/crypto/acompress.h b/include/crypto/acompress.h
index 752110a7719c..1448b20de492 100644
--- a/include/crypto/acompress.h
+++ b/include/crypto/acompress.h
@@ -598,6 +598,18 @@ int crypto_acomp_compress(struct acomp_req *req);
*/
int crypto_acomp_decompress(struct acomp_req *req);
+/**
+ * crypto_acomp_batch_size() -- Get the algorithm's batch size
+ *
+ * Function returns the algorithm's batch size for batching operations
+ *
+ * @tfm: ACOMPRESS tfm handle allocated with crypto_alloc_acomp()
+ *
+ * Return: @tfm's acomp_alg's @batch_size, if it has defined a
+ * @batch_size greater than 1; else return 1.
+ */
+unsigned int crypto_acomp_batch_size(struct crypto_acomp *tfm);
+
static inline struct acomp_req *acomp_request_on_stack_init(
char *buf, struct crypto_acomp *tfm)
{
diff --git a/include/crypto/internal/acompress.h b/include/crypto/internal/acompress.h
index 7c4e14491d59..dc126a8cfea2 100644
--- a/include/crypto/internal/acompress.h
+++ b/include/crypto/internal/acompress.h
@@ -34,6 +34,8 @@
* of all batch sub-requests having completed.
* Returns an error code in @req->slen if any
* of the sub-requests completed with an error.
+ * @batch_size: Maximum batch-size for batching compress/decompress
+ * operations.
* @init: Initialize the cryptographic transformation object.
* This function is used to initialize the cryptographic
* transformation object. This function is called only once at
@@ -53,6 +55,7 @@ struct acomp_alg {
int (*compress)(struct acomp_req *req);
int (*decompress)(struct acomp_req *req);
bool (*batch_completed)(struct acomp_req *req, bool comp);
+ unsigned int batch_size;
int (*init)(struct crypto_acomp *tfm);
void (*exit)(struct crypto_acomp *tfm);
--
2.27.0
^ permalink raw reply [flat|nested] 48+ messages in thread* [PATCH v14 23/26] mm: zswap: Tie per-CPU acomp_ctx lifetime to the pool.
2026-01-25 3:35 [PATCH v14 00/26] zswap compression batching with optimized iaa_crypto driver Kanchana P Sridhar
` (21 preceding siblings ...)
2026-01-25 3:35 ` [PATCH v14 22/26] crypto: acomp - Add crypto_acomp_batch_size() to get an algorithm's batch-size Kanchana P Sridhar
@ 2026-01-25 3:35 ` Kanchana P Sridhar
2026-02-04 16:29 ` Yosry Ahmed
2026-01-25 3:35 ` [PATCH v14 24/26] mm: zswap: Consistently use IS_ERR_OR_NULL() to check acomp_ctx resources Kanchana P Sridhar
` (3 subsequent siblings)
26 siblings, 1 reply; 48+ messages in thread
From: Kanchana P Sridhar @ 2026-01-25 3:35 UTC (permalink / raw)
To: linux-kernel, linux-mm, hannes, yosry.ahmed, nphamcs,
chengming.zhou, usamaarif642, ryan.roberts, 21cnbao, ying.huang,
akpm, senozhatsky, sj, kasong, linux-crypto, herbert, davem,
clabbe, ardb, ebiggers, surenb, kristen.c.accardi,
vinicius.gomes, giovanni.cabiddu
Cc: wajdi.k.feghali, kanchana.p.sridhar
Currently, per-CPU acomp_ctx are allocated on pool creation and/or CPU
hotplug, and destroyed on pool destruction or CPU hotunplug. This
complicates the lifetime management to save memory while a CPU is
offlined, which is not very common.
Simplify lifetime management by allocating per-CPU acomp_ctx once on
pool creation (or CPU hotplug for CPUs onlined later), and keeping them
allocated until the pool is destroyed.
Refactor cleanup code from zswap_cpu_comp_dead() into
acomp_ctx_dealloc() to be used elsewhere.
The main benefit of using the CPU hotplug multi state instance startup
callback to allocate the acomp_ctx resources is that it prevents the
cores from being offlined until the multi state instance addition call
returns.
From Documentation/core-api/cpu_hotplug.rst:
"The node list add/remove operations and the callback invocations are
serialized against CPU hotplug operations."
Furthermore, zswap_[de]compress() cannot contend with
zswap_cpu_comp_prepare() because:
- During pool creation/deletion, the pool is not in the zswap_pools
list.
- During CPU hot[un]plug, the CPU is not yet online, as Yosry pointed
out. zswap_cpu_comp_prepare() will be run on a control CPU,
since CPUHP_MM_ZSWP_POOL_PREPARE is in the PREPARE section of "enum
cpuhp_state".
In both these cases, any recursions into zswap reclaim from
zswap_cpu_comp_prepare() will be handled by the old pool.
The above two observations enable the following simplifications:
1) zswap_cpu_comp_prepare():
a) acomp_ctx mutex locking:
If the process gets migrated while zswap_cpu_comp_prepare() is
running, it will complete on the new CPU. In case of failures, we
pass the acomp_ctx pointer obtained at the start of
zswap_cpu_comp_prepare() to acomp_ctx_dealloc(), which again, can
only undergo migration. There appear to be no contention
scenarios that might cause inconsistent values of acomp_ctx's
members. Hence, it seems there is no need for
mutex_lock(&acomp_ctx->mutex) in zswap_cpu_comp_prepare().
b) acomp_ctx mutex initialization:
Since the pool is not yet on zswap_pools list, we don't need to
initialize the per-CPU acomp_ctx mutex in
zswap_pool_create(). This has been restored to occur in
zswap_cpu_comp_prepare().
c) Subsequent CPU offline-online transitions:
zswap_cpu_comp_prepare() checks upfront if acomp_ctx->acomp is
valid. If so, it returns success. This should handle any CPU
hotplug online-offline transitions after pool creation is done.
2) CPU offline vis-a-vis zswap ops:
Let's suppose the process is migrated to another CPU before the
current CPU is dysfunctional. If zswap_[de]compress() holds the
acomp_ctx->mutex lock of the offlined CPU, that mutex will be
released once it completes on the new CPU. Since there is no
teardown callback, there is no possibility of UAF.
3) Pool creation/deletion and process migration to another CPU:
During pool creation/deletion, the pool is not in the zswap_pools
list. Hence it cannot contend with zswap ops on that CPU. However,
the process can get migrated.
a) Pool creation --> zswap_cpu_comp_prepare()
--> process migrated:
* Old CPU offline: no-op.
* zswap_cpu_comp_prepare() continues
to run on the new CPU to finish
allocating acomp_ctx resources for
the offlined CPU.
b) Pool deletion --> acomp_ctx_dealloc()
--> process migrated:
* Old CPU offline: no-op.
* acomp_ctx_dealloc() continues
to run on the new CPU to finish
de-allocating acomp_ctx resources
for the offlined CPU.
4) Pool deletion vis-a-vis CPU onlining:
The call to cpuhp_state_remove_instance() cannot race with
zswap_cpu_comp_prepare() because of hotplug synchronization.
The current acomp_ctx_get_cpu_lock()/acomp_ctx_put_unlock() are
deleted. Instead, zswap_[de]compress() directly call
mutex_[un]lock(&acomp_ctx->mutex).
The per-CPU memory cost of not deleting the acomp_ctx resources upon CPU
offlining, and only deleting them when the pool is destroyed, is as
follows, on x86_64:
IAA with 8 dst buffers for batching: 64.34 KB
Software compressors with 1 dst buffer: 8.28 KB
This cost is only paid when a CPU is offlined, until it is onlined
again.
Signed-off-by: Kanchana P Sridhar <kanchana.p.sridhar@intel.com>
---
mm/zswap.c | 164 +++++++++++++++++++++--------------------------------
1 file changed, 66 insertions(+), 98 deletions(-)
diff --git a/mm/zswap.c b/mm/zswap.c
index 038e240c03dd..9480d54264e4 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -241,6 +241,20 @@ static inline struct xarray *swap_zswap_tree(swp_entry_t swp)
**********************************/
static void __zswap_pool_empty(struct percpu_ref *ref);
+static void acomp_ctx_dealloc(struct crypto_acomp_ctx *acomp_ctx)
+{
+ if (IS_ERR_OR_NULL(acomp_ctx))
+ return;
+
+ if (!IS_ERR_OR_NULL(acomp_ctx->req))
+ acomp_request_free(acomp_ctx->req);
+
+ if (!IS_ERR_OR_NULL(acomp_ctx->acomp))
+ crypto_free_acomp(acomp_ctx->acomp);
+
+ kfree(acomp_ctx->buffer);
+}
+
static struct zswap_pool *zswap_pool_create(char *compressor)
{
struct zswap_pool *pool;
@@ -262,19 +276,27 @@ static struct zswap_pool *zswap_pool_create(char *compressor)
strscpy(pool->tfm_name, compressor, sizeof(pool->tfm_name));
- pool->acomp_ctx = alloc_percpu(*pool->acomp_ctx);
+ /* Many things rely on the zero-initialization. */
+ pool->acomp_ctx = alloc_percpu_gfp(*pool->acomp_ctx,
+ GFP_KERNEL | __GFP_ZERO);
if (!pool->acomp_ctx) {
pr_err("percpu alloc failed\n");
goto error;
}
- for_each_possible_cpu(cpu)
- mutex_init(&per_cpu_ptr(pool->acomp_ctx, cpu)->mutex);
-
+ /*
+ * This is serialized against CPU hotplug operations. Hence, cores
+ * cannot be offlined until this finishes.
+ */
ret = cpuhp_state_add_instance(CPUHP_MM_ZSWP_POOL_PREPARE,
&pool->node);
+
+ /*
+ * cpuhp_state_add_instance() will not cleanup on failure since
+ * we don't register a hotunplug callback.
+ */
if (ret)
- goto error;
+ goto cpuhp_add_fail;
/* being the current pool takes 1 ref; this func expects the
* caller to always add the new pool as the current pool
@@ -291,6 +313,10 @@ static struct zswap_pool *zswap_pool_create(char *compressor)
ref_fail:
cpuhp_state_remove_instance(CPUHP_MM_ZSWP_POOL_PREPARE, &pool->node);
+
+cpuhp_add_fail:
+ for_each_possible_cpu(cpu)
+ acomp_ctx_dealloc(per_cpu_ptr(pool->acomp_ctx, cpu));
error:
if (pool->acomp_ctx)
free_percpu(pool->acomp_ctx);
@@ -321,9 +347,15 @@ static struct zswap_pool *__zswap_pool_create_fallback(void)
static void zswap_pool_destroy(struct zswap_pool *pool)
{
+ int cpu;
+
zswap_pool_debug("destroying", pool);
cpuhp_state_remove_instance(CPUHP_MM_ZSWP_POOL_PREPARE, &pool->node);
+
+ for_each_possible_cpu(cpu)
+ acomp_ctx_dealloc(per_cpu_ptr(pool->acomp_ctx, cpu));
+
free_percpu(pool->acomp_ctx);
zs_destroy_pool(pool->zs_pool);
@@ -735,39 +767,36 @@ static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node)
{
struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node);
struct crypto_acomp_ctx *acomp_ctx = per_cpu_ptr(pool->acomp_ctx, cpu);
- struct crypto_acomp *acomp = NULL;
- struct acomp_req *req = NULL;
- u8 *buffer = NULL;
- int ret;
+ int ret = -ENOMEM;
- buffer = kmalloc_node(PAGE_SIZE, GFP_KERNEL, cpu_to_node(cpu));
- if (!buffer) {
- ret = -ENOMEM;
- goto fail;
+ /*
+ * To handle cases where the CPU goes through online-offline-online
+ * transitions, we return if the acomp_ctx has already been initialized.
+ */
+ if (acomp_ctx->acomp) {
+ WARN_ON_ONCE(IS_ERR(acomp_ctx->acomp));
+ return 0;
}
- acomp = crypto_alloc_acomp_node(pool->tfm_name, 0, 0, cpu_to_node(cpu));
- if (IS_ERR(acomp)) {
+ acomp_ctx->buffer = kmalloc_node(PAGE_SIZE, GFP_KERNEL, cpu_to_node(cpu));
+ if (!acomp_ctx->buffer)
+ return ret;
+
+ acomp_ctx->acomp = crypto_alloc_acomp_node(pool->tfm_name, 0, 0, cpu_to_node(cpu));
+ if (IS_ERR(acomp_ctx->acomp)) {
pr_err("could not alloc crypto acomp %s : %ld\n",
- pool->tfm_name, PTR_ERR(acomp));
- ret = PTR_ERR(acomp);
+ pool->tfm_name, PTR_ERR(acomp_ctx->acomp));
+ ret = PTR_ERR(acomp_ctx->acomp);
goto fail;
}
- req = acomp_request_alloc(acomp);
- if (!req) {
+ acomp_ctx->req = acomp_request_alloc(acomp_ctx->acomp);
+ if (!acomp_ctx->req) {
pr_err("could not alloc crypto acomp_request %s\n",
pool->tfm_name);
- ret = -ENOMEM;
goto fail;
}
- /*
- * Only hold the mutex after completing allocations, otherwise we may
- * recurse into zswap through reclaim and attempt to hold the mutex
- * again resulting in a deadlock.
- */
- mutex_lock(&acomp_ctx->mutex);
crypto_init_wait(&acomp_ctx->wait);
/*
@@ -775,83 +804,19 @@ static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node)
* crypto_wait_req(); if the backend of acomp is scomp, the callback
* won't be called, crypto_wait_req() will return without blocking.
*/
- acomp_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG,
+ acomp_request_set_callback(acomp_ctx->req, CRYPTO_TFM_REQ_MAY_BACKLOG,
crypto_req_done, &acomp_ctx->wait);
- acomp_ctx->buffer = buffer;
- acomp_ctx->acomp = acomp;
- acomp_ctx->req = req;
-
acomp_request_set_unit_size(acomp_ctx->req, PAGE_SIZE);
- mutex_unlock(&acomp_ctx->mutex);
+ mutex_init(&acomp_ctx->mutex);
return 0;
fail:
- if (!IS_ERR_OR_NULL(acomp))
- crypto_free_acomp(acomp);
- kfree(buffer);
+ acomp_ctx_dealloc(acomp_ctx);
return ret;
}
-static int zswap_cpu_comp_dead(unsigned int cpu, struct hlist_node *node)
-{
- struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node);
- struct crypto_acomp_ctx *acomp_ctx = per_cpu_ptr(pool->acomp_ctx, cpu);
- struct acomp_req *req;
- struct crypto_acomp *acomp;
- u8 *buffer;
-
- if (IS_ERR_OR_NULL(acomp_ctx))
- return 0;
-
- mutex_lock(&acomp_ctx->mutex);
- req = acomp_ctx->req;
- acomp = acomp_ctx->acomp;
- buffer = acomp_ctx->buffer;
- acomp_ctx->req = NULL;
- acomp_ctx->acomp = NULL;
- acomp_ctx->buffer = NULL;
- mutex_unlock(&acomp_ctx->mutex);
-
- /*
- * Do the actual freeing after releasing the mutex to avoid subtle
- * locking dependencies causing deadlocks.
- */
- if (!IS_ERR_OR_NULL(req))
- acomp_request_free(req);
- if (!IS_ERR_OR_NULL(acomp))
- crypto_free_acomp(acomp);
- kfree(buffer);
-
- return 0;
-}
-
-static struct crypto_acomp_ctx *acomp_ctx_get_cpu_lock(struct zswap_pool *pool)
-{
- struct crypto_acomp_ctx *acomp_ctx;
-
- for (;;) {
- acomp_ctx = raw_cpu_ptr(pool->acomp_ctx);
- mutex_lock(&acomp_ctx->mutex);
- if (likely(acomp_ctx->req))
- return acomp_ctx;
- /*
- * It is possible that we were migrated to a different CPU after
- * getting the per-CPU ctx but before the mutex was acquired. If
- * the old CPU got offlined, zswap_cpu_comp_dead() could have
- * already freed ctx->req (among other things) and set it to
- * NULL. Just try again on the new CPU that we ended up on.
- */
- mutex_unlock(&acomp_ctx->mutex);
- }
-}
-
-static void acomp_ctx_put_unlock(struct crypto_acomp_ctx *acomp_ctx)
-{
- mutex_unlock(&acomp_ctx->mutex);
-}
-
static bool zswap_compress(struct page *page, struct zswap_entry *entry,
struct zswap_pool *pool)
{
@@ -864,7 +829,9 @@ static bool zswap_compress(struct page *page, struct zswap_entry *entry,
u8 *dst;
bool mapped = false;
- acomp_ctx = acomp_ctx_get_cpu_lock(pool);
+ acomp_ctx = raw_cpu_ptr(pool->acomp_ctx);
+ mutex_lock(&acomp_ctx->mutex);
+
dst = acomp_ctx->buffer;
sg_init_table(&input, 1);
sg_set_page(&input, page, PAGE_SIZE, 0);
@@ -927,7 +894,7 @@ static bool zswap_compress(struct page *page, struct zswap_entry *entry,
else if (alloc_ret)
zswap_reject_alloc_fail++;
- acomp_ctx_put_unlock(acomp_ctx);
+ mutex_unlock(&acomp_ctx->mutex);
return comp_ret == 0 && alloc_ret == 0;
}
@@ -939,7 +906,8 @@ static bool zswap_decompress(struct zswap_entry *entry, struct folio *folio)
int decomp_ret = 0, dlen = PAGE_SIZE;
u8 *src, *obj;
- acomp_ctx = acomp_ctx_get_cpu_lock(pool);
+ acomp_ctx = raw_cpu_ptr(pool->acomp_ctx);
+ mutex_lock(&acomp_ctx->mutex);
obj = zs_obj_read_begin(pool->zs_pool, entry->handle, entry->length,
acomp_ctx->buffer);
@@ -971,7 +939,7 @@ static bool zswap_decompress(struct zswap_entry *entry, struct folio *folio)
read_done:
zs_obj_read_end(pool->zs_pool, entry->handle, entry->length, obj);
- acomp_ctx_put_unlock(acomp_ctx);
+ mutex_unlock(&acomp_ctx->mutex);
if (!decomp_ret && dlen == PAGE_SIZE)
return true;
@@ -1797,7 +1765,7 @@ static int zswap_setup(void)
ret = cpuhp_setup_state_multi(CPUHP_MM_ZSWP_POOL_PREPARE,
"mm/zswap_pool:prepare",
zswap_cpu_comp_prepare,
- zswap_cpu_comp_dead);
+ NULL);
if (ret)
goto hp_fail;
--
2.27.0
^ permalink raw reply [flat|nested] 48+ messages in thread* Re: [PATCH v14 23/26] mm: zswap: Tie per-CPU acomp_ctx lifetime to the pool.
2026-01-25 3:35 ` [PATCH v14 23/26] mm: zswap: Tie per-CPU acomp_ctx lifetime to the pool Kanchana P Sridhar
@ 2026-02-04 16:29 ` Yosry Ahmed
0 siblings, 0 replies; 48+ messages in thread
From: Yosry Ahmed @ 2026-02-04 16:29 UTC (permalink / raw)
To: Kanchana P Sridhar
Cc: linux-kernel, linux-mm, hannes, nphamcs, chengming.zhou,
usamaarif642, ryan.roberts, 21cnbao, ying.huang, akpm,
senozhatsky, sj, kasong, linux-crypto, herbert, davem, clabbe,
ardb, ebiggers, surenb, kristen.c.accardi, vinicius.gomes,
giovanni.cabiddu, wajdi.k.feghali
On Sat, Jan 24, 2026 at 07:35:34PM -0800, Kanchana P Sridhar wrote:
> Currently, per-CPU acomp_ctx are allocated on pool creation and/or CPU
> hotplug, and destroyed on pool destruction or CPU hotunplug. This
> complicates the lifetime management to save memory while a CPU is
> offlined, which is not very common.
>
> Simplify lifetime management by allocating per-CPU acomp_ctx once on
> pool creation (or CPU hotplug for CPUs onlined later), and keeping them
> allocated until the pool is destroyed.
>
> Refactor cleanup code from zswap_cpu_comp_dead() into
> acomp_ctx_dealloc() to be used elsewhere.
>
> The main benefit of using the CPU hotplug multi state instance startup
> callback to allocate the acomp_ctx resources is that it prevents the
> cores from being offlined until the multi state instance addition call
> returns.
>
> From Documentation/core-api/cpu_hotplug.rst:
>
> "The node list add/remove operations and the callback invocations are
> serialized against CPU hotplug operations."
>
> Furthermore, zswap_[de]compress() cannot contend with
> zswap_cpu_comp_prepare() because:
>
> - During pool creation/deletion, the pool is not in the zswap_pools
> list.
>
> - During CPU hot[un]plug, the CPU is not yet online, as Yosry pointed
> out. zswap_cpu_comp_prepare() will be run on a control CPU,
> since CPUHP_MM_ZSWP_POOL_PREPARE is in the PREPARE section of "enum
> cpuhp_state".
>
> In both these cases, any recursions into zswap reclaim from
> zswap_cpu_comp_prepare() will be handled by the old pool.
>
> The above two observations enable the following simplifications:
>
> 1) zswap_cpu_comp_prepare():
>
> a) acomp_ctx mutex locking:
>
> If the process gets migrated while zswap_cpu_comp_prepare() is
> running, it will complete on the new CPU. In case of failures, we
> pass the acomp_ctx pointer obtained at the start of
> zswap_cpu_comp_prepare() to acomp_ctx_dealloc(), which again, can
> only undergo migration. There appear to be no contention
> scenarios that might cause inconsistent values of acomp_ctx's
> members. Hence, it seems there is no need for
> mutex_lock(&acomp_ctx->mutex) in zswap_cpu_comp_prepare().
>
> b) acomp_ctx mutex initialization:
>
> Since the pool is not yet on zswap_pools list, we don't need to
> initialize the per-CPU acomp_ctx mutex in
> zswap_pool_create(). This has been restored to occur in
> zswap_cpu_comp_prepare().
>
> c) Subsequent CPU offline-online transitions:
>
> zswap_cpu_comp_prepare() checks upfront if acomp_ctx->acomp is
> valid. If so, it returns success. This should handle any CPU
> hotplug online-offline transitions after pool creation is done.
>
> 2) CPU offline vis-a-vis zswap ops:
>
> Let's suppose the process is migrated to another CPU before the
> current CPU is dysfunctional. If zswap_[de]compress() holds the
> acomp_ctx->mutex lock of the offlined CPU, that mutex will be
> released once it completes on the new CPU. Since there is no
> teardown callback, there is no possibility of UAF.
>
> 3) Pool creation/deletion and process migration to another CPU:
>
> During pool creation/deletion, the pool is not in the zswap_pools
> list. Hence it cannot contend with zswap ops on that CPU. However,
> the process can get migrated.
>
> a) Pool creation --> zswap_cpu_comp_prepare()
> --> process migrated:
> * Old CPU offline: no-op.
> * zswap_cpu_comp_prepare() continues
> to run on the new CPU to finish
> allocating acomp_ctx resources for
> the offlined CPU.
>
> b) Pool deletion --> acomp_ctx_dealloc()
> --> process migrated:
> * Old CPU offline: no-op.
> * acomp_ctx_dealloc() continues
> to run on the new CPU to finish
> de-allocating acomp_ctx resources
> for the offlined CPU.
>
> 4) Pool deletion vis-a-vis CPU onlining:
>
> The call to cpuhp_state_remove_instance() cannot race with
> zswap_cpu_comp_prepare() because of hotplug synchronization.
>
> The current acomp_ctx_get_cpu_lock()/acomp_ctx_put_unlock() are
> deleted. Instead, zswap_[de]compress() directly call
> mutex_[un]lock(&acomp_ctx->mutex).
>
> The per-CPU memory cost of not deleting the acomp_ctx resources upon CPU
> offlining, and only deleting them when the pool is destroyed, is as
> follows, on x86_64:
>
> IAA with 8 dst buffers for batching: 64.34 KB
> Software compressors with 1 dst buffer: 8.28 KB
>
> This cost is only paid when a CPU is offlined, until it is onlined
> again.
>
> Signed-off-by: Kanchana P Sridhar <kanchana.p.sridhar@intel.com>
LGTM with a small nit below:
Acked-by: Yosry Ahmed <yosry.ahmed@linux.dev>
> ---
> mm/zswap.c | 164 +++++++++++++++++++++--------------------------------
> 1 file changed, 66 insertions(+), 98 deletions(-)
>
> diff --git a/mm/zswap.c b/mm/zswap.c
> index 038e240c03dd..9480d54264e4 100644
> --- a/mm/zswap.c
> +++ b/mm/zswap.c
> @@ -241,6 +241,20 @@ static inline struct xarray *swap_zswap_tree(swp_entry_t swp)
> **********************************/
> static void __zswap_pool_empty(struct percpu_ref *ref);
>
> +static void acomp_ctx_dealloc(struct crypto_acomp_ctx *acomp_ctx)
> +{
> + if (IS_ERR_OR_NULL(acomp_ctx))
> + return;
> +
> + if (!IS_ERR_OR_NULL(acomp_ctx->req))
> + acomp_request_free(acomp_ctx->req);
> +
> + if (!IS_ERR_OR_NULL(acomp_ctx->acomp))
> + crypto_free_acomp(acomp_ctx->acomp);
Should we set acomp_ctx->req, acomp_ctx->acomp, and acomp_ctx->buffer to
NULL here?
zswap_cpu_comp_prepare() uses NULL to detect that we need to initialize
acomp_ctx.
> +
> + kfree(acomp_ctx->buffer);
> +}
> +
> static struct zswap_pool *zswap_pool_create(char *compressor)
> {
> struct zswap_pool *pool;
> @@ -262,19 +276,27 @@ static struct zswap_pool *zswap_pool_create(char *compressor)
>
> strscpy(pool->tfm_name, compressor, sizeof(pool->tfm_name));
>
> - pool->acomp_ctx = alloc_percpu(*pool->acomp_ctx);
> + /* Many things rely on the zero-initialization. */
> + pool->acomp_ctx = alloc_percpu_gfp(*pool->acomp_ctx,
> + GFP_KERNEL | __GFP_ZERO);
> if (!pool->acomp_ctx) {
> pr_err("percpu alloc failed\n");
> goto error;
> }
>
> - for_each_possible_cpu(cpu)
> - mutex_init(&per_cpu_ptr(pool->acomp_ctx, cpu)->mutex);
> -
> + /*
> + * This is serialized against CPU hotplug operations. Hence, cores
> + * cannot be offlined until this finishes.
> + */
> ret = cpuhp_state_add_instance(CPUHP_MM_ZSWP_POOL_PREPARE,
> &pool->node);
> +
> + /*
> + * cpuhp_state_add_instance() will not cleanup on failure since
> + * we don't register a hotunplug callback.
> + */
> if (ret)
> - goto error;
> + goto cpuhp_add_fail;
>
> /* being the current pool takes 1 ref; this func expects the
> * caller to always add the new pool as the current pool
> @@ -291,6 +313,10 @@ static struct zswap_pool *zswap_pool_create(char *compressor)
>
> ref_fail:
> cpuhp_state_remove_instance(CPUHP_MM_ZSWP_POOL_PREPARE, &pool->node);
> +
> +cpuhp_add_fail:
> + for_each_possible_cpu(cpu)
> + acomp_ctx_dealloc(per_cpu_ptr(pool->acomp_ctx, cpu));
> error:
> if (pool->acomp_ctx)
> free_percpu(pool->acomp_ctx);
> @@ -321,9 +347,15 @@ static struct zswap_pool *__zswap_pool_create_fallback(void)
>
> static void zswap_pool_destroy(struct zswap_pool *pool)
> {
> + int cpu;
> +
> zswap_pool_debug("destroying", pool);
>
> cpuhp_state_remove_instance(CPUHP_MM_ZSWP_POOL_PREPARE, &pool->node);
> +
> + for_each_possible_cpu(cpu)
> + acomp_ctx_dealloc(per_cpu_ptr(pool->acomp_ctx, cpu));
> +
> free_percpu(pool->acomp_ctx);
>
> zs_destroy_pool(pool->zs_pool);
> @@ -735,39 +767,36 @@ static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node)
> {
> struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node);
> struct crypto_acomp_ctx *acomp_ctx = per_cpu_ptr(pool->acomp_ctx, cpu);
> - struct crypto_acomp *acomp = NULL;
> - struct acomp_req *req = NULL;
> - u8 *buffer = NULL;
> - int ret;
> + int ret = -ENOMEM;
>
> - buffer = kmalloc_node(PAGE_SIZE, GFP_KERNEL, cpu_to_node(cpu));
> - if (!buffer) {
> - ret = -ENOMEM;
> - goto fail;
> + /*
> + * To handle cases where the CPU goes through online-offline-online
> + * transitions, we return if the acomp_ctx has already been initialized.
> + */
> + if (acomp_ctx->acomp) {
> + WARN_ON_ONCE(IS_ERR(acomp_ctx->acomp));
> + return 0;
> }
>
> - acomp = crypto_alloc_acomp_node(pool->tfm_name, 0, 0, cpu_to_node(cpu));
> - if (IS_ERR(acomp)) {
> + acomp_ctx->buffer = kmalloc_node(PAGE_SIZE, GFP_KERNEL, cpu_to_node(cpu));
> + if (!acomp_ctx->buffer)
> + return ret;
> +
> + acomp_ctx->acomp = crypto_alloc_acomp_node(pool->tfm_name, 0, 0, cpu_to_node(cpu));
> + if (IS_ERR(acomp_ctx->acomp)) {
> pr_err("could not alloc crypto acomp %s : %ld\n",
> - pool->tfm_name, PTR_ERR(acomp));
> - ret = PTR_ERR(acomp);
> + pool->tfm_name, PTR_ERR(acomp_ctx->acomp));
> + ret = PTR_ERR(acomp_ctx->acomp);
> goto fail;
> }
>
> - req = acomp_request_alloc(acomp);
> - if (!req) {
> + acomp_ctx->req = acomp_request_alloc(acomp_ctx->acomp);
> + if (!acomp_ctx->req) {
> pr_err("could not alloc crypto acomp_request %s\n",
> pool->tfm_name);
> - ret = -ENOMEM;
> goto fail;
> }
>
> - /*
> - * Only hold the mutex after completing allocations, otherwise we may
> - * recurse into zswap through reclaim and attempt to hold the mutex
> - * again resulting in a deadlock.
> - */
> - mutex_lock(&acomp_ctx->mutex);
> crypto_init_wait(&acomp_ctx->wait);
>
> /*
> @@ -775,83 +804,19 @@ static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node)
> * crypto_wait_req(); if the backend of acomp is scomp, the callback
> * won't be called, crypto_wait_req() will return without blocking.
> */
> - acomp_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG,
> + acomp_request_set_callback(acomp_ctx->req, CRYPTO_TFM_REQ_MAY_BACKLOG,
> crypto_req_done, &acomp_ctx->wait);
>
> - acomp_ctx->buffer = buffer;
> - acomp_ctx->acomp = acomp;
> - acomp_ctx->req = req;
> -
> acomp_request_set_unit_size(acomp_ctx->req, PAGE_SIZE);
>
> - mutex_unlock(&acomp_ctx->mutex);
> + mutex_init(&acomp_ctx->mutex);
> return 0;
>
> fail:
> - if (!IS_ERR_OR_NULL(acomp))
> - crypto_free_acomp(acomp);
> - kfree(buffer);
> + acomp_ctx_dealloc(acomp_ctx);
> return ret;
> }
>
[..]
^ permalink raw reply [flat|nested] 48+ messages in thread
* [PATCH v14 24/26] mm: zswap: Consistently use IS_ERR_OR_NULL() to check acomp_ctx resources.
2026-01-25 3:35 [PATCH v14 00/26] zswap compression batching with optimized iaa_crypto driver Kanchana P Sridhar
` (22 preceding siblings ...)
2026-01-25 3:35 ` [PATCH v14 23/26] mm: zswap: Tie per-CPU acomp_ctx lifetime to the pool Kanchana P Sridhar
@ 2026-01-25 3:35 ` Kanchana P Sridhar
2026-01-30 23:53 ` Nhat Pham
2026-01-25 3:35 ` [PATCH v14 25/26] mm: zswap: Store large folios in batches Kanchana P Sridhar
` (2 subsequent siblings)
26 siblings, 1 reply; 48+ messages in thread
From: Kanchana P Sridhar @ 2026-01-25 3:35 UTC (permalink / raw)
To: linux-kernel, linux-mm, hannes, yosry.ahmed, nphamcs,
chengming.zhou, usamaarif642, ryan.roberts, 21cnbao, ying.huang,
akpm, senozhatsky, sj, kasong, linux-crypto, herbert, davem,
clabbe, ardb, ebiggers, surenb, kristen.c.accardi,
vinicius.gomes, giovanni.cabiddu
Cc: wajdi.k.feghali, kanchana.p.sridhar
Use IS_ERR_OR_NULL() in zswap_cpu_comp_prepare() to check for valid
acomp/req, making it consistent with acomp_ctx_dealloc().
Signed-off-by: Kanchana P Sridhar <kanchana.p.sridhar@intel.com>
Acked-by: Yosry Ahmed <yosry.ahmed@linux.dev>
---
mm/zswap.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/mm/zswap.c b/mm/zswap.c
index 9480d54264e4..0d56390342b7 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -783,7 +783,7 @@ static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node)
return ret;
acomp_ctx->acomp = crypto_alloc_acomp_node(pool->tfm_name, 0, 0, cpu_to_node(cpu));
- if (IS_ERR(acomp_ctx->acomp)) {
+ if (IS_ERR_OR_NULL(acomp_ctx->acomp)) {
pr_err("could not alloc crypto acomp %s : %ld\n",
pool->tfm_name, PTR_ERR(acomp_ctx->acomp));
ret = PTR_ERR(acomp_ctx->acomp);
@@ -791,7 +791,7 @@ static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node)
}
acomp_ctx->req = acomp_request_alloc(acomp_ctx->acomp);
- if (!acomp_ctx->req) {
+ if (IS_ERR_OR_NULL(acomp_ctx->req)) {
pr_err("could not alloc crypto acomp_request %s\n",
pool->tfm_name);
goto fail;
--
2.27.0
^ permalink raw reply [flat|nested] 48+ messages in thread* Re: [PATCH v14 24/26] mm: zswap: Consistently use IS_ERR_OR_NULL() to check acomp_ctx resources.
2026-01-25 3:35 ` [PATCH v14 24/26] mm: zswap: Consistently use IS_ERR_OR_NULL() to check acomp_ctx resources Kanchana P Sridhar
@ 2026-01-30 23:53 ` Nhat Pham
2026-01-31 1:15 ` Sridhar, Kanchana P
0 siblings, 1 reply; 48+ messages in thread
From: Nhat Pham @ 2026-01-30 23:53 UTC (permalink / raw)
To: Kanchana P Sridhar
Cc: linux-kernel, linux-mm, hannes, yosry.ahmed, chengming.zhou,
usamaarif642, ryan.roberts, 21cnbao, ying.huang, akpm,
senozhatsky, sj, kasong, linux-crypto, herbert, davem, clabbe,
ardb, ebiggers, surenb, kristen.c.accardi, vinicius.gomes,
giovanni.cabiddu, wajdi.k.feghali
On Sat, Jan 24, 2026 at 7:36 PM Kanchana P Sridhar
<kanchana.p.sridhar@intel.com> wrote:
>
> Use IS_ERR_OR_NULL() in zswap_cpu_comp_prepare() to check for valid
> acomp/req, making it consistent with acomp_ctx_dealloc().
>
> Signed-off-by: Kanchana P Sridhar <kanchana.p.sridhar@intel.com>
> Acked-by: Yosry Ahmed <yosry.ahmed@linux.dev>
LGTM. I wonder if this is technically a fix?
Also, considering submitting this separately if the patch series stall
- so that you don't have to carry one extra patch around every time :)
Anyway:
Acked-by: Nhat Pham <nphamcs@gmail.com>
^ permalink raw reply [flat|nested] 48+ messages in thread
* RE: [PATCH v14 24/26] mm: zswap: Consistently use IS_ERR_OR_NULL() to check acomp_ctx resources.
2026-01-30 23:53 ` Nhat Pham
@ 2026-01-31 1:15 ` Sridhar, Kanchana P
0 siblings, 0 replies; 48+ messages in thread
From: Sridhar, Kanchana P @ 2026-01-31 1:15 UTC (permalink / raw)
To: Nhat Pham
Cc: linux-kernel, linux-mm, hannes, yosry.ahmed, chengming.zhou,
usamaarif642, ryan.roberts, 21cnbao, ying.huang, akpm,
senozhatsky, sj, kasong, linux-crypto, herbert, davem, clabbe,
ardb, ebiggers, surenb, Accardi, Kristen C, Gomes, Vinicius,
Cabiddu, Giovanni, Feghali, Wajdi K, Sridhar, Kanchana P
> -----Original Message-----
> From: Nhat Pham <nphamcs@gmail.com>
> Sent: Friday, January 30, 2026 3:53 PM
> To: Sridhar, Kanchana P <kanchana.p.sridhar@intel.com>
> Cc: linux-kernel@vger.kernel.org; linux-mm@kvack.org;
> hannes@cmpxchg.org; yosry.ahmed@linux.dev; chengming.zhou@linux.dev;
> usamaarif642@gmail.com; ryan.roberts@arm.com; 21cnbao@gmail.com;
> ying.huang@linux.alibaba.com; akpm@linux-foundation.org;
> senozhatsky@chromium.org; sj@kernel.org; kasong@tencent.com; linux-
> crypto@vger.kernel.org; herbert@gondor.apana.org.au;
> davem@davemloft.net; clabbe@baylibre.com; ardb@kernel.org;
> ebiggers@google.com; surenb@google.com; Accardi, Kristen C
> <kristen.c.accardi@intel.com>; Gomes, Vinicius <vinicius.gomes@intel.com>;
> Cabiddu, Giovanni <giovanni.cabiddu@intel.com>; Feghali, Wajdi K
> <wajdi.k.feghali@intel.com>
> Subject: Re: [PATCH v14 24/26] mm: zswap: Consistently use
> IS_ERR_OR_NULL() to check acomp_ctx resources.
>
> On Sat, Jan 24, 2026 at 7:36 PM Kanchana P Sridhar
> <kanchana.p.sridhar@intel.com> wrote:
> >
> > Use IS_ERR_OR_NULL() in zswap_cpu_comp_prepare() to check for valid
> > acomp/req, making it consistent with acomp_ctx_dealloc().
> >
> > Signed-off-by: Kanchana P Sridhar <kanchana.p.sridhar@intel.com>
> > Acked-by: Yosry Ahmed <yosry.ahmed@linux.dev>
>
> LGTM. I wonder if this is technically a fix?
Hi Nhat,
Thanks for the review comments and for the Ack!
As to whether this is technically a fix: I think the answer is "not really",
because:
1) The failure handling has been consolidated to acomp_ctx_dealloc().
2) acomp_ctx_dealloc() replaces the existing zswap_cpu_comp_dead(),
which uses the same checks with IS_ERR_OR_NULL() for the
acomp_ctx->acomp and acomp_ctx->req.
3) Hence, this patch brings the error condition checks for these two
acomp_ctx members' allocation to be consistent with (1) and (2).
So I suppose this is a consistency change rather than a fix. Please
correct me if I am wrong.
>
> Also, considering submitting this separately if the patch series stall
> - so that you don't have to carry one extra patch around every time :)
Sure, thanks for the suggestion.
I would appreciate it if yourself and Yosry can review the other zswap
related patches in this series. I have addressed all but one v13 comment,
as mentioned in the cover letter in the "v14 Performance Summary"
section.
>
> Anyway:
> Acked-by: Nhat Pham <nphamcs@gmail.com>
Thanks!
Kanchana
^ permalink raw reply [flat|nested] 48+ messages in thread
* [PATCH v14 25/26] mm: zswap: Store large folios in batches.
2026-01-25 3:35 [PATCH v14 00/26] zswap compression batching with optimized iaa_crypto driver Kanchana P Sridhar
` (23 preceding siblings ...)
2026-01-25 3:35 ` [PATCH v14 24/26] mm: zswap: Consistently use IS_ERR_OR_NULL() to check acomp_ctx resources Kanchana P Sridhar
@ 2026-01-25 3:35 ` Kanchana P Sridhar
2026-01-31 0:33 ` Nhat Pham
2026-02-04 16:57 ` Yosry Ahmed
2026-01-25 3:35 ` [PATCH v14 26/26] mm: zswap: Batched zswap_compress() for compress batching of large folios Kanchana P Sridhar
2026-02-04 18:21 ` [PATCH v14 00/26] zswap compression batching with optimized iaa_crypto driver Yosry Ahmed
26 siblings, 2 replies; 48+ messages in thread
From: Kanchana P Sridhar @ 2026-01-25 3:35 UTC (permalink / raw)
To: linux-kernel, linux-mm, hannes, yosry.ahmed, nphamcs,
chengming.zhou, usamaarif642, ryan.roberts, 21cnbao, ying.huang,
akpm, senozhatsky, sj, kasong, linux-crypto, herbert, davem,
clabbe, ardb, ebiggers, surenb, kristen.c.accardi,
vinicius.gomes, giovanni.cabiddu
Cc: wajdi.k.feghali, kanchana.p.sridhar
Support batching when storing large folios in zswap. If the underlying
compressor supports batching (e.g. hardware parallel compression),
allocate multiple compression buffers, otherwise allocate one. The
number of buffers is bounded by a new constant, ZSWAP_MAX_BATCH_SIZE, to
limit the memory overhead. For existing software compressors, the only
extra overhead is the extra 'buffers' pointer, so 8 bytes per-CPU on
x86_64.
Only the first buffer is currently used, but subsequent changes will use
the remaining buffers for hardware compression batching.
Regardless of compression batching, always process large folios in
batches. For hardware compressors, the batch size is the compressor
batch size, otherwise ZSWAP_MAX_BATCH_SIZE is used.
zswap_store_page() is replaced with zswap_store_pages(), which processes
a batch of pages and allows for batching optimizations. For now, only
optimize allocating entries by using batch allocations from the slab
cache.
Since batch allocations do not support specifying a node id, store the
node id in the zswap entry instead of relying on the zswap_entry being
allocated on the same node. The size of the zswap_entry remains
unchanged as 'referenced' is lumped in with the 'length' (as it doesn't
need a full unsigned int anyway).
Avoid repeatedly calling mem_cgroup_zswap_writeback_enabled() for every
page and only call it once for the folio, since the entire folio is
charged to a single memcg.
Suggested-by: Nhat Pham <nphamcs@gmail.com>
Suggested-by: Yosry Ahmed <yosry.ahmed@linux.dev>
Signed-off-by: Kanchana P Sridhar <kanchana.p.sridhar@intel.com>
---
mm/zswap.c | 351 +++++++++++++++++++++++++++++++++++++----------------
1 file changed, 248 insertions(+), 103 deletions(-)
diff --git a/mm/zswap.c b/mm/zswap.c
index 0d56390342b7..6a22add63220 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -82,6 +82,11 @@ static bool zswap_pool_reached_full;
#define ZSWAP_PARAM_UNSET ""
+/* Limit the batch size to limit per-CPU memory usage for dst buffers. */
+#define ZSWAP_MAX_BATCH_SIZE 8U
+#define ZSWAP_ENTRY_SPARE_4BYTES 32U
+#define ZSWAP_ENTRY_REF_BIT 1U
+
static int zswap_setup(void);
/* Enable/disable zswap */
@@ -139,7 +144,7 @@ struct crypto_acomp_ctx {
struct crypto_acomp *acomp;
struct acomp_req *req;
struct crypto_wait wait;
- u8 *buffer;
+ u8 **buffers;
struct mutex mutex;
};
@@ -148,6 +153,9 @@ struct crypto_acomp_ctx {
* The only case where lru_lock is not acquired while holding tree.lock is
* when a zswap_entry is taken off the lru for writeback, in that case it
* needs to be verified that it's still valid in the tree.
+ *
+ * @compr_batch_size: The max batch size of the compression algorithm,
+ * bounded by ZSWAP_MAX_BATCH_SIZE.
*/
struct zswap_pool {
struct zs_pool *zs_pool;
@@ -157,6 +165,7 @@ struct zswap_pool {
struct work_struct release_work;
struct hlist_node node;
char tfm_name[CRYPTO_MAX_ALG_NAME];
+ u8 compr_batch_size;
};
/* Global LRU lists shared by all zswap pools. */
@@ -181,6 +190,7 @@ static struct shrinker *zswap_shrinker;
* writeback logic. The entry is only reclaimed by the writeback
* logic if referenced is unset. See comments in the shrinker
* section for context.
+ * nid - NUMA node id of the page for which this is the zswap entry.
* pool - the zswap_pool the entry's data is in
* handle - zsmalloc allocation handle that stores the compressed page data
* objcg - the obj_cgroup that the compressed memory is charged to
@@ -188,8 +198,11 @@ static struct shrinker *zswap_shrinker;
*/
struct zswap_entry {
swp_entry_t swpentry;
- unsigned int length;
- bool referenced;
+ struct {
+ unsigned int length:(ZSWAP_ENTRY_SPARE_4BYTES - ZSWAP_ENTRY_REF_BIT);
+ bool referenced:ZSWAP_ENTRY_REF_BIT;
+ };
+ int nid;
struct zswap_pool *pool;
unsigned long handle;
struct obj_cgroup *objcg;
@@ -241,8 +254,10 @@ static inline struct xarray *swap_zswap_tree(swp_entry_t swp)
**********************************/
static void __zswap_pool_empty(struct percpu_ref *ref);
-static void acomp_ctx_dealloc(struct crypto_acomp_ctx *acomp_ctx)
+static void acomp_ctx_dealloc(struct crypto_acomp_ctx *acomp_ctx, u8 nr_buffers)
{
+ u8 i;
+
if (IS_ERR_OR_NULL(acomp_ctx))
return;
@@ -252,7 +267,11 @@ static void acomp_ctx_dealloc(struct crypto_acomp_ctx *acomp_ctx)
if (!IS_ERR_OR_NULL(acomp_ctx->acomp))
crypto_free_acomp(acomp_ctx->acomp);
- kfree(acomp_ctx->buffer);
+ if (acomp_ctx->buffers) {
+ for (i = 0; i < nr_buffers; ++i)
+ kfree(acomp_ctx->buffers[i]);
+ kfree(acomp_ctx->buffers);
+ }
}
static struct zswap_pool *zswap_pool_create(char *compressor)
@@ -264,6 +283,7 @@ static struct zswap_pool *zswap_pool_create(char *compressor)
if (!zswap_has_pool && !strcmp(compressor, ZSWAP_PARAM_UNSET))
return NULL;
+ /* Many things rely on the zero-initialization. */
pool = kzalloc(sizeof(*pool), GFP_KERNEL);
if (!pool)
return NULL;
@@ -316,7 +336,9 @@ static struct zswap_pool *zswap_pool_create(char *compressor)
cpuhp_add_fail:
for_each_possible_cpu(cpu)
- acomp_ctx_dealloc(per_cpu_ptr(pool->acomp_ctx, cpu));
+ acomp_ctx_dealloc(per_cpu_ptr(pool->acomp_ctx, cpu),
+ pool->compr_batch_size);
+
error:
if (pool->acomp_ctx)
free_percpu(pool->acomp_ctx);
@@ -354,7 +376,8 @@ static void zswap_pool_destroy(struct zswap_pool *pool)
cpuhp_state_remove_instance(CPUHP_MM_ZSWP_POOL_PREPARE, &pool->node);
for_each_possible_cpu(cpu)
- acomp_ctx_dealloc(per_cpu_ptr(pool->acomp_ctx, cpu));
+ acomp_ctx_dealloc(per_cpu_ptr(pool->acomp_ctx, cpu),
+ pool->compr_batch_size);
free_percpu(pool->acomp_ctx);
@@ -645,14 +668,8 @@ static inline struct mem_cgroup *mem_cgroup_from_entry(struct zswap_entry *entry
}
#endif
-static inline int entry_to_nid(struct zswap_entry *entry)
-{
- return page_to_nid(virt_to_page(entry));
-}
-
static void zswap_lru_add(struct list_lru *list_lru, struct zswap_entry *entry)
{
- int nid = entry_to_nid(entry);
struct mem_cgroup *memcg;
/*
@@ -669,19 +686,18 @@ static void zswap_lru_add(struct list_lru *list_lru, struct zswap_entry *entry)
rcu_read_lock();
memcg = mem_cgroup_from_entry(entry);
/* will always succeed */
- list_lru_add(list_lru, &entry->lru, nid, memcg);
+ list_lru_add(list_lru, &entry->lru, entry->nid, memcg);
rcu_read_unlock();
}
static void zswap_lru_del(struct list_lru *list_lru, struct zswap_entry *entry)
{
- int nid = entry_to_nid(entry);
struct mem_cgroup *memcg;
rcu_read_lock();
memcg = mem_cgroup_from_entry(entry);
/* will always succeed */
- list_lru_del(list_lru, &entry->lru, nid, memcg);
+ list_lru_del(list_lru, &entry->lru, entry->nid, memcg);
rcu_read_unlock();
}
@@ -741,6 +757,56 @@ static void zswap_entry_cache_free(struct zswap_entry *entry)
kmem_cache_free(zswap_entry_cache, entry);
}
+static __always_inline void zswap_entries_cache_free_batch(
+ struct zswap_entry **entries,
+ u8 nr_entries)
+{
+ /*
+ * It is okay to use this to free entries allocated separately
+ * by zswap_entry_cache_alloc().
+ */
+ kmem_cache_free_bulk(zswap_entry_cache, nr_entries, (void **)entries);
+}
+
+static __always_inline bool zswap_entries_cache_alloc_batch(
+ struct zswap_entry **entries,
+ u8 nr_entries,
+ gfp_t gfp,
+ int nid)
+{
+ int nr_alloc = kmem_cache_alloc_bulk(zswap_entry_cache, gfp,
+ nr_entries, (void **)entries);
+
+ /*
+ * kmem_cache_alloc_bulk() should return @nr_entries on success
+ * and 0 on failure.
+ */
+ if (likely(nr_alloc == nr_entries))
+ return true;
+
+ if (WARN_ON_ONCE(unlikely(nr_alloc && (nr_alloc != nr_entries)))) {
+ zswap_reject_kmemcache_fail++;
+ zswap_entries_cache_free_batch(entries, nr_alloc);
+ nr_alloc = 0;
+ }
+
+ if (unlikely(!nr_alloc)) {
+ unsigned int i;
+
+ for (i = 0; i < nr_entries; ++i) {
+ entries[i] = zswap_entry_cache_alloc(GFP_KERNEL, nid);
+
+ if (unlikely(!entries[i])) {
+ zswap_reject_kmemcache_fail++;
+ zswap_entries_cache_free_batch(entries, i);
+ return false;
+ }
+ }
+ }
+
+ return true;
+}
+
/*
* Carries out the common pattern of freeing an entry's zsmalloc allocation,
* freeing the entry itself, and decrementing the number of stored pages.
@@ -767,7 +833,9 @@ static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node)
{
struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node);
struct crypto_acomp_ctx *acomp_ctx = per_cpu_ptr(pool->acomp_ctx, cpu);
+ int nid = cpu_to_node(cpu);
int ret = -ENOMEM;
+ u8 i;
/*
* To handle cases where the CPU goes through online-offline-online
@@ -778,11 +846,7 @@ static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node)
return 0;
}
- acomp_ctx->buffer = kmalloc_node(PAGE_SIZE, GFP_KERNEL, cpu_to_node(cpu));
- if (!acomp_ctx->buffer)
- return ret;
-
- acomp_ctx->acomp = crypto_alloc_acomp_node(pool->tfm_name, 0, 0, cpu_to_node(cpu));
+ acomp_ctx->acomp = crypto_alloc_acomp_node(pool->tfm_name, 0, 0, nid);
if (IS_ERR_OR_NULL(acomp_ctx->acomp)) {
pr_err("could not alloc crypto acomp %s : %ld\n",
pool->tfm_name, PTR_ERR(acomp_ctx->acomp));
@@ -790,20 +854,39 @@ static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node)
goto fail;
}
+ /*
+ * Allocate up to ZSWAP_MAX_BATCH_SIZE dst buffers if the
+ * compressor supports batching.
+ */
+ pool->compr_batch_size = min(ZSWAP_MAX_BATCH_SIZE,
+ crypto_acomp_batch_size(acomp_ctx->acomp));
+
acomp_ctx->req = acomp_request_alloc(acomp_ctx->acomp);
+
if (IS_ERR_OR_NULL(acomp_ctx->req)) {
pr_err("could not alloc crypto acomp_request %s\n",
pool->tfm_name);
goto fail;
}
- crypto_init_wait(&acomp_ctx->wait);
+ acomp_ctx->buffers = kcalloc_node(pool->compr_batch_size, sizeof(u8 *),
+ GFP_KERNEL, nid);
+ if (!acomp_ctx->buffers)
+ goto fail;
+
+ for (i = 0; i < pool->compr_batch_size; ++i) {
+ acomp_ctx->buffers[i] = kmalloc_node(PAGE_SIZE, GFP_KERNEL, nid);
+ if (!acomp_ctx->buffers[i])
+ goto fail;
+ }
/*
* if the backend of acomp is async zip, crypto_req_done() will wakeup
* crypto_wait_req(); if the backend of acomp is scomp, the callback
* won't be called, crypto_wait_req() will return without blocking.
*/
+ crypto_init_wait(&acomp_ctx->wait);
+
acomp_request_set_callback(acomp_ctx->req, CRYPTO_TFM_REQ_MAY_BACKLOG,
crypto_req_done, &acomp_ctx->wait);
@@ -813,12 +896,12 @@ static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node)
return 0;
fail:
- acomp_ctx_dealloc(acomp_ctx);
+ acomp_ctx_dealloc(acomp_ctx, pool->compr_batch_size);
return ret;
}
static bool zswap_compress(struct page *page, struct zswap_entry *entry,
- struct zswap_pool *pool)
+ struct zswap_pool *pool, bool wb_enabled)
{
struct crypto_acomp_ctx *acomp_ctx;
struct scatterlist input, output;
@@ -832,7 +915,7 @@ static bool zswap_compress(struct page *page, struct zswap_entry *entry,
acomp_ctx = raw_cpu_ptr(pool->acomp_ctx);
mutex_lock(&acomp_ctx->mutex);
- dst = acomp_ctx->buffer;
+ dst = acomp_ctx->buffers[0];
sg_init_table(&input, 1);
sg_set_page(&input, page, PAGE_SIZE, 0);
@@ -862,8 +945,7 @@ static bool zswap_compress(struct page *page, struct zswap_entry *entry,
* to the active LRU list in the case.
*/
if (comp_ret || !dlen || dlen >= PAGE_SIZE) {
- if (!mem_cgroup_zswap_writeback_enabled(
- folio_memcg(page_folio(page)))) {
+ if (!wb_enabled) {
comp_ret = comp_ret ? comp_ret : -EINVAL;
goto unlock;
}
@@ -909,7 +991,7 @@ static bool zswap_decompress(struct zswap_entry *entry, struct folio *folio)
acomp_ctx = raw_cpu_ptr(pool->acomp_ctx);
mutex_lock(&acomp_ctx->mutex);
obj = zs_obj_read_begin(pool->zs_pool, entry->handle, entry->length,
- acomp_ctx->buffer);
+ acomp_ctx->buffers[0]);
/* zswap entries of length PAGE_SIZE are not compressed. */
if (entry->length == PAGE_SIZE) {
@@ -919,15 +1001,15 @@ static bool zswap_decompress(struct zswap_entry *entry, struct folio *folio)
/*
* zs_obj_read_begin() might return a kmap address of highmem when
- * acomp_ctx->buffer is not used. However, sg_init_one() does not
- * handle highmem addresses, so copy the object to acomp_ctx->buffer.
+ * acomp_ctx->buffers[0] is not used. However, sg_init_one() does not
+ * handle highmem addresses, so copy the object to acomp_ctx->buffers[0].
*/
if (virt_addr_valid(obj)) {
src = obj;
} else {
- WARN_ON_ONCE(obj == acomp_ctx->buffer);
- memcpy(acomp_ctx->buffer, obj, entry->length);
- src = acomp_ctx->buffer;
+ WARN_ON_ONCE(obj == acomp_ctx->buffers[0]);
+ memcpy(acomp_ctx->buffers[0], obj, entry->length);
+ src = acomp_ctx->buffers[0];
}
sg_init_one(&input, src, entry->length);
@@ -1381,95 +1463,136 @@ static void shrink_worker(struct work_struct *w)
* main API
**********************************/
-static bool zswap_store_page(struct page *page,
- struct obj_cgroup *objcg,
- struct zswap_pool *pool)
+/*
+ * Store multiple pages in @folio, starting from the page at index @start up to
+ * the page at index @end-1.
+ */
+static bool zswap_store_pages(struct folio *folio,
+ long start,
+ long end,
+ struct zswap_pool *pool,
+ struct crypto_acomp_ctx *acomp_ctx,
+ int nid,
+ bool wb_enabled,
+ struct obj_cgroup *objcg)
{
- swp_entry_t page_swpentry = page_swap_entry(page);
- struct zswap_entry *entry, *old;
+ struct zswap_entry *entries[ZSWAP_MAX_BATCH_SIZE];
+ u8 i, store_fail_idx = 0, nr_pages = end - start;
- /* allocate entry */
- entry = zswap_entry_cache_alloc(GFP_KERNEL, page_to_nid(page));
- if (!entry) {
- zswap_reject_kmemcache_fail++;
+ VM_WARN_ON_ONCE(nr_pages > ZSWAP_MAX_BATCH_SIZE);
+
+ if (unlikely(!zswap_entries_cache_alloc_batch(entries, nr_pages,
+ GFP_KERNEL, nid)))
return false;
- }
- if (!zswap_compress(page, entry, pool))
- goto compress_failed;
+ /*
+ * We co-locate entry initialization as much as possible here to
+ * minimize potential cache misses.
+ */
+ for (i = 0; i < nr_pages; ++i) {
+ entries[i]->handle = (unsigned long)ERR_PTR(-EINVAL);
+ entries[i]->pool = pool;
+ entries[i]->swpentry = page_swap_entry(folio_page(folio, start + i));
+ entries[i]->objcg = objcg;
+ entries[i]->referenced = true;
+ entries[i]->nid = nid;
+ INIT_LIST_HEAD(&entries[i]->lru);
+ }
- old = xa_store(swap_zswap_tree(page_swpentry),
- swp_offset(page_swpentry),
- entry, GFP_KERNEL);
- if (xa_is_err(old)) {
- int err = xa_err(old);
+ for (i = 0; i < nr_pages; ++i) {
+ struct page *page = folio_page(folio, start + i);
- WARN_ONCE(err != -ENOMEM, "unexpected xarray error: %d\n", err);
- zswap_reject_alloc_fail++;
- goto store_failed;
+ if (!zswap_compress(page, entries[i], pool, wb_enabled))
+ goto store_pages_failed;
}
- /*
- * We may have had an existing entry that became stale when
- * the folio was redirtied and now the new version is being
- * swapped out. Get rid of the old.
- */
- if (old)
- zswap_entry_free(old);
+ for (i = 0; i < nr_pages; ++i) {
+ struct zswap_entry *old, *entry = entries[i];
- /*
- * The entry is successfully compressed and stored in the tree, there is
- * no further possibility of failure. Grab refs to the pool and objcg,
- * charge zswap memory, and increment zswap_stored_pages.
- * The opposite actions will be performed by zswap_entry_free()
- * when the entry is removed from the tree.
- */
- zswap_pool_get(pool);
- if (objcg) {
- obj_cgroup_get(objcg);
- obj_cgroup_charge_zswap(objcg, entry->length);
- }
- atomic_long_inc(&zswap_stored_pages);
- if (entry->length == PAGE_SIZE)
- atomic_long_inc(&zswap_stored_incompressible_pages);
+ old = xa_store(swap_zswap_tree(entry->swpentry),
+ swp_offset(entry->swpentry),
+ entry, GFP_KERNEL);
+ if (unlikely(xa_is_err(old))) {
+ int err = xa_err(old);
- /*
- * We finish initializing the entry while it's already in xarray.
- * This is safe because:
- *
- * 1. Concurrent stores and invalidations are excluded by folio lock.
- *
- * 2. Writeback is excluded by the entry not being on the LRU yet.
- * The publishing order matters to prevent writeback from seeing
- * an incoherent entry.
- */
- entry->pool = pool;
- entry->swpentry = page_swpentry;
- entry->objcg = objcg;
- entry->referenced = true;
- if (entry->length) {
- INIT_LIST_HEAD(&entry->lru);
- zswap_lru_add(&zswap_list_lru, entry);
+ WARN_ONCE(err != -ENOMEM, "unexpected xarray error: %d\n", err);
+ zswap_reject_alloc_fail++;
+ /*
+ * Entries up to this point have been stored in the
+ * xarray. zswap_store() will erase them from the xarray
+ * and call zswap_entry_free(). Local cleanup in
+ * 'store_pages_failed' only needs to happen for
+ * entries from [@i to @nr_pages).
+ */
+ store_fail_idx = i;
+ goto store_pages_failed;
+ }
+
+ /*
+ * We may have had an existing entry that became stale when
+ * the folio was redirtied and now the new version is being
+ * swapped out. Get rid of the old.
+ */
+ if (unlikely(old))
+ zswap_entry_free(old);
+
+ /*
+ * The entry is successfully compressed and stored in the tree,
+ * and further failures will be cleaned up in zswap_store().
+ * Grab refs to the pool and objcg, charge zswap memory, and
+ * increment zswap_stored_pages. The opposite actions will be
+ * performed by zswap_entry_free() when the entry is removed
+ * from the tree.
+ */
+ zswap_pool_get(pool);
+ if (objcg) {
+ obj_cgroup_get(objcg);
+ obj_cgroup_charge_zswap(objcg, entry->length);
+ }
+ atomic_long_inc(&zswap_stored_pages);
+ if (entry->length == PAGE_SIZE)
+ atomic_long_inc(&zswap_stored_incompressible_pages);
+
+ /*
+ * We finish by adding the entry to the LRU while it's already
+ * in xarray. This is safe because:
+ *
+ * 1. Concurrent stores and invalidations are excluded by folio lock.
+ *
+ * 2. Writeback is excluded by the entry not being on the LRU yet.
+ * The publishing order matters to prevent writeback from seeing
+ * an incoherent entry.
+ */
+ if (likely(entry->length))
+ zswap_lru_add(&zswap_list_lru, entry);
}
return true;
-store_failed:
- zs_free(pool->zs_pool, entry->handle);
-compress_failed:
- zswap_entry_cache_free(entry);
+store_pages_failed:
+ for (i = store_fail_idx; i < nr_pages; ++i) {
+ if (!IS_ERR_VALUE(entries[i]->handle))
+ zs_free(pool->zs_pool, entries[i]->handle);
+ }
+ zswap_entries_cache_free_batch(&entries[store_fail_idx],
+ nr_pages - store_fail_idx);
+
return false;
}
bool zswap_store(struct folio *folio)
{
+ bool wb_enabled = mem_cgroup_zswap_writeback_enabled(folio_memcg(folio));
long nr_pages = folio_nr_pages(folio);
+ struct crypto_acomp_ctx *acomp_ctx;
swp_entry_t swp = folio->swap;
struct obj_cgroup *objcg = NULL;
struct mem_cgroup *memcg = NULL;
+ int nid = folio_nid(folio);
struct zswap_pool *pool;
+ u8 store_batch_size;
bool ret = false;
- long index;
+ long start, end;
VM_WARN_ON_ONCE(!folio_test_locked(folio));
VM_WARN_ON_ONCE(!folio_test_swapcache(folio));
@@ -1503,10 +1626,32 @@ bool zswap_store(struct folio *folio)
mem_cgroup_put(memcg);
}
- for (index = 0; index < nr_pages; ++index) {
- struct page *page = folio_page(folio, index);
+ /*
+ * For batching compressors, store the folio in batches of the
+ * compressor's batch_size.
+ *
+ * For non-batching compressors, store the folio in batches
+ * of ZSWAP_MAX_BATCH_SIZE, where each page in the batch is
+ * compressed sequentially. This gives better performance than
+ * invoking zswap_store_pages() per-page, due to cache locality
+ * of working set structures.
+ */
+ store_batch_size = (pool->compr_batch_size > 1) ?
+ pool->compr_batch_size : ZSWAP_MAX_BATCH_SIZE;
+
+ acomp_ctx = raw_cpu_ptr(pool->acomp_ctx);
- if (!zswap_store_page(page, objcg, pool))
+ for (start = 0; start < nr_pages; start += store_batch_size) {
+ end = min(start + store_batch_size, nr_pages);
+
+ if (unlikely(!zswap_store_pages(folio,
+ start,
+ end,
+ pool,
+ acomp_ctx,
+ nid,
+ wb_enabled,
+ objcg)))
goto put_pool;
}
@@ -1536,9 +1681,9 @@ bool zswap_store(struct folio *folio)
struct zswap_entry *entry;
struct xarray *tree;
- for (index = 0; index < nr_pages; ++index) {
- tree = swap_zswap_tree(swp_entry(type, offset + index));
- entry = xa_erase(tree, offset + index);
+ for (start = 0; start < nr_pages; ++start) {
+ tree = swap_zswap_tree(swp_entry(type, offset + start));
+ entry = xa_erase(tree, offset + start);
if (entry)
zswap_entry_free(entry);
}
--
2.27.0
^ permalink raw reply [flat|nested] 48+ messages in thread* Re: [PATCH v14 25/26] mm: zswap: Store large folios in batches.
2026-01-25 3:35 ` [PATCH v14 25/26] mm: zswap: Store large folios in batches Kanchana P Sridhar
@ 2026-01-31 0:33 ` Nhat Pham
2026-01-31 20:22 ` Sridhar, Kanchana P
2026-02-04 16:57 ` Yosry Ahmed
1 sibling, 1 reply; 48+ messages in thread
From: Nhat Pham @ 2026-01-31 0:33 UTC (permalink / raw)
To: Kanchana P Sridhar
Cc: linux-kernel, linux-mm, hannes, yosry.ahmed, chengming.zhou,
usamaarif642, ryan.roberts, 21cnbao, ying.huang, akpm,
senozhatsky, sj, kasong, linux-crypto, herbert, davem, clabbe,
ardb, ebiggers, surenb, kristen.c.accardi, vinicius.gomes,
giovanni.cabiddu, wajdi.k.feghali
On Sat, Jan 24, 2026 at 7:36 PM Kanchana P Sridhar
<kanchana.p.sridhar@intel.com> wrote:
>
> Support batching when storing large folios in zswap. If the underlying
> compressor supports batching (e.g. hardware parallel compression),
> allocate multiple compression buffers, otherwise allocate one. The
> number of buffers is bounded by a new constant, ZSWAP_MAX_BATCH_SIZE, to
> limit the memory overhead. For existing software compressors, the only
> extra overhead is the extra 'buffers' pointer, so 8 bytes per-CPU on
> x86_64.
>
> Only the first buffer is currently used, but subsequent changes will use
> the remaining buffers for hardware compression batching.
>
> Regardless of compression batching, always process large folios in
> batches. For hardware compressors, the batch size is the compressor
> batch size, otherwise ZSWAP_MAX_BATCH_SIZE is used.
>
> zswap_store_page() is replaced with zswap_store_pages(), which processes
> a batch of pages and allows for batching optimizations. For now, only
> optimize allocating entries by using batch allocations from the slab
> cache.
>
> Since batch allocations do not support specifying a node id, store the
> node id in the zswap entry instead of relying on the zswap_entry being
> allocated on the same node. The size of the zswap_entry remains
> unchanged as 'referenced' is lumped in with the 'length' (as it doesn't
> need a full unsigned int anyway).
>
> Avoid repeatedly calling mem_cgroup_zswap_writeback_enabled() for every
> page and only call it once for the folio, since the entire folio is
> charged to a single memcg.
>
> Suggested-by: Nhat Pham <nphamcs@gmail.com>
> Suggested-by: Yosry Ahmed <yosry.ahmed@linux.dev>
> Signed-off-by: Kanchana P Sridhar <kanchana.p.sridhar@intel.com>
> ---
> mm/zswap.c | 351 +++++++++++++++++++++++++++++++++++++----------------
> 1 file changed, 248 insertions(+), 103 deletions(-)
>
> diff --git a/mm/zswap.c b/mm/zswap.c
> index 0d56390342b7..6a22add63220 100644
> --- a/mm/zswap.c
> +++ b/mm/zswap.c
> @@ -82,6 +82,11 @@ static bool zswap_pool_reached_full;
>
> #define ZSWAP_PARAM_UNSET ""
>
> +/* Limit the batch size to limit per-CPU memory usage for dst buffers. */
> +#define ZSWAP_MAX_BATCH_SIZE 8U
> +#define ZSWAP_ENTRY_SPARE_4BYTES 32U
> +#define ZSWAP_ENTRY_REF_BIT 1U
> +
> static int zswap_setup(void);
>
> /* Enable/disable zswap */
> @@ -139,7 +144,7 @@ struct crypto_acomp_ctx {
> struct crypto_acomp *acomp;
> struct acomp_req *req;
> struct crypto_wait wait;
> - u8 *buffer;
> + u8 **buffers;
> struct mutex mutex;
> };
>
> @@ -148,6 +153,9 @@ struct crypto_acomp_ctx {
> * The only case where lru_lock is not acquired while holding tree.lock is
> * when a zswap_entry is taken off the lru for writeback, in that case it
> * needs to be verified that it's still valid in the tree.
> + *
> + * @compr_batch_size: The max batch size of the compression algorithm,
> + * bounded by ZSWAP_MAX_BATCH_SIZE.
> */
> struct zswap_pool {
> struct zs_pool *zs_pool;
> @@ -157,6 +165,7 @@ struct zswap_pool {
> struct work_struct release_work;
> struct hlist_node node;
> char tfm_name[CRYPTO_MAX_ALG_NAME];
> + u8 compr_batch_size;
> };
>
> /* Global LRU lists shared by all zswap pools. */
> @@ -181,6 +190,7 @@ static struct shrinker *zswap_shrinker;
> * writeback logic. The entry is only reclaimed by the writeback
> * logic if referenced is unset. See comments in the shrinker
> * section for context.
> + * nid - NUMA node id of the page for which this is the zswap entry.
> * pool - the zswap_pool the entry's data is in
> * handle - zsmalloc allocation handle that stores the compressed page data
> * objcg - the obj_cgroup that the compressed memory is charged to
> @@ -188,8 +198,11 @@ static struct shrinker *zswap_shrinker;
> */
> struct zswap_entry {
> swp_entry_t swpentry;
> - unsigned int length;
> - bool referenced;
> + struct {
> + unsigned int length:(ZSWAP_ENTRY_SPARE_4BYTES - ZSWAP_ENTRY_REF_BIT);
> + bool referenced:ZSWAP_ENTRY_REF_BIT;
Hmm I thought Yosry confirmed that using values directly rather than
macros (i.e 32 and 1 instead of ZSWAP_ENTRY_SPARE_4BYTES and
ZSWAP_ENTRY_REF_BIT) was the convention? :)
https://lore.kernel.org/linux-mm/gnm6hcqlzna4p3unrad2sur7pnyovr7f2sfuiufzweu2zbfb2r@ia422moyti7v/
I was just copying zsmalloc's format ;) Anyway, either way a fixlet
should be sufficient. No big deal...
> + };
> + int nid;
> struct zswap_pool *pool;
> unsigned long handle;
> struct obj_cgroup *objcg;
> @@ -241,8 +254,10 @@ static inline struct xarray *swap_zswap_tree(swp_entry_t swp)
> **********************************/
> static void __zswap_pool_empty(struct percpu_ref *ref);
>
> -static void acomp_ctx_dealloc(struct crypto_acomp_ctx *acomp_ctx)
> +static void acomp_ctx_dealloc(struct crypto_acomp_ctx *acomp_ctx, u8 nr_buffers)
> {
> + u8 i;
> +
> if (IS_ERR_OR_NULL(acomp_ctx))
> return;
>
> @@ -252,7 +267,11 @@ static void acomp_ctx_dealloc(struct crypto_acomp_ctx *acomp_ctx)
> if (!IS_ERR_OR_NULL(acomp_ctx->acomp))
> crypto_free_acomp(acomp_ctx->acomp);
>
> - kfree(acomp_ctx->buffer);
> + if (acomp_ctx->buffers) {
> + for (i = 0; i < nr_buffers; ++i)
> + kfree(acomp_ctx->buffers[i]);
> + kfree(acomp_ctx->buffers);
> + }
> }
>
> static struct zswap_pool *zswap_pool_create(char *compressor)
> @@ -264,6 +283,7 @@ static struct zswap_pool *zswap_pool_create(char *compressor)
> if (!zswap_has_pool && !strcmp(compressor, ZSWAP_PARAM_UNSET))
> return NULL;
>
> + /* Many things rely on the zero-initialization. */
> pool = kzalloc(sizeof(*pool), GFP_KERNEL);
> if (!pool)
> return NULL;
> @@ -316,7 +336,9 @@ static struct zswap_pool *zswap_pool_create(char *compressor)
>
> cpuhp_add_fail:
> for_each_possible_cpu(cpu)
> - acomp_ctx_dealloc(per_cpu_ptr(pool->acomp_ctx, cpu));
> + acomp_ctx_dealloc(per_cpu_ptr(pool->acomp_ctx, cpu),
> + pool->compr_batch_size);
> +
> error:
> if (pool->acomp_ctx)
> free_percpu(pool->acomp_ctx);
> @@ -354,7 +376,8 @@ static void zswap_pool_destroy(struct zswap_pool *pool)
> cpuhp_state_remove_instance(CPUHP_MM_ZSWP_POOL_PREPARE, &pool->node);
>
> for_each_possible_cpu(cpu)
> - acomp_ctx_dealloc(per_cpu_ptr(pool->acomp_ctx, cpu));
> + acomp_ctx_dealloc(per_cpu_ptr(pool->acomp_ctx, cpu),
> + pool->compr_batch_size);
>
> free_percpu(pool->acomp_ctx);
>
> @@ -645,14 +668,8 @@ static inline struct mem_cgroup *mem_cgroup_from_entry(struct zswap_entry *entry
> }
> #endif
>
> -static inline int entry_to_nid(struct zswap_entry *entry)
> -{
> - return page_to_nid(virt_to_page(entry));
> -}
> -
> static void zswap_lru_add(struct list_lru *list_lru, struct zswap_entry *entry)
> {
> - int nid = entry_to_nid(entry);
> struct mem_cgroup *memcg;
>
> /*
> @@ -669,19 +686,18 @@ static void zswap_lru_add(struct list_lru *list_lru, struct zswap_entry *entry)
> rcu_read_lock();
> memcg = mem_cgroup_from_entry(entry);
> /* will always succeed */
> - list_lru_add(list_lru, &entry->lru, nid, memcg);
> + list_lru_add(list_lru, &entry->lru, entry->nid, memcg);
> rcu_read_unlock();
> }
>
> static void zswap_lru_del(struct list_lru *list_lru, struct zswap_entry *entry)
> {
> - int nid = entry_to_nid(entry);
> struct mem_cgroup *memcg;
>
> rcu_read_lock();
> memcg = mem_cgroup_from_entry(entry);
> /* will always succeed */
> - list_lru_del(list_lru, &entry->lru, nid, memcg);
> + list_lru_del(list_lru, &entry->lru, entry->nid, memcg);
> rcu_read_unlock();
> }
>
> @@ -741,6 +757,56 @@ static void zswap_entry_cache_free(struct zswap_entry *entry)
> kmem_cache_free(zswap_entry_cache, entry);
> }
>
> +static __always_inline void zswap_entries_cache_free_batch(
> + struct zswap_entry **entries,
> + u8 nr_entries)
> +{
> + /*
> + * It is okay to use this to free entries allocated separately
> + * by zswap_entry_cache_alloc().
> + */
> + kmem_cache_free_bulk(zswap_entry_cache, nr_entries, (void **)entries);
> +}
> +
> +static __always_inline bool zswap_entries_cache_alloc_batch(
> + struct zswap_entry **entries,
> + u8 nr_entries,
> + gfp_t gfp,
> + int nid)
> +{
> + int nr_alloc = kmem_cache_alloc_bulk(zswap_entry_cache, gfp,
> + nr_entries, (void **)entries);
> +
> + /*
> + * kmem_cache_alloc_bulk() should return @nr_entries on success
> + * and 0 on failure.
> + */
> + if (likely(nr_alloc == nr_entries))
> + return true;
> +
> + if (WARN_ON_ONCE(unlikely(nr_alloc && (nr_alloc != nr_entries)))) {
> + zswap_reject_kmemcache_fail++;
> + zswap_entries_cache_free_batch(entries, nr_alloc);
> + nr_alloc = 0;
> + }
Can partial allocation happen? I checked a couple callers of
kmem_cache_alloc_bulk() and none of them check the case nr_alloc &&
nr_alloc != nr_entries.
In fact, one caller (__io_alloc_req_refill() in io_uring/io_uring.c)
even explicitly document:
ret = kmem_cache_alloc_bulk(req_cachep, gfp, ARRAY_SIZE(reqs), reqs);
/*
* Bulk alloc is all-or-nothing. If we fail to get a batch,
* retry single alloc to be on the safe side.
*/
if (unlikely(ret <= 0)) {
reqs[0] = kmem_cache_alloc(req_cachep, gfp);
if (!reqs[0])
return false;
ret = 1;
}
Other callsers don't even bother checking the negative case (i.e ret <
0) - only the 0 case. I'm not terribly familiar with bulk allocation
though. Please fact check me :)
> +
> + if (unlikely(!nr_alloc)) {
> + unsigned int i;
> +
> + for (i = 0; i < nr_entries; ++i) {
> + entries[i] = zswap_entry_cache_alloc(GFP_KERNEL, nid);
> +
> + if (unlikely(!entries[i])) {
> + zswap_reject_kmemcache_fail++;
> + zswap_entries_cache_free_batch(entries, i);
> + return false;
> + }
> + }
> + }
> +
> + return true;
> +}
> +
> /*
> * Carries out the common pattern of freeing an entry's zsmalloc allocation,
> * freeing the entry itself, and decrementing the number of stored pages.
> @@ -767,7 +833,9 @@ static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node)
> {
> struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node);
> struct crypto_acomp_ctx *acomp_ctx = per_cpu_ptr(pool->acomp_ctx, cpu);
> + int nid = cpu_to_node(cpu);
> int ret = -ENOMEM;
> + u8 i;
>
> /*
> * To handle cases where the CPU goes through online-offline-online
> @@ -778,11 +846,7 @@ static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node)
> return 0;
> }
>
> - acomp_ctx->buffer = kmalloc_node(PAGE_SIZE, GFP_KERNEL, cpu_to_node(cpu));
> - if (!acomp_ctx->buffer)
> - return ret;
> -
> - acomp_ctx->acomp = crypto_alloc_acomp_node(pool->tfm_name, 0, 0, cpu_to_node(cpu));
> + acomp_ctx->acomp = crypto_alloc_acomp_node(pool->tfm_name, 0, 0, nid);
> if (IS_ERR_OR_NULL(acomp_ctx->acomp)) {
> pr_err("could not alloc crypto acomp %s : %ld\n",
> pool->tfm_name, PTR_ERR(acomp_ctx->acomp));
> @@ -790,20 +854,39 @@ static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node)
> goto fail;
> }
>
> + /*
> + * Allocate up to ZSWAP_MAX_BATCH_SIZE dst buffers if the
> + * compressor supports batching.
> + */
> + pool->compr_batch_size = min(ZSWAP_MAX_BATCH_SIZE,
> + crypto_acomp_batch_size(acomp_ctx->acomp));
> +
I asssume this is going to be 0 for zstd?
> acomp_ctx->req = acomp_request_alloc(acomp_ctx->acomp);
> +
> if (IS_ERR_OR_NULL(acomp_ctx->req)) {
> pr_err("could not alloc crypto acomp_request %s\n",
> pool->tfm_name);
> goto fail;
> }
>
> - crypto_init_wait(&acomp_ctx->wait);
> + acomp_ctx->buffers = kcalloc_node(pool->compr_batch_size, sizeof(u8 *),
> + GFP_KERNEL, nid);
> + if (!acomp_ctx->buffers)
> + goto fail;
> +
> + for (i = 0; i < pool->compr_batch_size; ++i) {
> + acomp_ctx->buffers[i] = kmalloc_node(PAGE_SIZE, GFP_KERNEL, nid);
> + if (!acomp_ctx->buffers[i])
> + goto fail;
> + }
>
> /*
> * if the backend of acomp is async zip, crypto_req_done() will wakeup
> * crypto_wait_req(); if the backend of acomp is scomp, the callback
> * won't be called, crypto_wait_req() will return without blocking.
> */
> + crypto_init_wait(&acomp_ctx->wait);
> +
> acomp_request_set_callback(acomp_ctx->req, CRYPTO_TFM_REQ_MAY_BACKLOG,
> crypto_req_done, &acomp_ctx->wait);
>
> @@ -813,12 +896,12 @@ static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node)
> return 0;
>
> fail:
> - acomp_ctx_dealloc(acomp_ctx);
> + acomp_ctx_dealloc(acomp_ctx, pool->compr_batch_size);
> return ret;
> }
>
> static bool zswap_compress(struct page *page, struct zswap_entry *entry,
> - struct zswap_pool *pool)
> + struct zswap_pool *pool, bool wb_enabled)
> {
> struct crypto_acomp_ctx *acomp_ctx;
> struct scatterlist input, output;
> @@ -832,7 +915,7 @@ static bool zswap_compress(struct page *page, struct zswap_entry *entry,
> acomp_ctx = raw_cpu_ptr(pool->acomp_ctx);
> mutex_lock(&acomp_ctx->mutex);
>
> - dst = acomp_ctx->buffer;
> + dst = acomp_ctx->buffers[0];
> sg_init_table(&input, 1);
> sg_set_page(&input, page, PAGE_SIZE, 0);
>
> @@ -862,8 +945,7 @@ static bool zswap_compress(struct page *page, struct zswap_entry *entry,
> * to the active LRU list in the case.
> */
> if (comp_ret || !dlen || dlen >= PAGE_SIZE) {
> - if (!mem_cgroup_zswap_writeback_enabled(
> - folio_memcg(page_folio(page)))) {
> + if (!wb_enabled) {
> comp_ret = comp_ret ? comp_ret : -EINVAL;
> goto unlock;
> }
> @@ -909,7 +991,7 @@ static bool zswap_decompress(struct zswap_entry *entry, struct folio *folio)
> acomp_ctx = raw_cpu_ptr(pool->acomp_ctx);
> mutex_lock(&acomp_ctx->mutex);
> obj = zs_obj_read_begin(pool->zs_pool, entry->handle, entry->length,
> - acomp_ctx->buffer);
> + acomp_ctx->buffers[0]);
>
> /* zswap entries of length PAGE_SIZE are not compressed. */
> if (entry->length == PAGE_SIZE) {
> @@ -919,15 +1001,15 @@ static bool zswap_decompress(struct zswap_entry *entry, struct folio *folio)
>
> /*
> * zs_obj_read_begin() might return a kmap address of highmem when
> - * acomp_ctx->buffer is not used. However, sg_init_one() does not
> - * handle highmem addresses, so copy the object to acomp_ctx->buffer.
> + * acomp_ctx->buffers[0] is not used. However, sg_init_one() does not
> + * handle highmem addresses, so copy the object to acomp_ctx->buffers[0].
> */
> if (virt_addr_valid(obj)) {
> src = obj;
> } else {
> - WARN_ON_ONCE(obj == acomp_ctx->buffer);
> - memcpy(acomp_ctx->buffer, obj, entry->length);
> - src = acomp_ctx->buffer;
> + WARN_ON_ONCE(obj == acomp_ctx->buffers[0]);
> + memcpy(acomp_ctx->buffers[0], obj, entry->length);
> + src = acomp_ctx->buffers[0];
> }
>
> sg_init_one(&input, src, entry->length);
> @@ -1381,95 +1463,136 @@ static void shrink_worker(struct work_struct *w)
> * main API
> **********************************/
>
> -static bool zswap_store_page(struct page *page,
> - struct obj_cgroup *objcg,
> - struct zswap_pool *pool)
> +/*
> + * Store multiple pages in @folio, starting from the page at index @start up to
> + * the page at index @end-1.
> + */
> +static bool zswap_store_pages(struct folio *folio,
> + long start,
> + long end,
> + struct zswap_pool *pool,
> + struct crypto_acomp_ctx *acomp_ctx,
> + int nid,
> + bool wb_enabled,
> + struct obj_cgroup *objcg)
> {
> - swp_entry_t page_swpentry = page_swap_entry(page);
> - struct zswap_entry *entry, *old;
> + struct zswap_entry *entries[ZSWAP_MAX_BATCH_SIZE];
> + u8 i, store_fail_idx = 0, nr_pages = end - start;
>
> - /* allocate entry */
> - entry = zswap_entry_cache_alloc(GFP_KERNEL, page_to_nid(page));
> - if (!entry) {
> - zswap_reject_kmemcache_fail++;
> + VM_WARN_ON_ONCE(nr_pages > ZSWAP_MAX_BATCH_SIZE);
> +
> + if (unlikely(!zswap_entries_cache_alloc_batch(entries, nr_pages,
> + GFP_KERNEL, nid)))
> return false;
> - }
>
> - if (!zswap_compress(page, entry, pool))
> - goto compress_failed;
> + /*
> + * We co-locate entry initialization as much as possible here to
> + * minimize potential cache misses.
> + */
> + for (i = 0; i < nr_pages; ++i) {
> + entries[i]->handle = (unsigned long)ERR_PTR(-EINVAL);
> + entries[i]->pool = pool;
> + entries[i]->swpentry = page_swap_entry(folio_page(folio, start + i));
> + entries[i]->objcg = objcg;
> + entries[i]->referenced = true;
> + entries[i]->nid = nid;
> + INIT_LIST_HEAD(&entries[i]->lru);
> + }
>
> - old = xa_store(swap_zswap_tree(page_swpentry),
> - swp_offset(page_swpentry),
> - entry, GFP_KERNEL);
> - if (xa_is_err(old)) {
> - int err = xa_err(old);
> + for (i = 0; i < nr_pages; ++i) {
> + struct page *page = folio_page(folio, start + i);
>
> - WARN_ONCE(err != -ENOMEM, "unexpected xarray error: %d\n", err);
> - zswap_reject_alloc_fail++;
> - goto store_failed;
> + if (!zswap_compress(page, entries[i], pool, wb_enabled))
> + goto store_pages_failed;
> }
>
> - /*
> - * We may have had an existing entry that became stale when
> - * the folio was redirtied and now the new version is being
> - * swapped out. Get rid of the old.
> - */
> - if (old)
> - zswap_entry_free(old);
> + for (i = 0; i < nr_pages; ++i) {
> + struct zswap_entry *old, *entry = entries[i];
>
> - /*
> - * The entry is successfully compressed and stored in the tree, there is
> - * no further possibility of failure. Grab refs to the pool and objcg,
> - * charge zswap memory, and increment zswap_stored_pages.
> - * The opposite actions will be performed by zswap_entry_free()
> - * when the entry is removed from the tree.
> - */
> - zswap_pool_get(pool);
> - if (objcg) {
> - obj_cgroup_get(objcg);
> - obj_cgroup_charge_zswap(objcg, entry->length);
> - }
> - atomic_long_inc(&zswap_stored_pages);
> - if (entry->length == PAGE_SIZE)
> - atomic_long_inc(&zswap_stored_incompressible_pages);
> + old = xa_store(swap_zswap_tree(entry->swpentry),
> + swp_offset(entry->swpentry),
> + entry, GFP_KERNEL);
Future follow-up: perhaps we can use advanced xarray API (xas_*) to
take the lock only once.
> + if (unlikely(xa_is_err(old))) {
> + int err = xa_err(old);
>
> - /*
> - * We finish initializing the entry while it's already in xarray.
> - * This is safe because:
> - *
> - * 1. Concurrent stores and invalidations are excluded by folio lock.
> - *
> - * 2. Writeback is excluded by the entry not being on the LRU yet.
> - * The publishing order matters to prevent writeback from seeing
> - * an incoherent entry.
> - */
> - entry->pool = pool;
> - entry->swpentry = page_swpentry;
> - entry->objcg = objcg;
> - entry->referenced = true;
> - if (entry->length) {
> - INIT_LIST_HEAD(&entry->lru);
> - zswap_lru_add(&zswap_list_lru, entry);
> + WARN_ONCE(err != -ENOMEM, "unexpected xarray error: %d\n", err);
> + zswap_reject_alloc_fail++;
> + /*
> + * Entries up to this point have been stored in the
> + * xarray. zswap_store() will erase them from the xarray
> + * and call zswap_entry_free(). Local cleanup in
> + * 'store_pages_failed' only needs to happen for
> + * entries from [@i to @nr_pages).
> + */
> + store_fail_idx = i;
> + goto store_pages_failed;
> + }
> +
> + /*
> + * We may have had an existing entry that became stale when
> + * the folio was redirtied and now the new version is being
> + * swapped out. Get rid of the old.
> + */
> + if (unlikely(old))
> + zswap_entry_free(old);
> +
> + /*
> + * The entry is successfully compressed and stored in the tree,
> + * and further failures will be cleaned up in zswap_store().
> + * Grab refs to the pool and objcg, charge zswap memory, and
> + * increment zswap_stored_pages. The opposite actions will be
> + * performed by zswap_entry_free() when the entry is removed
> + * from the tree.
> + */
> + zswap_pool_get(pool);
> + if (objcg) {
> + obj_cgroup_get(objcg);
> + obj_cgroup_charge_zswap(objcg, entry->length);
> + }
> + atomic_long_inc(&zswap_stored_pages);
> + if (entry->length == PAGE_SIZE)
> + atomic_long_inc(&zswap_stored_incompressible_pages);
> +
> + /*
> + * We finish by adding the entry to the LRU while it's already
> + * in xarray. This is safe because:
> + *
> + * 1. Concurrent stores and invalidations are excluded by folio lock.
> + *
> + * 2. Writeback is excluded by the entry not being on the LRU yet.
> + * The publishing order matters to prevent writeback from seeing
> + * an incoherent entry.
> + */
> + if (likely(entry->length))
> + zswap_lru_add(&zswap_list_lru, entry);
Hang on - how can entry->length == 0? This is probably holdover from
back when zero pages are still managed in zswap?
Future follow-up work: remove this check if that's the case...
The rest looks solid to me - I'll defer to Yosry and Johannes.
^ permalink raw reply [flat|nested] 48+ messages in thread* RE: [PATCH v14 25/26] mm: zswap: Store large folios in batches.
2026-01-31 0:33 ` Nhat Pham
@ 2026-01-31 20:22 ` Sridhar, Kanchana P
0 siblings, 0 replies; 48+ messages in thread
From: Sridhar, Kanchana P @ 2026-01-31 20:22 UTC (permalink / raw)
To: Nhat Pham
Cc: linux-kernel, linux-mm, hannes, yosry.ahmed, chengming.zhou,
usamaarif642, ryan.roberts, 21cnbao, ying.huang, akpm,
senozhatsky, sj, kasong, linux-crypto, herbert, davem, clabbe,
ardb, ebiggers, surenb, Accardi, Kristen C, Gomes, Vinicius,
Cabiddu, Giovanni, Feghali, Wajdi K, Sridhar, Kanchana P
> -----Original Message-----
> From: Nhat Pham <nphamcs@gmail.com>
> Sent: Friday, January 30, 2026 4:33 PM
> To: Sridhar, Kanchana P <kanchana.p.sridhar@intel.com>
> Cc: linux-kernel@vger.kernel.org; linux-mm@kvack.org;
> hannes@cmpxchg.org; yosry.ahmed@linux.dev; chengming.zhou@linux.dev;
> usamaarif642@gmail.com; ryan.roberts@arm.com; 21cnbao@gmail.com;
> ying.huang@linux.alibaba.com; akpm@linux-foundation.org;
> senozhatsky@chromium.org; sj@kernel.org; kasong@tencent.com; linux-
> crypto@vger.kernel.org; herbert@gondor.apana.org.au;
> davem@davemloft.net; clabbe@baylibre.com; ardb@kernel.org;
> ebiggers@google.com; surenb@google.com; Accardi, Kristen C
> <kristen.c.accardi@intel.com>; Gomes, Vinicius <vinicius.gomes@intel.com>;
> Cabiddu, Giovanni <giovanni.cabiddu@intel.com>; Feghali, Wajdi K
> <wajdi.k.feghali@intel.com>
> Subject: Re: [PATCH v14 25/26] mm: zswap: Store large folios in batches.
>
> On Sat, Jan 24, 2026 at 7:36 PM Kanchana P Sridhar
> <kanchana.p.sridhar@intel.com> wrote:
> >
> > Support batching when storing large folios in zswap. If the underlying
> > compressor supports batching (e.g. hardware parallel compression),
> > allocate multiple compression buffers, otherwise allocate one. The
> > number of buffers is bounded by a new constant, ZSWAP_MAX_BATCH_SIZE,
> to
> > limit the memory overhead. For existing software compressors, the only
> > extra overhead is the extra 'buffers' pointer, so 8 bytes per-CPU on
> > x86_64.
> >
> > Only the first buffer is currently used, but subsequent changes will use
> > the remaining buffers for hardware compression batching.
> >
> > Regardless of compression batching, always process large folios in
> > batches. For hardware compressors, the batch size is the compressor
> > batch size, otherwise ZSWAP_MAX_BATCH_SIZE is used.
> >
> > zswap_store_page() is replaced with zswap_store_pages(), which processes
> > a batch of pages and allows for batching optimizations. For now, only
> > optimize allocating entries by using batch allocations from the slab
> > cache.
> >
> > Since batch allocations do not support specifying a node id, store the
> > node id in the zswap entry instead of relying on the zswap_entry being
> > allocated on the same node. The size of the zswap_entry remains
> > unchanged as 'referenced' is lumped in with the 'length' (as it doesn't
> > need a full unsigned int anyway).
> >
> > Avoid repeatedly calling mem_cgroup_zswap_writeback_enabled() for
> every
> > page and only call it once for the folio, since the entire folio is
> > charged to a single memcg.
> >
> > Suggested-by: Nhat Pham <nphamcs@gmail.com>
> > Suggested-by: Yosry Ahmed <yosry.ahmed@linux.dev>
> > Signed-off-by: Kanchana P Sridhar <kanchana.p.sridhar@intel.com>
> > ---
> > mm/zswap.c | 351 +++++++++++++++++++++++++++++++++++++-----------
> -----
> > 1 file changed, 248 insertions(+), 103 deletions(-)
> >
> > diff --git a/mm/zswap.c b/mm/zswap.c
> > index 0d56390342b7..6a22add63220 100644
> > --- a/mm/zswap.c
> > +++ b/mm/zswap.c
> > @@ -82,6 +82,11 @@ static bool zswap_pool_reached_full;
> >
> > #define ZSWAP_PARAM_UNSET ""
> >
> > +/* Limit the batch size to limit per-CPU memory usage for dst buffers. */
> > +#define ZSWAP_MAX_BATCH_SIZE 8U
> > +#define ZSWAP_ENTRY_SPARE_4BYTES 32U
> > +#define ZSWAP_ENTRY_REF_BIT 1U
> > +
> > static int zswap_setup(void);
> >
> > /* Enable/disable zswap */
> > @@ -139,7 +144,7 @@ struct crypto_acomp_ctx {
> > struct crypto_acomp *acomp;
> > struct acomp_req *req;
> > struct crypto_wait wait;
> > - u8 *buffer;
> > + u8 **buffers;
> > struct mutex mutex;
> > };
> >
> > @@ -148,6 +153,9 @@ struct crypto_acomp_ctx {
> > * The only case where lru_lock is not acquired while holding tree.lock is
> > * when a zswap_entry is taken off the lru for writeback, in that case it
> > * needs to be verified that it's still valid in the tree.
> > + *
> > + * @compr_batch_size: The max batch size of the compression algorithm,
> > + * bounded by ZSWAP_MAX_BATCH_SIZE.
> > */
> > struct zswap_pool {
> > struct zs_pool *zs_pool;
> > @@ -157,6 +165,7 @@ struct zswap_pool {
> > struct work_struct release_work;
> > struct hlist_node node;
> > char tfm_name[CRYPTO_MAX_ALG_NAME];
> > + u8 compr_batch_size;
> > };
> >
> > /* Global LRU lists shared by all zswap pools. */
> > @@ -181,6 +190,7 @@ static struct shrinker *zswap_shrinker;
> > * writeback logic. The entry is only reclaimed by the writeback
> > * logic if referenced is unset. See comments in the shrinker
> > * section for context.
> > + * nid - NUMA node id of the page for which this is the zswap entry.
> > * pool - the zswap_pool the entry's data is in
> > * handle - zsmalloc allocation handle that stores the compressed page data
> > * objcg - the obj_cgroup that the compressed memory is charged to
> > @@ -188,8 +198,11 @@ static struct shrinker *zswap_shrinker;
> > */
> > struct zswap_entry {
> > swp_entry_t swpentry;
> > - unsigned int length;
> > - bool referenced;
> > + struct {
> > + unsigned int length:(ZSWAP_ENTRY_SPARE_4BYTES -
> ZSWAP_ENTRY_REF_BIT);
> > + bool referenced:ZSWAP_ENTRY_REF_BIT;
>
> Hmm I thought Yosry confirmed that using values directly rather than
> macros (i.e 32 and 1 instead of ZSWAP_ENTRY_SPARE_4BYTES and
> ZSWAP_ENTRY_REF_BIT) was the convention? :)
>
> https://lore.kernel.org/linux-
> mm/gnm6hcqlzna4p3unrad2sur7pnyovr7f2sfuiufzweu2zbfb2r@ia422moyti7v
> /
>
> I was just copying zsmalloc's format ;) Anyway, either way a fixlet
> should be sufficient. No big deal...
My apologies, Nhat and Yosry. I may have missed this. I can submit a
fixlet.
>
> > + };
> > + int nid;
> > struct zswap_pool *pool;
> > unsigned long handle;
> > struct obj_cgroup *objcg;
> > @@ -241,8 +254,10 @@ static inline struct xarray
> *swap_zswap_tree(swp_entry_t swp)
> > **********************************/
> > static void __zswap_pool_empty(struct percpu_ref *ref);
> >
> > -static void acomp_ctx_dealloc(struct crypto_acomp_ctx *acomp_ctx)
> > +static void acomp_ctx_dealloc(struct crypto_acomp_ctx *acomp_ctx, u8
> nr_buffers)
> > {
> > + u8 i;
> > +
> > if (IS_ERR_OR_NULL(acomp_ctx))
> > return;
> >
> > @@ -252,7 +267,11 @@ static void acomp_ctx_dealloc(struct
> crypto_acomp_ctx *acomp_ctx)
> > if (!IS_ERR_OR_NULL(acomp_ctx->acomp))
> > crypto_free_acomp(acomp_ctx->acomp);
> >
> > - kfree(acomp_ctx->buffer);
> > + if (acomp_ctx->buffers) {
> > + for (i = 0; i < nr_buffers; ++i)
> > + kfree(acomp_ctx->buffers[i]);
> > + kfree(acomp_ctx->buffers);
> > + }
> > }
> >
> > static struct zswap_pool *zswap_pool_create(char *compressor)
> > @@ -264,6 +283,7 @@ static struct zswap_pool *zswap_pool_create(char
> *compressor)
> > if (!zswap_has_pool && !strcmp(compressor, ZSWAP_PARAM_UNSET))
> > return NULL;
> >
> > + /* Many things rely on the zero-initialization. */
> > pool = kzalloc(sizeof(*pool), GFP_KERNEL);
> > if (!pool)
> > return NULL;
> > @@ -316,7 +336,9 @@ static struct zswap_pool *zswap_pool_create(char
> *compressor)
> >
> > cpuhp_add_fail:
> > for_each_possible_cpu(cpu)
> > - acomp_ctx_dealloc(per_cpu_ptr(pool->acomp_ctx, cpu));
> > + acomp_ctx_dealloc(per_cpu_ptr(pool->acomp_ctx, cpu),
> > + pool->compr_batch_size);
> > +
> > error:
> > if (pool->acomp_ctx)
> > free_percpu(pool->acomp_ctx);
> > @@ -354,7 +376,8 @@ static void zswap_pool_destroy(struct zswap_pool
> *pool)
> > cpuhp_state_remove_instance(CPUHP_MM_ZSWP_POOL_PREPARE,
> &pool->node);
> >
> > for_each_possible_cpu(cpu)
> > - acomp_ctx_dealloc(per_cpu_ptr(pool->acomp_ctx, cpu));
> > + acomp_ctx_dealloc(per_cpu_ptr(pool->acomp_ctx, cpu),
> > + pool->compr_batch_size);
> >
> > free_percpu(pool->acomp_ctx);
> >
> > @@ -645,14 +668,8 @@ static inline struct mem_cgroup
> *mem_cgroup_from_entry(struct zswap_entry *entry
> > }
> > #endif
> >
> > -static inline int entry_to_nid(struct zswap_entry *entry)
> > -{
> > - return page_to_nid(virt_to_page(entry));
> > -}
> > -
> > static void zswap_lru_add(struct list_lru *list_lru, struct zswap_entry
> *entry)
> > {
> > - int nid = entry_to_nid(entry);
> > struct mem_cgroup *memcg;
> >
> > /*
> > @@ -669,19 +686,18 @@ static void zswap_lru_add(struct list_lru *list_lru,
> struct zswap_entry *entry)
> > rcu_read_lock();
> > memcg = mem_cgroup_from_entry(entry);
> > /* will always succeed */
> > - list_lru_add(list_lru, &entry->lru, nid, memcg);
> > + list_lru_add(list_lru, &entry->lru, entry->nid, memcg);
> > rcu_read_unlock();
> > }
> >
> > static void zswap_lru_del(struct list_lru *list_lru, struct zswap_entry
> *entry)
> > {
> > - int nid = entry_to_nid(entry);
> > struct mem_cgroup *memcg;
> >
> > rcu_read_lock();
> > memcg = mem_cgroup_from_entry(entry);
> > /* will always succeed */
> > - list_lru_del(list_lru, &entry->lru, nid, memcg);
> > + list_lru_del(list_lru, &entry->lru, entry->nid, memcg);
> > rcu_read_unlock();
> > }
> >
> > @@ -741,6 +757,56 @@ static void zswap_entry_cache_free(struct
> zswap_entry *entry)
> > kmem_cache_free(zswap_entry_cache, entry);
> > }
> >
> > +static __always_inline void zswap_entries_cache_free_batch(
> > + struct zswap_entry **entries,
> > + u8 nr_entries)
> > +{
> > + /*
> > + * It is okay to use this to free entries allocated separately
> > + * by zswap_entry_cache_alloc().
> > + */
> > + kmem_cache_free_bulk(zswap_entry_cache, nr_entries, (void
> **)entries);
> > +}
> > +
> > +static __always_inline bool zswap_entries_cache_alloc_batch(
> > + struct zswap_entry **entries,
> > + u8 nr_entries,
> > + gfp_t gfp,
> > + int nid)
> > +{
> > + int nr_alloc = kmem_cache_alloc_bulk(zswap_entry_cache, gfp,
> > + nr_entries, (void **)entries);
> > +
> > + /*
> > + * kmem_cache_alloc_bulk() should return @nr_entries on success
> > + * and 0 on failure.
> > + */
> > + if (likely(nr_alloc == nr_entries))
> > + return true;
> > +
> > + if (WARN_ON_ONCE(unlikely(nr_alloc && (nr_alloc != nr_entries)))) {
> > + zswap_reject_kmemcache_fail++;
> > + zswap_entries_cache_free_batch(entries, nr_alloc);
> > + nr_alloc = 0;
> > + }
>
> Can partial allocation happen? I checked a couple callers of
> kmem_cache_alloc_bulk() and none of them check the case nr_alloc &&
> nr_alloc != nr_entries.
>
> In fact, one caller (__io_alloc_req_refill() in io_uring/io_uring.c)
> even explicitly document:
>
> ret = kmem_cache_alloc_bulk(req_cachep, gfp, ARRAY_SIZE(reqs), reqs);
>
> /*
> * Bulk alloc is all-or-nothing. If we fail to get a batch,
> * retry single alloc to be on the safe side.
> */
> if (unlikely(ret <= 0)) {
> reqs[0] = kmem_cache_alloc(req_cachep, gfp);
> if (!reqs[0])
> return false;
> ret = 1;
> }
>
> Other callsers don't even bother checking the negative case (i.e ret <
> 0) - only the 0 case. I'm not terribly familiar with bulk allocation
> though. Please fact check me :)
All great observations! Yosry and I had discussed this in [1] and agreed
to the WARN_ON_ONCE() mainly as future proofing. You are absolutely right
that currently, bulk allocation is all-or-nothing. There isn't a possibility of
a negative error return value though.
[1]: https://patchwork.kernel.org/comment/26697103/
>
> > +
> > + if (unlikely(!nr_alloc)) {
> > + unsigned int i;
> > +
> > + for (i = 0; i < nr_entries; ++i) {
> > + entries[i] = zswap_entry_cache_alloc(GFP_KERNEL, nid);
> > +
> > + if (unlikely(!entries[i])) {
> > + zswap_reject_kmemcache_fail++;
> > + zswap_entries_cache_free_batch(entries, i);
> > + return false;
> > + }
> > + }
> > + }
> > +
> > + return true;
> > +}
> > +
> > /*
> > * Carries out the common pattern of freeing an entry's zsmalloc allocation,
> > * freeing the entry itself, and decrementing the number of stored pages.
> > @@ -767,7 +833,9 @@ static int zswap_cpu_comp_prepare(unsigned int
> cpu, struct hlist_node *node)
> > {
> > struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node);
> > struct crypto_acomp_ctx *acomp_ctx = per_cpu_ptr(pool->acomp_ctx,
> cpu);
> > + int nid = cpu_to_node(cpu);
> > int ret = -ENOMEM;
> > + u8 i;
> >
> > /*
> > * To handle cases where the CPU goes through online-offline-online
> > @@ -778,11 +846,7 @@ static int zswap_cpu_comp_prepare(unsigned int
> cpu, struct hlist_node *node)
> > return 0;
> > }
> >
> > - acomp_ctx->buffer = kmalloc_node(PAGE_SIZE, GFP_KERNEL,
> cpu_to_node(cpu));
> > - if (!acomp_ctx->buffer)
> > - return ret;
> > -
> > - acomp_ctx->acomp = crypto_alloc_acomp_node(pool->tfm_name, 0,
> 0, cpu_to_node(cpu));
> > + acomp_ctx->acomp = crypto_alloc_acomp_node(pool->tfm_name, 0,
> 0, nid);
> > if (IS_ERR_OR_NULL(acomp_ctx->acomp)) {
> > pr_err("could not alloc crypto acomp %s : %ld\n",
> > pool->tfm_name, PTR_ERR(acomp_ctx->acomp));
> > @@ -790,20 +854,39 @@ static int zswap_cpu_comp_prepare(unsigned int
> cpu, struct hlist_node *node)
> > goto fail;
> > }
> >
> > + /*
> > + * Allocate up to ZSWAP_MAX_BATCH_SIZE dst buffers if the
> > + * compressor supports batching.
> > + */
> > + pool->compr_batch_size = min(ZSWAP_MAX_BATCH_SIZE,
> > + crypto_acomp_batch_size(acomp_ctx->acomp));
> > +
>
> I asssume this is going to be 0 for zstd?
This will be 1 for zstd based on patch 22 in this series.
>
> > acomp_ctx->req = acomp_request_alloc(acomp_ctx->acomp);
> > +
> > if (IS_ERR_OR_NULL(acomp_ctx->req)) {
> > pr_err("could not alloc crypto acomp_request %s\n",
> > pool->tfm_name);
> > goto fail;
> > }
> >
> > - crypto_init_wait(&acomp_ctx->wait);
> > + acomp_ctx->buffers = kcalloc_node(pool->compr_batch_size, sizeof(u8
> *),
> > + GFP_KERNEL, nid);
> > + if (!acomp_ctx->buffers)
> > + goto fail;
> > +
> > + for (i = 0; i < pool->compr_batch_size; ++i) {
> > + acomp_ctx->buffers[i] = kmalloc_node(PAGE_SIZE, GFP_KERNEL,
> nid);
> > + if (!acomp_ctx->buffers[i])
> > + goto fail;
> > + }
> >
> > /*
> > * if the backend of acomp is async zip, crypto_req_done() will wakeup
> > * crypto_wait_req(); if the backend of acomp is scomp, the callback
> > * won't be called, crypto_wait_req() will return without blocking.
> > */
> > + crypto_init_wait(&acomp_ctx->wait);
> > +
> > acomp_request_set_callback(acomp_ctx->req,
> CRYPTO_TFM_REQ_MAY_BACKLOG,
> > crypto_req_done, &acomp_ctx->wait);
> >
> > @@ -813,12 +896,12 @@ static int zswap_cpu_comp_prepare(unsigned int
> cpu, struct hlist_node *node)
> > return 0;
> >
> > fail:
> > - acomp_ctx_dealloc(acomp_ctx);
> > + acomp_ctx_dealloc(acomp_ctx, pool->compr_batch_size);
> > return ret;
> > }
> >
> > static bool zswap_compress(struct page *page, struct zswap_entry *entry,
> > - struct zswap_pool *pool)
> > + struct zswap_pool *pool, bool wb_enabled)
> > {
> > struct crypto_acomp_ctx *acomp_ctx;
> > struct scatterlist input, output;
> > @@ -832,7 +915,7 @@ static bool zswap_compress(struct page *page,
> struct zswap_entry *entry,
> > acomp_ctx = raw_cpu_ptr(pool->acomp_ctx);
> > mutex_lock(&acomp_ctx->mutex);
> >
> > - dst = acomp_ctx->buffer;
> > + dst = acomp_ctx->buffers[0];
> > sg_init_table(&input, 1);
> > sg_set_page(&input, page, PAGE_SIZE, 0);
> >
> > @@ -862,8 +945,7 @@ static bool zswap_compress(struct page *page,
> struct zswap_entry *entry,
> > * to the active LRU list in the case.
> > */
> > if (comp_ret || !dlen || dlen >= PAGE_SIZE) {
> > - if (!mem_cgroup_zswap_writeback_enabled(
> > - folio_memcg(page_folio(page)))) {
> > + if (!wb_enabled) {
> > comp_ret = comp_ret ? comp_ret : -EINVAL;
> > goto unlock;
> > }
> > @@ -909,7 +991,7 @@ static bool zswap_decompress(struct zswap_entry
> *entry, struct folio *folio)
> > acomp_ctx = raw_cpu_ptr(pool->acomp_ctx);
> > mutex_lock(&acomp_ctx->mutex);
> > obj = zs_obj_read_begin(pool->zs_pool, entry->handle, entry->length,
> > - acomp_ctx->buffer);
> > + acomp_ctx->buffers[0]);
> >
> > /* zswap entries of length PAGE_SIZE are not compressed. */
> > if (entry->length == PAGE_SIZE) {
> > @@ -919,15 +1001,15 @@ static bool zswap_decompress(struct
> zswap_entry *entry, struct folio *folio)
> >
> > /*
> > * zs_obj_read_begin() might return a kmap address of highmem when
> > - * acomp_ctx->buffer is not used. However, sg_init_one() does not
> > - * handle highmem addresses, so copy the object to acomp_ctx-
> >buffer.
> > + * acomp_ctx->buffers[0] is not used. However, sg_init_one() does not
> > + * handle highmem addresses, so copy the object to acomp_ctx-
> >buffers[0].
> > */
> > if (virt_addr_valid(obj)) {
> > src = obj;
> > } else {
> > - WARN_ON_ONCE(obj == acomp_ctx->buffer);
> > - memcpy(acomp_ctx->buffer, obj, entry->length);
> > - src = acomp_ctx->buffer;
> > + WARN_ON_ONCE(obj == acomp_ctx->buffers[0]);
> > + memcpy(acomp_ctx->buffers[0], obj, entry->length);
> > + src = acomp_ctx->buffers[0];
> > }
> >
> > sg_init_one(&input, src, entry->length);
> > @@ -1381,95 +1463,136 @@ static void shrink_worker(struct work_struct
> *w)
> > * main API
> > **********************************/
> >
> > -static bool zswap_store_page(struct page *page,
> > - struct obj_cgroup *objcg,
> > - struct zswap_pool *pool)
> > +/*
> > + * Store multiple pages in @folio, starting from the page at index @start up
> to
> > + * the page at index @end-1.
> > + */
> > +static bool zswap_store_pages(struct folio *folio,
> > + long start,
> > + long end,
> > + struct zswap_pool *pool,
> > + struct crypto_acomp_ctx *acomp_ctx,
> > + int nid,
> > + bool wb_enabled,
> > + struct obj_cgroup *objcg)
> > {
> > - swp_entry_t page_swpentry = page_swap_entry(page);
> > - struct zswap_entry *entry, *old;
> > + struct zswap_entry *entries[ZSWAP_MAX_BATCH_SIZE];
> > + u8 i, store_fail_idx = 0, nr_pages = end - start;
> >
> > - /* allocate entry */
> > - entry = zswap_entry_cache_alloc(GFP_KERNEL, page_to_nid(page));
> > - if (!entry) {
> > - zswap_reject_kmemcache_fail++;
> > + VM_WARN_ON_ONCE(nr_pages > ZSWAP_MAX_BATCH_SIZE);
> > +
> > + if (unlikely(!zswap_entries_cache_alloc_batch(entries, nr_pages,
> > + GFP_KERNEL, nid)))
> > return false;
> > - }
> >
> > - if (!zswap_compress(page, entry, pool))
> > - goto compress_failed;
> > + /*
> > + * We co-locate entry initialization as much as possible here to
> > + * minimize potential cache misses.
> > + */
> > + for (i = 0; i < nr_pages; ++i) {
> > + entries[i]->handle = (unsigned long)ERR_PTR(-EINVAL);
> > + entries[i]->pool = pool;
> > + entries[i]->swpentry = page_swap_entry(folio_page(folio, start +
> i));
> > + entries[i]->objcg = objcg;
> > + entries[i]->referenced = true;
> > + entries[i]->nid = nid;
> > + INIT_LIST_HEAD(&entries[i]->lru);
> > + }
> >
> > - old = xa_store(swap_zswap_tree(page_swpentry),
> > - swp_offset(page_swpentry),
> > - entry, GFP_KERNEL);
> > - if (xa_is_err(old)) {
> > - int err = xa_err(old);
> > + for (i = 0; i < nr_pages; ++i) {
> > + struct page *page = folio_page(folio, start + i);
> >
> > - WARN_ONCE(err != -ENOMEM, "unexpected xarray error: %d\n",
> err);
> > - zswap_reject_alloc_fail++;
> > - goto store_failed;
> > + if (!zswap_compress(page, entries[i], pool, wb_enabled))
> > + goto store_pages_failed;
> > }
> >
> > - /*
> > - * We may have had an existing entry that became stale when
> > - * the folio was redirtied and now the new version is being
> > - * swapped out. Get rid of the old.
> > - */
> > - if (old)
> > - zswap_entry_free(old);
> > + for (i = 0; i < nr_pages; ++i) {
> > + struct zswap_entry *old, *entry = entries[i];
> >
> > - /*
> > - * The entry is successfully compressed and stored in the tree, there is
> > - * no further possibility of failure. Grab refs to the pool and objcg,
> > - * charge zswap memory, and increment zswap_stored_pages.
> > - * The opposite actions will be performed by zswap_entry_free()
> > - * when the entry is removed from the tree.
> > - */
> > - zswap_pool_get(pool);
> > - if (objcg) {
> > - obj_cgroup_get(objcg);
> > - obj_cgroup_charge_zswap(objcg, entry->length);
> > - }
> > - atomic_long_inc(&zswap_stored_pages);
> > - if (entry->length == PAGE_SIZE)
> > - atomic_long_inc(&zswap_stored_incompressible_pages);
> > + old = xa_store(swap_zswap_tree(entry->swpentry),
> > + swp_offset(entry->swpentry),
> > + entry, GFP_KERNEL);
>
> Future follow-up: perhaps we can use advanced xarray API (xas_*) to
> take the lock only once.
Sure, thanks for the suggestion, we should definitely look into this
optimization.
>
> > + if (unlikely(xa_is_err(old))) {
> > + int err = xa_err(old);
> >
> > - /*
> > - * We finish initializing the entry while it's already in xarray.
> > - * This is safe because:
> > - *
> > - * 1. Concurrent stores and invalidations are excluded by folio lock.
> > - *
> > - * 2. Writeback is excluded by the entry not being on the LRU yet.
> > - * The publishing order matters to prevent writeback from seeing
> > - * an incoherent entry.
> > - */
> > - entry->pool = pool;
> > - entry->swpentry = page_swpentry;
> > - entry->objcg = objcg;
> > - entry->referenced = true;
> > - if (entry->length) {
> > - INIT_LIST_HEAD(&entry->lru);
> > - zswap_lru_add(&zswap_list_lru, entry);
> > + WARN_ONCE(err != -ENOMEM, "unexpected xarray error:
> %d\n", err);
> > + zswap_reject_alloc_fail++;
> > + /*
> > + * Entries up to this point have been stored in the
> > + * xarray. zswap_store() will erase them from the xarray
> > + * and call zswap_entry_free(). Local cleanup in
> > + * 'store_pages_failed' only needs to happen for
> > + * entries from [@i to @nr_pages).
> > + */
> > + store_fail_idx = i;
> > + goto store_pages_failed;
> > + }
> > +
> > + /*
> > + * We may have had an existing entry that became stale when
> > + * the folio was redirtied and now the new version is being
> > + * swapped out. Get rid of the old.
> > + */
> > + if (unlikely(old))
> > + zswap_entry_free(old);
> > +
> > + /*
> > + * The entry is successfully compressed and stored in the tree,
> > + * and further failures will be cleaned up in zswap_store().
> > + * Grab refs to the pool and objcg, charge zswap memory, and
> > + * increment zswap_stored_pages. The opposite actions will be
> > + * performed by zswap_entry_free() when the entry is removed
> > + * from the tree.
> > + */
> > + zswap_pool_get(pool);
> > + if (objcg) {
> > + obj_cgroup_get(objcg);
> > + obj_cgroup_charge_zswap(objcg, entry->length);
> > + }
> > + atomic_long_inc(&zswap_stored_pages);
> > + if (entry->length == PAGE_SIZE)
> > + atomic_long_inc(&zswap_stored_incompressible_pages);
> > +
> > + /*
> > + * We finish by adding the entry to the LRU while it's already
> > + * in xarray. This is safe because:
> > + *
> > + * 1. Concurrent stores and invalidations are excluded by folio
> lock.
> > + *
> > + * 2. Writeback is excluded by the entry not being on the LRU yet.
> > + * The publishing order matters to prevent writeback from seeing
> > + * an incoherent entry.
> > + */
> > + if (likely(entry->length))
> > + zswap_lru_add(&zswap_list_lru, entry);
>
> Hang on - how can entry->length == 0? This is probably holdover from
> back when zero pages are still managed in zswap?
I think so too, and figured not to change this, in case it was kept as a
paranoia-check.
>
> Future follow-up work: remove this check if that's the case...
I agree.
>
> The rest looks solid to me - I'll defer to Yosry and Johannes.
Thanks Nhat! I would appreciate review by Yosry and Johannes.
Thanks,
Kanchana
^ permalink raw reply [flat|nested] 48+ messages in thread
* Re: [PATCH v14 25/26] mm: zswap: Store large folios in batches.
2026-01-25 3:35 ` [PATCH v14 25/26] mm: zswap: Store large folios in batches Kanchana P Sridhar
2026-01-31 0:33 ` Nhat Pham
@ 2026-02-04 16:57 ` Yosry Ahmed
1 sibling, 0 replies; 48+ messages in thread
From: Yosry Ahmed @ 2026-02-04 16:57 UTC (permalink / raw)
To: Kanchana P Sridhar
Cc: linux-kernel, linux-mm, hannes, nphamcs, chengming.zhou,
usamaarif642, ryan.roberts, 21cnbao, ying.huang, akpm,
senozhatsky, sj, kasong, linux-crypto, herbert, davem, clabbe,
ardb, ebiggers, surenb, kristen.c.accardi, vinicius.gomes,
giovanni.cabiddu, wajdi.k.feghali
On Sat, Jan 24, 2026 at 07:35:36PM -0800, Kanchana P Sridhar wrote:
> Support batching when storing large folios in zswap. If the underlying
> compressor supports batching (e.g. hardware parallel compression),
> allocate multiple compression buffers, otherwise allocate one. The
> number of buffers is bounded by a new constant, ZSWAP_MAX_BATCH_SIZE, to
> limit the memory overhead. For existing software compressors, the only
> extra overhead is the extra 'buffers' pointer, so 8 bytes per-CPU on
> x86_64.
>
> Only the first buffer is currently used, but subsequent changes will use
> the remaining buffers for hardware compression batching.
>
> Regardless of compression batching, always process large folios in
> batches. For hardware compressors, the batch size is the compressor
> batch size, otherwise ZSWAP_MAX_BATCH_SIZE is used.
>
> zswap_store_page() is replaced with zswap_store_pages(), which processes
> a batch of pages and allows for batching optimizations. For now, only
> optimize allocating entries by using batch allocations from the slab
> cache.
>
> Since batch allocations do not support specifying a node id, store the
> node id in the zswap entry instead of relying on the zswap_entry being
> allocated on the same node. The size of the zswap_entry remains
> unchanged as 'referenced' is lumped in with the 'length' (as it doesn't
> need a full unsigned int anyway).
>
> Avoid repeatedly calling mem_cgroup_zswap_writeback_enabled() for every
> page and only call it once for the folio, since the entire folio is
> charged to a single memcg.
>
> Suggested-by: Nhat Pham <nphamcs@gmail.com>
> Suggested-by: Yosry Ahmed <yosry.ahmed@linux.dev>
> Signed-off-by: Kanchana P Sridhar <kanchana.p.sridhar@intel.com>
Generally LGTM with a few comments below, I suspect you'll need to
update this patch anyway due to conflicts.
> ---
> mm/zswap.c | 351 +++++++++++++++++++++++++++++++++++++----------------
> 1 file changed, 248 insertions(+), 103 deletions(-)
>
> diff --git a/mm/zswap.c b/mm/zswap.c
> index 0d56390342b7..6a22add63220 100644
> --- a/mm/zswap.c
> +++ b/mm/zswap.c
> @@ -82,6 +82,11 @@ static bool zswap_pool_reached_full;
>
> #define ZSWAP_PARAM_UNSET ""
>
> +/* Limit the batch size to limit per-CPU memory usage for dst buffers. */
> +#define ZSWAP_MAX_BATCH_SIZE 8U
> +#define ZSWAP_ENTRY_SPARE_4BYTES 32U
> +#define ZSWAP_ENTRY_REF_BIT 1U
> +
> static int zswap_setup(void);
>
> /* Enable/disable zswap */
> @@ -139,7 +144,7 @@ struct crypto_acomp_ctx {
> struct crypto_acomp *acomp;
> struct acomp_req *req;
> struct crypto_wait wait;
> - u8 *buffer;
> + u8 **buffers;
> struct mutex mutex;
> };
>
> @@ -148,6 +153,9 @@ struct crypto_acomp_ctx {
> * The only case where lru_lock is not acquired while holding tree.lock is
> * when a zswap_entry is taken off the lru for writeback, in that case it
> * needs to be verified that it's still valid in the tree.
> + *
> + * @compr_batch_size: The max batch size of the compression algorithm,
> + * bounded by ZSWAP_MAX_BATCH_SIZE.
> */
> struct zswap_pool {
> struct zs_pool *zs_pool;
> @@ -157,6 +165,7 @@ struct zswap_pool {
> struct work_struct release_work;
> struct hlist_node node;
> char tfm_name[CRYPTO_MAX_ALG_NAME];
> + u8 compr_batch_size;
> };
>
> /* Global LRU lists shared by all zswap pools. */
> @@ -181,6 +190,7 @@ static struct shrinker *zswap_shrinker;
> * writeback logic. The entry is only reclaimed by the writeback
> * logic if referenced is unset. See comments in the shrinker
> * section for context.
> + * nid - NUMA node id of the page for which this is the zswap entry.
> * pool - the zswap_pool the entry's data is in
> * handle - zsmalloc allocation handle that stores the compressed page data
> * objcg - the obj_cgroup that the compressed memory is charged to
> @@ -188,8 +198,11 @@ static struct shrinker *zswap_shrinker;
> */
> struct zswap_entry {
> swp_entry_t swpentry;
> - unsigned int length;
> - bool referenced;
> + struct {
> + unsigned int length:(ZSWAP_ENTRY_SPARE_4BYTES - ZSWAP_ENTRY_REF_BIT);
> + bool referenced:ZSWAP_ENTRY_REF_BIT;
> + };
As Nhat mentioned, please avoid the macros here. Also, if there's a new
version (which I suspect there will be due to conflicts), please move
adding the nid to the entry (instead of allocating it on the correct
node) to a separate patch.
> + int nid;
> struct zswap_pool *pool;
> unsigned long handle;
> struct obj_cgroup *objcg;
> @@ -241,8 +254,10 @@ static inline struct xarray *swap_zswap_tree(swp_entry_t swp)
> **********************************/
> static void __zswap_pool_empty(struct percpu_ref *ref);
>
> -static void acomp_ctx_dealloc(struct crypto_acomp_ctx *acomp_ctx)
> +static void acomp_ctx_dealloc(struct crypto_acomp_ctx *acomp_ctx, u8 nr_buffers)
> {
> + u8 i;
> +
> if (IS_ERR_OR_NULL(acomp_ctx))
> return;
>
> @@ -252,7 +267,11 @@ static void acomp_ctx_dealloc(struct crypto_acomp_ctx *acomp_ctx)
> if (!IS_ERR_OR_NULL(acomp_ctx->acomp))
> crypto_free_acomp(acomp_ctx->acomp);
>
> - kfree(acomp_ctx->buffer);
> + if (acomp_ctx->buffers) {
> + for (i = 0; i < nr_buffers; ++i)
> + kfree(acomp_ctx->buffers[i]);
> + kfree(acomp_ctx->buffers);
> + }
> }
>
> static struct zswap_pool *zswap_pool_create(char *compressor)
> @@ -264,6 +283,7 @@ static struct zswap_pool *zswap_pool_create(char *compressor)
> if (!zswap_has_pool && !strcmp(compressor, ZSWAP_PARAM_UNSET))
> return NULL;
>
> + /* Many things rely on the zero-initialization. */
> pool = kzalloc(sizeof(*pool), GFP_KERNEL);
> if (!pool)
> return NULL;
> @@ -316,7 +336,9 @@ static struct zswap_pool *zswap_pool_create(char *compressor)
>
> cpuhp_add_fail:
> for_each_possible_cpu(cpu)
> - acomp_ctx_dealloc(per_cpu_ptr(pool->acomp_ctx, cpu));
> + acomp_ctx_dealloc(per_cpu_ptr(pool->acomp_ctx, cpu),
> + pool->compr_batch_size);
> +
> error:
> if (pool->acomp_ctx)
> free_percpu(pool->acomp_ctx);
> @@ -354,7 +376,8 @@ static void zswap_pool_destroy(struct zswap_pool *pool)
> cpuhp_state_remove_instance(CPUHP_MM_ZSWP_POOL_PREPARE, &pool->node);
>
> for_each_possible_cpu(cpu)
> - acomp_ctx_dealloc(per_cpu_ptr(pool->acomp_ctx, cpu));
> + acomp_ctx_dealloc(per_cpu_ptr(pool->acomp_ctx, cpu),
> + pool->compr_batch_size);
>
> free_percpu(pool->acomp_ctx);
>
> @@ -645,14 +668,8 @@ static inline struct mem_cgroup *mem_cgroup_from_entry(struct zswap_entry *entry
> }
> #endif
>
> -static inline int entry_to_nid(struct zswap_entry *entry)
> -{
> - return page_to_nid(virt_to_page(entry));
> -}
> -
> static void zswap_lru_add(struct list_lru *list_lru, struct zswap_entry *entry)
> {
> - int nid = entry_to_nid(entry);
> struct mem_cgroup *memcg;
>
> /*
> @@ -669,19 +686,18 @@ static void zswap_lru_add(struct list_lru *list_lru, struct zswap_entry *entry)
> rcu_read_lock();
> memcg = mem_cgroup_from_entry(entry);
> /* will always succeed */
> - list_lru_add(list_lru, &entry->lru, nid, memcg);
> + list_lru_add(list_lru, &entry->lru, entry->nid, memcg);
> rcu_read_unlock();
> }
>
> static void zswap_lru_del(struct list_lru *list_lru, struct zswap_entry *entry)
> {
> - int nid = entry_to_nid(entry);
> struct mem_cgroup *memcg;
>
> rcu_read_lock();
> memcg = mem_cgroup_from_entry(entry);
> /* will always succeed */
> - list_lru_del(list_lru, &entry->lru, nid, memcg);
> + list_lru_del(list_lru, &entry->lru, entry->nid, memcg);
> rcu_read_unlock();
> }
>
> @@ -741,6 +757,56 @@ static void zswap_entry_cache_free(struct zswap_entry *entry)
> kmem_cache_free(zswap_entry_cache, entry);
> }
>
> +static __always_inline void zswap_entries_cache_free_batch(
> + struct zswap_entry **entries,
> + u8 nr_entries)
> +{
> + /*
> + * It is okay to use this to free entries allocated separately
> + * by zswap_entry_cache_alloc().
> + */
> + kmem_cache_free_bulk(zswap_entry_cache, nr_entries, (void **)entries);
> +}
> +
> +static __always_inline bool zswap_entries_cache_alloc_batch(
> + struct zswap_entry **entries,
> + u8 nr_entries,
> + gfp_t gfp,
> + int nid)
> +{
> + int nr_alloc = kmem_cache_alloc_bulk(zswap_entry_cache, gfp,
> + nr_entries, (void **)entries);
> +
> + /*
> + * kmem_cache_alloc_bulk() should return @nr_entries on success
> + * and 0 on failure.
> + */
> + if (likely(nr_alloc == nr_entries))
> + return true;
> +
> + if (WARN_ON_ONCE(unlikely(nr_alloc && (nr_alloc != nr_entries)))) {
We don't need to check 'nr_alloc != nr_entries' here, as we just checked
the opposite of it and returned above.
> + zswap_reject_kmemcache_fail++;
I don't think we want to increment the counter if batch allocation
fails, only if the fallback fails below. Not a big deal as this should
never happen though.
> + zswap_entries_cache_free_batch(entries, nr_alloc);
> + nr_alloc = 0;
> + }
> +
> + if (unlikely(!nr_alloc)) {
We don't need to check this here as well, nr_alloc should always be 0,
right? If anything, it's *very* likely() :)
> + unsigned int i;
> +
> + for (i = 0; i < nr_entries; ++i) {
> + entries[i] = zswap_entry_cache_alloc(GFP_KERNEL, nid);
> +
> + if (unlikely(!entries[i])) {
> + zswap_reject_kmemcache_fail++;
> + zswap_entries_cache_free_batch(entries, i);
> + return false;
> + }
> + }
> + }
> +
> + return true;
> +}
> +
> /*
> * Carries out the common pattern of freeing an entry's zsmalloc allocation,
> * freeing the entry itself, and decrementing the number of stored pages.
> @@ -767,7 +833,9 @@ static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node)
> {
> struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node);
> struct crypto_acomp_ctx *acomp_ctx = per_cpu_ptr(pool->acomp_ctx, cpu);
> + int nid = cpu_to_node(cpu);
> int ret = -ENOMEM;
> + u8 i;
>
> /*
> * To handle cases where the CPU goes through online-offline-online
> @@ -778,11 +846,7 @@ static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node)
> return 0;
> }
>
> - acomp_ctx->buffer = kmalloc_node(PAGE_SIZE, GFP_KERNEL, cpu_to_node(cpu));
> - if (!acomp_ctx->buffer)
> - return ret;
> -
> - acomp_ctx->acomp = crypto_alloc_acomp_node(pool->tfm_name, 0, 0, cpu_to_node(cpu));
> + acomp_ctx->acomp = crypto_alloc_acomp_node(pool->tfm_name, 0, 0, nid);
> if (IS_ERR_OR_NULL(acomp_ctx->acomp)) {
> pr_err("could not alloc crypto acomp %s : %ld\n",
> pool->tfm_name, PTR_ERR(acomp_ctx->acomp));
> @@ -790,20 +854,39 @@ static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node)
> goto fail;
> }
>
> + /*
> + * Allocate up to ZSWAP_MAX_BATCH_SIZE dst buffers if the
> + * compressor supports batching.
> + */
> + pool->compr_batch_size = min(ZSWAP_MAX_BATCH_SIZE,
> + crypto_acomp_batch_size(acomp_ctx->acomp));
> +
> acomp_ctx->req = acomp_request_alloc(acomp_ctx->acomp);
> +
> if (IS_ERR_OR_NULL(acomp_ctx->req)) {
> pr_err("could not alloc crypto acomp_request %s\n",
> pool->tfm_name);
> goto fail;
> }
>
> - crypto_init_wait(&acomp_ctx->wait);
> + acomp_ctx->buffers = kcalloc_node(pool->compr_batch_size, sizeof(u8 *),
> + GFP_KERNEL, nid);
> + if (!acomp_ctx->buffers)
> + goto fail;
> +
> + for (i = 0; i < pool->compr_batch_size; ++i) {
> + acomp_ctx->buffers[i] = kmalloc_node(PAGE_SIZE, GFP_KERNEL, nid);
> + if (!acomp_ctx->buffers[i])
> + goto fail;
> + }
>
> /*
> * if the backend of acomp is async zip, crypto_req_done() will wakeup
> * crypto_wait_req(); if the backend of acomp is scomp, the callback
> * won't be called, crypto_wait_req() will return without blocking.
> */
> + crypto_init_wait(&acomp_ctx->wait);
> +
> acomp_request_set_callback(acomp_ctx->req, CRYPTO_TFM_REQ_MAY_BACKLOG,
> crypto_req_done, &acomp_ctx->wait);
>
> @@ -813,12 +896,12 @@ static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node)
> return 0;
>
> fail:
> - acomp_ctx_dealloc(acomp_ctx);
> + acomp_ctx_dealloc(acomp_ctx, pool->compr_batch_size);
> return ret;
> }
>
> static bool zswap_compress(struct page *page, struct zswap_entry *entry,
> - struct zswap_pool *pool)
> + struct zswap_pool *pool, bool wb_enabled)
> {
> struct crypto_acomp_ctx *acomp_ctx;
> struct scatterlist input, output;
> @@ -832,7 +915,7 @@ static bool zswap_compress(struct page *page, struct zswap_entry *entry,
> acomp_ctx = raw_cpu_ptr(pool->acomp_ctx);
> mutex_lock(&acomp_ctx->mutex);
>
> - dst = acomp_ctx->buffer;
> + dst = acomp_ctx->buffers[0];
> sg_init_table(&input, 1);
> sg_set_page(&input, page, PAGE_SIZE, 0);
>
> @@ -862,8 +945,7 @@ static bool zswap_compress(struct page *page, struct zswap_entry *entry,
> * to the active LRU list in the case.
> */
> if (comp_ret || !dlen || dlen >= PAGE_SIZE) {
> - if (!mem_cgroup_zswap_writeback_enabled(
> - folio_memcg(page_folio(page)))) {
> + if (!wb_enabled) {
> comp_ret = comp_ret ? comp_ret : -EINVAL;
> goto unlock;
> }
> @@ -909,7 +991,7 @@ static bool zswap_decompress(struct zswap_entry *entry, struct folio *folio)
> acomp_ctx = raw_cpu_ptr(pool->acomp_ctx);
> mutex_lock(&acomp_ctx->mutex);
> obj = zs_obj_read_begin(pool->zs_pool, entry->handle, entry->length,
> - acomp_ctx->buffer);
> + acomp_ctx->buffers[0]);
>
> /* zswap entries of length PAGE_SIZE are not compressed. */
> if (entry->length == PAGE_SIZE) {
> @@ -919,15 +1001,15 @@ static bool zswap_decompress(struct zswap_entry *entry, struct folio *folio)
>
> /*
> * zs_obj_read_begin() might return a kmap address of highmem when
> - * acomp_ctx->buffer is not used. However, sg_init_one() does not
> - * handle highmem addresses, so copy the object to acomp_ctx->buffer.
> + * acomp_ctx->buffers[0] is not used. However, sg_init_one() does not
> + * handle highmem addresses, so copy the object to acomp_ctx->buffers[0].
> */
> if (virt_addr_valid(obj)) {
> src = obj;
> } else {
> - WARN_ON_ONCE(obj == acomp_ctx->buffer);
> - memcpy(acomp_ctx->buffer, obj, entry->length);
> - src = acomp_ctx->buffer;
> + WARN_ON_ONCE(obj == acomp_ctx->buffers[0]);
> + memcpy(acomp_ctx->buffers[0], obj, entry->length);
> + src = acomp_ctx->buffers[0];
> }
This code no longer exists (the conflict I referred to earlier).
> sg_init_one(&input, src, entry->length);
> @@ -1381,95 +1463,136 @@ static void shrink_worker(struct work_struct *w)
> * main API
> **********************************/
>
> -static bool zswap_store_page(struct page *page,
> - struct obj_cgroup *objcg,
> - struct zswap_pool *pool)
> +/*
> + * Store multiple pages in @folio, starting from the page at index @start up to
> + * the page at index @end-1.
> + */
> +static bool zswap_store_pages(struct folio *folio,
> + long start,
> + long end,
> + struct zswap_pool *pool,
> + struct crypto_acomp_ctx *acomp_ctx,
I don't see where acomp_ctx is used here?
> + int nid,
> + bool wb_enabled,
> + struct obj_cgroup *objcg)
> {
> - swp_entry_t page_swpentry = page_swap_entry(page);
> - struct zswap_entry *entry, *old;
> + struct zswap_entry *entries[ZSWAP_MAX_BATCH_SIZE];
> + u8 i, store_fail_idx = 0, nr_pages = end - start;
>
> - /* allocate entry */
> - entry = zswap_entry_cache_alloc(GFP_KERNEL, page_to_nid(page));
> - if (!entry) {
> - zswap_reject_kmemcache_fail++;
> + VM_WARN_ON_ONCE(nr_pages > ZSWAP_MAX_BATCH_SIZE);
> +
> + if (unlikely(!zswap_entries_cache_alloc_batch(entries, nr_pages,
> + GFP_KERNEL, nid)))
> return false;
> - }
>
> - if (!zswap_compress(page, entry, pool))
> - goto compress_failed;
> + /*
> + * We co-locate entry initialization as much as possible here to
> + * minimize potential cache misses.
> + */
> + for (i = 0; i < nr_pages; ++i) {
> + entries[i]->handle = (unsigned long)ERR_PTR(-EINVAL);
> + entries[i]->pool = pool;
> + entries[i]->swpentry = page_swap_entry(folio_page(folio, start + i));
> + entries[i]->objcg = objcg;
> + entries[i]->referenced = true;
> + entries[i]->nid = nid;
> + INIT_LIST_HEAD(&entries[i]->lru);
> + }
>
> - old = xa_store(swap_zswap_tree(page_swpentry),
> - swp_offset(page_swpentry),
> - entry, GFP_KERNEL);
> - if (xa_is_err(old)) {
> - int err = xa_err(old);
> + for (i = 0; i < nr_pages; ++i) {
> + struct page *page = folio_page(folio, start + i);
>
> - WARN_ONCE(err != -ENOMEM, "unexpected xarray error: %d\n", err);
> - zswap_reject_alloc_fail++;
> - goto store_failed;
> + if (!zswap_compress(page, entries[i], pool, wb_enabled))
> + goto store_pages_failed;
> }
>
> - /*
> - * We may have had an existing entry that became stale when
> - * the folio was redirtied and now the new version is being
> - * swapped out. Get rid of the old.
> - */
> - if (old)
> - zswap_entry_free(old);
> + for (i = 0; i < nr_pages; ++i) {
> + struct zswap_entry *old, *entry = entries[i];
>
> - /*
> - * The entry is successfully compressed and stored in the tree, there is
> - * no further possibility of failure. Grab refs to the pool and objcg,
> - * charge zswap memory, and increment zswap_stored_pages.
> - * The opposite actions will be performed by zswap_entry_free()
> - * when the entry is removed from the tree.
> - */
> - zswap_pool_get(pool);
> - if (objcg) {
> - obj_cgroup_get(objcg);
> - obj_cgroup_charge_zswap(objcg, entry->length);
> - }
> - atomic_long_inc(&zswap_stored_pages);
> - if (entry->length == PAGE_SIZE)
> - atomic_long_inc(&zswap_stored_incompressible_pages);
> + old = xa_store(swap_zswap_tree(entry->swpentry),
> + swp_offset(entry->swpentry),
> + entry, GFP_KERNEL);
> + if (unlikely(xa_is_err(old))) {
> + int err = xa_err(old);
>
> - /*
> - * We finish initializing the entry while it's already in xarray.
> - * This is safe because:
> - *
> - * 1. Concurrent stores and invalidations are excluded by folio lock.
> - *
> - * 2. Writeback is excluded by the entry not being on the LRU yet.
> - * The publishing order matters to prevent writeback from seeing
> - * an incoherent entry.
> - */
> - entry->pool = pool;
> - entry->swpentry = page_swpentry;
> - entry->objcg = objcg;
> - entry->referenced = true;
> - if (entry->length) {
> - INIT_LIST_HEAD(&entry->lru);
> - zswap_lru_add(&zswap_list_lru, entry);
> + WARN_ONCE(err != -ENOMEM, "unexpected xarray error: %d\n", err);
> + zswap_reject_alloc_fail++;
> + /*
> + * Entries up to this point have been stored in the
> + * xarray. zswap_store() will erase them from the xarray
> + * and call zswap_entry_free(). Local cleanup in
> + * 'store_pages_failed' only needs to happen for
> + * entries from [@i to @nr_pages).
> + */
> + store_fail_idx = i;
> + goto store_pages_failed;
> + }
> +
> + /*
> + * We may have had an existing entry that became stale when
> + * the folio was redirtied and now the new version is being
> + * swapped out. Get rid of the old.
> + */
> + if (unlikely(old))
> + zswap_entry_free(old);
> +
> + /*
> + * The entry is successfully compressed and stored in the tree,
> + * and further failures will be cleaned up in zswap_store().
> + * Grab refs to the pool and objcg, charge zswap memory, and
> + * increment zswap_stored_pages. The opposite actions will be
> + * performed by zswap_entry_free() when the entry is removed
> + * from the tree.
> + */
> + zswap_pool_get(pool);
> + if (objcg) {
> + obj_cgroup_get(objcg);
> + obj_cgroup_charge_zswap(objcg, entry->length);
> + }
> + atomic_long_inc(&zswap_stored_pages);
> + if (entry->length == PAGE_SIZE)
> + atomic_long_inc(&zswap_stored_incompressible_pages);
> +
> + /*
> + * We finish by adding the entry to the LRU while it's already
> + * in xarray. This is safe because:
> + *
> + * 1. Concurrent stores and invalidations are excluded by folio lock.
> + *
> + * 2. Writeback is excluded by the entry not being on the LRU yet.
> + * The publishing order matters to prevent writeback from seeing
> + * an incoherent entry.
> + */
> + if (likely(entry->length))
> + zswap_lru_add(&zswap_list_lru, entry);
> }
>
> return true;
>
> -store_failed:
> - zs_free(pool->zs_pool, entry->handle);
> -compress_failed:
> - zswap_entry_cache_free(entry);
> +store_pages_failed:
> + for (i = store_fail_idx; i < nr_pages; ++i) {
> + if (!IS_ERR_VALUE(entries[i]->handle))
> + zs_free(pool->zs_pool, entries[i]->handle);
> + }
> + zswap_entries_cache_free_batch(&entries[store_fail_idx],
> + nr_pages - store_fail_idx);
> +
> return false;
> }
>
> bool zswap_store(struct folio *folio)
> {
> + bool wb_enabled = mem_cgroup_zswap_writeback_enabled(folio_memcg(folio));
> long nr_pages = folio_nr_pages(folio);
> + struct crypto_acomp_ctx *acomp_ctx;
> swp_entry_t swp = folio->swap;
> struct obj_cgroup *objcg = NULL;
> struct mem_cgroup *memcg = NULL;
> + int nid = folio_nid(folio);
> struct zswap_pool *pool;
> + u8 store_batch_size;
> bool ret = false;
> - long index;
> + long start, end;
>
> VM_WARN_ON_ONCE(!folio_test_locked(folio));
> VM_WARN_ON_ONCE(!folio_test_swapcache(folio));
> @@ -1503,10 +1626,32 @@ bool zswap_store(struct folio *folio)
> mem_cgroup_put(memcg);
> }
>
> - for (index = 0; index < nr_pages; ++index) {
> - struct page *page = folio_page(folio, index);
> + /*
> + * For batching compressors, store the folio in batches of the
> + * compressor's batch_size.
> + *
> + * For non-batching compressors, store the folio in batches
> + * of ZSWAP_MAX_BATCH_SIZE, where each page in the batch is
> + * compressed sequentially. This gives better performance than
> + * invoking zswap_store_pages() per-page, due to cache locality
> + * of working set structures.
> + */
> + store_batch_size = (pool->compr_batch_size > 1) ?
> + pool->compr_batch_size : ZSWAP_MAX_BATCH_SIZE;
> +
> + acomp_ctx = raw_cpu_ptr(pool->acomp_ctx);
>
> - if (!zswap_store_page(page, objcg, pool))
> + for (start = 0; start < nr_pages; start += store_batch_size) {
> + end = min(start + store_batch_size, nr_pages);
> +
> + if (unlikely(!zswap_store_pages(folio,
> + start,
> + end,
> + pool,
> + acomp_ctx,
> + nid,
I think acomp_ctx is not used in zswap_store_pages(). If it is, we don't
need to pass it from here. Same for nid, no need to pass it from here.
We should be able to shrink the arg list and make this more concise. I
would drop unlikely() if it doesn't result in a meaningful improvement.
Otherwise you can also make this line a bit shorter if you keep
unlikely():
ret = zswap_store_pages(..);
if (unlikely(!ret))
goto put_pool;
> + wb_enabled,
> + objcg)))
> goto put_pool;
> }
>
> @@ -1536,9 +1681,9 @@ bool zswap_store(struct folio *folio)
> struct zswap_entry *entry;
> struct xarray *tree;
>
> - for (index = 0; index < nr_pages; ++index) {
> - tree = swap_zswap_tree(swp_entry(type, offset + index));
> - entry = xa_erase(tree, offset + index);
> + for (start = 0; start < nr_pages; ++start) {
> + tree = swap_zswap_tree(swp_entry(type, offset + start));
> + entry = xa_erase(tree, offset + start);
> if (entry)
> zswap_entry_free(entry);
> }
> --
> 2.27.0
>
^ permalink raw reply [flat|nested] 48+ messages in thread
* [PATCH v14 26/26] mm: zswap: Batched zswap_compress() for compress batching of large folios.
2026-01-25 3:35 [PATCH v14 00/26] zswap compression batching with optimized iaa_crypto driver Kanchana P Sridhar
` (24 preceding siblings ...)
2026-01-25 3:35 ` [PATCH v14 25/26] mm: zswap: Store large folios in batches Kanchana P Sridhar
@ 2026-01-25 3:35 ` Kanchana P Sridhar
2026-01-31 1:12 ` Nhat Pham
` (3 more replies)
2026-02-04 18:21 ` [PATCH v14 00/26] zswap compression batching with optimized iaa_crypto driver Yosry Ahmed
26 siblings, 4 replies; 48+ messages in thread
From: Kanchana P Sridhar @ 2026-01-25 3:35 UTC (permalink / raw)
To: linux-kernel, linux-mm, hannes, yosry.ahmed, nphamcs,
chengming.zhou, usamaarif642, ryan.roberts, 21cnbao, ying.huang,
akpm, senozhatsky, sj, kasong, linux-crypto, herbert, davem,
clabbe, ardb, ebiggers, surenb, kristen.c.accardi,
vinicius.gomes, giovanni.cabiddu
Cc: wajdi.k.feghali, kanchana.p.sridhar
We introduce a new batching implementation of zswap_compress() for
compressors that do and do not support batching. This eliminates code
duplication and facilitates code maintainability with the introduction
of compress batching.
The vectorized implementation of calling the earlier zswap_compress()
sequentially, one page at a time in zswap_store_pages(), is replaced
with this new version of zswap_compress() that accepts multiple pages to
compress as a batch.
If the compressor does not support batching, each page in the batch is
compressed and stored sequentially. If the compressor supports batching,
for e.g., 'deflate-iaa', the Intel IAA hardware accelerator, the batch
is compressed in parallel in hardware.
If the batch is compressed without errors, the compressed buffers for
the batch are stored in zsmalloc. In case of compression errors, the
current behavior based on whether the folio is enabled for zswap
writeback, is preserved.
The batched zswap_compress() incorporates Herbert's suggestion for
SG lists to represent the batch's inputs/outputs to interface with the
crypto API [1].
Performance data:
=================
As suggested by Barry, this is the performance data gathered on Intel
Sapphire Rapids with two workloads:
1) 30 usemem processes in a 150 GB memory limited cgroup, each
allocates 10G, i.e, effectively running at 50% memory pressure.
2) kernel_compilation "defconfig", 32 threads, cgroup memory limit set
to 1.7 GiB (50% memory pressure, since baseline memory usage is 3.4
GiB): data averaged across 10 runs.
To keep comparisons simple, all testing was done without the
zswap shrinker.
=========================================================================
IAA mm-unstable-1-23-2026 v14
=========================================================================
zswap compressor deflate-iaa deflate-iaa IAA Batching
vs.
IAA Sequential
=========================================================================
usemem30, 64K folios:
Total throughput (KB/s) 6,226,967 10,551,714 69%
Average throughput (KB/s) 207,565 351,723 69%
elapsed time (sec) 99.19 67.45 -32%
sys time (sec) 2,356.19 1,580.47 -33%
usemem30, PMD folios:
Total throughput (KB/s) 6,347,201 11,315,500 78%
Average throughput (KB/s) 211,573 377,183 78%
elapsed time (sec) 88.14 63.37 -28%
sys time (sec) 2,025.53 1,455.23 -28%
kernel_compilation, 64K folios:
elapsed time (sec) 100.10 98.74 -1.4%
sys time (sec) 308.72 301.23 -2%
kernel_compilation, PMD folios:
elapsed time (sec) 95.29 93.44 -1.9%
sys time (sec) 346.21 344.48 -0.5%
=========================================================================
=========================================================================
ZSTD mm-unstable-1-23-2026 v14
=========================================================================
zswap compressor zstd zstd v14 ZSTD
Improvement
=========================================================================
usemem30, 64K folios:
Total throughput (KB/s) 6,032,326 6,047,448 0.3%
Average throughput (KB/s) 201,077 201,581 0.3%
elapsed time (sec) 97.52 95.33 -2.2%
sys time (sec) 2,415.40 2,328.38 -4%
usemem30, PMD folios:
Total throughput (KB/s) 6,570,404 6,623,962 0.8%
Average throughput (KB/s) 219,013 220,798 0.8%
elapsed time (sec) 89.17 88.25 -1%
sys time (sec) 2,126.69 2,043.08 -4%
kernel_compilation, 64K folios:
elapsed time (sec) 100.89 99.98 -0.9%
sys time (sec) 417.49 414.62 -0.7%
kernel_compilation, PMD folios:
elapsed time (sec) 98.26 97.38 -0.9%
sys time (sec) 487.14 473.16 -2.9%
=========================================================================
Architectural considerations for the zswap batching framework:
==============================================================
We have designed the zswap batching framework to be
hardware-agnostic. It has no dependencies on Intel-specific features and
can be leveraged by any hardware accelerator or software-based
compressor. In other words, the framework is open and inclusive by
design.
Potential future clients of the batching framework:
===================================================
This patch-series demonstrates the performance benefits of compression
batching when used in zswap_store() of large folios. Compression
batching can be used for other use cases such as batching compression in
zram, batch compression of different folios during reclaim, kcompressd,
file systems, etc. Decompression batching can be used to improve
efficiency of zswap writeback (Thanks Nhat for this idea), batching
decompressions in zram, etc.
Experiments with kernel_compilation "allmodconfig" that combine zswap
compress batching, folio reclaim batching, and writeback batching show
that 0 pages are written back with deflate-iaa and zstd. For comparison,
the baselines for these compressors see 200K-800K pages written to disk.
Reclaim batching relieves memory pressure faster than reclaiming one
folio at a time, hence alleviates the need to scan slab memory for
writeback.
[1]: https://lore.kernel.org/all/aJ7Fk6RpNc815Ivd@gondor.apana.org.au/T/#m99aea2ce3d284e6c5a3253061d97b08c4752a798
Signed-off-by: Kanchana P Sridhar <kanchana.p.sridhar@intel.com>
---
mm/zswap.c | 260 ++++++++++++++++++++++++++++++++++++++---------------
1 file changed, 190 insertions(+), 70 deletions(-)
diff --git a/mm/zswap.c b/mm/zswap.c
index 6a22add63220..399112af2c54 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -145,6 +145,7 @@ struct crypto_acomp_ctx {
struct acomp_req *req;
struct crypto_wait wait;
u8 **buffers;
+ struct sg_table *sg_table;
struct mutex mutex;
};
@@ -272,6 +273,11 @@ static void acomp_ctx_dealloc(struct crypto_acomp_ctx *acomp_ctx, u8 nr_buffers)
kfree(acomp_ctx->buffers[i]);
kfree(acomp_ctx->buffers);
}
+
+ if (acomp_ctx->sg_table) {
+ sg_free_table(acomp_ctx->sg_table);
+ kfree(acomp_ctx->sg_table);
+ }
}
static struct zswap_pool *zswap_pool_create(char *compressor)
@@ -834,6 +840,7 @@ static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node)
struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node);
struct crypto_acomp_ctx *acomp_ctx = per_cpu_ptr(pool->acomp_ctx, cpu);
int nid = cpu_to_node(cpu);
+ struct scatterlist *sg;
int ret = -ENOMEM;
u8 i;
@@ -880,6 +887,22 @@ static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node)
goto fail;
}
+ acomp_ctx->sg_table = kmalloc(sizeof(*acomp_ctx->sg_table),
+ GFP_KERNEL);
+ if (!acomp_ctx->sg_table)
+ goto fail;
+
+ if (sg_alloc_table(acomp_ctx->sg_table, pool->compr_batch_size,
+ GFP_KERNEL))
+ goto fail;
+
+ /*
+ * Statically map the per-CPU destination buffers to the per-CPU
+ * SG lists.
+ */
+ for_each_sg(acomp_ctx->sg_table->sgl, sg, pool->compr_batch_size, i)
+ sg_set_buf(sg, acomp_ctx->buffers[i], PAGE_SIZE);
+
/*
* if the backend of acomp is async zip, crypto_req_done() will wakeup
* crypto_wait_req(); if the backend of acomp is scomp, the callback
@@ -900,84 +923,177 @@ static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node)
return ret;
}
-static bool zswap_compress(struct page *page, struct zswap_entry *entry,
- struct zswap_pool *pool, bool wb_enabled)
+/*
+ * zswap_compress() batching implementation for sequential and batching
+ * compressors.
+ *
+ * Description:
+ * ============
+ *
+ * Compress multiple @nr_pages in @folio starting from the @folio_start index in
+ * batches of @nr_batch_pages.
+ *
+ * It is assumed that @nr_pages <= ZSWAP_MAX_BATCH_SIZE. zswap_store() makes
+ * sure of this by design and zswap_store_pages() warns if this is not true.
+ *
+ * @nr_pages can be in (1, ZSWAP_MAX_BATCH_SIZE] even if the compressor does not
+ * support batching.
+ *
+ * If @nr_batch_pages is 1, each page is processed sequentially.
+ *
+ * If @nr_batch_pages is > 1, compression batching is invoked within
+ * the algorithm's driver, except if @nr_pages is 1: if so, the driver can
+ * choose to call it's sequential/non-batching compress routine.
+ *
+ * In both cases, if all compressions are successful, the compressed buffers
+ * are stored in zsmalloc.
+ *
+ * Design notes for batching compressors:
+ * ======================================
+ *
+ * Traversing SG lists when @nr_batch_pages is > 1 is expensive, and
+ * impacts batching performance if repeated:
+ * - to map destination buffers to each SG list in @acomp_ctx->sg_table.
+ * - to initialize each output @sg->length to PAGE_SIZE.
+ *
+ * Design choices made to optimize batching with SG lists:
+ *
+ * 1) The source folio pages in the batch are directly submitted to
+ * crypto_acomp via acomp_request_set_src_folio().
+ *
+ * 2) The per-CPU @acomp_ctx->sg_table scatterlists are statically mapped
+ * to the per-CPU dst @buffers at pool creation time.
+ *
+ * 3) zswap_compress() sets the output SG list length to PAGE_SIZE for
+ * non-batching compressors. The batching compressor's driver should do this
+ * as part of iterating through the dst SG lists for batch compression setup.
+ *
+ * Considerations for non-batching and batching compressors:
+ * =========================================================
+ *
+ * For each output SG list in @acomp_ctx->req->sg_table->sgl, the @sg->length
+ * should be set to either the page's compressed length (success), or it's
+ * compression error value.
+ */
+static bool zswap_compress(struct folio *folio,
+ long folio_start,
+ u8 nr_pages,
+ u8 nr_batch_pages,
+ struct zswap_entry *entries[],
+ struct zs_pool *zs_pool,
+ struct crypto_acomp_ctx *acomp_ctx,
+ int nid,
+ bool wb_enabled)
{
- struct crypto_acomp_ctx *acomp_ctx;
- struct scatterlist input, output;
- int comp_ret = 0, alloc_ret = 0;
- unsigned int dlen = PAGE_SIZE;
+ gfp_t gfp = GFP_NOWAIT | __GFP_NORETRY | __GFP_HIGHMEM | __GFP_MOVABLE;
+ unsigned int slen = nr_batch_pages * PAGE_SIZE;
+ u8 batch_start, batch_iter, compr_batch_size_iter;
+ struct scatterlist *sg;
unsigned long handle;
- gfp_t gfp;
- u8 *dst;
- bool mapped = false;
-
- acomp_ctx = raw_cpu_ptr(pool->acomp_ctx);
- mutex_lock(&acomp_ctx->mutex);
-
- dst = acomp_ctx->buffers[0];
- sg_init_table(&input, 1);
- sg_set_page(&input, page, PAGE_SIZE, 0);
-
- sg_init_one(&output, dst, PAGE_SIZE);
- acomp_request_set_params(acomp_ctx->req, &input, &output, PAGE_SIZE, dlen);
+ int err, dlen;
+ void *dst;
/*
- * it maybe looks a little bit silly that we send an asynchronous request,
- * then wait for its completion synchronously. This makes the process look
- * synchronous in fact.
- * Theoretically, acomp supports users send multiple acomp requests in one
- * acomp instance, then get those requests done simultaneously. but in this
- * case, zswap actually does store and load page by page, there is no
- * existing method to send the second page before the first page is done
- * in one thread doing zswap.
- * but in different threads running on different cpu, we have different
- * acomp instance, so multiple threads can do (de)compression in parallel.
+ * Locking the acomp_ctx mutex once per store batch results in better
+ * performance as compared to locking per compress batch.
*/
- comp_ret = crypto_wait_req(crypto_acomp_compress(acomp_ctx->req), &acomp_ctx->wait);
- dlen = acomp_ctx->req->dlen;
+ mutex_lock(&acomp_ctx->mutex);
/*
- * If a page cannot be compressed into a size smaller than PAGE_SIZE,
- * save the content as is without a compression, to keep the LRU order
- * of writebacks. If writeback is disabled, reject the page since it
- * only adds metadata overhead. swap_writeout() will put the page back
- * to the active LRU list in the case.
+ * Compress the @nr_pages in @folio starting at index @folio_start
+ * in batches of @nr_batch_pages.
*/
- if (comp_ret || !dlen || dlen >= PAGE_SIZE) {
- if (!wb_enabled) {
- comp_ret = comp_ret ? comp_ret : -EINVAL;
- goto unlock;
- }
- comp_ret = 0;
- dlen = PAGE_SIZE;
- dst = kmap_local_page(page);
- mapped = true;
- }
+ for (batch_start = 0; batch_start < nr_pages;
+ batch_start += nr_batch_pages) {
+ /*
+ * Send @nr_batch_pages to crypto_acomp for compression:
+ *
+ * These pages are in @folio's range of indices in the interval
+ * [@folio_start + @batch_start,
+ * @folio_start + @batch_start + @nr_batch_pages).
+ *
+ * @slen indicates the total source length bytes for @nr_batch_pages.
+ *
+ * The pool's compressor batch size is at least @nr_batch_pages,
+ * hence the acomp_ctx has at least @nr_batch_pages dst @buffers.
+ */
+ acomp_request_set_src_folio(acomp_ctx->req, folio,
+ (folio_start + batch_start) * PAGE_SIZE,
+ slen);
+
+ acomp_ctx->sg_table->sgl->length = slen;
+
+ acomp_request_set_dst_sg(acomp_ctx->req,
+ acomp_ctx->sg_table->sgl,
+ slen);
+
+ err = crypto_wait_req(crypto_acomp_compress(acomp_ctx->req),
+ &acomp_ctx->wait);
+
+ /*
+ * If a page cannot be compressed into a size smaller than
+ * PAGE_SIZE, save the content as is without a compression, to
+ * keep the LRU order of writebacks. If writeback is disabled,
+ * reject the page since it only adds metadata overhead.
+ * swap_writeout() will put the page back to the active LRU list
+ * in the case.
+ *
+ * It is assumed that any compressor that sets the output length
+ * to 0 or a value >= PAGE_SIZE will also return a negative
+ * error status in @err; i.e, will not return a successful
+ * compression status in @err in this case.
+ */
+ if (unlikely(err && !wb_enabled))
+ goto compress_error;
+
+ for_each_sg(acomp_ctx->sg_table->sgl, sg, nr_batch_pages,
+ compr_batch_size_iter) {
+ batch_iter = batch_start + compr_batch_size_iter;
+ dst = acomp_ctx->buffers[compr_batch_size_iter];
+ dlen = sg->length;
+
+ if (dlen < 0) {
+ dlen = PAGE_SIZE;
+ dst = kmap_local_page(folio_page(folio,
+ folio_start + batch_iter));
+ }
+
+ handle = zs_malloc(zs_pool, dlen, gfp, nid);
+
+ if (unlikely(IS_ERR_VALUE(handle))) {
+ if (PTR_ERR((void *)handle) == -ENOSPC)
+ zswap_reject_compress_poor++;
+ else
+ zswap_reject_alloc_fail++;
- gfp = GFP_NOWAIT | __GFP_NORETRY | __GFP_HIGHMEM | __GFP_MOVABLE;
- handle = zs_malloc(pool->zs_pool, dlen, gfp, page_to_nid(page));
- if (IS_ERR_VALUE(handle)) {
- alloc_ret = PTR_ERR((void *)handle);
- goto unlock;
+ goto err_unlock;
+ }
+
+ zs_obj_write(zs_pool, handle, dst, dlen);
+ entries[batch_iter]->handle = handle;
+ entries[batch_iter]->length = dlen;
+ if (dst != acomp_ctx->buffers[compr_batch_size_iter])
+ kunmap_local(dst);
+ }
}
- zs_obj_write(pool->zs_pool, handle, dst, dlen);
- entry->handle = handle;
- entry->length = dlen;
+ mutex_unlock(&acomp_ctx->mutex);
+ return true;
-unlock:
- if (mapped)
- kunmap_local(dst);
- if (comp_ret == -ENOSPC || alloc_ret == -ENOSPC)
- zswap_reject_compress_poor++;
- else if (comp_ret)
- zswap_reject_compress_fail++;
- else if (alloc_ret)
- zswap_reject_alloc_fail++;
+compress_error:
+ for_each_sg(acomp_ctx->sg_table->sgl, sg, nr_batch_pages,
+ compr_batch_size_iter) {
+ if ((int)sg->length < 0) {
+ if ((int)sg->length == -ENOSPC)
+ zswap_reject_compress_poor++;
+ else
+ zswap_reject_compress_fail++;
+ }
+ }
+err_unlock:
mutex_unlock(&acomp_ctx->mutex);
- return comp_ret == 0 && alloc_ret == 0;
+ return false;
}
static bool zswap_decompress(struct zswap_entry *entry, struct folio *folio)
@@ -1499,12 +1615,16 @@ static bool zswap_store_pages(struct folio *folio,
INIT_LIST_HEAD(&entries[i]->lru);
}
- for (i = 0; i < nr_pages; ++i) {
- struct page *page = folio_page(folio, start + i);
-
- if (!zswap_compress(page, entries[i], pool, wb_enabled))
- goto store_pages_failed;
- }
+ if (unlikely(!zswap_compress(folio,
+ start,
+ nr_pages,
+ min(nr_pages, pool->compr_batch_size),
+ entries,
+ pool->zs_pool,
+ acomp_ctx,
+ nid,
+ wb_enabled)))
+ goto store_pages_failed;
for (i = 0; i < nr_pages; ++i) {
struct zswap_entry *old, *entry = entries[i];
--
2.27.0
^ permalink raw reply [flat|nested] 48+ messages in thread* Re: [PATCH v14 26/26] mm: zswap: Batched zswap_compress() for compress batching of large folios.
2026-01-25 3:35 ` [PATCH v14 26/26] mm: zswap: Batched zswap_compress() for compress batching of large folios Kanchana P Sridhar
@ 2026-01-31 1:12 ` Nhat Pham
2026-01-31 20:31 ` Sridhar, Kanchana P
2026-02-04 0:30 ` Nhat Pham
` (2 subsequent siblings)
3 siblings, 1 reply; 48+ messages in thread
From: Nhat Pham @ 2026-01-31 1:12 UTC (permalink / raw)
To: Kanchana P Sridhar
Cc: linux-kernel, linux-mm, hannes, yosry.ahmed, chengming.zhou,
usamaarif642, ryan.roberts, 21cnbao, ying.huang, akpm,
senozhatsky, sj, kasong, linux-crypto, herbert, davem, clabbe,
ardb, ebiggers, surenb, kristen.c.accardi, vinicius.gomes,
giovanni.cabiddu, wajdi.k.feghali
On Sat, Jan 24, 2026 at 7:36 PM Kanchana P Sridhar
<kanchana.p.sridhar@intel.com> wrote:
>
> We introduce a new batching implementation of zswap_compress() for
> compressors that do and do not support batching. This eliminates code
> duplication and facilitates code maintainability with the introduction
> of compress batching.
>
> The vectorized implementation of calling the earlier zswap_compress()
> sequentially, one page at a time in zswap_store_pages(), is replaced
> with this new version of zswap_compress() that accepts multiple pages to
> compress as a batch.
>
> If the compressor does not support batching, each page in the batch is
> compressed and stored sequentially. If the compressor supports batching,
> for e.g., 'deflate-iaa', the Intel IAA hardware accelerator, the batch
> is compressed in parallel in hardware.
>
> If the batch is compressed without errors, the compressed buffers for
> the batch are stored in zsmalloc. In case of compression errors, the
> current behavior based on whether the folio is enabled for zswap
> writeback, is preserved.
>
> The batched zswap_compress() incorporates Herbert's suggestion for
> SG lists to represent the batch's inputs/outputs to interface with the
> crypto API [1].
>
> Performance data:
> =================
> As suggested by Barry, this is the performance data gathered on Intel
> Sapphire Rapids with two workloads:
>
> 1) 30 usemem processes in a 150 GB memory limited cgroup, each
> allocates 10G, i.e, effectively running at 50% memory pressure.
> 2) kernel_compilation "defconfig", 32 threads, cgroup memory limit set
> to 1.7 GiB (50% memory pressure, since baseline memory usage is 3.4
> GiB): data averaged across 10 runs.
>
> To keep comparisons simple, all testing was done without the
> zswap shrinker.
>
> =========================================================================
> IAA mm-unstable-1-23-2026 v14
> =========================================================================
> zswap compressor deflate-iaa deflate-iaa IAA Batching
> vs.
> IAA Sequential
> =========================================================================
> usemem30, 64K folios:
>
> Total throughput (KB/s) 6,226,967 10,551,714 69%
> Average throughput (KB/s) 207,565 351,723 69%
> elapsed time (sec) 99.19 67.45 -32%
> sys time (sec) 2,356.19 1,580.47 -33%
>
> usemem30, PMD folios:
>
> Total throughput (KB/s) 6,347,201 11,315,500 78%
> Average throughput (KB/s) 211,573 377,183 78%
> elapsed time (sec) 88.14 63.37 -28%
> sys time (sec) 2,025.53 1,455.23 -28%
>
> kernel_compilation, 64K folios:
>
> elapsed time (sec) 100.10 98.74 -1.4%
> sys time (sec) 308.72 301.23 -2%
>
> kernel_compilation, PMD folios:
>
> elapsed time (sec) 95.29 93.44 -1.9%
> sys time (sec) 346.21 344.48 -0.5%
> =========================================================================
>
> =========================================================================
> ZSTD mm-unstable-1-23-2026 v14
> =========================================================================
> zswap compressor zstd zstd v14 ZSTD
> Improvement
> =========================================================================
> usemem30, 64K folios:
>
> Total throughput (KB/s) 6,032,326 6,047,448 0.3%
> Average throughput (KB/s) 201,077 201,581 0.3%
> elapsed time (sec) 97.52 95.33 -2.2%
> sys time (sec) 2,415.40 2,328.38 -4%
>
> usemem30, PMD folios:
>
> Total throughput (KB/s) 6,570,404 6,623,962 0.8%
> Average throughput (KB/s) 219,013 220,798 0.8%
> elapsed time (sec) 89.17 88.25 -1%
> sys time (sec) 2,126.69 2,043.08 -4%
>
> kernel_compilation, 64K folios:
>
> elapsed time (sec) 100.89 99.98 -0.9%
> sys time (sec) 417.49 414.62 -0.7%
>
> kernel_compilation, PMD folios:
>
> elapsed time (sec) 98.26 97.38 -0.9%
> sys time (sec) 487.14 473.16 -2.9%
> =========================================================================
The rest of the patch changelog (architectural and future
considerations) can stay in the cover letter. Let's not duplicate
information :)
Keep the patch changelog limited to only the changes in the patch
itself (unless we need some clarifications imminently relevant).
I'll review the remainder of the patch later :)
^ permalink raw reply [flat|nested] 48+ messages in thread* RE: [PATCH v14 26/26] mm: zswap: Batched zswap_compress() for compress batching of large folios.
2026-01-31 1:12 ` Nhat Pham
@ 2026-01-31 20:31 ` Sridhar, Kanchana P
2026-02-01 0:48 ` Nhat Pham
0 siblings, 1 reply; 48+ messages in thread
From: Sridhar, Kanchana P @ 2026-01-31 20:31 UTC (permalink / raw)
To: Nhat Pham
Cc: linux-kernel, linux-mm, hannes, yosry.ahmed, chengming.zhou,
usamaarif642, ryan.roberts, 21cnbao, ying.huang, akpm,
senozhatsky, sj, kasong, linux-crypto, herbert, davem, clabbe,
ardb, ebiggers, surenb, Accardi, Kristen C, Gomes, Vinicius,
Cabiddu, Giovanni, Feghali, Wajdi K, Sridhar, Kanchana P
> -----Original Message-----
> From: Nhat Pham <nphamcs@gmail.com>
> Sent: Friday, January 30, 2026 5:13 PM
> To: Sridhar, Kanchana P <kanchana.p.sridhar@intel.com>
> Cc: linux-kernel@vger.kernel.org; linux-mm@kvack.org;
> hannes@cmpxchg.org; yosry.ahmed@linux.dev; chengming.zhou@linux.dev;
> usamaarif642@gmail.com; ryan.roberts@arm.com; 21cnbao@gmail.com;
> ying.huang@linux.alibaba.com; akpm@linux-foundation.org;
> senozhatsky@chromium.org; sj@kernel.org; kasong@tencent.com; linux-
> crypto@vger.kernel.org; herbert@gondor.apana.org.au;
> davem@davemloft.net; clabbe@baylibre.com; ardb@kernel.org;
> ebiggers@google.com; surenb@google.com; Accardi, Kristen C
> <kristen.c.accardi@intel.com>; Gomes, Vinicius <vinicius.gomes@intel.com>;
> Cabiddu, Giovanni <giovanni.cabiddu@intel.com>; Feghali, Wajdi K
> <wajdi.k.feghali@intel.com>
> Subject: Re: [PATCH v14 26/26] mm: zswap: Batched zswap_compress() for
> compress batching of large folios.
>
> On Sat, Jan 24, 2026 at 7:36 PM Kanchana P Sridhar
> <kanchana.p.sridhar@intel.com> wrote:
> >
> > We introduce a new batching implementation of zswap_compress() for
> > compressors that do and do not support batching. This eliminates code
> > duplication and facilitates code maintainability with the introduction
> > of compress batching.
> >
> > The vectorized implementation of calling the earlier zswap_compress()
> > sequentially, one page at a time in zswap_store_pages(), is replaced
> > with this new version of zswap_compress() that accepts multiple pages to
> > compress as a batch.
> >
> > If the compressor does not support batching, each page in the batch is
> > compressed and stored sequentially. If the compressor supports batching,
> > for e.g., 'deflate-iaa', the Intel IAA hardware accelerator, the batch
> > is compressed in parallel in hardware.
> >
> > If the batch is compressed without errors, the compressed buffers for
> > the batch are stored in zsmalloc. In case of compression errors, the
> > current behavior based on whether the folio is enabled for zswap
> > writeback, is preserved.
> >
> > The batched zswap_compress() incorporates Herbert's suggestion for
> > SG lists to represent the batch's inputs/outputs to interface with the
> > crypto API [1].
> >
> > Performance data:
> > =================
> > As suggested by Barry, this is the performance data gathered on Intel
> > Sapphire Rapids with two workloads:
> >
> > 1) 30 usemem processes in a 150 GB memory limited cgroup, each
> > allocates 10G, i.e, effectively running at 50% memory pressure.
> > 2) kernel_compilation "defconfig", 32 threads, cgroup memory limit set
> > to 1.7 GiB (50% memory pressure, since baseline memory usage is 3.4
> > GiB): data averaged across 10 runs.
> >
> > To keep comparisons simple, all testing was done without the
> > zswap shrinker.
> >
> >
> ==============================================================
> ===========
> > IAA mm-unstable-1-23-2026 v14
> >
> ==============================================================
> ===========
> > zswap compressor deflate-iaa deflate-iaa IAA Batching
> > vs.
> > IAA Sequential
> >
> ==============================================================
> ===========
> > usemem30, 64K folios:
> >
> > Total throughput (KB/s) 6,226,967 10,551,714 69%
> > Average throughput (KB/s) 207,565 351,723 69%
> > elapsed time (sec) 99.19 67.45 -32%
> > sys time (sec) 2,356.19 1,580.47 -33%
> >
> > usemem30, PMD folios:
> >
> > Total throughput (KB/s) 6,347,201 11,315,500 78%
> > Average throughput (KB/s) 211,573 377,183 78%
> > elapsed time (sec) 88.14 63.37 -28%
> > sys time (sec) 2,025.53 1,455.23 -28%
> >
> > kernel_compilation, 64K folios:
> >
> > elapsed time (sec) 100.10 98.74 -1.4%
> > sys time (sec) 308.72 301.23 -2%
> >
> > kernel_compilation, PMD folios:
> >
> > elapsed time (sec) 95.29 93.44 -1.9%
> > sys time (sec) 346.21 344.48 -0.5%
> >
> ==============================================================
> ===========
> >
> >
> ==============================================================
> ===========
> > ZSTD mm-unstable-1-23-2026 v14
> >
> ==============================================================
> ===========
> > zswap compressor zstd zstd v14 ZSTD
> > Improvement
> >
> ==============================================================
> ===========
> > usemem30, 64K folios:
> >
> > Total throughput (KB/s) 6,032,326 6,047,448 0.3%
> > Average throughput (KB/s) 201,077 201,581 0.3%
> > elapsed time (sec) 97.52 95.33 -2.2%
> > sys time (sec) 2,415.40 2,328.38 -4%
> >
> > usemem30, PMD folios:
> >
> > Total throughput (KB/s) 6,570,404 6,623,962 0.8%
> > Average throughput (KB/s) 219,013 220,798 0.8%
> > elapsed time (sec) 89.17 88.25 -1%
> > sys time (sec) 2,126.69 2,043.08 -4%
> >
> > kernel_compilation, 64K folios:
> >
> > elapsed time (sec) 100.89 99.98 -0.9%
> > sys time (sec) 417.49 414.62 -0.7%
> >
> > kernel_compilation, PMD folios:
> >
> > elapsed time (sec) 98.26 97.38 -0.9%
> > sys time (sec) 487.14 473.16 -2.9%
> >
> ==============================================================
> ===========
>
> The rest of the patch changelog (architectural and future
> considerations) can stay in the cover letter. Let's not duplicate
> information :)
>
> Keep the patch changelog limited to only the changes in the patch
> itself (unless we need some clarifications imminently relevant).
Hi Nhat,
Thanks for this comment. Yosry had also pointed this out in [1]. I have
been including the architectural and future considerations in this change log
since Andrew had asked me to do so. I hope this is Ok?
[1]: https://patchwork.kernel.org/comment/26706240/
>
> I'll review the remainder of the patch later :)
Sure.
Thanks,
Kanchana
^ permalink raw reply [flat|nested] 48+ messages in thread
* Re: [PATCH v14 26/26] mm: zswap: Batched zswap_compress() for compress batching of large folios.
2026-01-31 20:31 ` Sridhar, Kanchana P
@ 2026-02-01 0:48 ` Nhat Pham
2026-02-01 2:53 ` Sridhar, Kanchana P
0 siblings, 1 reply; 48+ messages in thread
From: Nhat Pham @ 2026-02-01 0:48 UTC (permalink / raw)
To: Sridhar, Kanchana P
Cc: linux-kernel, linux-mm, hannes, yosry.ahmed, chengming.zhou,
usamaarif642, ryan.roberts, 21cnbao, ying.huang, akpm,
senozhatsky, sj, kasong, linux-crypto, herbert, davem, clabbe,
ardb, ebiggers, surenb, Accardi, Kristen C, Gomes, Vinicius,
Cabiddu, Giovanni, Feghali, Wajdi K
On Sat, Jan 31, 2026 at 12:32 PM Sridhar, Kanchana P
<kanchana.p.sridhar@intel.com> wrote:
>
>
> > -----Original Message-----
> > From: Nhat Pham <nphamcs@gmail.com>
> > Sent: Friday, January 30, 2026 5:13 PM
> > To: Sridhar, Kanchana P <kanchana.p.sridhar@intel.com>
> > Cc: linux-kernel@vger.kernel.org; linux-mm@kvack.org;
> > hannes@cmpxchg.org; yosry.ahmed@linux.dev; chengming.zhou@linux.dev;
> > usamaarif642@gmail.com; ryan.roberts@arm.com; 21cnbao@gmail.com;
> > ying.huang@linux.alibaba.com; akpm@linux-foundation.org;
> > senozhatsky@chromium.org; sj@kernel.org; kasong@tencent.com; linux-
> > crypto@vger.kernel.org; herbert@gondor.apana.org.au;
> > davem@davemloft.net; clabbe@baylibre.com; ardb@kernel.org;
> > ebiggers@google.com; surenb@google.com; Accardi, Kristen C
> > <kristen.c.accardi@intel.com>; Gomes, Vinicius <vinicius.gomes@intel.com>;
> > Cabiddu, Giovanni <giovanni.cabiddu@intel.com>; Feghali, Wajdi K
> > <wajdi.k.feghali@intel.com>
> > Subject: Re: [PATCH v14 26/26] mm: zswap: Batched zswap_compress() for
> > compress batching of large folios.
> >
> > On Sat, Jan 24, 2026 at 7:36 PM Kanchana P Sridhar
> > <kanchana.p.sridhar@intel.com> wrote:
> > >
> > > We introduce a new batching implementation of zswap_compress() for
> > > compressors that do and do not support batching. This eliminates code
> > > duplication and facilitates code maintainability with the introduction
> > > of compress batching.
> > >
> > > The vectorized implementation of calling the earlier zswap_compress()
> > > sequentially, one page at a time in zswap_store_pages(), is replaced
> > > with this new version of zswap_compress() that accepts multiple pages to
> > > compress as a batch.
> > >
> > > If the compressor does not support batching, each page in the batch is
> > > compressed and stored sequentially. If the compressor supports batching,
> > > for e.g., 'deflate-iaa', the Intel IAA hardware accelerator, the batch
> > > is compressed in parallel in hardware.
> > >
> > > If the batch is compressed without errors, the compressed buffers for
> > > the batch are stored in zsmalloc. In case of compression errors, the
> > > current behavior based on whether the folio is enabled for zswap
> > > writeback, is preserved.
> > >
> > > The batched zswap_compress() incorporates Herbert's suggestion for
> > > SG lists to represent the batch's inputs/outputs to interface with the
> > > crypto API [1].
> > >
> > > Performance data:
> > > =================
> > > As suggested by Barry, this is the performance data gathered on Intel
> > > Sapphire Rapids with two workloads:
> > >
> > > 1) 30 usemem processes in a 150 GB memory limited cgroup, each
> > > allocates 10G, i.e, effectively running at 50% memory pressure.
> > > 2) kernel_compilation "defconfig", 32 threads, cgroup memory limit set
> > > to 1.7 GiB (50% memory pressure, since baseline memory usage is 3.4
> > > GiB): data averaged across 10 runs.
> > >
> > > To keep comparisons simple, all testing was done without the
> > > zswap shrinker.
> > >
> > >
> > ==============================================================
> > ===========
> > > IAA mm-unstable-1-23-2026 v14
> > >
> > ==============================================================
> > ===========
> > > zswap compressor deflate-iaa deflate-iaa IAA Batching
> > > vs.
> > > IAA Sequential
> > >
> > ==============================================================
> > ===========
> > > usemem30, 64K folios:
> > >
> > > Total throughput (KB/s) 6,226,967 10,551,714 69%
> > > Average throughput (KB/s) 207,565 351,723 69%
> > > elapsed time (sec) 99.19 67.45 -32%
> > > sys time (sec) 2,356.19 1,580.47 -33%
> > >
> > > usemem30, PMD folios:
> > >
> > > Total throughput (KB/s) 6,347,201 11,315,500 78%
> > > Average throughput (KB/s) 211,573 377,183 78%
> > > elapsed time (sec) 88.14 63.37 -28%
> > > sys time (sec) 2,025.53 1,455.23 -28%
> > >
> > > kernel_compilation, 64K folios:
> > >
> > > elapsed time (sec) 100.10 98.74 -1.4%
> > > sys time (sec) 308.72 301.23 -2%
> > >
> > > kernel_compilation, PMD folios:
> > >
> > > elapsed time (sec) 95.29 93.44 -1.9%
> > > sys time (sec) 346.21 344.48 -0.5%
> > >
> > ==============================================================
> > ===========
> > >
> > >
> > ==============================================================
> > ===========
> > > ZSTD mm-unstable-1-23-2026 v14
> > >
> > ==============================================================
> > ===========
> > > zswap compressor zstd zstd v14 ZSTD
> > > Improvement
> > >
> > ==============================================================
> > ===========
> > > usemem30, 64K folios:
> > >
> > > Total throughput (KB/s) 6,032,326 6,047,448 0.3%
> > > Average throughput (KB/s) 201,077 201,581 0.3%
> > > elapsed time (sec) 97.52 95.33 -2.2%
> > > sys time (sec) 2,415.40 2,328.38 -4%
> > >
> > > usemem30, PMD folios:
> > >
> > > Total throughput (KB/s) 6,570,404 6,623,962 0.8%
> > > Average throughput (KB/s) 219,013 220,798 0.8%
> > > elapsed time (sec) 89.17 88.25 -1%
> > > sys time (sec) 2,126.69 2,043.08 -4%
> > >
> > > kernel_compilation, 64K folios:
> > >
> > > elapsed time (sec) 100.89 99.98 -0.9%
> > > sys time (sec) 417.49 414.62 -0.7%
> > >
> > > kernel_compilation, PMD folios:
> > >
> > > elapsed time (sec) 98.26 97.38 -0.9%
> > > sys time (sec) 487.14 473.16 -2.9%
> > >
> > ==============================================================
> > ===========
> >
> > The rest of the patch changelog (architectural and future
> > considerations) can stay in the cover letter. Let's not duplicate
> > information :)
> >
> > Keep the patch changelog limited to only the changes in the patch
> > itself (unless we need some clarifications imminently relevant).
>
> Hi Nhat,
>
> Thanks for this comment. Yosry had also pointed this out in [1]. I have
> been including the architectural and future considerations in this change log
> since Andrew had asked me to do so. I hope this is Ok?
Ah hmmmmm. For some reasons I was under the assumption that usually
Andrew would concatenate the patch cover letter and the patch
changelog before merging. Oh well.
If Andrew prefers including that here then I'm fine with it.
>
> [1]: https://patchwork.kernel.org/comment/26706240/
>
> >
> > I'll review the remainder of the patch later :)
>
> Sure.
>
> Thanks,
> Kanchana
^ permalink raw reply [flat|nested] 48+ messages in thread
* RE: [PATCH v14 26/26] mm: zswap: Batched zswap_compress() for compress batching of large folios.
2026-02-01 0:48 ` Nhat Pham
@ 2026-02-01 2:53 ` Sridhar, Kanchana P
0 siblings, 0 replies; 48+ messages in thread
From: Sridhar, Kanchana P @ 2026-02-01 2:53 UTC (permalink / raw)
To: Nhat Pham
Cc: linux-kernel, linux-mm, hannes, yosry.ahmed, chengming.zhou,
usamaarif642, ryan.roberts, 21cnbao, ying.huang, akpm,
senozhatsky, sj, kasong, linux-crypto, herbert, davem, clabbe,
ardb, ebiggers, surenb, Accardi, Kristen C, Gomes, Vinicius,
Cabiddu, Giovanni, Feghali, Wajdi K, Sridhar, Kanchana P
> -----Original Message-----
> From: Nhat Pham <nphamcs@gmail.com>
> Sent: Saturday, January 31, 2026 4:49 PM
> To: Sridhar, Kanchana P <kanchana.p.sridhar@intel.com>
> Cc: linux-kernel@vger.kernel.org; linux-mm@kvack.org;
> hannes@cmpxchg.org; yosry.ahmed@linux.dev; chengming.zhou@linux.dev;
> usamaarif642@gmail.com; ryan.roberts@arm.com; 21cnbao@gmail.com;
> ying.huang@linux.alibaba.com; akpm@linux-foundation.org;
> senozhatsky@chromium.org; sj@kernel.org; kasong@tencent.com; linux-
> crypto@vger.kernel.org; herbert@gondor.apana.org.au;
> davem@davemloft.net; clabbe@baylibre.com; ardb@kernel.org;
> ebiggers@google.com; surenb@google.com; Accardi, Kristen C
> <kristen.c.accardi@intel.com>; Gomes, Vinicius <vinicius.gomes@intel.com>;
> Cabiddu, Giovanni <giovanni.cabiddu@intel.com>; Feghali, Wajdi K
> <wajdi.k.feghali@intel.com>
> Subject: Re: [PATCH v14 26/26] mm: zswap: Batched zswap_compress() for
> compress batching of large folios.
>
> On Sat, Jan 31, 2026 at 12:32 PM Sridhar, Kanchana P
> <kanchana.p.sridhar@intel.com> wrote:
> >
> >
> > > -----Original Message-----
> > > From: Nhat Pham <nphamcs@gmail.com>
> > > Sent: Friday, January 30, 2026 5:13 PM
> > > To: Sridhar, Kanchana P <kanchana.p.sridhar@intel.com>
> > > Cc: linux-kernel@vger.kernel.org; linux-mm@kvack.org;
> > > hannes@cmpxchg.org; yosry.ahmed@linux.dev;
> chengming.zhou@linux.dev;
> > > usamaarif642@gmail.com; ryan.roberts@arm.com; 21cnbao@gmail.com;
> > > ying.huang@linux.alibaba.com; akpm@linux-foundation.org;
> > > senozhatsky@chromium.org; sj@kernel.org; kasong@tencent.com; linux-
> > > crypto@vger.kernel.org; herbert@gondor.apana.org.au;
> > > davem@davemloft.net; clabbe@baylibre.com; ardb@kernel.org;
> > > ebiggers@google.com; surenb@google.com; Accardi, Kristen C
> > > <kristen.c.accardi@intel.com>; Gomes, Vinicius
> <vinicius.gomes@intel.com>;
> > > Cabiddu, Giovanni <giovanni.cabiddu@intel.com>; Feghali, Wajdi K
> > > <wajdi.k.feghali@intel.com>
> > > Subject: Re: [PATCH v14 26/26] mm: zswap: Batched zswap_compress()
> for
> > > compress batching of large folios.
> > >
> > > On Sat, Jan 24, 2026 at 7:36 PM Kanchana P Sridhar
> > > <kanchana.p.sridhar@intel.com> wrote:
> > > >
> > > > We introduce a new batching implementation of zswap_compress() for
> > > > compressors that do and do not support batching. This eliminates code
> > > > duplication and facilitates code maintainability with the introduction
> > > > of compress batching.
> > > >
> > > > The vectorized implementation of calling the earlier zswap_compress()
> > > > sequentially, one page at a time in zswap_store_pages(), is replaced
> > > > with this new version of zswap_compress() that accepts multiple pages
> to
> > > > compress as a batch.
> > > >
> > > > If the compressor does not support batching, each page in the batch is
> > > > compressed and stored sequentially. If the compressor supports
> batching,
> > > > for e.g., 'deflate-iaa', the Intel IAA hardware accelerator, the batch
> > > > is compressed in parallel in hardware.
> > > >
> > > > If the batch is compressed without errors, the compressed buffers for
> > > > the batch are stored in zsmalloc. In case of compression errors, the
> > > > current behavior based on whether the folio is enabled for zswap
> > > > writeback, is preserved.
> > > >
> > > > The batched zswap_compress() incorporates Herbert's suggestion for
> > > > SG lists to represent the batch's inputs/outputs to interface with the
> > > > crypto API [1].
> > > >
> > > > Performance data:
> > > > =================
> > > > As suggested by Barry, this is the performance data gathered on Intel
> > > > Sapphire Rapids with two workloads:
> > > >
> > > > 1) 30 usemem processes in a 150 GB memory limited cgroup, each
> > > > allocates 10G, i.e, effectively running at 50% memory pressure.
> > > > 2) kernel_compilation "defconfig", 32 threads, cgroup memory limit set
> > > > to 1.7 GiB (50% memory pressure, since baseline memory usage is 3.4
> > > > GiB): data averaged across 10 runs.
> > > >
> > > > To keep comparisons simple, all testing was done without the
> > > > zswap shrinker.
> > > >
> > > >
> > >
> ==============================================================
> > > ===========
> > > > IAA mm-unstable-1-23-2026 v14
> > > >
> > >
> ==============================================================
> > > ===========
> > > > zswap compressor deflate-iaa deflate-iaa IAA Batching
> > > > vs.
> > > > IAA Sequential
> > > >
> > >
> ==============================================================
> > > ===========
> > > > usemem30, 64K folios:
> > > >
> > > > Total throughput (KB/s) 6,226,967 10,551,714 69%
> > > > Average throughput (KB/s) 207,565 351,723 69%
> > > > elapsed time (sec) 99.19 67.45 -32%
> > > > sys time (sec) 2,356.19 1,580.47 -33%
> > > >
> > > > usemem30, PMD folios:
> > > >
> > > > Total throughput (KB/s) 6,347,201 11,315,500 78%
> > > > Average throughput (KB/s) 211,573 377,183 78%
> > > > elapsed time (sec) 88.14 63.37 -28%
> > > > sys time (sec) 2,025.53 1,455.23 -28%
> > > >
> > > > kernel_compilation, 64K folios:
> > > >
> > > > elapsed time (sec) 100.10 98.74 -1.4%
> > > > sys time (sec) 308.72 301.23 -2%
> > > >
> > > > kernel_compilation, PMD folios:
> > > >
> > > > elapsed time (sec) 95.29 93.44 -1.9%
> > > > sys time (sec) 346.21 344.48 -0.5%
> > > >
> > >
> ==============================================================
> > > ===========
> > > >
> > > >
> > >
> ==============================================================
> > > ===========
> > > > ZSTD mm-unstable-1-23-2026 v14
> > > >
> > >
> ==============================================================
> > > ===========
> > > > zswap compressor zstd zstd v14 ZSTD
> > > > Improvement
> > > >
> > >
> ==============================================================
> > > ===========
> > > > usemem30, 64K folios:
> > > >
> > > > Total throughput (KB/s) 6,032,326 6,047,448 0.3%
> > > > Average throughput (KB/s) 201,077 201,581 0.3%
> > > > elapsed time (sec) 97.52 95.33 -2.2%
> > > > sys time (sec) 2,415.40 2,328.38 -4%
> > > >
> > > > usemem30, PMD folios:
> > > >
> > > > Total throughput (KB/s) 6,570,404 6,623,962 0.8%
> > > > Average throughput (KB/s) 219,013 220,798 0.8%
> > > > elapsed time (sec) 89.17 88.25 -1%
> > > > sys time (sec) 2,126.69 2,043.08 -4%
> > > >
> > > > kernel_compilation, 64K folios:
> > > >
> > > > elapsed time (sec) 100.89 99.98 -0.9%
> > > > sys time (sec) 417.49 414.62 -0.7%
> > > >
> > > > kernel_compilation, PMD folios:
> > > >
> > > > elapsed time (sec) 98.26 97.38 -0.9%
> > > > sys time (sec) 487.14 473.16 -2.9%
> > > >
> > >
> ==============================================================
> > > ===========
> > >
> > > The rest of the patch changelog (architectural and future
> > > considerations) can stay in the cover letter. Let's not duplicate
> > > information :)
> > >
> > > Keep the patch changelog limited to only the changes in the patch
> > > itself (unless we need some clarifications imminently relevant).
> >
> > Hi Nhat,
> >
> > Thanks for this comment. Yosry had also pointed this out in [1]. I have
> > been including the architectural and future considerations in this change log
> > since Andrew had asked me to do so. I hope this is Ok?
>
> Ah hmmmmm. For some reasons I was under the assumption that usually
> Andrew would concatenate the patch cover letter and the patch
> changelog before merging. Oh well.
>
> If Andrew prefers including that here then I'm fine with it.
Ok, thank you Nhat!
Best regards,
Kanchana
>
> >
> > [1]: https://patchwork.kernel.org/comment/26706240/
> >
> > >
> > > I'll review the remainder of the patch later :)
> >
> > Sure.
> >
> > Thanks,
> > Kanchana
^ permalink raw reply [flat|nested] 48+ messages in thread
* Re: [PATCH v14 26/26] mm: zswap: Batched zswap_compress() for compress batching of large folios.
2026-01-25 3:35 ` [PATCH v14 26/26] mm: zswap: Batched zswap_compress() for compress batching of large folios Kanchana P Sridhar
2026-01-31 1:12 ` Nhat Pham
@ 2026-02-04 0:30 ` Nhat Pham
2026-02-04 18:10 ` Yosry Ahmed
2026-02-04 18:17 ` Yosry Ahmed
2026-02-04 18:17 ` Yosry Ahmed
3 siblings, 1 reply; 48+ messages in thread
From: Nhat Pham @ 2026-02-04 0:30 UTC (permalink / raw)
To: Kanchana P Sridhar
Cc: linux-kernel, linux-mm, hannes, yosry.ahmed, chengming.zhou,
usamaarif642, ryan.roberts, 21cnbao, ying.huang, akpm,
senozhatsky, sj, kasong, linux-crypto, herbert, davem, clabbe,
ardb, ebiggers, surenb, kristen.c.accardi, vinicius.gomes,
giovanni.cabiddu, wajdi.k.feghali
On Sat, Jan 24, 2026 at 7:36 PM Kanchana P Sridhar
<kanchana.p.sridhar@intel.com> wrote:
>
> We introduce a new batching implementation of zswap_compress() for
> compressors that do and do not support batching. This eliminates code
> duplication and facilitates code maintainability with the introduction
> of compress batching.
>
> The vectorized implementation of calling the earlier zswap_compress()
> sequentially, one page at a time in zswap_store_pages(), is replaced
> with this new version of zswap_compress() that accepts multiple pages to
> compress as a batch.
>
> If the compressor does not support batching, each page in the batch is
> compressed and stored sequentially. If the compressor supports batching,
> for e.g., 'deflate-iaa', the Intel IAA hardware accelerator, the batch
> is compressed in parallel in hardware.
>
> If the batch is compressed without errors, the compressed buffers for
> the batch are stored in zsmalloc. In case of compression errors, the
> current behavior based on whether the folio is enabled for zswap
> writeback, is preserved.
>
> The batched zswap_compress() incorporates Herbert's suggestion for
> SG lists to represent the batch's inputs/outputs to interface with the
> crypto API [1].
>
> Performance data:
> =================
> As suggested by Barry, this is the performance data gathered on Intel
> Sapphire Rapids with two workloads:
>
> 1) 30 usemem processes in a 150 GB memory limited cgroup, each
> allocates 10G, i.e, effectively running at 50% memory pressure.
> 2) kernel_compilation "defconfig", 32 threads, cgroup memory limit set
> to 1.7 GiB (50% memory pressure, since baseline memory usage is 3.4
> GiB): data averaged across 10 runs.
>
> To keep comparisons simple, all testing was done without the
> zswap shrinker.
>
> =========================================================================
> IAA mm-unstable-1-23-2026 v14
> =========================================================================
> zswap compressor deflate-iaa deflate-iaa IAA Batching
> vs.
> IAA Sequential
> =========================================================================
> usemem30, 64K folios:
>
> Total throughput (KB/s) 6,226,967 10,551,714 69%
> Average throughput (KB/s) 207,565 351,723 69%
> elapsed time (sec) 99.19 67.45 -32%
> sys time (sec) 2,356.19 1,580.47 -33%
>
> usemem30, PMD folios:
>
> Total throughput (KB/s) 6,347,201 11,315,500 78%
> Average throughput (KB/s) 211,573 377,183 78%
> elapsed time (sec) 88.14 63.37 -28%
> sys time (sec) 2,025.53 1,455.23 -28%
>
> kernel_compilation, 64K folios:
>
> elapsed time (sec) 100.10 98.74 -1.4%
> sys time (sec) 308.72 301.23 -2%
>
> kernel_compilation, PMD folios:
>
> elapsed time (sec) 95.29 93.44 -1.9%
> sys time (sec) 346.21 344.48 -0.5%
> =========================================================================
>
> =========================================================================
> ZSTD mm-unstable-1-23-2026 v14
> =========================================================================
> zswap compressor zstd zstd v14 ZSTD
> Improvement
> =========================================================================
> usemem30, 64K folios:
>
> Total throughput (KB/s) 6,032,326 6,047,448 0.3%
> Average throughput (KB/s) 201,077 201,581 0.3%
> elapsed time (sec) 97.52 95.33 -2.2%
> sys time (sec) 2,415.40 2,328.38 -4%
>
> usemem30, PMD folios:
>
> Total throughput (KB/s) 6,570,404 6,623,962 0.8%
> Average throughput (KB/s) 219,013 220,798 0.8%
> elapsed time (sec) 89.17 88.25 -1%
> sys time (sec) 2,126.69 2,043.08 -4%
>
> kernel_compilation, 64K folios:
>
> elapsed time (sec) 100.89 99.98 -0.9%
> sys time (sec) 417.49 414.62 -0.7%
>
> kernel_compilation, PMD folios:
>
> elapsed time (sec) 98.26 97.38 -0.9%
> sys time (sec) 487.14 473.16 -2.9%
> =========================================================================
>
> Architectural considerations for the zswap batching framework:
> ==============================================================
> We have designed the zswap batching framework to be
> hardware-agnostic. It has no dependencies on Intel-specific features and
> can be leveraged by any hardware accelerator or software-based
> compressor. In other words, the framework is open and inclusive by
> design.
>
> Potential future clients of the batching framework:
> ===================================================
> This patch-series demonstrates the performance benefits of compression
> batching when used in zswap_store() of large folios. Compression
> batching can be used for other use cases such as batching compression in
> zram, batch compression of different folios during reclaim, kcompressd,
> file systems, etc. Decompression batching can be used to improve
> efficiency of zswap writeback (Thanks Nhat for this idea), batching
> decompressions in zram, etc.
>
> Experiments with kernel_compilation "allmodconfig" that combine zswap
> compress batching, folio reclaim batching, and writeback batching show
> that 0 pages are written back with deflate-iaa and zstd. For comparison,
> the baselines for these compressors see 200K-800K pages written to disk.
> Reclaim batching relieves memory pressure faster than reclaiming one
> folio at a time, hence alleviates the need to scan slab memory for
> writeback.
>
> [1]: https://lore.kernel.org/all/aJ7Fk6RpNc815Ivd@gondor.apana.org.au/T/#m99aea2ce3d284e6c5a3253061d97b08c4752a798
>
> Signed-off-by: Kanchana P Sridhar <kanchana.p.sridhar@intel.com>
> ---
> mm/zswap.c | 260 ++++++++++++++++++++++++++++++++++++++---------------
> 1 file changed, 190 insertions(+), 70 deletions(-)
>
> diff --git a/mm/zswap.c b/mm/zswap.c
> index 6a22add63220..399112af2c54 100644
> --- a/mm/zswap.c
> +++ b/mm/zswap.c
> @@ -145,6 +145,7 @@ struct crypto_acomp_ctx {
> struct acomp_req *req;
> struct crypto_wait wait;
> u8 **buffers;
> + struct sg_table *sg_table;
> struct mutex mutex;
> };
>
> @@ -272,6 +273,11 @@ static void acomp_ctx_dealloc(struct crypto_acomp_ctx *acomp_ctx, u8 nr_buffers)
> kfree(acomp_ctx->buffers[i]);
> kfree(acomp_ctx->buffers);
> }
> +
> + if (acomp_ctx->sg_table) {
> + sg_free_table(acomp_ctx->sg_table);
> + kfree(acomp_ctx->sg_table);
> + }
> }
>
> static struct zswap_pool *zswap_pool_create(char *compressor)
> @@ -834,6 +840,7 @@ static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node)
> struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node);
> struct crypto_acomp_ctx *acomp_ctx = per_cpu_ptr(pool->acomp_ctx, cpu);
> int nid = cpu_to_node(cpu);
> + struct scatterlist *sg;
> int ret = -ENOMEM;
> u8 i;
>
> @@ -880,6 +887,22 @@ static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node)
> goto fail;
> }
>
> + acomp_ctx->sg_table = kmalloc(sizeof(*acomp_ctx->sg_table),
> + GFP_KERNEL);
> + if (!acomp_ctx->sg_table)
> + goto fail;
> +
> + if (sg_alloc_table(acomp_ctx->sg_table, pool->compr_batch_size,
> + GFP_KERNEL))
> + goto fail;
> +
> + /*
> + * Statically map the per-CPU destination buffers to the per-CPU
> + * SG lists.
> + */
> + for_each_sg(acomp_ctx->sg_table->sgl, sg, pool->compr_batch_size, i)
> + sg_set_buf(sg, acomp_ctx->buffers[i], PAGE_SIZE);
> +
> /*
> * if the backend of acomp is async zip, crypto_req_done() will wakeup
> * crypto_wait_req(); if the backend of acomp is scomp, the callback
> @@ -900,84 +923,177 @@ static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node)
> return ret;
> }
>
> -static bool zswap_compress(struct page *page, struct zswap_entry *entry,
> - struct zswap_pool *pool, bool wb_enabled)
> +/*
> + * zswap_compress() batching implementation for sequential and batching
> + * compressors.
> + *
> + * Description:
> + * ============
> + *
> + * Compress multiple @nr_pages in @folio starting from the @folio_start index in
> + * batches of @nr_batch_pages.
> + *
> + * It is assumed that @nr_pages <= ZSWAP_MAX_BATCH_SIZE. zswap_store() makes
> + * sure of this by design and zswap_store_pages() warns if this is not true.
> + *
> + * @nr_pages can be in (1, ZSWAP_MAX_BATCH_SIZE] even if the compressor does not
> + * support batching.
> + *
> + * If @nr_batch_pages is 1, each page is processed sequentially.
> + *
> + * If @nr_batch_pages is > 1, compression batching is invoked within
> + * the algorithm's driver, except if @nr_pages is 1: if so, the driver can
> + * choose to call it's sequential/non-batching compress routine.
Hmm, I'm a bit confused by this documentation.
Why is there extra explanation about nr_batch_pages > 1 and nr_pages
== 1? That cannot happen, no?
nr_batch_pages is already determined by the time we enter
zswap_compress() (the computation is done at its callsite, and already
takes into account nr_pages, since it is the min of nr_pages, and the
compressor batch size).
I find this batching (for store), then sub-batching (for compression),
confusing, even if I understand it's to maintain/improve performance
for the software compressors... It makes indices in zswap_compress()
very convoluted.
Yosry and Johannes - any thoughts on this?
> + *
> + * In both cases, if all compressions are successful, the compressed buffers
> + * are stored in zsmalloc.
> + *
> + * Design notes for batching compressors:
> + * ======================================
> + *
> + * Traversing SG lists when @nr_batch_pages is > 1 is expensive, and
> + * impacts batching performance if repeated:
> + * - to map destination buffers to each SG list in @acomp_ctx->sg_table.
> + * - to initialize each output @sg->length to PAGE_SIZE.
> + *
> + * Design choices made to optimize batching with SG lists:
> + *
> + * 1) The source folio pages in the batch are directly submitted to
> + * crypto_acomp via acomp_request_set_src_folio().
> + *
> + * 2) The per-CPU @acomp_ctx->sg_table scatterlists are statically mapped
> + * to the per-CPU dst @buffers at pool creation time.
> + *
> + * 3) zswap_compress() sets the output SG list length to PAGE_SIZE for
> + * non-batching compressors. The batching compressor's driver should do this
> + * as part of iterating through the dst SG lists for batch compression setup.
> + *
> + * Considerations for non-batching and batching compressors:
> + * =========================================================
> + *
> + * For each output SG list in @acomp_ctx->req->sg_table->sgl, the @sg->length
> + * should be set to either the page's compressed length (success), or it's
> + * compression error value.
> + */
> +static bool zswap_compress(struct folio *folio,
> + long folio_start,
> + u8 nr_pages,
> + u8 nr_batch_pages,
> + struct zswap_entry *entries[],
> + struct zs_pool *zs_pool,
> + struct crypto_acomp_ctx *acomp_ctx,
> + int nid,
> + bool wb_enabled)
> {
> - struct crypto_acomp_ctx *acomp_ctx;
> - struct scatterlist input, output;
> - int comp_ret = 0, alloc_ret = 0;
> - unsigned int dlen = PAGE_SIZE;
> + gfp_t gfp = GFP_NOWAIT | __GFP_NORETRY | __GFP_HIGHMEM | __GFP_MOVABLE;
> + unsigned int slen = nr_batch_pages * PAGE_SIZE;
> + u8 batch_start, batch_iter, compr_batch_size_iter;
> + struct scatterlist *sg;
> unsigned long handle;
> - gfp_t gfp;
> - u8 *dst;
> - bool mapped = false;
> -
> - acomp_ctx = raw_cpu_ptr(pool->acomp_ctx);
> - mutex_lock(&acomp_ctx->mutex);
> -
> - dst = acomp_ctx->buffers[0];
> - sg_init_table(&input, 1);
> - sg_set_page(&input, page, PAGE_SIZE, 0);
> -
> - sg_init_one(&output, dst, PAGE_SIZE);
> - acomp_request_set_params(acomp_ctx->req, &input, &output, PAGE_SIZE, dlen);
> + int err, dlen;
> + void *dst;
>
> /*
> - * it maybe looks a little bit silly that we send an asynchronous request,
> - * then wait for its completion synchronously. This makes the process look
> - * synchronous in fact.
> - * Theoretically, acomp supports users send multiple acomp requests in one
> - * acomp instance, then get those requests done simultaneously. but in this
> - * case, zswap actually does store and load page by page, there is no
> - * existing method to send the second page before the first page is done
> - * in one thread doing zswap.
> - * but in different threads running on different cpu, we have different
> - * acomp instance, so multiple threads can do (de)compression in parallel.
> + * Locking the acomp_ctx mutex once per store batch results in better
> + * performance as compared to locking per compress batch.
> */
> - comp_ret = crypto_wait_req(crypto_acomp_compress(acomp_ctx->req), &acomp_ctx->wait);
> - dlen = acomp_ctx->req->dlen;
> + mutex_lock(&acomp_ctx->mutex);
>
> /*
> - * If a page cannot be compressed into a size smaller than PAGE_SIZE,
> - * save the content as is without a compression, to keep the LRU order
> - * of writebacks. If writeback is disabled, reject the page since it
> - * only adds metadata overhead. swap_writeout() will put the page back
> - * to the active LRU list in the case.
> + * Compress the @nr_pages in @folio starting at index @folio_start
> + * in batches of @nr_batch_pages.
> */
> - if (comp_ret || !dlen || dlen >= PAGE_SIZE) {
> - if (!wb_enabled) {
> - comp_ret = comp_ret ? comp_ret : -EINVAL;
> - goto unlock;
> - }
> - comp_ret = 0;
> - dlen = PAGE_SIZE;
> - dst = kmap_local_page(page);
> - mapped = true;
> - }
> + for (batch_start = 0; batch_start < nr_pages;
> + batch_start += nr_batch_pages) {
> + /*
> + * Send @nr_batch_pages to crypto_acomp for compression:
> + *
> + * These pages are in @folio's range of indices in the interval
> + * [@folio_start + @batch_start,
> + * @folio_start + @batch_start + @nr_batch_pages).
> + *
> + * @slen indicates the total source length bytes for @nr_batch_pages.
> + *
> + * The pool's compressor batch size is at least @nr_batch_pages,
> + * hence the acomp_ctx has at least @nr_batch_pages dst @buffers.
> + */
> + acomp_request_set_src_folio(acomp_ctx->req, folio,
> + (folio_start + batch_start) * PAGE_SIZE,
> + slen);
> +
> + acomp_ctx->sg_table->sgl->length = slen;
> +
> + acomp_request_set_dst_sg(acomp_ctx->req,
> + acomp_ctx->sg_table->sgl,
> + slen);
> +
> + err = crypto_wait_req(crypto_acomp_compress(acomp_ctx->req),
> + &acomp_ctx->wait);
> +
> + /*
> + * If a page cannot be compressed into a size smaller than
> + * PAGE_SIZE, save the content as is without a compression, to
> + * keep the LRU order of writebacks. If writeback is disabled,
> + * reject the page since it only adds metadata overhead.
> + * swap_writeout() will put the page back to the active LRU list
> + * in the case.
> + *
> + * It is assumed that any compressor that sets the output length
> + * to 0 or a value >= PAGE_SIZE will also return a negative
> + * error status in @err; i.e, will not return a successful
> + * compression status in @err in this case.
> + */
> + if (unlikely(err && !wb_enabled))
> + goto compress_error;
> +
> + for_each_sg(acomp_ctx->sg_table->sgl, sg, nr_batch_pages,
> + compr_batch_size_iter) {
> + batch_iter = batch_start + compr_batch_size_iter;
> + dst = acomp_ctx->buffers[compr_batch_size_iter];
> + dlen = sg->length;
> +
> + if (dlen < 0) {
> + dlen = PAGE_SIZE;
> + dst = kmap_local_page(folio_page(folio,
> + folio_start + batch_iter));
> + }
> +
> + handle = zs_malloc(zs_pool, dlen, gfp, nid);
> +
> + if (unlikely(IS_ERR_VALUE(handle))) {
> + if (PTR_ERR((void *)handle) == -ENOSPC)
> + zswap_reject_compress_poor++;
> + else
> + zswap_reject_alloc_fail++;
>
> - gfp = GFP_NOWAIT | __GFP_NORETRY | __GFP_HIGHMEM | __GFP_MOVABLE;
> - handle = zs_malloc(pool->zs_pool, dlen, gfp, page_to_nid(page));
> - if (IS_ERR_VALUE(handle)) {
> - alloc_ret = PTR_ERR((void *)handle);
> - goto unlock;
> + goto err_unlock;
> + }
> +
> + zs_obj_write(zs_pool, handle, dst, dlen);
> + entries[batch_iter]->handle = handle;
> + entries[batch_iter]->length = dlen;
> + if (dst != acomp_ctx->buffers[compr_batch_size_iter])
> + kunmap_local(dst);
> + }
> }
>
> - zs_obj_write(pool->zs_pool, handle, dst, dlen);
> - entry->handle = handle;
> - entry->length = dlen;
> + mutex_unlock(&acomp_ctx->mutex);
> + return true;
>
> -unlock:
> - if (mapped)
> - kunmap_local(dst);
> - if (comp_ret == -ENOSPC || alloc_ret == -ENOSPC)
> - zswap_reject_compress_poor++;
> - else if (comp_ret)
> - zswap_reject_compress_fail++;
> - else if (alloc_ret)
> - zswap_reject_alloc_fail++;
> +compress_error:
> + for_each_sg(acomp_ctx->sg_table->sgl, sg, nr_batch_pages,
> + compr_batch_size_iter) {
> + if ((int)sg->length < 0) {
> + if ((int)sg->length == -ENOSPC)
> + zswap_reject_compress_poor++;
> + else
> + zswap_reject_compress_fail++;
> + }
> + }
>
> +err_unlock:
> mutex_unlock(&acomp_ctx->mutex);
> - return comp_ret == 0 && alloc_ret == 0;
> + return false;
> }
>
> static bool zswap_decompress(struct zswap_entry *entry, struct folio *folio)
> @@ -1499,12 +1615,16 @@ static bool zswap_store_pages(struct folio *folio,
> INIT_LIST_HEAD(&entries[i]->lru);
> }
>
> - for (i = 0; i < nr_pages; ++i) {
> - struct page *page = folio_page(folio, start + i);
> -
> - if (!zswap_compress(page, entries[i], pool, wb_enabled))
> - goto store_pages_failed;
> - }
> + if (unlikely(!zswap_compress(folio,
> + start,
> + nr_pages,
> + min(nr_pages, pool->compr_batch_size),
Hmm this is a bit confusing. There seems to be multiples kinds of "batch size".
Am I understanding this correctly:
zswap_store(folio)
-> zswap_store_pages() - handle a batch of nr_pages from start to
end (exclusive)
-> zswap_compress() - compress a batch of
min(compr_batch_size, nr_pages)
where:
* compr_batch_size is the batch size prescribed by compressor (1 for
zstd, potentially more for IAA).
* nr_pages is the "store batch size", which can be more than 1, even
for zstd (to take advantage of cache locality in zswap_store_pages).
> + entries,
> + pool->zs_pool,
> + acomp_ctx,
> + nid,
> + wb_enabled)))
> + goto store_pages_failed;
>
> for (i = 0; i < nr_pages; ++i) {
> struct zswap_entry *old, *entry = entries[i];
> --
> 2.27.0
>
The rest looks OK to me, but 80% of this patch is using the new crypto
API, so I'll wait for Herbert's Acked on the first half of the patch
series :)
^ permalink raw reply [flat|nested] 48+ messages in thread* Re: [PATCH v14 26/26] mm: zswap: Batched zswap_compress() for compress batching of large folios.
2026-02-04 0:30 ` Nhat Pham
@ 2026-02-04 18:10 ` Yosry Ahmed
0 siblings, 0 replies; 48+ messages in thread
From: Yosry Ahmed @ 2026-02-04 18:10 UTC (permalink / raw)
To: Nhat Pham
Cc: Kanchana P Sridhar, linux-kernel, linux-mm, hannes,
chengming.zhou, usamaarif642, ryan.roberts, 21cnbao, ying.huang,
akpm, senozhatsky, sj, kasong, linux-crypto, herbert, davem,
clabbe, ardb, ebiggers, surenb, kristen.c.accardi,
vinicius.gomes, giovanni.cabiddu, wajdi.k.feghali
On Tue, Feb 03, 2026 at 04:30:48PM -0800, Nhat Pham wrote:
[..]
> @@ -900,84 +923,177 @@ static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node)
> > return ret;
> > }
> >
> > -static bool zswap_compress(struct page *page, struct zswap_entry *entry,
> > - struct zswap_pool *pool, bool wb_enabled)
> > +/*
> > + * zswap_compress() batching implementation for sequential and batching
> > + * compressors.
> > + *
> > + * Description:
> > + * ============
> > + *
> > + * Compress multiple @nr_pages in @folio starting from the @folio_start index in
> > + * batches of @nr_batch_pages.
> > + *
> > + * It is assumed that @nr_pages <= ZSWAP_MAX_BATCH_SIZE. zswap_store() makes
> > + * sure of this by design and zswap_store_pages() warns if this is not true.
> > + *
> > + * @nr_pages can be in (1, ZSWAP_MAX_BATCH_SIZE] even if the compressor does not
> > + * support batching.
> > + *
> > + * If @nr_batch_pages is 1, each page is processed sequentially.
> > + *
> > + * If @nr_batch_pages is > 1, compression batching is invoked within
> > + * the algorithm's driver, except if @nr_pages is 1: if so, the driver can
> > + * choose to call it's sequential/non-batching compress routine.
>
> Hmm, I'm a bit confused by this documentation.
>
> Why is there extra explanation about nr_batch_pages > 1 and nr_pages
> == 1? That cannot happen, no?
>
> nr_batch_pages is already determined by the time we enter
> zswap_compress() (the computation is done at its callsite, and already
> takes into account nr_pages, since it is the min of nr_pages, and the
> compressor batch size).
>
> I find this batching (for store), then sub-batching (for compression),
> confusing, even if I understand it's to maintain/improve performance
> for the software compressors... It makes indices in zswap_compress()
> very convoluted.
>
> Yosry and Johannes - any thoughts on this?
Yeah, not a big fan either. I am really wondering if the perf hit is
real enough to justify this. I would much rather we use the same batch
size for both.
IIUC the problem is that we cannot use the crypto batching interface for
SW compressors as it requires compressor support, and we cannot avoid
batching altogether for SW compressors because they regress.
I wonder if we can add support in the crypto layer to handle batching
for SW compressors without compressor support (just loop on the batch
and compress one-by-one)?
Alternatively, I did suggest we at least introduce an intermediate
function to do the sub-batching to simplify zswap_compress() (e.g.
zswap_compress() and __zswap_compress()). I think this also caused
regressions but I wonder if we can force inline it or sth.
The current design is really confusing.
^ permalink raw reply [flat|nested] 48+ messages in thread
* Re: [PATCH v14 26/26] mm: zswap: Batched zswap_compress() for compress batching of large folios.
2026-01-25 3:35 ` [PATCH v14 26/26] mm: zswap: Batched zswap_compress() for compress batching of large folios Kanchana P Sridhar
2026-01-31 1:12 ` Nhat Pham
2026-02-04 0:30 ` Nhat Pham
@ 2026-02-04 18:17 ` Yosry Ahmed
2026-02-04 18:17 ` Yosry Ahmed
3 siblings, 0 replies; 48+ messages in thread
From: Yosry Ahmed @ 2026-02-04 18:17 UTC (permalink / raw)
To: Kanchana P Sridhar
Cc: linux-kernel, linux-mm, hannes, nphamcs, chengming.zhou,
usamaarif642, ryan.roberts, 21cnbao, ying.huang, akpm,
senozhatsky, sj, kasong, linux-crypto, herbert, davem, clabbe,
ardb, ebiggers, surenb, kristen.c.accardi, vinicius.gomes,
giovanni.cabiddu, wajdi.k.feghali
On Sat, Jan 24, 2026 at 07:35:37PM -0800, Kanchana P Sridhar wrote:
[..]
I am still not happy with the batching approach in general, but I will
leave that to the other thread with Nhat. Other comments below.
> ---
> mm/zswap.c | 260 ++++++++++++++++++++++++++++++++++++++---------------
> 1 file changed, 190 insertions(+), 70 deletions(-)
>
> diff --git a/mm/zswap.c b/mm/zswap.c
> index 6a22add63220..399112af2c54 100644
> --- a/mm/zswap.c
> +++ b/mm/zswap.c
> @@ -145,6 +145,7 @@ struct crypto_acomp_ctx {
> struct acomp_req *req;
> struct crypto_wait wait;
> u8 **buffers;
> + struct sg_table *sg_table;
> struct mutex mutex;
> };
>
> @@ -272,6 +273,11 @@ static void acomp_ctx_dealloc(struct crypto_acomp_ctx *acomp_ctx, u8 nr_buffers)
> kfree(acomp_ctx->buffers[i]);
> kfree(acomp_ctx->buffers);
> }
> +
> + if (acomp_ctx->sg_table) {
> + sg_free_table(acomp_ctx->sg_table);
> + kfree(acomp_ctx->sg_table);
> + }
> }
>
> static struct zswap_pool *zswap_pool_create(char *compressor)
> @@ -834,6 +840,7 @@ static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node)
> struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node);
> struct crypto_acomp_ctx *acomp_ctx = per_cpu_ptr(pool->acomp_ctx, cpu);
> int nid = cpu_to_node(cpu);
> + struct scatterlist *sg;
> int ret = -ENOMEM;
> u8 i;
>
> @@ -880,6 +887,22 @@ static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node)
> goto fail;
> }
>
> + acomp_ctx->sg_table = kmalloc(sizeof(*acomp_ctx->sg_table),
> + GFP_KERNEL);
> + if (!acomp_ctx->sg_table)
> + goto fail;
> +
> + if (sg_alloc_table(acomp_ctx->sg_table, pool->compr_batch_size,
> + GFP_KERNEL))
> + goto fail;
> +
> + /*
> + * Statically map the per-CPU destination buffers to the per-CPU
> + * SG lists.
> + */
> + for_each_sg(acomp_ctx->sg_table->sgl, sg, pool->compr_batch_size, i)
> + sg_set_buf(sg, acomp_ctx->buffers[i], PAGE_SIZE);
> +
> /*
> * if the backend of acomp is async zip, crypto_req_done() will wakeup
> * crypto_wait_req(); if the backend of acomp is scomp, the callback
> @@ -900,84 +923,177 @@ static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node)
> return ret;
> }
>
> -static bool zswap_compress(struct page *page, struct zswap_entry *entry,
> - struct zswap_pool *pool, bool wb_enabled)
> +/*
> + * zswap_compress() batching implementation for sequential and batching
> + * compressors.
> + *
> + * Description:
> + * ============
> + *
> + * Compress multiple @nr_pages in @folio starting from the @folio_start index in
> + * batches of @nr_batch_pages.
> + *
> + * It is assumed that @nr_pages <= ZSWAP_MAX_BATCH_SIZE. zswap_store() makes
> + * sure of this by design and zswap_store_pages() warns if this is not true.
These 2 lines are not necessary, the WARN documents it.
> + *
> + * @nr_pages can be in (1, ZSWAP_MAX_BATCH_SIZE] even if the compressor does not
> + * support batching.
> + *
> + * If @nr_batch_pages is 1, each page is processed sequentially.
> + *
> + * If @nr_batch_pages is > 1, compression batching is invoked within
> + * the algorithm's driver, except if @nr_pages is 1: if so, the driver can
> + * choose to call it's sequential/non-batching compress routine.
I think the "except.." part should be dropped? Can we have
nr_batch_pages > nr_pages?
Also, what the driver does is irrelevant here.
We can probably replace the above two sentences with
* if @nr_batch_pages > 1, the compressor may use batching to
* optimize compression.
> + *
> + * In both cases, if all compressions are successful, the compressed buffers
> + * are stored in zsmalloc.
This part is unnecessary.
> + *
> + * Design notes for batching compressors:
> + * ======================================
> + *
> + * Traversing SG lists when @nr_batch_pages is > 1 is expensive, and
> + * impacts batching performance if repeated:
> + * - to map destination buffers to each SG list in @acomp_ctx->sg_table.
> + * - to initialize each output @sg->length to PAGE_SIZE.
> + *
> + * Design choices made to optimize batching with SG lists:
> + *
> + * 1) The source folio pages in the batch are directly submitted to
> + * crypto_acomp via acomp_request_set_src_folio().
I think this part is a given, what else would we do?
> + *
> + * 2) The per-CPU @acomp_ctx->sg_table scatterlists are statically mapped
> + * to the per-CPU dst @buffers at pool creation time.
This is good to document. Although I think documenting it inline where
@acomp_ctx->sg_table is used would be better.
> + *
> + * 3) zswap_compress() sets the output SG list length to PAGE_SIZE for
> + * non-batching compressors. The batching compressor's driver should do this
> + * as part of iterating through the dst SG lists for batch compression setup.
Not sure what this is referring to?
> + *
> + * Considerations for non-batching and batching compressors:
> + * =========================================================
> + *
> + * For each output SG list in @acomp_ctx->req->sg_table->sgl, the @sg->length
> + * should be set to either the page's compressed length (success), or it's
> + * compression error value.
Would also be better to move to where it's used (e.g. when iterating the
sglist after compression).
> + */
> +static bool zswap_compress(struct folio *folio,
> + long folio_start,
> + u8 nr_pages,
> + u8 nr_batch_pages,
> + struct zswap_entry *entries[],
> + struct zs_pool *zs_pool,
> + struct crypto_acomp_ctx *acomp_ctx,
> + int nid,
> + bool wb_enabled)
> {
> - struct crypto_acomp_ctx *acomp_ctx;
> - struct scatterlist input, output;
> - int comp_ret = 0, alloc_ret = 0;
> - unsigned int dlen = PAGE_SIZE;
> + gfp_t gfp = GFP_NOWAIT | __GFP_NORETRY | __GFP_HIGHMEM | __GFP_MOVABLE;
> + unsigned int slen = nr_batch_pages * PAGE_SIZE;
> + u8 batch_start, batch_iter, compr_batch_size_iter;
> + struct scatterlist *sg;
> unsigned long handle;
> - gfp_t gfp;
> - u8 *dst;
> - bool mapped = false;
> -
> - acomp_ctx = raw_cpu_ptr(pool->acomp_ctx);
> - mutex_lock(&acomp_ctx->mutex);
> -
> - dst = acomp_ctx->buffers[0];
> - sg_init_table(&input, 1);
> - sg_set_page(&input, page, PAGE_SIZE, 0);
> -
> - sg_init_one(&output, dst, PAGE_SIZE);
> - acomp_request_set_params(acomp_ctx->req, &input, &output, PAGE_SIZE, dlen);
> + int err, dlen;
> + void *dst;
>
> /*
> - * it maybe looks a little bit silly that we send an asynchronous request,
> - * then wait for its completion synchronously. This makes the process look
> - * synchronous in fact.
> - * Theoretically, acomp supports users send multiple acomp requests in one
> - * acomp instance, then get those requests done simultaneously. but in this
> - * case, zswap actually does store and load page by page, there is no
> - * existing method to send the second page before the first page is done
> - * in one thread doing zswap.
> - * but in different threads running on different cpu, we have different
> - * acomp instance, so multiple threads can do (de)compression in parallel.
> + * Locking the acomp_ctx mutex once per store batch results in better
> + * performance as compared to locking per compress batch.
> */
> - comp_ret = crypto_wait_req(crypto_acomp_compress(acomp_ctx->req), &acomp_ctx->wait);
> - dlen = acomp_ctx->req->dlen;
> + mutex_lock(&acomp_ctx->mutex);
>
> /*
> - * If a page cannot be compressed into a size smaller than PAGE_SIZE,
> - * save the content as is without a compression, to keep the LRU order
> - * of writebacks. If writeback is disabled, reject the page since it
> - * only adds metadata overhead. swap_writeout() will put the page back
> - * to the active LRU list in the case.
> + * Compress the @nr_pages in @folio starting at index @folio_start
> + * in batches of @nr_batch_pages.
> */
> - if (comp_ret || !dlen || dlen >= PAGE_SIZE) {
> - if (!wb_enabled) {
> - comp_ret = comp_ret ? comp_ret : -EINVAL;
> - goto unlock;
> - }
> - comp_ret = 0;
> - dlen = PAGE_SIZE;
> - dst = kmap_local_page(page);
> - mapped = true;
> - }
> + for (batch_start = 0; batch_start < nr_pages;
> + batch_start += nr_batch_pages) {
> + /*
> + * Send @nr_batch_pages to crypto_acomp for compression:
> + *
> + * These pages are in @folio's range of indices in the interval
> + * [@folio_start + @batch_start,
> + * @folio_start + @batch_start + @nr_batch_pages).
> + *
> + * @slen indicates the total source length bytes for @nr_batch_pages.
> + *
> + * The pool's compressor batch size is at least @nr_batch_pages,
> + * hence the acomp_ctx has at least @nr_batch_pages dst @buffers.
> + */
> + acomp_request_set_src_folio(acomp_ctx->req, folio,
> + (folio_start + batch_start) * PAGE_SIZE,
> + slen);
> +
> + acomp_ctx->sg_table->sgl->length = slen;
> +
> + acomp_request_set_dst_sg(acomp_ctx->req,
> + acomp_ctx->sg_table->sgl,
> + slen);
> +
> + err = crypto_wait_req(crypto_acomp_compress(acomp_ctx->req),
> + &acomp_ctx->wait);
> +
> + /*
> + * If a page cannot be compressed into a size smaller than
> + * PAGE_SIZE, save the content as is without a compression, to
> + * keep the LRU order of writebacks. If writeback is disabled,
> + * reject the page since it only adds metadata overhead.
> + * swap_writeout() will put the page back to the active LRU list
> + * in the case.
> + *
> + * It is assumed that any compressor that sets the output length
> + * to 0 or a value >= PAGE_SIZE will also return a negative
> + * error status in @err; i.e, will not return a successful
> + * compression status in @err in this case.
> + */
> + if (unlikely(err && !wb_enabled))
> + goto compress_error;
> +
> + for_each_sg(acomp_ctx->sg_table->sgl, sg, nr_batch_pages,
> + compr_batch_size_iter) {
> + batch_iter = batch_start + compr_batch_size_iter;
> + dst = acomp_ctx->buffers[compr_batch_size_iter];
> + dlen = sg->length;
> +
> + if (dlen < 0) {
> + dlen = PAGE_SIZE;
> + dst = kmap_local_page(folio_page(folio,
> + folio_start + batch_iter));
> + }
> +
> + handle = zs_malloc(zs_pool, dlen, gfp, nid);
> +
> + if (unlikely(IS_ERR_VALUE(handle))) {
> + if (PTR_ERR((void *)handle) == -ENOSPC)
> + zswap_reject_compress_poor++;
> + else
> + zswap_reject_alloc_fail++;
>
> - gfp = GFP_NOWAIT | __GFP_NORETRY | __GFP_HIGHMEM | __GFP_MOVABLE;
> - handle = zs_malloc(pool->zs_pool, dlen, gfp, page_to_nid(page));
> - if (IS_ERR_VALUE(handle)) {
> - alloc_ret = PTR_ERR((void *)handle);
> - goto unlock;
> + goto err_unlock;
> + }
> +
> + zs_obj_write(zs_pool, handle, dst, dlen);
> + entries[batch_iter]->handle = handle;
> + entries[batch_iter]->length = dlen;
> + if (dst != acomp_ctx->buffers[compr_batch_size_iter])
> + kunmap_local(dst);
> + }
> }
>
> - zs_obj_write(pool->zs_pool, handle, dst, dlen);
> - entry->handle = handle;
> - entry->length = dlen;
> + mutex_unlock(&acomp_ctx->mutex);
> + return true;
>
> -unlock:
> - if (mapped)
> - kunmap_local(dst);
> - if (comp_ret == -ENOSPC || alloc_ret == -ENOSPC)
> - zswap_reject_compress_poor++;
> - else if (comp_ret)
> - zswap_reject_compress_fail++;
> - else if (alloc_ret)
> - zswap_reject_alloc_fail++;
> +compress_error:
> + for_each_sg(acomp_ctx->sg_table->sgl, sg, nr_batch_pages,
> + compr_batch_size_iter) {
> + if ((int)sg->length < 0) {
> + if ((int)sg->length == -ENOSPC)
> + zswap_reject_compress_poor++;
> + else
> + zswap_reject_compress_fail++;
> + }
> + }
>
> +err_unlock:
> mutex_unlock(&acomp_ctx->mutex);
> - return comp_ret == 0 && alloc_ret == 0;
> + return false;
> }
>
> static bool zswap_decompress(struct zswap_entry *entry, struct folio *folio)
> @@ -1499,12 +1615,16 @@ static bool zswap_store_pages(struct folio *folio,
> INIT_LIST_HEAD(&entries[i]->lru);
> }
>
> - for (i = 0; i < nr_pages; ++i) {
> - struct page *page = folio_page(folio, start + i);
> -
> - if (!zswap_compress(page, entries[i], pool, wb_enabled))
> - goto store_pages_failed;
> - }
> + if (unlikely(!zswap_compress(folio,
> + start,
> + nr_pages,
> + min(nr_pages, pool->compr_batch_size),
> + entries,
> + pool->zs_pool,
> + acomp_ctx,
> + nid,
> + wb_enabled)))
> + goto store_pages_failed;
Similar to the previous patch, I don't like the huge arg list. Drop args
that don't have to be passed in (e.g. acomp_ctx, nid..), and either drop
unlikely() or use an intermediate variable.
>
> for (i = 0; i < nr_pages; ++i) {
> struct zswap_entry *old, *entry = entries[i];
> --
> 2.27.0
>
^ permalink raw reply [flat|nested] 48+ messages in thread* Re: [PATCH v14 26/26] mm: zswap: Batched zswap_compress() for compress batching of large folios.
2026-01-25 3:35 ` [PATCH v14 26/26] mm: zswap: Batched zswap_compress() for compress batching of large folios Kanchana P Sridhar
` (2 preceding siblings ...)
2026-02-04 18:17 ` Yosry Ahmed
@ 2026-02-04 18:17 ` Yosry Ahmed
3 siblings, 0 replies; 48+ messages in thread
From: Yosry Ahmed @ 2026-02-04 18:17 UTC (permalink / raw)
To: Kanchana P Sridhar
Cc: linux-kernel, linux-mm, hannes, nphamcs, chengming.zhou,
usamaarif642, ryan.roberts, 21cnbao, ying.huang, akpm,
senozhatsky, sj, kasong, linux-crypto, herbert, davem, clabbe,
ardb, ebiggers, surenb, kristen.c.accardi, vinicius.gomes,
giovanni.cabiddu, wajdi.k.feghali
On Sat, Jan 24, 2026 at 07:35:37PM -0800, Kanchana P Sridhar wrote:
> We introduce a new batching implementation of zswap_compress() for
> compressors that do and do not support batching. This eliminates code
> duplication and facilitates code maintainability with the introduction
> of compress batching.
>
> The vectorized implementation of calling the earlier zswap_compress()
> sequentially, one page at a time in zswap_store_pages(), is replaced
> with this new version of zswap_compress() that accepts multiple pages to
> compress as a batch.
>
> If the compressor does not support batching, each page in the batch is
> compressed and stored sequentially. If the compressor supports batching,
> for e.g., 'deflate-iaa', the Intel IAA hardware accelerator, the batch
> is compressed in parallel in hardware.
>
> If the batch is compressed without errors, the compressed buffers for
> the batch are stored in zsmalloc. In case of compression errors, the
> current behavior based on whether the folio is enabled for zswap
> writeback, is preserved.
>
> The batched zswap_compress() incorporates Herbert's suggestion for
> SG lists to represent the batch's inputs/outputs to interface with the
> crypto API [1].
>
> Performance data:
> =================
> As suggested by Barry, this is the performance data gathered on Intel
> Sapphire Rapids with two workloads:
>
> 1) 30 usemem processes in a 150 GB memory limited cgroup, each
> allocates 10G, i.e, effectively running at 50% memory pressure.
> 2) kernel_compilation "defconfig", 32 threads, cgroup memory limit set
> to 1.7 GiB (50% memory pressure, since baseline memory usage is 3.4
> GiB): data averaged across 10 runs.
>
> To keep comparisons simple, all testing was done without the
> zswap shrinker.
>
> =========================================================================
> IAA mm-unstable-1-23-2026 v14
> =========================================================================
> zswap compressor deflate-iaa deflate-iaa IAA Batching
> vs.
> IAA Sequential
> =========================================================================
> usemem30, 64K folios:
>
> Total throughput (KB/s) 6,226,967 10,551,714 69%
> Average throughput (KB/s) 207,565 351,723 69%
> elapsed time (sec) 99.19 67.45 -32%
> sys time (sec) 2,356.19 1,580.47 -33%
>
> usemem30, PMD folios:
>
> Total throughput (KB/s) 6,347,201 11,315,500 78%
> Average throughput (KB/s) 211,573 377,183 78%
> elapsed time (sec) 88.14 63.37 -28%
> sys time (sec) 2,025.53 1,455.23 -28%
>
> kernel_compilation, 64K folios:
>
> elapsed time (sec) 100.10 98.74 -1.4%
> sys time (sec) 308.72 301.23 -2%
>
> kernel_compilation, PMD folios:
>
> elapsed time (sec) 95.29 93.44 -1.9%
> sys time (sec) 346.21 344.48 -0.5%
> =========================================================================
>
> =========================================================================
> ZSTD mm-unstable-1-23-2026 v14
> =========================================================================
> zswap compressor zstd zstd v14 ZSTD
> Improvement
> =========================================================================
> usemem30, 64K folios:
>
> Total throughput (KB/s) 6,032,326 6,047,448 0.3%
> Average throughput (KB/s) 201,077 201,581 0.3%
> elapsed time (sec) 97.52 95.33 -2.2%
> sys time (sec) 2,415.40 2,328.38 -4%
>
> usemem30, PMD folios:
>
> Total throughput (KB/s) 6,570,404 6,623,962 0.8%
> Average throughput (KB/s) 219,013 220,798 0.8%
> elapsed time (sec) 89.17 88.25 -1%
> sys time (sec) 2,126.69 2,043.08 -4%
>
> kernel_compilation, 64K folios:
>
> elapsed time (sec) 100.89 99.98 -0.9%
> sys time (sec) 417.49 414.62 -0.7%
>
> kernel_compilation, PMD folios:
>
> elapsed time (sec) 98.26 97.38 -0.9%
> sys time (sec) 487.14 473.16 -2.9%
> =========================================================================
>
> Architectural considerations for the zswap batching framework:
> ==============================================================
> We have designed the zswap batching framework to be
> hardware-agnostic. It has no dependencies on Intel-specific features and
> can be leveraged by any hardware accelerator or software-based
> compressor. In other words, the framework is open and inclusive by
> design.
>
> Potential future clients of the batching framework:
> ===================================================
> This patch-series demonstrates the performance benefits of compression
> batching when used in zswap_store() of large folios. Compression
> batching can be used for other use cases such as batching compression in
> zram, batch compression of different folios during reclaim, kcompressd,
> file systems, etc. Decompression batching can be used to improve
> efficiency of zswap writeback (Thanks Nhat for this idea), batching
> decompressions in zram, etc.
>
> Experiments with kernel_compilation "allmodconfig" that combine zswap
> compress batching, folio reclaim batching, and writeback batching show
> that 0 pages are written back with deflate-iaa and zstd. For comparison,
> the baselines for these compressors see 200K-800K pages written to disk.
> Reclaim batching relieves memory pressure faster than reclaiming one
> folio at a time, hence alleviates the need to scan slab memory for
> writeback.
>
> [1]: https://lore.kernel.org/all/aJ7Fk6RpNc815Ivd@gondor.apana.org.au/T/#m99aea2ce3d284e6c5a3253061d97b08c4752a798
>
> Signed-off-by: Kanchana P Sridhar <kanchana.p.sridhar@intel.com>
Herbert, could you please review this patch since most of it is using
new crypto APIs?
Thanks!
^ permalink raw reply [flat|nested] 48+ messages in thread
* Re: [PATCH v14 00/26] zswap compression batching with optimized iaa_crypto driver
2026-01-25 3:35 [PATCH v14 00/26] zswap compression batching with optimized iaa_crypto driver Kanchana P Sridhar
` (25 preceding siblings ...)
2026-01-25 3:35 ` [PATCH v14 26/26] mm: zswap: Batched zswap_compress() for compress batching of large folios Kanchana P Sridhar
@ 2026-02-04 18:21 ` Yosry Ahmed
2026-02-04 18:39 ` Andrew Morton
26 siblings, 1 reply; 48+ messages in thread
From: Yosry Ahmed @ 2026-02-04 18:21 UTC (permalink / raw)
To: Kanchana P Sridhar
Cc: linux-kernel, linux-mm, hannes, nphamcs, chengming.zhou,
usamaarif642, ryan.roberts, 21cnbao, ying.huang, akpm,
senozhatsky, sj, kasong, linux-crypto, herbert, davem, clabbe,
ardb, ebiggers, surenb, kristen.c.accardi, vinicius.gomes,
giovanni.cabiddu, wajdi.k.feghali
On Sat, Jan 24, 2026 at 07:35:11PM -0800, Kanchana P Sridhar wrote:
[..]
I think this series is really hard to move and respin in its current
form.
Herbert, could we take in the crypto patches separately (if they are
ready)? Not sure if it's better to take them through the crypto tree
(and provide a tag for Andrew?), or through the mm tree. But either way,
most review is on the later zswap patches and respinning all these
crypto patch every time is a pain.
>
> Kanchana P Sridhar (26):
> crypto: iaa - Reorganize the iaa_crypto driver code.
> crypto: iaa - Replace sprintf with sysfs_emit in sysfs show functions
> crypto: iaa - New architecture for IAA device WQ [de]comp usage & core
> mapping.
> crypto: iaa - Simplify, consistency of function parameters, minor
> stats bug fix.
> crypto: iaa - Descriptor allocation timeouts with mitigations.
> crypto: iaa - iaa_wq uses percpu_refs for get/put reference counting.
> crypto: iaa - Simplify the code flow in iaa_compress() and
> iaa_decompress().
> crypto: iaa - Refactor hardware descriptor setup into separate
> procedures.
> crypto: iaa - Simplified, efficient job submissions for non-irq mode.
> crypto: iaa - Deprecate exporting add/remove IAA compression modes.
> crypto: iaa - Expect a single scatterlist for a [de]compress request's
> src/dst.
> crypto: iaa - Rearchitect iaa_crypto to have clean interfaces with
> crypto_acomp.
> crypto: acomp - Define a unit_size in struct acomp_req to enable
> batching.
> crypto: acomp - Add bit to indicate segmentation support
> crypto: acomp - Add trivial segmentation wrapper
> crypto: iaa - IAA Batching for parallel compressions/decompressions.
> crypto: iaa - Submit the two largest source buffers first in batch
> decompress.
> crypto: acomp, iaa - crypto_acomp integration of IAA Batching.
> crypto: iaa - Enable async mode and make it the default.
> crypto: iaa - Disable iaa_verify_compress by default.
> crypto: iaa - Add deflate-iaa-dynamic compression mode.
> crypto: acomp - Add crypto_acomp_batch_size() to get an algorithm's
> batch-size.
> mm: zswap: Tie per-CPU acomp_ctx lifetime to the pool.
> mm: zswap: Consistently use IS_ERR_OR_NULL() to check acomp_ctx
> resources.
Andrew, I think this two zswap patches are in good shape, and are
standalone improvements. Do they apply to mm-unstable? Could we take
them in separately to lighten the load of respinning this?
> mm: zswap: Store large folios in batches.
> mm: zswap: Batched zswap_compress() for compress batching of large
> folios.
>
> .../driver-api/crypto/iaa/iaa-crypto.rst | 168 +-
> crypto/acompress.c | 110 +-
> crypto/testmgr.c | 10 +
> crypto/testmgr.h | 74 +
> drivers/crypto/intel/iaa/Makefile | 4 +-
> drivers/crypto/intel/iaa/iaa_crypto.h | 95 +-
> .../intel/iaa/iaa_crypto_comp_dynamic.c | 22 +
> drivers/crypto/intel/iaa/iaa_crypto_main.c | 2926 ++++++++++++-----
> drivers/crypto/intel/iaa/iaa_crypto_stats.c | 8 +
> drivers/crypto/intel/iaa/iaa_crypto_stats.h | 2 +
> include/crypto/acompress.h | 68 +
> include/crypto/algapi.h | 5 +
> include/crypto/internal/acompress.h | 15 +
> include/linux/crypto.h | 3 +
> mm/zswap.c | 724 ++--
> 15 files changed, 3144 insertions(+), 1090 deletions(-)
> create mode 100644 drivers/crypto/intel/iaa/iaa_crypto_comp_dynamic.c
>
> --
> 2.27.0
>
^ permalink raw reply [flat|nested] 48+ messages in thread* Re: [PATCH v14 00/26] zswap compression batching with optimized iaa_crypto driver
2026-02-04 18:21 ` [PATCH v14 00/26] zswap compression batching with optimized iaa_crypto driver Yosry Ahmed
@ 2026-02-04 18:39 ` Andrew Morton
2026-02-04 18:49 ` Yosry Ahmed
0 siblings, 1 reply; 48+ messages in thread
From: Andrew Morton @ 2026-02-04 18:39 UTC (permalink / raw)
To: Yosry Ahmed
Cc: Kanchana P Sridhar, linux-kernel, linux-mm, hannes, nphamcs,
chengming.zhou, usamaarif642, ryan.roberts, 21cnbao, ying.huang,
senozhatsky, sj, kasong, linux-crypto, herbert, davem, clabbe,
ardb, ebiggers, surenb, kristen.c.accardi, vinicius.gomes,
giovanni.cabiddu, wajdi.k.feghali
On Wed, 4 Feb 2026 18:21:43 +0000 Yosry Ahmed <yosry.ahmed@linux.dev> wrote:
> On Sat, Jan 24, 2026 at 07:35:11PM -0800, Kanchana P Sridhar wrote:
> [..]
>
> I think this series is really hard to move and respin in its current
> form.
>
> Herbert, could we take in the crypto patches separately (if they are
> ready)? Not sure if it's better to take them through the crypto tree
> (and provide a tag for Andrew?), or through the mm tree.
Keeping everything in the same tree is of course simpler.
> But either way,
> most review is on the later zswap patches and respinning all these
> crypto patch every time is a pain.
It's mainly a crypto patchset by linecount:
: .../driver-api/crypto/iaa/iaa-crypto.rst | 168 +-
: crypto/acompress.c | 110 +-
: crypto/testmgr.c | 10 +
: crypto/testmgr.h | 74 +
: drivers/crypto/intel/iaa/Makefile | 4 +-
: drivers/crypto/intel/iaa/iaa_crypto.h | 95 +-
: .../intel/iaa/iaa_crypto_comp_dynamic.c | 22 +
: drivers/crypto/intel/iaa/iaa_crypto_main.c | 2926 ++++++++++++-----
: drivers/crypto/intel/iaa/iaa_crypto_stats.c | 8 +
: drivers/crypto/intel/iaa/iaa_crypto_stats.h | 2 +
: include/crypto/acompress.h | 68 +
: include/crypto/algapi.h | 5 +
: include/crypto/internal/acompress.h | 15 +
: include/linux/crypto.h | 3 +
: mm/zswap.c | 724 ++--
: 15 files changed, 3144 insertions(+), 1090 deletions(-)
: create mode 100644 drivers/crypto/intel/iaa/iaa_crypto_comp_dynamic.c
So I expect it'll work to take all this into the crypto tree.
> > mm: zswap: Tie per-CPU acomp_ctx lifetime to the pool.
> > mm: zswap: Consistently use IS_ERR_OR_NULL() to check acomp_ctx
> > resources.
>
> Andrew, I think this two zswap patches are in good shape, and are
> standalone improvements. Do they apply to mm-unstable? Could we take
> them in separately to lighten the load of respinning this?
"mm: zswap: Tie per-CPU acomp_ctx lifetime to the pool" throws a few
rejects.
"mm: zswap: Consistently use IS_ERR_OR_NULL() to check acomp_ctx
resources" also throws rejects when applied standalone.
^ permalink raw reply [flat|nested] 48+ messages in thread
* Re: [PATCH v14 00/26] zswap compression batching with optimized iaa_crypto driver
2026-02-04 18:39 ` Andrew Morton
@ 2026-02-04 18:49 ` Yosry Ahmed
2026-02-05 4:16 ` Herbert Xu
0 siblings, 1 reply; 48+ messages in thread
From: Yosry Ahmed @ 2026-02-04 18:49 UTC (permalink / raw)
To: Andrew Morton
Cc: Kanchana P Sridhar, linux-kernel, linux-mm, hannes, nphamcs,
chengming.zhou, usamaarif642, ryan.roberts, 21cnbao, ying.huang,
senozhatsky, sj, kasong, linux-crypto, herbert, davem, clabbe,
ardb, ebiggers, surenb, kristen.c.accardi, vinicius.gomes,
giovanni.cabiddu, wajdi.k.feghali
On Wed, Feb 04, 2026 at 10:39:25AM -0800, Andrew Morton wrote:
> On Wed, 4 Feb 2026 18:21:43 +0000 Yosry Ahmed <yosry.ahmed@linux.dev> wrote:
>
> > On Sat, Jan 24, 2026 at 07:35:11PM -0800, Kanchana P Sridhar wrote:
> > [..]
> >
> > I think this series is really hard to move and respin in its current
> > form.
> >
> > Herbert, could we take in the crypto patches separately (if they are
> > ready)? Not sure if it's better to take them through the crypto tree
> > (and provide a tag for Andrew?), or through the mm tree.
>
> Keeping everything in the same tree is of course simpler.
>
> > But either way,
> > most review is on the later zswap patches and respinning all these
> > crypto patch every time is a pain.
>
> It's mainly a crypto patchset by linecount:
>
> : .../driver-api/crypto/iaa/iaa-crypto.rst | 168 +-
> : crypto/acompress.c | 110 +-
> : crypto/testmgr.c | 10 +
> : crypto/testmgr.h | 74 +
> : drivers/crypto/intel/iaa/Makefile | 4 +-
> : drivers/crypto/intel/iaa/iaa_crypto.h | 95 +-
> : .../intel/iaa/iaa_crypto_comp_dynamic.c | 22 +
> : drivers/crypto/intel/iaa/iaa_crypto_main.c | 2926 ++++++++++++-----
> : drivers/crypto/intel/iaa/iaa_crypto_stats.c | 8 +
> : drivers/crypto/intel/iaa/iaa_crypto_stats.h | 2 +
> : include/crypto/acompress.h | 68 +
> : include/crypto/algapi.h | 5 +
> : include/crypto/internal/acompress.h | 15 +
> : include/linux/crypto.h | 3 +
> : mm/zswap.c | 724 ++--
> : 15 files changed, 3144 insertions(+), 1090 deletions(-)
> : create mode 100644 drivers/crypto/intel/iaa/iaa_crypto_comp_dynamic.c
>
> So I expect it'll work to take all this into the crypto tree.
Herbert, are the crypto patches ready to be picked up? If yes, could you
please pick them, then we can figure out how to route the dependent
zswap patches based on the timeline?
>
> > > mm: zswap: Tie per-CPU acomp_ctx lifetime to the pool.
> > > mm: zswap: Consistently use IS_ERR_OR_NULL() to check acomp_ctx
> > > resources.
> >
> > Andrew, I think this two zswap patches are in good shape, and are
> > standalone improvements. Do they apply to mm-unstable? Could we take
> > them in separately to lighten the load of respinning this?
>
> "mm: zswap: Tie per-CPU acomp_ctx lifetime to the pool" throws a few
> rejects.
>
> "mm: zswap: Consistently use IS_ERR_OR_NULL() to check acomp_ctx
> resources" also throws rejects when applied standalone.
Kanchana, could you please respin these 2 changes, and a new change to
store the nid in the zswap_entry, and send them out separately?
We can land them separate from the rest of the series to accelerate
things.
^ permalink raw reply [flat|nested] 48+ messages in thread
* Re: [PATCH v14 00/26] zswap compression batching with optimized iaa_crypto driver
2026-02-04 18:49 ` Yosry Ahmed
@ 2026-02-05 4:16 ` Herbert Xu
0 siblings, 0 replies; 48+ messages in thread
From: Herbert Xu @ 2026-02-05 4:16 UTC (permalink / raw)
To: Yosry Ahmed
Cc: Andrew Morton, Kanchana P Sridhar, linux-kernel, linux-mm,
hannes, nphamcs, chengming.zhou, usamaarif642, ryan.roberts,
21cnbao, ying.huang, senozhatsky, sj, kasong, linux-crypto,
davem, clabbe, ardb, ebiggers, surenb, kristen.c.accardi,
vinicius.gomes, giovanni.cabiddu, wajdi.k.feghali
On Wed, Feb 04, 2026 at 06:49:03PM +0000, Yosry Ahmed wrote:
>
> Herbert, are the crypto patches ready to be picked up? If yes, could you
> please pick them, then we can figure out how to route the dependent
> zswap patches based on the timeline?
I can take the first half of the series (up to patch 15) and
we can use that as the base for the next revision.
Thanks,
--
Email: Herbert Xu <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt
^ permalink raw reply [flat|nested] 48+ messages in thread