[RFC PATCH 0/6] zsmalloc: make zsmalloc preemptible

linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed

* [RFC PATCH 0/6] zsmalloc: make zsmalloc preemptible
@ 2025-01-27  7:59 Sergey Senozhatsky
  2025-01-27  7:59 ` [RFC PATCH 1/6] zram: deffer slot free notification Sergey Senozhatsky
                   ` (5 more replies)
  0 siblings, 6 replies; 24+ messages in thread
From: Sergey Senozhatsky @ 2025-01-27  7:59 UTC (permalink / raw)
  To: Andrew Morton, Minchan Kim, Johannes Weiner, Yosry Ahmed, Nhat Pham
  Cc: linux-mm, linux-kernel, Sergey Senozhatsky

RFC

This is Part II of the series [1] that makes zram read() and write()
preemptible.  This part focuses only zsmalloc because zsmalloc imposes
atomicity restrictions on its users.  One notable example is object
mapping API, which returns with:
a) local CPU lock held
b) zspage rwlock held

First, zsmalloc is converted to use sleepable locks for internal
structs/data managements.  Second, a new handle mapping is introduced
(temporarily, until we remove zs_map_object/zs_unmap_object) which
doesn't use per-CPU buffers (and hence local CPU lock) but requires
users to provide a pointer to temp buffer for objects copy-in (when
needed) so the callers now should be able to schedule while having
objects mapped.

The series makes zsmalloc preemptible which, in turn, makes zram read()
preemptible.

[1] https://lore.kernel.org/linux-mm/20250127072932.1289973-1-senozhatsky@chromium.org

Sergey Senozhatsky (6):
  zram: deffer slot free notification
  zsmalloc: make zspage lock preemptible
  zsmalloc: convert to sleepable pool lock
  zsmalloc: make class lock sleepable
  zsmalloc: introduce handle mapping API
  zram: switch over to zshandle mapping API

 drivers/block/zram/zcomp.c    |   4 +-
 drivers/block/zram/zcomp.h    |   2 +
 drivers/block/zram/zram_drv.c | 169 ++++++++++-----
 drivers/block/zram/zram_drv.h |   4 +
 include/linux/zsmalloc.h      |  29 +++
 mm/zsmalloc.c                 | 372 ++++++++++++++++++++++------------
 6 files changed, 404 insertions(+), 176 deletions(-)

-- 
2.48.1.262.g85cc9f2d1e-goog

^ permalink raw reply	[flat|nested] 24+ messages in thread

* [RFC PATCH 1/6] zram: deffer slot free notification
  2025-01-27  7:59 [RFC PATCH 0/6] zsmalloc: make zsmalloc preemptible Sergey Senozhatsky
@ 2025-01-27  7:59 ` Sergey Senozhatsky
  2025-01-27  7:59 ` [RFC PATCH 2/6] zsmalloc: make zspage lock preemptible Sergey Senozhatsky
                   ` (4 subsequent siblings)
  5 siblings, 0 replies; 24+ messages in thread
From: Sergey Senozhatsky @ 2025-01-27  7:59 UTC (permalink / raw)
  To: Andrew Morton, Minchan Kim, Johannes Weiner, Yosry Ahmed, Nhat Pham
  Cc: linux-mm, linux-kernel, Sergey Senozhatsky

As of this moment ->swap_slot_free_notify is called from
atomic section (under spin-lock) which makes it impossible
to make zsmalloc fully preemptible.  Deffer slot-free to
a non-atomic context.

Signed-off-by: Sergey Senozhatsky <senozhatsky@chromium.org>
---
 drivers/block/zram/zram_drv.c | 66 +++++++++++++++++++++++++++++++++--
 drivers/block/zram/zram_drv.h |  4 +++
 2 files changed, 68 insertions(+), 2 deletions(-)

diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index ad3e8885b0d2..9c72beb86ab0 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -2315,9 +2315,52 @@ static void zram_submit_bio(struct bio *bio)
 	}
 }
 
+static void async_slot_free(struct work_struct *work)
+{
+	struct zram *zram = container_of(work, struct zram, slot_free_work);
+
+	spin_lock(&zram->slot_free_lock);
+	while (!list_empty(&zram->slot_free_list)) {
+		struct zram_pp_slot *pps;
+
+		pps = list_first_entry(&zram->slot_free_list,
+				       struct zram_pp_slot,
+				       entry);
+		list_del_init(&pps->entry);
+		spin_unlock(&zram->slot_free_lock);
+
+		zram_slot_write_lock(zram, pps->index);
+		if (zram_test_flag(zram, pps->index, ZRAM_PP_SLOT))
+			zram_free_page(zram, pps->index);
+		zram_slot_write_unlock(zram, pps->index);
+
+		kfree(pps);
+		spin_lock(&zram->slot_free_lock);
+	}
+	spin_unlock(&zram->slot_free_lock);
+};
+
+static void zram_kick_slot_free(struct zram *zram)
+{
+	schedule_work(&zram->slot_free_work);
+}
+
+static void zram_flush_slot_free(struct zram *zram)
+{
+	flush_work(&zram->slot_free_work);
+}
+
+static void zram_init_slot_free(struct zram *zram)
+{
+	spin_lock_init(&zram->slot_free_lock);
+	INIT_LIST_HEAD(&zram->slot_free_list);
+	INIT_WORK(&zram->slot_free_work, async_slot_free);
+}
+
 static void zram_slot_free_notify(struct block_device *bdev,
-				unsigned long index)
+				  unsigned long index)
 {
+	struct zram_pp_slot *pps;
 	struct zram *zram;
 
 	zram = bdev->bd_disk->private_data;
@@ -2328,7 +2371,24 @@ static void zram_slot_free_notify(struct block_device *bdev,
 		return;
 	}
 
-	zram_free_page(zram, index);
+	if (zram_test_flag(zram, index, ZRAM_PP_SLOT))
+		goto out;
+
+	pps = kzalloc(sizeof(*pps), GFP_ATOMIC);
+	if (!pps) {
+		atomic64_inc(&zram->stats.miss_free);
+		goto out;
+	}
+
+	INIT_LIST_HEAD(&pps->entry);
+	pps->index = index;
+	zram_set_flag(zram, index, ZRAM_PP_SLOT);
+	spin_lock(&zram->slot_free_lock);
+	list_add(&pps->entry, &zram->slot_free_list);
+	spin_unlock(&zram->slot_free_lock);
+
+	zram_kick_slot_free(zram);
+out:
 	zram_slot_write_unlock(zram, index);
 }
 
@@ -2473,6 +2533,7 @@ static ssize_t reset_store(struct device *dev,
 
 	/* Make sure all the pending I/O are finished */
 	sync_blockdev(disk->part0);
+	zram_flush_slot_free(zram);
 	zram_reset_device(zram);
 
 	mutex_lock(&disk->open_mutex);
@@ -2618,6 +2679,7 @@ static int zram_add(void)
 	atomic_set(&zram->pp_in_progress, 0);
 	zram_comp_params_reset(zram);
 	comp_algorithm_set(zram, ZRAM_PRIMARY_COMP, default_compressor);
+	zram_init_slot_free(zram);
 
 	/* Actual capacity set using sysfs (/sys/block/zram<id>/disksize */
 	set_capacity(zram->disk, 0);
diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h
index b7e250d6fa02..27ca269f4a4e 100644
--- a/drivers/block/zram/zram_drv.h
+++ b/drivers/block/zram/zram_drv.h
@@ -134,5 +134,9 @@ struct zram {
 	struct dentry *debugfs_dir;
 #endif
 	atomic_t pp_in_progress;
+
+	spinlock_t slot_free_lock;
+	struct list_head slot_free_list;
+	struct work_struct slot_free_work;
 };
 #endif
-- 
2.48.1.262.g85cc9f2d1e-goog



^ permalink raw reply	[flat|nested] 24+ messages in thread

* [RFC PATCH 2/6] zsmalloc: make zspage lock preemptible
  2025-01-27  7:59 [RFC PATCH 0/6] zsmalloc: make zsmalloc preemptible Sergey Senozhatsky
  2025-01-27  7:59 ` [RFC PATCH 1/6] zram: deffer slot free notification Sergey Senozhatsky
@ 2025-01-27  7:59 ` Sergey Senozhatsky
  2025-01-27 20:23   ` Uros Bizjak
  2025-01-27  7:59 ` [RFC PATCH 3/6] zsmalloc: convert to sleepable pool lock Sergey Senozhatsky
                   ` (3 subsequent siblings)
  5 siblings, 1 reply; 24+ messages in thread
From: Sergey Senozhatsky @ 2025-01-27  7:59 UTC (permalink / raw)
  To: Andrew Morton, Minchan Kim, Johannes Weiner, Yosry Ahmed, Nhat Pham
  Cc: linux-mm, linux-kernel, Sergey Senozhatsky

Switch over from rwlock_t to a atomic_t variable that takes
negative value when the page is under migration, or positive
values when the page is used by zsmalloc users (object map,
etc.)  Using a rwsem per-zspage is a little too memory heavy,
a simple atomic_t should suffice, after all we only need to
mark zspage as either used-for-write or used-for-read.  This
is needed to make zsmalloc preemtible in the future.

Signed-off-by: Sergey Senozhatsky <senozhatsky@chromium.org>
---
 mm/zsmalloc.c | 112 +++++++++++++++++++++++++++++---------------------
 1 file changed, 66 insertions(+), 46 deletions(-)

diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index 817626a351f8..28a75bfbeaa6 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -257,6 +257,9 @@ static inline void free_zpdesc(struct zpdesc *zpdesc)
 	__free_page(page);
 }
 
+#define ZS_PAGE_UNLOCKED	0
+#define ZS_PAGE_WRLOCKED	-1
+
 struct zspage {
 	struct {
 		unsigned int huge:HUGE_BITS;
@@ -269,7 +272,7 @@ struct zspage {
 	struct zpdesc *first_zpdesc;
 	struct list_head list; /* fullness list */
 	struct zs_pool *pool;
-	rwlock_t lock;
+	atomic_t lock;
 };
 
 struct mapping_area {
@@ -290,11 +293,53 @@ static bool ZsHugePage(struct zspage *zspage)
 	return zspage->huge;
 }
 
-static void migrate_lock_init(struct zspage *zspage);
-static void migrate_read_lock(struct zspage *zspage);
-static void migrate_read_unlock(struct zspage *zspage);
-static void migrate_write_lock(struct zspage *zspage);
-static void migrate_write_unlock(struct zspage *zspage);
+static void zspage_lock_init(struct zspage *zspage)
+{
+	atomic_set(&zspage->lock, ZS_PAGE_UNLOCKED);
+}
+
+static void zspage_read_lock(struct zspage *zspage)
+{
+	atomic_t *lock = &zspage->lock;
+	int old;
+
+	while (1) {
+		old = atomic_read(lock);
+		if (old == ZS_PAGE_WRLOCKED) {
+			cpu_relax();
+			continue;
+		}
+
+		if (atomic_cmpxchg(lock, old, old + 1) == old)
+			return;
+
+		cpu_relax();
+	}
+}
+
+static void zspage_read_unlock(struct zspage *zspage)
+{
+	atomic_dec(&zspage->lock);
+}
+
+static void zspage_write_lock(struct zspage *zspage)
+{
+	atomic_t *lock = &zspage->lock;
+	int old;
+
+	while (1) {
+		old = atomic_cmpxchg(lock, ZS_PAGE_UNLOCKED, ZS_PAGE_WRLOCKED);
+		if (old == ZS_PAGE_UNLOCKED)
+			return;
+
+		cpu_relax();
+	}
+}
+
+static void zspage_write_unlock(struct zspage *zspage)
+{
+	atomic_set(&zspage->lock, ZS_PAGE_UNLOCKED);
+}
 
 #ifdef CONFIG_COMPACTION
 static void kick_deferred_free(struct zs_pool *pool);
@@ -992,7 +1037,7 @@ static struct zspage *alloc_zspage(struct zs_pool *pool,
 		return NULL;
 
 	zspage->magic = ZSPAGE_MAGIC;
-	migrate_lock_init(zspage);
+	zspage_lock_init(zspage);
 
 	for (i = 0; i < class->pages_per_zspage; i++) {
 		struct zpdesc *zpdesc;
@@ -1217,7 +1262,7 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle,
 	 * zs_unmap_object API so delegate the locking from class to zspage
 	 * which is smaller granularity.
 	 */
-	migrate_read_lock(zspage);
+	zspage_read_lock(zspage);
 	read_unlock(&pool->migrate_lock);
 
 	class = zspage_class(pool, zspage);
@@ -1277,7 +1322,7 @@ void zs_unmap_object(struct zs_pool *pool, unsigned long handle)
 	}
 	local_unlock(&zs_map_area.lock);
 
-	migrate_read_unlock(zspage);
+	zspage_read_unlock(zspage);
 }
 EXPORT_SYMBOL_GPL(zs_unmap_object);
 
@@ -1671,18 +1716,18 @@ static void lock_zspage(struct zspage *zspage)
 	/*
 	 * Pages we haven't locked yet can be migrated off the list while we're
 	 * trying to lock them, so we need to be careful and only attempt to
-	 * lock each page under migrate_read_lock(). Otherwise, the page we lock
+	 * lock each page under zspage_read_lock(). Otherwise, the page we lock
 	 * may no longer belong to the zspage. This means that we may wait for
 	 * the wrong page to unlock, so we must take a reference to the page
-	 * prior to waiting for it to unlock outside migrate_read_lock().
+	 * prior to waiting for it to unlock outside zspage_read_lock().
 	 */
 	while (1) {
-		migrate_read_lock(zspage);
+		zspage_read_lock(zspage);
 		zpdesc = get_first_zpdesc(zspage);
 		if (zpdesc_trylock(zpdesc))
 			break;
 		zpdesc_get(zpdesc);
-		migrate_read_unlock(zspage);
+		zspage_read_unlock(zspage);
 		zpdesc_wait_locked(zpdesc);
 		zpdesc_put(zpdesc);
 	}
@@ -1693,41 +1738,16 @@ static void lock_zspage(struct zspage *zspage)
 			curr_zpdesc = zpdesc;
 		} else {
 			zpdesc_get(zpdesc);
-			migrate_read_unlock(zspage);
+			zspage_read_unlock(zspage);
 			zpdesc_wait_locked(zpdesc);
 			zpdesc_put(zpdesc);
-			migrate_read_lock(zspage);
+			zspage_read_lock(zspage);
 		}
 	}
-	migrate_read_unlock(zspage);
+	zspage_read_unlock(zspage);
 }
 #endif /* CONFIG_COMPACTION */
 
-static void migrate_lock_init(struct zspage *zspage)
-{
-	rwlock_init(&zspage->lock);
-}
-
-static void migrate_read_lock(struct zspage *zspage) __acquires(&zspage->lock)
-{
-	read_lock(&zspage->lock);
-}
-
-static void migrate_read_unlock(struct zspage *zspage) __releases(&zspage->lock)
-{
-	read_unlock(&zspage->lock);
-}
-
-static void migrate_write_lock(struct zspage *zspage)
-{
-	write_lock(&zspage->lock);
-}
-
-static void migrate_write_unlock(struct zspage *zspage)
-{
-	write_unlock(&zspage->lock);
-}
-
 #ifdef CONFIG_COMPACTION
 
 static const struct movable_operations zsmalloc_mops;
@@ -1803,8 +1823,8 @@ static int zs_page_migrate(struct page *newpage, struct page *page,
 	 * the class lock protects zpage alloc/free in the zspage.
 	 */
 	spin_lock(&class->lock);
-	/* the migrate_write_lock protects zpage access via zs_map_object */
-	migrate_write_lock(zspage);
+	/* the zspage_write_lock protects zpage access via zs_map_object */
+	zspage_write_lock(zspage);
 
 	offset = get_first_obj_offset(zpdesc);
 	s_addr = kmap_local_zpdesc(zpdesc);
@@ -1835,7 +1855,7 @@ static int zs_page_migrate(struct page *newpage, struct page *page,
 	 */
 	write_unlock(&pool->migrate_lock);
 	spin_unlock(&class->lock);
-	migrate_write_unlock(zspage);
+	zspage_write_unlock(zspage);
 
 	zpdesc_get(newzpdesc);
 	if (zpdesc_zone(newzpdesc) != zpdesc_zone(zpdesc)) {
@@ -1971,9 +1991,9 @@ static unsigned long __zs_compact(struct zs_pool *pool,
 		if (!src_zspage)
 			break;
 
-		migrate_write_lock(src_zspage);
+		zspage_write_lock(src_zspage);
 		migrate_zspage(pool, src_zspage, dst_zspage);
-		migrate_write_unlock(src_zspage);
+		zspage_write_unlock(src_zspage);
 
 		fg = putback_zspage(class, src_zspage);
 		if (fg == ZS_INUSE_RATIO_0) {
-- 
2.48.1.262.g85cc9f2d1e-goog



^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [RFC PATCH 2/6] zsmalloc: make zspage lock preemptible
  2025-01-27  7:59 ` [RFC PATCH 2/6] zsmalloc: make zspage lock preemptible Sergey Senozhatsky
@ 2025-01-27 20:23   ` Uros Bizjak
  2025-01-28  0:29     ` Sergey Senozhatsky
  0 siblings, 1 reply; 24+ messages in thread
From: Uros Bizjak @ 2025-01-27 20:23 UTC (permalink / raw)
  To: Sergey Senozhatsky, Andrew Morton, Minchan Kim, Johannes Weiner,
	Yosry Ahmed, Nhat Pham
  Cc: linux-mm, linux-kernel



On 27. 01. 25 08:59, Sergey Senozhatsky wrote:
> Switch over from rwlock_t to a atomic_t variable that takes
> negative value when the page is under migration, or positive
> values when the page is used by zsmalloc users (object map,
> etc.)  Using a rwsem per-zspage is a little too memory heavy,
> a simple atomic_t should suffice, after all we only need to
> mark zspage as either used-for-write or used-for-read.  This
> is needed to make zsmalloc preemtible in the future.
> 
> Signed-off-by: Sergey Senozhatsky <senozhatsky@chromium.org>
> ---
>   mm/zsmalloc.c | 112 +++++++++++++++++++++++++++++---------------------
>   1 file changed, 66 insertions(+), 46 deletions(-)
> 
> diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
> index 817626a351f8..28a75bfbeaa6 100644
> --- a/mm/zsmalloc.c
> +++ b/mm/zsmalloc.c
> @@ -257,6 +257,9 @@ static inline void free_zpdesc(struct zpdesc *zpdesc)
>   	__free_page(page);
>   }
>   
> +#define ZS_PAGE_UNLOCKED	0
> +#define ZS_PAGE_WRLOCKED	-1
> +
>   struct zspage {
>   	struct {
>   		unsigned int huge:HUGE_BITS;
> @@ -269,7 +272,7 @@ struct zspage {
>   	struct zpdesc *first_zpdesc;
>   	struct list_head list; /* fullness list */
>   	struct zs_pool *pool;
> -	rwlock_t lock;
> +	atomic_t lock;
>   };
>   
>   struct mapping_area {
> @@ -290,11 +293,53 @@ static bool ZsHugePage(struct zspage *zspage)
>   	return zspage->huge;
>   }
>   
> -static void migrate_lock_init(struct zspage *zspage);
> -static void migrate_read_lock(struct zspage *zspage);
> -static void migrate_read_unlock(struct zspage *zspage);
> -static void migrate_write_lock(struct zspage *zspage);
> -static void migrate_write_unlock(struct zspage *zspage);
> +static void zspage_lock_init(struct zspage *zspage)
> +{
> +	atomic_set(&zspage->lock, ZS_PAGE_UNLOCKED);
> +}
> +
> +static void zspage_read_lock(struct zspage *zspage)
> +{
> +	atomic_t *lock = &zspage->lock;
> +	int old;
> +
> +	while (1) {
> +		old = atomic_read(lock);
> +		if (old == ZS_PAGE_WRLOCKED) {
> +			cpu_relax();
> +			continue;
> +		}
> +
> +		if (atomic_cmpxchg(lock, old, old + 1) == old)
> +			return;

You can use atomic_try_cmpxchg() here:

if (atomic_try_cmpxchg(lock, &old, old + 1))
         return;

> +
> +		cpu_relax();
> +	}
> +}
> +
> +static void zspage_read_unlock(struct zspage *zspage)
> +{
> +	atomic_dec(&zspage->lock);
> +}
> +
> +static void zspage_write_lock(struct zspage *zspage)
> +{
> +	atomic_t *lock = &zspage->lock;
> +	int old;
> +
> +	while (1) {
> +		old = atomic_cmpxchg(lock, ZS_PAGE_UNLOCKED, ZS_PAGE_WRLOCKED);
> +		if (old == ZS_PAGE_UNLOCKED)
> +			return;

Also, the above code can be rewritten as:

while (1) {
         old = ZS_PAGE_UNLOCKED;
         if (atomic_try_cmpxchg (lock, &old, ZS_PAGE_WRLOCKED))
                 return;	
> +
> +		cpu_relax();
> +	}
> +}

The above change will result in a slightly better generated asm.

Uros.


^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [RFC PATCH 2/6] zsmalloc: make zspage lock preemptible
  2025-01-27 20:23   ` Uros Bizjak
@ 2025-01-28  0:29     ` Sergey Senozhatsky
  0 siblings, 0 replies; 24+ messages in thread
From: Sergey Senozhatsky @ 2025-01-28  0:29 UTC (permalink / raw)
  To: Uros Bizjak
  Cc: Sergey Senozhatsky, Andrew Morton, Minchan Kim, Johannes Weiner,
	Yosry Ahmed, Nhat Pham, linux-mm, linux-kernel

On (25/01/27 21:23), Uros Bizjak wrote:
> > +static void zspage_read_lock(struct zspage *zspage)
> > +{
> > +	atomic_t *lock = &zspage->lock;
> > +	int old;
> > +
> > +	while (1) {
> > +		old = atomic_read(lock);
> > +		if (old == ZS_PAGE_WRLOCKED) {
> > +			cpu_relax();
> > +			continue;
> > +		}
> > +
> > +		if (atomic_cmpxchg(lock, old, old + 1) == old)
> > +			return;
> 
> You can use atomic_try_cmpxchg() here:
> 
> if (atomic_try_cmpxchg(lock, &old, old + 1))
>         return;
> 
> > +
> > +		cpu_relax();
> > +	}
> > +}
> > +
> > +static void zspage_read_unlock(struct zspage *zspage)
> > +{
> > +	atomic_dec(&zspage->lock);
> > +}
> > +
> > +static void zspage_write_lock(struct zspage *zspage)
> > +{
> > +	atomic_t *lock = &zspage->lock;
> > +	int old;
> > +
> > +	while (1) {
> > +		old = atomic_cmpxchg(lock, ZS_PAGE_UNLOCKED, ZS_PAGE_WRLOCKED);
> > +		if (old == ZS_PAGE_UNLOCKED)
> > +			return;
> 
> Also, the above code can be rewritten as:
> 
> while (1) {
>         old = ZS_PAGE_UNLOCKED;
>         if (atomic_try_cmpxchg (lock, &old, ZS_PAGE_WRLOCKED))
>                 return;	
> > +
> > +		cpu_relax();
> > +	}
> > +}
> 
> The above change will result in a slightly better generated asm.

Thanks, I'll take a look for the next version.


^ permalink raw reply	[flat|nested] 24+ messages in thread

* [RFC PATCH 3/6] zsmalloc: convert to sleepable pool lock
  2025-01-27  7:59 [RFC PATCH 0/6] zsmalloc: make zsmalloc preemptible Sergey Senozhatsky
  2025-01-27  7:59 ` [RFC PATCH 1/6] zram: deffer slot free notification Sergey Senozhatsky
  2025-01-27  7:59 ` [RFC PATCH 2/6] zsmalloc: make zspage lock preemptible Sergey Senozhatsky
@ 2025-01-27  7:59 ` Sergey Senozhatsky
  2025-01-27  7:59 ` [RFC PATCH 4/6] zsmalloc: make class lock sleepable Sergey Senozhatsky
                   ` (2 subsequent siblings)
  5 siblings, 0 replies; 24+ messages in thread
From: Sergey Senozhatsky @ 2025-01-27  7:59 UTC (permalink / raw)
  To: Andrew Morton, Minchan Kim, Johannes Weiner, Yosry Ahmed, Nhat Pham
  Cc: linux-mm, linux-kernel, Sergey Senozhatsky

Switch over from rwlock_t to rwsemaphore, also introduce
simple helpers to lock/unlock the pool.  This is needed
to make zsmalloc preemptible in the future.

Signed-off-by: Sergey Senozhatsky <senozhatsky@chromium.org>
---
 mm/zsmalloc.c | 58 ++++++++++++++++++++++++++++++++++++---------------
 1 file changed, 41 insertions(+), 17 deletions(-)

diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index 28a75bfbeaa6..751871ec533f 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -204,8 +204,8 @@ struct link_free {
 };
 
 struct zs_pool {
-	const char *name;
-
+	/* protect page/zspage migration */
+	struct rw_semaphore migrate_lock;
 	struct size_class *size_class[ZS_SIZE_CLASSES];
 	struct kmem_cache *handle_cachep;
 	struct kmem_cache *zspage_cachep;
@@ -216,6 +216,7 @@ struct zs_pool {
 
 	/* Compact classes */
 	struct shrinker *shrinker;
+	atomic_t compaction_in_progress;
 
 #ifdef CONFIG_ZSMALLOC_STAT
 	struct dentry *stat_dentry;
@@ -223,11 +224,34 @@ struct zs_pool {
 #ifdef CONFIG_COMPACTION
 	struct work_struct free_work;
 #endif
-	/* protect page/zspage migration */
-	rwlock_t migrate_lock;
-	atomic_t compaction_in_progress;
+	const char *name;
 };
 
+static void pool_write_unlock(struct zs_pool *pool)
+{
+	up_write(&pool->migrate_lock);
+}
+
+static void pool_write_lock(struct zs_pool *pool)
+{
+	down_write(&pool->migrate_lock);
+}
+
+static void pool_read_unlock(struct zs_pool *pool)
+{
+	up_read(&pool->migrate_lock);
+}
+
+static void pool_read_lock(struct zs_pool *pool)
+{
+	down_read(&pool->migrate_lock);
+}
+
+static bool zspool_lock_is_contended(struct zs_pool *pool)
+{
+	return rwsem_is_contended(&pool->migrate_lock);
+}
+
 static inline void zpdesc_set_first(struct zpdesc *zpdesc)
 {
 	SetPagePrivate(zpdesc_page(zpdesc));
@@ -1251,7 +1275,7 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle,
 	BUG_ON(in_interrupt());
 
 	/* It guarantees it can get zspage from handle safely */
-	read_lock(&pool->migrate_lock);
+	pool_read_lock(pool);
 	obj = handle_to_obj(handle);
 	obj_to_location(obj, &zpdesc, &obj_idx);
 	zspage = get_zspage(zpdesc);
@@ -1263,7 +1287,7 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle,
 	 * which is smaller granularity.
 	 */
 	zspage_read_lock(zspage);
-	read_unlock(&pool->migrate_lock);
+	pool_read_unlock(pool);
 
 	class = zspage_class(pool, zspage);
 	off = offset_in_page(class->size * obj_idx);
@@ -1498,13 +1522,13 @@ void zs_free(struct zs_pool *pool, unsigned long handle)
 	 * The pool->migrate_lock protects the race with zpage's migration
 	 * so it's safe to get the page from handle.
 	 */
-	read_lock(&pool->migrate_lock);
+	pool_read_lock(pool);
 	obj = handle_to_obj(handle);
 	obj_to_zpdesc(obj, &f_zpdesc);
 	zspage = get_zspage(f_zpdesc);
 	class = zspage_class(pool, zspage);
 	spin_lock(&class->lock);
-	read_unlock(&pool->migrate_lock);
+	pool_read_unlock(pool);
 
 	class_stat_sub(class, ZS_OBJS_INUSE, 1);
 	obj_free(class->size, obj);
@@ -1816,7 +1840,7 @@ static int zs_page_migrate(struct page *newpage, struct page *page,
 	 * The pool migrate_lock protects the race between zpage migration
 	 * and zs_free.
 	 */
-	write_lock(&pool->migrate_lock);
+	pool_write_lock(pool);
 	class = zspage_class(pool, zspage);
 
 	/*
@@ -1853,7 +1877,7 @@ static int zs_page_migrate(struct page *newpage, struct page *page,
 	 * Since we complete the data copy and set up new zspage structure,
 	 * it's okay to release migration_lock.
 	 */
-	write_unlock(&pool->migrate_lock);
+	pool_write_unlock(pool);
 	spin_unlock(&class->lock);
 	zspage_write_unlock(zspage);
 
@@ -1976,7 +2000,7 @@ static unsigned long __zs_compact(struct zs_pool *pool,
 	 * protect the race between zpage migration and zs_free
 	 * as well as zpage allocation/free
 	 */
-	write_lock(&pool->migrate_lock);
+	pool_write_lock(pool);
 	spin_lock(&class->lock);
 	while (zs_can_compact(class)) {
 		int fg;
@@ -2003,14 +2027,14 @@ static unsigned long __zs_compact(struct zs_pool *pool,
 		src_zspage = NULL;
 
 		if (get_fullness_group(class, dst_zspage) == ZS_INUSE_RATIO_100
-		    || rwlock_is_contended(&pool->migrate_lock)) {
+		    || zspool_lock_is_contended(pool)) {
 			putback_zspage(class, dst_zspage);
 			dst_zspage = NULL;
 
 			spin_unlock(&class->lock);
-			write_unlock(&pool->migrate_lock);
+			pool_write_unlock(pool);
 			cond_resched();
-			write_lock(&pool->migrate_lock);
+			pool_write_lock(pool);
 			spin_lock(&class->lock);
 		}
 	}
@@ -2022,7 +2046,7 @@ static unsigned long __zs_compact(struct zs_pool *pool,
 		putback_zspage(class, dst_zspage);
 
 	spin_unlock(&class->lock);
-	write_unlock(&pool->migrate_lock);
+	pool_write_unlock(pool);
 
 	return pages_freed;
 }
@@ -2159,7 +2183,7 @@ struct zs_pool *zs_create_pool(const char *name)
 		return NULL;
 
 	init_deferred_free(pool);
-	rwlock_init(&pool->migrate_lock);
+	init_rwsem(&pool->migrate_lock);
 	atomic_set(&pool->compaction_in_progress, 0);
 
 	pool->name = kstrdup(name, GFP_KERNEL);
-- 
2.48.1.262.g85cc9f2d1e-goog



^ permalink raw reply	[flat|nested] 24+ messages in thread

* [RFC PATCH 4/6] zsmalloc: make class lock sleepable
  2025-01-27  7:59 [RFC PATCH 0/6] zsmalloc: make zsmalloc preemptible Sergey Senozhatsky
                   ` (2 preceding siblings ...)
  2025-01-27  7:59 ` [RFC PATCH 3/6] zsmalloc: convert to sleepable pool lock Sergey Senozhatsky
@ 2025-01-27  7:59 ` Sergey Senozhatsky
  2025-01-27  7:59 ` [RFC PATCH 5/6] zsmalloc: introduce handle mapping API Sergey Senozhatsky
  2025-01-27  7:59 ` [RFC PATCH 6/6] zram: switch over to zshandle " Sergey Senozhatsky
  5 siblings, 0 replies; 24+ messages in thread
From: Sergey Senozhatsky @ 2025-01-27  7:59 UTC (permalink / raw)
  To: Andrew Morton, Minchan Kim, Johannes Weiner, Yosry Ahmed, Nhat Pham
  Cc: linux-mm, linux-kernel, Sergey Senozhatsky

Switch over from spin-lock to mutex, also introduce simple
helpers to lock/unlock size class.  This is needed to make
zsmalloc preemptible in the future.

Signed-off-by: Sergey Senozhatsky <senozhatsky@chromium.org>
---
 mm/zsmalloc.c | 54 ++++++++++++++++++++++++++++-----------------------
 1 file changed, 30 insertions(+), 24 deletions(-)

diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index 751871ec533f..a5c1f9852072 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -168,7 +168,7 @@ static struct dentry *zs_stat_root;
 static size_t huge_class_size;
 
 struct size_class {
-	spinlock_t lock;
+	struct mutex lock;
 	struct list_head fullness_list[NR_FULLNESS_GROUPS];
 	/*
 	 * Size of objects stored in this class. Must be multiple
@@ -252,6 +252,16 @@ static bool zspool_lock_is_contended(struct zs_pool *pool)
 	return rwsem_is_contended(&pool->migrate_lock);
 }
 
+static void size_class_lock(struct size_class *class)
+{
+	mutex_lock(&class->lock);
+}
+
+static void size_class_unlock(struct size_class *class)
+{
+	mutex_unlock(&class->lock);
+}
+
 static inline void zpdesc_set_first(struct zpdesc *zpdesc)
 {
 	SetPagePrivate(zpdesc_page(zpdesc));
@@ -657,8 +667,7 @@ static int zs_stats_size_show(struct seq_file *s, void *v)
 		if (class->index != i)
 			continue;
 
-		spin_lock(&class->lock);
-
+		size_class_lock(class);
 		seq_printf(s, " %5u %5u ", i, class->size);
 		for (fg = ZS_INUSE_RATIO_10; fg < NR_FULLNESS_GROUPS; fg++) {
 			inuse_totals[fg] += class_stat_read(class, fg);
@@ -668,7 +677,7 @@ static int zs_stats_size_show(struct seq_file *s, void *v)
 		obj_allocated = class_stat_read(class, ZS_OBJS_ALLOCATED);
 		obj_used = class_stat_read(class, ZS_OBJS_INUSE);
 		freeable = zs_can_compact(class);
-		spin_unlock(&class->lock);
+		size_class_unlock(class);
 
 		objs_per_zspage = class->objs_per_zspage;
 		pages_used = obj_allocated / objs_per_zspage *
@@ -926,8 +935,6 @@ static void __free_zspage(struct zs_pool *pool, struct size_class *class,
 {
 	struct zpdesc *zpdesc, *next;
 
-	assert_spin_locked(&class->lock);
-
 	VM_BUG_ON(get_zspage_inuse(zspage));
 	VM_BUG_ON(zspage->fullness != ZS_INUSE_RATIO_0);
 
@@ -1443,7 +1450,7 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp)
 	class = pool->size_class[get_size_class_index(size)];
 
 	/* class->lock effectively protects the zpage migration */
-	spin_lock(&class->lock);
+	size_class_lock(class);
 	zspage = find_get_zspage(class);
 	if (likely(zspage)) {
 		obj_malloc(pool, zspage, handle);
@@ -1453,8 +1460,7 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp)
 
 		goto out;
 	}
-
-	spin_unlock(&class->lock);
+	size_class_unlock(class);
 
 	zspage = alloc_zspage(pool, class, gfp);
 	if (!zspage) {
@@ -1462,7 +1468,7 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp)
 		return (unsigned long)ERR_PTR(-ENOMEM);
 	}
 
-	spin_lock(&class->lock);
+	size_class_lock(class);
 	obj_malloc(pool, zspage, handle);
 	newfg = get_fullness_group(class, zspage);
 	insert_zspage(class, zspage, newfg);
@@ -1473,7 +1479,7 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp)
 	/* We completely set up zspage so mark them as movable */
 	SetZsPageMovable(pool, zspage);
 out:
-	spin_unlock(&class->lock);
+	size_class_unlock(class);
 
 	return handle;
 }
@@ -1527,7 +1533,7 @@ void zs_free(struct zs_pool *pool, unsigned long handle)
 	obj_to_zpdesc(obj, &f_zpdesc);
 	zspage = get_zspage(f_zpdesc);
 	class = zspage_class(pool, zspage);
-	spin_lock(&class->lock);
+	size_class_lock(class);
 	pool_read_unlock(pool);
 
 	class_stat_sub(class, ZS_OBJS_INUSE, 1);
@@ -1537,7 +1543,7 @@ void zs_free(struct zs_pool *pool, unsigned long handle)
 	if (fullness == ZS_INUSE_RATIO_0)
 		free_zspage(pool, class, zspage);
 
-	spin_unlock(&class->lock);
+	size_class_unlock(class);
 	cache_free_handle(pool, handle);
 }
 EXPORT_SYMBOL_GPL(zs_free);
@@ -1846,7 +1852,7 @@ static int zs_page_migrate(struct page *newpage, struct page *page,
 	/*
 	 * the class lock protects zpage alloc/free in the zspage.
 	 */
-	spin_lock(&class->lock);
+	size_class_lock(class);
 	/* the zspage_write_lock protects zpage access via zs_map_object */
 	zspage_write_lock(zspage);
 
@@ -1878,7 +1884,7 @@ static int zs_page_migrate(struct page *newpage, struct page *page,
 	 * it's okay to release migration_lock.
 	 */
 	pool_write_unlock(pool);
-	spin_unlock(&class->lock);
+	size_class_unlock(class);
 	zspage_write_unlock(zspage);
 
 	zpdesc_get(newzpdesc);
@@ -1922,10 +1928,10 @@ static void async_free_zspage(struct work_struct *work)
 		if (class->index != i)
 			continue;
 
-		spin_lock(&class->lock);
+		size_class_lock(class);
 		list_splice_init(&class->fullness_list[ZS_INUSE_RATIO_0],
 				 &free_pages);
-		spin_unlock(&class->lock);
+		size_class_unlock(class);
 	}
 
 	list_for_each_entry_safe(zspage, tmp, &free_pages, list) {
@@ -1933,10 +1939,10 @@ static void async_free_zspage(struct work_struct *work)
 		lock_zspage(zspage);
 
 		class = zspage_class(pool, zspage);
-		spin_lock(&class->lock);
+		size_class_lock(class);
 		class_stat_sub(class, ZS_INUSE_RATIO_0, 1);
 		__free_zspage(pool, class, zspage);
-		spin_unlock(&class->lock);
+		size_class_unlock(class);
 	}
 };
 
@@ -2001,7 +2007,7 @@ static unsigned long __zs_compact(struct zs_pool *pool,
 	 * as well as zpage allocation/free
 	 */
 	pool_write_lock(pool);
-	spin_lock(&class->lock);
+	size_class_lock(class);
 	while (zs_can_compact(class)) {
 		int fg;
 
@@ -2031,11 +2037,11 @@ static unsigned long __zs_compact(struct zs_pool *pool,
 			putback_zspage(class, dst_zspage);
 			dst_zspage = NULL;
 
-			spin_unlock(&class->lock);
+			size_class_unlock(class);
 			pool_write_unlock(pool);
 			cond_resched();
 			pool_write_lock(pool);
-			spin_lock(&class->lock);
+			size_class_lock(class);
 		}
 	}
 
@@ -2045,7 +2051,7 @@ static unsigned long __zs_compact(struct zs_pool *pool,
 	if (dst_zspage)
 		putback_zspage(class, dst_zspage);
 
-	spin_unlock(&class->lock);
+	size_class_unlock(class);
 	pool_write_unlock(pool);
 
 	return pages_freed;
@@ -2255,7 +2261,7 @@ struct zs_pool *zs_create_pool(const char *name)
 		class->index = i;
 		class->pages_per_zspage = pages_per_zspage;
 		class->objs_per_zspage = objs_per_zspage;
-		spin_lock_init(&class->lock);
+		mutex_init(&class->lock);
 		pool->size_class[i] = class;
 
 		fullness = ZS_INUSE_RATIO_0;
-- 
2.48.1.262.g85cc9f2d1e-goog



^ permalink raw reply	[flat|nested] 24+ messages in thread

* [RFC PATCH 5/6] zsmalloc: introduce handle mapping API
  2025-01-27  7:59 [RFC PATCH 0/6] zsmalloc: make zsmalloc preemptible Sergey Senozhatsky
                   ` (3 preceding siblings ...)
  2025-01-27  7:59 ` [RFC PATCH 4/6] zsmalloc: make class lock sleepable Sergey Senozhatsky
@ 2025-01-27  7:59 ` Sergey Senozhatsky
  2025-01-27 21:26   ` Yosry Ahmed
  2025-01-27 21:58   ` Yosry Ahmed
  2025-01-27  7:59 ` [RFC PATCH 6/6] zram: switch over to zshandle " Sergey Senozhatsky
  5 siblings, 2 replies; 24+ messages in thread
From: Sergey Senozhatsky @ 2025-01-27  7:59 UTC (permalink / raw)
  To: Andrew Morton, Minchan Kim, Johannes Weiner, Yosry Ahmed, Nhat Pham
  Cc: linux-mm, linux-kernel, Sergey Senozhatsky

Introduce new API to map/unmap zsmalloc handle/object.  The key
difference is that this API does not impose atomicity restrictions
on its users, unlike zs_map_object() which returns with page-faults
and preemption disabled - handle mapping API does not need a per-CPU
vm-area because the users are required to provide an aux buffer for
objects that span several physical pages.

Keep zs_map_object/zs_unmap_object for the time being, as there are
still users of it, but eventually old API will be removed.

Signed-off-by: Sergey Senozhatsky <senozhatsky@chromium.org>
---
 include/linux/zsmalloc.h |  29 ++++++++
 mm/zsmalloc.c            | 148 ++++++++++++++++++++++++++++-----------
 2 files changed, 138 insertions(+), 39 deletions(-)

diff --git a/include/linux/zsmalloc.h b/include/linux/zsmalloc.h
index a48cd0ffe57d..72d84537dd38 100644
--- a/include/linux/zsmalloc.h
+++ b/include/linux/zsmalloc.h
@@ -58,4 +58,33 @@ unsigned long zs_compact(struct zs_pool *pool);
 unsigned int zs_lookup_class_index(struct zs_pool *pool, unsigned int size);
 
 void zs_pool_stats(struct zs_pool *pool, struct zs_pool_stats *stats);
+
+struct zs_handle_mapping {
+	unsigned long handle;
+	/* Points to start of the object data either within local_copy or
+	 * within local_mapping. This is what callers should use to access
+	 * or modify handle data.
+	 */
+	void *handle_mem;
+
+	enum zs_mapmode mode;
+	union {
+		/*
+		 * Handle object data copied, because it spans across several
+		 * (non-contiguous) physical pages. This pointer should be
+		 * set by the zs_map_handle() caller beforehand and should
+		 * never be accessed directly.
+		 */
+		void *local_copy;
+		/*
+		 * Handle object mapped directly. Should never be used
+		 * directly.
+		 */
+		void *local_mapping;
+	};
+};
+
+int zs_map_handle(struct zs_pool *pool, struct zs_handle_mapping *map);
+void zs_unmap_handle(struct zs_pool *pool, struct zs_handle_mapping *map);
+
 #endif
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index a5c1f9852072..281bba4a3277 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -1132,18 +1132,14 @@ static inline void __zs_cpu_down(struct mapping_area *area)
 	area->vm_buf = NULL;
 }
 
-static void *__zs_map_object(struct mapping_area *area,
-			struct zpdesc *zpdescs[2], int off, int size)
+static void zs_obj_copyin(void *buf, struct zpdesc *zpdesc, int off, int size)
 {
+	struct zpdesc *zpdescs[2];
 	size_t sizes[2];
-	char *buf = area->vm_buf;
-
-	/* disable page faults to match kmap_local_page() return conditions */
-	pagefault_disable();
 
-	/* no read fastpath */
-	if (area->vm_mm == ZS_MM_WO)
-		goto out;
+	zpdescs[0] = zpdesc;
+	zpdescs[1] = get_next_zpdesc(zpdesc);
+	BUG_ON(!zpdescs[1]);
 
 	sizes[0] = PAGE_SIZE - off;
 	sizes[1] = size - sizes[0];
@@ -1151,21 +1147,17 @@ static void *__zs_map_object(struct mapping_area *area,
 	/* copy object to per-cpu buffer */
 	memcpy_from_page(buf, zpdesc_page(zpdescs[0]), off, sizes[0]);
 	memcpy_from_page(buf + sizes[0], zpdesc_page(zpdescs[1]), 0, sizes[1]);
-out:
-	return area->vm_buf;
 }
 
-static void __zs_unmap_object(struct mapping_area *area,
-			struct zpdesc *zpdescs[2], int off, int size)
+static void zs_obj_copyout(void *buf, struct zpdesc *zpdesc, int off, int size)
 {
+	struct zpdesc *zpdescs[2];
 	size_t sizes[2];
-	char *buf;
 
-	/* no write fastpath */
-	if (area->vm_mm == ZS_MM_RO)
-		goto out;
+	zpdescs[0] = zpdesc;
+	zpdescs[1] = get_next_zpdesc(zpdesc);
+	BUG_ON(!zpdescs[1]);
 
-	buf = area->vm_buf;
 	buf = buf + ZS_HANDLE_SIZE;
 	size -= ZS_HANDLE_SIZE;
 	off += ZS_HANDLE_SIZE;
@@ -1176,10 +1168,6 @@ static void __zs_unmap_object(struct mapping_area *area,
 	/* copy per-cpu buffer to object */
 	memcpy_to_page(zpdesc_page(zpdescs[0]), off, buf, sizes[0]);
 	memcpy_to_page(zpdesc_page(zpdescs[1]), 0, buf + sizes[0], sizes[1]);
-
-out:
-	/* enable page faults to match kunmap_local() return conditions */
-	pagefault_enable();
 }
 
 static int zs_cpu_prepare(unsigned int cpu)
@@ -1260,6 +1248,8 @@ EXPORT_SYMBOL_GPL(zs_get_total_pages);
  * against nested mappings.
  *
  * This function returns with preemption and page faults disabled.
+ *
+ * NOTE: this function is deprecated and will be removed.
  */
 void *zs_map_object(struct zs_pool *pool, unsigned long handle,
 			enum zs_mapmode mm)
@@ -1268,10 +1258,8 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle,
 	struct zpdesc *zpdesc;
 	unsigned long obj, off;
 	unsigned int obj_idx;
-
 	struct size_class *class;
 	struct mapping_area *area;
-	struct zpdesc *zpdescs[2];
 	void *ret;
 
 	/*
@@ -1309,12 +1297,14 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle,
 		goto out;
 	}
 
-	/* this object spans two pages */
-	zpdescs[0] = zpdesc;
-	zpdescs[1] = get_next_zpdesc(zpdesc);
-	BUG_ON(!zpdescs[1]);
+	ret = area->vm_buf;
+	/* disable page faults to match kmap_local_page() return conditions */
+	pagefault_disable();
+	if (mm != ZS_MM_WO) {
+		/* this object spans two pages */
+		zs_obj_copyin(area->vm_buf, zpdesc, off, class->size);
+	}
 
-	ret = __zs_map_object(area, zpdescs, off, class->size);
 out:
 	if (likely(!ZsHugePage(zspage)))
 		ret += ZS_HANDLE_SIZE;
@@ -1323,13 +1313,13 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle,
 }
 EXPORT_SYMBOL_GPL(zs_map_object);
 
+/* NOTE: this function is deprecated and will be removed. */
 void zs_unmap_object(struct zs_pool *pool, unsigned long handle)
 {
 	struct zspage *zspage;
 	struct zpdesc *zpdesc;
 	unsigned long obj, off;
 	unsigned int obj_idx;
-
 	struct size_class *class;
 	struct mapping_area *area;
 
@@ -1340,23 +1330,103 @@ void zs_unmap_object(struct zs_pool *pool, unsigned long handle)
 	off = offset_in_page(class->size * obj_idx);
 
 	area = this_cpu_ptr(&zs_map_area);
-	if (off + class->size <= PAGE_SIZE)
+	if (off + class->size <= PAGE_SIZE) {
 		kunmap_local(area->vm_addr);
-	else {
-		struct zpdesc *zpdescs[2];
+		goto out;
+	}
 
-		zpdescs[0] = zpdesc;
-		zpdescs[1] = get_next_zpdesc(zpdesc);
-		BUG_ON(!zpdescs[1]);
+	if (area->vm_mm != ZS_MM_RO)
+		zs_obj_copyout(area->vm_buf, zpdesc, off, class->size);
+	/* enable page faults to match kunmap_local() return conditions */
+	pagefault_enable();
 
-		__zs_unmap_object(area, zpdescs, off, class->size);
-	}
+out:
 	local_unlock(&zs_map_area.lock);
-
 	zspage_read_unlock(zspage);
 }
 EXPORT_SYMBOL_GPL(zs_unmap_object);
 
+void zs_unmap_handle(struct zs_pool *pool, struct zs_handle_mapping *map)
+{
+	struct zspage *zspage;
+	struct zpdesc *zpdesc;
+	unsigned long obj, off;
+	unsigned int obj_idx;
+	struct size_class *class;
+
+	obj = handle_to_obj(map->handle);
+	obj_to_location(obj, &zpdesc, &obj_idx);
+	zspage = get_zspage(zpdesc);
+	class = zspage_class(pool, zspage);
+	off = offset_in_page(class->size * obj_idx);
+
+	if (off + class->size <= PAGE_SIZE) {
+		kunmap_local(map->local_mapping);
+		goto out;
+	}
+
+	if (map->mode != ZS_MM_RO)
+		zs_obj_copyout(map->local_copy, zpdesc, off, class->size);
+
+out:
+	zspage_read_unlock(zspage);
+}
+EXPORT_SYMBOL_GPL(zs_unmap_handle);
+
+int zs_map_handle(struct zs_pool *pool, struct zs_handle_mapping *map)
+{
+	struct zspage *zspage;
+	struct zpdesc *zpdesc;
+	unsigned long obj, off;
+	unsigned int obj_idx;
+	struct size_class *class;
+
+	WARN_ON(in_interrupt());
+
+	/* It guarantees it can get zspage from handle safely */
+	pool_read_lock(pool);
+	obj = handle_to_obj(map->handle);
+	obj_to_location(obj, &zpdesc, &obj_idx);
+	zspage = get_zspage(zpdesc);
+
+	/*
+	 * migration cannot move any zpages in this zspage. Here, class->lock
+	 * is too heavy since callers would take some time until they calls
+	 * zs_unmap_object API so delegate the locking from class to zspage
+	 * which is smaller granularity.
+	 */
+	zspage_read_lock(zspage);
+	pool_read_unlock(pool);
+
+	class = zspage_class(pool, zspage);
+	off = offset_in_page(class->size * obj_idx);
+
+	if (off + class->size <= PAGE_SIZE) {
+		/* this object is contained entirely within a page */
+		map->local_mapping = kmap_local_zpdesc(zpdesc);
+		map->handle_mem = map->local_mapping + off;
+		goto out;
+	}
+
+	if (WARN_ON_ONCE(!map->local_copy)) {
+		zspage_read_unlock(zspage);
+		return -EINVAL;
+	}
+
+	map->handle_mem = map->local_copy;
+	if (map->mode != ZS_MM_WO) {
+		/* this object spans two pages */
+		zs_obj_copyin(map->local_copy, zpdesc, off, class->size);
+	}
+
+out:
+	if (likely(!ZsHugePage(zspage)))
+		map->handle_mem += ZS_HANDLE_SIZE;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(zs_map_handle);
+
 /**
  * zs_huge_class_size() - Returns the size (in bytes) of the first huge
  *                        zsmalloc &size_class.
-- 
2.48.1.262.g85cc9f2d1e-goog



^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [RFC PATCH 5/6] zsmalloc: introduce handle mapping API
  2025-01-27  7:59 ` [RFC PATCH 5/6] zsmalloc: introduce handle mapping API Sergey Senozhatsky
@ 2025-01-27 21:26   ` Yosry Ahmed
  2025-01-28  0:37     ` Sergey Senozhatsky
  2025-01-27 21:58   ` Yosry Ahmed
  1 sibling, 1 reply; 24+ messages in thread
From: Yosry Ahmed @ 2025-01-27 21:26 UTC (permalink / raw)
  To: Sergey Senozhatsky
  Cc: Andrew Morton, Minchan Kim, Johannes Weiner, Nhat Pham, linux-mm,
	linux-kernel

On Mon, Jan 27, 2025 at 04:59:30PM +0900, Sergey Senozhatsky wrote:
> Introduce new API to map/unmap zsmalloc handle/object.  The key
> difference is that this API does not impose atomicity restrictions
> on its users, unlike zs_map_object() which returns with page-faults
> and preemption disabled

I think that's not entirely accurate, see below.

[..]
> @@ -1309,12 +1297,14 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle,
>  		goto out;
>  	}
>  
> -	/* this object spans two pages */
> -	zpdescs[0] = zpdesc;
> -	zpdescs[1] = get_next_zpdesc(zpdesc);
> -	BUG_ON(!zpdescs[1]);
> +	ret = area->vm_buf;
> +	/* disable page faults to match kmap_local_page() return conditions */
> +	pagefault_disable();

Is this accurate/necessary? I am looking at kmap_local_page() and I
don't see it. Maybe that's remnant from the old code using
kmap_atomic()?

> +	if (mm != ZS_MM_WO) {
> +		/* this object spans two pages */
> +		zs_obj_copyin(area->vm_buf, zpdesc, off, class->size);
> +	}
>  
> -	ret = __zs_map_object(area, zpdescs, off, class->size);
>  out:
>  	if (likely(!ZsHugePage(zspage)))
>  		ret += ZS_HANDLE_SIZE;


^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [RFC PATCH 5/6] zsmalloc: introduce handle mapping API
  2025-01-27 21:26   ` Yosry Ahmed
@ 2025-01-28  0:37     ` Sergey Senozhatsky
  2025-01-28  0:49       ` Yosry Ahmed
  0 siblings, 1 reply; 24+ messages in thread
From: Sergey Senozhatsky @ 2025-01-28  0:37 UTC (permalink / raw)
  To: Yosry Ahmed
  Cc: Sergey Senozhatsky, Andrew Morton, Minchan Kim, Johannes Weiner,
	Nhat Pham, linux-mm, linux-kernel

On (25/01/27 21:26), Yosry Ahmed wrote:
> On Mon, Jan 27, 2025 at 04:59:30PM +0900, Sergey Senozhatsky wrote:
> > Introduce new API to map/unmap zsmalloc handle/object.  The key
> > difference is that this API does not impose atomicity restrictions
> > on its users, unlike zs_map_object() which returns with page-faults
> > and preemption disabled
> 
> I think that's not entirely accurate, see below.

Preemption is disabled via zspage-s rwlock_t - zs_map_object() returns
with it being locked and it's being unlocked in zs_unmap_object().  Then
the function disables pagefaults and per-CPU local lock (protects per-CPU
vm-area) additionally disables preemption.

> [..]
> > @@ -1309,12 +1297,14 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle,
> >  		goto out;
> >  	}
> >  
> > -	/* this object spans two pages */
> > -	zpdescs[0] = zpdesc;
> > -	zpdescs[1] = get_next_zpdesc(zpdesc);
> > -	BUG_ON(!zpdescs[1]);
> > +	ret = area->vm_buf;
> > +	/* disable page faults to match kmap_local_page() return conditions */
> > +	pagefault_disable();
> 
> Is this accurate/necessary? I am looking at kmap_local_page() and I
> don't see it. Maybe that's remnant from the old code using
> kmap_atomic()?

No, this does not look accuare nor neccesary to me.  I asume that's from
a very long time ago, but regardless of that I don't really understand
why that API wants to resemblwe kmap_atomic() (I think that was the
intention).  This interface if expected to be gone so I didn't want
to dig into it and fix it.


^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [RFC PATCH 5/6] zsmalloc: introduce handle mapping API
  2025-01-28  0:37     ` Sergey Senozhatsky
@ 2025-01-28  0:49       ` Yosry Ahmed
  2025-01-28  1:13         ` Sergey Senozhatsky
  0 siblings, 1 reply; 24+ messages in thread
From: Yosry Ahmed @ 2025-01-28  0:49 UTC (permalink / raw)
  To: Sergey Senozhatsky
  Cc: Andrew Morton, Minchan Kim, Johannes Weiner, Nhat Pham, linux-mm,
	linux-kernel

On Tue, Jan 28, 2025 at 09:37:20AM +0900, Sergey Senozhatsky wrote:
> On (25/01/27 21:26), Yosry Ahmed wrote:
> > On Mon, Jan 27, 2025 at 04:59:30PM +0900, Sergey Senozhatsky wrote:
> > > Introduce new API to map/unmap zsmalloc handle/object.  The key
> > > difference is that this API does not impose atomicity restrictions
> > > on its users, unlike zs_map_object() which returns with page-faults
> > > and preemption disabled
> > 
> > I think that's not entirely accurate, see below.
> 
> Preemption is disabled via zspage-s rwlock_t - zs_map_object() returns
> with it being locked and it's being unlocked in zs_unmap_object().  Then
> the function disables pagefaults and per-CPU local lock (protects per-CPU
> vm-area) additionally disables preemption.

Right, I meant it does not always disable page faults.

> 
> > [..]
> > > @@ -1309,12 +1297,14 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle,
> > >  		goto out;
> > >  	}
> > >  
> > > -	/* this object spans two pages */
> > > -	zpdescs[0] = zpdesc;
> > > -	zpdescs[1] = get_next_zpdesc(zpdesc);
> > > -	BUG_ON(!zpdescs[1]);
> > > +	ret = area->vm_buf;
> > > +	/* disable page faults to match kmap_local_page() return conditions */
> > > +	pagefault_disable();
> > 
> > Is this accurate/necessary? I am looking at kmap_local_page() and I
> > don't see it. Maybe that's remnant from the old code using
> > kmap_atomic()?
> 
> No, this does not look accuare nor neccesary to me.  I asume that's from
> a very long time ago, but regardless of that I don't really understand
> why that API wants to resemblwe kmap_atomic() (I think that was the
> intention).  This interface if expected to be gone so I didn't want
> to dig into it and fix it.

My assumption has been that back when we were using kmap_atomic(), which
disables page faults, we wanted to make this API's behavior consistent
for users where or not we called kmap_atomic() -- so this makes sure it
always disables page faults.

Now that we switched to kmap_local_page(), which doesn't disable page
faults, this was left behind, ulitmately making the interface
inconsistent and contradicting the purpose of its existence.

This is 100% speculation on my end :)

Anyway, if this function will be removed soon then it's not worth
revisiting it now.


^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [RFC PATCH 5/6] zsmalloc: introduce handle mapping API
  2025-01-28  0:49       ` Yosry Ahmed
@ 2025-01-28  1:13         ` Sergey Senozhatsky
  0 siblings, 0 replies; 24+ messages in thread
From: Sergey Senozhatsky @ 2025-01-28  1:13 UTC (permalink / raw)
  To: Yosry Ahmed
  Cc: Sergey Senozhatsky, Andrew Morton, Minchan Kim, Johannes Weiner,
	Nhat Pham, linux-mm, linux-kernel

On (25/01/28 00:49), Yosry Ahmed wrote:
> > Preemption is disabled via zspage-s rwlock_t - zs_map_object() returns
> > with it being locked and it's being unlocked in zs_unmap_object().  Then
> > the function disables pagefaults and per-CPU local lock (protects per-CPU
> > vm-area) additionally disables preemption.
> 
> Right, I meant it does not always disable page faults.

I'll add "sometimes" :)

[..]
> Anyway, if this function will be removed soon then it's not worth
> revisiting it now.

Ack.


^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [RFC PATCH 5/6] zsmalloc: introduce handle mapping API
  2025-01-27  7:59 ` [RFC PATCH 5/6] zsmalloc: introduce handle mapping API Sergey Senozhatsky
  2025-01-27 21:26   ` Yosry Ahmed
@ 2025-01-27 21:58   ` Yosry Ahmed
  2025-01-28  0:59     ` Sergey Senozhatsky
  1 sibling, 1 reply; 24+ messages in thread
From: Yosry Ahmed @ 2025-01-27 21:58 UTC (permalink / raw)
  To: Sergey Senozhatsky
  Cc: Andrew Morton, Minchan Kim, Johannes Weiner, Nhat Pham, linux-mm,
	linux-kernel

On Mon, Jan 27, 2025 at 04:59:30PM +0900, Sergey Senozhatsky wrote:
> Introduce new API to map/unmap zsmalloc handle/object.  The key
> difference is that this API does not impose atomicity restrictions
> on its users, unlike zs_map_object() which returns with page-faults
> and preemption disabled - handle mapping API does not need a per-CPU
> vm-area because the users are required to provide an aux buffer for
> objects that span several physical pages.

I like the idea of supplying the buffer directly to zsmalloc, and zswap
already has per-CPU buffers allocated. This will help remove the special
case to handle not being able to sleep in zswap_decompress().

That being said, I am not a big fan of the new API for several reasons:
- The interface seems complicated, why do we need struct
zs_handle_mapping? Can't the user just pass an extra parameter to
zs_map_object/zs_unmap_object() to supply the buffer, and the return
value is the pointer to the data within the buffer?

- This seems to require an additional buffer on the compress side. Right
now, zswap compresses the page into its own buffer, maps the handle,
and copies to it. Now the map operation will require an extra buffer.
I guess in the WO case the buffer is not needed and we can just pass
NULL?

Taking a step back, it actually seems to me that the mapping interface
may not be the best, at least from a zswap perspective. In both cases,
we map, copy from/to the handle, then unmap. The special casing here is
essentially handling the copy direction. Zram looks fairly similar but I
didn't look too closely.

I wonder if the API should store/load instead. You either pass a buffer
to be stored (equivalent to today's alloc + map + copy), or pass a
buffer to load into (equivalent to today's map + copy). What we really
need on the zswap side is zs_store() and zs_load(), not zs_map() with
different mapping types and an optional buffer if we are going to
eventually store. I guess that's part of a larger overhaul and we'd need
to update other zpool allocators (or remove them, z3fold should be
coming soon).

Anyway this is mostly just me ranting because improving the interface to
avoid the atomicity requires making it even more complicated, when it's
really simple when you think about it in terms of what you really want
to do (i.e. store and load).

> Keep zs_map_object/zs_unmap_object for the time being, as there are
> still users of it, but eventually old API will be removed.
> 
> Signed-off-by: Sergey Senozhatsky <senozhatsky@chromium.org>
> ---
>  include/linux/zsmalloc.h |  29 ++++++++
>  mm/zsmalloc.c            | 148 ++++++++++++++++++++++++++++-----------
>  2 files changed, 138 insertions(+), 39 deletions(-)
> 
> diff --git a/include/linux/zsmalloc.h b/include/linux/zsmalloc.h
> index a48cd0ffe57d..72d84537dd38 100644
> --- a/include/linux/zsmalloc.h
> +++ b/include/linux/zsmalloc.h
> @@ -58,4 +58,33 @@ unsigned long zs_compact(struct zs_pool *pool);
>  unsigned int zs_lookup_class_index(struct zs_pool *pool, unsigned int size);
>  
>  void zs_pool_stats(struct zs_pool *pool, struct zs_pool_stats *stats);
> +
> +struct zs_handle_mapping {
> +	unsigned long handle;
> +	/* Points to start of the object data either within local_copy or
> +	 * within local_mapping. This is what callers should use to access
> +	 * or modify handle data.
> +	 */
> +	void *handle_mem;
> +
> +	enum zs_mapmode mode;
> +	union {
> +		/*
> +		 * Handle object data copied, because it spans across several
> +		 * (non-contiguous) physical pages. This pointer should be
> +		 * set by the zs_map_handle() caller beforehand and should
> +		 * never be accessed directly.
> +		 */
> +		void *local_copy;
> +		/*
> +		 * Handle object mapped directly. Should never be used
> +		 * directly.
> +		 */
> +		void *local_mapping;
> +	};
> +};
> +
> +int zs_map_handle(struct zs_pool *pool, struct zs_handle_mapping *map);
> +void zs_unmap_handle(struct zs_pool *pool, struct zs_handle_mapping *map);
> +
>  #endif
> diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
> index a5c1f9852072..281bba4a3277 100644
> --- a/mm/zsmalloc.c
> +++ b/mm/zsmalloc.c
> @@ -1132,18 +1132,14 @@ static inline void __zs_cpu_down(struct mapping_area *area)
>  	area->vm_buf = NULL;
>  }
>  
> -static void *__zs_map_object(struct mapping_area *area,
> -			struct zpdesc *zpdescs[2], int off, int size)
> +static void zs_obj_copyin(void *buf, struct zpdesc *zpdesc, int off, int size)
>  {
> +	struct zpdesc *zpdescs[2];
>  	size_t sizes[2];
> -	char *buf = area->vm_buf;
> -
> -	/* disable page faults to match kmap_local_page() return conditions */
> -	pagefault_disable();
>  
> -	/* no read fastpath */
> -	if (area->vm_mm == ZS_MM_WO)
> -		goto out;
> +	zpdescs[0] = zpdesc;
> +	zpdescs[1] = get_next_zpdesc(zpdesc);
> +	BUG_ON(!zpdescs[1]);
>  
>  	sizes[0] = PAGE_SIZE - off;
>  	sizes[1] = size - sizes[0];
> @@ -1151,21 +1147,17 @@ static void *__zs_map_object(struct mapping_area *area,
>  	/* copy object to per-cpu buffer */
>  	memcpy_from_page(buf, zpdesc_page(zpdescs[0]), off, sizes[0]);
>  	memcpy_from_page(buf + sizes[0], zpdesc_page(zpdescs[1]), 0, sizes[1]);
> -out:
> -	return area->vm_buf;
>  }
>  
> -static void __zs_unmap_object(struct mapping_area *area,
> -			struct zpdesc *zpdescs[2], int off, int size)
> +static void zs_obj_copyout(void *buf, struct zpdesc *zpdesc, int off, int size)
>  {
> +	struct zpdesc *zpdescs[2];
>  	size_t sizes[2];
> -	char *buf;
>  
> -	/* no write fastpath */
> -	if (area->vm_mm == ZS_MM_RO)
> -		goto out;
> +	zpdescs[0] = zpdesc;
> +	zpdescs[1] = get_next_zpdesc(zpdesc);
> +	BUG_ON(!zpdescs[1]);
>  
> -	buf = area->vm_buf;
>  	buf = buf + ZS_HANDLE_SIZE;
>  	size -= ZS_HANDLE_SIZE;
>  	off += ZS_HANDLE_SIZE;
> @@ -1176,10 +1168,6 @@ static void __zs_unmap_object(struct mapping_area *area,
>  	/* copy per-cpu buffer to object */
>  	memcpy_to_page(zpdesc_page(zpdescs[0]), off, buf, sizes[0]);
>  	memcpy_to_page(zpdesc_page(zpdescs[1]), 0, buf + sizes[0], sizes[1]);
> -
> -out:
> -	/* enable page faults to match kunmap_local() return conditions */
> -	pagefault_enable();
>  }
>  
>  static int zs_cpu_prepare(unsigned int cpu)
> @@ -1260,6 +1248,8 @@ EXPORT_SYMBOL_GPL(zs_get_total_pages);
>   * against nested mappings.
>   *
>   * This function returns with preemption and page faults disabled.
> + *
> + * NOTE: this function is deprecated and will be removed.
>   */
>  void *zs_map_object(struct zs_pool *pool, unsigned long handle,
>  			enum zs_mapmode mm)
> @@ -1268,10 +1258,8 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle,
>  	struct zpdesc *zpdesc;
>  	unsigned long obj, off;
>  	unsigned int obj_idx;
> -
>  	struct size_class *class;
>  	struct mapping_area *area;
> -	struct zpdesc *zpdescs[2];
>  	void *ret;
>  
>  	/*
> @@ -1309,12 +1297,14 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle,
>  		goto out;
>  	}
>  
> -	/* this object spans two pages */
> -	zpdescs[0] = zpdesc;
> -	zpdescs[1] = get_next_zpdesc(zpdesc);
> -	BUG_ON(!zpdescs[1]);
> +	ret = area->vm_buf;
> +	/* disable page faults to match kmap_local_page() return conditions */
> +	pagefault_disable();
> +	if (mm != ZS_MM_WO) {
> +		/* this object spans two pages */
> +		zs_obj_copyin(area->vm_buf, zpdesc, off, class->size);
> +	}
>  
> -	ret = __zs_map_object(area, zpdescs, off, class->size);
>  out:
>  	if (likely(!ZsHugePage(zspage)))
>  		ret += ZS_HANDLE_SIZE;
> @@ -1323,13 +1313,13 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle,
>  }
>  EXPORT_SYMBOL_GPL(zs_map_object);
>  
> +/* NOTE: this function is deprecated and will be removed. */
>  void zs_unmap_object(struct zs_pool *pool, unsigned long handle)
>  {
>  	struct zspage *zspage;
>  	struct zpdesc *zpdesc;
>  	unsigned long obj, off;
>  	unsigned int obj_idx;
> -
>  	struct size_class *class;
>  	struct mapping_area *area;
>  
> @@ -1340,23 +1330,103 @@ void zs_unmap_object(struct zs_pool *pool, unsigned long handle)
>  	off = offset_in_page(class->size * obj_idx);
>  
>  	area = this_cpu_ptr(&zs_map_area);
> -	if (off + class->size <= PAGE_SIZE)
> +	if (off + class->size <= PAGE_SIZE) {
>  		kunmap_local(area->vm_addr);
> -	else {
> -		struct zpdesc *zpdescs[2];
> +		goto out;
> +	}
>  
> -		zpdescs[0] = zpdesc;
> -		zpdescs[1] = get_next_zpdesc(zpdesc);
> -		BUG_ON(!zpdescs[1]);
> +	if (area->vm_mm != ZS_MM_RO)
> +		zs_obj_copyout(area->vm_buf, zpdesc, off, class->size);
> +	/* enable page faults to match kunmap_local() return conditions */
> +	pagefault_enable();
>  
> -		__zs_unmap_object(area, zpdescs, off, class->size);
> -	}
> +out:
>  	local_unlock(&zs_map_area.lock);
> -
>  	zspage_read_unlock(zspage);
>  }
>  EXPORT_SYMBOL_GPL(zs_unmap_object);
>  
> +void zs_unmap_handle(struct zs_pool *pool, struct zs_handle_mapping *map)
> +{
> +	struct zspage *zspage;
> +	struct zpdesc *zpdesc;
> +	unsigned long obj, off;
> +	unsigned int obj_idx;
> +	struct size_class *class;
> +
> +	obj = handle_to_obj(map->handle);
> +	obj_to_location(obj, &zpdesc, &obj_idx);
> +	zspage = get_zspage(zpdesc);
> +	class = zspage_class(pool, zspage);
> +	off = offset_in_page(class->size * obj_idx);
> +
> +	if (off + class->size <= PAGE_SIZE) {
> +		kunmap_local(map->local_mapping);
> +		goto out;
> +	}
> +
> +	if (map->mode != ZS_MM_RO)
> +		zs_obj_copyout(map->local_copy, zpdesc, off, class->size);
> +
> +out:
> +	zspage_read_unlock(zspage);
> +}
> +EXPORT_SYMBOL_GPL(zs_unmap_handle);
> +
> +int zs_map_handle(struct zs_pool *pool, struct zs_handle_mapping *map)
> +{
> +	struct zspage *zspage;
> +	struct zpdesc *zpdesc;
> +	unsigned long obj, off;
> +	unsigned int obj_idx;
> +	struct size_class *class;
> +
> +	WARN_ON(in_interrupt());
> +
> +	/* It guarantees it can get zspage from handle safely */
> +	pool_read_lock(pool);
> +	obj = handle_to_obj(map->handle);
> +	obj_to_location(obj, &zpdesc, &obj_idx);
> +	zspage = get_zspage(zpdesc);
> +
> +	/*
> +	 * migration cannot move any zpages in this zspage. Here, class->lock
> +	 * is too heavy since callers would take some time until they calls
> +	 * zs_unmap_object API so delegate the locking from class to zspage
> +	 * which is smaller granularity.
> +	 */
> +	zspage_read_lock(zspage);
> +	pool_read_unlock(pool);
> +
> +	class = zspage_class(pool, zspage);
> +	off = offset_in_page(class->size * obj_idx);
> +
> +	if (off + class->size <= PAGE_SIZE) {
> +		/* this object is contained entirely within a page */
> +		map->local_mapping = kmap_local_zpdesc(zpdesc);
> +		map->handle_mem = map->local_mapping + off;
> +		goto out;
> +	}
> +
> +	if (WARN_ON_ONCE(!map->local_copy)) {
> +		zspage_read_unlock(zspage);
> +		return -EINVAL;
> +	}
> +
> +	map->handle_mem = map->local_copy;
> +	if (map->mode != ZS_MM_WO) {
> +		/* this object spans two pages */
> +		zs_obj_copyin(map->local_copy, zpdesc, off, class->size);
> +	}
> +
> +out:
> +	if (likely(!ZsHugePage(zspage)))
> +		map->handle_mem += ZS_HANDLE_SIZE;
> +
> +	return 0;
> +}
> +EXPORT_SYMBOL_GPL(zs_map_handle);
> +
>  /**
>   * zs_huge_class_size() - Returns the size (in bytes) of the first huge
>   *                        zsmalloc &size_class.
> -- 
> 2.48.1.262.g85cc9f2d1e-goog
> 


^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [RFC PATCH 5/6] zsmalloc: introduce handle mapping API
  2025-01-27 21:58   ` Yosry Ahmed
@ 2025-01-28  0:59     ` Sergey Senozhatsky
  2025-01-28  1:36       ` Yosry Ahmed
  0 siblings, 1 reply; 24+ messages in thread
From: Sergey Senozhatsky @ 2025-01-28  0:59 UTC (permalink / raw)
  To: Yosry Ahmed
  Cc: Sergey Senozhatsky, Andrew Morton, Minchan Kim, Johannes Weiner,
	Nhat Pham, linux-mm, linux-kernel

On (25/01/27 21:58), Yosry Ahmed wrote:
> On Mon, Jan 27, 2025 at 04:59:30PM +0900, Sergey Senozhatsky wrote:
> > Introduce new API to map/unmap zsmalloc handle/object.  The key
> > difference is that this API does not impose atomicity restrictions
> > on its users, unlike zs_map_object() which returns with page-faults
> > and preemption disabled - handle mapping API does not need a per-CPU
> > vm-area because the users are required to provide an aux buffer for
> > objects that span several physical pages.
> 
> I like the idea of supplying the buffer directly to zsmalloc, and zswap
> already has per-CPU buffers allocated. This will help remove the special
> case to handle not being able to sleep in zswap_decompress().

The interface, basically, is what we currently have, but the state
is moved out of zsmalloc internal per-CPU vm-area.

> That being said, I am not a big fan of the new API for several reasons:
> - The interface seems complicated, why do we need struct
> zs_handle_mapping? Can't the user just pass an extra parameter to
> zs_map_object/zs_unmap_object() to supply the buffer, and the return
> value is the pointer to the data within the buffer?

At least now we need to save some state - e.g. direction of the map()
so that during unmap zsmalloc determines if it needs to perform copy-out
or not.  It also needs that state in order to know if the buffer needs
to be unmapped.

zsmalloc MAP has two cases:
a) the object spans several physical non-contig pages: copy-in object into
  aux buffer and return (linear) pointer to that buffer
b) the object is contained within a physical page: kmap that page and
  return (linear) pointer to that mapping, unmap in zs_unmap_object().

> - This seems to require an additional buffer on the compress side. Right
> now, zswap compresses the page into its own buffer, maps the handle,
> and copies to it. Now the map operation will require an extra buffer.

Yes, for (a) mentioned above.

> I guess in the WO case the buffer is not needed and we can just pass
> NULL?

Yes.

> Taking a step back, it actually seems to me that the mapping interface
> may not be the best, at least from a zswap perspective. In both cases,
> we map, copy from/to the handle, then unmap. The special casing here is
> essentially handling the copy direction. Zram looks fairly similar but I
> didn't look too closely.
> 
> I wonder if the API should store/load instead. You either pass a buffer
> to be stored (equivalent to today's alloc + map + copy), or pass a
> buffer to load into (equivalent to today's map + copy). What we really
> need on the zswap side is zs_store() and zs_load(), not zs_map() with
> different mapping types and an optional buffer if we are going to
> eventually store. I guess that's part of a larger overhaul and we'd need
> to update other zpool allocators (or remove them, z3fold should be
> coming soon).

So I though about it: load and store.

zs_obj_load()
{
	zspage->page kmap, etc.
	memcpy buf page   # if direction is not WO
	unmap
}

zs_obj_store()
{
	zspage->page kmap, etc.
	memcpy page buf   # if direction is not RO
	unmap
}

load+store would not require zsmalloc to be preemptible internally, we
could just keep existing atomic locks and it would make things a little
simpler on the zram side (slot-free-notification is called from atomic
section).

But, and it's a big but.  And it's (b) from the above.  I wasn't brave
enough to just drop (b) optimization and replace it with memcpy(),
especially when we work with relatively large objects (say size-class
3600 bytes and above).  This certainly would not make battery powered
devices happier.  Maybe in zswap the page is only read once (is that
correct?), but in zram page can be read multiple times (e.g. when zram
is used as a raw block-dev, or has a mounted fs on it) which means
multiple extra memcpy()-s.

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [RFC PATCH 5/6] zsmalloc: introduce handle mapping API
  2025-01-28  0:59     ` Sergey Senozhatsky
@ 2025-01-28  1:36       ` Yosry Ahmed
  2025-01-28  5:29         ` Sergey Senozhatsky
  2025-01-29  5:40         ` Sergey Senozhatsky
  0 siblings, 2 replies; 24+ messages in thread
From: Yosry Ahmed @ 2025-01-28  1:36 UTC (permalink / raw)
  To: Sergey Senozhatsky
  Cc: Andrew Morton, Minchan Kim, Johannes Weiner, Nhat Pham, linux-mm,
	linux-kernel

On Tue, Jan 28, 2025 at 09:59:55AM +0900, Sergey Senozhatsky wrote:
> On (25/01/27 21:58), Yosry Ahmed wrote:
> > On Mon, Jan 27, 2025 at 04:59:30PM +0900, Sergey Senozhatsky wrote:
> > > Introduce new API to map/unmap zsmalloc handle/object.  The key
> > > difference is that this API does not impose atomicity restrictions
> > > on its users, unlike zs_map_object() which returns with page-faults
> > > and preemption disabled - handle mapping API does not need a per-CPU
> > > vm-area because the users are required to provide an aux buffer for
> > > objects that span several physical pages.
> > 
> > I like the idea of supplying the buffer directly to zsmalloc, and zswap
> > already has per-CPU buffers allocated. This will help remove the special
> > case to handle not being able to sleep in zswap_decompress().
> 
> The interface, basically, is what we currently have, but the state
> is moved out of zsmalloc internal per-CPU vm-area.
> 
> > That being said, I am not a big fan of the new API for several reasons:
> > - The interface seems complicated, why do we need struct
> > zs_handle_mapping? Can't the user just pass an extra parameter to
> > zs_map_object/zs_unmap_object() to supply the buffer, and the return
> > value is the pointer to the data within the buffer?
> 
> At least now we need to save some state - e.g. direction of the map()
> so that during unmap zsmalloc determines if it needs to perform copy-out
> or not.  It also needs that state in order to know if the buffer needs
> to be unmapped.
> 
> zsmalloc MAP has two cases:
> a) the object spans several physical non-contig pages: copy-in object into
>   aux buffer and return (linear) pointer to that buffer
> b) the object is contained within a physical page: kmap that page and
>   return (linear) pointer to that mapping, unmap in zs_unmap_object().

Ack. See below.

> > - This seems to require an additional buffer on the compress side. Right
> > now, zswap compresses the page into its own buffer, maps the handle,
> > and copies to it. Now the map operation will require an extra buffer.
> 
> Yes, for (a) mentioned above.
> 
> > I guess in the WO case the buffer is not needed and we can just pass
> > NULL?
> 
> Yes.

Perhaps we want to document this and enforce it (make sure that the
NULL-ness of the buffer matches the access type).

> > Taking a step back, it actually seems to me that the mapping interface
> > may not be the best, at least from a zswap perspective. In both cases,
> > we map, copy from/to the handle, then unmap. The special casing here is
> > essentially handling the copy direction. Zram looks fairly similar but I
> > didn't look too closely.
> > 
> > I wonder if the API should store/load instead. You either pass a buffer
> > to be stored (equivalent to today's alloc + map + copy), or pass a
> > buffer to load into (equivalent to today's map + copy). What we really
> > need on the zswap side is zs_store() and zs_load(), not zs_map() with
> > different mapping types and an optional buffer if we are going to
> > eventually store. I guess that's part of a larger overhaul and we'd need
> > to update other zpool allocators (or remove them, z3fold should be
> > coming soon).
> 
> So I though about it: load and store.
> 
> zs_obj_load()
> {
> 	zspage->page kmap, etc.
> 	memcpy buf page   # if direction is not WO
> 	unmap
> }
> 
> zs_obj_store()
> {
> 	zspage->page kmap, etc.
> 	memcpy page buf   # if direction is not RO
> 	unmap
> }
> 
> load+store would not require zsmalloc to be preemptible internally, we
> could just keep existing atomic locks and it would make things a little
> simpler on the zram side (slot-free-notification is called from atomic
> section).
> 
> But, and it's a big but.  And it's (b) from the above.  I wasn't brave
> enough to just drop (b) optimization and replace it with memcpy(),
> especially when we work with relatively large objects (say size-class
> 3600 bytes and above).  This certainly would not make battery powered
> devices happier.  Maybe in zswap the page is only read once (is that
> correct?), but in zram page can be read multiple times (e.g. when zram
> is used as a raw block-dev, or has a mounted fs on it) which means
> multiple extra memcpy()-s.

In zswap, because we use the crypto_acomp API, when we cannot sleep with
the object mapped (which is true for zsmalloc), we just copy the
compressed object into a preallocated buffer anyway. So having a
zs_obj_load() interface would move that copy inside zsmalloc.

With your series, zswap can drop the memcpy and save some cycles on the
compress side. I didn't realize that zram does not perform any copies on the
read/decompress side.

Maybe the load interface can still provide a buffer to avoid the copy
where possible? I suspect with that we don't need the state and can
just pass a pointer. We'd need another call to potentially unmap, so
maybe load_start/load_end, or read_start/read_end.

Something like:

zs_obj_read_start(.., buf)
{
	if (contained in one page)
		return kmapped obj
	else
		memcpy to buf
		return buf
}

zs_obj_read_end(.., buf)
{
	if (container in one page)
		kunmap
}

The interface is more straightforward and we can drop the map flags
entirely, unless I missed something here. Unfortunately you'd still need
the locking changes in zsmalloc to make zram reads fully preemptible.

I am not suggesting that we have to go this way, just throwing out
ideas.

BTW, are we positive that the locking changes made in this series are
not introducing regressions? I'd hate for us to avoid an extra copy but
end up paying for it in lock contention.


^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [RFC PATCH 5/6] zsmalloc: introduce handle mapping API
  2025-01-28  1:36       ` Yosry Ahmed
@ 2025-01-28  5:29         ` Sergey Senozhatsky
  2025-01-28  9:38           ` Sergey Senozhatsky
  2025-01-28 11:10           ` Sergey Senozhatsky
  2025-01-29  5:40         ` Sergey Senozhatsky
  1 sibling, 2 replies; 24+ messages in thread
From: Sergey Senozhatsky @ 2025-01-28  5:29 UTC (permalink / raw)
  To: Yosry Ahmed
  Cc: Sergey Senozhatsky, Andrew Morton, Minchan Kim, Johannes Weiner,
	Nhat Pham, linux-mm, linux-kernel

On (25/01/28 01:36), Yosry Ahmed wrote:
> > Yes, for (a) mentioned above.
> > 
> > > I guess in the WO case the buffer is not needed and we can just pass
> > > NULL?
> > 
> > Yes.
> 
> Perhaps we want to document this and enforce it (make sure that the
> NULL-ness of the buffer matches the access type).

Right.

> > But, and it's a big but.  And it's (b) from the above.  I wasn't brave
> > enough to just drop (b) optimization and replace it with memcpy(),
> > especially when we work with relatively large objects (say size-class
> > 3600 bytes and above).  This certainly would not make battery powered
> > devices happier.  Maybe in zswap the page is only read once (is that
> > correct?), but in zram page can be read multiple times (e.g. when zram
> > is used as a raw block-dev, or has a mounted fs on it) which means
> > multiple extra memcpy()-s.
> 
> In zswap, because we use the crypto_acomp API, when we cannot sleep with
> the object mapped (which is true for zsmalloc), we just copy the
> compressed object into a preallocated buffer anyway. So having a
> zs_obj_load() interface would move that copy inside zsmalloc.

Yeah, I saw zpool_can_sleep_mapped() and had the same thought.  zram,
as of now, doesn't support algos that can/need schedule internally for
whatever reason - kmalloc, mutex, H/W wait, etc.

> With your series, zswap can drop the memcpy and save some cycles on the
> compress side. I didn't realize that zram does not perform any copies on the
> read/decompress side.
> 
> Maybe the load interface can still provide a buffer to avoid the copy
> where possible? I suspect with that we don't need the state and can
> just pass a pointer. We'd need another call to potentially unmap, so
> maybe load_start/load_end, or read_start/read_end.
> 
> Something like:
> 
> zs_obj_read_start(.., buf)
> {
> 	if (contained in one page)
> 		return kmapped obj
> 	else
> 		memcpy to buf
> 		return buf
> }
> 
> zs_obj_read_end(.., buf)
> {
> 	if (container in one page)
> 		kunmap
> }
> 
> The interface is more straightforward and we can drop the map flags
> entirely, unless I missed something here. Unfortunately you'd still need
> the locking changes in zsmalloc to make zram reads fully preemptible.

Agreed, the interface part is less of a problem, the atomicity of zsmalloc
is a much bigger issue.  We, technically, only need to mark zspage as "being
used, don't free" so that zsfree/compaction/migration don't mess with it,
but this is only "technically".  In practice we then have

	CPU0							CPU1

	zs_map_object
	set READ bit					migrate
	schedule						pool rwlock
									size class spin-lock
									wait for READ bit to clear
	...								set WRITE bit
	clear READ bit

and the whole thing collapses like a house of cards.  I wasn't able
to trigger a watchdog on my tests, but the pattern is there and it's
enough.  Maybe we can teach compaction and migration to try-WRITE and
bail out if the page is locked, but I don't know.

> I am not suggesting that we have to go this way, just throwing out
> ideas.

Sure, load+store is still an option.  While that zs_map_object()
optimization is nice, it may have two sides [in zram case].  On
one hand, we safe memcpy() [but only for certain objects], on the
other hand, we keep the page locked for the entire decompression
duration, which can be quite a while (e.g. when algorithm is
configured with a very high compression level):

	CPU0							CPU1

	zs_map_object
	read lock page rwlock			write lock page rwlock
									spin
	decompress()					... spin a lot
	read unlock page rwlock

Maybe copy-in is just an okay thing to do.  Let me try to measure.

> BTW, are we positive that the locking changes made in this series are
> not introducing regressions?

Cannot claim that with confidence.  Our workloads don't match, we don't
even use zsmalloc in the same way :)  Here be dragons.


^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [RFC PATCH 5/6] zsmalloc: introduce handle mapping API
  2025-01-28  5:29         ` Sergey Senozhatsky
@ 2025-01-28  9:38           ` Sergey Senozhatsky
  2025-01-28 17:21             ` Yosry Ahmed
  2025-01-28 11:10           ` Sergey Senozhatsky
  1 sibling, 1 reply; 24+ messages in thread
From: Sergey Senozhatsky @ 2025-01-28  9:38 UTC (permalink / raw)
  To: Yosry Ahmed
  Cc: Andrew Morton, Minchan Kim, Johannes Weiner, Nhat Pham, linux-mm,
	linux-kernel, Sergey Senozhatsky

On (25/01/28 14:29), Sergey Senozhatsky wrote:
> Maybe copy-in is just an okay thing to do.  Let me try to measure.

Naaah, not really okay.  On our memory-pressure test (4GB device, 4
CPUs) that kmap_local thingy appears to save approx 6GB of memcpy().

CPY stats: 734954 1102903168 4926116 6566654656

There were 734954 cases when we memcpy() [object spans two pages] with
accumulated size of 1102903168 bytes, and 4926116 cases when we took
a shortcut via kmap_local and avoided memcpy(), with accumulated size
of 6566654656 bytes.

In both cases I counted only RO direction for map, and WO direction
for unmap.

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [RFC PATCH 5/6] zsmalloc: introduce handle mapping API
  2025-01-28  9:38           ` Sergey Senozhatsky
@ 2025-01-28 17:21             ` Yosry Ahmed
  2025-01-29  3:32               ` Sergey Senozhatsky
  0 siblings, 1 reply; 24+ messages in thread
From: Yosry Ahmed @ 2025-01-28 17:21 UTC (permalink / raw)
  To: Sergey Senozhatsky
  Cc: Andrew Morton, Minchan Kim, Johannes Weiner, Nhat Pham, linux-mm,
	linux-kernel

On Tue, Jan 28, 2025 at 06:38:35PM +0900, Sergey Senozhatsky wrote:
> On (25/01/28 14:29), Sergey Senozhatsky wrote:
> > Maybe copy-in is just an okay thing to do.  Let me try to measure.
> 
> Naaah, not really okay.  On our memory-pressure test (4GB device, 4
> CPUs) that kmap_local thingy appears to save approx 6GB of memcpy().
> 
> CPY stats: 734954 1102903168 4926116 6566654656
> 
> There were 734954 cases when we memcpy() [object spans two pages] with
> accumulated size of 1102903168 bytes, and 4926116 cases when we took
> a shortcut via kmap_local and avoided memcpy(), with accumulated size
> of 6566654656 bytes.
> 
> In both cases I counted only RO direction for map, and WO direction
> for unmap.

Yeah seems like the optimization is effective, at least on that
workload, unless the memcpy() is cheap and avoiding it is not buying as
much (do you know if that's the case?).

Anyway, we can keep the optimization and zswap could start making use of
it if zsmalloc becomes preemtible, so that's still a win.


^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [RFC PATCH 5/6] zsmalloc: introduce handle mapping API
  2025-01-28 17:21             ` Yosry Ahmed
@ 2025-01-29  3:32               ` Sergey Senozhatsky
  0 siblings, 0 replies; 24+ messages in thread
From: Sergey Senozhatsky @ 2025-01-29  3:32 UTC (permalink / raw)
  To: Yosry Ahmed
  Cc: Sergey Senozhatsky, Andrew Morton, Minchan Kim, Johannes Weiner,
	Nhat Pham, linux-mm, linux-kernel

On (25/01/28 17:21), Yosry Ahmed wrote:
> On Tue, Jan 28, 2025 at 06:38:35PM +0900, Sergey Senozhatsky wrote:
> > On (25/01/28 14:29), Sergey Senozhatsky wrote:
> > > Maybe copy-in is just an okay thing to do.  Let me try to measure.
> 
> Yeah seems like the optimization is effective, at least on that
> workload

The workload is just lots of browser tabs (zram is configured as a
swap device), so it's quite representative.

> unless the memcpy() is cheap and avoiding it is not buying as
> much
>
> ...
>
> (do you know if that's the case?).

We run on arm64 and x86_64 on a variety of models from low-cost ones to
high-cost ones.  I probably wouldn't expect memcpy() of 6GB of random
sized objects (loop unrolling is possible only partially) to be cheap
in general.


^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [RFC PATCH 5/6] zsmalloc: introduce handle mapping API
  2025-01-28  5:29         ` Sergey Senozhatsky
  2025-01-28  9:38           ` Sergey Senozhatsky
@ 2025-01-28 11:10           ` Sergey Senozhatsky
  2025-01-28 17:22             ` Yosry Ahmed
  1 sibling, 1 reply; 24+ messages in thread
From: Sergey Senozhatsky @ 2025-01-28 11:10 UTC (permalink / raw)
  To: Yosry Ahmed
  Cc: Andrew Morton, Minchan Kim, Johannes Weiner, Nhat Pham, linux-mm,
	linux-kernel, Sergey Senozhatsky

On (25/01/28 14:29), Sergey Senozhatsky wrote:
> Maybe we can teach compaction and migration to try-WRITE and
> bail out if the page is locked, but I don't know.

This seems to be working just fine.


^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [RFC PATCH 5/6] zsmalloc: introduce handle mapping API
  2025-01-28 11:10           ` Sergey Senozhatsky
@ 2025-01-28 17:22             ` Yosry Ahmed
  2025-01-28 23:01               ` Sergey Senozhatsky
  0 siblings, 1 reply; 24+ messages in thread
From: Yosry Ahmed @ 2025-01-28 17:22 UTC (permalink / raw)
  To: Sergey Senozhatsky
  Cc: Andrew Morton, Minchan Kim, Johannes Weiner, Nhat Pham, linux-mm,
	linux-kernel

On Tue, Jan 28, 2025 at 08:10:10PM +0900, Sergey Senozhatsky wrote:
> On (25/01/28 14:29), Sergey Senozhatsky wrote:
> > Maybe we can teach compaction and migration to try-WRITE and
> > bail out if the page is locked, but I don't know.
> 
> This seems to be working just fine.

Does this mean we won't need as much locking changes to get zsmalloc to
be preemtible?

I am slightly worried about how these changes will affect performance
tbh.


^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [RFC PATCH 5/6] zsmalloc: introduce handle mapping API
  2025-01-28 17:22             ` Yosry Ahmed
@ 2025-01-28 23:01               ` Sergey Senozhatsky
  0 siblings, 0 replies; 24+ messages in thread
From: Sergey Senozhatsky @ 2025-01-28 23:01 UTC (permalink / raw)
  To: Yosry Ahmed
  Cc: Sergey Senozhatsky, Andrew Morton, Minchan Kim, Johannes Weiner,
	Nhat Pham, linux-mm, linux-kernel

On (25/01/28 17:22), Yosry Ahmed wrote:
> On Tue, Jan 28, 2025 at 08:10:10PM +0900, Sergey Senozhatsky wrote:
> > On (25/01/28 14:29), Sergey Senozhatsky wrote:
> > > Maybe we can teach compaction and migration to try-WRITE and
> > > bail out if the page is locked, but I don't know.
> > 
> > This seems to be working just fine.
> 
> Does this mean we won't need as much locking changes to get zsmalloc to
> be preemtible?

Correct, only zspage lock is getting converted.


^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [RFC PATCH 5/6] zsmalloc: introduce handle mapping API
  2025-01-28  1:36       ` Yosry Ahmed
  2025-01-28  5:29         ` Sergey Senozhatsky
@ 2025-01-29  5:40         ` Sergey Senozhatsky
  1 sibling, 0 replies; 24+ messages in thread
From: Sergey Senozhatsky @ 2025-01-29  5:40 UTC (permalink / raw)
  To: Yosry Ahmed
  Cc: Sergey Senozhatsky, Andrew Morton, Minchan Kim, Johannes Weiner,
	Nhat Pham, linux-mm, linux-kernel

On (25/01/28 01:36), Yosry Ahmed wrote:
> zs_obj_read_start(.., buf)
> {
> 	if (contained in one page)
> 		return kmapped obj
> 	else
> 		memcpy to buf
> 		return buf
> }
> 
> zs_obj_read_end(.., buf)
> {
> 	if (container in one page)
> 		kunmap
> }

So it seems we can optimize things a little further by avoiding more
memcpy() calls.  Namely, for WO directions [on objects that span two
pages] the caller first memcpy() data to a provided temp buffer (per-CPU
buffer at this moment) and then during zs_unmap() memcpy() from that temp
buffer to physical pages that object spans accross.  We can skip this memcpy()
and instead write to physical pages straight from the compression buffer.

I'll post an updated series shortly.

^ permalink raw reply	[flat|nested] 24+ messages in thread

* [RFC PATCH 6/6] zram: switch over to zshandle mapping API
  2025-01-27  7:59 [RFC PATCH 0/6] zsmalloc: make zsmalloc preemptible Sergey Senozhatsky
                   ` (4 preceding siblings ...)
  2025-01-27  7:59 ` [RFC PATCH 5/6] zsmalloc: introduce handle mapping API Sergey Senozhatsky
@ 2025-01-27  7:59 ` Sergey Senozhatsky
  5 siblings, 0 replies; 24+ messages in thread
From: Sergey Senozhatsky @ 2025-01-27  7:59 UTC (permalink / raw)
  To: Andrew Morton, Minchan Kim, Johannes Weiner, Yosry Ahmed, Nhat Pham
  Cc: linux-mm, linux-kernel, Sergey Senozhatsky

Use new zsmalloc handle mapping API so now zram read() becomes
preemptible.

Signed-off-by: Sergey Senozhatsky <senozhatsky@chromium.org>
---
 drivers/block/zram/zcomp.c    |   4 +-
 drivers/block/zram/zcomp.h    |   2 +
 drivers/block/zram/zram_drv.c | 103 ++++++++++++++++++----------------
 3 files changed, 61 insertions(+), 48 deletions(-)

diff --git a/drivers/block/zram/zcomp.c b/drivers/block/zram/zcomp.c
index efd5919808d9..9b373ab1ee0b 100644
--- a/drivers/block/zram/zcomp.c
+++ b/drivers/block/zram/zcomp.c
@@ -45,6 +45,7 @@ static const struct zcomp_ops *backends[] = {
 static void zcomp_strm_free(struct zcomp *comp, struct zcomp_strm *strm)
 {
 	comp->ops->destroy_ctx(&strm->ctx);
+	vfree(strm->handle_mem_copy);
 	vfree(strm->buffer);
 	kfree(strm);
 }
@@ -66,12 +67,13 @@ static struct zcomp_strm *zcomp_strm_alloc(struct zcomp *comp)
 		return NULL;
 	}
 
+	strm->handle_mem_copy = vzalloc(PAGE_SIZE);
 	/*
 	 * allocate 2 pages. 1 for compressed data, plus 1 extra in case if
 	 * compressed data is larger than the original one.
 	 */
 	strm->buffer = vzalloc(2 * PAGE_SIZE);
-	if (!strm->buffer) {
+	if (!strm->buffer || !strm->handle_mem_copy) {
 		zcomp_strm_free(comp, strm);
 		return NULL;
 	}
diff --git a/drivers/block/zram/zcomp.h b/drivers/block/zram/zcomp.h
index 62330829db3f..f003f09820a5 100644
--- a/drivers/block/zram/zcomp.h
+++ b/drivers/block/zram/zcomp.h
@@ -34,6 +34,8 @@ struct zcomp_strm {
 	struct list_head entry;
 	/* compression buffer */
 	void *buffer;
+	/* handle object memory copy */
+	void *handle_mem_copy;
 	struct zcomp_ctx ctx;
 };
 
diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index 9c72beb86ab0..120055b11520 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -1558,37 +1558,43 @@ static int read_same_filled_page(struct zram *zram, struct page *page,
 static int read_incompressible_page(struct zram *zram, struct page *page,
 				    u32 index)
 {
-	unsigned long handle;
-	void *src, *dst;
+	struct zs_handle_mapping hm;
+	void *dst;
 
-	handle = zram_get_handle(zram, index);
-	src = zs_map_object(zram->mem_pool, handle, ZS_MM_RO);
+	hm.handle = zram_get_handle(zram, index);
+	hm.mode = ZS_MM_RO;
+
+	zs_map_handle(zram->mem_pool, &hm);
 	dst = kmap_local_page(page);
-	copy_page(dst, src);
+	copy_page(dst, hm.handle_mem);
 	kunmap_local(dst);
-	zs_unmap_object(zram->mem_pool, handle);
+	zs_unmap_handle(zram->mem_pool, &hm);
 
 	return 0;
 }
 
 static int read_compressed_page(struct zram *zram, struct page *page, u32 index)
 {
+	struct zs_handle_mapping hm;
 	struct zcomp_strm *zstrm;
-	unsigned long handle;
 	unsigned int size;
-	void *src, *dst;
+	void *dst;
 	int ret, prio;
 
-	handle = zram_get_handle(zram, index);
 	size = zram_get_obj_size(zram, index);
 	prio = zram_get_priority(zram, index);
 
 	zstrm = zcomp_stream_get(zram->comps[prio]);
-	src = zs_map_object(zram->mem_pool, handle, ZS_MM_RO);
+	hm.handle = zram_get_handle(zram, index);
+	hm.mode = ZS_MM_RO;
+	hm.local_copy = zstrm->handle_mem_copy;
+
+	zs_map_handle(zram->mem_pool, &hm);
 	dst = kmap_local_page(page);
-	ret = zcomp_decompress(zram->comps[prio], zstrm, src, size, dst);
+	ret = zcomp_decompress(zram->comps[prio], zstrm,
+			       hm.handle_mem, size, dst);
 	kunmap_local(dst);
-	zs_unmap_object(zram->mem_pool, handle);
+	zs_unmap_handle(zram->mem_pool, &hm);
 	zcomp_stream_put(zram->comps[prio], zstrm);
 
 	return ret;
@@ -1683,33 +1689,34 @@ static int write_same_filled_page(struct zram *zram, unsigned long fill,
 static int write_incompressible_page(struct zram *zram, struct page *page,
 				     u32 index)
 {
-	unsigned long handle;
-	void *src, *dst;
+	struct zs_handle_mapping hm;
+	void *src;
 
 	/*
 	 * This function is called from preemptible context so we don't need
 	 * to do optimistic and fallback to pessimistic handle allocation,
 	 * like we do for compressible pages.
 	 */
-	handle = zs_malloc(zram->mem_pool, PAGE_SIZE,
-			   GFP_NOIO | __GFP_HIGHMEM | __GFP_MOVABLE);
-	if (IS_ERR_VALUE(handle))
-		return PTR_ERR((void *)handle);
+	hm.handle = zs_malloc(zram->mem_pool, PAGE_SIZE,
+			      GFP_NOIO | __GFP_HIGHMEM | __GFP_MOVABLE);
+	if (IS_ERR_VALUE(hm.handle))
+		return PTR_ERR((void *)hm.handle);
 
 	if (!zram_can_store_page(zram)) {
-		zs_free(zram->mem_pool, handle);
+		zs_free(zram->mem_pool, hm.handle);
 		return -ENOMEM;
 	}
 
-	dst = zs_map_object(zram->mem_pool, handle, ZS_MM_WO);
+	hm.mode = ZS_MM_WO;
+	zs_map_handle(zram->mem_pool, &hm);
 	src = kmap_local_page(page);
-	memcpy(dst, src, PAGE_SIZE);
+	memcpy(hm.handle_mem, src, PAGE_SIZE);
 	kunmap_local(src);
-	zs_unmap_object(zram->mem_pool, handle);
+	zs_unmap_handle(zram->mem_pool, &hm);
 
 	zram_slot_write_lock(zram, index);
 	zram_set_flag(zram, index, ZRAM_HUGE);
-	zram_set_handle(zram, index, handle);
+	zram_set_handle(zram, index, hm.handle);
 	zram_set_obj_size(zram, index, PAGE_SIZE);
 	zram_slot_write_unlock(zram, index);
 
@@ -1724,9 +1731,9 @@ static int write_incompressible_page(struct zram *zram, struct page *page,
 static int zram_write_page(struct zram *zram, struct page *page, u32 index)
 {
 	int ret = 0;
-	unsigned long handle;
+	struct zs_handle_mapping hm;
 	unsigned int comp_len;
-	void *dst, *mem;
+	void *mem;
 	struct zcomp_strm *zstrm;
 	unsigned long element;
 	bool same_filled;
@@ -1758,25 +1765,26 @@ static int zram_write_page(struct zram *zram, struct page *page, u32 index)
 		return write_incompressible_page(zram, page, index);
 	}
 
-	handle = zs_malloc(zram->mem_pool, comp_len,
-			   GFP_NOIO | __GFP_HIGHMEM | __GFP_MOVABLE);
-	if (IS_ERR_VALUE(handle))
-		return PTR_ERR((void *)handle);
+	hm.handle = zs_malloc(zram->mem_pool, comp_len,
+			      GFP_NOIO | __GFP_HIGHMEM | __GFP_MOVABLE);
+	if (IS_ERR_VALUE(hm.handle))
+		return PTR_ERR((void *)hm.handle);
 
 	if (!zram_can_store_page(zram)) {
 		zcomp_stream_put(zram->comps[ZRAM_PRIMARY_COMP], zstrm);
-		zs_free(zram->mem_pool, handle);
+		zs_free(zram->mem_pool, hm.handle);
 		return -ENOMEM;
 	}
 
-	dst = zs_map_object(zram->mem_pool, handle, ZS_MM_WO);
-
-	memcpy(dst, zstrm->buffer, comp_len);
+	hm.mode = ZS_MM_WO;
+	hm.local_copy = zstrm->handle_mem_copy;
+	zs_map_handle(zram->mem_pool, &hm);
+	memcpy(hm.handle_mem, zstrm->buffer, comp_len);
+	zs_unmap_handle(zram->mem_pool, &hm);
 	zcomp_stream_put(zram->comps[ZRAM_PRIMARY_COMP], zstrm);
-	zs_unmap_object(zram->mem_pool, handle);
 
 	zram_slot_write_lock(zram, index);
-	zram_set_handle(zram, index, handle);
+	zram_set_handle(zram, index, hm.handle);
 	zram_set_obj_size(zram, index, comp_len);
 	zram_slot_write_unlock(zram, index);
 
@@ -1875,14 +1883,14 @@ static int recompress_slot(struct zram *zram, u32 index, struct page *page,
 			   u32 prio_max)
 {
 	struct zcomp_strm *zstrm = NULL;
+	struct zs_handle_mapping hm;
 	unsigned long handle_old;
-	unsigned long handle_new;
 	unsigned int comp_len_old;
 	unsigned int comp_len_new;
 	unsigned int class_index_old;
 	unsigned int class_index_new;
 	u32 num_recomps = 0;
-	void *src, *dst;
+	void *src;
 	int ret;
 
 	handle_old = zram_get_handle(zram, index);
@@ -2000,34 +2008,35 @@ static int recompress_slot(struct zram *zram, u32 index, struct page *page,
 
 	/* zsmalloc handle allocation can schedule, unlock slot's bucket */
 	zram_slot_write_unlock(zram, index);
-	handle_new = zs_malloc(zram->mem_pool, comp_len_new,
-			       GFP_NOIO | __GFP_HIGHMEM | __GFP_MOVABLE);
+	hm.handle = zs_malloc(zram->mem_pool, comp_len_new,
+			      GFP_NOIO | __GFP_HIGHMEM | __GFP_MOVABLE);
 	zram_slot_write_lock(zram, index);
 
 	/*
 	 * If we couldn't allocate memory for recompressed object then bail
 	 * out and simply keep the old (existing) object in mempool.
 	 */
-	if (IS_ERR_VALUE(handle_new)) {
+	if (IS_ERR_VALUE(hm.handle)) {
 		zcomp_stream_put(zram->comps[prio], zstrm);
-		return PTR_ERR((void *)handle_new);
+		return PTR_ERR((void *)hm.handle);
 	}
 
 	/* Slot has been modified concurrently */
 	if (!zram_test_flag(zram, index, ZRAM_PP_SLOT)) {
 		zcomp_stream_put(zram->comps[prio], zstrm);
-		zs_free(zram->mem_pool, handle_new);
+		zs_free(zram->mem_pool, hm.handle);
 		return 0;
 	}
 
-	dst = zs_map_object(zram->mem_pool, handle_new, ZS_MM_WO);
-	memcpy(dst, zstrm->buffer, comp_len_new);
+	hm.mode = ZS_MM_WO;
+	hm.local_copy = zstrm->handle_mem_copy;
+	zs_map_handle(zram->mem_pool, &hm);
+	memcpy(hm.handle_mem, zstrm->buffer, comp_len_new);
+	zs_unmap_handle(zram->mem_pool, &hm);
 	zcomp_stream_put(zram->comps[prio], zstrm);
 
-	zs_unmap_object(zram->mem_pool, handle_new);
-
 	zram_free_page(zram, index);
-	zram_set_handle(zram, index, handle_new);
+	zram_set_handle(zram, index, hm.handle);
 	zram_set_obj_size(zram, index, comp_len_new);
 	zram_set_priority(zram, index, prio);
 
-- 
2.48.1.262.g85cc9f2d1e-goog



^ permalink raw reply	[flat|nested] 24+ messages in thread

end of thread, other threads:[~2025-01-29  5:40 UTC | newest]

Thread overview: 24+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2025-01-27  7:59 [RFC PATCH 0/6] zsmalloc: make zsmalloc preemptible Sergey Senozhatsky
2025-01-27  7:59 ` [RFC PATCH 1/6] zram: deffer slot free notification Sergey Senozhatsky
2025-01-27  7:59 ` [RFC PATCH 2/6] zsmalloc: make zspage lock preemptible Sergey Senozhatsky
2025-01-27 20:23   ` Uros Bizjak
2025-01-28  0:29     ` Sergey Senozhatsky
2025-01-27  7:59 ` [RFC PATCH 3/6] zsmalloc: convert to sleepable pool lock Sergey Senozhatsky
2025-01-27  7:59 ` [RFC PATCH 4/6] zsmalloc: make class lock sleepable Sergey Senozhatsky
2025-01-27  7:59 ` [RFC PATCH 5/6] zsmalloc: introduce handle mapping API Sergey Senozhatsky
2025-01-27 21:26   ` Yosry Ahmed
2025-01-28  0:37     ` Sergey Senozhatsky
2025-01-28  0:49       ` Yosry Ahmed
2025-01-28  1:13         ` Sergey Senozhatsky
2025-01-27 21:58   ` Yosry Ahmed
2025-01-28  0:59     ` Sergey Senozhatsky
2025-01-28  1:36       ` Yosry Ahmed
2025-01-28  5:29         ` Sergey Senozhatsky
2025-01-28  9:38           ` Sergey Senozhatsky
2025-01-28 17:21             ` Yosry Ahmed
2025-01-29  3:32               ` Sergey Senozhatsky
2025-01-28 11:10           ` Sergey Senozhatsky
2025-01-28 17:22             ` Yosry Ahmed
2025-01-28 23:01               ` Sergey Senozhatsky
2025-01-29  5:40         ` Sergey Senozhatsky
2025-01-27  7:59 ` [RFC PATCH 6/6] zram: switch over to zshandle " Sergey Senozhatsky

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox