linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed
From: Sergey Senozhatsky <senozhatsky@chromium.org>
To: Andrew Morton <akpm@linux-foundation.org>
Cc: Yosry Ahmed <yosry.ahmed@linux.dev>,
	Hillf Danton <hdanton@sina.com>, Kairui Song <ryncsn@gmail.com>,
	Sebastian Andrzej Siewior <bigeasy@linutronix.de>,
	Minchan Kim <minchan@kernel.org>,
	linux-mm@kvack.org, linux-kernel@vger.kernel.org,
	Sergey Senozhatsky <senozhatsky@chromium.org>
Subject: [PATCH v9 01/19] zram: sleepable entry locking
Date: Thu, 27 Feb 2025 13:35:19 +0900	[thread overview]
Message-ID: <20250227043618.88380-2-senozhatsky@chromium.org> (raw)
In-Reply-To: <20250227043618.88380-1-senozhatsky@chromium.org>

Concurrent modifications of meta table entries is now handled
by per-entry spin-lock.  This has a number of shortcomings.

First, this imposes atomic requirements on compression backends.
zram can call both zcomp_compress() and zcomp_decompress() under
entry spin-lock, which implies that we can use only compression
algorithms that don't schedule/sleep/wait during compression and
decompression.  This, for instance, makes it impossible to use
some of the ASYNC compression algorithms (H/W compression, etc.)
implementations.

Second, this can potentially trigger watchdogs.  For example,
entry re-compression with secondary algorithms is performed
under entry spin-lock.  Given that we chain secondary
compression algorithms and that some of them can be configured
for best compression ratio (and worst compression speed) zram
can stay under spin-lock for quite some time.

Having a per-entry mutex (or, for instance, a rw-semaphore)
significantly increases sizeof() of each entry and hence the
meta table.  Therefore entry locking returns back to bit
locking, as before, however, this time also preempt-rt friendly,
because if waits-on-bit instead of spinning-on-bit.  Lock owners
are also now permitted to schedule, which is a first step on the
path of making zram non-atomic.

Signed-off-by: Sergey Senozhatsky <senozhatsky@chromium.org>
---
 drivers/block/zram/zram_drv.c | 56 ++++++++++++++++++++++++++++-------
 drivers/block/zram/zram_drv.h | 16 ++++++----
 2 files changed, 55 insertions(+), 17 deletions(-)

diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index 9f5020b077c5..ddf03f6cbeed 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -58,19 +58,56 @@ static void zram_free_page(struct zram *zram, size_t index);
 static int zram_read_from_zspool(struct zram *zram, struct page *page,
 				 u32 index);
 
-static int zram_slot_trylock(struct zram *zram, u32 index)
+#define slot_dep_map(zram, index) (&(zram)->table[(index)].dep_map)
+
+static void zram_slot_lock_init(struct zram *zram, u32 index)
 {
-	return spin_trylock(&zram->table[index].lock);
+	lockdep_init_map(slot_dep_map(zram, index),
+			 "zram->table[index].lock",
+			 &zram->lock_class, 0);
+}
+
+/*
+ * entry locking rules:
+ *
+ * 1) Lock is exclusive
+ *
+ * 2) lock() function can sleep waiting for the lock
+ *
+ * 3) Lock owner can sleep
+ *
+ * 4) Use TRY lock variant when in atomic context
+ *    - must check return value and handle locking failers
+ */
+static __must_check bool zram_slot_trylock(struct zram *zram, u32 index)
+{
+	unsigned long *lock = &zram->table[index].flags;
+
+	if (!test_and_set_bit_lock(ZRAM_ENTRY_LOCK, lock)) {
+		mutex_acquire(slot_dep_map(zram, index), 0, 1, _RET_IP_);
+		lock_acquired(slot_dep_map(zram, index), _RET_IP_);
+		return true;
+	}
+
+	lock_contended(slot_dep_map(zram, index), _RET_IP_);
+	return false;
 }
 
 static void zram_slot_lock(struct zram *zram, u32 index)
 {
-	spin_lock(&zram->table[index].lock);
+	unsigned long *lock = &zram->table[index].flags;
+
+	mutex_acquire(slot_dep_map(zram, index), 0, 0, _RET_IP_);
+	wait_on_bit_lock(lock, ZRAM_ENTRY_LOCK, TASK_UNINTERRUPTIBLE);
+	lock_acquired(slot_dep_map(zram, index), _RET_IP_);
 }
 
 static void zram_slot_unlock(struct zram *zram, u32 index)
 {
-	spin_unlock(&zram->table[index].lock);
+	unsigned long *lock = &zram->table[index].flags;
+
+	mutex_release(slot_dep_map(zram, index), _RET_IP_);
+	clear_and_wake_up_bit(ZRAM_ENTRY_LOCK, lock);
 }
 
 static inline bool init_done(struct zram *zram)
@@ -93,7 +130,6 @@ static void zram_set_handle(struct zram *zram, u32 index, unsigned long handle)
 	zram->table[index].handle = handle;
 }
 
-/* flag operations require table entry bit_spin_lock() being held */
 static bool zram_test_flag(struct zram *zram, u32 index,
 			enum zram_pageflags flag)
 {
@@ -1473,15 +1509,11 @@ static bool zram_meta_alloc(struct zram *zram, u64 disksize)
 		huge_class_size = zs_huge_class_size(zram->mem_pool);
 
 	for (index = 0; index < num_pages; index++)
-		spin_lock_init(&zram->table[index].lock);
+		zram_slot_lock_init(zram, index);
+
 	return true;
 }
 
-/*
- * To protect concurrent access to the same index entry,
- * caller should hold this table index entry's bit_spinlock to
- * indicate this index entry is accessing.
- */
 static void zram_free_page(struct zram *zram, size_t index)
 {
 	unsigned long handle;
@@ -2625,6 +2657,7 @@ static int zram_add(void)
 	if (ret)
 		goto out_cleanup_disk;
 
+	lockdep_register_key(&zram->lock_class);
 	zram_debugfs_register(zram);
 	pr_info("Added device: %s\n", zram->disk->disk_name);
 	return device_id;
@@ -2653,6 +2686,7 @@ static int zram_remove(struct zram *zram)
 		zram->claim = true;
 	mutex_unlock(&zram->disk->open_mutex);
 
+	lockdep_unregister_key(&zram->lock_class);
 	zram_debugfs_unregister(zram);
 
 	if (claimed) {
diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h
index db78d7c01b9a..8a7d52fbab4d 100644
--- a/drivers/block/zram/zram_drv.h
+++ b/drivers/block/zram/zram_drv.h
@@ -28,7 +28,6 @@
 #define ZRAM_SECTOR_PER_LOGICAL_BLOCK	\
 	(1 << (ZRAM_LOGICAL_BLOCK_SHIFT - SECTOR_SHIFT))
 
-
 /*
  * ZRAM is mainly used for memory efficiency so we want to keep memory
  * footprint small and thus squeeze size and zram pageflags into a flags
@@ -46,6 +45,7 @@
 /* Flags for zram pages (table[page_no].flags) */
 enum zram_pageflags {
 	ZRAM_SAME = ZRAM_FLAG_SHIFT,	/* Page consists the same element */
+	ZRAM_ENTRY_LOCK, /* entry access lock bit */
 	ZRAM_WB,	/* page is stored on backing_device */
 	ZRAM_PP_SLOT,	/* Selected for post-processing */
 	ZRAM_HUGE,	/* Incompressible page */
@@ -58,16 +58,19 @@ enum zram_pageflags {
 	__NR_ZRAM_PAGEFLAGS,
 };
 
-/*-- Data structures */
-
-/* Allocated for each disk page */
+/*
+ * Allocated for each disk page.  We use bit-lock (ZRAM_ENTRY_LOCK bit
+ * of flags) to save memory.  There can be plenty of entries and standard
+ * locking primitives (e.g. mutex) will significantly increase sizeof()
+ * of each entry and hence of the meta table.
+ */
 struct zram_table_entry {
 	unsigned long handle;
-	unsigned int flags;
-	spinlock_t lock;
+	unsigned long flags;
 #ifdef CONFIG_ZRAM_TRACK_ENTRY_ACTIME
 	ktime_t ac_time;
 #endif
+	struct lockdep_map dep_map;
 };
 
 struct zram_stats {
@@ -137,5 +140,6 @@ struct zram {
 	struct dentry *debugfs_dir;
 #endif
 	atomic_t pp_in_progress;
+	struct lock_class_key lock_class;
 };
 #endif
-- 
2.48.1.658.g4767266eb4-goog



  reply	other threads:[~2025-02-27  4:36 UTC|newest]

Thread overview: 26+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2025-02-27  4:35 [PATCH v9 00/19] zsmalloc/zram: there be preemption Sergey Senozhatsky
2025-02-27  4:35 ` Sergey Senozhatsky [this message]
2025-02-27  4:35 ` [PATCH v9 02/19] zram: permit preemption with active compression stream Sergey Senozhatsky
2025-02-27  4:35 ` [PATCH v9 03/19] zram: remove unused crypto include Sergey Senozhatsky
2025-02-27  4:35 ` [PATCH v9 04/19] zram: remove max_comp_streams device attr Sergey Senozhatsky
2025-02-27  4:35 ` [PATCH v9 05/19] zram: remove second stage of handle allocation Sergey Senozhatsky
2025-02-27  4:35 ` [PATCH v9 06/19] zram: no-warn about zsmalloc " Sergey Senozhatsky
2025-02-27  4:35 ` [PATCH v9 07/19] zram: remove writestall zram_stats member Sergey Senozhatsky
2025-02-27  4:35 ` [PATCH v9 08/19] zram: limit max recompress prio to num_active_comps Sergey Senozhatsky
2025-02-27  4:35 ` [PATCH v9 09/19] zram: filter out recomp targets based on priority Sergey Senozhatsky
2025-02-27  4:35 ` [PATCH v9 10/19] zram: rework recompression loop Sergey Senozhatsky
2025-02-27  4:35 ` [PATCH v9 11/19] zram: move post-processing target allocation Sergey Senozhatsky
2025-02-27  4:35 ` [PATCH v9 12/19] zsmalloc: rename pool lock Sergey Senozhatsky
2025-02-27  4:35 ` [PATCH v9 13/19] zsmalloc: make zspage lock preemptible Sergey Senozhatsky
2025-02-27  4:35 ` [PATCH v9 14/19] zsmalloc: introduce new object mapping API Sergey Senozhatsky
2025-02-27  5:48   ` Yosry Ahmed
2025-02-27  6:54     ` Sergey Senozhatsky
2025-02-27  7:09       ` Yosry Ahmed
2025-03-01  7:22   ` Herbert Xu
2025-03-03  2:42     ` Sergey Senozhatsky
2025-03-03  2:51       ` Herbert Xu
2025-02-27  4:35 ` [PATCH v9 15/19] zram: switch to new zsmalloc " Sergey Senozhatsky
2025-02-27  4:35 ` [PATCH v9 16/19] zram: permit reclaim in zstd custom allocator Sergey Senozhatsky
2025-02-27  4:35 ` [PATCH v9 17/19] zram: do not leak page on recompress_store error path Sergey Senozhatsky
2025-02-27  4:35 ` [PATCH v9 18/19] zram: do not leak page on writeback_store " Sergey Senozhatsky
2025-02-27  4:35 ` [PATCH v9 19/19] zram: add might_sleep to zcomp API Sergey Senozhatsky

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20250227043618.88380-2-senozhatsky@chromium.org \
    --to=senozhatsky@chromium.org \
    --cc=akpm@linux-foundation.org \
    --cc=bigeasy@linutronix.de \
    --cc=hdanton@sina.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=minchan@kernel.org \
    --cc=ryncsn@gmail.com \
    --cc=yosry.ahmed@linux.dev \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox