From: Sergey Senozhatsky <senozhatsky@chromium.org>
To: Andrew Morton <akpm@linux-foundation.org>
Cc: Yosry Ahmed <yosry.ahmed@linux.dev>,
Kairui Song <ryncsn@gmail.com>, Minchan Kim <minchan@kernel.org>,
linux-mm@kvack.org, linux-kernel@vger.kernel.org,
Sergey Senozhatsky <senozhatsky@chromium.org>
Subject: [PATCH v5 12/18] zsmalloc: make zspage lock preemptible
Date: Wed, 12 Feb 2025 15:27:10 +0900 [thread overview]
Message-ID: <20250212063153.179231-13-senozhatsky@chromium.org> (raw)
In-Reply-To: <20250212063153.179231-1-senozhatsky@chromium.org>
Switch over from rwlock_t to a atomic_t variable that takes negative
value when the page is under migration, or positive values when the
page is used by zsmalloc users (object map, etc.) Using a rwsem
per-zspage is a little too memory heavy, a simple atomic_t should
suffice.
zspage lock is a leaf lock for zs_map_object(), where it's read-acquired.
Since this lock now permits preemption extra care needs to be taken when
it is write-acquired - all writers grab it in atomic context, so they
cannot spin and wait for (potentially preempted) reader to unlock zspage.
There are only two writers at this moment - migration and compaction. In
both cases we use write-try-lock and bail out if zspage is read locked.
Writers, on the other hand, never get preempted, so readers can spin
waiting for the writer to unlock zspage.
With this we can implement a preemptible object mapping.
Signed-off-by: Sergey Senozhatsky <senozhatsky@chromium.org>
Cc: Yosry Ahmed <yosry.ahmed@linux.dev>
---
mm/zsmalloc.c | 183 +++++++++++++++++++++++++++++++++++---------------
1 file changed, 128 insertions(+), 55 deletions(-)
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index c82c24b8e6a4..80261bb78cf8 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -226,6 +226,9 @@ struct zs_pool {
/* protect page/zspage migration */
rwlock_t lock;
atomic_t compaction_in_progress;
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+ struct lock_class_key lockdep_key;
+#endif
};
static void pool_write_unlock(struct zs_pool *pool)
@@ -292,6 +295,9 @@ static inline void free_zpdesc(struct zpdesc *zpdesc)
__free_page(page);
}
+#define ZS_PAGE_UNLOCKED 0
+#define ZS_PAGE_WRLOCKED -1
+
struct zspage {
struct {
unsigned int huge:HUGE_BITS;
@@ -304,7 +310,11 @@ struct zspage {
struct zpdesc *first_zpdesc;
struct list_head list; /* fullness list */
struct zs_pool *pool;
- rwlock_t lock;
+ atomic_t lock;
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+ struct lockdep_map lockdep_map;
+#endif
};
struct mapping_area {
@@ -314,6 +324,88 @@ struct mapping_area {
enum zs_mapmode vm_mm; /* mapping mode */
};
+static void zspage_lock_init(struct zspage *zspage)
+{
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+ lockdep_init_map(&zspage->lockdep_map, "zsmalloc-page",
+ &zspage->pool->lockdep_key, 0);
+#endif
+
+ atomic_set(&zspage->lock, ZS_PAGE_UNLOCKED);
+}
+
+/*
+ * zspage locking rules:
+ *
+ * 1) writer-lock is exclusive
+ *
+ * 2) writer-lock owner cannot sleep
+ *
+ * 3) writer-lock owner cannot spin waiting for the lock
+ * - caller (e.g. compaction and migration) must check return value and
+ * handle locking failures
+ * - there is only TRY variant of writer-lock function
+ *
+ * 4) reader-lock owners (multiple) can sleep
+ *
+ * 5) reader-lock owners can spin waiting for the lock, in any context
+ * - existing readers (even preempted ones) don't block new readers
+ * - writer-lock owners never sleep, always unlock at some point
+ */
+static void zspage_read_lock(struct zspage *zspage)
+{
+ atomic_t *lock = &zspage->lock;
+ int old = atomic_read_acquire(lock);
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+ rwsem_acquire_read(&zspage->lockdep_map, 0, 0, _RET_IP_);
+#endif
+
+ do {
+ if (old == ZS_PAGE_WRLOCKED) {
+ cpu_relax();
+ old = atomic_read_acquire(lock);
+ continue;
+ }
+ } while (!atomic_try_cmpxchg_acquire(lock, &old, old + 1));
+}
+
+static void zspage_read_unlock(struct zspage *zspage)
+{
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+ rwsem_release(&zspage->lockdep_map, _RET_IP_);
+#endif
+ atomic_dec_return_release(&zspage->lock);
+}
+
+static __must_check bool zspage_try_write_lock(struct zspage *zspage)
+{
+ atomic_t *lock = &zspage->lock;
+ int old = ZS_PAGE_UNLOCKED;
+
+ WARN_ON_ONCE(preemptible());
+
+ preempt_disable();
+ if (atomic_try_cmpxchg_acquire(lock, &old, ZS_PAGE_WRLOCKED)) {
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+ rwsem_acquire(&zspage->lockdep_map, 0, 1, _RET_IP_);
+#endif
+ return true;
+ }
+
+ preempt_enable();
+ return false;
+}
+
+static void zspage_write_unlock(struct zspage *zspage)
+{
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+ rwsem_release(&zspage->lockdep_map, _RET_IP_);
+#endif
+ atomic_set_release(&zspage->lock, ZS_PAGE_UNLOCKED);
+ preempt_enable();
+}
+
/* huge object: pages_per_zspage == 1 && maxobj_per_zspage == 1 */
static void SetZsHugePage(struct zspage *zspage)
{
@@ -325,12 +417,6 @@ static bool ZsHugePage(struct zspage *zspage)
return zspage->huge;
}
-static void migrate_lock_init(struct zspage *zspage);
-static void migrate_read_lock(struct zspage *zspage);
-static void migrate_read_unlock(struct zspage *zspage);
-static void migrate_write_lock(struct zspage *zspage);
-static void migrate_write_unlock(struct zspage *zspage);
-
#ifdef CONFIG_COMPACTION
static void kick_deferred_free(struct zs_pool *pool);
static void init_deferred_free(struct zs_pool *pool);
@@ -1026,7 +1112,9 @@ static struct zspage *alloc_zspage(struct zs_pool *pool,
return NULL;
zspage->magic = ZSPAGE_MAGIC;
- migrate_lock_init(zspage);
+ zspage->pool = pool;
+ zspage->class = class->index;
+ zspage_lock_init(zspage);
for (i = 0; i < class->pages_per_zspage; i++) {
struct zpdesc *zpdesc;
@@ -1049,8 +1137,6 @@ static struct zspage *alloc_zspage(struct zs_pool *pool,
create_page_chain(class, zspage, zpdescs);
init_zspage(class, zspage);
- zspage->pool = pool;
- zspage->class = class->index;
return zspage;
}
@@ -1251,7 +1337,7 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle,
* zs_unmap_object API so delegate the locking from class to zspage
* which is smaller granularity.
*/
- migrate_read_lock(zspage);
+ zspage_read_lock(zspage);
pool_read_unlock(pool);
class = zspage_class(pool, zspage);
@@ -1311,7 +1397,7 @@ void zs_unmap_object(struct zs_pool *pool, unsigned long handle)
}
local_unlock(&zs_map_area.lock);
- migrate_read_unlock(zspage);
+ zspage_read_unlock(zspage);
}
EXPORT_SYMBOL_GPL(zs_unmap_object);
@@ -1705,18 +1791,18 @@ static void lock_zspage(struct zspage *zspage)
/*
* Pages we haven't locked yet can be migrated off the list while we're
* trying to lock them, so we need to be careful and only attempt to
- * lock each page under migrate_read_lock(). Otherwise, the page we lock
+ * lock each page under zspage_read_lock(). Otherwise, the page we lock
* may no longer belong to the zspage. This means that we may wait for
* the wrong page to unlock, so we must take a reference to the page
- * prior to waiting for it to unlock outside migrate_read_lock().
+ * prior to waiting for it to unlock outside zspage_read_lock().
*/
while (1) {
- migrate_read_lock(zspage);
+ zspage_read_lock(zspage);
zpdesc = get_first_zpdesc(zspage);
if (zpdesc_trylock(zpdesc))
break;
zpdesc_get(zpdesc);
- migrate_read_unlock(zspage);
+ zspage_read_unlock(zspage);
zpdesc_wait_locked(zpdesc);
zpdesc_put(zpdesc);
}
@@ -1727,41 +1813,16 @@ static void lock_zspage(struct zspage *zspage)
curr_zpdesc = zpdesc;
} else {
zpdesc_get(zpdesc);
- migrate_read_unlock(zspage);
+ zspage_read_unlock(zspage);
zpdesc_wait_locked(zpdesc);
zpdesc_put(zpdesc);
- migrate_read_lock(zspage);
+ zspage_read_lock(zspage);
}
}
- migrate_read_unlock(zspage);
+ zspage_read_unlock(zspage);
}
#endif /* CONFIG_COMPACTION */
-static void migrate_lock_init(struct zspage *zspage)
-{
- rwlock_init(&zspage->lock);
-}
-
-static void migrate_read_lock(struct zspage *zspage) __acquires(&zspage->lock)
-{
- read_lock(&zspage->lock);
-}
-
-static void migrate_read_unlock(struct zspage *zspage) __releases(&zspage->lock)
-{
- read_unlock(&zspage->lock);
-}
-
-static void migrate_write_lock(struct zspage *zspage)
-{
- write_lock(&zspage->lock);
-}
-
-static void migrate_write_unlock(struct zspage *zspage)
-{
- write_unlock(&zspage->lock);
-}
-
#ifdef CONFIG_COMPACTION
static const struct movable_operations zsmalloc_mops;
@@ -1803,7 +1864,7 @@ static bool zs_page_isolate(struct page *page, isolate_mode_t mode)
}
static int zs_page_migrate(struct page *newpage, struct page *page,
- enum migrate_mode mode)
+ enum migrate_mode mode)
{
struct zs_pool *pool;
struct size_class *class;
@@ -1819,15 +1880,12 @@ static int zs_page_migrate(struct page *newpage, struct page *page,
VM_BUG_ON_PAGE(!zpdesc_is_isolated(zpdesc), zpdesc_page(zpdesc));
- /* We're committed, tell the world that this is a Zsmalloc page. */
- __zpdesc_set_zsmalloc(newzpdesc);
-
/* The page is locked, so this pointer must remain valid */
zspage = get_zspage(zpdesc);
pool = zspage->pool;
/*
- * The pool lock protects the race between zpage migration
+ * The pool->lock protects the race between zpage migration
* and zs_free.
*/
pool_write_lock(pool);
@@ -1837,8 +1895,15 @@ static int zs_page_migrate(struct page *newpage, struct page *page,
* the class lock protects zpage alloc/free in the zspage.
*/
size_class_lock(class);
- /* the migrate_write_lock protects zpage access via zs_map_object */
- migrate_write_lock(zspage);
+ /* the zspage write_lock protects zpage access via zs_map_object */
+ if (!zspage_try_write_lock(zspage)) {
+ size_class_unlock(class);
+ pool_write_unlock(pool);
+ return -EINVAL;
+ }
+
+ /* We're committed, tell the world that this is a Zsmalloc page. */
+ __zpdesc_set_zsmalloc(newzpdesc);
offset = get_first_obj_offset(zpdesc);
s_addr = kmap_local_zpdesc(zpdesc);
@@ -1869,7 +1934,7 @@ static int zs_page_migrate(struct page *newpage, struct page *page,
*/
pool_write_unlock(pool);
size_class_unlock(class);
- migrate_write_unlock(zspage);
+ zspage_write_unlock(zspage);
zpdesc_get(newzpdesc);
if (zpdesc_zone(newzpdesc) != zpdesc_zone(zpdesc)) {
@@ -2005,9 +2070,11 @@ static unsigned long __zs_compact(struct zs_pool *pool,
if (!src_zspage)
break;
- migrate_write_lock(src_zspage);
+ if (!zspage_try_write_lock(src_zspage))
+ break;
+
migrate_zspage(pool, src_zspage, dst_zspage);
- migrate_write_unlock(src_zspage);
+ zspage_write_unlock(src_zspage);
fg = putback_zspage(class, src_zspage);
if (fg == ZS_INUSE_RATIO_0) {
@@ -2267,7 +2334,9 @@ struct zs_pool *zs_create_pool(const char *name)
* trigger compaction manually. Thus, ignore return code.
*/
zs_register_shrinker(pool);
-
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+ lockdep_register_key(&pool->lockdep_key);
+#endif
return pool;
err:
@@ -2304,6 +2373,10 @@ void zs_destroy_pool(struct zs_pool *pool)
kfree(class);
}
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+ lockdep_unregister_key(&pool->lockdep_key);
+#endif
+
destroy_cache(pool);
kfree(pool->name);
kfree(pool);
--
2.48.1.502.g6dc24dfdaf-goog
next prev parent reply other threads:[~2025-02-12 6:33 UTC|newest]
Thread overview: 40+ messages / expand[flat|nested] mbox.gz Atom feed top
2025-02-12 6:26 [PATCH v5 00/18] zsmalloc/zram: there be preemption Sergey Senozhatsky
2025-02-12 6:26 ` [PATCH v5 01/18] zram: sleepable entry locking Sergey Senozhatsky
2025-02-13 0:08 ` Andrew Morton
2025-02-13 0:52 ` Sergey Senozhatsky
2025-02-13 1:42 ` Sergey Senozhatsky
2025-02-13 8:49 ` Sergey Senozhatsky
2025-02-12 6:27 ` [PATCH v5 02/18] zram: permit preemption with active compression stream Sergey Senozhatsky
2025-02-12 16:01 ` Yosry Ahmed
2025-02-13 1:04 ` Sergey Senozhatsky
2025-02-12 6:27 ` [PATCH v5 03/18] zram: remove crypto include Sergey Senozhatsky
2025-02-12 16:13 ` Yosry Ahmed
2025-02-13 0:53 ` Sergey Senozhatsky
2025-02-12 6:27 ` [PATCH v5 04/18] zram: remove max_comp_streams device attr Sergey Senozhatsky
2025-02-12 6:27 ` [PATCH v5 05/18] zram: remove two-staged handle allocation Sergey Senozhatsky
2025-02-12 6:27 ` [PATCH v5 06/18] zram: remove writestall zram_stats member Sergey Senozhatsky
2025-02-12 6:27 ` [PATCH v5 07/18] zram: limit max recompress prio to num_active_comps Sergey Senozhatsky
2025-02-12 6:27 ` [PATCH v5 08/18] zram: filter out recomp targets based on priority Sergey Senozhatsky
2025-02-12 6:27 ` [PATCH v5 09/18] zram: rework recompression loop Sergey Senozhatsky
2025-02-12 6:27 ` [PATCH v5 10/18] zsmalloc: factor out pool locking helpers Sergey Senozhatsky
2025-02-12 16:18 ` Yosry Ahmed
2025-02-12 16:19 ` Yosry Ahmed
2025-02-13 0:57 ` Sergey Senozhatsky
2025-02-13 1:12 ` Yosry Ahmed
2025-02-13 2:54 ` Sergey Senozhatsky
2025-02-12 6:27 ` [PATCH v5 11/18] zsmalloc: factor out size-class " Sergey Senozhatsky
2025-02-12 6:27 ` Sergey Senozhatsky [this message]
2025-02-12 17:14 ` [PATCH v5 12/18] zsmalloc: make zspage lock preemptible Yosry Ahmed
2025-02-13 1:20 ` Sergey Senozhatsky
2025-02-13 1:31 ` Yosry Ahmed
2025-02-13 1:53 ` Sergey Senozhatsky
2025-02-13 11:32 ` Hillf Danton
2025-02-13 12:29 ` Sergey Senozhatsky
2025-02-12 6:27 ` [PATCH v5 13/18] zsmalloc: introduce new object mapping API Sergey Senozhatsky
2025-02-12 6:27 ` [PATCH v5 14/18] zram: switch to new zsmalloc " Sergey Senozhatsky
2025-02-12 6:27 ` [PATCH v5 15/18] zram: permit reclaim in zstd custom allocator Sergey Senozhatsky
2025-02-12 6:27 ` [PATCH v5 16/18] zram: do not leak page on recompress_store error path Sergey Senozhatsky
2025-02-12 6:27 ` [PATCH v5 17/18] zram: do not leak page on writeback_store " Sergey Senozhatsky
2025-02-12 6:27 ` [PATCH v5 18/18] zram: add might_sleep to zcomp API Sergey Senozhatsky
2025-02-13 0:09 ` [PATCH v5 00/18] zsmalloc/zram: there be preemption Andrew Morton
2025-02-13 0:51 ` Sergey Senozhatsky
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20250212063153.179231-13-senozhatsky@chromium.org \
--to=senozhatsky@chromium.org \
--cc=akpm@linux-foundation.org \
--cc=linux-kernel@vger.kernel.org \
--cc=linux-mm@kvack.org \
--cc=minchan@kernel.org \
--cc=ryncsn@gmail.com \
--cc=yosry.ahmed@linux.dev \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox