Re: [RFC PATCH v4 3/4] hazptr: Implement Hazard Pointers

linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed

From: Boqun Feng <boqun.feng@gmail.com>
To: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Joel Fernandes <joel@joelfernandes.org>,
	"Paul E. McKenney" <paulmck@kernel.org>,
	linux-kernel@vger.kernel.org, Nicholas Piggin <npiggin@gmail.com>,
	Michael Ellerman <mpe@ellerman.id.au>,
	Greg Kroah-Hartman <gregkh@linuxfoundation.org>,
	Sebastian Andrzej Siewior <bigeasy@linutronix.de>,
	Will Deacon <will@kernel.org>,
	Peter Zijlstra <peterz@infradead.org>,
	Alan Stern <stern@rowland.harvard.edu>,
	John Stultz <jstultz@google.com>,
	Neeraj Upadhyay <Neeraj.Upadhyay@amd.com>,
	Linus Torvalds <torvalds@linux-foundation.org>,
	Andrew Morton <akpm@linux-foundation.org>,
	Frederic Weisbecker <frederic@kernel.org>,
	Josh Triplett <josh@joshtriplett.org>,
	Uladzislau Rezki <urezki@gmail.com>,
	Steven Rostedt <rostedt@goodmis.org>,
	Lai Jiangshan <jiangshanlai@gmail.com>,
	Zqiang <qiang.zhang1211@gmail.com>,
	Ingo Molnar <mingo@redhat.com>, Waiman Long <longman@redhat.com>,
	Mark Rutland <mark.rutland@arm.com>,
	Thomas Gleixner <tglx@linutronix.de>,
	Vlastimil Babka <vbabka@suse.cz>,
	maged.michael@gmail.com,	Mateusz Guzik <mjguzik@gmail.com>,
	Jonas Oberhauser <jonas.oberhauser@huaweicloud.com>,
	rcu@vger.kernel.org, linux-mm@kvack.org, lkmm@lists.linux.dev
Subject: Re: [RFC PATCH v4 3/4] hazptr: Implement Hazard Pointers
Date: Thu, 18 Dec 2025 17:36:57 +0900	[thread overview]
Message-ID: <aUO9KRaP0FXpm_l9@tardis-2.local> (raw)
In-Reply-To: <20251218014531.3793471-4-mathieu.desnoyers@efficios.com>

On Wed, Dec 17, 2025 at 08:45:30PM -0500, Mathieu Desnoyers wrote:
[...]
> +static inline
> +struct hazptr_slot *hazptr_get_free_percpu_slot(void)
> +{
> +	struct hazptr_percpu_slots *percpu_slots = this_cpu_ptr(&hazptr_percpu_slots);
> +	unsigned int idx;
> +
> +	for (idx = 0; idx < NR_HAZPTR_PERCPU_SLOTS; idx++) {
> +		struct hazptr_slot *slot = &percpu_slots->slots[idx];
> +
> +		if (!READ_ONCE(slot->addr))
> +			return slot;
> +	}
> +	/* All slots are in use. */
> +	return NULL;
> +}
> +
> +static inline
> +bool hazptr_slot_is_backup(struct hazptr_ctx *ctx, struct hazptr_slot *slot)
> +{
> +	return slot == &ctx->backup_slot.slot;
> +}
> +
> +/*
> + * hazptr_acquire: Load pointer at address and protect with hazard pointer.
> + *
> + * Load @addr_p, and protect the loaded pointer with hazard pointer.
> + *
> + * Returns a non-NULL protected address if the loaded pointer is non-NULL.
> + * Returns NULL if the loaded pointer is NULL.
> + *
> + * On success the protected hazptr slot is stored in @ctx->slot.
> + */
> +static inline
> +void *hazptr_acquire(struct hazptr_ctx *ctx, void * const * addr_p)
> +{
> +	struct hazptr_slot *slot = NULL;
> +	void *addr, *addr2;
> +
> +	/*
> +	 * Load @addr_p to know which address should be protected.
> +	 */
> +	addr = READ_ONCE(*addr_p);
> +	for (;;) {
> +		if (!addr)
> +			return NULL;
> +		guard(preempt)();
> +		if (likely(!hazptr_slot_is_backup(ctx, slot))) {
> +			slot = hazptr_get_free_percpu_slot();

I need to continue share my concerns about this "allocating slot while
protecting" pattern. Here realistically, we will go over a few of the
per-CPU hazard pointer slots *every time* instead of directly using a
pre-allocated hazard pointer slot. Could you utilize this[1] to see a
comparison of the reader-side performance against RCU/SRCU?

> +			/*
> +			 * If all the per-CPU slots are already in use, fallback
> +			 * to the backup slot.
> +			 */
> +			if (unlikely(!slot))
> +				slot = hazptr_chain_backup_slot(ctx);
> +		}
> +		WRITE_ONCE(slot->addr, addr);	/* Store B */
> +
> +		/* Memory ordering: Store B before Load A. */
> +		smp_mb();
> +
> +		/*
> +		 * Re-load @addr_p after storing it to the hazard pointer slot.
> +		 */
> +		addr2 = READ_ONCE(*addr_p);	/* Load A */
> +		if (likely(ptr_eq(addr2, addr)))
> +			break;
> +		/*
> +		 * If @addr_p content has changed since the first load,
> +		 * release the hazard pointer and try again.
> +		 */
> +		WRITE_ONCE(slot->addr, NULL);
> +		if (!addr2) {
> +			if (hazptr_slot_is_backup(ctx, slot))
> +				hazptr_unchain_backup_slot(ctx);
> +			return NULL;
> +		}
> +		addr = addr2;
> +	}
> +	ctx->slot = slot;
> +	/*
> +	 * Use addr2 loaded from the second READ_ONCE() to preserve
> +	 * address dependency ordering.
> +	 */
> +	return addr2;
> +}
> +
> +/* Release the protected hazard pointer from @slot. */
> +static inline
> +void hazptr_release(struct hazptr_ctx *ctx, void *addr)
> +{
> +	struct hazptr_slot *slot;
> +
> +	if (!addr)
> +		return;
> +	slot = ctx->slot;
> +	WARN_ON_ONCE(slot->addr != addr);
> +	smp_store_release(&slot->addr, NULL);
> +	if (unlikely(hazptr_slot_is_backup(ctx, slot)))
> +		hazptr_unchain_backup_slot(ctx);
> +}
> +
> +void hazptr_init(void);
> +
> +#endif /* _LINUX_HAZPTR_H */
> diff --git a/init/main.c b/init/main.c
> index 07a3116811c5..858eaa87bde7 100644
> --- a/init/main.c
> +++ b/init/main.c
> @@ -104,6 +104,7 @@
>  #include <linux/pidfs.h>
>  #include <linux/ptdump.h>
>  #include <linux/time_namespace.h>
> +#include <linux/hazptr.h>
>  #include <net/net_namespace.h>
>  
>  #include <asm/io.h>
> @@ -1002,6 +1003,7 @@ void start_kernel(void)
>  	workqueue_init_early();
>  
>  	rcu_init();
> +	hazptr_init();
>  	kvfree_rcu_init();
>  
>  	/* Trace events are available after this */
> diff --git a/kernel/Makefile b/kernel/Makefile
> index 9fe722305c9b..1178907fe0ea 100644
> --- a/kernel/Makefile
> +++ b/kernel/Makefile
> @@ -7,7 +7,7 @@ obj-y     = fork.o exec_domain.o panic.o \
>  	    cpu.o exit.o softirq.o resource.o \
>  	    sysctl.o capability.o ptrace.o user.o \
>  	    signal.o sys.o umh.o workqueue.o pid.o task_work.o \
> -	    extable.o params.o \
> +	    extable.o params.o hazptr.o \
>  	    kthread.o sys_ni.o nsproxy.o nstree.o nscommon.o \
>  	    notifier.o ksysfs.o cred.o reboot.o \
>  	    async.o range.o smpboot.o ucount.o regset.o ksyms_common.o
> diff --git a/kernel/hazptr.c b/kernel/hazptr.c
> new file mode 100644
> index 000000000000..2ec288bc1132
> --- /dev/null
> +++ b/kernel/hazptr.c
> @@ -0,0 +1,150 @@
> +// SPDX-FileCopyrightText: 2024 Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
> +//
> +// SPDX-License-Identifier: LGPL-2.1-or-later
> +
> +/*
> + * hazptr: Hazard Pointers
> + */
> +
> +#include <linux/hazptr.h>
> +#include <linux/percpu.h>
> +#include <linux/spinlock.h>
> +#include <linux/list.h>
> +#include <linux/export.h>
> +
> +struct overflow_list {
> +	raw_spinlock_t lock;		/* Lock protecting overflow list and list generation. */
> +	struct list_head head;		/* Overflow list head. */
> +	uint64_t gen;			/* Overflow list generation. */
> +};
> +
> +static DEFINE_PER_CPU(struct overflow_list, percpu_overflow_list);
> +
> +DEFINE_PER_CPU(struct hazptr_percpu_slots, hazptr_percpu_slots);
> +EXPORT_PER_CPU_SYMBOL_GPL(hazptr_percpu_slots);
> +
> +/*
> + * Perform piecewise iteration on overflow list waiting until "addr" is
> + * not present. Raw spinlock is released and taken between each list
> + * item and busy loop iteration. The overflow list generation is checked
> + * each time the lock is taken to validate that the list has not changed
> + * before resuming iteration or busy wait. If the generation has
> + * changed, retry the entire list traversal.
> + */
> +static
> +void hazptr_synchronize_overflow_list(struct overflow_list *overflow_list, void *addr)
> +{
> +	struct hazptr_backup_slot *backup_slot;
> +	uint64_t snapshot_gen;
> +
> +	raw_spin_lock(&overflow_list->lock);
> +retry:
> +	snapshot_gen = overflow_list->gen;
> +	list_for_each_entry(backup_slot, &overflow_list->head, node) {
> +		/* Busy-wait if node is found. */
> +		while (smp_load_acquire(&backup_slot->slot.addr) == addr) { /* Load B */
> +			raw_spin_unlock(&overflow_list->lock);
> +			cpu_relax();

I think we should prioritize the scan thread solution [2] instead of
busy waiting hazrd pointer updaters, because when we have multiple
hazard pointer usages we would want to consolidate the scans from
updater side. If so, the whole ->gen can be avoided.

However this ->gen idea does seem ot resolve another issue for me, I'm
trying to make shazptr critical section preemptive by using a per-task
backup slot (if you recall, this is your idea from the hallway
discussions we had during LPC 2024), and currently I could not make it
work because the following sequeue:

1. CPU 0 already has one pointer protected.

2. CPU 1 begins the updater scan, and it scans the list of preempted
   hazard pointer readers, no reader.

3. CPU 0 does a context switch, it stores the current hazard pointer
   value to the current task's ->hazard_slot (let's say the task is task
   A), and add it to the list of preempted hazard pointer readers.

4. CPU 0 clears its percpu hazptr_slots for the next task (B).

5. CPU 1 continues the updater scan, and it scans the percpu slot of
   CPU 0, and finds no reader.

in this situation, updater will miss a reader. But if we add a
generation snapshotting at step 2 and generation increment at step 3, I
think it'll work.

IMO, if we make this work, it's better than the current backup slot
mechanism IMO, because we only need to acquire the lock if context
switch happens. I will look into the implementation of this and if I
could get it down, I will send it in my next version of shazptr. Mention
it here just to add this option into the discussion.

[1]: https://lore.kernel.org/lkml/20250625031101.12555-3-boqun.feng@gmail.com/
[2]: https://lore.kernel.org/lkml/20250625031101.12555-5-boqun.feng@gmail.com/

Regards,
Boqun

> +			raw_spin_lock(&overflow_list->lock);
> +			if (overflow_list->gen != snapshot_gen)
> +				goto retry;
> +		}
> +		raw_spin_unlock(&overflow_list->lock);
> +		/*
> +		 * Release raw spinlock, validate generation after
> +		 * re-acquiring the lock.
> +		 */
> +		raw_spin_lock(&overflow_list->lock);
> +		if (overflow_list->gen != snapshot_gen)
> +			goto retry;
> +	}
> +	raw_spin_unlock(&overflow_list->lock);
> +}
> +
> +static
> +void hazptr_synchronize_cpu_slots(int cpu, void *addr)
> +{
> +	struct hazptr_percpu_slots *percpu_slots = per_cpu_ptr(&hazptr_percpu_slots, cpu);
> +	unsigned int idx;
> +
> +	for (idx = 0; idx < NR_HAZPTR_PERCPU_SLOTS; idx++) {
> +		struct hazptr_slot *slot = &percpu_slots->slots[idx];
> +
> +		/* Busy-wait if node is found. */
> +		smp_cond_load_acquire(&slot->addr, VAL != addr); /* Load B */
> +	}
> +}
> +
> +/*
> + * hazptr_synchronize: Wait until @addr is released from all slots.
> + *
> + * Wait to observe that each slot contains a value that differs from
> + * @addr before returning.
> + * Should be called from preemptible context.
> + */
> +void hazptr_synchronize(void *addr)
> +{
> +	int cpu;
> +
> +	/*
> +	 * Busy-wait should only be done from preemptible context.
> +	 */
> +	lockdep_assert_preemption_enabled();
> +
> +	/*
> +	 * Store A precedes hazptr_scan(): it unpublishes addr (sets it to
> +	 * NULL or to a different value), and thus hides it from hazard
> +	 * pointer readers.
> +	 */
> +	if (!addr)
> +		return;
> +	/* Memory ordering: Store A before Load B. */
> +	smp_mb();
> +	/* Scan all CPUs slots. */
> +	for_each_possible_cpu(cpu) {
> +		/* Scan CPU slots. */
> +		hazptr_synchronize_cpu_slots(cpu, addr);
> +		/* Scan backup slots in percpu overflow list. */
> +		hazptr_synchronize_overflow_list(per_cpu_ptr(&percpu_overflow_list, cpu), addr);
> +	}
> +}
> +EXPORT_SYMBOL_GPL(hazptr_synchronize);
> +
> +struct hazptr_slot *hazptr_chain_backup_slot(struct hazptr_ctx *ctx)
> +{
> +	struct overflow_list *overflow_list = this_cpu_ptr(&percpu_overflow_list);
> +	struct hazptr_slot *slot = &ctx->backup_slot.slot;
> +
> +	slot->addr = NULL;
> +
> +	raw_spin_lock(&overflow_list->lock);
> +	overflow_list->gen++;
> +	list_add(&ctx->backup_slot.node, &overflow_list->head);
> +	ctx->backup_slot.cpu = smp_processor_id();
> +	raw_spin_unlock(&overflow_list->lock);
> +	return slot;
> +}
> +EXPORT_SYMBOL_GPL(hazptr_chain_backup_slot);
> +
> +void hazptr_unchain_backup_slot(struct hazptr_ctx *ctx)
> +{
> +	struct overflow_list *overflow_list = per_cpu_ptr(&percpu_overflow_list, ctx->backup_slot.cpu);
> +
> +	raw_spin_lock(&overflow_list->lock);
> +	overflow_list->gen++;
> +	list_del(&ctx->backup_slot.node);
> +	raw_spin_unlock(&overflow_list->lock);
> +}
> +EXPORT_SYMBOL_GPL(hazptr_unchain_backup_slot);
> +
> +void __init hazptr_init(void)
> +{
> +	int cpu;
> +
> +	for_each_possible_cpu(cpu) {
> +		struct overflow_list *overflow_list = per_cpu_ptr(&percpu_overflow_list, cpu);
> +
> +		raw_spin_lock_init(&overflow_list->lock);
> +		INIT_LIST_HEAD(&overflow_list->head);
> +	}
> +}
> -- 
> 2.39.5
>

next prev parent reply	other threads:[~2025-12-18 10:29 UTC|newest]

Thread overview: 29+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2025-12-18  1:45 [RFC PATCH v4 0/4] " Mathieu Desnoyers
2025-12-18  1:45 ` [RFC PATCH v4 1/4] compiler.h: Introduce ptr_eq() to preserve address dependency Mathieu Desnoyers
2025-12-18  9:03   ` David Laight
2025-12-18 13:51     ` Mathieu Desnoyers
2025-12-18 15:54       ` David Laight
2025-12-18 14:27     ` Gary Guo
2025-12-18 16:12       ` David Laight
2025-12-18  1:45 ` [RFC PATCH v4 2/4] Documentation: RCU: Refer to ptr_eq() Mathieu Desnoyers
2025-12-18  1:45 ` [RFC PATCH v4 3/4] hazptr: Implement Hazard Pointers Mathieu Desnoyers
2025-12-18  8:36   ` Boqun Feng [this message]
2025-12-18 17:35     ` Mathieu Desnoyers
2025-12-18 20:22       ` Boqun Feng
2025-12-18 23:36         ` Mathieu Desnoyers
2025-12-19  0:25           ` Boqun Feng
2025-12-19  6:06             ` Joel Fernandes
2025-12-19 15:14             ` Mathieu Desnoyers
2025-12-19 15:42               ` Joel Fernandes
2025-12-19 22:19                 ` Mathieu Desnoyers
2025-12-19 22:39                   ` Joel Fernandes
2025-12-21  9:59                     ` Boqun Feng
2025-12-19  0:43       ` Boqun Feng
2025-12-19 14:22         ` Mathieu Desnoyers
2025-12-19  1:22   ` Joel Fernandes
2025-12-18  1:45 ` [RFC PATCH v4 4/4] hazptr: Migrate per-CPU slots to backup slot on context switch Mathieu Desnoyers
2025-12-18 16:20   ` Mathieu Desnoyers
2025-12-18 22:16   ` Boqun Feng
2025-12-19  0:21     ` Mathieu Desnoyers
2025-12-18 10:33 ` [RFC PATCH v4 0/4] Hazard Pointers Joel Fernandes
2025-12-18 17:54   ` Mathieu Desnoyers

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=aUO9KRaP0FXpm_l9@tardis-2.local \
    --to=boqun.feng@gmail.com \
    --cc=Neeraj.Upadhyay@amd.com \
    --cc=akpm@linux-foundation.org \
    --cc=bigeasy@linutronix.de \
    --cc=frederic@kernel.org \
    --cc=gregkh@linuxfoundation.org \
    --cc=jiangshanlai@gmail.com \
    --cc=joel@joelfernandes.org \
    --cc=jonas.oberhauser@huaweicloud.com \
    --cc=josh@joshtriplett.org \
    --cc=jstultz@google.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=lkmm@lists.linux.dev \
    --cc=longman@redhat.com \
    --cc=maged.michael@gmail.com \
    --cc=mark.rutland@arm.com \
    --cc=mathieu.desnoyers@efficios.com \
    --cc=mingo@redhat.com \
    --cc=mjguzik@gmail.com \
    --cc=mpe@ellerman.id.au \
    --cc=npiggin@gmail.com \
    --cc=paulmck@kernel.org \
    --cc=peterz@infradead.org \
    --cc=qiang.zhang1211@gmail.com \
    --cc=rcu@vger.kernel.org \
    --cc=rostedt@goodmis.org \
    --cc=stern@rowland.harvard.edu \
    --cc=tglx@linutronix.de \
    --cc=torvalds@linux-foundation.org \
    --cc=urezki@gmail.com \
    --cc=vbabka@suse.cz \
    --cc=will@kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox