* [PATCH RESEND v2 1/4] mm/gup: Add huge pte handling logic in follow_page_pte()
2025-03-18 3:59 [PATCH RESEND v2 0/4] riscv: iommu: Support Svnapot Xu Lu
@ 2025-03-18 3:59 ` Xu Lu
2025-03-18 3:59 ` [PATCH RESEND v2 2/4] iommu/riscv: Use pte_t to represent page table entry Xu Lu
` (2 subsequent siblings)
3 siblings, 0 replies; 6+ messages in thread
From: Xu Lu @ 2025-03-18 3:59 UTC (permalink / raw)
To: akpm, jhubbard, kirill.shutemov, tjeznach, joro, will, robin.murphy
Cc: lihangjing, xieyongji, linux-riscv, linux-kernel, linux-mm, Xu Lu
Page mapped at pte level can also be huge page when ARM CONT_PTE or
RISC-V SVNAPOT is applied. Lack of huge pte handling logic in
follow_page_pte() may lead to both performance and correctness issues.
For example, on RISC-V platform, pages in the same 64K huge page have
the same pte value, which means follow_page_pte() will get the same page
for all of them using pte_pfn(). Then __get_user_pages() will return an
array of pages with the same pfn. Mapping these pages causes memory
confusion. This error can be triggered by the following code:
void *addr = mmap(NULL, 0x10000, PROT_READ | PROT_WRITE, MAP_ANONYMOUS |
MAP_PRIVATE | MAP_HUGETLB | MAP_HUGE_64KB, -1, 0);
struct vfio_iommu_type1_dma_map dmap_map = {
.argsz = sizeof(dma_map),
.flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE,
.vaddr = (uint64_t)addr,
.size = 0x10000,
};
ioctl(vfio_container_fd, VFIO_IOMMU_MAP_DMA, &dma_map);
This commit supplies huge pte handling logic in follow_page_pte() to
avoid such problems.
Signed-off-by: Xu Lu <luxu.kernel@bytedance.com>
---
arch/riscv/include/asm/pgtable.h | 6 ++++++
include/linux/pgtable.h | 8 ++++++++
mm/gup.c | 17 +++++++++++------
3 files changed, 25 insertions(+), 6 deletions(-)
diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h
index 050fdc49b5ad7..40ae5979dd82c 100644
--- a/arch/riscv/include/asm/pgtable.h
+++ b/arch/riscv/include/asm/pgtable.h
@@ -800,6 +800,12 @@ static inline bool pud_user_accessible_page(pud_t pud)
#endif
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+#define pte_trans_huge pte_trans_huge
+static inline int pte_trans_huge(pte_t pte)
+{
+ return pte_huge(pte) && pte_napot(pte);
+}
+
static inline int pmd_trans_huge(pmd_t pmd)
{
return pmd_leaf(pmd);
diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index 94d267d02372e..3f57ee6dcf017 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -1584,6 +1584,14 @@ static inline unsigned long my_zero_pfn(unsigned long addr)
#ifdef CONFIG_MMU
+#if (defined(CONFIG_TRANSPARENT_HUGEPAGE) && !defined(pte_trans_huge)) || \
+ (!defined(CONFIG_TRANSPARENT_HUGEPAGE))
+static inline int pte_trans_huge(pte_t pte)
+{
+ return 0;
+}
+#endif
+
#ifndef CONFIG_TRANSPARENT_HUGEPAGE
static inline int pmd_trans_huge(pmd_t pmd)
{
diff --git a/mm/gup.c b/mm/gup.c
index 3883b307780ea..67981ee28df86 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -838,7 +838,7 @@ static inline bool can_follow_write_pte(pte_t pte, struct page *page,
static struct page *follow_page_pte(struct vm_area_struct *vma,
unsigned long address, pmd_t *pmd, unsigned int flags,
- struct dev_pagemap **pgmap)
+ struct follow_page_context *ctx)
{
struct mm_struct *mm = vma->vm_mm;
struct folio *folio;
@@ -879,8 +879,8 @@ static struct page *follow_page_pte(struct vm_area_struct *vma,
* case since they are only valid while holding the pgmap
* reference.
*/
- *pgmap = get_dev_pagemap(pte_pfn(pte), *pgmap);
- if (*pgmap)
+ ctx->pgmap = get_dev_pagemap(pte_pfn(pte), ctx->pgmap);
+ if (ctx->pgmap)
page = pte_page(pte);
else
goto no_page;
@@ -940,6 +940,11 @@ static struct page *follow_page_pte(struct vm_area_struct *vma,
*/
folio_mark_accessed(folio);
}
+ if (is_vm_hugetlb_page(vma) || pte_trans_huge(pte)) {
+ ctx->page_mask = (1 << folio_order(folio)) - 1;
+ page = folio_page(folio, 0) +
+ ((address & (folio_size(folio) - 1)) >> PAGE_SHIFT);
+ }
out:
pte_unmap_unlock(ptep, ptl);
return page;
@@ -975,7 +980,7 @@ static struct page *follow_pmd_mask(struct vm_area_struct *vma,
return no_page_table(vma, flags, address);
}
if (likely(!pmd_leaf(pmdval)))
- return follow_page_pte(vma, address, pmd, flags, &ctx->pgmap);
+ return follow_page_pte(vma, address, pmd, flags, ctx);
if (pmd_protnone(pmdval) && !gup_can_follow_protnone(vma, flags))
return no_page_table(vma, flags, address);
@@ -988,14 +993,14 @@ static struct page *follow_pmd_mask(struct vm_area_struct *vma,
}
if (unlikely(!pmd_leaf(pmdval))) {
spin_unlock(ptl);
- return follow_page_pte(vma, address, pmd, flags, &ctx->pgmap);
+ return follow_page_pte(vma, address, pmd, flags, ctx);
}
if (pmd_trans_huge(pmdval) && (flags & FOLL_SPLIT_PMD)) {
spin_unlock(ptl);
split_huge_pmd(vma, pmd, address);
/* If pmd was left empty, stuff a page table in there quickly */
return pte_alloc(mm, pmd) ? ERR_PTR(-ENOMEM) :
- follow_page_pte(vma, address, pmd, flags, &ctx->pgmap);
+ follow_page_pte(vma, address, pmd, flags, ctx);
}
page = follow_huge_pmd(vma, address, pmd, flags, ctx);
spin_unlock(ptl);
--
2.20.1
^ permalink raw reply [flat|nested] 6+ messages in thread* [PATCH RESEND v2 2/4] iommu/riscv: Use pte_t to represent page table entry
2025-03-18 3:59 [PATCH RESEND v2 0/4] riscv: iommu: Support Svnapot Xu Lu
2025-03-18 3:59 ` [PATCH RESEND v2 1/4] mm/gup: Add huge pte handling logic in follow_page_pte() Xu Lu
@ 2025-03-18 3:59 ` Xu Lu
2025-03-18 3:59 ` [PATCH RESEND v2 3/4] iommu/riscv: Introduce IOMMU page table lock Xu Lu
2025-03-18 3:59 ` [PATCH RESEND v2 4/4] iommu/riscv: Add support for Svnapot Xu Lu
3 siblings, 0 replies; 6+ messages in thread
From: Xu Lu @ 2025-03-18 3:59 UTC (permalink / raw)
To: akpm, jhubbard, kirill.shutemov, tjeznach, joro, will, robin.murphy
Cc: lihangjing, xieyongji, linux-riscv, linux-kernel, linux-mm, Xu Lu
Since RISC-V IOMMU has the same pte format and translation process with
MMU as is specified in RISC-V Privileged specification, we use pte_t to
represent IOMMU pte too to reuse existing pte operation functions.
Signed-off-by: Xu Lu <luxu.kernel@bytedance.com>
---
drivers/iommu/riscv/iommu.c | 79 ++++++++++++++++++-------------------
1 file changed, 39 insertions(+), 40 deletions(-)
diff --git a/drivers/iommu/riscv/iommu.c b/drivers/iommu/riscv/iommu.c
index 8f049d4a0e2cb..3b0c934decd08 100644
--- a/drivers/iommu/riscv/iommu.c
+++ b/drivers/iommu/riscv/iommu.c
@@ -812,7 +812,7 @@ struct riscv_iommu_domain {
bool amo_enabled;
int numa_node;
unsigned int pgd_mode;
- unsigned long *pgd_root;
+ pte_t *pgd_root;
};
#define iommu_domain_to_riscv(iommu_domain) \
@@ -1081,27 +1081,29 @@ static void riscv_iommu_iotlb_sync(struct iommu_domain *iommu_domain,
#define PT_SHIFT (PAGE_SHIFT - ilog2(sizeof(pte_t)))
-#define _io_pte_present(pte) ((pte) & (_PAGE_PRESENT | _PAGE_PROT_NONE))
-#define _io_pte_leaf(pte) ((pte) & _PAGE_LEAF)
-#define _io_pte_none(pte) ((pte) == 0)
-#define _io_pte_entry(pn, prot) ((_PAGE_PFN_MASK & ((pn) << _PAGE_PFN_SHIFT)) | (prot))
+#define _io_pte_present(pte) (pte_val(pte) & (_PAGE_PRESENT | _PAGE_PROT_NONE))
+#define _io_pte_leaf(pte) (pte_val(pte) & _PAGE_LEAF)
+#define _io_pte_none(pte) (pte_val(pte) == 0)
+#define _io_pte_entry(pn, prot) (__pte((_PAGE_PFN_MASK & ((pn) << _PAGE_PFN_SHIFT)) | (prot)))
static void riscv_iommu_pte_free(struct riscv_iommu_domain *domain,
- unsigned long pte, struct list_head *freelist)
+ pte_t pte, struct list_head *freelist)
{
- unsigned long *ptr;
+ pte_t *ptr;
int i;
if (!_io_pte_present(pte) || _io_pte_leaf(pte))
return;
- ptr = (unsigned long *)pfn_to_virt(__page_val_to_pfn(pte));
+ ptr = (pte_t *)pfn_to_virt(pte_pfn(pte));
/* Recursively free all sub page table pages */
for (i = 0; i < PTRS_PER_PTE; i++) {
- pte = READ_ONCE(ptr[i]);
- if (!_io_pte_none(pte) && cmpxchg_relaxed(ptr + i, pte, 0) == pte)
+ pte = ptr[i];
+ if (!_io_pte_none(pte)) {
+ ptr[i] = __pte(0);
riscv_iommu_pte_free(domain, pte, freelist);
+ }
}
if (freelist)
@@ -1110,12 +1112,12 @@ static void riscv_iommu_pte_free(struct riscv_iommu_domain *domain,
iommu_free_page(ptr);
}
-static unsigned long *riscv_iommu_pte_alloc(struct riscv_iommu_domain *domain,
- unsigned long iova, size_t pgsize,
- gfp_t gfp)
+static pte_t *riscv_iommu_pte_alloc(struct riscv_iommu_domain *domain,
+ unsigned long iova, size_t pgsize,
+ gfp_t gfp)
{
- unsigned long *ptr = domain->pgd_root;
- unsigned long pte, old;
+ pte_t *ptr = domain->pgd_root;
+ pte_t pte, old;
int level = domain->pgd_mode - RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV39 + 2;
void *addr;
@@ -1131,7 +1133,7 @@ static unsigned long *riscv_iommu_pte_alloc(struct riscv_iommu_domain *domain,
if (((size_t)1 << shift) == pgsize)
return ptr;
pte_retry:
- pte = READ_ONCE(*ptr);
+ pte = ptep_get(ptr);
/*
* This is very likely incorrect as we should not be adding
* new mapping with smaller granularity on top
@@ -1147,38 +1149,37 @@ static unsigned long *riscv_iommu_pte_alloc(struct riscv_iommu_domain *domain,
addr = iommu_alloc_page_node(domain->numa_node, gfp);
if (!addr)
return NULL;
- old = pte;
- pte = _io_pte_entry(virt_to_pfn(addr), _PAGE_TABLE);
- if (cmpxchg_relaxed(ptr, old, pte) != old) {
- iommu_free_page(addr);
+ old = ptep_get(ptr);
+ if (!_io_pte_none(old))
goto pte_retry;
- }
+ pte = _io_pte_entry(virt_to_pfn(addr), _PAGE_TABLE);
+ set_pte(ptr, pte);
}
- ptr = (unsigned long *)pfn_to_virt(__page_val_to_pfn(pte));
+ ptr = (pte_t *)pfn_to_virt(pte_pfn(pte));
} while (level-- > 0);
return NULL;
}
-static unsigned long *riscv_iommu_pte_fetch(struct riscv_iommu_domain *domain,
- unsigned long iova, size_t *pte_pgsize)
+static pte_t *riscv_iommu_pte_fetch(struct riscv_iommu_domain *domain,
+ unsigned long iova, size_t *pte_pgsize)
{
- unsigned long *ptr = domain->pgd_root;
- unsigned long pte;
+ pte_t *ptr = domain->pgd_root;
+ pte_t pte;
int level = domain->pgd_mode - RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV39 + 2;
do {
const int shift = PAGE_SHIFT + PT_SHIFT * level;
ptr += ((iova >> shift) & (PTRS_PER_PTE - 1));
- pte = READ_ONCE(*ptr);
+ pte = ptep_get(ptr);
if (_io_pte_present(pte) && _io_pte_leaf(pte)) {
*pte_pgsize = (size_t)1 << shift;
return ptr;
}
if (_io_pte_none(pte))
return NULL;
- ptr = (unsigned long *)pfn_to_virt(__page_val_to_pfn(pte));
+ ptr = (pte_t *)pfn_to_virt(pte_pfn(pte));
} while (level-- > 0);
return NULL;
@@ -1191,8 +1192,9 @@ static int riscv_iommu_map_pages(struct iommu_domain *iommu_domain,
{
struct riscv_iommu_domain *domain = iommu_domain_to_riscv(iommu_domain);
size_t size = 0;
- unsigned long *ptr;
- unsigned long pte, old, pte_prot;
+ pte_t *ptr;
+ pte_t pte, old;
+ unsigned long pte_prot;
int rc = 0;
LIST_HEAD(freelist);
@@ -1210,10 +1212,9 @@ static int riscv_iommu_map_pages(struct iommu_domain *iommu_domain,
break;
}
- old = READ_ONCE(*ptr);
+ old = ptep_get(ptr);
pte = _io_pte_entry(phys_to_pfn(phys), pte_prot);
- if (cmpxchg_relaxed(ptr, old, pte) != old)
- continue;
+ set_pte(ptr, pte);
riscv_iommu_pte_free(domain, old, &freelist);
@@ -1247,7 +1248,7 @@ static size_t riscv_iommu_unmap_pages(struct iommu_domain *iommu_domain,
{
struct riscv_iommu_domain *domain = iommu_domain_to_riscv(iommu_domain);
size_t size = pgcount << __ffs(pgsize);
- unsigned long *ptr, old;
+ pte_t *ptr;
size_t unmapped = 0;
size_t pte_size;
@@ -1260,9 +1261,7 @@ static size_t riscv_iommu_unmap_pages(struct iommu_domain *iommu_domain,
if (iova & (pte_size - 1))
return unmapped;
- old = READ_ONCE(*ptr);
- if (cmpxchg_relaxed(ptr, old, 0) != old)
- continue;
+ set_pte(ptr, __pte(0));
iommu_iotlb_gather_add_page(&domain->domain, gather, iova,
pte_size);
@@ -1279,13 +1278,13 @@ static phys_addr_t riscv_iommu_iova_to_phys(struct iommu_domain *iommu_domain,
{
struct riscv_iommu_domain *domain = iommu_domain_to_riscv(iommu_domain);
size_t pte_size;
- unsigned long *ptr;
+ pte_t *ptr;
ptr = riscv_iommu_pte_fetch(domain, iova, &pte_size);
- if (_io_pte_none(*ptr) || !_io_pte_present(*ptr))
+ if (_io_pte_none(ptep_get(ptr)) || !_io_pte_present(ptep_get(ptr)))
return 0;
- return pfn_to_phys(__page_val_to_pfn(*ptr)) | (iova & (pte_size - 1));
+ return pfn_to_phys(pte_pfn(ptep_get(ptr))) | (iova & (pte_size - 1));
}
static void riscv_iommu_free_paging_domain(struct iommu_domain *iommu_domain)
--
2.20.1
^ permalink raw reply [flat|nested] 6+ messages in thread* [PATCH RESEND v2 3/4] iommu/riscv: Introduce IOMMU page table lock
2025-03-18 3:59 [PATCH RESEND v2 0/4] riscv: iommu: Support Svnapot Xu Lu
2025-03-18 3:59 ` [PATCH RESEND v2 1/4] mm/gup: Add huge pte handling logic in follow_page_pte() Xu Lu
2025-03-18 3:59 ` [PATCH RESEND v2 2/4] iommu/riscv: Use pte_t to represent page table entry Xu Lu
@ 2025-03-18 3:59 ` Xu Lu
2025-04-01 15:19 ` Jason Gunthorpe
2025-03-18 3:59 ` [PATCH RESEND v2 4/4] iommu/riscv: Add support for Svnapot Xu Lu
3 siblings, 1 reply; 6+ messages in thread
From: Xu Lu @ 2025-03-18 3:59 UTC (permalink / raw)
To: akpm, jhubbard, kirill.shutemov, tjeznach, joro, will, robin.murphy
Cc: lihangjing, xieyongji, linux-riscv, linux-kernel, linux-mm, Xu Lu
Introduce page table lock to address competition issues when modifying
multiple PTEs, for example, when applying Svnapot. We use fine-grained
page table locks to minimize lock contention.
Signed-off-by: Xu Lu <luxu.kernel@bytedance.com>
---
drivers/iommu/riscv/iommu.c | 123 +++++++++++++++++++++++++++++++-----
1 file changed, 107 insertions(+), 16 deletions(-)
diff --git a/drivers/iommu/riscv/iommu.c b/drivers/iommu/riscv/iommu.c
index 3b0c934decd08..ce4cf6569ffb4 100644
--- a/drivers/iommu/riscv/iommu.c
+++ b/drivers/iommu/riscv/iommu.c
@@ -808,6 +808,7 @@ struct riscv_iommu_domain {
struct iommu_domain domain;
struct list_head bonds;
spinlock_t lock; /* protect bonds list updates. */
+ spinlock_t page_table_lock; /* protect page table updates. */
int pscid;
bool amo_enabled;
int numa_node;
@@ -1086,8 +1087,80 @@ static void riscv_iommu_iotlb_sync(struct iommu_domain *iommu_domain,
#define _io_pte_none(pte) (pte_val(pte) == 0)
#define _io_pte_entry(pn, prot) (__pte((_PAGE_PFN_MASK & ((pn) << _PAGE_PFN_SHIFT)) | (prot)))
+#define RISCV_IOMMU_PMD_LEVEL 1
+
+static bool riscv_iommu_ptlock_init(struct ptdesc *ptdesc, int level)
+{
+ if (level <= RISCV_IOMMU_PMD_LEVEL)
+ return ptlock_init(ptdesc);
+ return true;
+}
+
+static void riscv_iommu_ptlock_free(struct ptdesc *ptdesc, int level)
+{
+ if (level <= RISCV_IOMMU_PMD_LEVEL)
+ ptlock_free(ptdesc);
+}
+
+static spinlock_t *riscv_iommu_ptlock(struct riscv_iommu_domain *domain,
+ pte_t *pte, int level)
+{
+ spinlock_t *ptl; /* page table page lock */
+
+#ifdef CONFIG_SPLIT_PTE_PTLOCKS
+ if (level <= RISCV_IOMMU_PMD_LEVEL)
+ ptl = ptlock_ptr(page_ptdesc(virt_to_page(pte)));
+ else
+#endif
+ ptl = &domain->page_table_lock;
+ spin_lock(ptl);
+
+ return ptl;
+}
+
+static void *riscv_iommu_alloc_pagetable_node(int numa_node, gfp_t gfp, int level)
+{
+ struct ptdesc *ptdesc;
+ void *addr;
+
+ addr = iommu_alloc_page_node(numa_node, gfp);
+ if (!addr)
+ return NULL;
+
+ ptdesc = page_ptdesc(virt_to_page(addr));
+ if (!riscv_iommu_ptlock_init(ptdesc, level)) {
+ iommu_free_page(addr);
+ addr = NULL;
+ }
+
+ return addr;
+}
+
+static void riscv_iommu_free_pagetable(void *addr, int level)
+{
+ struct ptdesc *ptdesc = page_ptdesc(virt_to_page(addr));
+
+ riscv_iommu_ptlock_free(ptdesc, level);
+ iommu_free_page(addr);
+}
+
+static int pgsize_to_level(size_t pgsize)
+{
+ int level = RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV57 -
+ RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV39 + 2;
+ int shift = PAGE_SHIFT + PT_SHIFT * level;
+
+ while (pgsize < ((size_t)1 << shift)) {
+ shift -= PT_SHIFT;
+ level--;
+ }
+
+ return level;
+}
+
static void riscv_iommu_pte_free(struct riscv_iommu_domain *domain,
- pte_t pte, struct list_head *freelist)
+ pte_t pte, int level,
+ struct list_head *freelist)
{
pte_t *ptr;
int i;
@@ -1102,10 +1175,11 @@ static void riscv_iommu_pte_free(struct riscv_iommu_domain *domain,
pte = ptr[i];
if (!_io_pte_none(pte)) {
ptr[i] = __pte(0);
- riscv_iommu_pte_free(domain, pte, freelist);
+ riscv_iommu_pte_free(domain, pte, level - 1, freelist);
}
}
+ riscv_iommu_ptlock_free(page_ptdesc(virt_to_page(ptr)), level);
if (freelist)
list_add_tail(&virt_to_page(ptr)->lru, freelist);
else
@@ -1117,8 +1191,9 @@ static pte_t *riscv_iommu_pte_alloc(struct riscv_iommu_domain *domain,
gfp_t gfp)
{
pte_t *ptr = domain->pgd_root;
- pte_t pte, old;
+ pte_t pte;
int level = domain->pgd_mode - RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV39 + 2;
+ spinlock_t *ptl; /* page table page lock */
void *addr;
do {
@@ -1146,14 +1221,21 @@ static pte_t *riscv_iommu_pte_alloc(struct riscv_iommu_domain *domain,
* page table. This might race with other mappings, retry.
*/
if (_io_pte_none(pte)) {
- addr = iommu_alloc_page_node(domain->numa_node, gfp);
+ addr = riscv_iommu_alloc_pagetable_node(domain->numa_node, gfp,
+ level - 1);
if (!addr)
return NULL;
- old = ptep_get(ptr);
- if (!_io_pte_none(old))
+
+ ptl = riscv_iommu_ptlock(domain, ptr, level);
+ pte = ptep_get(ptr);
+ if (!_io_pte_none(pte)) {
+ spin_unlock(ptl);
+ riscv_iommu_free_pagetable(addr, level - 1);
goto pte_retry;
+ }
pte = _io_pte_entry(virt_to_pfn(addr), _PAGE_TABLE);
set_pte(ptr, pte);
+ spin_unlock(ptl);
}
ptr = (pte_t *)pfn_to_virt(pte_pfn(pte));
} while (level-- > 0);
@@ -1193,9 +1275,10 @@ static int riscv_iommu_map_pages(struct iommu_domain *iommu_domain,
struct riscv_iommu_domain *domain = iommu_domain_to_riscv(iommu_domain);
size_t size = 0;
pte_t *ptr;
- pte_t pte, old;
+ pte_t pte;
unsigned long pte_prot;
- int rc = 0;
+ int rc = 0, level;
+ spinlock_t *ptl; /* page table page lock */
LIST_HEAD(freelist);
if (!(prot & IOMMU_WRITE))
@@ -1212,11 +1295,12 @@ static int riscv_iommu_map_pages(struct iommu_domain *iommu_domain,
break;
}
- old = ptep_get(ptr);
+ level = pgsize_to_level(pgsize);
+ ptl = riscv_iommu_ptlock(domain, ptr, level);
+ riscv_iommu_pte_free(domain, ptep_get(ptr), level, &freelist);
pte = _io_pte_entry(phys_to_pfn(phys), pte_prot);
set_pte(ptr, pte);
-
- riscv_iommu_pte_free(domain, old, &freelist);
+ spin_unlock(ptl);
size += pgsize;
iova += pgsize;
@@ -1251,6 +1335,7 @@ static size_t riscv_iommu_unmap_pages(struct iommu_domain *iommu_domain,
pte_t *ptr;
size_t unmapped = 0;
size_t pte_size;
+ spinlock_t *ptl; /* page table page lock */
while (unmapped < size) {
ptr = riscv_iommu_pte_fetch(domain, iova, &pte_size);
@@ -1261,7 +1346,9 @@ static size_t riscv_iommu_unmap_pages(struct iommu_domain *iommu_domain,
if (iova & (pte_size - 1))
return unmapped;
+ ptl = riscv_iommu_ptlock(domain, ptr, pgsize_to_level(pte_size));
set_pte(ptr, __pte(0));
+ spin_unlock(ptl);
iommu_iotlb_gather_add_page(&domain->domain, gather, iova,
pte_size);
@@ -1291,13 +1378,14 @@ static void riscv_iommu_free_paging_domain(struct iommu_domain *iommu_domain)
{
struct riscv_iommu_domain *domain = iommu_domain_to_riscv(iommu_domain);
const unsigned long pfn = virt_to_pfn(domain->pgd_root);
+ int level = domain->pgd_mode - RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV39 + 2;
WARN_ON(!list_empty(&domain->bonds));
if ((int)domain->pscid > 0)
ida_free(&riscv_iommu_pscids, domain->pscid);
- riscv_iommu_pte_free(domain, _io_pte_entry(pfn, _PAGE_TABLE), NULL);
+ riscv_iommu_pte_free(domain, _io_pte_entry(pfn, _PAGE_TABLE), level, NULL);
kfree(domain);
}
@@ -1358,7 +1446,7 @@ static struct iommu_domain *riscv_iommu_alloc_paging_domain(struct device *dev)
struct riscv_iommu_device *iommu;
unsigned int pgd_mode;
dma_addr_t va_mask;
- int va_bits;
+ int va_bits, level;
iommu = dev_to_iommu(dev);
if (iommu->caps & RISCV_IOMMU_CAPABILITIES_SV57) {
@@ -1381,11 +1469,14 @@ static struct iommu_domain *riscv_iommu_alloc_paging_domain(struct device *dev)
INIT_LIST_HEAD_RCU(&domain->bonds);
spin_lock_init(&domain->lock);
+ spin_lock_init(&domain->page_table_lock);
domain->numa_node = dev_to_node(iommu->dev);
domain->amo_enabled = !!(iommu->caps & RISCV_IOMMU_CAPABILITIES_AMO_HWAD);
domain->pgd_mode = pgd_mode;
- domain->pgd_root = iommu_alloc_page_node(domain->numa_node,
- GFP_KERNEL_ACCOUNT);
+ level = domain->pgd_mode - RISCV_IOMMU_DC_FSC_IOSATP_MODE_SV39 + 2;
+ domain->pgd_root = riscv_iommu_alloc_pagetable_node(domain->numa_node,
+ GFP_KERNEL_ACCOUNT,
+ level);
if (!domain->pgd_root) {
kfree(domain);
return ERR_PTR(-ENOMEM);
@@ -1394,7 +1485,7 @@ static struct iommu_domain *riscv_iommu_alloc_paging_domain(struct device *dev)
domain->pscid = ida_alloc_range(&riscv_iommu_pscids, 1,
RISCV_IOMMU_MAX_PSCID, GFP_KERNEL);
if (domain->pscid < 0) {
- iommu_free_page(domain->pgd_root);
+ riscv_iommu_free_pagetable(domain->pgd_root, level);
kfree(domain);
return ERR_PTR(-ENOMEM);
}
--
2.20.1
^ permalink raw reply [flat|nested] 6+ messages in thread* Re: [PATCH RESEND v2 3/4] iommu/riscv: Introduce IOMMU page table lock
2025-03-18 3:59 ` [PATCH RESEND v2 3/4] iommu/riscv: Introduce IOMMU page table lock Xu Lu
@ 2025-04-01 15:19 ` Jason Gunthorpe
0 siblings, 0 replies; 6+ messages in thread
From: Jason Gunthorpe @ 2025-04-01 15:19 UTC (permalink / raw)
To: Xu Lu
Cc: akpm, jhubbard, kirill.shutemov, tjeznach, joro, will,
robin.murphy, lihangjing, xieyongji, linux-riscv, linux-kernel,
linux-mm
On Tue, Mar 18, 2025 at 11:59:29AM +0800, Xu Lu wrote:
> Introduce page table lock to address competition issues when modifying
> multiple PTEs, for example, when applying Svnapot. We use fine-grained
> page table locks to minimize lock contention.
This does not seem right, there is no need for locks to manage a cont
bit on the iommu side.
Jason
^ permalink raw reply [flat|nested] 6+ messages in thread
* [PATCH RESEND v2 4/4] iommu/riscv: Add support for Svnapot
2025-03-18 3:59 [PATCH RESEND v2 0/4] riscv: iommu: Support Svnapot Xu Lu
` (2 preceding siblings ...)
2025-03-18 3:59 ` [PATCH RESEND v2 3/4] iommu/riscv: Introduce IOMMU page table lock Xu Lu
@ 2025-03-18 3:59 ` Xu Lu
3 siblings, 0 replies; 6+ messages in thread
From: Xu Lu @ 2025-03-18 3:59 UTC (permalink / raw)
To: akpm, jhubbard, kirill.shutemov, tjeznach, joro, will, robin.murphy
Cc: lihangjing, xieyongji, linux-riscv, linux-kernel, linux-mm, Xu Lu
Add Svnapot size as supported page size and apply Svnapot when it is
possible.
Signed-off-by: Xu Lu <luxu.kernel@bytedance.com>
---
drivers/iommu/riscv/iommu.c | 86 +++++++++++++++++++++++++++++++++----
1 file changed, 77 insertions(+), 9 deletions(-)
diff --git a/drivers/iommu/riscv/iommu.c b/drivers/iommu/riscv/iommu.c
index ce4cf6569ffb4..7cc736abd2a61 100644
--- a/drivers/iommu/riscv/iommu.c
+++ b/drivers/iommu/riscv/iommu.c
@@ -1158,6 +1158,26 @@ static int pgsize_to_level(size_t pgsize)
return level;
}
+static unsigned long napot_size_to_order(unsigned long size)
+{
+ unsigned long order;
+
+ if (!has_svnapot())
+ return 0;
+
+ for_each_napot_order(order) {
+ if (size == napot_cont_size(order))
+ return order;
+ }
+
+ return 0;
+}
+
+static bool is_napot_size(unsigned long size)
+{
+ return napot_size_to_order(size) != 0;
+}
+
static void riscv_iommu_pte_free(struct riscv_iommu_domain *domain,
pte_t pte, int level,
struct list_head *freelist)
@@ -1205,7 +1225,8 @@ static pte_t *riscv_iommu_pte_alloc(struct riscv_iommu_domain *domain,
* existing mapping with smaller granularity. Up to the caller
* to replace and invalidate.
*/
- if (((size_t)1 << shift) == pgsize)
+ if ((((size_t)1 << shift) == pgsize) ||
+ (is_napot_size(pgsize) && pgsize_to_level(pgsize) == level))
return ptr;
pte_retry:
pte = ptep_get(ptr);
@@ -1256,7 +1277,10 @@ static pte_t *riscv_iommu_pte_fetch(struct riscv_iommu_domain *domain,
ptr += ((iova >> shift) & (PTRS_PER_PTE - 1));
pte = ptep_get(ptr);
if (_io_pte_present(pte) && _io_pte_leaf(pte)) {
- *pte_pgsize = (size_t)1 << shift;
+ if (pte_napot(pte))
+ *pte_pgsize = napot_cont_size(napot_cont_order(pte));
+ else
+ *pte_pgsize = (size_t)1 << shift;
return ptr;
}
if (_io_pte_none(pte))
@@ -1274,13 +1298,18 @@ static int riscv_iommu_map_pages(struct iommu_domain *iommu_domain,
{
struct riscv_iommu_domain *domain = iommu_domain_to_riscv(iommu_domain);
size_t size = 0;
- pte_t *ptr;
- pte_t pte;
- unsigned long pte_prot;
- int rc = 0, level;
+ pte_t *ptr, old, pte;
+ unsigned long pte_prot, order = 0;
+ int rc = 0, level, i;
spinlock_t *ptl; /* page table page lock */
LIST_HEAD(freelist);
+ if (iova & (pgsize - 1))
+ return -EINVAL;
+
+ if (is_napot_size(pgsize))
+ order = napot_size_to_order(pgsize);
+
if (!(prot & IOMMU_WRITE))
pte_prot = _PAGE_BASE | _PAGE_READ;
else if (domain->amo_enabled)
@@ -1297,9 +1326,27 @@ static int riscv_iommu_map_pages(struct iommu_domain *iommu_domain,
level = pgsize_to_level(pgsize);
ptl = riscv_iommu_ptlock(domain, ptr, level);
- riscv_iommu_pte_free(domain, ptep_get(ptr), level, &freelist);
+
+ old = ptep_get(ptr);
+ if (pte_napot(old) && napot_cont_size(napot_cont_order(old)) > pgsize) {
+ spin_unlock(ptl);
+ rc = -EFAULT;
+ break;
+ }
+
pte = _io_pte_entry(phys_to_pfn(phys), pte_prot);
- set_pte(ptr, pte);
+ if (order) {
+ pte = pte_mknapot(pte, order);
+ for (i = 0; i < napot_pte_num(order); i++, ptr++) {
+ old = ptep_get(ptr);
+ riscv_iommu_pte_free(domain, old, level, &freelist);
+ set_pte(ptr, pte);
+ }
+ } else {
+ riscv_iommu_pte_free(domain, old, level, &freelist);
+ set_pte(ptr, pte);
+ }
+
spin_unlock(ptl);
size += pgsize;
@@ -1336,6 +1383,9 @@ static size_t riscv_iommu_unmap_pages(struct iommu_domain *iommu_domain,
size_t unmapped = 0;
size_t pte_size;
spinlock_t *ptl; /* page table page lock */
+ unsigned long pte_num;
+ pte_t pte;
+ int i;
while (unmapped < size) {
ptr = riscv_iommu_pte_fetch(domain, iova, &pte_size);
@@ -1347,7 +1397,21 @@ static size_t riscv_iommu_unmap_pages(struct iommu_domain *iommu_domain,
return unmapped;
ptl = riscv_iommu_ptlock(domain, ptr, pgsize_to_level(pte_size));
- set_pte(ptr, __pte(0));
+ if (is_napot_size(pte_size)) {
+ pte = ptep_get(ptr);
+
+ if (!pte_napot(pte) ||
+ napot_cont_size(napot_cont_order(pte)) != pte_size) {
+ spin_unlock(ptl);
+ return unmapped;
+ }
+
+ pte_num = napot_pte_num(napot_cont_order(pte));
+ for (i = 0; i < pte_num; i++, ptr++)
+ set_pte(ptr, __pte(0));
+ } else {
+ set_pte(ptr, __pte(0));
+ }
spin_unlock(ptl);
iommu_iotlb_gather_add_page(&domain->domain, gather, iova,
@@ -1447,6 +1511,7 @@ static struct iommu_domain *riscv_iommu_alloc_paging_domain(struct device *dev)
unsigned int pgd_mode;
dma_addr_t va_mask;
int va_bits, level;
+ size_t order;
iommu = dev_to_iommu(dev);
if (iommu->caps & RISCV_IOMMU_CAPABILITIES_SV57) {
@@ -1506,6 +1571,9 @@ static struct iommu_domain *riscv_iommu_alloc_paging_domain(struct device *dev)
domain->domain.geometry.aperture_end = va_mask;
domain->domain.geometry.force_aperture = true;
domain->domain.pgsize_bitmap = va_mask & (SZ_4K | SZ_2M | SZ_1G | SZ_512G);
+ if (has_svnapot())
+ for_each_napot_order(order)
+ domain->domain.pgsize_bitmap |= napot_cont_size(order) & va_mask;
domain->domain.ops = &riscv_iommu_paging_domain_ops;
--
2.20.1
^ permalink raw reply [flat|nested] 6+ messages in thread