From: Pasha Tatashin <pasha.tatashin@soleen.com>
To: akpm@linux-foundation.org, linux-mm@kvack.org,
pasha.tatashin@soleen.com, linux-kernel@vger.kernel.org,
rientjes@google.com, dwmw2@infradead.org,
baolu.lu@linux.intel.com, joro@8bytes.org, will@kernel.org,
robin.murphy@arm.com, iommu@lists.linux.dev
Subject: [RFC 1/3] iommu/intel: Use page->refcount to count number of entries in IOMMU
Date: Thu, 21 Dec 2023 03:19:13 +0000 [thread overview]
Message-ID: <20231221031915.619337-2-pasha.tatashin@soleen.com> (raw)
In-Reply-To: <20231221031915.619337-1-pasha.tatashin@soleen.com>
In order to be able to efficiently free empty page table levels, count the
number of entries in each page table my incremeanting and decremeanting
refcount every time a PTE is inserted or removed form the page table.
For this to work correctly, add two helper function:
dma_clear_pte and dma_set_pte where counting is performed,
Also, modify the code so every page table entry is always updated using the
two new functions.
Signed-off-by: Pasha Tatashin <pasha.tatashin@soleen.com>
---
drivers/iommu/intel/iommu.c | 40 +++++++++++++++++++++---------------
drivers/iommu/intel/iommu.h | 41 +++++++++++++++++++++++++++++++------
2 files changed, 58 insertions(+), 23 deletions(-)
diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c
index 897159dba47d..4688ef797161 100644
--- a/drivers/iommu/intel/iommu.c
+++ b/drivers/iommu/intel/iommu.c
@@ -949,7 +949,7 @@ static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain,
if (domain->use_first_level)
pteval |= DMA_FL_PTE_XD | DMA_FL_PTE_US | DMA_FL_PTE_ACCESS;
- if (cmpxchg64(&pte->val, 0ULL, pteval))
+ if (dma_set_pte(pte, pteval))
/* Someone else set it while we were thinking; use theirs. */
free_pgtable_page(tmp_page);
else
@@ -1021,7 +1021,8 @@ static void dma_pte_clear_range(struct dmar_domain *domain,
continue;
}
do {
- dma_clear_pte(pte);
+ if (dma_pte_present(pte))
+ dma_clear_pte(pte);
start_pfn += lvl_to_nr_pages(large_page);
pte++;
} while (start_pfn <= last_pfn && !first_pte_in_page(pte));
@@ -1062,7 +1063,8 @@ static void dma_pte_free_level(struct dmar_domain *domain, int level,
*/
if (level < retain_level && !(start_pfn > level_pfn ||
last_pfn < level_pfn + level_size(level) - 1)) {
- dma_clear_pte(pte);
+ if (dma_pte_present(pte))
+ dma_clear_pte(pte);
domain_flush_cache(domain, pte, sizeof(*pte));
free_pgtable_page(level_pte);
}
@@ -1093,12 +1095,13 @@ static void dma_pte_free_pagetable(struct dmar_domain *domain,
}
}
-/* When a page at a given level is being unlinked from its parent, we don't
- need to *modify* it at all. All we need to do is make a list of all the
- pages which can be freed just as soon as we've flushed the IOTLB and we
- know the hardware page-walk will no longer touch them.
- The 'pte' argument is the *parent* PTE, pointing to the page that is to
- be freed. */
+/*
+ * A given page at a given level is being unlinked from its parent.
+ * We need to make a list of all the pages which can be freed just as soon as
+ * we've flushed the IOTLB and we know the hardware page-walk will no longer
+ * touch them. The 'pte' argument is the *parent* PTE, pointing to the page
+ * that is to be freed.
+ */
static void dma_pte_list_pagetables(struct dmar_domain *domain,
int level, struct dma_pte *pte,
struct list_head *freelist)
@@ -1106,17 +1109,20 @@ static void dma_pte_list_pagetables(struct dmar_domain *domain,
struct page *pg;
pg = pfn_to_page(dma_pte_addr(pte) >> PAGE_SHIFT);
- list_add_tail(&pg->lru, freelist);
-
- if (level == 1)
- return;
-
pte = page_address(pg);
+
do {
- if (dma_pte_present(pte) && !dma_pte_superpage(pte))
- dma_pte_list_pagetables(domain, level - 1, pte, freelist);
+ if (dma_pte_present(pte)) {
+ if (level > 1 && !dma_pte_superpage(pte)) {
+ dma_pte_list_pagetables(domain, level - 1, pte,
+ freelist);
+ }
+ dma_clear_pte(pte);
+ }
pte++;
} while (!first_pte_in_page(pte));
+
+ list_add_tail(&pg->lru, freelist);
}
static void dma_pte_clear_level(struct dmar_domain *domain, int level,
@@ -2244,7 +2250,7 @@ __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn,
/* We don't need lock here, nobody else
* touches the iova range
*/
- tmp = cmpxchg64_local(&pte->val, 0ULL, pteval);
+ tmp = dma_set_pte(pte, pteval);
if (tmp) {
static int dumps = 5;
pr_crit("ERROR: DMA PTE for vPFN 0x%lx already set (to %llx not %llx)\n",
diff --git a/drivers/iommu/intel/iommu.h b/drivers/iommu/intel/iommu.h
index ce030c5b5772..f1ea508f45bd 100644
--- a/drivers/iommu/intel/iommu.h
+++ b/drivers/iommu/intel/iommu.h
@@ -802,11 +802,6 @@ struct dma_pte {
u64 val;
};
-static inline void dma_clear_pte(struct dma_pte *pte)
-{
- pte->val = 0;
-}
-
static inline u64 dma_pte_addr(struct dma_pte *pte)
{
#ifdef CONFIG_64BIT
@@ -818,9 +813,43 @@ static inline u64 dma_pte_addr(struct dma_pte *pte)
#endif
}
+#define DMA_PTEVAL_PRESENT(pteval) (((pteval) & 3) != 0)
static inline bool dma_pte_present(struct dma_pte *pte)
{
- return (pte->val & 3) != 0;
+ return DMA_PTEVAL_PRESENT(pte->val);
+}
+
+static inline void dma_clear_pte(struct dma_pte *pte)
+{
+ u64 old_pteval;
+
+ old_pteval = xchg(&pte->val, 0ULL);
+ if (DMA_PTEVAL_PRESENT(old_pteval)) {
+ struct page *pg = virt_to_page(pte);
+ int rc = page_ref_dec_return(pg);
+
+ WARN_ON_ONCE(rc > 512 || rc < 1);
+ } else {
+ /* Ensure that we cleared a valid entry from the page table */
+ WARN_ON(1);
+ }
+}
+
+static inline u64 dma_set_pte(struct dma_pte *pte, u64 pteval)
+{
+ u64 old_pteval;
+
+ /* Ensure we about to set a valid entry to the page table */
+ WARN_ON(!DMA_PTEVAL_PRESENT(pteval));
+ old_pteval = cmpxchg64(&pte->val, 0ULL, pteval);
+ if (old_pteval == 0) {
+ struct page *pg = virt_to_page(pte);
+ int rc = page_ref_inc_return(pg);
+
+ WARN_ON_ONCE(rc > 513 || rc < 2);
+ }
+
+ return old_pteval;
}
static inline bool dma_sl_pte_test_and_clear_dirty(struct dma_pte *pte,
--
2.43.0.472.g3155946c3a-goog
next prev parent reply other threads:[~2023-12-21 3:19 UTC|newest]
Thread overview: 11+ messages / expand[flat|nested] mbox.gz Atom feed top
2023-12-21 3:19 [RFC 0/3] iommu/intel: Free empty page tables on unmaps Pasha Tatashin
2023-12-21 3:19 ` Pasha Tatashin [this message]
2023-12-25 2:59 ` [RFC 1/3] iommu/intel: Use page->refcount to count number of entries in IOMMU Liu, Jingqi
2023-12-28 15:01 ` Pasha Tatashin
2023-12-21 3:19 ` [RFC 2/3] iommu/intel: synchronize page table map and unmap operations Pasha Tatashin
2023-12-21 3:19 ` [RFC 3/3] iommu/intel: free empty page tables on unmaps Pasha Tatashin
2023-12-21 4:16 ` [RFC 0/3] iommu/intel: Free " Matthew Wilcox
2023-12-21 5:13 ` Pasha Tatashin
2023-12-21 5:42 ` Pasha Tatashin
2023-12-21 14:06 ` Matthew Wilcox
2023-12-21 14:58 ` Pasha Tatashin
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20231221031915.619337-2-pasha.tatashin@soleen.com \
--to=pasha.tatashin@soleen.com \
--cc=akpm@linux-foundation.org \
--cc=baolu.lu@linux.intel.com \
--cc=dwmw2@infradead.org \
--cc=iommu@lists.linux.dev \
--cc=joro@8bytes.org \
--cc=linux-kernel@vger.kernel.org \
--cc=linux-mm@kvack.org \
--cc=rientjes@google.com \
--cc=robin.murphy@arm.com \
--cc=will@kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox