From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
To: Theodore Ts'o <tytso@mit.edu>,
Andreas Dilger <adilger.kernel@dilger.ca>,
Jan Kara <jack@suse.com>,
Andrew Morton <akpm@linux-foundation.org>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>,
Hugh Dickins <hughd@google.com>,
Andrea Arcangeli <aarcange@redhat.com>,
Dave Hansen <dave.hansen@intel.com>,
Vlastimil Babka <vbabka@suse.cz>,
Matthew Wilcox <willy@infradead.org>,
Ross Zwisler <ross.zwisler@linux.intel.com>,
linux-ext4@vger.kernel.org, linux-fsdevel@vger.kernel.org,
linux-kernel@vger.kernel.org, linux-mm@kvack.org,
linux-block@vger.kernel.org,
"Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
Subject: [PATCHv3 14/41] filemap: allocate huge page in page_cache_read(), if allowed
Date: Thu, 15 Sep 2016 14:54:56 +0300 [thread overview]
Message-ID: <20160915115523.29737-15-kirill.shutemov@linux.intel.com> (raw)
In-Reply-To: <20160915115523.29737-1-kirill.shutemov@linux.intel.com>
This patch adds basic functionality to put huge page into page cache.
At the moment we only put huge pages into radix-tree if the range covered
by the huge page is empty.
We ignore shadow entires for now, just remove them from the tree before
inserting huge page.
Later we can add logic to accumulate information from shadow entires to
return to caller (average eviction time?).
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
---
include/linux/fs.h | 5 ++
include/linux/pagemap.h | 21 ++++++-
mm/filemap.c | 148 +++++++++++++++++++++++++++++++++++++++++++-----
3 files changed, 157 insertions(+), 17 deletions(-)
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 901e25d495cc..122024ccc739 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1829,6 +1829,11 @@ struct super_operations {
#else
#define S_DAX 0 /* Make all the DAX code disappear */
#endif
+#define S_HUGE_MODE 0xc000
+#define S_HUGE_NEVER 0x0000
+#define S_HUGE_ALWAYS 0x4000
+#define S_HUGE_WITHIN_SIZE 0x8000
+#define S_HUGE_ADVISE 0xc000
/*
* Note that nosuid etc flags are inode-specific: setting some file-system
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 66a1260b33de..a84f11a672f0 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -191,14 +191,20 @@ static inline int page_cache_add_speculative(struct page *page, int count)
}
#ifdef CONFIG_NUMA
-extern struct page *__page_cache_alloc(gfp_t gfp);
+extern struct page *__page_cache_alloc_order(gfp_t gfp, unsigned int order);
#else
-static inline struct page *__page_cache_alloc(gfp_t gfp)
+static inline struct page *__page_cache_alloc_order(gfp_t gfp,
+ unsigned int order)
{
- return alloc_pages(gfp, 0);
+ return alloc_pages(gfp, order);
}
#endif
+static inline struct page *__page_cache_alloc(gfp_t gfp)
+{
+ return __page_cache_alloc_order(gfp, 0);
+}
+
static inline struct page *page_cache_alloc(struct address_space *x)
{
return __page_cache_alloc(mapping_gfp_mask(x));
@@ -215,6 +221,15 @@ static inline gfp_t readahead_gfp_mask(struct address_space *x)
__GFP_COLD | __GFP_NORETRY | __GFP_NOWARN;
}
+extern bool __page_cache_allow_huge(struct address_space *x, pgoff_t offset);
+static inline bool page_cache_allow_huge(struct address_space *x,
+ pgoff_t offset)
+{
+ if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
+ return false;
+ return __page_cache_allow_huge(x, offset);
+}
+
typedef int filler_t(void *, struct page *);
pgoff_t page_cache_next_hole(struct address_space *mapping,
diff --git a/mm/filemap.c b/mm/filemap.c
index 6f7f45f47d68..50afe17230e7 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -637,14 +637,14 @@ static int __add_to_page_cache_locked(struct page *page,
pgoff_t offset, gfp_t gfp_mask,
void **shadowp)
{
- int huge = PageHuge(page);
+ int hugetlb = PageHuge(page);
struct mem_cgroup *memcg;
int error;
VM_BUG_ON_PAGE(!PageLocked(page), page);
VM_BUG_ON_PAGE(PageSwapBacked(page), page);
- if (!huge) {
+ if (!hugetlb) {
error = mem_cgroup_try_charge(page, current->mm,
gfp_mask, &memcg, false);
if (error)
@@ -653,7 +653,7 @@ static int __add_to_page_cache_locked(struct page *page,
error = radix_tree_maybe_preload(gfp_mask & ~__GFP_HIGHMEM);
if (error) {
- if (!huge)
+ if (!hugetlb)
mem_cgroup_cancel_charge(page, memcg, false);
return error;
}
@@ -663,16 +663,55 @@ static int __add_to_page_cache_locked(struct page *page,
page->index = offset;
spin_lock_irq(&mapping->tree_lock);
- error = page_cache_tree_insert(mapping, page, shadowp);
+ if (PageTransHuge(page)) {
+ struct radix_tree_iter iter;
+ void **slot;
+ void *p;
+
+ error = 0;
+
+ /* Wipe shadow entires */
+ radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, offset) {
+ if (iter.index >= offset + HPAGE_PMD_NR)
+ break;
+
+ p = radix_tree_deref_slot_protected(slot,
+ &mapping->tree_lock);
+ if (!p)
+ continue;
+
+ if (!radix_tree_exception(p)) {
+ error = -EEXIST;
+ break;
+ }
+
+ mapping->nrexceptional--;
+ rcu_assign_pointer(*slot, NULL);
+ }
+
+ if (!error)
+ error = __radix_tree_insert(&mapping->page_tree, offset,
+ compound_order(page), page);
+
+ if (!error) {
+ count_vm_event(THP_FILE_ALLOC);
+ mapping->nrpages += HPAGE_PMD_NR;
+ *shadowp = NULL;
+ __inc_node_page_state(page, NR_FILE_THPS);
+ }
+ } else {
+ error = page_cache_tree_insert(mapping, page, shadowp);
+ }
radix_tree_preload_end();
if (unlikely(error))
goto err_insert;
/* hugetlb pages do not participate in page cache accounting. */
- if (!huge)
- __inc_node_page_state(page, NR_FILE_PAGES);
+ if (!hugetlb)
+ __mod_node_page_state(page_pgdat(page), NR_FILE_PAGES,
+ hpage_nr_pages(page));
spin_unlock_irq(&mapping->tree_lock);
- if (!huge)
+ if (!hugetlb)
mem_cgroup_commit_charge(page, memcg, false, false);
trace_mm_filemap_add_to_page_cache(page);
return 0;
@@ -680,7 +719,7 @@ err_insert:
page->mapping = NULL;
/* Leave page->index set: truncation relies upon it */
spin_unlock_irq(&mapping->tree_lock);
- if (!huge)
+ if (!hugetlb)
mem_cgroup_cancel_charge(page, memcg, false);
put_page(page);
return error;
@@ -737,7 +776,7 @@ int add_to_page_cache_lru(struct page *page, struct address_space *mapping,
EXPORT_SYMBOL_GPL(add_to_page_cache_lru);
#ifdef CONFIG_NUMA
-struct page *__page_cache_alloc(gfp_t gfp)
+struct page *__page_cache_alloc_order(gfp_t gfp, unsigned int order)
{
int n;
struct page *page;
@@ -747,14 +786,14 @@ struct page *__page_cache_alloc(gfp_t gfp)
do {
cpuset_mems_cookie = read_mems_allowed_begin();
n = cpuset_mem_spread_node();
- page = __alloc_pages_node(n, gfp, 0);
+ page = __alloc_pages_node(n, gfp, order);
} while (!page && read_mems_allowed_retry(cpuset_mems_cookie));
return page;
}
- return alloc_pages(gfp, 0);
+ return alloc_pages(gfp, order);
}
-EXPORT_SYMBOL(__page_cache_alloc);
+EXPORT_SYMBOL(__page_cache_alloc_order);
#endif
/*
@@ -1149,6 +1188,69 @@ repeat:
}
EXPORT_SYMBOL(find_lock_entry);
+bool __page_cache_allow_huge(struct address_space *mapping, pgoff_t offset)
+{
+ struct inode *inode = mapping->host;
+ struct radix_tree_iter iter;
+ void **slot;
+ struct page *page;
+
+ if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE))
+ return false;
+
+ offset = round_down(offset, HPAGE_PMD_NR);
+
+ switch (inode->i_flags & S_HUGE_MODE) {
+ case S_HUGE_NEVER:
+ return false;
+ case S_HUGE_ALWAYS:
+ break;
+ case S_HUGE_WITHIN_SIZE:
+ if (DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE) <
+ offset + HPAGE_PMD_NR)
+ return false;
+ break;
+ case S_HUGE_ADVISE:
+ /* TODO */
+ return false;
+ default:
+ WARN_ON_ONCE(1);
+ return false;
+ }
+
+ rcu_read_lock();
+ radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, offset) {
+ if (iter.index >= offset + HPAGE_PMD_NR)
+ break;
+
+ /* Shadow entires are fine */
+ page = radix_tree_deref_slot(slot);
+ if (page && !radix_tree_exception(page)) {
+ rcu_read_unlock();
+ return false;
+ }
+ }
+ rcu_read_unlock();
+
+ return true;
+
+}
+
+static struct page *page_cache_alloc_huge(struct address_space *mapping,
+ pgoff_t offset, gfp_t gfp_mask)
+{
+ struct page *page;
+
+ if (!page_cache_allow_huge(mapping, offset))
+ return NULL;
+
+ gfp_mask |= __GFP_COMP | __GFP_NORETRY | __GFP_NOWARN;
+ page = __page_cache_alloc_order(gfp_mask, HPAGE_PMD_ORDER);
+ if (page)
+ prep_transhuge_page(page);
+ return page;
+}
+
/**
* pagecache_get_page - find and get a page reference
* @mapping: the address_space to search
@@ -2022,19 +2124,37 @@ static int page_cache_read(struct file *file, pgoff_t offset, gfp_t gfp_mask)
{
struct address_space *mapping = file->f_mapping;
struct page *page;
+ pgoff_t hoffset;
int ret;
do {
- page = __page_cache_alloc(gfp_mask|__GFP_COLD);
+ page = page_cache_alloc_huge(mapping, offset, gfp_mask);
+no_huge:
+ if (!page)
+ page = __page_cache_alloc(gfp_mask|__GFP_COLD);
if (!page)
return -ENOMEM;
- ret = add_to_page_cache_lru(page, mapping, offset, gfp_mask & GFP_KERNEL);
+ if (PageTransHuge(page))
+ hoffset = round_down(offset, HPAGE_PMD_NR);
+ else
+ hoffset = offset;
+
+ ret = add_to_page_cache_lru(page, mapping, hoffset,
+ gfp_mask & GFP_KERNEL);
if (ret == 0)
ret = mapping->a_ops->readpage(file, page);
else if (ret == -EEXIST)
ret = 0; /* losing race to add is OK */
+ if (ret && PageTransHuge(page)) {
+ delete_from_page_cache(page);
+ unlock_page(page);
+ put_page(page);
+ page = NULL;
+ goto no_huge;
+ }
+
put_page(page);
} while (ret == AOP_TRUNCATED_PAGE);
--
2.9.3
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
next prev parent reply other threads:[~2016-09-15 11:56 UTC|newest]
Thread overview: 75+ messages / expand[flat|nested] mbox.gz Atom feed top
2016-09-15 11:54 [PATCHv3 00/41] ext4: support of huge pages Kirill A. Shutemov
2016-09-15 11:54 ` [PATCHv3 01/41] tools: Add WARN_ON_ONCE Kirill A. Shutemov
2016-09-15 11:54 ` [PATCHv3 02/41] radix tree test suite: Allow GFP_ATOMIC allocations to fail Kirill A. Shutemov
2016-09-15 11:54 ` [PATCHv3 03/41] radix-tree: Add radix_tree_join Kirill A. Shutemov
2016-09-15 11:54 ` [PATCHv3 04/41] radix-tree: Add radix_tree_split Kirill A. Shutemov
2016-09-15 11:54 ` [PATCHv3 05/41] radix-tree: Add radix_tree_split_preload() Kirill A. Shutemov
2016-09-15 11:54 ` [PATCHv3 06/41] radix-tree: Handle multiorder entries being deleted by replace_clear_tags Kirill A. Shutemov
2016-09-15 11:54 ` [PATCHv3 07/41] mm, shmem: swich huge tmpfs to multi-order radix-tree entries Kirill A. Shutemov
2016-09-16 12:07 ` Kirill A. Shutemov
2016-09-15 11:54 ` [PATCHv3 08/41] Revert "radix-tree: implement radix_tree_maybe_preload_order()" Kirill A. Shutemov
2016-09-15 11:54 ` [PATCHv3 09/41] page-flags: relax page flag policy for few flags Kirill A. Shutemov
2016-09-15 11:54 ` [PATCHv3 10/41] mm, rmap: account file thp pages Kirill A. Shutemov
2016-09-15 11:54 ` [PATCHv3 11/41] thp: try to free page's buffers before attempt split Kirill A. Shutemov
2016-10-11 15:40 ` Jan Kara
2016-10-11 21:43 ` Kirill A. Shutemov
2016-09-15 11:54 ` [PATCHv3 12/41] thp: handle write-protection faults for file THP Kirill A. Shutemov
2016-10-11 15:47 ` Jan Kara
2016-10-11 21:47 ` Kirill A. Shutemov
2016-09-15 11:54 ` [PATCHv3 13/41] truncate: make sure invalidate_mapping_pages() can discard huge pages Kirill A. Shutemov
2016-10-11 15:58 ` Jan Kara
2016-10-11 21:53 ` Kirill A. Shutemov
2016-10-12 6:43 ` Jan Kara
2016-10-24 10:41 ` Kirill A. Shutemov
2016-09-15 11:54 ` Kirill A. Shutemov [this message]
2016-10-11 16:15 ` [PATCHv3 14/41] filemap: allocate huge page in page_cache_read(), if allowed Jan Kara
2016-10-11 21:57 ` Kirill A. Shutemov
2016-09-15 11:54 ` [PATCHv3 15/41] filemap: handle huge pages in do_generic_file_read() Kirill A. Shutemov
2016-10-13 9:33 ` Jan Kara
2016-10-31 18:10 ` Kirill A. Shutemov
2016-11-01 16:39 ` Jan Kara
2016-11-02 8:32 ` Kirill A. Shutemov
2016-11-02 14:37 ` Christoph Hellwig
2016-11-03 20:40 ` Jan Kara
2016-11-07 11:07 ` Kirill A. Shutemov
2016-11-07 14:59 ` Christoph Hellwig
2016-11-02 14:36 ` Christoph Hellwig
2016-11-03 17:56 ` Jan Kara
2016-11-07 11:13 ` Kirill A. Shutemov
2016-11-07 15:01 ` Christoph Hellwig
2016-11-07 16:03 ` Kirill A. Shutemov
2016-09-15 11:54 ` [PATCHv3 16/41] filemap: allocate huge page in pagecache_get_page(), if allowed Kirill A. Shutemov
2016-09-15 11:54 ` [PATCHv3 17/41] filemap: handle huge pages in filemap_fdatawait_range() Kirill A. Shutemov
2016-10-13 9:44 ` Jan Kara
2016-10-13 12:08 ` Kirill A. Shutemov
2016-10-13 13:18 ` Jan Kara
2016-10-24 11:36 ` Kirill A. Shutemov
2016-10-30 17:31 ` Jan Kara
2016-09-15 11:55 ` [PATCHv3 18/41] HACK: readahead: alloc huge pages, if allowed Kirill A. Shutemov
2016-09-15 11:55 ` [PATCHv3 19/41] block: define BIO_MAX_PAGES to HPAGE_PMD_NR if huge page cache enabled Kirill A. Shutemov
2016-09-16 12:09 ` Kirill A. Shutemov
2016-09-15 11:55 ` [PATCHv3 20/41] mm: make write_cache_pages() work on huge pages Kirill A. Shutemov
2016-09-15 11:55 ` [PATCHv3 21/41] thp: introduce hpage_size() and hpage_mask() Kirill A. Shutemov
2016-09-15 11:55 ` [PATCHv3 22/41] thp: do not threat slab pages as huge in hpage_{nr_pages,size,mask} Kirill A. Shutemov
2016-09-15 11:55 ` [PATCHv3 23/41] fs: make block_read_full_page() be able to read huge page Kirill A. Shutemov
2016-09-15 11:55 ` [PATCHv3 24/41] fs: make block_write_{begin,end}() be able to handle huge pages Kirill A. Shutemov
2016-09-15 11:55 ` [PATCHv3 25/41] fs: make block_page_mkwrite() aware about " Kirill A. Shutemov
2016-09-15 11:55 ` [PATCHv3 26/41] truncate: make truncate_inode_pages_range() " Kirill A. Shutemov
2016-09-15 11:55 ` [PATCHv3 27/41] truncate: make invalidate_inode_pages2_range() " Kirill A. Shutemov
2016-09-15 11:55 ` [PATCHv3 28/41] mm, hugetlb: switch hugetlbfs to multi-order radix-tree entries Kirill A. Shutemov
2016-09-15 11:55 ` [PATCHv3 29/41] ext4: make ext4_mpage_readpages() hugepage-aware Kirill A. Shutemov
2016-09-15 12:27 ` Andreas Dilger
2016-09-16 12:17 ` Kirill A. Shutemov
2016-09-16 12:10 ` Kirill A. Shutemov
2016-09-15 11:55 ` [PATCHv3 30/41] ext4: make ext4_writepage() work on huge pages Kirill A. Shutemov
2016-09-15 11:55 ` [PATCHv3 31/41] ext4: handle huge pages in ext4_page_mkwrite() Kirill A. Shutemov
2016-09-15 11:55 ` [PATCHv3 32/41] ext4: handle huge pages in __ext4_block_zero_page_range() Kirill A. Shutemov
2016-09-15 11:55 ` [PATCHv3 33/41] ext4: make ext4_block_write_begin() aware about huge pages Kirill A. Shutemov
2016-09-15 11:55 ` [PATCHv3 34/41] ext4: handle huge pages in ext4_da_write_end() Kirill A. Shutemov
2016-09-15 11:55 ` [PATCHv3 35/41] ext4: make ext4_da_page_release_reservation() aware about huge pages Kirill A. Shutemov
2016-09-15 11:55 ` [PATCHv3 36/41] ext4: handle writeback with " Kirill A. Shutemov
2016-09-15 11:55 ` [PATCHv3 37/41] ext4: make EXT4_IOC_MOVE_EXT work " Kirill A. Shutemov
2016-09-15 11:55 ` [PATCHv3 38/41] ext4: fix SEEK_DATA/SEEK_HOLE for " Kirill A. Shutemov
2016-09-15 11:55 ` [PATCHv3 39/41] ext4: make fallocate() operations work with " Kirill A. Shutemov
2016-09-15 11:55 ` [PATCHv3 40/41] mm, fs, ext4: expand use of page_mapping() and page_to_pgoff() Kirill A. Shutemov
2016-09-15 11:55 ` [PATCHv3 41/41] ext4, vfs: add huge= mount option Kirill A. Shutemov
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20160915115523.29737-15-kirill.shutemov@linux.intel.com \
--to=kirill.shutemov@linux.intel.com \
--cc=aarcange@redhat.com \
--cc=adilger.kernel@dilger.ca \
--cc=akpm@linux-foundation.org \
--cc=dave.hansen@intel.com \
--cc=hughd@google.com \
--cc=jack@suse.com \
--cc=linux-block@vger.kernel.org \
--cc=linux-ext4@vger.kernel.org \
--cc=linux-fsdevel@vger.kernel.org \
--cc=linux-kernel@vger.kernel.org \
--cc=linux-mm@kvack.org \
--cc=ross.zwisler@linux.intel.com \
--cc=tytso@mit.edu \
--cc=vbabka@suse.cz \
--cc=viro@zeniv.linux.org.uk \
--cc=willy@infradead.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox