Re: [PATCH 09/10] Btrfs: kill the btree_inode

linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed

From: Liu Bo <bo.li.liu@oracle.com>
To: Josef Bacik <josef@toxicpanda.com>
Cc: hannes@cmpxchg.org, linux-mm@kvack.org,
	akpm@linux-foundation.org, jack@suse.cz,
	linux-fsdevel@vger.kernel.org, kernel-team@fb.com,
	linux-btrfs@vger.kernel.org, Josef Bacik <jbacik@fb.com>
Subject: Re: [PATCH 09/10] Btrfs: kill the btree_inode
Date: Thu, 16 Nov 2017 17:03:08 -0800	[thread overview]
Message-ID: <20171117010307.GF23614@dhcp-whq-twvpn-1-vpnpool-10-159-142-193.vpn.oracle.com> (raw)
In-Reply-To: <1510696616-8489-9-git-send-email-josef@toxicpanda.com>

On Tue, Nov 14, 2017 at 04:56:55PM -0500, Josef Bacik wrote:
> From: Josef Bacik <jbacik@fb.com>
> 
> In order to more efficiently support sub-page blocksizes we need to stop
> allocating pages from pagecache for our metadata.  Instead switch to using the
> account_metadata* counters for making sure we are keeping the system aware of
> how much dirty metadata we have, and use the ->free_cached_objects super
> operation in order to handle freeing up extent buffers.  This greatly simplifies
> how we deal with extent buffers as now we no longer have to tie the page cache
> reclaimation stuff to the extent buffer stuff.  This will also allow us to
> simply kmalloc() our data for sub-page blocksizes.
>

The patch is too big for one to review, but so far it looks good to
me, a few comments.

> Signed-off-by: Josef Bacik <jbacik@fb.com>
> ---
...
>  
> -static int check_async_write(struct btrfs_inode *bi)
> +static int check_async_write(void)
>  {
> -	if (atomic_read(&bi->sync_writers))
> +	if (current->journal_info)

Please add a comment that explains we're called from commit
transaction.

>  		return 0;
>  #ifdef CONFIG_X86
>  	if (static_cpu_has(X86_FEATURE_XMM4_2))
...
> @@ -4977,12 +5054,12 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
>  	unsigned long len = fs_info->nodesize;
>  	unsigned long num_pages = num_extent_pages(start, len);
>  	unsigned long i;
> -	unsigned long index = start >> PAGE_SHIFT;
>  	struct extent_buffer *eb;
>  	struct extent_buffer *exists = NULL;
>  	struct page *p;
> -	struct address_space *mapping = fs_info->btree_inode->i_mapping;
> -	int uptodate = 1;
> +	struct btrfs_eb_info *eb_info = fs_info->eb_info;
> +//	struct zone *last_zone = NULL;
> +//	struct pg_data_t *last_pgdata = NULL;

hmm, a typo?

Thanks,

-liubo

>  	int ret;
>  
>  	if (!IS_ALIGNED(start, fs_info->sectorsize)) {
> @@ -4990,62 +5067,36 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
>  		return ERR_PTR(-EINVAL);
>  	}
>  
> -	eb = find_extent_buffer(fs_info, start);
> +	eb = find_extent_buffer(eb_info, start);
>  	if (eb)
>  		return eb;
>  
> -	eb = __alloc_extent_buffer(fs_info, start, len);
> +	eb = __alloc_extent_buffer(eb_info, start, len);
>  	if (!eb)
>  		return ERR_PTR(-ENOMEM);
>  
> -	for (i = 0; i < num_pages; i++, index++) {
> -		p = find_or_create_page(mapping, index, GFP_NOFS|__GFP_NOFAIL);
> +	for (i = 0; i < num_pages; i++) {
> +		p = alloc_page(GFP_NOFS|__GFP_NOFAIL);
>  		if (!p) {
>  			exists = ERR_PTR(-ENOMEM);
>  			goto free_eb;
>  		}
>  
> -		spin_lock(&mapping->private_lock);
> -		if (PagePrivate(p)) {
> -			/*
> -			 * We could have already allocated an eb for this page
> -			 * and attached one so lets see if we can get a ref on
> -			 * the existing eb, and if we can we know it's good and
> -			 * we can just return that one, else we know we can just
> -			 * overwrite page->private.
> -			 */
> -			exists = (struct extent_buffer *)p->private;
> -			if (atomic_inc_not_zero(&exists->refs)) {
> -				spin_unlock(&mapping->private_lock);
> -				unlock_page(p);
> -				put_page(p);
> -				mark_extent_buffer_accessed(exists, p);
> -				goto free_eb;
> -			}
> -			exists = NULL;
> -
> -			/*
> -			 * Do this so attach doesn't complain and we need to
> -			 * drop the ref the old guy had.
> -			 */
> -			ClearPagePrivate(p);
> -			WARN_ON(PageDirty(p));
> -			put_page(p);
> -		}
> +		/*
> +		 * If our pages span zones or numa nodes we have to do
> +		 * dirty/writeback accounting per page, otherwise we can do it
> +		 * in bulk and save us some looping.
> +		 *
> +		if (!last_zone)
> +			last_zone = page_zone(p);
> +		if (!last_pgdata)
> +			last_pgdata = page_pgdata(p);
> +		if (last_zone != page_zone(p) || last_pgdata != page_pgdata(p))
> +			set_bit(EXTENT_BUFFER_MIXED_PAGES, &eb->bflags);
> +		*/
>  		attach_extent_buffer_page(eb, p);
> -		spin_unlock(&mapping->private_lock);
> -		WARN_ON(PageDirty(p));
>  		eb->pages[i] = p;
> -		if (!PageUptodate(p))
> -			uptodate = 0;
> -
> -		/*
> -		 * see below about how we avoid a nasty race with release page
> -		 * and why we unlock later
> -		 */
>  	}
> -	if (uptodate)
> -		set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
>  again:
>  	ret = radix_tree_preload(GFP_NOFS);
>  	if (ret) {
> @@ -5053,13 +5104,13 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
>  		goto free_eb;
>  	}
>  
> -	spin_lock(&fs_info->buffer_lock);
> -	ret = radix_tree_insert(&fs_info->buffer_radix,
> +	spin_lock_irq(&eb_info->buffer_lock);
> +	ret = radix_tree_insert(&eb_info->buffer_radix,
>  				start >> PAGE_SHIFT, eb);
> -	spin_unlock(&fs_info->buffer_lock);
> +	spin_unlock_irq(&eb_info->buffer_lock);
>  	radix_tree_preload_end();
>  	if (ret == -EEXIST) {
> -		exists = find_extent_buffer(fs_info, start);
> +		exists = find_extent_buffer(eb_info, start);
>  		if (exists)
>  			goto free_eb;
>  		else
> @@ -5069,31 +5120,10 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
>  	check_buffer_tree_ref(eb);
>  	set_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags);
>  
> -	/*
> -	 * there is a race where release page may have
> -	 * tried to find this extent buffer in the radix
> -	 * but failed.  It will tell the VM it is safe to
> -	 * reclaim the, and it will clear the page private bit.
> -	 * We must make sure to set the page private bit properly
> -	 * after the extent buffer is in the radix tree so
> -	 * it doesn't get lost
> -	 */
> -	SetPageChecked(eb->pages[0]);
> -	for (i = 1; i < num_pages; i++) {
> -		p = eb->pages[i];
> -		ClearPageChecked(p);
> -		unlock_page(p);
> -	}
> -	unlock_page(eb->pages[0]);
>  	return eb;
>  
>  free_eb:
>  	WARN_ON(!atomic_dec_and_test(&eb->refs));
> -	for (i = 0; i < num_pages; i++) {
> -		if (eb->pages[i])
> -			unlock_page(eb->pages[i]);
> -	}
> -
>  	btrfs_release_extent_buffer(eb);
>  	return exists;
>  }
> @@ -5109,17 +5139,19 @@ static inline void btrfs_release_extent_buffer_rcu(struct rcu_head *head)
>  /* Expects to have eb->eb_lock already held */
>  static int release_extent_buffer(struct extent_buffer *eb)
>  {
> +	struct btrfs_eb_info *eb_info = eb->eb_info;
> +
>  	WARN_ON(atomic_read(&eb->refs) == 0);
>  	if (atomic_dec_and_test(&eb->refs)) {
> +		if (eb_info)
> +			list_lru_del(&eb_info->lru_list, &eb->lru);
>  		if (test_and_clear_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags)) {
> -			struct btrfs_fs_info *fs_info = eb->fs_info;
> -
>  			spin_unlock(&eb->refs_lock);
>  
> -			spin_lock(&fs_info->buffer_lock);
> -			radix_tree_delete(&fs_info->buffer_radix,
> -					  eb->start >> PAGE_SHIFT);
> -			spin_unlock(&fs_info->buffer_lock);
> +			spin_lock_irq(&eb_info->buffer_lock);
> +			radix_tree_delete(&eb_info->buffer_radix,
> +					  eb_index(eb));
> +			spin_unlock_irq(&eb_info->buffer_lock);
>  		} else {
>  			spin_unlock(&eb->refs_lock);
>  		}
> @@ -5134,6 +5166,8 @@ static int release_extent_buffer(struct extent_buffer *eb)
>  #endif
>  		call_rcu(&eb->rcu_head, btrfs_release_extent_buffer_rcu);
>  		return 1;
> +	} else if (eb_info && atomic_read(&eb->refs) == 1) {
> +		list_lru_add(&eb_info->lru_list, &eb->lru);
>  	}
>  	spin_unlock(&eb->refs_lock);
>  
> @@ -5167,10 +5201,6 @@ void free_extent_buffer(struct extent_buffer *eb)
>  	    test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
>  		atomic_dec(&eb->refs);
>  
> -	/*
> -	 * I know this is terrible, but it's temporary until we stop tracking
> -	 * the uptodate bits and such for the extent buffers.
> -	 */
>  	release_extent_buffer(eb);
>  }
>  
> @@ -5188,82 +5218,156 @@ void free_extent_buffer_stale(struct extent_buffer *eb)
>  	release_extent_buffer(eb);
>  }
>  
> -void clear_extent_buffer_dirty(struct extent_buffer *eb)
> +long btrfs_nr_ebs(struct super_block *sb, struct shrink_control *sc)
>  {
> -	unsigned long i;
> -	unsigned long num_pages;
> -	struct page *page;
> +	struct btrfs_fs_info *fs_info = btrfs_sb(sb);
> +	struct btrfs_eb_info *eb_info = fs_info->eb_info;
>  
> -	num_pages = num_extent_pages(eb->start, eb->len);
> +	return list_lru_shrink_count(&eb_info->lru_list, sc);
> +}
>  
> -	for (i = 0; i < num_pages; i++) {
> -		page = eb->pages[i];
> -		if (!PageDirty(page))
> -			continue;
> +static enum lru_status eb_lru_isolate(struct list_head *item,
> +				      struct list_lru_one *lru,
> +				      spinlock_t *lru_lock, void *arg)
> +{
> +	struct list_head *freeable = (struct list_head *)arg;
> +	struct extent_buffer *eb = container_of(item, struct extent_buffer,
> +						lru);
> +	enum lru_status ret;
> +	int refs;
>  
> -		lock_page(page);
> -		WARN_ON(!PagePrivate(page));
> +	if (!spin_trylock(&eb->refs_lock))
> +		return LRU_SKIP;
>  
> -		clear_page_dirty_for_io(page);
> -		spin_lock_irq(&page->mapping->tree_lock);
> -		if (!PageDirty(page)) {
> -			radix_tree_tag_clear(&page->mapping->page_tree,
> -						page_index(page),
> -						PAGECACHE_TAG_DIRTY);
> -		}
> -		spin_unlock_irq(&page->mapping->tree_lock);
> -		ClearPageError(page);
> -		unlock_page(page);
> +	if (extent_buffer_under_io(eb)) {
> +		ret = LRU_ROTATE;
> +		goto out;
>  	}
> +
> +	refs = atomic_read(&eb->refs);
> +	/* We can race with somebody freeing us, just skip if this happens. */
> +	if (refs == 0) {
> +		ret = LRU_SKIP;
> +		goto out;
> +	}
> +
> +	/* Eb is in use, don't kill it. */
> +	if (refs > 1) {
> +		ret = LRU_ROTATE;
> +		goto out;
> +	}
> +
> +	/*
> +	 * If we don't clear the TREE_REF flag then this eb is going to
> +	 * disappear soon anyway.  Otherwise we become responsible for dropping
> +	 * the last ref on this eb and we know it'll survive until we call
> +	 * dispose_list.
> +	 */
> +	if (!test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) {
> +		ret = LRU_SKIP;
> +		goto out;
> +	}
> +	list_lru_isolate_move(lru, &eb->lru, freeable);
> +	ret = LRU_REMOVED;
> +out:
> +	spin_unlock(&eb->refs_lock);
> +	return ret;
> +}
> +
> +static void dispose_list(struct list_head *list)
> +{
> +	struct extent_buffer *eb;
> +
> +	while (!list_empty(list)) {
> +		eb = list_first_entry(list, struct extent_buffer, lru);
> +
> +		spin_lock(&eb->refs_lock);
> +		list_del_init(&eb->lru);
> +		spin_unlock(&eb->refs_lock);
> +		free_extent_buffer(eb);
> +		cond_resched();
> +	}
> +}
> +
> +long btrfs_free_ebs(struct super_block *sb, struct shrink_control *sc)
> +{
> +	struct btrfs_fs_info *fs_info = btrfs_sb(sb);
> +	struct btrfs_eb_info *eb_info = fs_info->eb_info;
> +	LIST_HEAD(freeable);
> +	long freed;
> +
> +	freed = list_lru_shrink_walk(&eb_info->lru_list, sc, eb_lru_isolate,
> +				     &freeable);
> +	dispose_list(&freeable);
> +	return freed;
> +}
> +
> +void btrfs_invalidate_eb_info(struct btrfs_eb_info *eb_info)
> +{
> +	LIST_HEAD(freeable);
> +
> +	/*
> +	 * We should be able to free all the extent buffers at this point, if we
> +	 * can't there's a problem and we should complain loudly about it.
> +	 */
> +	do {
> +		list_lru_walk(&eb_info->lru_list, eb_lru_isolate, &freeable, LONG_MAX);
> +	} while (WARN_ON(list_lru_count(&eb_info->lru_list)));
> +	dispose_list(&freeable);
> +	synchronize_rcu();
> +}
> +
> +int clear_extent_buffer_dirty(struct extent_buffer *eb)
> +{
> +	struct btrfs_eb_info *eb_info = eb->eb_info;
> +	struct super_block *sb = eb_info->fs_info->sb;
> +	unsigned long num_pages;
> +
> +	if (!test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags))
> +		return 0;
> +
> +	spin_lock_irq(&eb_info->buffer_lock);
> +	radix_tree_tag_clear(&eb_info->buffer_radix, eb_index(eb),
> +			     PAGECACHE_TAG_DIRTY);
> +	spin_unlock_irq(&eb_info->buffer_lock);
> +
> +	num_pages = num_extent_pages(eb->start, eb->len);
> +	account_metadata_cleaned(eb->pages[0], sb->s_bdi, eb->len);
>  	WARN_ON(atomic_read(&eb->refs) == 0);
> +	return 1;
>  }
>  
>  int set_extent_buffer_dirty(struct extent_buffer *eb)
>  {
> -	unsigned long i;
> +	struct btrfs_eb_info *eb_info = eb->eb_info;
> +	struct super_block *sb = eb_info->fs_info->sb;
>  	unsigned long num_pages;
>  	int was_dirty = 0;
>  
>  	check_buffer_tree_ref(eb);
>  
> -	was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags);
> -
> -	num_pages = num_extent_pages(eb->start, eb->len);
>  	WARN_ON(atomic_read(&eb->refs) == 0);
>  	WARN_ON(!test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags));
> +	if (test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags))
> +		return 1;
>  
> -	for (i = 0; i < num_pages; i++)
> -		set_page_dirty(eb->pages[i]);
> +	num_pages = num_extent_pages(eb->start, eb->len);
> +	account_metadata_dirtied(eb->pages[0], sb->s_bdi, eb->len);
> +	spin_lock_irq(&eb_info->buffer_lock);
> +	radix_tree_tag_set(&eb_info->buffer_radix, eb_index(eb),
> +			   PAGECACHE_TAG_DIRTY);
> +	spin_unlock_irq(&eb_info->buffer_lock);
>  	return was_dirty;
>  }
>  
>  void clear_extent_buffer_uptodate(struct extent_buffer *eb)
>  {
> -	unsigned long i;
> -	struct page *page;
> -	unsigned long num_pages;
> -
>  	clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
> -	num_pages = num_extent_pages(eb->start, eb->len);
> -	for (i = 0; i < num_pages; i++) {
> -		page = eb->pages[i];
> -		if (page)
> -			ClearPageUptodate(page);
> -	}
>  }
>  
>  void set_extent_buffer_uptodate(struct extent_buffer *eb)
>  {
> -	unsigned long i;
> -	struct page *page;
> -	unsigned long num_pages;
> -
>  	set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
> -	num_pages = num_extent_pages(eb->start, eb->len);
> -	for (i = 0; i < num_pages; i++) {
> -		page = eb->pages[i];
> -		SetPageUptodate(page);
> -	}
>  }
>  
>  int extent_buffer_uptodate(struct extent_buffer *eb)
> @@ -5271,112 +5375,165 @@ int extent_buffer_uptodate(struct extent_buffer *eb)
>  	return test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
>  }
>  
> -int read_extent_buffer_pages(struct extent_io_tree *tree,
> -			     struct extent_buffer *eb, int wait,
> -			     get_extent_t *get_extent, int mirror_num)
> +static void end_bio_extent_buffer_readpage(struct bio *bio)
>  {
> +	struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
> +	struct extent_io_tree *tree = NULL;
> +	struct bio_vec *bvec;
> +	u64 unlock_start = 0, unlock_len = 0;
> +	int mirror_num = io_bio->mirror_num;
> +	int uptodate = !bio->bi_status;
> +	int i, ret;
> +
> +	bio_for_each_segment_all(bvec, bio, i) {
> +		struct page *page = bvec->bv_page;
> +		struct btrfs_eb_info *eb_info;
> +		struct extent_buffer *eb;
> +
> +		eb = (struct extent_buffer *)page->private;
> +		if (WARN_ON(!eb))
> +			continue;
> +
> +		eb_info = eb->eb_info;
> +		if (!tree)
> +			tree = &eb_info->io_tree;
> +		if (uptodate) {
> +			/*
> +			 * btree_readpage_end_io_hook doesn't care about
> +			 * start/end so just pass 0.  We'll kill this later.
> +			 */
> +			ret = tree->ops->readpage_end_io_hook(io_bio, 0,
> +							      page, 0, 0,
> +							      mirror_num);
> +			if (ret) {
> +				uptodate = 0;
> +			} else {
> +				u64 start = eb->start;
> +				int c, num_pages;
> +
> +				num_pages = num_extent_pages(eb->start,
> +							     eb->len);
> +				for (c = 0; c < num_pages; c++) {
> +					if (eb->pages[c] == page)
> +						break;
> +					start += PAGE_SIZE;
> +				}
> +				clean_io_failure(eb_info->fs_info,
> +						 &eb_info->io_failure_tree,
> +						 tree, start, page, 0, 0);
> +			}
> +		}
> +		/*
> +		 * We never fix anything in btree_io_failed_hook.
> +		 *
> +		 * TODO: rework the io failed hook to not assume we can fix
> +		 * anything.
> +		 */
> +		if (!uptodate)
> +			tree->ops->readpage_io_failed_hook(page, mirror_num);
> +
> +		if (unlock_start == 0) {
> +			unlock_start = eb->start;
> +			unlock_len = PAGE_SIZE;
> +		} else {
> +			unlock_len += PAGE_SIZE;
> +		}
> +	}
> +
> +	if (unlock_start)
> +		unlock_extent(tree, unlock_start,
> +			      unlock_start + unlock_len - 1);
> +	if (io_bio->end_io)
> +		io_bio->end_io(io_bio, blk_status_to_errno(bio->bi_status));
> +	bio_put(bio);
> +}
> +
> +int read_extent_buffer_pages(struct extent_buffer *eb, int wait,
> +			     int mirror_num)
> +{
> +	struct btrfs_eb_info *eb_info = eb->eb_info;
> +	struct extent_io_tree *io_tree = &eb_info->io_tree;
> +	struct block_device *bdev = eb_info->fs_info->fs_devices->latest_bdev;
> +	struct bio *bio = NULL;
> +	u64 offset = eb->start;
> +	u64 unlock_start = 0, unlock_len = 0;
>  	unsigned long i;
>  	struct page *page;
>  	int err;
>  	int ret = 0;
> -	int locked_pages = 0;
> -	int all_uptodate = 1;
>  	unsigned long num_pages;
> -	unsigned long num_reads = 0;
> -	struct bio *bio = NULL;
> -	unsigned long bio_flags = 0;
>  
>  	if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
>  		return 0;
>  
> -	num_pages = num_extent_pages(eb->start, eb->len);
> -	for (i = 0; i < num_pages; i++) {
> -		page = eb->pages[i];
> -		if (wait == WAIT_NONE) {
> -			if (!trylock_page(page))
> -				goto unlock_exit;
> -		} else {
> -			lock_page(page);
> -		}
> -		locked_pages++;
> -	}
> -	/*
> -	 * We need to firstly lock all pages to make sure that
> -	 * the uptodate bit of our pages won't be affected by
> -	 * clear_extent_buffer_uptodate().
> -	 */
> -	for (i = 0; i < num_pages; i++) {
> -		page = eb->pages[i];
> -		if (!PageUptodate(page)) {
> -			num_reads++;
> -			all_uptodate = 0;
> -		}
> -	}
> -
> -	if (all_uptodate) {
> -		set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
> -		goto unlock_exit;
> +	if (test_and_set_bit(EXTENT_BUFFER_READING, &eb->bflags)) {
> +		if (wait != WAIT_COMPLETE)
> +			return 0;
> +		wait_on_bit_io(&eb->bflags, EXTENT_BUFFER_READING,
> +			       TASK_UNINTERRUPTIBLE);
> +		if (!test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
> +			ret = -EIO;
> +		return ret;
>  	}
>  
> +	lock_extent(io_tree, eb->start, eb->start + eb->len - 1);
> +	num_pages = num_extent_pages(eb->start, eb->len);
>  	clear_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags);
>  	eb->read_mirror = 0;
> -	atomic_set(&eb->io_pages, num_reads);
> +	atomic_set(&eb->io_pages, num_pages);
>  	for (i = 0; i < num_pages; i++) {
>  		page = eb->pages[i];
> -
> -		if (!PageUptodate(page)) {
> -			if (ret) {
> -				atomic_dec(&eb->io_pages);
> -				unlock_page(page);
> -				continue;
> +		if (ret) {
> +			unlock_len += PAGE_SIZE;
> +			if (atomic_dec_and_test(&eb->io_pages)) {
> +				clear_bit(EXTENT_BUFFER_READING, &eb->bflags);
> +				smp_mb__after_atomic();
> +				wake_up_bit(&eb->bflags, EXTENT_BUFFER_READING);
>  			}
> +			continue;
> +		}
>  
> -			ClearPageError(page);
> -			err = __extent_read_full_page(tree, page,
> -						      get_extent, &bio,
> -						      mirror_num, &bio_flags,
> -						      REQ_META);
> -			if (err) {
> -				ret = err;
> -				/*
> -				 * We use &bio in above __extent_read_full_page,
> -				 * so we ensure that if it returns error, the
> -				 * current page fails to add itself to bio and
> -				 * it's been unlocked.
> -				 *
> -				 * We must dec io_pages by ourselves.
> -				 */
> -				atomic_dec(&eb->io_pages);
> +		err = submit_extent_page(REQ_OP_READ | REQ_META, io_tree, NULL,
> +					 page, offset >> 9, PAGE_SIZE, 0, bdev,
> +					 &bio, end_bio_extent_buffer_readpage,
> +					 mirror_num, 0, 0, 0, false);
> +		if (err) {
> +			ret = err;
> +			/*
> +			 * We use &bio in above submit_extent_page
> +			 * so we ensure that if it returns error, the
> +			 * current page fails to add itself to bio and
> +			 * it's been unlocked.
> +			 *
> +			 * We must dec io_pages by ourselves.
> +			 */
> +			if (atomic_dec_and_test(&eb->io_pages)) {
> +				clear_bit(EXTENT_BUFFER_READING, &eb->bflags);
> +				smp_mb__after_atomic();
> +				wake_up_bit(&eb->bflags, EXTENT_BUFFER_READING);
>  			}
> -		} else {
> -			unlock_page(page);
> +			unlock_start = eb->start;
> +			unlock_len = PAGE_SIZE;
>  		}
> +		offset += PAGE_SIZE;
>  	}
>  
>  	if (bio) {
> -		err = submit_one_bio(bio, mirror_num, bio_flags);
> +		err = submit_one_bio(bio, mirror_num, 0);
>  		if (err)
>  			return err;
>  	}
>  
> +	if (ret && unlock_start)
> +		unlock_extent(io_tree, unlock_start,
> +			      unlock_start + unlock_len - 1);
>  	if (ret || wait != WAIT_COMPLETE)
>  		return ret;
>  
> -	for (i = 0; i < num_pages; i++) {
> -		page = eb->pages[i];
> -		wait_on_page_locked(page);
> -		if (!PageUptodate(page))
> -			ret = -EIO;
> -	}
> -
> -	return ret;
> -
> -unlock_exit:
> -	while (locked_pages > 0) {
> -		locked_pages--;
> -		page = eb->pages[locked_pages];
> -		unlock_page(page);
> -	}
> +	wait_on_bit_io(&eb->bflags, EXTENT_BUFFER_READING,
> +		       TASK_UNINTERRUPTIBLE);
> +	if (!test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))
> +		ret = -EIO;
>  	return ret;
>  }
>  
> @@ -5533,7 +5690,6 @@ void write_extent_buffer_chunk_tree_uuid(struct extent_buffer *eb,
>  {
>  	char *kaddr;
>  
> -	WARN_ON(!PageUptodate(eb->pages[0]));
>  	kaddr = page_address(eb->pages[0]);
>  	memcpy(kaddr + offsetof(struct btrfs_header, chunk_tree_uuid), srcv,
>  			BTRFS_FSID_SIZE);
> @@ -5543,7 +5699,6 @@ void write_extent_buffer_fsid(struct extent_buffer *eb, const void *srcv)
>  {
>  	char *kaddr;
>  
> -	WARN_ON(!PageUptodate(eb->pages[0]));
>  	kaddr = page_address(eb->pages[0]);
>  	memcpy(kaddr + offsetof(struct btrfs_header, fsid), srcv,
>  			BTRFS_FSID_SIZE);
> @@ -5567,7 +5722,6 @@ void write_extent_buffer(struct extent_buffer *eb, const void *srcv,
>  
>  	while (len > 0) {
>  		page = eb->pages[i];
> -		WARN_ON(!PageUptodate(page));
>  
>  		cur = min(len, PAGE_SIZE - offset);
>  		kaddr = page_address(page);
> @@ -5597,7 +5751,6 @@ void memzero_extent_buffer(struct extent_buffer *eb, unsigned long start,
>  
>  	while (len > 0) {
>  		page = eb->pages[i];
> -		WARN_ON(!PageUptodate(page));
>  
>  		cur = min(len, PAGE_SIZE - offset);
>  		kaddr = page_address(page);
> @@ -5642,7 +5795,6 @@ void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src,
>  
>  	while (len > 0) {
>  		page = dst->pages[i];
> -		WARN_ON(!PageUptodate(page));
>  
>  		cur = min(len, (unsigned long)(PAGE_SIZE - offset));
>  
> @@ -5745,7 +5897,6 @@ int extent_buffer_test_bit(struct extent_buffer *eb, unsigned long start,
>  
>  	eb_bitmap_offset(eb, start, nr, &i, &offset);
>  	page = eb->pages[i];
> -	WARN_ON(!PageUptodate(page));
>  	kaddr = page_address(page);
>  	return 1U & (kaddr[offset] >> (nr & (BITS_PER_BYTE - 1)));
>  }
> @@ -5770,7 +5921,6 @@ void extent_buffer_bitmap_set(struct extent_buffer *eb, unsigned long start,
>  
>  	eb_bitmap_offset(eb, start, pos, &i, &offset);
>  	page = eb->pages[i];
> -	WARN_ON(!PageUptodate(page));
>  	kaddr = page_address(page);
>  
>  	while (len >= bits_to_set) {
> @@ -5781,7 +5931,6 @@ void extent_buffer_bitmap_set(struct extent_buffer *eb, unsigned long start,
>  		if (++offset >= PAGE_SIZE && len > 0) {
>  			offset = 0;
>  			page = eb->pages[++i];
> -			WARN_ON(!PageUptodate(page));
>  			kaddr = page_address(page);
>  		}
>  	}
> @@ -5812,7 +5961,6 @@ void extent_buffer_bitmap_clear(struct extent_buffer *eb, unsigned long start,
>  
>  	eb_bitmap_offset(eb, start, pos, &i, &offset);
>  	page = eb->pages[i];
> -	WARN_ON(!PageUptodate(page));
>  	kaddr = page_address(page);
>  
>  	while (len >= bits_to_clear) {
> @@ -5823,7 +5971,6 @@ void extent_buffer_bitmap_clear(struct extent_buffer *eb, unsigned long start,
>  		if (++offset >= PAGE_SIZE && len > 0) {
>  			offset = 0;
>  			page = eb->pages[++i];
> -			WARN_ON(!PageUptodate(page));
>  			kaddr = page_address(page);
>  		}
>  	}
> @@ -5864,7 +6011,7 @@ static void copy_pages(struct page *dst_page, struct page *src_page,
>  void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
>  			   unsigned long src_offset, unsigned long len)
>  {
> -	struct btrfs_fs_info *fs_info = dst->fs_info;
> +	struct btrfs_fs_info *fs_info = dst->eb_info->fs_info;
>  	size_t cur;
>  	size_t dst_off_in_page;
>  	size_t src_off_in_page;
> @@ -5911,7 +6058,7 @@ void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
>  void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
>  			   unsigned long src_offset, unsigned long len)
>  {
> -	struct btrfs_fs_info *fs_info = dst->fs_info;
> +	struct btrfs_fs_info *fs_info = dst->eb_info->fs_info;
>  	size_t cur;
>  	size_t dst_off_in_page;
>  	size_t src_off_in_page;
> @@ -5957,45 +6104,3 @@ void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
>  		len -= cur;
>  	}
>  }
> -
> -int try_release_extent_buffer(struct page *page)
> -{
> -	struct extent_buffer *eb;
> -
> -	/*
> -	 * We need to make sure nobody is attaching this page to an eb right
> -	 * now.
> -	 */
> -	spin_lock(&page->mapping->private_lock);
> -	if (!PagePrivate(page)) {
> -		spin_unlock(&page->mapping->private_lock);
> -		return 1;
> -	}
> -
> -	eb = (struct extent_buffer *)page->private;
> -	BUG_ON(!eb);
> -
> -	/*
> -	 * This is a little awful but should be ok, we need to make sure that
> -	 * the eb doesn't disappear out from under us while we're looking at
> -	 * this page.
> -	 */
> -	spin_lock(&eb->refs_lock);
> -	if (atomic_read(&eb->refs) != 1 || extent_buffer_under_io(eb)) {
> -		spin_unlock(&eb->refs_lock);
> -		spin_unlock(&page->mapping->private_lock);
> -		return 0;
> -	}
> -	spin_unlock(&page->mapping->private_lock);
> -
> -	/*
> -	 * If tree ref isn't set then we know the ref on this eb is a real ref,
> -	 * so just return, this page will likely be freed soon anyway.
> -	 */
> -	if (!test_and_clear_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) {
> -		spin_unlock(&eb->refs_lock);
> -		return 0;
> -	}
> -
> -	return release_extent_buffer(eb);
> -}
> diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
> index 861dacb371c7..f18cbce1f2f1 100644
> --- a/fs/btrfs/extent_io.h
> +++ b/fs/btrfs/extent_io.h
> @@ -47,6 +47,8 @@
>  #define EXTENT_BUFFER_DUMMY 9
>  #define EXTENT_BUFFER_IN_TREE 10
>  #define EXTENT_BUFFER_WRITE_ERR 11    /* write IO error */
> +#define EXTENT_BUFFER_MIXED_PAGES 12	/* the pages span multiple zones or numa nodes. */
> +#define EXTENT_BUFFER_READING 13 /* currently reading this eb. */
>  
>  /* these are flags for __process_pages_contig */
>  #define PAGE_UNLOCK		(1 << 0)
> @@ -160,13 +162,25 @@ struct extent_state {
>  #endif
>  };
>  
> +struct btrfs_eb_info {
> +	struct btrfs_fs_info *fs_info;
> +	struct extent_io_tree io_tree;
> +	struct extent_io_tree io_failure_tree;
> +
> +	/* Extent buffer radix tree */
> +	spinlock_t buffer_lock;
> +	struct radix_tree_root buffer_radix;
> +	struct list_lru lru_list;
> +	pgoff_t writeback_index;
> +};
> +
>  #define INLINE_EXTENT_BUFFER_PAGES 16
>  #define MAX_INLINE_EXTENT_BUFFER_SIZE (INLINE_EXTENT_BUFFER_PAGES * PAGE_SIZE)
>  struct extent_buffer {
>  	u64 start;
>  	unsigned long len;
>  	unsigned long bflags;
> -	struct btrfs_fs_info *fs_info;
> +	struct btrfs_eb_info *eb_info;
>  	spinlock_t refs_lock;
>  	atomic_t refs;
>  	atomic_t io_pages;
> @@ -201,6 +215,7 @@ struct extent_buffer {
>  #ifdef CONFIG_BTRFS_DEBUG
>  	struct list_head leak_list;
>  #endif
> +	struct list_head lru;
>  };
>  
>  /*
> @@ -408,8 +423,6 @@ int extent_writepages(struct extent_io_tree *tree,
>  		      struct address_space *mapping,
>  		      get_extent_t *get_extent,
>  		      struct writeback_control *wbc);
> -int btree_write_cache_pages(struct address_space *mapping,
> -			    struct writeback_control *wbc);
>  int extent_readpages(struct extent_io_tree *tree,
>  		     struct address_space *mapping,
>  		     struct list_head *pages, unsigned nr_pages,
> @@ -420,21 +433,18 @@ void set_page_extent_mapped(struct page *page);
>  
>  struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
>  					  u64 start);
> -struct extent_buffer *__alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
> -						  u64 start, unsigned long len);
> -struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
> -						u64 start);
> +struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_eb_info *eb_info,
> +						u64 start, unsigned long len);
>  struct extent_buffer *btrfs_clone_extent_buffer(struct extent_buffer *src);
> -struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info,
> +struct extent_buffer *find_extent_buffer(struct btrfs_eb_info *eb_info,
>  					 u64 start);
>  void free_extent_buffer(struct extent_buffer *eb);
>  void free_extent_buffer_stale(struct extent_buffer *eb);
>  #define WAIT_NONE	0
>  #define WAIT_COMPLETE	1
>  #define WAIT_PAGE_LOCK	2
> -int read_extent_buffer_pages(struct extent_io_tree *tree,
> -			     struct extent_buffer *eb, int wait,
> -			     get_extent_t *get_extent, int mirror_num);
> +int read_extent_buffer_pages(struct extent_buffer *eb, int wait,
> +			     int mirror_num);
>  void wait_on_extent_buffer_writeback(struct extent_buffer *eb);
>  
>  static inline unsigned long num_extent_pages(u64 start, u64 len)
> @@ -448,6 +458,11 @@ static inline void extent_buffer_get(struct extent_buffer *eb)
>  	atomic_inc(&eb->refs);
>  }
>  
> +static inline unsigned long eb_index(struct extent_buffer *eb)
> +{
> +	return eb->start >> PAGE_SHIFT;
> +}
> +
>  int memcmp_extent_buffer(const struct extent_buffer *eb, const void *ptrv,
>  			 unsigned long start, unsigned long len);
>  void read_extent_buffer(const struct extent_buffer *eb, void *dst,
> @@ -478,7 +493,7 @@ void extent_buffer_bitmap_set(struct extent_buffer *eb, unsigned long start,
>  			      unsigned long pos, unsigned long len);
>  void extent_buffer_bitmap_clear(struct extent_buffer *eb, unsigned long start,
>  				unsigned long pos, unsigned long len);
> -void clear_extent_buffer_dirty(struct extent_buffer *eb);
> +int clear_extent_buffer_dirty(struct extent_buffer *eb);
>  int set_extent_buffer_dirty(struct extent_buffer *eb);
>  void set_extent_buffer_uptodate(struct extent_buffer *eb);
>  void clear_extent_buffer_uptodate(struct extent_buffer *eb);
> @@ -512,6 +527,14 @@ int clean_io_failure(struct btrfs_fs_info *fs_info,
>  void end_extent_writepage(struct page *page, int err, u64 start, u64 end);
>  int repair_eb_io_failure(struct btrfs_fs_info *fs_info,
>  			 struct extent_buffer *eb, int mirror_num);
> +void btree_flush(struct btrfs_fs_info *fs_info);
> +int btree_write_range(struct btrfs_fs_info *fs_info, u64 start, u64 end);
> +int btree_wait_range(struct btrfs_fs_info *fs_info, u64 start, u64 end);
> +long btrfs_free_ebs(struct super_block *sb, struct shrink_control *sc);
> +long btrfs_nr_ebs(struct super_block *sb, struct shrink_control *sc);
> +void btrfs_write_ebs(struct super_block *sb, struct writeback_control *wbc);
> +void btrfs_invalidate_eb_info(struct btrfs_eb_info *eb_info);
> +int btrfs_init_eb_info(struct btrfs_fs_info *fs_info);
>  
>  /*
>   * When IO fails, either with EIO or csum verification fails, we
> @@ -552,6 +575,6 @@ noinline u64 find_lock_delalloc_range(struct inode *inode,
>  				      struct page *locked_page, u64 *start,
>  				      u64 *end, u64 max_bytes);
>  #endif
> -struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info,
> -					       u64 start);
> +struct extent_buffer *alloc_test_extent_buffer(struct btrfs_eb_info *eb_info,
> +					       u64 start, u32 nodesize);
>  #endif
> diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
> index 46b5632a7c6d..27bc64fb6d3e 100644
> --- a/fs/btrfs/inode.c
> +++ b/fs/btrfs/inode.c
> @@ -1877,9 +1877,9 @@ static void btrfs_clear_bit_hook(void *private_data,
>   * return 0 if page can be merged to bio
>   * return error otherwise
>   */
> -int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
> -			 size_t size, struct bio *bio,
> -			 unsigned long bio_flags)
> +static int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
> +				size_t size, struct bio *bio,
> +				unsigned long bio_flags)
>  {
>  	struct inode *inode = page->mapping->host;
>  	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
> diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
> index 569205e651c7..f912c8166d94 100644
> --- a/fs/btrfs/print-tree.c
> +++ b/fs/btrfs/print-tree.c
> @@ -102,6 +102,7 @@ static void print_extent_item(struct extent_buffer *eb, int slot, int type)
>  	ptr = (unsigned long)iref;
>  	end = (unsigned long)ei + item_size;
>  	while (ptr < end) {
> +		struct btrfs_fs_info *fs_info = eb->eb_info->fs_info;
>  		iref = (struct btrfs_extent_inline_ref *)ptr;
>  		type = btrfs_extent_inline_ref_type(eb, iref);
>  		offset = btrfs_extent_inline_ref_offset(eb, iref);
> @@ -116,9 +117,9 @@ static void print_extent_item(struct extent_buffer *eb, int slot, int type)
>  			 * offset is supposed to be a tree block which
>  			 * must be aligned to nodesize.
>  			 */
> -			if (!IS_ALIGNED(offset, eb->fs_info->nodesize))
> +			if (!IS_ALIGNED(offset, fs_info->nodesize))
>  				pr_info("\t\t\t(parent %llu is NOT ALIGNED to nodesize %llu)\n",
> -					offset, (unsigned long long)eb->fs_info->nodesize);
> +					offset, (unsigned long long)fs_info->nodesize);
>  			break;
>  		case BTRFS_EXTENT_DATA_REF_KEY:
>  			dref = (struct btrfs_extent_data_ref *)(&iref->offset);
> @@ -132,9 +133,9 @@ static void print_extent_item(struct extent_buffer *eb, int slot, int type)
>  			 * offset is supposed to be a tree block which
>  			 * must be aligned to nodesize.
>  			 */
> -			if (!IS_ALIGNED(offset, eb->fs_info->nodesize))
> +			if (!IS_ALIGNED(offset, fs_info->nodesize))
>  				pr_info("\t\t\t(parent %llu is NOT ALIGNED to nodesize %llu)\n",
> -				     offset, (unsigned long long)eb->fs_info->nodesize);
> +				     offset, (unsigned long long)fs_info->nodesize);
>  			break;
>  		default:
>  			pr_cont("(extent %llu has INVALID ref type %d)\n",
> @@ -199,7 +200,7 @@ void btrfs_print_leaf(struct extent_buffer *l)
>  	if (!l)
>  		return;
>  
> -	fs_info = l->fs_info;
> +	fs_info = l->eb_info->fs_info;
>  	nr = btrfs_header_nritems(l);
>  
>  	btrfs_info(fs_info, "leaf %llu total ptrs %d free space %d",
> @@ -347,7 +348,7 @@ void btrfs_print_tree(struct extent_buffer *c)
>  
>  	if (!c)
>  		return;
> -	fs_info = c->fs_info;
> +	fs_info = c->eb_info->fs_info;
>  	nr = btrfs_header_nritems(c);
>  	level = btrfs_header_level(c);
>  	if (level == 0) {
> diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c
> index ab852b8e3e37..c6244890085f 100644
> --- a/fs/btrfs/reada.c
> +++ b/fs/btrfs/reada.c
> @@ -210,7 +210,7 @@ static void __readahead_hook(struct btrfs_fs_info *fs_info,
>  
>  int btree_readahead_hook(struct extent_buffer *eb, int err)
>  {
> -	struct btrfs_fs_info *fs_info = eb->fs_info;
> +	struct btrfs_fs_info *fs_info = eb->eb_info->fs_info;
>  	int ret = 0;
>  	struct reada_extent *re;
>  
> diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
> index 3338407ef0f0..e40bd9a910dd 100644
> --- a/fs/btrfs/root-tree.c
> +++ b/fs/btrfs/root-tree.c
> @@ -45,7 +45,7 @@ static void btrfs_read_root_item(struct extent_buffer *eb, int slot,
>  	if (!need_reset && btrfs_root_generation(item)
>  		!= btrfs_root_generation_v2(item)) {
>  		if (btrfs_root_generation_v2(item) != 0) {
> -			btrfs_warn(eb->fs_info,
> +			btrfs_warn(eb->eb_info->fs_info,
>  					"mismatching generation and generation_v2 found in root item. This root was probably mounted with an older kernel. Resetting all new fields.");
>  		}
>  		need_reset = 1;
> diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
> index 8e74f7029e12..3b5fe791639d 100644
> --- a/fs/btrfs/super.c
> +++ b/fs/btrfs/super.c
> @@ -1198,7 +1198,7 @@ int btrfs_sync_fs(struct super_block *sb, int wait)
>  	trace_btrfs_sync_fs(fs_info, wait);
>  
>  	if (!wait) {
> -		filemap_flush(fs_info->btree_inode->i_mapping);
> +		btree_flush(fs_info);
>  		return 0;
>  	}
>  
> @@ -2284,19 +2284,22 @@ static int btrfs_show_devname(struct seq_file *m, struct dentry *root)
>  }
>  
>  static const struct super_operations btrfs_super_ops = {
> -	.drop_inode	= btrfs_drop_inode,
> -	.evict_inode	= btrfs_evict_inode,
> -	.put_super	= btrfs_put_super,
> -	.sync_fs	= btrfs_sync_fs,
> -	.show_options	= btrfs_show_options,
> -	.show_devname	= btrfs_show_devname,
> -	.write_inode	= btrfs_write_inode,
> -	.alloc_inode	= btrfs_alloc_inode,
> -	.destroy_inode	= btrfs_destroy_inode,
> -	.statfs		= btrfs_statfs,
> -	.remount_fs	= btrfs_remount,
> -	.freeze_fs	= btrfs_freeze,
> -	.unfreeze_fs	= btrfs_unfreeze,
> +	.drop_inode		= btrfs_drop_inode,
> +	.evict_inode		= btrfs_evict_inode,
> +	.put_super		= btrfs_put_super,
> +	.sync_fs		= btrfs_sync_fs,
> +	.show_options		= btrfs_show_options,
> +	.show_devname		= btrfs_show_devname,
> +	.write_inode		= btrfs_write_inode,
> +	.alloc_inode		= btrfs_alloc_inode,
> +	.destroy_inode		= btrfs_destroy_inode,
> +	.statfs			= btrfs_statfs,
> +	.remount_fs		= btrfs_remount,
> +	.freeze_fs		= btrfs_freeze,
> +	.unfreeze_fs		= btrfs_unfreeze,
> +	.nr_cached_objects	= btrfs_nr_ebs,
> +	.free_cached_objects	= btrfs_free_ebs,
> +	.write_metadata		= btrfs_write_ebs,
>  };
>  
>  static const struct file_operations btrfs_ctl_fops = {
> diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c
> index d3f25376a0f8..dbf05b2ab9ee 100644
> --- a/fs/btrfs/tests/btrfs-tests.c
> +++ b/fs/btrfs/tests/btrfs-tests.c
> @@ -102,15 +102,32 @@ struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(u32 nodesize, u32 sectorsize)
>  
>  	fs_info->nodesize = nodesize;
>  	fs_info->sectorsize = sectorsize;
> +	fs_info->eb_info = kzalloc(sizeof(struct btrfs_eb_info),
> +				   GFP_KERNEL);
> +	if (!fs_info->eb_info) {
> +		kfree(fs_info->fs_devices);
> +		kfree(fs_info->super_copy);
> +		kfree(fs_info);
> +		return NULL;
> +	}
> +
> +	if (btrfs_init_eb_info(fs_info)) {
> +		kfree(fs_info->eb_info);
> +		kfree(fs_info->fs_devices);
> +		kfree(fs_info->super_copy);
> +		kfree(fs_info);
> +		return NULL;
> +	}
>  
>  	if (init_srcu_struct(&fs_info->subvol_srcu)) {
> +		list_lru_destroy(&fs_info->eb_info->lru_list);
> +		kfree(fs_info->eb_info);
>  		kfree(fs_info->fs_devices);
>  		kfree(fs_info->super_copy);
>  		kfree(fs_info);
>  		return NULL;
>  	}
>  
> -	spin_lock_init(&fs_info->buffer_lock);
>  	spin_lock_init(&fs_info->qgroup_lock);
>  	spin_lock_init(&fs_info->qgroup_op_lock);
>  	spin_lock_init(&fs_info->super_lock);
> @@ -126,7 +143,6 @@ struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(u32 nodesize, u32 sectorsize)
>  	INIT_LIST_HEAD(&fs_info->dirty_qgroups);
>  	INIT_LIST_HEAD(&fs_info->dead_roots);
>  	INIT_LIST_HEAD(&fs_info->tree_mod_seq_list);
> -	INIT_RADIX_TREE(&fs_info->buffer_radix, GFP_ATOMIC);
>  	INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC);
>  	extent_io_tree_init(&fs_info->freed_extents[0], NULL);
>  	extent_io_tree_init(&fs_info->freed_extents[1], NULL);
> @@ -140,6 +156,7 @@ struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(u32 nodesize, u32 sectorsize)
>  
>  void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info)
>  {
> +	struct btrfs_eb_info *eb_info;
>  	struct radix_tree_iter iter;
>  	void **slot;
>  
> @@ -150,13 +167,14 @@ void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info)
>  			      &fs_info->fs_state)))
>  		return;
>  
> +	eb_info = fs_info->eb_info;
>  	test_mnt->mnt_sb->s_fs_info = NULL;
>  
> -	spin_lock(&fs_info->buffer_lock);
> -	radix_tree_for_each_slot(slot, &fs_info->buffer_radix, &iter, 0) {
> +	spin_lock_irq(&eb_info->buffer_lock);
> +	radix_tree_for_each_slot(slot, &eb_info->buffer_radix, &iter, 0) {
>  		struct extent_buffer *eb;
>  
> -		eb = radix_tree_deref_slot_protected(slot, &fs_info->buffer_lock);
> +		eb = radix_tree_deref_slot_protected(slot, &eb_info->buffer_lock);
>  		if (!eb)
>  			continue;
>  		/* Shouldn't happen but that kind of thinking creates CVE's */
> @@ -166,15 +184,17 @@ void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info)
>  			continue;
>  		}
>  		slot = radix_tree_iter_resume(slot, &iter);
> -		spin_unlock(&fs_info->buffer_lock);
> +		spin_unlock_irq(&eb_info->buffer_lock);
>  		free_extent_buffer_stale(eb);
> -		spin_lock(&fs_info->buffer_lock);
> +		spin_lock_irq(&eb_info->buffer_lock);
>  	}
> -	spin_unlock(&fs_info->buffer_lock);
> +	spin_unlock_irq(&eb_info->buffer_lock);
>  
>  	btrfs_free_qgroup_config(fs_info);
>  	btrfs_free_fs_roots(fs_info);
>  	cleanup_srcu_struct(&fs_info->subvol_srcu);
> +	list_lru_destroy(&eb_info->lru_list);
> +	kfree(fs_info->eb_info);
>  	kfree(fs_info->super_copy);
>  	kfree(fs_info->fs_devices);
>  	kfree(fs_info);
> diff --git a/fs/btrfs/tests/extent-buffer-tests.c b/fs/btrfs/tests/extent-buffer-tests.c
> index b9142c614114..9a264b81a7b4 100644
> --- a/fs/btrfs/tests/extent-buffer-tests.c
> +++ b/fs/btrfs/tests/extent-buffer-tests.c
> @@ -61,7 +61,8 @@ static int test_btrfs_split_item(u32 sectorsize, u32 nodesize)
>  		goto out;
>  	}
>  
> -	path->nodes[0] = eb = alloc_dummy_extent_buffer(fs_info, nodesize);
> +	path->nodes[0] = eb = alloc_dummy_extent_buffer(fs_info->eb_info, 0,
> +							nodesize);
>  	if (!eb) {
>  		test_msg("Could not allocate dummy buffer\n");
>  		ret = -ENOMEM;
> diff --git a/fs/btrfs/tests/extent-io-tests.c b/fs/btrfs/tests/extent-io-tests.c
> index d06b1c931d05..600c01ddf0d0 100644
> --- a/fs/btrfs/tests/extent-io-tests.c
> +++ b/fs/btrfs/tests/extent-io-tests.c
> @@ -406,7 +406,7 @@ static int test_eb_bitmaps(u32 sectorsize, u32 nodesize)
>  		return -ENOMEM;
>  	}
>  
> -	eb = __alloc_dummy_extent_buffer(fs_info, 0, len);
> +	eb = alloc_dummy_extent_buffer(NULL, 0, len);
>  	if (!eb) {
>  		test_msg("Couldn't allocate test extent buffer\n");
>  		kfree(bitmap);
> @@ -419,7 +419,7 @@ static int test_eb_bitmaps(u32 sectorsize, u32 nodesize)
>  
>  	/* Do it over again with an extent buffer which isn't page-aligned. */
>  	free_extent_buffer(eb);
> -	eb = __alloc_dummy_extent_buffer(NULL, nodesize / 2, len);
> +	eb = alloc_dummy_extent_buffer(NULL, nodesize / 2, len);
>  	if (!eb) {
>  		test_msg("Couldn't allocate test extent buffer\n");
>  		kfree(bitmap);
> diff --git a/fs/btrfs/tests/free-space-tree-tests.c b/fs/btrfs/tests/free-space-tree-tests.c
> index 8444a018cca2..afba937f4365 100644
> --- a/fs/btrfs/tests/free-space-tree-tests.c
> +++ b/fs/btrfs/tests/free-space-tree-tests.c
> @@ -474,7 +474,8 @@ static int run_test(test_func_t test_func, int bitmaps, u32 sectorsize,
>  	root->fs_info->free_space_root = root;
>  	root->fs_info->tree_root = root;
>  
> -	root->node = alloc_test_extent_buffer(root->fs_info, nodesize);
> +	root->node = alloc_test_extent_buffer(fs_info->eb_info, nodesize,
> +					      nodesize);
>  	if (!root->node) {
>  		test_msg("Couldn't allocate dummy buffer\n");
>  		ret = -ENOMEM;
> diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c
> index 11c77eafde00..486aa7fbfce2 100644
> --- a/fs/btrfs/tests/inode-tests.c
> +++ b/fs/btrfs/tests/inode-tests.c
> @@ -261,7 +261,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
>  		goto out;
>  	}
>  
> -	root->node = alloc_dummy_extent_buffer(fs_info, nodesize);
> +	root->node = alloc_dummy_extent_buffer(fs_info->eb_info, 0, nodesize);
>  	if (!root->node) {
>  		test_msg("Couldn't allocate dummy buffer\n");
>  		goto out;
> @@ -867,7 +867,7 @@ static int test_hole_first(u32 sectorsize, u32 nodesize)
>  		goto out;
>  	}
>  
> -	root->node = alloc_dummy_extent_buffer(fs_info, nodesize);
> +	root->node = alloc_dummy_extent_buffer(fs_info->eb_info, 0, nodesize);
>  	if (!root->node) {
>  		test_msg("Couldn't allocate dummy buffer\n");
>  		goto out;
> diff --git a/fs/btrfs/tests/qgroup-tests.c b/fs/btrfs/tests/qgroup-tests.c
> index 0f4ce970d195..0ba27cd9ae4c 100644
> --- a/fs/btrfs/tests/qgroup-tests.c
> +++ b/fs/btrfs/tests/qgroup-tests.c
> @@ -486,7 +486,8 @@ int btrfs_test_qgroups(u32 sectorsize, u32 nodesize)
>  	 * Can't use bytenr 0, some things freak out
>  	 * *cough*backref walking code*cough*
>  	 */
> -	root->node = alloc_test_extent_buffer(root->fs_info, nodesize);
> +	root->node = alloc_test_extent_buffer(fs_info->eb_info, nodesize,
> +					      nodesize);
>  	if (!root->node) {
>  		test_msg("Couldn't allocate dummy buffer\n");
>  		ret = -ENOMEM;
> diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
> index 9fed8c67b6e8..5df3963c413e 100644
> --- a/fs/btrfs/transaction.c
> +++ b/fs/btrfs/transaction.c
> @@ -293,8 +293,7 @@ static noinline int join_transaction(struct btrfs_fs_info *fs_info,
>  	INIT_LIST_HEAD(&cur_trans->deleted_bgs);
>  	spin_lock_init(&cur_trans->dropped_roots_lock);
>  	list_add_tail(&cur_trans->list, &fs_info->trans_list);
> -	extent_io_tree_init(&cur_trans->dirty_pages,
> -			     fs_info->btree_inode);
> +	extent_io_tree_init(&cur_trans->dirty_pages, NULL);
>  	fs_info->generation++;
>  	cur_trans->transid = fs_info->generation;
>  	fs_info->running_transaction = cur_trans;
> @@ -944,12 +943,10 @@ int btrfs_write_marked_extents(struct btrfs_fs_info *fs_info,
>  {
>  	int err = 0;
>  	int werr = 0;
> -	struct address_space *mapping = fs_info->btree_inode->i_mapping;
>  	struct extent_state *cached_state = NULL;
>  	u64 start = 0;
>  	u64 end;
>  
> -	atomic_inc(&BTRFS_I(fs_info->btree_inode)->sync_writers);
>  	while (!find_first_extent_bit(dirty_pages, start, &start, &end,
>  				      mark, &cached_state)) {
>  		bool wait_writeback = false;
> @@ -975,17 +972,16 @@ int btrfs_write_marked_extents(struct btrfs_fs_info *fs_info,
>  			wait_writeback = true;
>  		}
>  		if (!err)
> -			err = filemap_fdatawrite_range(mapping, start, end);
> +			err = btree_write_range(fs_info, start, end);
>  		if (err)
>  			werr = err;
>  		else if (wait_writeback)
> -			werr = filemap_fdatawait_range(mapping, start, end);
> +			werr = btree_wait_range(fs_info, start, end);
>  		free_extent_state(cached_state);
>  		cached_state = NULL;
>  		cond_resched();
>  		start = end + 1;
>  	}
> -	atomic_dec(&BTRFS_I(fs_info->btree_inode)->sync_writers);
>  	return werr;
>  }
>  
> @@ -1000,7 +996,6 @@ static int __btrfs_wait_marked_extents(struct btrfs_fs_info *fs_info,
>  {
>  	int err = 0;
>  	int werr = 0;
> -	struct address_space *mapping = fs_info->btree_inode->i_mapping;
>  	struct extent_state *cached_state = NULL;
>  	u64 start = 0;
>  	u64 end;
> @@ -1021,7 +1016,7 @@ static int __btrfs_wait_marked_extents(struct btrfs_fs_info *fs_info,
>  		if (err == -ENOMEM)
>  			err = 0;
>  		if (!err)
> -			err = filemap_fdatawait_range(mapping, start, end);
> +			err = btree_wait_range(fs_info, start, end);
>  		if (err)
>  			werr = err;
>  		free_extent_state(cached_state);
> -- 
> 2.7.5
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

next prev parent reply	other threads:[~2017-11-17  1:03 UTC|newest]

Thread overview: 19+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2017-11-14 21:56 [PATCH 01/10] remove mapping from balance_dirty_pages*() Josef Bacik
2017-11-14 21:56 ` [PATCH 02/10] writeback: convert WB_WRITTEN/WB_DIRITED counters to bytes Josef Bacik
2017-11-16 23:45   ` Liu Bo
2017-11-14 21:56 ` [PATCH 03/10] lib: add a batch size to fprop_global Josef Bacik
2017-11-22  8:47   ` Jan Kara
2017-11-22  8:54     ` Jan Kara
2017-11-14 21:56 ` [PATCH 04/10] lib: add a __fprop_add_percpu_max Josef Bacik
2017-11-14 21:56 ` [PATCH 05/10] writeback: convert the flexible prop stuff to bytes Josef Bacik
2017-11-14 21:56 ` [PATCH 06/10] writeback: add counters for metadata usage Josef Bacik
2017-11-22 10:21   ` Jan Kara
2017-11-14 21:56 ` [PATCH 07/10] writeback: introduce super_operations->write_metadata Josef Bacik
2017-11-14 21:56 ` [PATCH 08/10] export radix_tree_iter_tag_set Josef Bacik
2017-11-14 21:56 ` [PATCH 09/10] Btrfs: kill the btree_inode Josef Bacik
2017-11-17  1:03   ` Liu Bo [this message]
2017-11-17  1:13     ` Josef Bacik
2017-11-14 21:56 ` [PATCH 10/10] btrfs: rework end io for extent buffer reads Josef Bacik
2017-11-17  1:24   ` Liu Bo
2017-11-16 23:36 ` [PATCH 01/10] remove mapping from balance_dirty_pages*() Liu Bo
2017-11-21 22:45 ` Andrew Morton

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20171117010307.GF23614@dhcp-whq-twvpn-1-vpnpool-10-159-142-193.vpn.oracle.com \
    --to=bo.li.liu@oracle.com \
    --cc=akpm@linux-foundation.org \
    --cc=hannes@cmpxchg.org \
    --cc=jack@suse.cz \
    --cc=jbacik@fb.com \
    --cc=josef@toxicpanda.com \
    --cc=kernel-team@fb.com \
    --cc=linux-btrfs@vger.kernel.org \
    --cc=linux-fsdevel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox