linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed
From: Xiubo Li <lixiubo@cmss.chinamobile.com>
To: Andy Grover <agrover@redhat.com>,
	nab@linux-iscsi.org, mchristi@redhat.com
Cc: shli@kernel.org, sheng@yasker.org, linux-scsi@vger.kernel.org,
	target-devel@vger.kernel.org, namei.unix@gmail.com,
	linux-mm@kvack.org
Subject: Re: [PATCHv2 2/5] target/user: Add global data block pool support
Date: Fri, 17 Mar 2017 16:04:26 +0800	[thread overview]
Message-ID: <f4c4e83a-d6b1-ed57-7a54-4277722e5a46@cmss.chinamobile.com> (raw)
In-Reply-To: <ddd797ea-43f0-b863-64e4-1e758f41dafe@cmss.chinamobile.com>

[-- Attachment #1: Type: text/plain, Size: 23131 bytes --]

[...]
> These days what I have gotten is that the unmap_mapping_range() could 
> be used.
> At the same time I have deep into the mm code and fixed the double 
> usage of
> the data blocks and possible page fault call trace bugs mentioned above.
>
> Following is the V3 patch. I have test this using 4 targets & fio for 
> about 2 days, so
> far so good.
>
> I'm still testing this using more complex test case.
>
I have test it the whole day today:
- using 4 targets
- setting TCMU_GLOBAL_MAX_BLOCKS = [512 1K 1M 1G 2G]
- each target here needs more than 450 blocks when running
- fio: -iodepth [1 2 4 8 16] -thread -rw=[read write] -bs=[1K 2K 3K 5K 
7K 16K 64K 1M] -size=20G -numjobs=10 -runtime=1000  ...


The result:
- the system mm, no memory leakage happen.
- the same blocks will page fault again after the unmap.
- try to touch the blocks out of the iov[N], no page fault call trace 
output.
- works well all the day.

Thanks,

BRs
Xiubo

[...]
> From: Xiubo Li<lixiubo@cmss.chinamobile.com>
>
> For each target there will be one ring, when the target number
> grows larger and larger, it could eventually runs out of the
> system memories.
>
> In this patch for each target ring, for the cmd area the size
> will be limited to 8MB and for the data area the size will be
> limited to 256K * PAGE_SIZE.
>
> For all the targets' data areas, they will get empty blocks
> from the "global data block pool", which has limited to 512K *
> PAGE_SIZE for now.
>
> When the "global data block pool" has been used up, then any
> target could trigger the unmapping thread routine to shrink the
> targets' rings. And for the idle targets the unmapping routine
> will reserve 256 blocks at least.
>
> When user space has touched the data blocks out of the iov[N],
> the tcmu_page_fault() will return one zeroed blocks.
>
> Signed-off-by: Xiubo Li<lixiubo@cmss.chinamobile.com>
> Signed-off-by: Jianfei Hu<hujianfei@cmss.chinamobile.com>
> ---
>   drivers/target/target_core_user.c | 433 ++++++++++++++++++++++++++++++--------
>   1 file changed, 349 insertions(+), 84 deletions(-)
>
> diff --git a/drivers/target/target_core_user.c b/drivers/target/target_core_user.c
> index e904bc0..bbc52074 100644
> --- a/drivers/target/target_core_user.c
> +++ b/drivers/target/target_core_user.c
> @@ -30,6 +30,8 @@
>   #include <linux/stringify.h>
>   #include <linux/bitops.h>
>   #include <linux/highmem.h>
> +#include <linux/mutex.h>
> +#include <linux/kthread.h>
>   #include <net/genetlink.h>
>   #include <scsi/scsi_common.h>
>   #include <scsi/scsi_proto.h>
> @@ -66,17 +68,24 @@
>   
>   #define TCMU_TIME_OUT (30 * MSEC_PER_SEC)
>   
> -/* For cmd area, the size is fixed 2M */
> -#define CMDR_SIZE (2 * 1024 * 1024)
> +/* For cmd area, the size is fixed 8MB */
> +#define CMDR_SIZE (8 * 1024 * 1024)
>   
> -/* For data area, the size is fixed 32M */
> -#define DATA_BLOCK_BITS (8 * 1024)
> -#define DATA_BLOCK_SIZE 4096
> +/*
> + * For data area, the block size is PAGE_SIZE and
> + * the total size is 256K * PAGE_SIZE.
> + */
> +#define DATA_BLOCK_SIZE PAGE_SIZE
> +#define DATA_BLOCK_BITS (256 * 1024)
>   #define DATA_SIZE (DATA_BLOCK_BITS * DATA_BLOCK_SIZE)
> +#define DATA_BLOCK_RES_BITS 256
>   
> -/* The ring buffer size is 34M */
> +/* The total size of the ring is 8M + 256K * PAGE_SIZE */
>   #define TCMU_RING_SIZE (CMDR_SIZE + DATA_SIZE)
>   
> +/* Default maximum of the global data blocks(512K * PAGE_SIZE) */
> +#define TCMU_GLOBAL_MAX_BLOCKS (512 * 1024)
> +
>   static struct device *tcmu_root_device;
>   
>   struct tcmu_hba {
> @@ -86,6 +95,8 @@ struct tcmu_hba {
>   #define TCMU_CONFIG_LEN 256
>   
>   struct tcmu_dev {
> +	struct list_head node;
> +
>   	struct se_device se_dev;
>   
>   	char *name;
> @@ -97,6 +108,15 @@ struct tcmu_dev {
>   
>   	struct uio_info uio_info;
>   
> +	struct inode *inode;
> +
> +	bool unmapping;
> +	bool waiting_global;
> +	uint32_t dbi_cur;
> +	uint32_t dbi_thresh;
> +	DECLARE_BITMAP(data_bitmap, DATA_BLOCK_BITS);
> +	struct radix_tree_root data_blocks;
> +
>   	struct tcmu_mailbox *mb_addr;
>   	size_t dev_size;
>   	u32 cmdr_size;
> @@ -110,10 +130,6 @@ struct tcmu_dev {
>   	/* TODO should this be a mutex? */
>   	spinlock_t cmdr_lock;
>   
> -	uint32_t dbi_cur;
> -	DECLARE_BITMAP(data_bitmap, DATA_BLOCK_BITS);
> -	struct radix_tree_root data_blocks;
> -
>   	struct idr commands;
>   	spinlock_t commands_lock;
>   
> @@ -137,6 +153,11 @@ struct tcmu_cmd {
>   	uint32_t *dbi;
>   };
>   
> +static wait_queue_head_t g_wait;
> +static DEFINE_MUTEX(g_mutex);
> +static LIST_HEAD(root_udev);
> +static spinlock_t g_lock;
> +static unsigned long global_db_count;
>   static struct kmem_cache *tcmu_cmd_cache;
>   
>   /* multicast group */
> @@ -160,54 +181,89 @@ enum tcmu_multicast_groups {
>   	.netnsok = true,
>   };
>   
> -static int tcmu_db_get_empty_block(struct tcmu_dev *udev, void **addr)
> +#define tcmu_cmd_reset_dbi_cur(cmd) ((cmd)->dbi_cur = 0)
> +#define tcmu_cmd_set_dbi(cmd, index) ((cmd)->dbi[(cmd)->dbi_cur++] = (index))
> +#define tcmu_cmd_get_dbi(cmd) ((cmd)->dbi[(cmd)->dbi_cur++])
> +
> +static inline void tcmu_cmd_free_data(struct tcmu_cmd *tcmu_cmd, uint32_t len)
>   {
> -	void *p;
> -	uint32_t dbi;
> -	int ret;
> +	struct tcmu_dev *udev = tcmu_cmd->tcmu_dev;
> +	uint32_t i;
>   
> -	dbi = find_first_zero_bit(udev->data_bitmap, DATA_BLOCK_BITS);
> -	if (dbi > udev->dbi_cur)
> -		udev->dbi_cur = dbi;
> +	for (i = 0; i < len; i++)
> +		clear_bit(tcmu_cmd->dbi[i], udev->data_bitmap);
> +}
>   
> -	set_bit(dbi, udev->data_bitmap);
> +static inline bool get_empty_growing_block(struct tcmu_dev *udev,
> +					   struct tcmu_cmd *tcmu_cmd)
> +{
> +	struct page *page;
> +	int ret, dbi;
>   
> -	p = radix_tree_lookup(&udev->data_blocks, dbi);
> -	if (!p) {
> -		p = kzalloc(DATA_BLOCK_SIZE, GFP_ATOMIC);
> -		if (!p) {
> -			clear_bit(dbi, udev->data_bitmap);
> -			return -ENOMEM;
> +	dbi = find_first_zero_bit(udev->data_bitmap, udev->dbi_thresh);
> +	if (dbi == udev->dbi_thresh)
> +		return false;
> +
> +	page = radix_tree_lookup(&udev->data_blocks, dbi);
> +	if (!page) {
> +		/* try to get new page from the mm */
> +		spin_lock_irq(&g_lock);
> +		if (global_db_count >= TCMU_GLOBAL_MAX_BLOCKS) {
> +			spin_unlock_irq(&g_lock);
> +			wake_up(&g_wait);
> +			return false;
> +		}
> +		global_db_count++;
> +		spin_unlock_irq(&g_lock);
> +
> +		page = alloc_page(GFP_ATOMIC);
> +		if (!page) {
> +			return false;
>   		}
>   
> -		ret = radix_tree_insert(&udev->data_blocks, dbi, p);
> +		ret = radix_tree_insert(&udev->data_blocks, dbi, page);
>   		if (ret) {
> -			kfree(p);
> -			clear_bit(dbi, udev->data_bitmap);
> -			return ret;
> +			__free_page(page);
> +			return false;
>   		}
>   	}
>   
> -	*addr = p;
> -	return dbi;
> +	if (dbi > udev->dbi_cur)
> +		udev->dbi_cur = dbi;
> +
> +	set_bit(dbi, udev->data_bitmap);
> +	tcmu_cmd_set_dbi(tcmu_cmd, dbi);
> +
> +	return true;
>   }
>   
> -static void *tcmu_db_get_block_addr(struct tcmu_dev *udev, uint32_t dbi)
> +static bool tcmu_db_get_empty_blocks(struct tcmu_dev *udev,
> +				     struct tcmu_cmd *tcmu_cmd)
>   {
> -	return radix_tree_lookup(&udev->data_blocks, dbi);
> -}
> +	int i;
>   
> -#define tcmu_cmd_reset_dbi_cur(cmd) ((cmd)->dbi_cur = 0)
> -#define tcmu_cmd_set_dbi(cmd, index) ((cmd)->dbi[(cmd)->dbi_cur++] = (index))
> -#define tcmu_cmd_get_dbi(cmd) ((cmd)->dbi[(cmd)->dbi_cur++])
> +	tcmu_cmd_reset_dbi_cur(tcmu_cmd);
> +	for (i = 0; i < tcmu_cmd->dbi_len; i++) {
> +		if (!get_empty_growing_block(udev, tcmu_cmd))
> +			goto err;
> +	}
> +	return true;
>   
> -static void tcmu_cmd_free_data(struct tcmu_cmd *tcmu_cmd)
> +err:
> +	tcmu_cmd_free_data(tcmu_cmd, tcmu_cmd->dbi_cur);
> +	udev->waiting_global = true;
> +	return false;
> +}
> +
> +static struct page *tcmu_db_get_block_page(struct tcmu_dev *udev, uint32_t dbi)
>   {
> -	struct tcmu_dev *udev = tcmu_cmd->tcmu_dev;
> -	uint32_t bi;
> +	struct page *page;
>   
> -	for (bi = 0; bi < tcmu_cmd->dbi_len; bi++)
> -		clear_bit(tcmu_cmd->dbi[bi], udev->data_bitmap);
> +	page = radix_tree_lookup(&udev->data_blocks, dbi);
> +	if (!page)
> +		return NULL;
> +
> +	return page;
>   }
>   
>   static inline void tcmu_free_cmd(struct tcmu_cmd *tcmu_cmd)
> @@ -344,17 +400,20 @@ static int alloc_and_scatter_data_area(struct tcmu_dev *udev,
>   	void *from, *to = NULL;
>   	size_t copy_bytes, to_offset, offset;
>   	struct scatterlist *sg;
> +	struct page *page;
>   
>   	for_each_sg(data_sg, sg, data_nents, i) {
>   		int sg_remaining = sg->length;
>   		from = kmap_atomic(sg_page(sg)) + sg->offset;
>   		while (sg_remaining > 0) {
>   			if (block_remaining == 0) {
> +				if (to)
> +					kunmap_atomic(to);
> +
>   				block_remaining = DATA_BLOCK_SIZE;
> -				dbi = tcmu_db_get_empty_block(udev, &to);
> -				if (dbi < 0)
> -					return dbi;
> -				tcmu_cmd_set_dbi(tcmu_cmd, dbi);
> +				dbi = tcmu_cmd_get_dbi(tcmu_cmd);
> +				page = tcmu_db_get_block_page(udev, dbi);
> +				to = kmap_atomic(page);
>   			}
>   
>   			copy_bytes = min_t(size_t, sg_remaining,
> @@ -362,7 +421,7 @@ static int alloc_and_scatter_data_area(struct tcmu_dev *udev,
>   			to_offset = get_block_offset_user(udev, dbi,
>   					block_remaining);
>   			offset = DATA_BLOCK_SIZE - block_remaining;
> -			to = (void *)(unsigned long)to + offset;
> +			to = (void *)((unsigned long)to + offset);
>   
>   			if (*iov_cnt != 0 &&
>   			    to_offset == iov_tail(udev, *iov)) {
> @@ -382,6 +441,8 @@ static int alloc_and_scatter_data_area(struct tcmu_dev *udev,
>   		}
>   		kunmap_atomic(from - sg->offset);
>   	}
> +	if (to)
> +		kunmap_atomic(to);
>   
>   	return 0;
>   }
> @@ -391,23 +452,28 @@ static void gather_data_area(struct tcmu_dev *udev, struct tcmu_cmd *tcmu_cmd,
>   {
>   	int i, dbi;
>   	int block_remaining = 0;
> -	void *from, *to;
> +	void *from = NULL, *to;
>   	size_t copy_bytes, offset;
>   	struct scatterlist *sg;
> +	struct page *page;
>   
>   	for_each_sg(data_sg, sg, data_nents, i) {
>   		int sg_remaining = sg->length;
>   		to = kmap_atomic(sg_page(sg)) + sg->offset;
>   		while (sg_remaining > 0) {
>   			if (block_remaining == 0) {
> +				if (from)
> +					kunmap_atomic(from);
> +
>   				block_remaining = DATA_BLOCK_SIZE;
>   				dbi = tcmu_cmd_get_dbi(tcmu_cmd);
> -				from = tcmu_db_get_block_addr(udev, dbi);
> +				page = tcmu_db_get_block_page(udev, dbi);
> +				from = kmap_atomic(page);
>   			}
>   			copy_bytes = min_t(size_t, sg_remaining,
>   					block_remaining);
>   			offset = DATA_BLOCK_SIZE - block_remaining;
> -			from = (void *)(unsigned long)from + offset;
> +			from = (void *)((unsigned long)from + offset);
>   			tcmu_flush_dcache_range(from, copy_bytes);
>   			memcpy(to + sg->length - sg_remaining, from,
>   					copy_bytes);
> @@ -417,12 +483,13 @@ static void gather_data_area(struct tcmu_dev *udev, struct tcmu_cmd *tcmu_cmd,
>   		}
>   		kunmap_atomic(to - sg->offset);
>   	}
> +	if (from)
> +		kunmap_atomic(from);
>   }
>   
> -static inline size_t spc_bitmap_free(unsigned long *bitmap)
> +static inline size_t spc_bitmap_free(unsigned long *bitmap, uint32_t thresh)
>   {
> -	return DATA_BLOCK_SIZE * (DATA_BLOCK_BITS -
> -			bitmap_weight(bitmap, DATA_BLOCK_BITS));
> +	return DATA_BLOCK_SIZE * (thresh - bitmap_weight(bitmap, thresh));
>   }
>   
>   /*
> @@ -431,12 +498,14 @@ static inline size_t spc_bitmap_free(unsigned long *bitmap)
>    *
>    * Called with ring lock held.
>    */
> -static bool is_ring_space_avail(struct tcmu_dev *udev, size_t cmd_size, size_t data_needed)
> +static bool is_ring_space_avail(struct tcmu_dev *udev, struct tcmu_cmd *cmd,
> +		size_t cmd_size, size_t data_needed)
>   {
>   	struct tcmu_mailbox *mb = udev->mb_addr;
>   	size_t space, cmd_needed;
>   	u32 cmd_head;
>   
> +	udev->waiting_global = false;
>   	tcmu_flush_dcache_range(mb, sizeof(*mb));
>   
>   	cmd_head = mb->cmd_head % udev->cmdr_size; /* UAM */
> @@ -457,10 +526,24 @@ static bool is_ring_space_avail(struct tcmu_dev *udev, size_t cmd_size, size_t d
>   		return false;
>   	}
>   
> -	space = spc_bitmap_free(udev->data_bitmap);
> +	/* try to check and get the data blocks as needed */
> +	space = spc_bitmap_free(udev->data_bitmap, udev->dbi_thresh);
>   	if (space < data_needed) {
> -		pr_debug("no data space: only %zu available, but ask for %zu\n",
> -				space, data_needed);
> +		if (udev->unmapping) {
> +			pr_debug("no data space: only %zu available, but ask for %zu\n",
> +					space, data_needed);
> +			return false;
> +		} else {
> +			udev->dbi_thresh += udev->dbi_thresh / 2;
> +			udev->dbi_thresh = min((int)udev->dbi_thresh, DATA_BLOCK_BITS);
> +			space = spc_bitmap_free(udev->data_bitmap, udev->dbi_thresh);
> +			if (space < data_needed)
> +				return false;
> +		}
> +	}
> +
> +	if (!tcmu_db_get_empty_blocks(udev, cmd)) {
> +		pr_debug("no data space: ask for %zu\n", data_needed);
>   		return false;
>   	}
>   
> @@ -519,7 +602,7 @@ static bool is_ring_space_avail(struct tcmu_dev *udev, size_t cmd_size, size_t d
>   		return TCM_INVALID_CDB_FIELD;
>   	}
>   
> -	while (!is_ring_space_avail(udev, command_size, data_length)) {
> +	while (!is_ring_space_avail(udev, tcmu_cmd, command_size, data_length)) {
>   		int ret;
>   		DEFINE_WAIT(__wait);
>   
> @@ -567,6 +650,7 @@ static bool is_ring_space_avail(struct tcmu_dev *udev, size_t cmd_size, size_t d
>   	entry->hdr.uflags = 0;
>   
>   	/* Handle allocating space from the data area */
> +	tcmu_cmd_reset_dbi_cur(tcmu_cmd);
>   	iov = &entry->req.iov[0];
>   	iov_cnt = 0;
>   	copy_to_data_area = (se_cmd->data_direction == DMA_TO_DEVICE
> @@ -664,7 +748,7 @@ static void tcmu_handle_completion(struct tcmu_cmd *cmd, struct tcmu_cmd_entry *
>   	target_complete_cmd(cmd->se_cmd, entry->rsp.scsi_status);
>   
>   	cmd->se_cmd = NULL;
> -	tcmu_cmd_free_data(cmd);
> +	tcmu_cmd_free_data(cmd, cmd->dbi_len);
>   	tcmu_free_cmd(cmd);
>   }
>   
> @@ -783,41 +867,80 @@ static int tcmu_irqcontrol(struct uio_info *info, s32 irq_on)
>   
>   static void tcmu_db_release(struct tcmu_dev *udev, bool release_pending)
>   {
> -	uint32_t dbi, end;
> -	void *addr;
> +	int dbi = -1, end;
> +	struct page *page;
>   
>   	spin_lock_irq(&udev->cmdr_lock);
> -
>   	end = udev->dbi_cur + 1;
>   
> -	/* try to release all unused blocks */
> -	dbi = find_first_zero_bit(udev->data_bitmap, end);
> -	if (dbi >= end) {
> -		spin_unlock_irq(&udev->cmdr_lock);
> -		return;
> -	}
> +	/* try to release all unused but has mapped blocks */
>   	do {
> -		addr = radix_tree_delete(&udev->data_blocks, dbi);
> -		kfree(addr);
> -
>   		dbi = find_next_zero_bit(udev->data_bitmap, end, dbi + 1);
> -	} while (dbi < end);
> +		if (dbi == end)
> +			break;
>   
> -	if (!release_pending)
> -		return;
> +		/*
> +		 * When the bit is cleared and p != NULL, meaning that
> +		 * this tcmu block had already freed-after-use.
> +		 *
> +		 * If p->user == 0, meaning that the current ring buffer
> +		 * is the last or the only user of the tcmu block, and
> +		 * it must already in the free list, so it could be
> +		 * remove from the list and then released its memories.
> +		 *
> +		 * If p->user != 0, meaning that the current tcmu block is
> +		 * still referenced by other ring buffers, so just ignore
> +		 * it without doing anyting.
> +		 */
> +		page = radix_tree_delete(&udev->data_blocks, dbi);
> +		if (page) {
> +				__free_page(page);
> +				spin_lock_irq(&g_lock);
> +				global_db_count--;
> +				spin_unlock_irq(&g_lock);
> +		}
> +	} while (1);
>   
> -	/* try to release all pending blocks */
> -	dbi = find_first_bit(udev->data_bitmap, end);
> -	if (dbi >= end) {
> +	if (!release_pending) {
>   		spin_unlock_irq(&udev->cmdr_lock);
>   		return;
>   	}
> -	do {
> -		addr = radix_tree_delete(&udev->data_blocks, dbi);
> -		kfree(addr);
>   
> +	/* try to release all pending blocks */
> +	dbi = -1;
> +	do {
>   		dbi = find_next_bit(udev->data_bitmap, end, dbi + 1);
> -	} while (dbi < end);
> +		if (dbi == end)
> +			break;
> +
> +		clear_bit(dbi, udev->data_bitmap);
> +
> +		/*
> +		 * When the bit is set and p != NULL, meaning that this
> +		 * tcmu block is still being used here.
> +		 *
> +		 * If p->user == 0, meaning that the current ring buffer
> +		 * is the last or the only user of this tcmu block, and
> +		 * it won't in the free list, so could just release its
> +		 * memories.
> +		 *
> +		 * If the p->user != 0, we should insert it to the free
> +		 * list.
> +		 *
> +		 * p == NULL means that the current ring buffer is broken.
> +		 */
> +		page = radix_tree_delete(&udev->data_blocks, dbi);
> +		if (page) {
> +				__free_page(page);
> +				spin_lock_irq(&g_lock);
> +				global_db_count--;
> +				spin_unlock_irq(&g_lock);
> +		} else {
> +			pr_err("block page not found, ring is broken\n");
> +			set_bit(TCMU_DEV_BIT_BROKEN, &udev->flags);
> +			break;
> +		}
> +	} while (1);
>   
>   	spin_unlock_irq(&udev->cmdr_lock);
>   }
> @@ -846,6 +969,43 @@ static int tcmu_find_mem_index(struct vm_area_struct *vma)
>   	return -1;
>   }
>   
> +/*
> + * Normally it shouldn't be here. This is just for avoid
> + * the page fault call trace, and will return zeroed page.
> + */
> +static struct page *tcmu_try_to_alloc_new_page(struct tcmu_dev *udev, uint32_t dbi)
> +{
> +	struct page *page;
> +	int ret;
> +
> +	if (dbi >= udev->dbi_thresh) {
> +		udev->dbi_thresh = dbi;
> +		udev->dbi_cur = dbi;
> +	}
> +
> +	page = alloc_page(GFP_ATOMIC | __GFP_ZERO);
> +	if (!page) {
> +		return NULL;
> +	}
> +
> +	ret = radix_tree_insert(&udev->data_blocks, dbi, page);
> +	if (ret) {
> +		__free_page(page);
> +		return NULL;
> +	}
> +
> +	/*
> +	 * Since this case is rare in page fault routine, here we
> +	 * will allow the global_db_count >= TCMU_GLOBAL_MAX_BLOCKS
> +	 * to reduce possible page fault call trace.
> +	 */
> +	spin_lock_irq(&g_lock);
> +	global_db_count++;
> +	spin_unlock_irq(&g_lock);
> +
> +	return page;
> +}
> +
>   static int tcmu_vma_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
>   {
>   	struct tcmu_dev *udev = vma->vm_private_data;
> @@ -869,14 +1029,17 @@ static int tcmu_vma_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
>   		addr = (void *)(unsigned long)info->mem[mi].addr + offset;
>   		page = vmalloc_to_page(addr);
>   	} else {
> -		/* For the dynamically growing data area pages */
>   		uint32_t dbi;
>   
> +		/* For the dynamically growing data area pages */
>   		dbi = (offset - udev->data_off) / DATA_BLOCK_SIZE;
> -		addr = tcmu_db_get_block_addr(udev, dbi);
> -		if (!addr)
> +		spin_lock_irq(&udev->cmdr_lock);
> +		page = tcmu_db_get_block_page(udev, dbi);
> +		if (!page)
> +			page = tcmu_try_to_alloc_new_page(udev, dbi);
> +		spin_unlock_irq(&udev->cmdr_lock);
> +		if (!page)
>   			return VM_FAULT_NOPAGE;
> -		page = virt_to_page(addr);
>   	}
>   
>   	get_page(page);
> @@ -913,6 +1076,8 @@ static int tcmu_open(struct uio_info *info, struct inode *inode)
>   	if (test_and_set_bit(TCMU_DEV_BIT_OPEN, &udev->flags))
>   		return -EBUSY;
>   
> +	udev->inode = inode;
> +
>   	pr_debug("open\n");
>   
>   	return 0;
> @@ -1003,6 +1168,8 @@ static int tcmu_configure_device(struct se_device *dev)
>   	udev->cmdr_size = CMDR_SIZE - CMDR_OFF;
>   	udev->data_off = CMDR_SIZE;
>   	udev->data_size = DATA_SIZE;
> +	udev->dbi_thresh = DATA_BLOCK_BITS;
> +	udev->unmapping = false;
>   
>   	/* Initialise the mailbox of the ring buffer */
>   	mb = udev->mb_addr;
> @@ -1048,6 +1215,10 @@ static int tcmu_configure_device(struct se_device *dev)
>   	if (ret)
>   		goto err_netlink;
>   
> +	mutex_lock(&g_mutex);
> +	list_add(&udev->node, &root_udev);
> +	mutex_unlock(&g_mutex);
> +
>   	return 0;
>   
>   err_netlink:
> @@ -1072,6 +1243,10 @@ static void tcmu_free_device(struct se_device *dev)
>   {
>   	struct tcmu_dev *udev = TCMU_DEV(dev);
>   
> +	mutex_lock(&g_mutex);
> +	list_del(&udev->node);
> +	mutex_unlock(&g_mutex);
> +
>   	vfree(udev->mb_addr);
>   
>   	/* Upper layer should drain all requests before calling this */
> @@ -1235,12 +1410,90 @@ static sector_t tcmu_get_blocks(struct se_device *dev)
>   	.tb_dev_attrib_attrs	= passthrough_attrib_attrs,
>   };
>   
> +static struct task_struct *unmap_thread;
> +
> +/*
> + * The unmapping thread routine.
> + */
> +static int unmap_thread_fn(void *data)
> +{
> +	struct tcmu_dev *udev;
> +	loff_t offset;
> +	uint32_t start, end, dbi;
> +	struct page *page;
> +	bool unmapped;
> +	int i;
> +
> +	while (1) {
> +		DEFINE_WAIT(__wait);
> +
> +		prepare_to_wait(&g_wait, &__wait, TASK_INTERRUPTIBLE);
> +		schedule();
> +		finish_wait(&g_wait, &__wait);
> +
> +		unmapped = false;
> +		mutex_lock(&g_mutex);
> +		list_for_each_entry(udev, &root_udev, node) {
> +			spin_lock_irq(&udev->cmdr_lock);
> +			end = udev->dbi_cur + 1;
> +			dbi = find_last_bit(udev->data_bitmap, end);
> +			if (dbi == end) {
> +				/*
> +				 * Reserved for DATA_BLOCK_RES_BITS
> +				 * blocks for idle udev
> +				 */
> +				dbi = DATA_BLOCK_RES_BITS - 1;
> +				udev->dbi_cur = 0;
> +			} else {
> +				udev->dbi_cur = dbi;
> +			}
> +
> +			udev->dbi_thresh = start = dbi + 1;
> +			if (start >= end) {
> +				spin_unlock_irq(&udev->cmdr_lock);
> +				continue;
> +			}
> +			udev->unmapping = true;
> +			spin_unlock_irq(&udev->cmdr_lock);
> +
> +			/* Here will truncate the ring from offset */
> +			offset = udev->data_off + start * DATA_BLOCK_SIZE;
> +			unmap_mapping_range(udev->inode->i_mapping, offset, 0, 1);
> +			unmapped = true;
> +
> +			spin_lock_irq(&udev->cmdr_lock);
> +			for (i = start; i < end; i++) {
> +				page = radix_tree_delete(&udev->data_blocks, i);
> +				if (page) {
> +					__free_page(page);
> +					spin_lock_irq(&g_lock);
> +					global_db_count--;
> +					spin_unlock_irq(&g_lock);
> +				}
> +			}
> +			udev->unmapping = false;
> +			spin_unlock_irq(&udev->cmdr_lock);
> +		}
> +
> +		if (unmapped) {
> +			list_for_each_entry(udev, &root_udev, node)
> +				if (udev->waiting_global)
> +					wake_up(&udev->wait_cmdr);
> +		}
> +		mutex_unlock(&g_mutex);
> +	}
> +
> +	return 0;
> +}
> +
>   static int __init tcmu_module_init(void)
>   {
>   	int ret;
>   
>   	BUILD_BUG_ON((sizeof(struct tcmu_cmd_entry) % TCMU_OP_ALIGN_SIZE) != 0);
>   
> +	spin_lock_init(&g_lock);
> +
>   	tcmu_cmd_cache = kmem_cache_create("tcmu_cmd_cache",
>   				sizeof(struct tcmu_cmd),
>   				__alignof__(struct tcmu_cmd),
> @@ -1263,8 +1516,17 @@ static int __init tcmu_module_init(void)
>   	if (ret)
>   		goto out_unreg_genl;
>   
> +	init_waitqueue_head(&g_wait);
> +	unmap_thread = kthread_run(unmap_thread_fn, NULL, "tcmu_unmap");
> +	if (IS_ERR(unmap_thread)) {
> +		unmap_thread = NULL;
> +		goto out_unreg_transport;
> +	}
> +
>   	return 0;
>   
> +out_unreg_transport:
> +	target_backend_unregister(&tcmu_ops);
>   out_unreg_genl:
>   	genl_unregister_family(&tcmu_genl_family);
>   out_unreg_device:
> @@ -1277,6 +1539,9 @@ static int __init tcmu_module_init(void)
>   
>   static void __exit tcmu_module_exit(void)
>   {
> +	if (unmap_thread)
> +		kthread_stop(unmap_thread);
> +
>   	target_backend_unregister(&tcmu_ops);
>   	genl_unregister_family(&tcmu_genl_family);
>   	root_device_unregister(tcmu_root_device);
> -- 1.8.3.1
>


[-- Attachment #2: Type: text/html, Size: 23248 bytes --]

  reply	other threads:[~2017-03-17  8:05 UTC|newest]

Thread overview: 5+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
     [not found] <1488962743-17028-1-git-send-email-lixiubo@cmss.chinamobile.com>
     [not found] ` <1488962743-17028-3-git-send-email-lixiubo@cmss.chinamobile.com>
2017-03-08 20:20   ` Andy Grover
2017-03-16  9:39     ` Xiubo Li
2017-03-17  8:04       ` Xiubo Li [this message]
2017-03-17 17:11         ` Andy Grover
2017-03-17 22:06           ` 李秀波

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=f4c4e83a-d6b1-ed57-7a54-4277722e5a46@cmss.chinamobile.com \
    --to=lixiubo@cmss.chinamobile.com \
    --cc=agrover@redhat.com \
    --cc=linux-mm@kvack.org \
    --cc=linux-scsi@vger.kernel.org \
    --cc=mchristi@redhat.com \
    --cc=nab@linux-iscsi.org \
    --cc=namei.unix@gmail.com \
    --cc=sheng@yasker.org \
    --cc=shli@kernel.org \
    --cc=target-devel@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox