From: Xiubo Li <lixiubo@cmss.chinamobile.com>
To: Andy Grover <agrover@redhat.com>,
nab@linux-iscsi.org, mchristi@redhat.com
Cc: shli@kernel.org, sheng@yasker.org, linux-scsi@vger.kernel.org,
target-devel@vger.kernel.org, namei.unix@gmail.com,
linux-mm@kvack.org
Subject: Re: [PATCHv2 2/5] target/user: Add global data block pool support
Date: Fri, 17 Mar 2017 16:04:26 +0800 [thread overview]
Message-ID: <f4c4e83a-d6b1-ed57-7a54-4277722e5a46@cmss.chinamobile.com> (raw)
In-Reply-To: <ddd797ea-43f0-b863-64e4-1e758f41dafe@cmss.chinamobile.com>
[-- Attachment #1: Type: text/plain, Size: 23131 bytes --]
[...]
> These days what I have gotten is that the unmap_mapping_range() could
> be used.
> At the same time I have deep into the mm code and fixed the double
> usage of
> the data blocks and possible page fault call trace bugs mentioned above.
>
> Following is the V3 patch. I have test this using 4 targets & fio for
> about 2 days, so
> far so good.
>
> I'm still testing this using more complex test case.
>
I have test it the whole day today:
- using 4 targets
- setting TCMU_GLOBAL_MAX_BLOCKS = [512 1K 1M 1G 2G]
- each target here needs more than 450 blocks when running
- fio: -iodepth [1 2 4 8 16] -thread -rw=[read write] -bs=[1K 2K 3K 5K
7K 16K 64K 1M] -size=20G -numjobs=10 -runtime=1000 ...
The result:
- the system mm, no memory leakage happen.
- the same blocks will page fault again after the unmap.
- try to touch the blocks out of the iov[N], no page fault call trace
output.
- works well all the day.
Thanks,
BRs
Xiubo
[...]
> From: Xiubo Li<lixiubo@cmss.chinamobile.com>
>
> For each target there will be one ring, when the target number
> grows larger and larger, it could eventually runs out of the
> system memories.
>
> In this patch for each target ring, for the cmd area the size
> will be limited to 8MB and for the data area the size will be
> limited to 256K * PAGE_SIZE.
>
> For all the targets' data areas, they will get empty blocks
> from the "global data block pool", which has limited to 512K *
> PAGE_SIZE for now.
>
> When the "global data block pool" has been used up, then any
> target could trigger the unmapping thread routine to shrink the
> targets' rings. And for the idle targets the unmapping routine
> will reserve 256 blocks at least.
>
> When user space has touched the data blocks out of the iov[N],
> the tcmu_page_fault() will return one zeroed blocks.
>
> Signed-off-by: Xiubo Li<lixiubo@cmss.chinamobile.com>
> Signed-off-by: Jianfei Hu<hujianfei@cmss.chinamobile.com>
> ---
> drivers/target/target_core_user.c | 433 ++++++++++++++++++++++++++++++--------
> 1 file changed, 349 insertions(+), 84 deletions(-)
>
> diff --git a/drivers/target/target_core_user.c b/drivers/target/target_core_user.c
> index e904bc0..bbc52074 100644
> --- a/drivers/target/target_core_user.c
> +++ b/drivers/target/target_core_user.c
> @@ -30,6 +30,8 @@
> #include <linux/stringify.h>
> #include <linux/bitops.h>
> #include <linux/highmem.h>
> +#include <linux/mutex.h>
> +#include <linux/kthread.h>
> #include <net/genetlink.h>
> #include <scsi/scsi_common.h>
> #include <scsi/scsi_proto.h>
> @@ -66,17 +68,24 @@
>
> #define TCMU_TIME_OUT (30 * MSEC_PER_SEC)
>
> -/* For cmd area, the size is fixed 2M */
> -#define CMDR_SIZE (2 * 1024 * 1024)
> +/* For cmd area, the size is fixed 8MB */
> +#define CMDR_SIZE (8 * 1024 * 1024)
>
> -/* For data area, the size is fixed 32M */
> -#define DATA_BLOCK_BITS (8 * 1024)
> -#define DATA_BLOCK_SIZE 4096
> +/*
> + * For data area, the block size is PAGE_SIZE and
> + * the total size is 256K * PAGE_SIZE.
> + */
> +#define DATA_BLOCK_SIZE PAGE_SIZE
> +#define DATA_BLOCK_BITS (256 * 1024)
> #define DATA_SIZE (DATA_BLOCK_BITS * DATA_BLOCK_SIZE)
> +#define DATA_BLOCK_RES_BITS 256
>
> -/* The ring buffer size is 34M */
> +/* The total size of the ring is 8M + 256K * PAGE_SIZE */
> #define TCMU_RING_SIZE (CMDR_SIZE + DATA_SIZE)
>
> +/* Default maximum of the global data blocks(512K * PAGE_SIZE) */
> +#define TCMU_GLOBAL_MAX_BLOCKS (512 * 1024)
> +
> static struct device *tcmu_root_device;
>
> struct tcmu_hba {
> @@ -86,6 +95,8 @@ struct tcmu_hba {
> #define TCMU_CONFIG_LEN 256
>
> struct tcmu_dev {
> + struct list_head node;
> +
> struct se_device se_dev;
>
> char *name;
> @@ -97,6 +108,15 @@ struct tcmu_dev {
>
> struct uio_info uio_info;
>
> + struct inode *inode;
> +
> + bool unmapping;
> + bool waiting_global;
> + uint32_t dbi_cur;
> + uint32_t dbi_thresh;
> + DECLARE_BITMAP(data_bitmap, DATA_BLOCK_BITS);
> + struct radix_tree_root data_blocks;
> +
> struct tcmu_mailbox *mb_addr;
> size_t dev_size;
> u32 cmdr_size;
> @@ -110,10 +130,6 @@ struct tcmu_dev {
> /* TODO should this be a mutex? */
> spinlock_t cmdr_lock;
>
> - uint32_t dbi_cur;
> - DECLARE_BITMAP(data_bitmap, DATA_BLOCK_BITS);
> - struct radix_tree_root data_blocks;
> -
> struct idr commands;
> spinlock_t commands_lock;
>
> @@ -137,6 +153,11 @@ struct tcmu_cmd {
> uint32_t *dbi;
> };
>
> +static wait_queue_head_t g_wait;
> +static DEFINE_MUTEX(g_mutex);
> +static LIST_HEAD(root_udev);
> +static spinlock_t g_lock;
> +static unsigned long global_db_count;
> static struct kmem_cache *tcmu_cmd_cache;
>
> /* multicast group */
> @@ -160,54 +181,89 @@ enum tcmu_multicast_groups {
> .netnsok = true,
> };
>
> -static int tcmu_db_get_empty_block(struct tcmu_dev *udev, void **addr)
> +#define tcmu_cmd_reset_dbi_cur(cmd) ((cmd)->dbi_cur = 0)
> +#define tcmu_cmd_set_dbi(cmd, index) ((cmd)->dbi[(cmd)->dbi_cur++] = (index))
> +#define tcmu_cmd_get_dbi(cmd) ((cmd)->dbi[(cmd)->dbi_cur++])
> +
> +static inline void tcmu_cmd_free_data(struct tcmu_cmd *tcmu_cmd, uint32_t len)
> {
> - void *p;
> - uint32_t dbi;
> - int ret;
> + struct tcmu_dev *udev = tcmu_cmd->tcmu_dev;
> + uint32_t i;
>
> - dbi = find_first_zero_bit(udev->data_bitmap, DATA_BLOCK_BITS);
> - if (dbi > udev->dbi_cur)
> - udev->dbi_cur = dbi;
> + for (i = 0; i < len; i++)
> + clear_bit(tcmu_cmd->dbi[i], udev->data_bitmap);
> +}
>
> - set_bit(dbi, udev->data_bitmap);
> +static inline bool get_empty_growing_block(struct tcmu_dev *udev,
> + struct tcmu_cmd *tcmu_cmd)
> +{
> + struct page *page;
> + int ret, dbi;
>
> - p = radix_tree_lookup(&udev->data_blocks, dbi);
> - if (!p) {
> - p = kzalloc(DATA_BLOCK_SIZE, GFP_ATOMIC);
> - if (!p) {
> - clear_bit(dbi, udev->data_bitmap);
> - return -ENOMEM;
> + dbi = find_first_zero_bit(udev->data_bitmap, udev->dbi_thresh);
> + if (dbi == udev->dbi_thresh)
> + return false;
> +
> + page = radix_tree_lookup(&udev->data_blocks, dbi);
> + if (!page) {
> + /* try to get new page from the mm */
> + spin_lock_irq(&g_lock);
> + if (global_db_count >= TCMU_GLOBAL_MAX_BLOCKS) {
> + spin_unlock_irq(&g_lock);
> + wake_up(&g_wait);
> + return false;
> + }
> + global_db_count++;
> + spin_unlock_irq(&g_lock);
> +
> + page = alloc_page(GFP_ATOMIC);
> + if (!page) {
> + return false;
> }
>
> - ret = radix_tree_insert(&udev->data_blocks, dbi, p);
> + ret = radix_tree_insert(&udev->data_blocks, dbi, page);
> if (ret) {
> - kfree(p);
> - clear_bit(dbi, udev->data_bitmap);
> - return ret;
> + __free_page(page);
> + return false;
> }
> }
>
> - *addr = p;
> - return dbi;
> + if (dbi > udev->dbi_cur)
> + udev->dbi_cur = dbi;
> +
> + set_bit(dbi, udev->data_bitmap);
> + tcmu_cmd_set_dbi(tcmu_cmd, dbi);
> +
> + return true;
> }
>
> -static void *tcmu_db_get_block_addr(struct tcmu_dev *udev, uint32_t dbi)
> +static bool tcmu_db_get_empty_blocks(struct tcmu_dev *udev,
> + struct tcmu_cmd *tcmu_cmd)
> {
> - return radix_tree_lookup(&udev->data_blocks, dbi);
> -}
> + int i;
>
> -#define tcmu_cmd_reset_dbi_cur(cmd) ((cmd)->dbi_cur = 0)
> -#define tcmu_cmd_set_dbi(cmd, index) ((cmd)->dbi[(cmd)->dbi_cur++] = (index))
> -#define tcmu_cmd_get_dbi(cmd) ((cmd)->dbi[(cmd)->dbi_cur++])
> + tcmu_cmd_reset_dbi_cur(tcmu_cmd);
> + for (i = 0; i < tcmu_cmd->dbi_len; i++) {
> + if (!get_empty_growing_block(udev, tcmu_cmd))
> + goto err;
> + }
> + return true;
>
> -static void tcmu_cmd_free_data(struct tcmu_cmd *tcmu_cmd)
> +err:
> + tcmu_cmd_free_data(tcmu_cmd, tcmu_cmd->dbi_cur);
> + udev->waiting_global = true;
> + return false;
> +}
> +
> +static struct page *tcmu_db_get_block_page(struct tcmu_dev *udev, uint32_t dbi)
> {
> - struct tcmu_dev *udev = tcmu_cmd->tcmu_dev;
> - uint32_t bi;
> + struct page *page;
>
> - for (bi = 0; bi < tcmu_cmd->dbi_len; bi++)
> - clear_bit(tcmu_cmd->dbi[bi], udev->data_bitmap);
> + page = radix_tree_lookup(&udev->data_blocks, dbi);
> + if (!page)
> + return NULL;
> +
> + return page;
> }
>
> static inline void tcmu_free_cmd(struct tcmu_cmd *tcmu_cmd)
> @@ -344,17 +400,20 @@ static int alloc_and_scatter_data_area(struct tcmu_dev *udev,
> void *from, *to = NULL;
> size_t copy_bytes, to_offset, offset;
> struct scatterlist *sg;
> + struct page *page;
>
> for_each_sg(data_sg, sg, data_nents, i) {
> int sg_remaining = sg->length;
> from = kmap_atomic(sg_page(sg)) + sg->offset;
> while (sg_remaining > 0) {
> if (block_remaining == 0) {
> + if (to)
> + kunmap_atomic(to);
> +
> block_remaining = DATA_BLOCK_SIZE;
> - dbi = tcmu_db_get_empty_block(udev, &to);
> - if (dbi < 0)
> - return dbi;
> - tcmu_cmd_set_dbi(tcmu_cmd, dbi);
> + dbi = tcmu_cmd_get_dbi(tcmu_cmd);
> + page = tcmu_db_get_block_page(udev, dbi);
> + to = kmap_atomic(page);
> }
>
> copy_bytes = min_t(size_t, sg_remaining,
> @@ -362,7 +421,7 @@ static int alloc_and_scatter_data_area(struct tcmu_dev *udev,
> to_offset = get_block_offset_user(udev, dbi,
> block_remaining);
> offset = DATA_BLOCK_SIZE - block_remaining;
> - to = (void *)(unsigned long)to + offset;
> + to = (void *)((unsigned long)to + offset);
>
> if (*iov_cnt != 0 &&
> to_offset == iov_tail(udev, *iov)) {
> @@ -382,6 +441,8 @@ static int alloc_and_scatter_data_area(struct tcmu_dev *udev,
> }
> kunmap_atomic(from - sg->offset);
> }
> + if (to)
> + kunmap_atomic(to);
>
> return 0;
> }
> @@ -391,23 +452,28 @@ static void gather_data_area(struct tcmu_dev *udev, struct tcmu_cmd *tcmu_cmd,
> {
> int i, dbi;
> int block_remaining = 0;
> - void *from, *to;
> + void *from = NULL, *to;
> size_t copy_bytes, offset;
> struct scatterlist *sg;
> + struct page *page;
>
> for_each_sg(data_sg, sg, data_nents, i) {
> int sg_remaining = sg->length;
> to = kmap_atomic(sg_page(sg)) + sg->offset;
> while (sg_remaining > 0) {
> if (block_remaining == 0) {
> + if (from)
> + kunmap_atomic(from);
> +
> block_remaining = DATA_BLOCK_SIZE;
> dbi = tcmu_cmd_get_dbi(tcmu_cmd);
> - from = tcmu_db_get_block_addr(udev, dbi);
> + page = tcmu_db_get_block_page(udev, dbi);
> + from = kmap_atomic(page);
> }
> copy_bytes = min_t(size_t, sg_remaining,
> block_remaining);
> offset = DATA_BLOCK_SIZE - block_remaining;
> - from = (void *)(unsigned long)from + offset;
> + from = (void *)((unsigned long)from + offset);
> tcmu_flush_dcache_range(from, copy_bytes);
> memcpy(to + sg->length - sg_remaining, from,
> copy_bytes);
> @@ -417,12 +483,13 @@ static void gather_data_area(struct tcmu_dev *udev, struct tcmu_cmd *tcmu_cmd,
> }
> kunmap_atomic(to - sg->offset);
> }
> + if (from)
> + kunmap_atomic(from);
> }
>
> -static inline size_t spc_bitmap_free(unsigned long *bitmap)
> +static inline size_t spc_bitmap_free(unsigned long *bitmap, uint32_t thresh)
> {
> - return DATA_BLOCK_SIZE * (DATA_BLOCK_BITS -
> - bitmap_weight(bitmap, DATA_BLOCK_BITS));
> + return DATA_BLOCK_SIZE * (thresh - bitmap_weight(bitmap, thresh));
> }
>
> /*
> @@ -431,12 +498,14 @@ static inline size_t spc_bitmap_free(unsigned long *bitmap)
> *
> * Called with ring lock held.
> */
> -static bool is_ring_space_avail(struct tcmu_dev *udev, size_t cmd_size, size_t data_needed)
> +static bool is_ring_space_avail(struct tcmu_dev *udev, struct tcmu_cmd *cmd,
> + size_t cmd_size, size_t data_needed)
> {
> struct tcmu_mailbox *mb = udev->mb_addr;
> size_t space, cmd_needed;
> u32 cmd_head;
>
> + udev->waiting_global = false;
> tcmu_flush_dcache_range(mb, sizeof(*mb));
>
> cmd_head = mb->cmd_head % udev->cmdr_size; /* UAM */
> @@ -457,10 +526,24 @@ static bool is_ring_space_avail(struct tcmu_dev *udev, size_t cmd_size, size_t d
> return false;
> }
>
> - space = spc_bitmap_free(udev->data_bitmap);
> + /* try to check and get the data blocks as needed */
> + space = spc_bitmap_free(udev->data_bitmap, udev->dbi_thresh);
> if (space < data_needed) {
> - pr_debug("no data space: only %zu available, but ask for %zu\n",
> - space, data_needed);
> + if (udev->unmapping) {
> + pr_debug("no data space: only %zu available, but ask for %zu\n",
> + space, data_needed);
> + return false;
> + } else {
> + udev->dbi_thresh += udev->dbi_thresh / 2;
> + udev->dbi_thresh = min((int)udev->dbi_thresh, DATA_BLOCK_BITS);
> + space = spc_bitmap_free(udev->data_bitmap, udev->dbi_thresh);
> + if (space < data_needed)
> + return false;
> + }
> + }
> +
> + if (!tcmu_db_get_empty_blocks(udev, cmd)) {
> + pr_debug("no data space: ask for %zu\n", data_needed);
> return false;
> }
>
> @@ -519,7 +602,7 @@ static bool is_ring_space_avail(struct tcmu_dev *udev, size_t cmd_size, size_t d
> return TCM_INVALID_CDB_FIELD;
> }
>
> - while (!is_ring_space_avail(udev, command_size, data_length)) {
> + while (!is_ring_space_avail(udev, tcmu_cmd, command_size, data_length)) {
> int ret;
> DEFINE_WAIT(__wait);
>
> @@ -567,6 +650,7 @@ static bool is_ring_space_avail(struct tcmu_dev *udev, size_t cmd_size, size_t d
> entry->hdr.uflags = 0;
>
> /* Handle allocating space from the data area */
> + tcmu_cmd_reset_dbi_cur(tcmu_cmd);
> iov = &entry->req.iov[0];
> iov_cnt = 0;
> copy_to_data_area = (se_cmd->data_direction == DMA_TO_DEVICE
> @@ -664,7 +748,7 @@ static void tcmu_handle_completion(struct tcmu_cmd *cmd, struct tcmu_cmd_entry *
> target_complete_cmd(cmd->se_cmd, entry->rsp.scsi_status);
>
> cmd->se_cmd = NULL;
> - tcmu_cmd_free_data(cmd);
> + tcmu_cmd_free_data(cmd, cmd->dbi_len);
> tcmu_free_cmd(cmd);
> }
>
> @@ -783,41 +867,80 @@ static int tcmu_irqcontrol(struct uio_info *info, s32 irq_on)
>
> static void tcmu_db_release(struct tcmu_dev *udev, bool release_pending)
> {
> - uint32_t dbi, end;
> - void *addr;
> + int dbi = -1, end;
> + struct page *page;
>
> spin_lock_irq(&udev->cmdr_lock);
> -
> end = udev->dbi_cur + 1;
>
> - /* try to release all unused blocks */
> - dbi = find_first_zero_bit(udev->data_bitmap, end);
> - if (dbi >= end) {
> - spin_unlock_irq(&udev->cmdr_lock);
> - return;
> - }
> + /* try to release all unused but has mapped blocks */
> do {
> - addr = radix_tree_delete(&udev->data_blocks, dbi);
> - kfree(addr);
> -
> dbi = find_next_zero_bit(udev->data_bitmap, end, dbi + 1);
> - } while (dbi < end);
> + if (dbi == end)
> + break;
>
> - if (!release_pending)
> - return;
> + /*
> + * When the bit is cleared and p != NULL, meaning that
> + * this tcmu block had already freed-after-use.
> + *
> + * If p->user == 0, meaning that the current ring buffer
> + * is the last or the only user of the tcmu block, and
> + * it must already in the free list, so it could be
> + * remove from the list and then released its memories.
> + *
> + * If p->user != 0, meaning that the current tcmu block is
> + * still referenced by other ring buffers, so just ignore
> + * it without doing anyting.
> + */
> + page = radix_tree_delete(&udev->data_blocks, dbi);
> + if (page) {
> + __free_page(page);
> + spin_lock_irq(&g_lock);
> + global_db_count--;
> + spin_unlock_irq(&g_lock);
> + }
> + } while (1);
>
> - /* try to release all pending blocks */
> - dbi = find_first_bit(udev->data_bitmap, end);
> - if (dbi >= end) {
> + if (!release_pending) {
> spin_unlock_irq(&udev->cmdr_lock);
> return;
> }
> - do {
> - addr = radix_tree_delete(&udev->data_blocks, dbi);
> - kfree(addr);
>
> + /* try to release all pending blocks */
> + dbi = -1;
> + do {
> dbi = find_next_bit(udev->data_bitmap, end, dbi + 1);
> - } while (dbi < end);
> + if (dbi == end)
> + break;
> +
> + clear_bit(dbi, udev->data_bitmap);
> +
> + /*
> + * When the bit is set and p != NULL, meaning that this
> + * tcmu block is still being used here.
> + *
> + * If p->user == 0, meaning that the current ring buffer
> + * is the last or the only user of this tcmu block, and
> + * it won't in the free list, so could just release its
> + * memories.
> + *
> + * If the p->user != 0, we should insert it to the free
> + * list.
> + *
> + * p == NULL means that the current ring buffer is broken.
> + */
> + page = radix_tree_delete(&udev->data_blocks, dbi);
> + if (page) {
> + __free_page(page);
> + spin_lock_irq(&g_lock);
> + global_db_count--;
> + spin_unlock_irq(&g_lock);
> + } else {
> + pr_err("block page not found, ring is broken\n");
> + set_bit(TCMU_DEV_BIT_BROKEN, &udev->flags);
> + break;
> + }
> + } while (1);
>
> spin_unlock_irq(&udev->cmdr_lock);
> }
> @@ -846,6 +969,43 @@ static int tcmu_find_mem_index(struct vm_area_struct *vma)
> return -1;
> }
>
> +/*
> + * Normally it shouldn't be here. This is just for avoid
> + * the page fault call trace, and will return zeroed page.
> + */
> +static struct page *tcmu_try_to_alloc_new_page(struct tcmu_dev *udev, uint32_t dbi)
> +{
> + struct page *page;
> + int ret;
> +
> + if (dbi >= udev->dbi_thresh) {
> + udev->dbi_thresh = dbi;
> + udev->dbi_cur = dbi;
> + }
> +
> + page = alloc_page(GFP_ATOMIC | __GFP_ZERO);
> + if (!page) {
> + return NULL;
> + }
> +
> + ret = radix_tree_insert(&udev->data_blocks, dbi, page);
> + if (ret) {
> + __free_page(page);
> + return NULL;
> + }
> +
> + /*
> + * Since this case is rare in page fault routine, here we
> + * will allow the global_db_count >= TCMU_GLOBAL_MAX_BLOCKS
> + * to reduce possible page fault call trace.
> + */
> + spin_lock_irq(&g_lock);
> + global_db_count++;
> + spin_unlock_irq(&g_lock);
> +
> + return page;
> +}
> +
> static int tcmu_vma_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
> {
> struct tcmu_dev *udev = vma->vm_private_data;
> @@ -869,14 +1029,17 @@ static int tcmu_vma_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
> addr = (void *)(unsigned long)info->mem[mi].addr + offset;
> page = vmalloc_to_page(addr);
> } else {
> - /* For the dynamically growing data area pages */
> uint32_t dbi;
>
> + /* For the dynamically growing data area pages */
> dbi = (offset - udev->data_off) / DATA_BLOCK_SIZE;
> - addr = tcmu_db_get_block_addr(udev, dbi);
> - if (!addr)
> + spin_lock_irq(&udev->cmdr_lock);
> + page = tcmu_db_get_block_page(udev, dbi);
> + if (!page)
> + page = tcmu_try_to_alloc_new_page(udev, dbi);
> + spin_unlock_irq(&udev->cmdr_lock);
> + if (!page)
> return VM_FAULT_NOPAGE;
> - page = virt_to_page(addr);
> }
>
> get_page(page);
> @@ -913,6 +1076,8 @@ static int tcmu_open(struct uio_info *info, struct inode *inode)
> if (test_and_set_bit(TCMU_DEV_BIT_OPEN, &udev->flags))
> return -EBUSY;
>
> + udev->inode = inode;
> +
> pr_debug("open\n");
>
> return 0;
> @@ -1003,6 +1168,8 @@ static int tcmu_configure_device(struct se_device *dev)
> udev->cmdr_size = CMDR_SIZE - CMDR_OFF;
> udev->data_off = CMDR_SIZE;
> udev->data_size = DATA_SIZE;
> + udev->dbi_thresh = DATA_BLOCK_BITS;
> + udev->unmapping = false;
>
> /* Initialise the mailbox of the ring buffer */
> mb = udev->mb_addr;
> @@ -1048,6 +1215,10 @@ static int tcmu_configure_device(struct se_device *dev)
> if (ret)
> goto err_netlink;
>
> + mutex_lock(&g_mutex);
> + list_add(&udev->node, &root_udev);
> + mutex_unlock(&g_mutex);
> +
> return 0;
>
> err_netlink:
> @@ -1072,6 +1243,10 @@ static void tcmu_free_device(struct se_device *dev)
> {
> struct tcmu_dev *udev = TCMU_DEV(dev);
>
> + mutex_lock(&g_mutex);
> + list_del(&udev->node);
> + mutex_unlock(&g_mutex);
> +
> vfree(udev->mb_addr);
>
> /* Upper layer should drain all requests before calling this */
> @@ -1235,12 +1410,90 @@ static sector_t tcmu_get_blocks(struct se_device *dev)
> .tb_dev_attrib_attrs = passthrough_attrib_attrs,
> };
>
> +static struct task_struct *unmap_thread;
> +
> +/*
> + * The unmapping thread routine.
> + */
> +static int unmap_thread_fn(void *data)
> +{
> + struct tcmu_dev *udev;
> + loff_t offset;
> + uint32_t start, end, dbi;
> + struct page *page;
> + bool unmapped;
> + int i;
> +
> + while (1) {
> + DEFINE_WAIT(__wait);
> +
> + prepare_to_wait(&g_wait, &__wait, TASK_INTERRUPTIBLE);
> + schedule();
> + finish_wait(&g_wait, &__wait);
> +
> + unmapped = false;
> + mutex_lock(&g_mutex);
> + list_for_each_entry(udev, &root_udev, node) {
> + spin_lock_irq(&udev->cmdr_lock);
> + end = udev->dbi_cur + 1;
> + dbi = find_last_bit(udev->data_bitmap, end);
> + if (dbi == end) {
> + /*
> + * Reserved for DATA_BLOCK_RES_BITS
> + * blocks for idle udev
> + */
> + dbi = DATA_BLOCK_RES_BITS - 1;
> + udev->dbi_cur = 0;
> + } else {
> + udev->dbi_cur = dbi;
> + }
> +
> + udev->dbi_thresh = start = dbi + 1;
> + if (start >= end) {
> + spin_unlock_irq(&udev->cmdr_lock);
> + continue;
> + }
> + udev->unmapping = true;
> + spin_unlock_irq(&udev->cmdr_lock);
> +
> + /* Here will truncate the ring from offset */
> + offset = udev->data_off + start * DATA_BLOCK_SIZE;
> + unmap_mapping_range(udev->inode->i_mapping, offset, 0, 1);
> + unmapped = true;
> +
> + spin_lock_irq(&udev->cmdr_lock);
> + for (i = start; i < end; i++) {
> + page = radix_tree_delete(&udev->data_blocks, i);
> + if (page) {
> + __free_page(page);
> + spin_lock_irq(&g_lock);
> + global_db_count--;
> + spin_unlock_irq(&g_lock);
> + }
> + }
> + udev->unmapping = false;
> + spin_unlock_irq(&udev->cmdr_lock);
> + }
> +
> + if (unmapped) {
> + list_for_each_entry(udev, &root_udev, node)
> + if (udev->waiting_global)
> + wake_up(&udev->wait_cmdr);
> + }
> + mutex_unlock(&g_mutex);
> + }
> +
> + return 0;
> +}
> +
> static int __init tcmu_module_init(void)
> {
> int ret;
>
> BUILD_BUG_ON((sizeof(struct tcmu_cmd_entry) % TCMU_OP_ALIGN_SIZE) != 0);
>
> + spin_lock_init(&g_lock);
> +
> tcmu_cmd_cache = kmem_cache_create("tcmu_cmd_cache",
> sizeof(struct tcmu_cmd),
> __alignof__(struct tcmu_cmd),
> @@ -1263,8 +1516,17 @@ static int __init tcmu_module_init(void)
> if (ret)
> goto out_unreg_genl;
>
> + init_waitqueue_head(&g_wait);
> + unmap_thread = kthread_run(unmap_thread_fn, NULL, "tcmu_unmap");
> + if (IS_ERR(unmap_thread)) {
> + unmap_thread = NULL;
> + goto out_unreg_transport;
> + }
> +
> return 0;
>
> +out_unreg_transport:
> + target_backend_unregister(&tcmu_ops);
> out_unreg_genl:
> genl_unregister_family(&tcmu_genl_family);
> out_unreg_device:
> @@ -1277,6 +1539,9 @@ static int __init tcmu_module_init(void)
>
> static void __exit tcmu_module_exit(void)
> {
> + if (unmap_thread)
> + kthread_stop(unmap_thread);
> +
> target_backend_unregister(&tcmu_ops);
> genl_unregister_family(&tcmu_genl_family);
> root_device_unregister(tcmu_root_device);
> -- 1.8.3.1
>
[-- Attachment #2: Type: text/html, Size: 23248 bytes --]
next prev parent reply other threads:[~2017-03-17 8:05 UTC|newest]
Thread overview: 5+ messages / expand[flat|nested] mbox.gz Atom feed top
[not found] <1488962743-17028-1-git-send-email-lixiubo@cmss.chinamobile.com>
[not found] ` <1488962743-17028-3-git-send-email-lixiubo@cmss.chinamobile.com>
2017-03-08 20:20 ` Andy Grover
2017-03-16 9:39 ` Xiubo Li
2017-03-17 8:04 ` Xiubo Li [this message]
2017-03-17 17:11 ` Andy Grover
2017-03-17 22:06 ` 李秀波
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=f4c4e83a-d6b1-ed57-7a54-4277722e5a46@cmss.chinamobile.com \
--to=lixiubo@cmss.chinamobile.com \
--cc=agrover@redhat.com \
--cc=linux-mm@kvack.org \
--cc=linux-scsi@vger.kernel.org \
--cc=mchristi@redhat.com \
--cc=nab@linux-iscsi.org \
--cc=namei.unix@gmail.com \
--cc=sheng@yasker.org \
--cc=shli@kernel.org \
--cc=target-devel@vger.kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox