diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index a948a9c3c5564e60904f4374498a97e202584379..50e4fdd4ccda944b927262c48cbbe0b3a258231c 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -829,7 +829,10 @@ static inline unsigned long pmd_index(unsigned long address) * * this function returns the index of the entry in the pte page which would * control the given virtual address + * + * Also define macro so we can test if pte_index is defined for arch. */ +#define pte_index pte_index static inline unsigned long pte_index(unsigned long address) { return (address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1); diff --git a/drivers/target/target_core_user.c b/drivers/target/target_core_user.c index 2e413fba45501c977bc5b0c8f6fe0c18f58f80c4..76f02b24e0cb2c8c63cad831f11360cab3f61baf 100644 --- a/drivers/target/target_core_user.c +++ b/drivers/target/target_core_user.c @@ -29,6 +29,8 @@ #include #include #include +#include +#include #include #include #include @@ -86,6 +88,7 @@ #define DATA_BLOCK_SIZE PAGE_SIZE #define DATA_BLOCK_SHIFT PAGE_SHIFT #define DATA_BLOCK_BITS_DEF (256 * 1024) +#define ZC_DATA_BLOCK_BITS_DEF (256 * 1024) #define TCMU_MBS_TO_BLOCKS(_mbs) (_mbs << (20 - DATA_BLOCK_SHIFT)) #define TCMU_BLOCKS_TO_MBS(_blocks) (_blocks >> (20 - DATA_BLOCK_SHIFT)) @@ -133,6 +136,8 @@ struct tcmu_dev { #define TCMU_DEV_BIT_OPEN 0 #define TCMU_DEV_BIT_BROKEN 1 #define TCMU_DEV_BIT_BLOCKED 2 +#define TCMU_DEV_BIT_READ_BYPASS_DATA_AREA 3 +#define TCMU_DEV_BIT_WRITE_BYPASS_DATA_AREA 4 unsigned long flags; struct uio_info uio_info; @@ -147,10 +152,13 @@ struct tcmu_dev { /* Must add data_off and mb_addr to get the address */ size_t data_off; size_t data_size; + size_t zc_data_size; uint32_t max_blocks; + uint32_t zc_max_blocks; size_t ring_size; struct mutex cmdr_lock; + struct rw_semaphore i_mmap_sem; struct list_head qfull_queue; uint32_t dbi_max; @@ -158,6 +166,12 @@ struct tcmu_dev { unsigned long *data_bitmap; struct radix_tree_root data_blocks; + uint32_t zc_dbi_max; + uint32_t zc_dbi_thresh; + unsigned long *zc_data_bitmap; + uint32_t read_zc_size; + uint32_t write_zc_size; + struct idr commands; struct timer_list cmd_timer; @@ -183,6 +197,12 @@ struct tcmu_cmd { struct tcmu_dev *tcmu_dev; struct list_head queue_entry; + /* for zero_copy */ + struct mm_struct *vma_vm_mm; + struct vm_area_struct *vma; + struct iovec *iov; + int iov_cnt; + uint16_t cmd_id; /* Can't use se_cmd when cleaning up expired cmds, because if @@ -195,7 +215,11 @@ struct tcmu_cmd { #define TCMU_CMD_BIT_EXPIRED 0 #define TCMU_CMD_BIT_INFLIGHT 1 +#define TCMU_CMD_BIT_ZEROCOPY 2 +#define TCMU_CMD_BIT_BYPASS_DATA_AREA 3 unsigned long flags; + + struct mutex cmd_lock; }; /* * To avoid dead lock the mutex lock order should always be: @@ -488,10 +512,33 @@ static struct genl_family tcmu_genl_family __ro_after_init = { static void tcmu_cmd_free_data(struct tcmu_cmd *tcmu_cmd, uint32_t len) { struct tcmu_dev *udev = tcmu_cmd->tcmu_dev; + unsigned long *data_bitmap; uint32_t i; + if (test_bit(TCMU_CMD_BIT_ZEROCOPY, &tcmu_cmd->flags)) + data_bitmap = udev->zc_data_bitmap; + else + data_bitmap = udev->data_bitmap; + for (i = 0; i < len; i++) - clear_bit(tcmu_cmd->dbi[i], udev->data_bitmap); + clear_bit(tcmu_cmd->dbi[i], data_bitmap); +} + +static inline bool tcmu_get_zc_empty_block(struct tcmu_dev *udev, + struct tcmu_cmd *tcmu_cmd) +{ + int dbi; + + dbi = find_first_zero_bit(udev->zc_data_bitmap, udev->zc_dbi_thresh); + if (dbi == udev->zc_dbi_thresh) + return false; + + if (dbi > udev->zc_dbi_max) + udev->zc_dbi_max = dbi; + + set_bit(dbi, udev->zc_data_bitmap); + tcmu_cmd_set_dbi(tcmu_cmd, dbi); + return true; } static inline bool tcmu_get_empty_block(struct tcmu_dev *udev, @@ -535,12 +582,19 @@ static inline bool tcmu_get_empty_block(struct tcmu_dev *udev, } static bool tcmu_get_empty_blocks(struct tcmu_dev *udev, - struct tcmu_cmd *tcmu_cmd) + struct tcmu_cmd *tcmu_cmd, + bool zero_copy) { int i; + bool ret; for (i = tcmu_cmd->dbi_cur; i < tcmu_cmd->dbi_cnt; i++) { - if (!tcmu_get_empty_block(udev, tcmu_cmd)) + if (zero_copy) + ret = tcmu_get_zc_empty_block(udev, tcmu_cmd); + else + ret = tcmu_get_empty_block(udev, tcmu_cmd); + + if (!ret) return false; } return true; @@ -552,8 +606,40 @@ tcmu_get_block_page(struct tcmu_dev *udev, uint32_t dbi) return radix_tree_lookup(&udev->data_blocks, dbi); } +static void tcmu_cmd_zerocopy_unmap(struct tcmu_cmd *cmd) +{ + struct mm_struct *mm; + struct vm_area_struct *vma; + struct iovec *iov = cmd->iov; + unsigned long address; + int i; + + mm = cmd->vma_vm_mm; + vma = cmd->vma; + if (!mm) + return; + + if (mmget_not_zero(mm)) { + down_read(&mm->mmap_sem); + for (i = 0; i < cmd->iov_cnt; i++) { + address = (unsigned long)iov->iov_base; + zap_page_range(vma, address, iov->iov_len); + iov++; + } + up_read(&mm->mmap_sem); + mmput(mm); + } + + cmd->vma_vm_mm = NULL; + cmd->vma = NULL; + mmdrop(mm); + kfree(cmd->iov); +} + static inline void tcmu_free_cmd(struct tcmu_cmd *tcmu_cmd) { + if (test_bit(TCMU_CMD_BIT_ZEROCOPY, &tcmu_cmd->flags)) + tcmu_cmd_zerocopy_unmap(tcmu_cmd); kfree(tcmu_cmd->dbi); kmem_cache_free(tcmu_cmd_cache, tcmu_cmd); } @@ -579,11 +665,67 @@ static inline uint32_t tcmu_cmd_get_block_cnt(struct tcmu_cmd *tcmu_cmd) return data_length / DATA_BLOCK_SIZE; } +static void tcmu_set_cmd_bypass_data_area(struct tcmu_cmd *tcmu_cmd) +{ + struct tcmu_dev *udev = tcmu_cmd->tcmu_dev; + struct se_cmd *se_cmd = tcmu_cmd->se_cmd; + + /* + * Zero copy is map sg pages to userspace, and bypass data area + * is copy data between sg pages and userspace buffer, so they + * are completely different. + */ + if (test_bit(TCMU_CMD_BIT_ZEROCOPY, &tcmu_cmd->flags)) + return; + + if (se_cmd->data_direction == DMA_FROM_DEVICE && + test_bit(TCMU_DEV_BIT_READ_BYPASS_DATA_AREA, &udev->flags)) + set_bit(TCMU_CMD_BIT_BYPASS_DATA_AREA, &tcmu_cmd->flags); + + if (se_cmd->data_direction == DMA_TO_DEVICE && + test_bit(TCMU_DEV_BIT_WRITE_BYPASS_DATA_AREA, &udev->flags)) + set_bit(TCMU_CMD_BIT_BYPASS_DATA_AREA, &tcmu_cmd->flags); +} + +static void tcmu_set_cmd_do_zero_copy(struct tcmu_cmd *tcmu_cmd) +{ + struct tcmu_dev *udev = tcmu_cmd->tcmu_dev; + struct se_cmd *se_cmd = tcmu_cmd->se_cmd; + struct scatterlist *data_sg = se_cmd->t_data_sg, *sg; + unsigned int data_nents = se_cmd->t_data_nents; + int i; + + if ((se_cmd->se_cmd_flags & SCF_BIDI) || !se_cmd->data_length || + !IS_ALIGNED(se_cmd->data_length, PAGE_SIZE)) + return; + + if ((se_cmd->data_direction == DMA_FROM_DEVICE) && + (!udev->read_zc_size || + se_cmd->data_length < (udev->read_zc_size << 10))) + return; + + if ((se_cmd->data_direction == DMA_TO_DEVICE) && + (!udev->write_zc_size || + se_cmd->data_length < (udev->write_zc_size << 10))) + return; + + /* Now, check every sg pages is aligned. */ + for_each_sg(data_sg, sg, data_nents, i) { + if ((sg->offset && !IS_ALIGNED(sg->offset, PAGE_SIZE)) || + !IS_ALIGNED(sg->length, PAGE_SIZE)) + break; + } + if (i == data_nents) + set_bit(TCMU_CMD_BIT_ZEROCOPY, &tcmu_cmd->flags); +} + static struct tcmu_cmd *tcmu_alloc_cmd(struct se_cmd *se_cmd) { struct se_device *se_dev = se_cmd->se_dev; struct tcmu_dev *udev = TCMU_DEV(se_dev); struct tcmu_cmd *tcmu_cmd; + bool zero_copy; + bool bypass_data_area; tcmu_cmd = kmem_cache_zalloc(tcmu_cmd_cache, GFP_KERNEL); if (!tcmu_cmd) @@ -592,14 +734,25 @@ static struct tcmu_cmd *tcmu_alloc_cmd(struct se_cmd *se_cmd) INIT_LIST_HEAD(&tcmu_cmd->queue_entry); tcmu_cmd->se_cmd = se_cmd; tcmu_cmd->tcmu_dev = udev; + mutex_init(&tcmu_cmd->cmd_lock); + tcmu_set_cmd_do_zero_copy(tcmu_cmd); + tcmu_set_cmd_bypass_data_area(tcmu_cmd); + + zero_copy = test_bit(TCMU_CMD_BIT_ZEROCOPY, &tcmu_cmd->flags); + bypass_data_area = test_bit(TCMU_CMD_BIT_BYPASS_DATA_AREA, &tcmu_cmd->flags); tcmu_cmd_reset_dbi_cur(tcmu_cmd); - tcmu_cmd->dbi_cnt = tcmu_cmd_get_block_cnt(tcmu_cmd); - tcmu_cmd->dbi = kcalloc(tcmu_cmd->dbi_cnt, sizeof(uint32_t), - GFP_KERNEL); - if (!tcmu_cmd->dbi) { - kmem_cache_free(tcmu_cmd_cache, tcmu_cmd); - return NULL; + if (zero_copy || !bypass_data_area) { + tcmu_cmd->dbi_cnt = tcmu_cmd_get_block_cnt(tcmu_cmd); + tcmu_cmd->dbi = kcalloc(tcmu_cmd->dbi_cnt, sizeof(uint32_t), + GFP_KERNEL); + if (!tcmu_cmd->dbi) { + kmem_cache_free(tcmu_cmd_cache, tcmu_cmd); + return NULL; + } + } else { + tcmu_cmd->dbi_cnt = 0; + tcmu_cmd->dbi = NULL; } return tcmu_cmd; @@ -824,13 +977,15 @@ static inline size_t spc_bitmap_free(unsigned long *bitmap, uint32_t thresh) * Called with ring lock held. */ static bool is_ring_space_avail(struct tcmu_dev *udev, struct tcmu_cmd *cmd, - size_t cmd_size, size_t data_needed) + size_t cmd_size, size_t data_needed, bool zero_copy) { struct tcmu_mailbox *mb = udev->mb_addr; uint32_t blocks_needed = (data_needed + DATA_BLOCK_SIZE - 1) / DATA_BLOCK_SIZE; size_t space, cmd_needed; u32 cmd_head; + unsigned long *data_bitmap; + uint32_t *dbi_thresh, max_blocks; tcmu_flush_dcache_range(mb, sizeof(*mb)); @@ -852,25 +1007,37 @@ static bool is_ring_space_avail(struct tcmu_dev *udev, struct tcmu_cmd *cmd, return false; } + if (test_bit(TCMU_CMD_BIT_BYPASS_DATA_AREA, &cmd->flags)) + return true; + + if (zero_copy) { + data_bitmap = udev->zc_data_bitmap; + dbi_thresh = &udev->zc_dbi_thresh; + max_blocks = udev->zc_max_blocks; + } else { + data_bitmap = udev->data_bitmap; + dbi_thresh = &udev->dbi_thresh; + max_blocks = udev->max_blocks; + } + /* try to check and get the data blocks as needed */ - space = spc_bitmap_free(udev->data_bitmap, udev->dbi_thresh); + space = spc_bitmap_free(data_bitmap, *dbi_thresh); if ((space * DATA_BLOCK_SIZE) < data_needed) { - unsigned long blocks_left = - (udev->max_blocks - udev->dbi_thresh) + space; + unsigned long blocks_left = max_blocks - *dbi_thresh + space; if (blocks_left < blocks_needed) { - pr_debug("no data space: only %lu available, but ask for %zu\n", - blocks_left * DATA_BLOCK_SIZE, - data_needed); + pr_debug("no data space[%s]: only %lu available, but ask for %zu\n", + zero_copy ? "zero copy" : "non zero copy", + blocks_left * DATA_BLOCK_SIZE, data_needed); return false; } - udev->dbi_thresh += blocks_needed; - if (udev->dbi_thresh > udev->max_blocks) - udev->dbi_thresh = udev->max_blocks; + *dbi_thresh += blocks_needed; + if (*dbi_thresh > max_blocks) + *dbi_thresh = max_blocks; } - return tcmu_get_empty_blocks(udev, cmd); + return tcmu_get_empty_blocks(udev, cmd, zero_copy); } static inline size_t tcmu_cmd_get_base_cmd_size(size_t iov_cnt) @@ -976,6 +1143,8 @@ static int queue_cmd_ring(struct tcmu_cmd *tcmu_cmd, sense_reason_t *scsi_err) uint64_t cdb_off; bool copy_to_data_area; size_t data_length = tcmu_cmd_get_data_length(tcmu_cmd); + bool zero_copy = test_bit(TCMU_CMD_BIT_ZEROCOPY, &tcmu_cmd->flags); + bool bypass_data_area = test_bit(TCMU_CMD_BIT_BYPASS_DATA_AREA, &tcmu_cmd->flags); *scsi_err = TCM_NO_SENSE; @@ -1018,7 +1187,7 @@ static int queue_cmd_ring(struct tcmu_cmd *tcmu_cmd, sense_reason_t *scsi_err) return -1; } - if (!is_ring_space_avail(udev, tcmu_cmd, command_size, data_length)) { + if (!is_ring_space_avail(udev, tcmu_cmd, command_size, data_length, zero_copy)) { /* * Don't leave commands partially setup because the unmap * thread might need the blocks to make forward progress. @@ -1056,15 +1225,18 @@ static int queue_cmd_ring(struct tcmu_cmd *tcmu_cmd, sense_reason_t *scsi_err) iov = &entry->req.iov[0]; iov_cnt = 0; copy_to_data_area = (se_cmd->data_direction == DMA_TO_DEVICE - || se_cmd->se_cmd_flags & SCF_BIDI); - scatter_data_area(udev, tcmu_cmd, se_cmd->t_data_sg, - se_cmd->t_data_nents, &iov, &iov_cnt, - copy_to_data_area); + || se_cmd->se_cmd_flags & SCF_BIDI) && (!zero_copy); + + if (zero_copy || !bypass_data_area) { + scatter_data_area(udev, tcmu_cmd, se_cmd->t_data_sg, + se_cmd->t_data_nents, &iov, &iov_cnt, + copy_to_data_area); + } entry->req.iov_cnt = iov_cnt; /* Handle BIDI commands */ iov_cnt = 0; - if (se_cmd->se_cmd_flags & SCF_BIDI) { + if ((se_cmd->se_cmd_flags & SCF_BIDI) && !bypass_data_area) { iov++; scatter_data_area(udev, tcmu_cmd, se_cmd->t_bidi_data_sg, se_cmd->t_bidi_data_nents, &iov, &iov_cnt, @@ -1081,6 +1253,20 @@ static int queue_cmd_ring(struct tcmu_cmd *tcmu_cmd, sense_reason_t *scsi_err) return -1; } entry->hdr.cmd_id = tcmu_cmd->cmd_id; + if (zero_copy) { + int i; + struct iovec *tiov; + + tiov = &entry->req.iov[0]; + for (i = 0; i < entry->req.iov_cnt; i++) { + tiov->iov_base = tiov->iov_base + udev->data_size; + tiov++; + } + entry->hdr.kflags |= TCMU_KFLAG_ZERO_COPY; + } + + if (bypass_data_area) + entry->hdr.kflags |= TCMU_KFLAG_BYPASS_DATA_AREA; /* * Recalaulate the command's base size and size according @@ -1181,6 +1367,15 @@ static void tcmu_handle_completion(struct tcmu_cmd *cmd, struct tcmu_cmd_entry * else se_cmd->se_cmd_flags |= SCF_TREAT_READ_AS_NORMAL; } + + if (test_bit(TCMU_CMD_BIT_ZEROCOPY, &cmd->flags)) { + tcmu_cmd_zerocopy_unmap(cmd); + goto done; + } + + if (test_bit(TCMU_CMD_BIT_BYPASS_DATA_AREA, &cmd->flags)) + goto done; + if (se_cmd->se_cmd_flags & SCF_BIDI) { /* Get Data-In buffer before clean up */ gather_data_area(udev, cmd, true, read_len); @@ -1316,6 +1511,9 @@ static int tcmu_check_expired_cmd(int id, void *p, void *data) if (!udev->cmd_time_out) return 0; + mutex_lock(&cmd->cmd_lock); + if (test_bit(TCMU_CMD_BIT_ZEROCOPY, &cmd->flags)) + tcmu_cmd_zerocopy_unmap(cmd); set_bit(TCMU_CMD_BIT_EXPIRED, &cmd->flags); /* * target_complete_cmd will translate this to LUN COMM FAILURE @@ -1323,6 +1521,7 @@ static int tcmu_check_expired_cmd(int id, void *p, void *data) scsi_status = SAM_STAT_CHECK_CONDITION; list_del_init(&cmd->queue_entry); cmd->se_cmd = NULL; + mutex_unlock(&cmd->cmd_lock); } else { list_del_init(&cmd->queue_entry); idr_remove(&udev->commands, id); @@ -1404,7 +1603,11 @@ static struct se_device *tcmu_alloc_device(struct se_hba *hba, const char *name) udev->cmdr_size = CMDR_SIZE_DEF; udev->max_blocks = DATA_BLOCK_BITS_DEF; + udev->zc_max_blocks = ZC_DATA_BLOCK_BITS_DEF; mutex_init(&udev->cmdr_lock); + init_rwsem(&udev->i_mmap_sem); + udev->read_zc_size = 0; + udev->write_zc_size = 0; INIT_LIST_HEAD(&udev->node); INIT_LIST_HEAD(&udev->timedout_entry); @@ -1517,10 +1720,11 @@ static struct page *tcmu_try_get_block_page(struct tcmu_dev *udev, uint32_t dbi) { struct page *page; - mutex_lock(&udev->cmdr_lock); + down_read(&udev->i_mmap_sem); page = tcmu_get_block_page(udev, dbi); if (likely(page)) { - mutex_unlock(&udev->cmdr_lock); + get_page(page); + up_read(&udev->i_mmap_sem); return page; } @@ -1531,7 +1735,7 @@ static struct page *tcmu_try_get_block_page(struct tcmu_dev *udev, uint32_t dbi) pr_err("Invalid addr to data block mapping (dbi %u) on device %s\n", dbi, udev->name); page = NULL; - mutex_unlock(&udev->cmdr_lock); + up_read(&udev->i_mmap_sem); return page; } @@ -1558,6 +1762,7 @@ static vm_fault_t tcmu_vma_fault(struct vm_fault *vmf) /* For the vmalloc()ed cmd area pages */ addr = (void *)(unsigned long)info->mem[mi].addr + offset; page = vmalloc_to_page(addr); + get_page(page); } else { uint32_t dbi; @@ -1568,7 +1773,6 @@ static vm_fault_t tcmu_vma_fault(struct vm_fault *vmf) return VM_FAULT_SIGBUS; } - get_page(page); vmf->page = page; return 0; } @@ -1581,7 +1785,7 @@ static int tcmu_mmap(struct uio_info *info, struct vm_area_struct *vma) { struct tcmu_dev *udev = container_of(info, struct tcmu_dev, uio_info); - vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; + vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP | VM_MIXEDMAP; vma->vm_ops = &tcmu_vm_ops; vma->vm_private_data = udev; @@ -1670,11 +1874,115 @@ static void tcmu_dev_kref_release(struct kref *kref) tcmu_blocks_release(&udev->data_blocks, 0, udev->dbi_max + 1); kfree(udev->data_bitmap); + kfree(udev->zc_data_bitmap); mutex_unlock(&udev->cmdr_lock); call_rcu(&dev->rcu_head, tcmu_dev_call_rcu); } +#define TCMU_ZEROCOPY_PAGE_BATCH 32 + +static inline int tcmu_zerocopy_one_seg(struct iovec *iov, + struct vm_area_struct *vma, + struct sg_page_iter *sgiter) +{ + struct page *pages[TCMU_ZEROCOPY_PAGE_BATCH]; + unsigned int len = iov->iov_len; + unsigned long address = (unsigned long)iov->iov_base; + unsigned long pages_remaining, pg_index = 0; + struct page *page; + int ret; + + while (len > 0) { + __sg_page_iter_next(sgiter); + page = sg_page_iter_page(sgiter); + pages[pg_index++] = page; + len -= PAGE_SIZE; + if (pg_index == TCMU_ZEROCOPY_PAGE_BATCH || !len) { + pages_remaining = pg_index; + ret = vm_insert_pages_mkspecial(vma, address, pages, + &pages_remaining); + if (ret < 0) { + pr_err("vm insert pages failed, error code: %d\n", ret); + return ret; + } + address = address + pg_index * PAGE_SIZE; + pg_index = 0; + } + } + + return 0; +} + +long tcmu_ioctl_cmd_zerocopy(struct tcmu_dev *udev, unsigned long arg) +{ + struct tcmu_cmd *cmd; + struct se_cmd *se_cmd; + struct scatterlist *data_sg; + unsigned int data_nents; + struct tcmu_cmd_zerocopy zc; + struct iovec *iov, *tiov; + struct sg_page_iter sgiter; + struct mm_struct *mm = current->mm; + struct vm_area_struct *vma; + int i, ret = 0; + + if (copy_from_user(&zc, (struct tcmu_cmd_zerocopy __user *)arg, + sizeof(zc))) + return -EFAULT; + if (zc.iov_cnt <= 0) + return -EINVAL; + + iov = kmalloc_array(zc.iov_cnt, sizeof(struct iovec), GFP_KERNEL); + if (!iov) + return -ENOMEM; + if (copy_from_user(iov, zc.iov, sizeof(struct iovec) * zc.iov_cnt)) { + kfree(iov); + return -EFAULT; + } + + mutex_lock(&udev->cmdr_lock); + down_read(&mm->mmap_sem); + cmd = idr_find(&udev->commands, zc.cmd_id); + if (!cmd) { + ret = -EINVAL; + kfree(iov); + pr_err("tcmu zero copy: cmd_id %d not found\n", zc.cmd_id); + goto out; + } + se_cmd = cmd->se_cmd; + + vma = find_vma(current->mm, (unsigned long)iov->iov_base); + if (!vma) { + ret = -EINVAL; + kfree(iov); + pr_err("tcmu zero copy: invalid iov_base\n"); + goto out; + } + data_sg = se_cmd->t_data_sg; + data_nents = se_cmd->t_data_nents; + __sg_page_iter_start(&sgiter, data_sg, data_nents, 0); + tiov = iov; + for (i = 0; i < zc.iov_cnt; i++) { + ret = tcmu_zerocopy_one_seg(tiov, vma, &sgiter); + if (ret < 0) { + kfree(iov); + goto out; + } + tiov++; + } + + cmd->iov = iov; + cmd->iov_cnt = zc.iov_cnt; + cmd->vma_vm_mm = vma->vm_mm; + cmd->vma = vma; + mmgrab(cmd->vma_vm_mm); +out: + up_read(&mm->mmap_sem); + mutex_unlock(&udev->cmdr_lock); + return ret; +} + static int tcmu_release(struct uio_info *info, struct inode *inode) { struct tcmu_dev *udev = container_of(info, struct tcmu_dev, uio_info); @@ -1687,6 +1995,107 @@ static int tcmu_release(struct uio_info *info, struct inode *inode) return 0; } +static long tcmu_do_copy_data(struct tcmu_cmd *tcmu_cmd, + struct iovec __user *uiovec, + unsigned int vcnt, + bool is_copy_to_sgl) +{ + struct iovec iovstack[UIO_FASTIOV]; + struct iovec *iov = iovstack; + struct iov_iter iter; + ssize_t ret; + struct se_cmd *se_cmd = tcmu_cmd->se_cmd; + struct scatterlist *data_sg, *sg; + int i; + unsigned int data_nents; + + if (se_cmd->se_cmd_flags & SCF_BIDI) { + data_sg = se_cmd->t_bidi_data_sg; + data_nents = se_cmd->t_bidi_data_nents; + } else { + data_sg = se_cmd->t_data_sg; + data_nents = se_cmd->t_data_nents; + } + + ret = import_iovec(READ, uiovec, vcnt, ARRAY_SIZE(iovstack), &iov, &iter); + if (ret < 0) { + pr_err("import iovec failed.\n"); + return -EFAULT; + } + + for_each_sg(data_sg, sg, data_nents, i) { + if (is_copy_to_sgl) + ret = copy_page_from_iter(sg_page(sg), sg->offset, sg->length, &iter); + else + ret = copy_page_to_iter(sg_page(sg), sg->offset, sg->length, &iter); + if (ret < 0) { + pr_err("copy failed.\n"); + break; + } + } + kfree(iov); + return ret < 0 ? -EFAULT : 0; +} + +static long tcmu_copy_data(struct tcmu_dev *udev, + unsigned long arg, + bool is_copy_to_sgl) +{ + struct tcmu_data_xfer __user *uxfer = (struct tcmu_data_xfer __user *)arg; + struct tcmu_data_xfer xfer; + struct tcmu_cmd *tcmu_cmd; + long ret; + + if (copy_from_user(&xfer, uxfer, sizeof(xfer))) + return -EFAULT; + + tcmu_cmd = idr_find(&udev->commands, xfer.cmd_id); + if (!tcmu_cmd) { + pr_err("Can not find tcmu command, cmd_id:%d\n", xfer.cmd_id); + set_bit(TCMU_DEV_BIT_BROKEN, &udev->flags); + return -EFAULT; + } + + mutex_lock(&tcmu_cmd->cmd_lock); + if (!test_bit(TCMU_CMD_BIT_BYPASS_DATA_AREA, &tcmu_cmd->flags)) { + ret = -EINVAL; + goto out; + } + + if (test_bit(TCMU_CMD_BIT_EXPIRED, &tcmu_cmd->flags)) { + pr_err("Command is expired, cmd_id:%d\n", xfer.cmd_id); + ret = -EFAULT; + goto out; + } + + ret = tcmu_do_copy_data(tcmu_cmd, xfer.iovec, + xfer.iov_cnt, is_copy_to_sgl); +out: + mutex_unlock(&tcmu_cmd->cmd_lock); + return ret; +} + +static long tcmu_ioctl(struct uio_info *info, unsigned int cmd, unsigned long arg) +{ + struct tcmu_dev *udev = container_of(info, struct tcmu_dev, uio_info); + long ret; + + switch (cmd) { + case TCMU_IOCTL_CMD_COPY_TO_SGL: + ret = tcmu_copy_data(udev, arg, true); + break; + case TCMU_IOCTL_CMD_COPY_FROM_SGL: + ret = tcmu_copy_data(udev, arg, false); + break; + case TCMU_IOCTL_CMD_ZEROCOPY: + ret = tcmu_ioctl_cmd_zerocopy(udev, arg); + break; + default: + ret = -EINVAL; + } + return ret; +} + static int tcmu_init_genl_cmd_reply(struct tcmu_dev *udev, int cmd) { struct tcmu_nl_cmd *nl_cmd = &udev->curr_nl_cmd; @@ -1879,10 +2288,14 @@ static int tcmu_configure_device(struct se_device *dev) udev->data_bitmap = kcalloc(BITS_TO_LONGS(udev->max_blocks), sizeof(unsigned long), GFP_KERNEL); + + udev->zc_data_bitmap = kcalloc(BITS_TO_LONGS(udev->zc_max_blocks), + sizeof(unsigned long), + GFP_KERNEL); mutex_unlock(&udev->cmdr_lock); - if (!udev->data_bitmap) { + if (!udev->data_bitmap || !udev->zc_data_bitmap) { ret = -ENOMEM; - goto err_bitmap_alloc; + goto err_vzalloc; } udev->mb_addr = vzalloc(udev->cmdr_size + CMDR_OFF); @@ -1894,7 +2307,9 @@ static int tcmu_configure_device(struct se_device *dev) /* mailbox fits in first part of CMDR space */ udev->data_off = udev->cmdr_size + CMDR_OFF; udev->data_size = udev->max_blocks * DATA_BLOCK_SIZE; + udev->zc_data_size = udev->zc_max_blocks * DATA_BLOCK_SIZE; udev->dbi_thresh = 0; /* Default in Idle state */ + udev->zc_dbi_thresh = 0; /* Default in Idle state */ /* Initialise the mailbox of the ring buffer */ mb = udev->mb_addr; @@ -1911,7 +2326,8 @@ static int tcmu_configure_device(struct se_device *dev) info->mem[0].name = "tcm-user command & data buffer"; info->mem[0].addr = (phys_addr_t)(uintptr_t)udev->mb_addr; - info->mem[0].size = udev->ring_size = udev->data_size + udev->cmdr_size + CMDR_OFF; + info->mem[0].size = udev->ring_size = udev->data_size + udev->zc_data_size + + udev->cmdr_size + CMDR_OFF; info->mem[0].memtype = UIO_MEM_NONE; info->irqcontrol = tcmu_irqcontrol; @@ -1920,6 +2336,7 @@ static int tcmu_configure_device(struct se_device *dev) info->mmap = tcmu_mmap; info->open = tcmu_open; info->release = tcmu_release; + info->ioctl = tcmu_ioctl; ret = uio_register_device(tcmu_root_device, info); if (ret) @@ -1966,7 +2383,8 @@ static int tcmu_configure_device(struct se_device *dev) err_vzalloc: kfree(udev->data_bitmap); udev->data_bitmap = NULL; -err_bitmap_alloc: + kfree(udev->zc_data_bitmap); + udev->zc_data_bitmap = NULL; kfree(info->name); info->name = NULL; @@ -2642,6 +3060,130 @@ static ssize_t tcmu_reset_ring_store(struct config_item *item, const char *page, } CONFIGFS_ATTR_WO(tcmu_, reset_ring); +static ssize_t tcmu_read_bypass_data_area_show(struct config_item *item, char *page) +{ + struct se_dev_attrib *da = container_of(to_config_group(item), + struct se_dev_attrib, da_group); + struct tcmu_dev *udev = TCMU_DEV(da->da_dev); + + if (test_bit(TCMU_DEV_BIT_READ_BYPASS_DATA_AREA, &udev->flags)) + return snprintf(page, PAGE_SIZE, "%s\n", "true"); + else + return snprintf(page, PAGE_SIZE, "%s\n", "false"); +} + +static ssize_t tcmu_read_bypass_data_area_store(struct config_item *item, const char *page, + size_t count) +{ + struct se_dev_attrib *da = container_of(to_config_group(item), + struct se_dev_attrib, da_group); + struct tcmu_dev *udev = TCMU_DEV(da->da_dev); + bool bypass_data_area; + int ret; + + ret = strtobool(page, &bypass_data_area); + if (ret < 0) + return ret; + + if (bypass_data_area) + set_bit(TCMU_DEV_BIT_READ_BYPASS_DATA_AREA, &udev->flags); + else + clear_bit(TCMU_DEV_BIT_READ_BYPASS_DATA_AREA, &udev->flags); + + return count; +} +CONFIGFS_ATTR(tcmu_, read_bypass_data_area); + +static ssize_t tcmu_write_bypass_data_area_show(struct config_item *item, char *page) +{ + struct se_dev_attrib *da = container_of(to_config_group(item), + struct se_dev_attrib, da_group); + struct tcmu_dev *udev = TCMU_DEV(da->da_dev); + + if (test_bit(TCMU_DEV_BIT_WRITE_BYPASS_DATA_AREA, &udev->flags)) + return snprintf(page, PAGE_SIZE, "%s\n", "true"); + else + return snprintf(page, PAGE_SIZE, "%s\n", "false"); +} + +static ssize_t tcmu_write_bypass_data_area_store(struct config_item *item, const char *page, + size_t count) +{ + struct se_dev_attrib *da = container_of(to_config_group(item), + struct se_dev_attrib, da_group); + struct tcmu_dev *udev = TCMU_DEV(da->da_dev); + bool bypass_data_area; + int ret; + + ret = strtobool(page, &bypass_data_area); + if (ret < 0) + return ret; + + if (bypass_data_area) + set_bit(TCMU_DEV_BIT_WRITE_BYPASS_DATA_AREA, &udev->flags); + else + clear_bit(TCMU_DEV_BIT_WRITE_BYPASS_DATA_AREA, &udev->flags); + + return count; +} +CONFIGFS_ATTR(tcmu_, write_bypass_data_area); + +static ssize_t tcmu_read_zc_size_show(struct config_item *item, char *page) +{ + struct se_dev_attrib *da = container_of(to_config_group(item), + struct se_dev_attrib, da_group); + struct tcmu_dev *udev = TCMU_DEV(da->da_dev); + + return snprintf(page, PAGE_SIZE, "%ukb\n", udev->read_zc_size); +} + +static ssize_t tcmu_read_zc_size_store(struct config_item *item, const char *page, + size_t count) +{ + struct se_dev_attrib *da = container_of(to_config_group(item), + struct se_dev_attrib, da_group); + struct tcmu_dev *udev = TCMU_DEV(da->da_dev); + uint32_t read_zc_size; + int ret; + + ret = kstrtou32(page, 0, &read_zc_size); + if (ret < 0) + return ret; + + udev->read_zc_size = read_zc_size; + + return count; +} +CONFIGFS_ATTR(tcmu_, read_zc_size); + +static ssize_t tcmu_write_zc_size_show(struct config_item *item, char *page) +{ + struct se_dev_attrib *da = container_of(to_config_group(item), + struct se_dev_attrib, da_group); + struct tcmu_dev *udev = TCMU_DEV(da->da_dev); + + return snprintf(page, PAGE_SIZE, "%ukb\n", udev->write_zc_size); +} + +static ssize_t tcmu_write_zc_size_store(struct config_item *item, const char *page, + size_t count) +{ + struct se_dev_attrib *da = container_of(to_config_group(item), + struct se_dev_attrib, da_group); + struct tcmu_dev *udev = TCMU_DEV(da->da_dev); + uint32_t write_zc_size; + int ret; + + ret = kstrtou32(page, 0, &write_zc_size); + if (ret < 0) + return ret; + + udev->write_zc_size = write_zc_size; + + return count; +} +CONFIGFS_ATTR(tcmu_, write_zc_size); + static struct configfs_attribute *tcmu_attrib_attrs[] = { &tcmu_attr_cmd_time_out, &tcmu_attr_qfull_time_out, @@ -2651,6 +3193,10 @@ static struct configfs_attribute *tcmu_attrib_attrs[] = { &tcmu_attr_dev_size, &tcmu_attr_emulate_write_cache, &tcmu_attr_nl_reply_supported, + &tcmu_attr_read_bypass_data_area, + &tcmu_attr_write_bypass_data_area, + &tcmu_attr_read_zc_size, + &tcmu_attr_write_zc_size, NULL, }; @@ -2707,6 +3253,7 @@ static void find_free_blocks(void) continue; } + down_write(&udev->i_mmap_sem); end = udev->dbi_max + 1; block = find_last_bit(udev->data_bitmap, end); if (block == udev->dbi_max) { @@ -2714,6 +3261,7 @@ static void find_free_blocks(void) * The last bit is dbi_max, so it is not possible * reclaim any blocks. */ + up_write(&udev->i_mmap_sem); mutex_unlock(&udev->cmdr_lock); continue; } else if (block == end) { @@ -2731,6 +3279,7 @@ static void find_free_blocks(void) /* Release the block pages */ tcmu_blocks_release(&udev->data_blocks, start, end); + up_write(&udev->i_mmap_sem); mutex_unlock(&udev->cmdr_lock); total_freed += end - start; diff --git a/drivers/uio/uio.c b/drivers/uio/uio.c index 2762148c169df206fae40da2230435f515b7d282..e7f449fe688c0f1fc1ea231dd06fbad455ddbbff 100644 --- a/drivers/uio/uio.c +++ b/drivers/uio/uio.c @@ -12,6 +12,8 @@ * Base Functions */ +#include +#include #include #include #include @@ -216,7 +218,9 @@ static ssize_t name_show(struct device *dev, struct uio_device *idev = dev_get_drvdata(dev); int ret; - mutex_lock(&idev->info_lock); + if (!percpu_ref_tryget_live(&idev->info_ref)) + return -EINVAL; + if (!idev->info) { ret = -EINVAL; dev_err(dev, "the device has been unregistered\n"); @@ -226,7 +230,7 @@ static ssize_t name_show(struct device *dev, ret = sprintf(buf, "%s\n", idev->info->name); out: - mutex_unlock(&idev->info_lock); + percpu_ref_put(&idev->info_ref); return ret; } static DEVICE_ATTR_RO(name); @@ -237,7 +241,9 @@ static ssize_t version_show(struct device *dev, struct uio_device *idev = dev_get_drvdata(dev); int ret; - mutex_lock(&idev->info_lock); + if (!percpu_ref_tryget_live(&idev->info_ref)) + return -EINVAL; + if (!idev->info) { ret = -EINVAL; dev_err(dev, "the device has been unregistered\n"); @@ -247,7 +253,7 @@ static ssize_t version_show(struct device *dev, ret = sprintf(buf, "%s\n", idev->info->version); out: - mutex_unlock(&idev->info_lock); + percpu_ref_put(&idev->info_ref); return ret; } static DEVICE_ATTR_RO(version); @@ -487,16 +493,20 @@ static int uio_open(struct inode *inode, struct file *filep) listener->event_count = atomic_read(&idev->event); filep->private_data = listener; - mutex_lock(&idev->info_lock); + if (!percpu_ref_tryget_live(&idev->info_ref)) { + ret = -EINVAL; + goto err_infoopen; + } + if (!idev->info) { - mutex_unlock(&idev->info_lock); + percpu_ref_put(&idev->info_ref); ret = -EINVAL; - goto err_alloc_listener; + goto err_infoopen; } if (idev->info && idev->info->open) ret = idev->info->open(idev->info, inode); - mutex_unlock(&idev->info_lock); + percpu_ref_put(&idev->info_ref); if (ret) goto err_infoopen; @@ -529,10 +539,12 @@ static int uio_release(struct inode *inode, struct file *filep) struct uio_listener *listener = filep->private_data; struct uio_device *idev = listener->dev; - mutex_lock(&idev->info_lock); + if (!percpu_ref_tryget_live(&idev->info_ref)) + return -EINVAL; + if (idev->info && idev->info->release) ret = idev->info->release(idev->info, inode); - mutex_unlock(&idev->info_lock); + percpu_ref_put(&idev->info_ref); module_put(idev->owner); kfree(listener); @@ -546,10 +558,12 @@ static __poll_t uio_poll(struct file *filep, poll_table *wait) struct uio_device *idev = listener->dev; __poll_t ret = 0; - mutex_lock(&idev->info_lock); + if (!percpu_ref_tryget_live(&idev->info_ref)) + return EPOLLERR; + if (!idev->info || !idev->info->irq) - ret = -EIO; - mutex_unlock(&idev->info_lock); + ret = EPOLLERR; + percpu_ref_put(&idev->info_ref); if (ret) return ret; @@ -569,10 +583,12 @@ static ssize_t uio_read(struct file *filep, char __user *buf, ssize_t retval = 0; s32 event_count; - mutex_lock(&idev->info_lock); + if (!percpu_ref_tryget_live(&idev->info_ref)) + return -EINVAL; + if (!idev->info || !idev->info->irq) retval = -EIO; - mutex_unlock(&idev->info_lock); + percpu_ref_put(&idev->info_ref); if (retval) return retval; @@ -629,7 +645,9 @@ static ssize_t uio_write(struct file *filep, const char __user *buf, if (copy_from_user(&irq_on, buf, count)) return -EFAULT; - mutex_lock(&idev->info_lock); + if (!percpu_ref_tryget_live(&idev->info_ref)) + return -EINVAL; + if (!idev->info) { retval = -EINVAL; goto out; @@ -648,7 +666,7 @@ static ssize_t uio_write(struct file *filep, const char __user *buf, retval = idev->info->irqcontrol(idev->info, irq_on); out: - mutex_unlock(&idev->info_lock); + percpu_ref_put(&idev->info_ref); return retval ? retval : sizeof(s32); } @@ -673,7 +691,9 @@ static vm_fault_t uio_vma_fault(struct vm_fault *vmf) int ret = 0; int mi; - mutex_lock(&idev->info_lock); + if (!percpu_ref_tryget_live(&idev->info_ref)) + return VM_FAULT_SIGBUS; + if (!idev->info) { ret = VM_FAULT_SIGBUS; goto out; @@ -700,8 +720,7 @@ static vm_fault_t uio_vma_fault(struct vm_fault *vmf) vmf->page = page; out: - mutex_unlock(&idev->info_lock); - + percpu_ref_put(&idev->info_ref); return ret; } @@ -769,7 +788,9 @@ static int uio_mmap(struct file *filep, struct vm_area_struct *vma) vma->vm_private_data = idev; - mutex_lock(&idev->info_lock); + if (!percpu_ref_tryget_live(&idev->info_ref)) + return -EINVAL; + if (!idev->info) { ret = -EINVAL; goto out; @@ -807,10 +828,30 @@ static int uio_mmap(struct file *filep, struct vm_area_struct *vma) } out: - mutex_unlock(&idev->info_lock); + percpu_ref_put(&idev->info_ref); return ret; } +static long uio_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) +{ + struct uio_listener *listener = filp->private_data; + struct uio_device *idev = listener->dev; + long retval = 0; + + if (!percpu_ref_tryget_live(&idev->info_ref)) + return -EINVAL; + + if (!idev->info || !idev->info->ioctl) { + retval = -EINVAL; + goto out; + } + + retval = idev->info->ioctl(idev->info, cmd, arg); +out: + percpu_ref_put(&idev->info_ref); + return retval; +} + static const struct file_operations uio_fops = { .owner = THIS_MODULE, .open = uio_open, @@ -821,6 +862,8 @@ static const struct file_operations uio_fops = { .poll = uio_poll, .fasync = uio_fasync, .llseek = noop_llseek, + .unlocked_ioctl = uio_ioctl, + .compat_ioctl = uio_ioctl, }; static int uio_major_init(void) @@ -903,6 +946,13 @@ static void uio_device_release(struct device *dev) kfree(idev); } +static void uio_info_free(struct percpu_ref *ref) +{ + struct uio_device *idev = container_of(ref, struct uio_device, info_ref); + + complete(&idev->free_done); +} + /** * uio_register_device - register a new userspace IO device * @owner: module that creates the new device @@ -933,10 +983,17 @@ int __uio_register_device(struct module *owner, idev->owner = owner; idev->info = info; - mutex_init(&idev->info_lock); init_waitqueue_head(&idev->wait); atomic_set(&idev->event, 0); + ret = percpu_ref_init(&idev->info_ref, uio_info_free, 0, GFP_KERNEL); + if (ret) { + pr_err("percpu_ref init failed!\n"); + return ret; + } + init_completion(&idev->confirm_done); + init_completion(&idev->free_done); + ret = uio_get_minor(idev); if (ret) return ret; @@ -990,6 +1047,13 @@ int __uio_register_device(struct module *owner, } EXPORT_SYMBOL_GPL(__uio_register_device); +static void uio_confirm_info(struct percpu_ref *ref) +{ + struct uio_device *idev = container_of(ref, struct uio_device, info_ref); + + complete(&idev->confirm_done); +} + /** * uio_unregister_device - unregister a industrial IO device * @info: UIO device capabilities @@ -1006,14 +1070,16 @@ void uio_unregister_device(struct uio_info *info) uio_free_minor(idev); - mutex_lock(&idev->info_lock); + percpu_ref_kill_and_confirm(&idev->info_ref, uio_confirm_info); + wait_for_completion(&idev->confirm_done); + wait_for_completion(&idev->free_done); + /* now, we can set info to NULL */ uio_dev_del_attributes(idev); if (info->irq && info->irq != UIO_IRQ_CUSTOM) free_irq(info->irq, idev); idev->info = NULL; - mutex_unlock(&idev->info_lock); device_unregister(&idev->dev); diff --git a/include/linux/mm.h b/include/linux/mm.h index 2f9381ead615990171ca12dc9bc484799f57ce4b..4637f0f691af719e7e3a5b3b7ebc4ac761f1f35a 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2639,6 +2639,10 @@ struct vm_area_struct *find_extend_vma(struct mm_struct *, unsigned long addr); int remap_pfn_range(struct vm_area_struct *, unsigned long addr, unsigned long pfn, unsigned long size, pgprot_t); int vm_insert_page(struct vm_area_struct *, unsigned long addr, struct page *); +int vm_insert_page_mkspecial(struct vm_area_struct *vma, unsigned long addr, + struct page *page); +int vm_insert_pages_mkspecial(struct vm_area_struct *vma, unsigned long addr, + struct page **pages, unsigned long *num); int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr, unsigned long pfn); int vm_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr, diff --git a/include/linux/uio_driver.h b/include/linux/uio_driver.h index 6f8b68cd460f8c2b0aff758848a5de5a3ad65d6c..6a8314f8fb173641f3d9e2a1cbd3cabb45d745be 100644 --- a/include/linux/uio_driver.h +++ b/include/linux/uio_driver.h @@ -17,6 +17,7 @@ #include #include #include +#include struct module; struct uio_map; @@ -75,9 +76,11 @@ struct uio_device { struct fasync_struct *async_queue; wait_queue_head_t wait; struct uio_info *info; - struct mutex info_lock; struct kobject *map_dir; struct kobject *portio_dir; + struct percpu_ref info_ref; + struct completion confirm_done; + struct completion free_done; }; /** @@ -110,6 +113,7 @@ struct uio_info { int (*open)(struct uio_info *info, struct inode *inode); int (*release)(struct uio_info *info, struct inode *inode); int (*irqcontrol)(struct uio_info *info, s32 irq_on); + long (*ioctl)(struct uio_info *info, unsigned int cmd, unsigned long arg); }; extern int __must_check diff --git a/include/uapi/linux/target_core_user.h b/include/uapi/linux/target_core_user.h index b7b57967d90f09cd428d90e12b0035e3ecbcfc67..da8b79ef638ddbc0e124b3a74f947c9585ff8733 100644 --- a/include/uapi/linux/target_core_user.h +++ b/include/uapi/linux/target_core_user.h @@ -70,6 +70,8 @@ enum tcmu_opcode { struct tcmu_cmd_entry_hdr { __u32 len_op; __u16 cmd_id; +#define TCMU_KFLAG_ZERO_COPY 0x1 +#define TCMU_KFLAG_BYPASS_DATA_AREA 0x2 __u8 kflags; #define TCMU_UFLAG_UNKNOWN_OP 0x1 #define TCMU_UFLAG_READ_LEN 0x2 @@ -158,4 +160,21 @@ enum tcmu_genl_attr { }; #define TCMU_ATTR_MAX (__TCMU_ATTR_MAX - 1) +struct tcmu_data_xfer { + __u16 cmd_id; + __u16 __pad1; + __u32 iov_cnt; + struct iovec __user *iovec; +}; + +#define TCMU_IOCTL_CMD_COPY_TO_SGL _IOW('T', 0xe0, struct tcmu_data_xfer) +#define TCMU_IOCTL_CMD_COPY_FROM_SGL _IOR('T', 0xe1, struct tcmu_data_xfer) +#define TCMU_IOCTL_CMD_ZEROCOPY _IOW('T', 0xe2, struct tcmu_cmd_zerocopy) + +struct tcmu_cmd_zerocopy { + struct iovec __user *iov; + u32 iov_cnt; + u16 cmd_id; +}; + #endif diff --git a/mm/memory.c b/mm/memory.c index a862260513887ecf07591ab6f506a50ea021865b..39e152961954894b1437589af8f0e59cf65df77f 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -40,6 +40,7 @@ #include #include +#include #include #include #include @@ -1638,6 +1639,7 @@ void zap_page_range(struct vm_area_struct *vma, unsigned long start, mmu_notifier_invalidate_range_end(mm, start, end); tlb_finish_mmu(&tlb, start, end); } +EXPORT_SYMBOL_GPL(zap_page_range); /** * zap_page_range_single - remove user pages in a given range @@ -1686,8 +1688,7 @@ void zap_vma_ptes(struct vm_area_struct *vma, unsigned long address, } EXPORT_SYMBOL_GPL(zap_vma_ptes); -pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr, - spinlock_t **ptl) +static pmd_t *walk_to_pmd(struct mm_struct *mm, unsigned long addr) { pgd_t *pgd; p4d_t *p4d; @@ -1706,6 +1707,16 @@ pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr, return NULL; VM_BUG_ON(pmd_trans_huge(*pmd)); + return pmd; +} + +pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr, + spinlock_t **ptl) +{ + pmd_t *pmd = walk_to_pmd(mm, addr); + + if (!pmd) + return NULL; return pte_alloc_map_lock(mm, pmd, addr, ptl); } @@ -2215,6 +2226,174 @@ int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long } EXPORT_SYMBOL(vm_iomap_memory); +#ifdef CONFIG_ARCH_HAS_PTE_SPECIAL +static int insert_page_into_pte_locked_mkspecial(struct mm_struct *mm, pte_t *pte, + unsigned long addr, struct page *page, pgprot_t prot) +{ + /* + * The page to be inserted should be either anonymous page or file page. + * + * In general, the anonymous page used in dio should be pinned, while + * the file page used in buffer IO is either locked (read) or writeback + * (sync). On the other hand, file page used in IO metadata read (e.g., + * ext4_get_inode_loc) can be unlocked, and the buffer_head is locked + * instead. + * + * Finally, it is the caller's responsibility to ensure the validity of + * pages to be inserted, i.e., such pages are used for IO requests. + */ + if (!PageAnon(page) && !page_is_file_cache(page)) + return -EINVAL; + + flush_dcache_page(page); + + if (!pte_none(*pte)) + return -EBUSY; + set_pte_at(mm, addr, pte, pte_mkspecial(mk_pte(page, prot))); + return 0; +} + +static int insert_page_mkspecial(struct vm_area_struct *vma, unsigned long addr, + struct page *page, pgprot_t prot) +{ + struct mm_struct *mm = vma->vm_mm; + int retval; + pte_t *pte; + spinlock_t *ptl; + + retval = -ENOMEM; + pte = get_locked_pte(mm, addr, &ptl); + if (!pte) + goto out; + retval = insert_page_into_pte_locked_mkspecial(mm, pte, addr, page, prot); + pte_unmap_unlock(pte, ptl); +out: + return retval; +} + +int vm_insert_page_mkspecial(struct vm_area_struct *vma, unsigned long addr, struct page *page) +{ + if (addr < vma->vm_start || addr >= vma->vm_end) + return -EFAULT; + if (!(vma->vm_flags & VM_MIXEDMAP)) { + BUG_ON(down_read_trylock(&vma->vm_mm->mmap_sem)); + BUG_ON(vma->vm_flags & VM_PFNMAP); + vma->vm_flags |= VM_MIXEDMAP; + } + return insert_page_mkspecial(vma, addr, page, vma->vm_page_prot); +} + +#ifdef pte_index +/* + * insert_pages_mkspecial() amortizes the cost of spinlock operations + * when inserting pages in a loop. Arch *must* define pte_index. + */ +static int insert_pages_mkspecial(struct vm_area_struct *vma, unsigned long addr, + struct page **pages, unsigned long *num, pgprot_t prot) +{ + pmd_t *pmd = NULL; + pte_t *start_pte, *pte; + spinlock_t *pte_lock; + struct mm_struct *const mm = vma->vm_mm; + unsigned long curr_page_idx = 0; + unsigned long remaining_pages_total = *num; + unsigned long pages_to_write_in_pmd; + int ret; +more: + ret = -EFAULT; + pmd = walk_to_pmd(mm, addr); + if (!pmd) + goto out; + + pages_to_write_in_pmd = min_t(unsigned long, + remaining_pages_total, PTRS_PER_PTE - pte_index(addr)); + + /* Allocate the PTE if necessary; takes PMD lock once only. */ + ret = -ENOMEM; + if (pte_alloc(mm, pmd, addr)) + goto out; + + while (pages_to_write_in_pmd) { + int pte_idx = 0; + const int batch_size = min_t(int, pages_to_write_in_pmd, 8); + + start_pte = pte_offset_map_lock(mm, pmd, addr, &pte_lock); + for (pte = start_pte; pte_idx < batch_size; ++pte, ++pte_idx) { + int err = insert_page_into_pte_locked_mkspecial(mm, pte, + addr, pages[curr_page_idx], prot); + if (unlikely(err)) { + pte_unmap_unlock(start_pte, pte_lock); + ret = err; + remaining_pages_total -= pte_idx; + goto out; + } + addr += PAGE_SIZE; + ++curr_page_idx; + } + pte_unmap_unlock(start_pte, pte_lock); + pages_to_write_in_pmd -= batch_size; + remaining_pages_total -= batch_size; + } + if (remaining_pages_total) + goto more; + ret = 0; +out: + *num = remaining_pages_total; + return ret; +} +#endif /* pte_index */ + +/* + * vm_insert_pages_mkspecial - variant of vm_insert_pages using insert_pfn. + * + * The main purpose of vm_insert_pages_mkspecial is to combine the advantages of + * vm_insert_pages (batching the pmd lock) and remap_pfn_range_notrack (skipping + * track_pfn_insert). + * + * The caller should ensure the isolation (refcounted, PG_locked, PG_writeback, etc.) + * of @pages, and account for error case where a subset of @pages are mapped. + */ +int vm_insert_pages_mkspecial(struct vm_area_struct *vma, unsigned long addr, + struct page **pages, unsigned long *num) +{ +#ifdef pte_index + const unsigned long end_addr = addr + (*num * PAGE_SIZE) - 1; + + if (addr < vma->vm_start || end_addr >= vma->vm_end) + return -EFAULT; + if (!(vma->vm_flags & VM_MIXEDMAP)) { + BUG_ON(down_read_trylock(&vma->vm_mm->mmap_sem)); + BUG_ON(vma->vm_flags & VM_PFNMAP); + vma->vm_flags |= VM_MIXEDMAP; + } + return insert_pages_mkspecial(vma, addr, pages, num, vma->vm_page_prot); +#else + unsigned long idx = 0, pgcount = *num; + int err = -EINVAL; + + for (; idx < pgcount; ++idx) { + err = vm_insert_page_mkspecial(vma, addr + (PAGE_SIZE * idx), pages[idx]); + if (err) + break; + } + *num = pgcount - idx; + return err; +#endif /* pte_index */ +} +#else +int vm_insert_page_mkspecial(struct vm_area_struct *vma, unsigned long addr, struct page *page) +{ + return -EINVAL; +} +int vm_insert_pages_mkspecial(struct vm_area_struct *vma, unsigned long addr, + struct page **pages, unsigned long *num) +{ + return -EINVAL; +} +#endif /* CONFIG_ARCH_HAS_PTE_SPECIAL */ +EXPORT_SYMBOL(vm_insert_page_mkspecial); +EXPORT_SYMBOL(vm_insert_pages_mkspecial); + static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd, unsigned long addr, unsigned long end, pte_fn_t fn, void *data)