From 6fccf9383b280d463a7dfe1e0d048aff8df8a25e Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Wed, 18 Jun 2014 13:58:57 -0600 Subject: NVMe: Async event request Submits NVMe asynchronous event requests, one event up to the controller maximum or number of possible different event types (8), whichever is smaller. Events successfully returned by the controller are logged. Signed-off-by: Keith Busch Signed-off-by: Matthew Wilcox Signed-off-by: Jens Axboe --- include/linux/nvme.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 2bf403195c09..974efd04a4b1 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -99,6 +99,7 @@ struct nvme_dev { u32 stripe_size; u16 oncs; u16 abort_limit; + u8 event_limit; u8 vwc; u8 initialized; }; -- cgit v1.2.3 From a7dd7957acf798ac406afd6631e64a27ac4a5bf1 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Sat, 28 Jun 2014 09:00:27 -0400 Subject: NVMe: Update list of status codes Taken from the draft NVMe 1.1b specification. Signed-off-by: Matthew Wilcox Signed-off-by: Jens Axboe --- include/uapi/linux/nvme.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/nvme.h b/include/uapi/linux/nvme.h index 29a7d8619d8d..134518b0100b 100644 --- a/include/uapi/linux/nvme.h +++ b/include/uapi/linux/nvme.h @@ -440,9 +440,15 @@ enum { NVME_SC_FUSED_MISSING = 0xa, NVME_SC_INVALID_NS = 0xb, NVME_SC_CMD_SEQ_ERROR = 0xc, + NVME_SC_SGL_INVALID_LAST = 0xd, + NVME_SC_SGL_INVALID_COUNT = 0xe, + NVME_SC_SGL_INVALID_DATA = 0xf, + NVME_SC_SGL_INVALID_METADATA = 0x10, + NVME_SC_SGL_INVALID_TYPE = 0x11, NVME_SC_LBA_RANGE = 0x80, NVME_SC_CAP_EXCEEDED = 0x81, NVME_SC_NS_NOT_READY = 0x82, + NVME_SC_RESERVATION_CONFLICT = 0x83, NVME_SC_CQ_INVALID = 0x100, NVME_SC_QID_INVALID = 0x101, NVME_SC_QUEUE_SIZE = 0x102, @@ -454,7 +460,15 @@ enum { NVME_SC_INVALID_VECTOR = 0x108, NVME_SC_INVALID_LOG_PAGE = 0x109, NVME_SC_INVALID_FORMAT = 0x10a, + NVME_SC_FIRMWARE_NEEDS_RESET = 0x10b, + NVME_SC_INVALID_QUEUE = 0x10c, + NVME_SC_FEATURE_NOT_SAVEABLE = 0x10d, + NVME_SC_FEATURE_NOT_CHANGEABLE = 0x10e, + NVME_SC_FEATURE_NOT_PER_NS = 0x10f, + NVME_SC_FW_NEEDS_RESET_SUBSYS = 0x110, NVME_SC_BAD_ATTRIBUTES = 0x180, + NVME_SC_INVALID_PI = 0x181, + NVME_SC_READ_ONLY = 0x182, NVME_SC_WRITE_FAULT = 0x280, NVME_SC_READ_ERROR = 0x281, NVME_SC_GUARD_CHECK = 0x282, -- cgit v1.2.3 From 1d0906246095184d1624c643c2088152d330c40a Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Mon, 23 Jun 2014 11:34:01 -0600 Subject: NVMe: Mismatched host/device page size support Adds support for devices with max page size smaller than the host's. In the case we encounter such a host/device combination, the driver will split a page into as many PRP entries as necessary for the device's page size capabilities. If the device's reported minimum page size is greater than the host's, the driver will not attempt to enable the device and return an error instead. Signed-off-by: Keith Busch Signed-off-by: Matthew Wilcox Signed-off-by: Jens Axboe --- drivers/block/nvme-core.c | 59 ++++++++++++++++++++++++++++++++--------------- include/linux/nvme.h | 2 ++ 2 files changed, 42 insertions(+), 19 deletions(-) (limited to 'include') diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c index 42a62bbf4a11..e60bb0fec7e3 100644 --- a/drivers/block/nvme-core.c +++ b/drivers/block/nvme-core.c @@ -373,17 +373,17 @@ static __le64 **iod_list(struct nvme_iod *iod) * as it only leads to a small amount of wasted memory for the lifetime of * the I/O. */ -static int nvme_npages(unsigned size) +static int nvme_npages(unsigned size, struct nvme_dev *dev) { - unsigned nprps = DIV_ROUND_UP(size + PAGE_SIZE, PAGE_SIZE); - return DIV_ROUND_UP(8 * nprps, PAGE_SIZE - 8); + unsigned nprps = DIV_ROUND_UP(size + dev->page_size, dev->page_size); + return DIV_ROUND_UP(8 * nprps, dev->page_size - 8); } static struct nvme_iod * -nvme_alloc_iod(unsigned nseg, unsigned nbytes, gfp_t gfp) +nvme_alloc_iod(unsigned nseg, unsigned nbytes, struct nvme_dev *dev, gfp_t gfp) { struct nvme_iod *iod = kmalloc(sizeof(struct nvme_iod) + - sizeof(__le64 *) * nvme_npages(nbytes) + + sizeof(__le64 *) * nvme_npages(nbytes, dev) + sizeof(struct scatterlist) * nseg, gfp); if (iod) { @@ -400,7 +400,7 @@ nvme_alloc_iod(unsigned nseg, unsigned nbytes, gfp_t gfp) void nvme_free_iod(struct nvme_dev *dev, struct nvme_iod *iod) { - const int last_prp = PAGE_SIZE / 8 - 1; + const int last_prp = dev->page_size / 8 - 1; int i; __le64 **list = iod_list(iod); dma_addr_t prp_dma = iod->first_dma; @@ -491,26 +491,27 @@ int nvme_setup_prps(struct nvme_dev *dev, struct nvme_iod *iod, int total_len, __le64 **list = iod_list(iod); dma_addr_t prp_dma; int nprps, i; + u32 page_size = dev->page_size; - length -= (PAGE_SIZE - offset); + length -= (page_size - offset); if (length <= 0) return total_len; - dma_len -= (PAGE_SIZE - offset); + dma_len -= (page_size - offset); if (dma_len) { - dma_addr += (PAGE_SIZE - offset); + dma_addr += (page_size - offset); } else { sg = sg_next(sg); dma_addr = sg_dma_address(sg); dma_len = sg_dma_len(sg); } - if (length <= PAGE_SIZE) { + if (length <= page_size) { iod->first_dma = dma_addr; return total_len; } - nprps = DIV_ROUND_UP(length, PAGE_SIZE); + nprps = DIV_ROUND_UP(length, page_size); if (nprps <= (256 / 8)) { pool = dev->prp_small_pool; iod->npages = 0; @@ -523,13 +524,13 @@ int nvme_setup_prps(struct nvme_dev *dev, struct nvme_iod *iod, int total_len, if (!prp_list) { iod->first_dma = dma_addr; iod->npages = -1; - return (total_len - length) + PAGE_SIZE; + return (total_len - length) + page_size; } list[0] = prp_list; iod->first_dma = prp_dma; i = 0; for (;;) { - if (i == PAGE_SIZE / 8) { + if (i == page_size >> 3) { __le64 *old_prp_list = prp_list; prp_list = dma_pool_alloc(pool, gfp, &prp_dma); if (!prp_list) @@ -540,9 +541,9 @@ int nvme_setup_prps(struct nvme_dev *dev, struct nvme_iod *iod, int total_len, i = 1; } prp_list[i++] = cpu_to_le64(dma_addr); - dma_len -= PAGE_SIZE; - dma_addr += PAGE_SIZE; - length -= PAGE_SIZE; + dma_len -= page_size; + dma_addr += page_size; + length -= page_size; if (length <= 0) break; if (dma_len > 0) @@ -749,7 +750,7 @@ static int nvme_submit_bio_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns, if ((bio->bi_rw & REQ_FLUSH) && psegs) return nvme_split_flush_data(nvmeq, bio); - iod = nvme_alloc_iod(psegs, bio->bi_iter.bi_size, GFP_ATOMIC); + iod = nvme_alloc_iod(psegs, bio->bi_iter.bi_size, ns->dev, GFP_ATOMIC); if (!iod) return -ENOMEM; @@ -1463,6 +1464,24 @@ static int nvme_configure_admin_queue(struct nvme_dev *dev) u32 aqa; u64 cap = readq(&dev->bar->cap); struct nvme_queue *nvmeq; + unsigned page_shift = PAGE_SHIFT; + unsigned dev_page_min = NVME_CAP_MPSMIN(cap) + 12; + unsigned dev_page_max = NVME_CAP_MPSMAX(cap) + 12; + + if (page_shift < dev_page_min) { + dev_err(&dev->pci_dev->dev, + "Minimum device page size (%u) too large for " + "host (%u)\n", 1 << dev_page_min, + 1 << page_shift); + return -ENODEV; + } + if (page_shift > dev_page_max) { + dev_info(&dev->pci_dev->dev, + "Device maximum page size (%u) smaller than " + "host (%u); enabling work-around\n", + 1 << dev_page_max, 1 << page_shift); + page_shift = dev_page_max; + } result = nvme_disable_ctrl(dev, cap); if (result < 0) @@ -1478,8 +1497,10 @@ static int nvme_configure_admin_queue(struct nvme_dev *dev) aqa = nvmeq->q_depth - 1; aqa |= aqa << 16; + dev->page_size = 1 << page_shift; + dev->ctrl_config = NVME_CC_ENABLE | NVME_CC_CSS_NVM; - dev->ctrl_config |= (PAGE_SHIFT - 12) << NVME_CC_MPS_SHIFT; + dev->ctrl_config |= (page_shift - 12) << NVME_CC_MPS_SHIFT; dev->ctrl_config |= NVME_CC_ARB_RR | NVME_CC_SHN_NONE; dev->ctrl_config |= NVME_CC_IOSQES | NVME_CC_IOCQES; @@ -1529,7 +1550,7 @@ struct nvme_iod *nvme_map_user_pages(struct nvme_dev *dev, int write, } err = -ENOMEM; - iod = nvme_alloc_iod(count, length, GFP_KERNEL); + iod = nvme_alloc_iod(count, length, dev, GFP_KERNEL); if (!iod) goto put_pages; diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 974efd04a4b1..ed09074e5554 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -38,6 +38,7 @@ struct nvme_bar { #define NVME_CAP_TIMEOUT(cap) (((cap) >> 24) & 0xff) #define NVME_CAP_STRIDE(cap) (((cap) >> 32) & 0xf) #define NVME_CAP_MPSMIN(cap) (((cap) >> 48) & 0xf) +#define NVME_CAP_MPSMAX(cap) (((cap) >> 52) & 0xf) enum { NVME_CC_ENABLE = 1 << 0, @@ -97,6 +98,7 @@ struct nvme_dev { char firmware_rev[8]; u32 max_hw_sectors; u32 stripe_size; + u32 page_size; u16 oncs; u16 abort_limit; u8 event_limit; -- cgit v1.2.3 From 7963e521811ed19396d47793cc77d87c643ab891 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Fri, 12 Sep 2014 16:07:20 -0600 Subject: NVMe: Passthrough IOCTL for IO commands The NVME_IOCTL_SUBMIT_IO only works for IO commands with block data transfers and isn't usable for other NVMe commands like flush, data set management, or any sort of vendor unique command. The NVME_IOCTL_ADMIN_CMD, however, can easily be modified to accept arbitrary IO commands in addition to arbitrary admin commands without breaking backward compatibility. This patch just adds a new IOCTL to distinguish if the driver should submit the command on an IO or Admin queue. Signed-off-by: Keith Busch Signed-off-by: Matthew Wilcox Signed-off-by: Jens Axboe --- drivers/block/nvme-core.c | 17 ++++++++++++----- include/uapi/linux/nvme.h | 5 ++++- 2 files changed, 16 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c index 93ee55ee3775..70be3a151eb7 100644 --- a/drivers/block/nvme-core.c +++ b/drivers/block/nvme-core.c @@ -1732,10 +1732,10 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio) return status; } -static int nvme_user_admin_cmd(struct nvme_dev *dev, - struct nvme_admin_cmd __user *ucmd) +static int nvme_user_cmd(struct nvme_dev *dev, + struct nvme_passthru_cmd __user *ucmd, bool ioq) { - struct nvme_admin_cmd cmd; + struct nvme_passthru_cmd cmd; struct nvme_command c; int status, length; struct nvme_iod *uninitialized_var(iod); @@ -1774,6 +1774,9 @@ static int nvme_user_admin_cmd(struct nvme_dev *dev, ADMIN_TIMEOUT; if (length != cmd.data_len) status = -ENOMEM; + else if (ioq) + status = nvme_submit_sync_cmd(dev, this_cpu_read(*dev->io_queue), &c, + &cmd.result, timeout); else status = nvme_submit_sync_cmd(dev, 0, &c, &cmd.result, timeout); @@ -1799,7 +1802,9 @@ static int nvme_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, force_successful_syscall_return(); return ns->ns_id; case NVME_IOCTL_ADMIN_CMD: - return nvme_user_admin_cmd(ns->dev, (void __user *)arg); + return nvme_user_cmd(ns->dev, (void __user *)arg, false); + case NVME_IOCTL_IO_CMD: + return nvme_user_cmd(ns->dev, (void __user *)arg, true); case NVME_IOCTL_SUBMIT_IO: return nvme_submit_io(ns, (void __user *)arg); case SG_GET_VERSION_NUM: @@ -2743,7 +2748,9 @@ static long nvme_dev_ioctl(struct file *f, unsigned int cmd, unsigned long arg) struct nvme_dev *dev = f->private_data; switch (cmd) { case NVME_IOCTL_ADMIN_CMD: - return nvme_user_admin_cmd(dev, (void __user *)arg); + return nvme_user_cmd(dev, (void __user *)arg, false); + case NVME_IOCTL_IO_CMD: + return nvme_user_cmd(dev, (void __user *)arg, true); default: return -ENOTTY; } diff --git a/include/uapi/linux/nvme.h b/include/uapi/linux/nvme.h index 134518b0100b..97106556408f 100644 --- a/include/uapi/linux/nvme.h +++ b/include/uapi/linux/nvme.h @@ -503,7 +503,7 @@ struct nvme_user_io { __u16 appmask; }; -struct nvme_admin_cmd { +struct nvme_passthru_cmd { __u8 opcode; __u8 flags; __u16 rsvd1; @@ -524,8 +524,11 @@ struct nvme_admin_cmd { __u32 result; }; +#define nvme_admin_cmd nvme_passthru_cmd + #define NVME_IOCTL_ID _IO('N', 0x40) #define NVME_IOCTL_ADMIN_CMD _IOWR('N', 0x41, struct nvme_admin_cmd) #define NVME_IOCTL_SUBMIT_IO _IOW('N', 0x42, struct nvme_user_io) +#define NVME_IOCTL_IO_CMD _IOWR('N', 0x43, struct nvme_passthru_cmd) #endif /* _UAPI_LINUX_NVME_H */ -- cgit v1.2.3 From 60e0545cc92ce4172c7069d559568717f64af332 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Mon, 15 Sep 2014 14:08:38 -0600 Subject: NVMe: Updates for 1.1 spec Updating commands and structures for NVMe 1.1 updates, mostly for nvme reservations. There are no additional in-kernel uses, but this is for the uapi. While doing this, I noticed that the software progress features was using the wrong value, so updating that value as well. Signed-off-by: Keith Busch Signed-off-by: Matthew Wilcox Signed-off-by: Jens Axboe --- include/uapi/linux/nvme.h | 27 ++++++++++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/uapi/linux/nvme.h b/include/uapi/linux/nvme.h index 97106556408f..26386cf3db44 100644 --- a/include/uapi/linux/nvme.h +++ b/include/uapi/linux/nvme.h @@ -181,6 +181,22 @@ enum { NVME_LBART_ATTRIB_HIDE = 1 << 1, }; +struct nvme_reservation_status { + __le32 gen; + __u8 rtype; + __u8 regctl[2]; + __u8 resv5[2]; + __u8 ptpls; + __u8 resv10[13]; + struct { + __le16 cntlid; + __u8 rcsts; + __u8 resv3[5]; + __le64 hostid; + __le64 rkey; + } regctl_ds[]; +}; + /* I/O commands */ enum nvme_opcode { @@ -189,7 +205,12 @@ enum nvme_opcode { nvme_cmd_read = 0x02, nvme_cmd_write_uncor = 0x04, nvme_cmd_compare = 0x05, + nvme_cmd_write_zeroes = 0x08, nvme_cmd_dsm = 0x09, + nvme_cmd_resv_register = 0x0d, + nvme_cmd_resv_report = 0x0e, + nvme_cmd_resv_acquire = 0x11, + nvme_cmd_resv_release = 0x15, }; struct nvme_common_command { @@ -305,7 +326,11 @@ enum { NVME_FEAT_IRQ_CONFIG = 0x09, NVME_FEAT_WRITE_ATOMIC = 0x0a, NVME_FEAT_ASYNC_EVENT = 0x0b, - NVME_FEAT_SW_PROGRESS = 0x0c, + NVME_FEAT_AUTO_PST = 0x0c, + NVME_FEAT_SW_PROGRESS = 0x80, + NVME_FEAT_HOST_ID = 0x81, + NVME_FEAT_RESV_MASK = 0x82, + NVME_FEAT_RESV_PERSIST = 0x83, NVME_LOG_ERROR = 0x01, NVME_LOG_SMART = 0x02, NVME_LOG_FW_SLOT = 0x03, -- cgit v1.2.3 From a4aea5623d4a54682b6ff5c18196d7802f3e478f Mon Sep 17 00:00:00 2001 From: Matias Bjørling Date: Tue, 4 Nov 2014 08:20:14 -0700 Subject: NVMe: Convert to blk-mq This converts the NVMe driver to a blk-mq request-based driver. The NVMe driver is currently bio-based and implements queue logic within itself. By using blk-mq, a lot of these responsibilities can be moved and simplified. The patch is divided into the following blocks: * Per-command data and cmdid have been moved into the struct request field. The cmdid_data can be retrieved using blk_mq_rq_to_pdu() and id maintenance are now handled by blk-mq through the rq->tag field. * The logic for splitting bio's has been moved into the blk-mq layer. The driver instead notifies the block layer about limited gap support in SG lists. * blk-mq handles timeouts and is reimplemented within nvme_timeout(). This both includes abort handling and command cancelation. * Assignment of nvme queues to CPUs are replaced with the blk-mq version. The current blk-mq strategy is to assign the number of mapped queues and CPUs to provide synergy, while the nvme driver assign as many nvme hw queues as possible. This can be implemented in blk-mq if needed. * NVMe queues are merged with the tags structure of blk-mq. * blk-mq takes care of setup/teardown of nvme queues and guards invalid accesses. Therefore, RCU-usage for nvme queues can be removed. * IO tracing and accounting are handled by blk-mq and therefore removed. * Queue suspension logic is replaced with the logic from the block layer. Contributions in this patch from: Sam Bradshaw Jens Axboe Keith Busch Robert Nelson Acked-by: Keith Busch Acked-by: Jens Axboe Updated for new ->queue_rq() prototype. Signed-off-by: Jens Axboe --- drivers/block/nvme-core.c | 1366 ++++++++++++++++++--------------------------- drivers/block/nvme-scsi.c | 8 +- include/linux/nvme.h | 15 +- 3 files changed, 570 insertions(+), 819 deletions(-) (limited to 'include') diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c index c70eff3673d0..39050a3d10fd 100644 --- a/drivers/block/nvme-core.c +++ b/drivers/block/nvme-core.c @@ -13,9 +13,9 @@ */ #include -#include #include #include +#include #include #include #include @@ -33,7 +33,6 @@ #include #include #include -#include #include #include #include @@ -42,9 +41,8 @@ #include #include -#include - #define NVME_Q_DEPTH 1024 +#define NVME_AQ_DEPTH 64 #define SQ_SIZE(depth) (depth * sizeof(struct nvme_command)) #define CQ_SIZE(depth) (depth * sizeof(struct nvme_completion)) #define ADMIN_TIMEOUT (admin_timeout * HZ) @@ -81,10 +79,12 @@ static wait_queue_head_t nvme_kthread_wait; static struct notifier_block nvme_nb; static void nvme_reset_failed_dev(struct work_struct *ws); +static int nvme_process_cq(struct nvme_queue *nvmeq); struct async_cmd_info { struct kthread_work work; struct kthread_worker *worker; + struct request *req; u32 result; int status; void *ctx; @@ -104,10 +104,6 @@ struct nvme_queue { volatile struct nvme_completion *cqes; dma_addr_t sq_dma_addr; dma_addr_t cq_dma_addr; - wait_queue_head_t sq_full; - wait_queue_t sq_cong_wait; - struct bio_list sq_cong; - struct list_head iod_bio; u32 __iomem *q_db; u16 q_depth; u16 cq_vector; @@ -117,10 +113,8 @@ struct nvme_queue { u16 qid; u8 cq_phase; u8 cqe_seen; - u8 q_suspended; - cpumask_var_t cpu_mask; struct async_cmd_info cmdinfo; - unsigned long cmdid_data[]; + struct blk_mq_hw_ctx *hctx; }; /* @@ -148,62 +142,72 @@ typedef void (*nvme_completion_fn)(struct nvme_queue *, void *, struct nvme_cmd_info { nvme_completion_fn fn; void *ctx; - unsigned long timeout; int aborted; + struct nvme_queue *nvmeq; }; -static struct nvme_cmd_info *nvme_cmd_info(struct nvme_queue *nvmeq) +static int nvme_admin_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, + unsigned int hctx_idx) { - return (void *)&nvmeq->cmdid_data[BITS_TO_LONGS(nvmeq->q_depth)]; + struct nvme_dev *dev = data; + struct nvme_queue *nvmeq = dev->queues[0]; + + WARN_ON(nvmeq->hctx); + nvmeq->hctx = hctx; + hctx->driver_data = nvmeq; + return 0; } -static unsigned nvme_queue_extra(int depth) +static int nvme_admin_init_request(void *data, struct request *req, + unsigned int hctx_idx, unsigned int rq_idx, + unsigned int numa_node) { - return DIV_ROUND_UP(depth, 8) + (depth * sizeof(struct nvme_cmd_info)); + struct nvme_dev *dev = data; + struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(req); + struct nvme_queue *nvmeq = dev->queues[0]; + + BUG_ON(!nvmeq); + cmd->nvmeq = nvmeq; + return 0; } -/** - * alloc_cmdid() - Allocate a Command ID - * @nvmeq: The queue that will be used for this command - * @ctx: A pointer that will be passed to the handler - * @handler: The function to call on completion - * - * Allocate a Command ID for a queue. The data passed in will - * be passed to the completion handler. This is implemented by using - * the bottom two bits of the ctx pointer to store the handler ID. - * Passing in a pointer that's not 4-byte aligned will cause a BUG. - * We can change this if it becomes a problem. - * - * May be called with local interrupts disabled and the q_lock held, - * or with interrupts enabled and no locks held. - */ -static int alloc_cmdid(struct nvme_queue *nvmeq, void *ctx, - nvme_completion_fn handler, unsigned timeout) +static int nvme_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, + unsigned int hctx_idx) { - int depth = nvmeq->q_depth - 1; - struct nvme_cmd_info *info = nvme_cmd_info(nvmeq); - int cmdid; + struct nvme_dev *dev = data; + struct nvme_queue *nvmeq = dev->queues[ + (hctx_idx % dev->queue_count) + 1]; - do { - cmdid = find_first_zero_bit(nvmeq->cmdid_data, depth); - if (cmdid >= depth) - return -EBUSY; - } while (test_and_set_bit(cmdid, nvmeq->cmdid_data)); + if (!nvmeq->hctx) + nvmeq->hctx = hctx; + + /* nvmeq queues are shared between namespaces. We assume here that + * blk-mq map the tags so they match up with the nvme queue tags. */ + WARN_ON(nvmeq->hctx->tags != hctx->tags); - info[cmdid].fn = handler; - info[cmdid].ctx = ctx; - info[cmdid].timeout = jiffies + timeout; - info[cmdid].aborted = 0; - return cmdid; + hctx->driver_data = nvmeq; + return 0; } -static int alloc_cmdid_killable(struct nvme_queue *nvmeq, void *ctx, - nvme_completion_fn handler, unsigned timeout) +static int nvme_init_request(void *data, struct request *req, + unsigned int hctx_idx, unsigned int rq_idx, + unsigned int numa_node) { - int cmdid; - wait_event_killable(nvmeq->sq_full, - (cmdid = alloc_cmdid(nvmeq, ctx, handler, timeout)) >= 0); - return (cmdid < 0) ? -EINTR : cmdid; + struct nvme_dev *dev = data; + struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(req); + struct nvme_queue *nvmeq = dev->queues[hctx_idx + 1]; + + BUG_ON(!nvmeq); + cmd->nvmeq = nvmeq; + return 0; +} + +static void nvme_set_info(struct nvme_cmd_info *cmd, void *ctx, + nvme_completion_fn handler) +{ + cmd->fn = handler; + cmd->ctx = ctx; + cmd->aborted = 0; } /* Special values must be less than 0x1000 */ @@ -211,18 +215,12 @@ static int alloc_cmdid_killable(struct nvme_queue *nvmeq, void *ctx, #define CMD_CTX_CANCELLED (0x30C + CMD_CTX_BASE) #define CMD_CTX_COMPLETED (0x310 + CMD_CTX_BASE) #define CMD_CTX_INVALID (0x314 + CMD_CTX_BASE) -#define CMD_CTX_ABORT (0x318 + CMD_CTX_BASE) -#define CMD_CTX_ASYNC (0x31C + CMD_CTX_BASE) static void special_completion(struct nvme_queue *nvmeq, void *ctx, struct nvme_completion *cqe) { if (ctx == CMD_CTX_CANCELLED) return; - if (ctx == CMD_CTX_ABORT) { - ++nvmeq->dev->abort_limit; - return; - } if (ctx == CMD_CTX_COMPLETED) { dev_warn(nvmeq->q_dmadev, "completed id %d twice on queue %d\n", @@ -235,110 +233,89 @@ static void special_completion(struct nvme_queue *nvmeq, void *ctx, cqe->command_id, le16_to_cpup(&cqe->sq_id)); return; } - if (ctx == CMD_CTX_ASYNC) { - u32 result = le32_to_cpup(&cqe->result); - u16 status = le16_to_cpup(&cqe->status) >> 1; - - if (status == NVME_SC_SUCCESS || status == NVME_SC_ABORT_REQ) - ++nvmeq->dev->event_limit; - if (status == NVME_SC_SUCCESS) - dev_warn(nvmeq->q_dmadev, - "async event result %08x\n", result); - return; - } - dev_warn(nvmeq->q_dmadev, "Unknown special completion %p\n", ctx); } -static void async_completion(struct nvme_queue *nvmeq, void *ctx, - struct nvme_completion *cqe) -{ - struct async_cmd_info *cmdinfo = ctx; - cmdinfo->result = le32_to_cpup(&cqe->result); - cmdinfo->status = le16_to_cpup(&cqe->status) >> 1; - queue_kthread_work(cmdinfo->worker, &cmdinfo->work); -} - -/* - * Called with local interrupts disabled and the q_lock held. May not sleep. - */ -static void *free_cmdid(struct nvme_queue *nvmeq, int cmdid, - nvme_completion_fn *fn) +static void *cancel_cmd_info(struct nvme_cmd_info *cmd, nvme_completion_fn *fn) { void *ctx; - struct nvme_cmd_info *info = nvme_cmd_info(nvmeq); - if (cmdid >= nvmeq->q_depth || !info[cmdid].fn) { - if (fn) - *fn = special_completion; - return CMD_CTX_INVALID; - } if (fn) - *fn = info[cmdid].fn; - ctx = info[cmdid].ctx; - info[cmdid].fn = special_completion; - info[cmdid].ctx = CMD_CTX_COMPLETED; - clear_bit(cmdid, nvmeq->cmdid_data); - wake_up(&nvmeq->sq_full); + *fn = cmd->fn; + ctx = cmd->ctx; + cmd->fn = special_completion; + cmd->ctx = CMD_CTX_CANCELLED; return ctx; } -static void *cancel_cmdid(struct nvme_queue *nvmeq, int cmdid, - nvme_completion_fn *fn) +static void async_req_completion(struct nvme_queue *nvmeq, void *ctx, + struct nvme_completion *cqe) { - void *ctx; - struct nvme_cmd_info *info = nvme_cmd_info(nvmeq); - if (fn) - *fn = info[cmdid].fn; - ctx = info[cmdid].ctx; - info[cmdid].fn = special_completion; - info[cmdid].ctx = CMD_CTX_CANCELLED; - return ctx; -} + struct request *req = ctx; -static struct nvme_queue *raw_nvmeq(struct nvme_dev *dev, int qid) -{ - return rcu_dereference_raw(dev->queues[qid]); + u32 result = le32_to_cpup(&cqe->result); + u16 status = le16_to_cpup(&cqe->status) >> 1; + + if (status == NVME_SC_SUCCESS || status == NVME_SC_ABORT_REQ) + ++nvmeq->dev->event_limit; + if (status == NVME_SC_SUCCESS) + dev_warn(nvmeq->q_dmadev, + "async event result %08x\n", result); + + blk_put_request(req); } -static struct nvme_queue *get_nvmeq(struct nvme_dev *dev) __acquires(RCU) +static void abort_completion(struct nvme_queue *nvmeq, void *ctx, + struct nvme_completion *cqe) { - struct nvme_queue *nvmeq; - unsigned queue_id = get_cpu_var(*dev->io_queue); + struct request *req = ctx; + + u16 status = le16_to_cpup(&cqe->status) >> 1; + u32 result = le32_to_cpup(&cqe->result); - rcu_read_lock(); - nvmeq = rcu_dereference(dev->queues[queue_id]); - if (nvmeq) - return nvmeq; + blk_put_request(req); - rcu_read_unlock(); - put_cpu_var(*dev->io_queue); - return NULL; + dev_warn(nvmeq->q_dmadev, "Abort status:%x result:%x", status, result); + ++nvmeq->dev->abort_limit; } -static void put_nvmeq(struct nvme_queue *nvmeq) __releases(RCU) +static void async_completion(struct nvme_queue *nvmeq, void *ctx, + struct nvme_completion *cqe) { - rcu_read_unlock(); - put_cpu_var(nvmeq->dev->io_queue); + struct async_cmd_info *cmdinfo = ctx; + cmdinfo->result = le32_to_cpup(&cqe->result); + cmdinfo->status = le16_to_cpup(&cqe->status) >> 1; + queue_kthread_work(cmdinfo->worker, &cmdinfo->work); + blk_put_request(cmdinfo->req); } -static struct nvme_queue *lock_nvmeq(struct nvme_dev *dev, int q_idx) - __acquires(RCU) +static inline struct nvme_cmd_info *get_cmd_from_tag(struct nvme_queue *nvmeq, + unsigned int tag) { - struct nvme_queue *nvmeq; - - rcu_read_lock(); - nvmeq = rcu_dereference(dev->queues[q_idx]); - if (nvmeq) - return nvmeq; + struct blk_mq_hw_ctx *hctx = nvmeq->hctx; + struct request *req = blk_mq_tag_to_rq(hctx->tags, tag); - rcu_read_unlock(); - return NULL; + return blk_mq_rq_to_pdu(req); } -static void unlock_nvmeq(struct nvme_queue *nvmeq) __releases(RCU) +/* + * Called with local interrupts disabled and the q_lock held. May not sleep. + */ +static void *nvme_finish_cmd(struct nvme_queue *nvmeq, int tag, + nvme_completion_fn *fn) { - rcu_read_unlock(); + struct nvme_cmd_info *cmd = get_cmd_from_tag(nvmeq, tag); + void *ctx; + if (tag >= nvmeq->q_depth) { + *fn = special_completion; + return CMD_CTX_INVALID; + } + if (fn) + *fn = cmd->fn; + ctx = cmd->ctx; + cmd->fn = special_completion; + cmd->ctx = CMD_CTX_COMPLETED; + return ctx; } /** @@ -348,26 +325,29 @@ static void unlock_nvmeq(struct nvme_queue *nvmeq) __releases(RCU) * * Safe to use from interrupt context */ -static int nvme_submit_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd) +static int __nvme_submit_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd) { - unsigned long flags; - u16 tail; - spin_lock_irqsave(&nvmeq->q_lock, flags); - if (nvmeq->q_suspended) { - spin_unlock_irqrestore(&nvmeq->q_lock, flags); - return -EBUSY; - } - tail = nvmeq->sq_tail; + u16 tail = nvmeq->sq_tail; + memcpy(&nvmeq->sq_cmds[tail], cmd, sizeof(*cmd)); if (++tail == nvmeq->q_depth) tail = 0; writel(tail, nvmeq->q_db); nvmeq->sq_tail = tail; - spin_unlock_irqrestore(&nvmeq->q_lock, flags); return 0; } +static int nvme_submit_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd) +{ + unsigned long flags; + int ret; + spin_lock_irqsave(&nvmeq->q_lock, flags); + ret = __nvme_submit_cmd(nvmeq, cmd); + spin_unlock_irqrestore(&nvmeq->q_lock, flags); + return ret; +} + static __le64 **iod_list(struct nvme_iod *iod) { return ((void *)iod) + iod->offset; @@ -397,7 +377,6 @@ nvme_alloc_iod(unsigned nseg, unsigned nbytes, struct nvme_dev *dev, gfp_t gfp) iod->length = nbytes; iod->nents = 0; iod->first_dma = 0ULL; - iod->start_time = jiffies; } return iod; @@ -421,35 +400,6 @@ void nvme_free_iod(struct nvme_dev *dev, struct nvme_iod *iod) kfree(iod); } -static void nvme_start_io_acct(struct bio *bio) -{ - struct gendisk *disk = bio->bi_bdev->bd_disk; - if (blk_queue_io_stat(disk->queue)) { - const int rw = bio_data_dir(bio); - int cpu = part_stat_lock(); - part_round_stats(cpu, &disk->part0); - part_stat_inc(cpu, &disk->part0, ios[rw]); - part_stat_add(cpu, &disk->part0, sectors[rw], - bio_sectors(bio)); - part_inc_in_flight(&disk->part0, rw); - part_stat_unlock(); - } -} - -static void nvme_end_io_acct(struct bio *bio, unsigned long start_time) -{ - struct gendisk *disk = bio->bi_bdev->bd_disk; - if (blk_queue_io_stat(disk->queue)) { - const int rw = bio_data_dir(bio); - unsigned long duration = jiffies - start_time; - int cpu = part_stat_lock(); - part_stat_add(cpu, &disk->part0, ticks[rw], duration); - part_round_stats(cpu, &disk->part0); - part_dec_in_flight(&disk->part0, rw); - part_stat_unlock(); - } -} - static int nvme_error_status(u16 status) { switch (status & 0x7ff) { @@ -462,36 +412,37 @@ static int nvme_error_status(u16 status) } } -static void bio_completion(struct nvme_queue *nvmeq, void *ctx, +static void req_completion(struct nvme_queue *nvmeq, void *ctx, struct nvme_completion *cqe) { struct nvme_iod *iod = ctx; - struct bio *bio = iod->private; + struct request *req = iod->private; + struct nvme_cmd_info *cmd_rq = blk_mq_rq_to_pdu(req); + u16 status = le16_to_cpup(&cqe->status) >> 1; - int error = 0; if (unlikely(status)) { - if (!(status & NVME_SC_DNR || - bio->bi_rw & REQ_FAILFAST_MASK) && - (jiffies - iod->start_time) < IOD_TIMEOUT) { - if (!waitqueue_active(&nvmeq->sq_full)) - add_wait_queue(&nvmeq->sq_full, - &nvmeq->sq_cong_wait); - list_add_tail(&iod->node, &nvmeq->iod_bio); - wake_up(&nvmeq->sq_full); + if (!(status & NVME_SC_DNR || blk_noretry_request(req)) + && (jiffies - req->start_time) < req->timeout) { + blk_mq_requeue_request(req); + blk_mq_kick_requeue_list(req->q); return; } - error = nvme_error_status(status); - } - if (iod->nents) { - dma_unmap_sg(nvmeq->q_dmadev, iod->sg, iod->nents, - bio_data_dir(bio) ? DMA_TO_DEVICE : DMA_FROM_DEVICE); - nvme_end_io_acct(bio, iod->start_time); - } + req->errors = nvme_error_status(status); + } else + req->errors = 0; + + if (cmd_rq->aborted) + dev_warn(&nvmeq->dev->pci_dev->dev, + "completing aborted command with status:%04x\n", + status); + + if (iod->nents) + dma_unmap_sg(&nvmeq->dev->pci_dev->dev, iod->sg, iod->nents, + rq_data_dir(req) ? DMA_TO_DEVICE : DMA_FROM_DEVICE); nvme_free_iod(nvmeq->dev, iod); - trace_block_bio_complete(bdev_get_queue(bio->bi_bdev), bio, error); - bio_endio(bio, error); + blk_mq_complete_request(req); } /* length is in bytes. gfp flags indicates whether we may sleep. */ @@ -574,88 +525,25 @@ int nvme_setup_prps(struct nvme_dev *dev, struct nvme_iod *iod, int total_len, return total_len; } -static int nvme_split_and_submit(struct bio *bio, struct nvme_queue *nvmeq, - int len) -{ - struct bio *split = bio_split(bio, len >> 9, GFP_ATOMIC, NULL); - if (!split) - return -ENOMEM; - - trace_block_split(bdev_get_queue(bio->bi_bdev), bio, - split->bi_iter.bi_sector); - bio_chain(split, bio); - - if (!waitqueue_active(&nvmeq->sq_full)) - add_wait_queue(&nvmeq->sq_full, &nvmeq->sq_cong_wait); - bio_list_add(&nvmeq->sq_cong, split); - bio_list_add(&nvmeq->sq_cong, bio); - wake_up(&nvmeq->sq_full); - - return 0; -} - -/* NVMe scatterlists require no holes in the virtual address */ -#define BIOVEC_NOT_VIRT_MERGEABLE(vec1, vec2) ((vec2)->bv_offset || \ - (((vec1)->bv_offset + (vec1)->bv_len) % PAGE_SIZE)) - -static int nvme_map_bio(struct nvme_queue *nvmeq, struct nvme_iod *iod, - struct bio *bio, enum dma_data_direction dma_dir, int psegs) -{ - struct bio_vec bvec, bvprv; - struct bvec_iter iter; - struct scatterlist *sg = NULL; - int length = 0, nsegs = 0, split_len = bio->bi_iter.bi_size; - int first = 1; - - if (nvmeq->dev->stripe_size) - split_len = nvmeq->dev->stripe_size - - ((bio->bi_iter.bi_sector << 9) & - (nvmeq->dev->stripe_size - 1)); - - sg_init_table(iod->sg, psegs); - bio_for_each_segment(bvec, bio, iter) { - if (!first && BIOVEC_PHYS_MERGEABLE(&bvprv, &bvec)) { - sg->length += bvec.bv_len; - } else { - if (!first && BIOVEC_NOT_VIRT_MERGEABLE(&bvprv, &bvec)) - return nvme_split_and_submit(bio, nvmeq, - length); - - sg = sg ? sg + 1 : iod->sg; - sg_set_page(sg, bvec.bv_page, - bvec.bv_len, bvec.bv_offset); - nsegs++; - } - - if (split_len - length < bvec.bv_len) - return nvme_split_and_submit(bio, nvmeq, split_len); - length += bvec.bv_len; - bvprv = bvec; - first = 0; - } - iod->nents = nsegs; - sg_mark_end(sg); - if (dma_map_sg(nvmeq->q_dmadev, iod->sg, iod->nents, dma_dir) == 0) - return -ENOMEM; - - BUG_ON(length != bio->bi_iter.bi_size); - return length; -} - -static int nvme_submit_discard(struct nvme_queue *nvmeq, struct nvme_ns *ns, - struct bio *bio, struct nvme_iod *iod, int cmdid) +/* + * We reuse the small pool to allocate the 16-byte range here as it is not + * worth having a special pool for these or additional cases to handle freeing + * the iod. + */ +static void nvme_submit_discard(struct nvme_queue *nvmeq, struct nvme_ns *ns, + struct request *req, struct nvme_iod *iod) { struct nvme_dsm_range *range = (struct nvme_dsm_range *)iod_list(iod)[0]; struct nvme_command *cmnd = &nvmeq->sq_cmds[nvmeq->sq_tail]; range->cattr = cpu_to_le32(0); - range->nlb = cpu_to_le32(bio->bi_iter.bi_size >> ns->lba_shift); - range->slba = cpu_to_le64(nvme_block_nr(ns, bio->bi_iter.bi_sector)); + range->nlb = cpu_to_le32(blk_rq_bytes(req) >> ns->lba_shift); + range->slba = cpu_to_le64(nvme_block_nr(ns, blk_rq_pos(req))); memset(cmnd, 0, sizeof(*cmnd)); cmnd->dsm.opcode = nvme_cmd_dsm; - cmnd->dsm.command_id = cmdid; + cmnd->dsm.command_id = req->tag; cmnd->dsm.nsid = cpu_to_le32(ns->ns_id); cmnd->dsm.prp1 = cpu_to_le64(iod->first_dma); cmnd->dsm.nr = 0; @@ -664,11 +552,9 @@ static int nvme_submit_discard(struct nvme_queue *nvmeq, struct nvme_ns *ns, if (++nvmeq->sq_tail == nvmeq->q_depth) nvmeq->sq_tail = 0; writel(nvmeq->sq_tail, nvmeq->q_db); - - return 0; } -static int nvme_submit_flush(struct nvme_queue *nvmeq, struct nvme_ns *ns, +static void nvme_submit_flush(struct nvme_queue *nvmeq, struct nvme_ns *ns, int cmdid) { struct nvme_command *cmnd = &nvmeq->sq_cmds[nvmeq->sq_tail]; @@ -681,49 +567,34 @@ static int nvme_submit_flush(struct nvme_queue *nvmeq, struct nvme_ns *ns, if (++nvmeq->sq_tail == nvmeq->q_depth) nvmeq->sq_tail = 0; writel(nvmeq->sq_tail, nvmeq->q_db); - - return 0; } -static int nvme_submit_iod(struct nvme_queue *nvmeq, struct nvme_iod *iod) +static int nvme_submit_iod(struct nvme_queue *nvmeq, struct nvme_iod *iod, + struct nvme_ns *ns) { - struct bio *bio = iod->private; - struct nvme_ns *ns = bio->bi_bdev->bd_disk->private_data; + struct request *req = iod->private; struct nvme_command *cmnd; - int cmdid; - u16 control; - u32 dsmgmt; + u16 control = 0; + u32 dsmgmt = 0; - cmdid = alloc_cmdid(nvmeq, iod, bio_completion, NVME_IO_TIMEOUT); - if (unlikely(cmdid < 0)) - return cmdid; - - if (bio->bi_rw & REQ_DISCARD) - return nvme_submit_discard(nvmeq, ns, bio, iod, cmdid); - if (bio->bi_rw & REQ_FLUSH) - return nvme_submit_flush(nvmeq, ns, cmdid); - - control = 0; - if (bio->bi_rw & REQ_FUA) + if (req->cmd_flags & REQ_FUA) control |= NVME_RW_FUA; - if (bio->bi_rw & (REQ_FAILFAST_DEV | REQ_RAHEAD)) + if (req->cmd_flags & (REQ_FAILFAST_DEV | REQ_RAHEAD)) control |= NVME_RW_LR; - dsmgmt = 0; - if (bio->bi_rw & REQ_RAHEAD) + if (req->cmd_flags & REQ_RAHEAD) dsmgmt |= NVME_RW_DSM_FREQ_PREFETCH; cmnd = &nvmeq->sq_cmds[nvmeq->sq_tail]; memset(cmnd, 0, sizeof(*cmnd)); - cmnd->rw.opcode = bio_data_dir(bio) ? nvme_cmd_write : nvme_cmd_read; - cmnd->rw.command_id = cmdid; + cmnd->rw.opcode = (rq_data_dir(req) ? nvme_cmd_write : nvme_cmd_read); + cmnd->rw.command_id = req->tag; cmnd->rw.nsid = cpu_to_le32(ns->ns_id); cmnd->rw.prp1 = cpu_to_le64(sg_dma_address(iod->sg)); cmnd->rw.prp2 = cpu_to_le64(iod->first_dma); - cmnd->rw.slba = cpu_to_le64(nvme_block_nr(ns, bio->bi_iter.bi_sector)); - cmnd->rw.length = - cpu_to_le16((bio->bi_iter.bi_size >> ns->lba_shift) - 1); + cmnd->rw.slba = cpu_to_le64(nvme_block_nr(ns, blk_rq_pos(req))); + cmnd->rw.length = cpu_to_le16((blk_rq_bytes(req) >> ns->lba_shift) - 1); cmnd->rw.control = cpu_to_le16(control); cmnd->rw.dsmgmt = cpu_to_le32(dsmgmt); @@ -734,47 +605,37 @@ static int nvme_submit_iod(struct nvme_queue *nvmeq, struct nvme_iod *iod) return 0; } -static int nvme_split_flush_data(struct nvme_queue *nvmeq, struct bio *bio) -{ - struct bio *split = bio_clone(bio, GFP_ATOMIC); - if (!split) - return -ENOMEM; - - split->bi_iter.bi_size = 0; - split->bi_phys_segments = 0; - bio->bi_rw &= ~REQ_FLUSH; - bio_chain(split, bio); - - if (!waitqueue_active(&nvmeq->sq_full)) - add_wait_queue(&nvmeq->sq_full, &nvmeq->sq_cong_wait); - bio_list_add(&nvmeq->sq_cong, split); - bio_list_add(&nvmeq->sq_cong, bio); - wake_up_process(nvme_thread); - - return 0; -} - -/* - * Called with local interrupts disabled and the q_lock held. May not sleep. - */ -static int nvme_submit_bio_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns, - struct bio *bio) +static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx, + const struct blk_mq_queue_data *bd) { + struct nvme_ns *ns = hctx->queue->queuedata; + struct nvme_queue *nvmeq = hctx->driver_data; + struct request *req = bd->rq; + struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(req); struct nvme_iod *iod; - int psegs = bio_phys_segments(ns->queue, bio); - int result; - unsigned size = !(bio->bi_rw & REQ_DISCARD) ? bio->bi_iter.bi_size : + int psegs = req->nr_phys_segments; + int result = BLK_MQ_RQ_QUEUE_BUSY; + enum dma_data_direction dma_dir; + unsigned size = !(req->cmd_flags & REQ_DISCARD) ? blk_rq_bytes(req) : sizeof(struct nvme_dsm_range); - if ((bio->bi_rw & REQ_FLUSH) && psegs) - return nvme_split_flush_data(nvmeq, bio); + /* + * Requeued IO has already been prepped + */ + iod = req->special; + if (iod) + goto submit_iod; iod = nvme_alloc_iod(psegs, size, ns->dev, GFP_ATOMIC); if (!iod) - return -ENOMEM; + return result; + + iod->private = req; + req->special = iod; - iod->private = bio; - if (bio->bi_rw & REQ_DISCARD) { + nvme_set_info(cmd, iod, req_completion); + + if (req->cmd_flags & REQ_DISCARD) { void *range; /* * We reuse the small pool to allocate the 16-byte range here @@ -784,33 +645,45 @@ static int nvme_submit_bio_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns, range = dma_pool_alloc(nvmeq->dev->prp_small_pool, GFP_ATOMIC, &iod->first_dma); - if (!range) { - result = -ENOMEM; - goto free_iod; - } + if (!range) + goto finish_cmd; iod_list(iod)[0] = (__le64 *)range; iod->npages = 0; } else if (psegs) { - result = nvme_map_bio(nvmeq, iod, bio, - bio_data_dir(bio) ? DMA_TO_DEVICE : DMA_FROM_DEVICE, - psegs); - if (result <= 0) - goto free_iod; - if (nvme_setup_prps(nvmeq->dev, iod, result, GFP_ATOMIC) != - result) { - result = -ENOMEM; - goto free_iod; + dma_dir = rq_data_dir(req) ? DMA_TO_DEVICE : DMA_FROM_DEVICE; + + sg_init_table(iod->sg, psegs); + iod->nents = blk_rq_map_sg(req->q, req, iod->sg); + if (!iod->nents) { + result = BLK_MQ_RQ_QUEUE_ERROR; + goto finish_cmd; } - nvme_start_io_acct(bio); - } - if (unlikely(nvme_submit_iod(nvmeq, iod))) { - if (!waitqueue_active(&nvmeq->sq_full)) - add_wait_queue(&nvmeq->sq_full, &nvmeq->sq_cong_wait); - list_add_tail(&iod->node, &nvmeq->iod_bio); + + if (!dma_map_sg(nvmeq->q_dmadev, iod->sg, iod->nents, dma_dir)) + goto finish_cmd; + + if (blk_rq_bytes(req) != nvme_setup_prps(nvmeq->dev, iod, + blk_rq_bytes(req), GFP_ATOMIC)) + goto finish_cmd; } - return 0; - free_iod: + blk_mq_start_request(req); + + submit_iod: + spin_lock_irq(&nvmeq->q_lock); + if (req->cmd_flags & REQ_DISCARD) + nvme_submit_discard(nvmeq, ns, req, iod); + else if (req->cmd_flags & REQ_FLUSH) + nvme_submit_flush(nvmeq, ns, req->tag); + else + nvme_submit_iod(nvmeq, iod, ns); + + nvme_process_cq(nvmeq); + spin_unlock_irq(&nvmeq->q_lock); + return BLK_MQ_RQ_QUEUE_OK; + + finish_cmd: + nvme_finish_cmd(nvmeq, req->tag, NULL); nvme_free_iod(nvmeq->dev, iod); return result; } @@ -833,8 +706,7 @@ static int nvme_process_cq(struct nvme_queue *nvmeq) head = 0; phase = !phase; } - - ctx = free_cmdid(nvmeq, cqe.command_id, &fn); + ctx = nvme_finish_cmd(nvmeq, cqe.command_id, &fn); fn(nvmeq, ctx, &cqe); } @@ -855,29 +727,13 @@ static int nvme_process_cq(struct nvme_queue *nvmeq) return 1; } -static void nvme_make_request(struct request_queue *q, struct bio *bio) +/* Admin queue isn't initialized as a request queue. If at some point this + * happens anyway, make sure to notify the user */ +static int nvme_admin_queue_rq(struct blk_mq_hw_ctx *hctx, + const struct blk_mq_queue_data *bd) { - struct nvme_ns *ns = q->queuedata; - struct nvme_queue *nvmeq = get_nvmeq(ns->dev); - int result = -EBUSY; - - if (!nvmeq) { - bio_endio(bio, -EIO); - return; - } - - spin_lock_irq(&nvmeq->q_lock); - if (!nvmeq->q_suspended && bio_list_empty(&nvmeq->sq_cong)) - result = nvme_submit_bio_queue(nvmeq, ns, bio); - if (unlikely(result)) { - if (!waitqueue_active(&nvmeq->sq_full)) - add_wait_queue(&nvmeq->sq_full, &nvmeq->sq_cong_wait); - bio_list_add(&nvmeq->sq_cong, bio); - } - - nvme_process_cq(nvmeq); - spin_unlock_irq(&nvmeq->q_lock); - put_nvmeq(nvmeq); + WARN_ON_ONCE(1); + return BLK_MQ_RQ_QUEUE_ERROR; } static irqreturn_t nvme_irq(int irq, void *data) @@ -901,10 +757,11 @@ static irqreturn_t nvme_irq_check(int irq, void *data) return IRQ_WAKE_THREAD; } -static void nvme_abort_command(struct nvme_queue *nvmeq, int cmdid) +static void nvme_abort_cmd_info(struct nvme_queue *nvmeq, struct nvme_cmd_info * + cmd_info) { spin_lock_irq(&nvmeq->q_lock); - cancel_cmdid(nvmeq, cmdid, NULL); + cancel_cmd_info(cmd_info, NULL); spin_unlock_irq(&nvmeq->q_lock); } @@ -927,45 +784,31 @@ static void sync_completion(struct nvme_queue *nvmeq, void *ctx, * Returns 0 on success. If the result is negative, it's a Linux error code; * if the result is positive, it's an NVM Express status code */ -static int nvme_submit_sync_cmd(struct nvme_dev *dev, int q_idx, - struct nvme_command *cmd, +static int nvme_submit_sync_cmd(struct request *req, struct nvme_command *cmd, u32 *result, unsigned timeout) { - int cmdid, ret; + int ret; struct sync_cmd_info cmdinfo; - struct nvme_queue *nvmeq; - - nvmeq = lock_nvmeq(dev, q_idx); - if (!nvmeq) - return -ENODEV; + struct nvme_cmd_info *cmd_rq = blk_mq_rq_to_pdu(req); + struct nvme_queue *nvmeq = cmd_rq->nvmeq; cmdinfo.task = current; cmdinfo.status = -EINTR; - cmdid = alloc_cmdid(nvmeq, &cmdinfo, sync_completion, timeout); - if (cmdid < 0) { - unlock_nvmeq(nvmeq); - return cmdid; - } - cmd->common.command_id = cmdid; + cmd->common.command_id = req->tag; + + nvme_set_info(cmd_rq, &cmdinfo, sync_completion); set_current_state(TASK_KILLABLE); ret = nvme_submit_cmd(nvmeq, cmd); if (ret) { - free_cmdid(nvmeq, cmdid, NULL); - unlock_nvmeq(nvmeq); + nvme_finish_cmd(nvmeq, req->tag, NULL); set_current_state(TASK_RUNNING); - return ret; } - unlock_nvmeq(nvmeq); schedule_timeout(timeout); if (cmdinfo.status == -EINTR) { - nvmeq = lock_nvmeq(dev, q_idx); - if (nvmeq) { - nvme_abort_command(nvmeq, cmdid); - unlock_nvmeq(nvmeq); - } + nvme_abort_cmd_info(nvmeq, blk_mq_rq_to_pdu(req)); return -EINTR; } @@ -975,59 +818,99 @@ static int nvme_submit_sync_cmd(struct nvme_dev *dev, int q_idx, return cmdinfo.status; } -static int nvme_submit_async_cmd(struct nvme_queue *nvmeq, +static int nvme_submit_async_admin_req(struct nvme_dev *dev) +{ + struct nvme_queue *nvmeq = dev->queues[0]; + struct nvme_command c; + struct nvme_cmd_info *cmd_info; + struct request *req; + + req = blk_mq_alloc_request(dev->admin_q, WRITE, GFP_KERNEL, false); + if (!req) + return -ENOMEM; + + cmd_info = blk_mq_rq_to_pdu(req); + nvme_set_info(cmd_info, req, async_req_completion); + + memset(&c, 0, sizeof(c)); + c.common.opcode = nvme_admin_async_event; + c.common.command_id = req->tag; + + return __nvme_submit_cmd(nvmeq, &c); +} + +static int nvme_submit_admin_async_cmd(struct nvme_dev *dev, struct nvme_command *cmd, struct async_cmd_info *cmdinfo, unsigned timeout) { - int cmdid; + struct nvme_queue *nvmeq = dev->queues[0]; + struct request *req; + struct nvme_cmd_info *cmd_rq; - cmdid = alloc_cmdid_killable(nvmeq, cmdinfo, async_completion, timeout); - if (cmdid < 0) - return cmdid; + req = blk_mq_alloc_request(dev->admin_q, WRITE, GFP_KERNEL, false); + if (!req) + return -ENOMEM; + + req->timeout = timeout; + cmd_rq = blk_mq_rq_to_pdu(req); + cmdinfo->req = req; + nvme_set_info(cmd_rq, cmdinfo, async_completion); cmdinfo->status = -EINTR; - cmd->common.command_id = cmdid; + + cmd->common.command_id = req->tag; + return nvme_submit_cmd(nvmeq, cmd); } -int nvme_submit_admin_cmd(struct nvme_dev *dev, struct nvme_command *cmd, - u32 *result) +int __nvme_submit_admin_cmd(struct nvme_dev *dev, struct nvme_command *cmd, + u32 *result, unsigned timeout) { - return nvme_submit_sync_cmd(dev, 0, cmd, result, ADMIN_TIMEOUT); + int res; + struct request *req; + + req = blk_mq_alloc_request(dev->admin_q, WRITE, GFP_KERNEL, false); + if (!req) + return -ENOMEM; + res = nvme_submit_sync_cmd(req, cmd, result, timeout); + blk_put_request(req); + return res; } -int nvme_submit_io_cmd(struct nvme_dev *dev, struct nvme_command *cmd, +int nvme_submit_admin_cmd(struct nvme_dev *dev, struct nvme_command *cmd, u32 *result) { - return nvme_submit_sync_cmd(dev, this_cpu_read(*dev->io_queue), cmd, - result, NVME_IO_TIMEOUT); + return __nvme_submit_admin_cmd(dev, cmd, result, ADMIN_TIMEOUT); } -static int nvme_submit_admin_cmd_async(struct nvme_dev *dev, - struct nvme_command *cmd, struct async_cmd_info *cmdinfo) +int nvme_submit_io_cmd(struct nvme_dev *dev, struct nvme_ns *ns, + struct nvme_command *cmd, u32 *result) { - return nvme_submit_async_cmd(raw_nvmeq(dev, 0), cmd, cmdinfo, - ADMIN_TIMEOUT); + int res; + struct request *req; + + req = blk_mq_alloc_request(ns->queue, WRITE, (GFP_KERNEL|__GFP_WAIT), + false); + if (!req) + return -ENOMEM; + res = nvme_submit_sync_cmd(req, cmd, result, NVME_IO_TIMEOUT); + blk_put_request(req); + return res; } static int adapter_delete_queue(struct nvme_dev *dev, u8 opcode, u16 id) { - int status; struct nvme_command c; memset(&c, 0, sizeof(c)); c.delete_queue.opcode = opcode; c.delete_queue.qid = cpu_to_le16(id); - status = nvme_submit_admin_cmd(dev, &c, NULL); - if (status) - return -EIO; - return 0; + return nvme_submit_admin_cmd(dev, &c, NULL); } static int adapter_alloc_cq(struct nvme_dev *dev, u16 qid, struct nvme_queue *nvmeq) { - int status; struct nvme_command c; int flags = NVME_QUEUE_PHYS_CONTIG | NVME_CQ_IRQ_ENABLED; @@ -1039,16 +922,12 @@ static int adapter_alloc_cq(struct nvme_dev *dev, u16 qid, c.create_cq.cq_flags = cpu_to_le16(flags); c.create_cq.irq_vector = cpu_to_le16(nvmeq->cq_vector); - status = nvme_submit_admin_cmd(dev, &c, NULL); - if (status) - return -EIO; - return 0; + return nvme_submit_admin_cmd(dev, &c, NULL); } static int adapter_alloc_sq(struct nvme_dev *dev, u16 qid, struct nvme_queue *nvmeq) { - int status; struct nvme_command c; int flags = NVME_QUEUE_PHYS_CONTIG | NVME_SQ_PRIO_MEDIUM; @@ -1060,10 +939,7 @@ static int adapter_alloc_sq(struct nvme_dev *dev, u16 qid, c.create_sq.sq_flags = cpu_to_le16(flags); c.create_sq.cqid = cpu_to_le16(qid); - status = nvme_submit_admin_cmd(dev, &c, NULL); - if (status) - return -EIO; - return 0; + return nvme_submit_admin_cmd(dev, &c, NULL); } static int adapter_delete_cq(struct nvme_dev *dev, u16 cqid) @@ -1119,28 +995,27 @@ int nvme_set_features(struct nvme_dev *dev, unsigned fid, unsigned dword11, } /** - * nvme_abort_cmd - Attempt aborting a command - * @cmdid: Command id of a timed out IO - * @queue: The queue with timed out IO + * nvme_abort_req - Attempt aborting a request * * Schedule controller reset if the command was already aborted once before and * still hasn't been returned to the driver, or if this is the admin queue. */ -static void nvme_abort_cmd(int cmdid, struct nvme_queue *nvmeq) +static void nvme_abort_req(struct request *req) { - int a_cmdid; - struct nvme_command cmd; + struct nvme_cmd_info *cmd_rq = blk_mq_rq_to_pdu(req); + struct nvme_queue *nvmeq = cmd_rq->nvmeq; struct nvme_dev *dev = nvmeq->dev; - struct nvme_cmd_info *info = nvme_cmd_info(nvmeq); - struct nvme_queue *adminq; + struct request *abort_req; + struct nvme_cmd_info *abort_cmd; + struct nvme_command cmd; - if (!nvmeq->qid || info[cmdid].aborted) { + if (!nvmeq->qid || cmd_rq->aborted) { if (work_busy(&dev->reset_work)) return; list_del_init(&dev->node); dev_warn(&dev->pci_dev->dev, - "I/O %d QID %d timeout, reset controller\n", cmdid, - nvmeq->qid); + "I/O %d QID %d timeout, reset controller\n", + req->tag, nvmeq->qid); dev->reset_workfn = nvme_reset_failed_dev; queue_work(nvme_workq, &dev->reset_work); return; @@ -1149,89 +1024,79 @@ static void nvme_abort_cmd(int cmdid, struct nvme_queue *nvmeq) if (!dev->abort_limit) return; - adminq = rcu_dereference(dev->queues[0]); - a_cmdid = alloc_cmdid(adminq, CMD_CTX_ABORT, special_completion, - ADMIN_TIMEOUT); - if (a_cmdid < 0) + abort_req = blk_mq_alloc_request(dev->admin_q, WRITE, GFP_ATOMIC, + false); + if (!abort_req) return; + abort_cmd = blk_mq_rq_to_pdu(abort_req); + nvme_set_info(abort_cmd, abort_req, abort_completion); + memset(&cmd, 0, sizeof(cmd)); cmd.abort.opcode = nvme_admin_abort_cmd; - cmd.abort.cid = cmdid; + cmd.abort.cid = req->tag; cmd.abort.sqid = cpu_to_le16(nvmeq->qid); - cmd.abort.command_id = a_cmdid; + cmd.abort.command_id = abort_req->tag; --dev->abort_limit; - info[cmdid].aborted = 1; - info[cmdid].timeout = jiffies + ADMIN_TIMEOUT; + cmd_rq->aborted = 1; - dev_warn(nvmeq->q_dmadev, "Aborting I/O %d QID %d\n", cmdid, + dev_warn(nvmeq->q_dmadev, "Aborting I/O %d QID %d\n", req->tag, nvmeq->qid); - nvme_submit_cmd(adminq, &cmd); + if (nvme_submit_cmd(dev->queues[0], &cmd) < 0) { + dev_warn(nvmeq->q_dmadev, + "Could not abort I/O %d QID %d", + req->tag, nvmeq->qid); + blk_put_request(req); + } } -/** - * nvme_cancel_ios - Cancel outstanding I/Os - * @queue: The queue to cancel I/Os on - * @timeout: True to only cancel I/Os which have timed out - */ -static void nvme_cancel_ios(struct nvme_queue *nvmeq, bool timeout) +static void nvme_cancel_queue_ios(struct blk_mq_hw_ctx *hctx, + struct request *req, void *data, bool reserved) { - int depth = nvmeq->q_depth - 1; - struct nvme_cmd_info *info = nvme_cmd_info(nvmeq); - unsigned long now = jiffies; - int cmdid; + struct nvme_queue *nvmeq = data; + void *ctx; + nvme_completion_fn fn; + struct nvme_cmd_info *cmd; + static struct nvme_completion cqe = { + .status = cpu_to_le16(NVME_SC_ABORT_REQ << 1), + }; - for_each_set_bit(cmdid, nvmeq->cmdid_data, depth) { - void *ctx; - nvme_completion_fn fn; - static struct nvme_completion cqe = { - .status = cpu_to_le16(NVME_SC_ABORT_REQ << 1), - }; + cmd = blk_mq_rq_to_pdu(req); - if (timeout && !time_after(now, info[cmdid].timeout)) - continue; - if (info[cmdid].ctx == CMD_CTX_CANCELLED) - continue; - if (timeout && info[cmdid].ctx == CMD_CTX_ASYNC) - continue; - if (timeout && nvmeq->dev->initialized) { - nvme_abort_cmd(cmdid, nvmeq); - continue; - } - dev_warn(nvmeq->q_dmadev, "Cancelling I/O %d QID %d\n", cmdid, - nvmeq->qid); - ctx = cancel_cmdid(nvmeq, cmdid, &fn); - fn(nvmeq, ctx, &cqe); - } + if (cmd->ctx == CMD_CTX_CANCELLED) + return; + + dev_warn(nvmeq->q_dmadev, "Cancelling I/O %d QID %d\n", + req->tag, nvmeq->qid); + ctx = cancel_cmd_info(cmd, &fn); + fn(nvmeq, ctx, &cqe); } -static void nvme_free_queue(struct nvme_queue *nvmeq) +static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved) { - spin_lock_irq(&nvmeq->q_lock); - while (bio_list_peek(&nvmeq->sq_cong)) { - struct bio *bio = bio_list_pop(&nvmeq->sq_cong); - bio_endio(bio, -EIO); - } - while (!list_empty(&nvmeq->iod_bio)) { - static struct nvme_completion cqe = { - .status = cpu_to_le16( - (NVME_SC_ABORT_REQ | NVME_SC_DNR) << 1), - }; - struct nvme_iod *iod = list_first_entry(&nvmeq->iod_bio, - struct nvme_iod, - node); - list_del(&iod->node); - bio_completion(nvmeq, iod, &cqe); - } - spin_unlock_irq(&nvmeq->q_lock); + struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(req); + struct nvme_queue *nvmeq = cmd->nvmeq; + + dev_warn(nvmeq->q_dmadev, "Timeout I/O %d QID %d\n", req->tag, + nvmeq->qid); + if (nvmeq->dev->initialized) + nvme_abort_req(req); + + /* + * The aborted req will be completed on receiving the abort req. + * We enable the timer again. If hit twice, it'll cause a device reset, + * as the device then is in a faulty state. + */ + return BLK_EH_RESET_TIMER; +} +static void nvme_free_queue(struct nvme_queue *nvmeq) +{ dma_free_coherent(nvmeq->q_dmadev, CQ_SIZE(nvmeq->q_depth), (void *)nvmeq->cqes, nvmeq->cq_dma_addr); dma_free_coherent(nvmeq->q_dmadev, SQ_SIZE(nvmeq->q_depth), nvmeq->sq_cmds, nvmeq->sq_dma_addr); - if (nvmeq->qid) - free_cpumask_var(nvmeq->cpu_mask); kfree(nvmeq); } @@ -1243,10 +1108,10 @@ static void nvme_free_queues(struct nvme_dev *dev, int lowest) int i; for (i = dev->queue_count - 1; i >= lowest; i--) { - nvmeq = raw_nvmeq(dev, i); - RCU_INIT_POINTER(dev->queues[i], NULL); + struct nvme_queue *nvmeq = dev->queues[i]; llist_add(&nvmeq->node, &q_list); dev->queue_count--; + dev->queues[i] = NULL; } synchronize_rcu(); entry = llist_del_all(&q_list); @@ -1257,19 +1122,12 @@ static void nvme_free_queues(struct nvme_dev *dev, int lowest) /** * nvme_suspend_queue - put queue into suspended state * @nvmeq - queue to suspend - * - * Returns 1 if already suspended, 0 otherwise. */ static int nvme_suspend_queue(struct nvme_queue *nvmeq) { int vector = nvmeq->dev->entry[nvmeq->cq_vector].vector; spin_lock_irq(&nvmeq->q_lock); - if (nvmeq->q_suspended) { - spin_unlock_irq(&nvmeq->q_lock); - return 1; - } - nvmeq->q_suspended = 1; nvmeq->dev->online_queues--; spin_unlock_irq(&nvmeq->q_lock); @@ -1281,15 +1139,18 @@ static int nvme_suspend_queue(struct nvme_queue *nvmeq) static void nvme_clear_queue(struct nvme_queue *nvmeq) { + struct blk_mq_hw_ctx *hctx = nvmeq->hctx; + spin_lock_irq(&nvmeq->q_lock); nvme_process_cq(nvmeq); - nvme_cancel_ios(nvmeq, false); + if (hctx && hctx->tags) + blk_mq_tag_busy_iter(hctx, nvme_cancel_queue_ios, nvmeq); spin_unlock_irq(&nvmeq->q_lock); } static void nvme_disable_queue(struct nvme_dev *dev, int qid) { - struct nvme_queue *nvmeq = raw_nvmeq(dev, qid); + struct nvme_queue *nvmeq = dev->queues[qid]; if (!nvmeq) return; @@ -1309,8 +1170,7 @@ static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid, int depth, int vector) { struct device *dmadev = &dev->pci_dev->dev; - unsigned extra = nvme_queue_extra(depth); - struct nvme_queue *nvmeq = kzalloc(sizeof(*nvmeq) + extra, GFP_KERNEL); + struct nvme_queue *nvmeq = kzalloc(sizeof(*nvmeq), GFP_KERNEL); if (!nvmeq) return NULL; @@ -1324,9 +1184,6 @@ static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid, if (!nvmeq->sq_cmds) goto free_cqdma; - if (qid && !zalloc_cpumask_var(&nvmeq->cpu_mask, GFP_KERNEL)) - goto free_sqdma; - nvmeq->q_dmadev = dmadev; nvmeq->dev = dev; snprintf(nvmeq->irqname, sizeof(nvmeq->irqname), "nvme%dq%d", @@ -1334,22 +1191,15 @@ static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid, spin_lock_init(&nvmeq->q_lock); nvmeq->cq_head = 0; nvmeq->cq_phase = 1; - init_waitqueue_head(&nvmeq->sq_full); - bio_list_init(&nvmeq->sq_cong); - INIT_LIST_HEAD(&nvmeq->iod_bio); nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride]; nvmeq->q_depth = depth; nvmeq->cq_vector = vector; nvmeq->qid = qid; - nvmeq->q_suspended = 1; dev->queue_count++; - rcu_assign_pointer(dev->queues[qid], nvmeq); + dev->queues[qid] = nvmeq; return nvmeq; - free_sqdma: - dma_free_coherent(dmadev, SQ_SIZE(depth), (void *)nvmeq->sq_cmds, - nvmeq->sq_dma_addr); free_cqdma: dma_free_coherent(dmadev, CQ_SIZE(depth), (void *)nvmeq->cqes, nvmeq->cq_dma_addr); @@ -1372,18 +1222,13 @@ static int queue_request_irq(struct nvme_dev *dev, struct nvme_queue *nvmeq, static void nvme_init_queue(struct nvme_queue *nvmeq, u16 qid) { struct nvme_dev *dev = nvmeq->dev; - unsigned extra = nvme_queue_extra(nvmeq->q_depth); spin_lock_irq(&nvmeq->q_lock); - init_waitqueue_entry(&nvmeq->sq_cong_wait, nvme_thread); nvmeq->sq_tail = 0; nvmeq->cq_head = 0; nvmeq->cq_phase = 1; nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride]; - memset(nvmeq->cmdid_data, 0, extra); memset((void *)nvmeq->cqes, 0, CQ_SIZE(nvmeq->q_depth)); - nvme_cancel_ios(nvmeq, false); - nvmeq->q_suspended = 0; dev->online_queues++; spin_unlock_irq(&nvmeq->q_lock); } @@ -1486,6 +1331,52 @@ static int nvme_shutdown_ctrl(struct nvme_dev *dev) return 0; } +static struct blk_mq_ops nvme_mq_admin_ops = { + .queue_rq = nvme_admin_queue_rq, + .map_queue = blk_mq_map_queue, + .init_hctx = nvme_admin_init_hctx, + .init_request = nvme_admin_init_request, + .timeout = nvme_timeout, +}; + +static struct blk_mq_ops nvme_mq_ops = { + .queue_rq = nvme_queue_rq, + .map_queue = blk_mq_map_queue, + .init_hctx = nvme_init_hctx, + .init_request = nvme_init_request, + .timeout = nvme_timeout, +}; + +static int nvme_alloc_admin_tags(struct nvme_dev *dev) +{ + if (!dev->admin_q) { + dev->admin_tagset.ops = &nvme_mq_admin_ops; + dev->admin_tagset.nr_hw_queues = 1; + dev->admin_tagset.queue_depth = NVME_AQ_DEPTH - 1; + dev->admin_tagset.timeout = ADMIN_TIMEOUT; + dev->admin_tagset.numa_node = dev_to_node(&dev->pci_dev->dev); + dev->admin_tagset.cmd_size = sizeof(struct nvme_cmd_info); + dev->admin_tagset.driver_data = dev; + + if (blk_mq_alloc_tag_set(&dev->admin_tagset)) + return -ENOMEM; + + dev->admin_q = blk_mq_init_queue(&dev->admin_tagset); + if (!dev->admin_q) { + blk_mq_free_tag_set(&dev->admin_tagset); + return -ENOMEM; + } + } + + return 0; +} + +static void nvme_free_admin_tags(struct nvme_dev *dev) +{ + if (dev->admin_q) + blk_mq_free_tag_set(&dev->admin_tagset); +} + static int nvme_configure_admin_queue(struct nvme_dev *dev) { int result; @@ -1515,9 +1406,9 @@ static int nvme_configure_admin_queue(struct nvme_dev *dev) if (result < 0) return result; - nvmeq = raw_nvmeq(dev, 0); + nvmeq = dev->queues[0]; if (!nvmeq) { - nvmeq = nvme_alloc_queue(dev, 0, 64, 0); + nvmeq = nvme_alloc_queue(dev, 0, NVME_AQ_DEPTH, 0); if (!nvmeq) return -ENOMEM; } @@ -1538,13 +1429,23 @@ static int nvme_configure_admin_queue(struct nvme_dev *dev) result = nvme_enable_ctrl(dev, cap); if (result) - return result; + goto free_nvmeq; + + result = nvme_alloc_admin_tags(dev); + if (result) + goto free_nvmeq; result = queue_request_irq(dev, nvmeq, nvmeq->irqname); if (result) - return result; + goto free_tags; return result; + + free_tags: + nvme_free_admin_tags(dev); + free_nvmeq: + nvme_free_queues(dev, 0); + return result; } struct nvme_iod *nvme_map_user_pages(struct nvme_dev *dev, int write, @@ -1702,7 +1603,7 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio) if (length != (io.nblocks + 1) << ns->lba_shift) status = -ENOMEM; else - status = nvme_submit_io_cmd(dev, &c, NULL); + status = nvme_submit_io_cmd(dev, ns, &c, NULL); if (meta_len) { if (status == NVME_SC_SUCCESS && !(io.opcode & 1)) { @@ -1734,8 +1635,8 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio) return status; } -static int nvme_user_cmd(struct nvme_dev *dev, - struct nvme_passthru_cmd __user *ucmd, bool ioq) +static int nvme_user_cmd(struct nvme_dev *dev, struct nvme_ns *ns, + struct nvme_passthru_cmd __user *ucmd) { struct nvme_passthru_cmd cmd; struct nvme_command c; @@ -1774,13 +1675,23 @@ static int nvme_user_cmd(struct nvme_dev *dev, timeout = cmd.timeout_ms ? msecs_to_jiffies(cmd.timeout_ms) : ADMIN_TIMEOUT; + if (length != cmd.data_len) status = -ENOMEM; - else if (ioq) - status = nvme_submit_sync_cmd(dev, this_cpu_read(*dev->io_queue), &c, - &cmd.result, timeout); - else - status = nvme_submit_sync_cmd(dev, 0, &c, &cmd.result, timeout); + else if (ns) { + struct request *req; + + req = blk_mq_alloc_request(ns->queue, WRITE, + (GFP_KERNEL|__GFP_WAIT), false); + if (!req) + status = -ENOMEM; + else { + status = nvme_submit_sync_cmd(req, &c, &cmd.result, + timeout); + blk_put_request(req); + } + } else + status = __nvme_submit_admin_cmd(dev, &c, &cmd.result, timeout); if (cmd.data_len) { nvme_unmap_user_pages(dev, cmd.opcode & 1, iod); @@ -1804,9 +1715,9 @@ static int nvme_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, force_successful_syscall_return(); return ns->ns_id; case NVME_IOCTL_ADMIN_CMD: - return nvme_user_cmd(ns->dev, (void __user *)arg, false); + return nvme_user_cmd(ns->dev, NULL, (void __user *)arg); case NVME_IOCTL_IO_CMD: - return nvme_user_cmd(ns->dev, (void __user *)arg, true); + return nvme_user_cmd(ns->dev, ns, (void __user *)arg); case NVME_IOCTL_SUBMIT_IO: return nvme_submit_io(ns, (void __user *)arg); case SG_GET_VERSION_NUM: @@ -1906,62 +1817,6 @@ static const struct block_device_operations nvme_fops = { .revalidate_disk= nvme_revalidate_disk, }; -static void nvme_resubmit_iods(struct nvme_queue *nvmeq) -{ - struct nvme_iod *iod, *next; - - list_for_each_entry_safe(iod, next, &nvmeq->iod_bio, node) { - if (unlikely(nvme_submit_iod(nvmeq, iod))) - break; - list_del(&iod->node); - if (bio_list_empty(&nvmeq->sq_cong) && - list_empty(&nvmeq->iod_bio)) - remove_wait_queue(&nvmeq->sq_full, - &nvmeq->sq_cong_wait); - } -} - -static void nvme_resubmit_bios(struct nvme_queue *nvmeq) -{ - while (bio_list_peek(&nvmeq->sq_cong)) { - struct bio *bio = bio_list_pop(&nvmeq->sq_cong); - struct nvme_ns *ns = bio->bi_bdev->bd_disk->private_data; - - if (bio_list_empty(&nvmeq->sq_cong) && - list_empty(&nvmeq->iod_bio)) - remove_wait_queue(&nvmeq->sq_full, - &nvmeq->sq_cong_wait); - if (nvme_submit_bio_queue(nvmeq, ns, bio)) { - if (!waitqueue_active(&nvmeq->sq_full)) - add_wait_queue(&nvmeq->sq_full, - &nvmeq->sq_cong_wait); - bio_list_add_head(&nvmeq->sq_cong, bio); - break; - } - } -} - -static int nvme_submit_async_req(struct nvme_queue *nvmeq) -{ - struct nvme_command *c; - int cmdid; - - cmdid = alloc_cmdid(nvmeq, CMD_CTX_ASYNC, special_completion, 0); - if (cmdid < 0) - return cmdid; - - c = &nvmeq->sq_cmds[nvmeq->sq_tail]; - memset(c, 0, sizeof(*c)); - c->common.opcode = nvme_admin_async_event; - c->common.command_id = cmdid; - - if (++nvmeq->sq_tail == nvmeq->q_depth) - nvmeq->sq_tail = 0; - writel(nvmeq->sq_tail, nvmeq->q_db); - - return 0; -} - static int nvme_kthread(void *data) { struct nvme_dev *dev, *next; @@ -1977,34 +1832,26 @@ static int nvme_kthread(void *data) continue; list_del_init(&dev->node); dev_warn(&dev->pci_dev->dev, - "Failed status, reset controller\n"); + "Failed status: %x, reset controller\n", + readl(&dev->bar->csts)); dev->reset_workfn = nvme_reset_failed_dev; queue_work(nvme_workq, &dev->reset_work); continue; } - rcu_read_lock(); for (i = 0; i < dev->queue_count; i++) { - struct nvme_queue *nvmeq = - rcu_dereference(dev->queues[i]); + struct nvme_queue *nvmeq = dev->queues[i]; if (!nvmeq) continue; spin_lock_irq(&nvmeq->q_lock); - if (nvmeq->q_suspended) - goto unlock; nvme_process_cq(nvmeq); - nvme_cancel_ios(nvmeq, true); - nvme_resubmit_bios(nvmeq); - nvme_resubmit_iods(nvmeq); while ((i == 0) && (dev->event_limit > 0)) { - if (nvme_submit_async_req(nvmeq)) + if (nvme_submit_async_admin_req(dev)) break; dev->event_limit--; } - unlock: spin_unlock_irq(&nvmeq->q_lock); } - rcu_read_unlock(); } spin_unlock(&dev_list_lock); schedule_timeout(round_jiffies_relative(HZ)); @@ -2027,29 +1874,29 @@ static struct nvme_ns *nvme_alloc_ns(struct nvme_dev *dev, unsigned nsid, { struct nvme_ns *ns; struct gendisk *disk; + int node = dev_to_node(&dev->pci_dev->dev); int lbaf; if (rt->attributes & NVME_LBART_ATTRIB_HIDE) return NULL; - ns = kzalloc(sizeof(*ns), GFP_KERNEL); + ns = kzalloc_node(sizeof(*ns), GFP_KERNEL, node); if (!ns) return NULL; - ns->queue = blk_alloc_queue(GFP_KERNEL); + ns->queue = blk_mq_init_queue(&dev->tagset); if (!ns->queue) goto out_free_ns; - ns->queue->queue_flags = QUEUE_FLAG_DEFAULT; - queue_flag_clear_unlocked(QUEUE_FLAG_STACKABLE, ns->queue); queue_flag_set_unlocked(QUEUE_FLAG_NOMERGES, ns->queue); queue_flag_set_unlocked(QUEUE_FLAG_NONROT, ns->queue); - queue_flag_clear_unlocked(QUEUE_FLAG_ADD_RANDOM, ns->queue); - blk_queue_make_request(ns->queue, nvme_make_request); + queue_flag_set_unlocked(QUEUE_FLAG_SG_GAPS, ns->queue); + queue_flag_clear_unlocked(QUEUE_FLAG_IO_STAT, ns->queue); ns->dev = dev; ns->queue->queuedata = ns; - disk = alloc_disk(0); + disk = alloc_disk_node(0, node); if (!disk) goto out_free_queue; + ns->ns_id = nsid; ns->disk = disk; lbaf = id->flbas & 0xf; @@ -2058,6 +1905,8 @@ static struct nvme_ns *nvme_alloc_ns(struct nvme_dev *dev, unsigned nsid, blk_queue_logical_block_size(ns->queue, 1 << ns->lba_shift); if (dev->max_hw_sectors) blk_queue_max_hw_sectors(ns->queue, dev->max_hw_sectors); + if (dev->stripe_size) + blk_queue_chunk_sectors(ns->queue, dev->stripe_size >> 9); if (dev->vwc & NVME_CTRL_VWC_PRESENT) blk_queue_flush(ns->queue, REQ_FLUSH | REQ_FUA); @@ -2083,143 +1932,19 @@ static struct nvme_ns *nvme_alloc_ns(struct nvme_dev *dev, unsigned nsid, return NULL; } -static int nvme_find_closest_node(int node) -{ - int n, val, min_val = INT_MAX, best_node = node; - - for_each_online_node(n) { - if (n == node) - continue; - val = node_distance(node, n); - if (val < min_val) { - min_val = val; - best_node = n; - } - } - return best_node; -} - -static void nvme_set_queue_cpus(cpumask_t *qmask, struct nvme_queue *nvmeq, - int count) -{ - int cpu; - for_each_cpu(cpu, qmask) { - if (cpumask_weight(nvmeq->cpu_mask) >= count) - break; - if (!cpumask_test_and_set_cpu(cpu, nvmeq->cpu_mask)) - *per_cpu_ptr(nvmeq->dev->io_queue, cpu) = nvmeq->qid; - } -} - -static void nvme_add_cpus(cpumask_t *mask, const cpumask_t *unassigned_cpus, - const cpumask_t *new_mask, struct nvme_queue *nvmeq, int cpus_per_queue) -{ - int next_cpu; - for_each_cpu(next_cpu, new_mask) { - cpumask_or(mask, mask, get_cpu_mask(next_cpu)); - cpumask_or(mask, mask, topology_thread_cpumask(next_cpu)); - cpumask_and(mask, mask, unassigned_cpus); - nvme_set_queue_cpus(mask, nvmeq, cpus_per_queue); - } -} - static void nvme_create_io_queues(struct nvme_dev *dev) { - unsigned i, max; + unsigned i; - max = min(dev->max_qid, num_online_cpus()); - for (i = dev->queue_count; i <= max; i++) + for (i = dev->queue_count; i <= dev->max_qid; i++) if (!nvme_alloc_queue(dev, i, dev->q_depth, i - 1)) break; - max = min(dev->queue_count - 1, num_online_cpus()); - for (i = dev->online_queues; i <= max; i++) - if (nvme_create_queue(raw_nvmeq(dev, i), i)) + for (i = dev->online_queues; i <= dev->queue_count - 1; i++) + if (nvme_create_queue(dev->queues[i], i)) break; } -/* - * If there are fewer queues than online cpus, this will try to optimally - * assign a queue to multiple cpus by grouping cpus that are "close" together: - * thread siblings, core, socket, closest node, then whatever else is - * available. - */ -static void nvme_assign_io_queues(struct nvme_dev *dev) -{ - unsigned cpu, cpus_per_queue, queues, remainder, i; - cpumask_var_t unassigned_cpus; - - nvme_create_io_queues(dev); - - queues = min(dev->online_queues - 1, num_online_cpus()); - if (!queues) - return; - - cpus_per_queue = num_online_cpus() / queues; - remainder = queues - (num_online_cpus() - queues * cpus_per_queue); - - if (!alloc_cpumask_var(&unassigned_cpus, GFP_KERNEL)) - return; - - cpumask_copy(unassigned_cpus, cpu_online_mask); - cpu = cpumask_first(unassigned_cpus); - for (i = 1; i <= queues; i++) { - struct nvme_queue *nvmeq = lock_nvmeq(dev, i); - cpumask_t mask; - - cpumask_clear(nvmeq->cpu_mask); - if (!cpumask_weight(unassigned_cpus)) { - unlock_nvmeq(nvmeq); - break; - } - - mask = *get_cpu_mask(cpu); - nvme_set_queue_cpus(&mask, nvmeq, cpus_per_queue); - if (cpus_weight(mask) < cpus_per_queue) - nvme_add_cpus(&mask, unassigned_cpus, - topology_thread_cpumask(cpu), - nvmeq, cpus_per_queue); - if (cpus_weight(mask) < cpus_per_queue) - nvme_add_cpus(&mask, unassigned_cpus, - topology_core_cpumask(cpu), - nvmeq, cpus_per_queue); - if (cpus_weight(mask) < cpus_per_queue) - nvme_add_cpus(&mask, unassigned_cpus, - cpumask_of_node(cpu_to_node(cpu)), - nvmeq, cpus_per_queue); - if (cpus_weight(mask) < cpus_per_queue) - nvme_add_cpus(&mask, unassigned_cpus, - cpumask_of_node( - nvme_find_closest_node( - cpu_to_node(cpu))), - nvmeq, cpus_per_queue); - if (cpus_weight(mask) < cpus_per_queue) - nvme_add_cpus(&mask, unassigned_cpus, - unassigned_cpus, - nvmeq, cpus_per_queue); - - WARN(cpumask_weight(nvmeq->cpu_mask) != cpus_per_queue, - "nvme%d qid:%d mis-matched queue-to-cpu assignment\n", - dev->instance, i); - - irq_set_affinity_hint(dev->entry[nvmeq->cq_vector].vector, - nvmeq->cpu_mask); - cpumask_andnot(unassigned_cpus, unassigned_cpus, - nvmeq->cpu_mask); - cpu = cpumask_next(cpu, unassigned_cpus); - if (remainder && !--remainder) - cpus_per_queue++; - unlock_nvmeq(nvmeq); - } - WARN(cpumask_weight(unassigned_cpus), "nvme%d unassigned online cpus\n", - dev->instance); - i = 0; - cpumask_andnot(unassigned_cpus, cpu_possible_mask, cpu_online_mask); - for_each_cpu(cpu, unassigned_cpus) - *per_cpu_ptr(dev->io_queue, cpu) = (i++ % queues) + 1; - free_cpumask_var(unassigned_cpus); -} - static int set_queue_count(struct nvme_dev *dev, int count) { int status; @@ -2243,33 +1968,9 @@ static size_t db_bar_size(struct nvme_dev *dev, unsigned nr_io_queues) return 4096 + ((nr_io_queues + 1) * 8 * dev->db_stride); } -static void nvme_cpu_workfn(struct work_struct *work) -{ - struct nvme_dev *dev = container_of(work, struct nvme_dev, cpu_work); - if (dev->initialized) - nvme_assign_io_queues(dev); -} - -static int nvme_cpu_notify(struct notifier_block *self, - unsigned long action, void *hcpu) -{ - struct nvme_dev *dev; - - switch (action) { - case CPU_ONLINE: - case CPU_DEAD: - spin_lock(&dev_list_lock); - list_for_each_entry(dev, &dev_list, node) - schedule_work(&dev->cpu_work); - spin_unlock(&dev_list_lock); - break; - } - return NOTIFY_OK; -} - static int nvme_setup_io_queues(struct nvme_dev *dev) { - struct nvme_queue *adminq = raw_nvmeq(dev, 0); + struct nvme_queue *adminq = dev->queues[0]; struct pci_dev *pdev = dev->pci_dev; int result, i, vecs, nr_io_queues, size; @@ -2321,14 +2022,12 @@ static int nvme_setup_io_queues(struct nvme_dev *dev) dev->max_qid = nr_io_queues; result = queue_request_irq(dev, adminq, adminq->irqname); - if (result) { - adminq->q_suspended = 1; + if (result) goto free_queues; - } /* Free previously allocated queues that are no longer usable */ nvme_free_queues(dev, nr_io_queues + 1); - nvme_assign_io_queues(dev); + nvme_create_io_queues(dev); return 0; @@ -2378,8 +2077,30 @@ static int nvme_dev_add(struct nvme_dev *dev) if (ctrl->mdts) dev->max_hw_sectors = 1 << (ctrl->mdts + shift - 9); if ((pdev->vendor == PCI_VENDOR_ID_INTEL) && - (pdev->device == 0x0953) && ctrl->vs[3]) + (pdev->device == 0x0953) && ctrl->vs[3]) { + unsigned int max_hw_sectors; + dev->stripe_size = 1 << (ctrl->vs[3] + shift); + max_hw_sectors = dev->stripe_size >> (shift - 9); + if (dev->max_hw_sectors) { + dev->max_hw_sectors = min(max_hw_sectors, + dev->max_hw_sectors); + } else + dev->max_hw_sectors = max_hw_sectors; + } + + dev->tagset.ops = &nvme_mq_ops; + dev->tagset.nr_hw_queues = dev->online_queues - 1; + dev->tagset.timeout = NVME_IO_TIMEOUT; + dev->tagset.numa_node = dev_to_node(&dev->pci_dev->dev); + dev->tagset.queue_depth = + min_t(int, dev->q_depth, BLK_MQ_MAX_DEPTH) - 1; + dev->tagset.cmd_size = sizeof(struct nvme_cmd_info); + dev->tagset.flags = BLK_MQ_F_SHOULD_MERGE; + dev->tagset.driver_data = dev; + + if (blk_mq_alloc_tag_set(&dev->tagset)) + goto out; id_ns = mem; for (i = 1; i <= nn; i++) { @@ -2529,7 +2250,8 @@ static int adapter_async_del_queue(struct nvme_queue *nvmeq, u8 opcode, c.delete_queue.qid = cpu_to_le16(nvmeq->qid); init_kthread_work(&nvmeq->cmdinfo.work, fn); - return nvme_submit_admin_cmd_async(nvmeq->dev, &c, &nvmeq->cmdinfo); + return nvme_submit_admin_async_cmd(nvmeq->dev, &c, &nvmeq->cmdinfo, + ADMIN_TIMEOUT); } static void nvme_del_cq_work_handler(struct kthread_work *work) @@ -2592,7 +2314,7 @@ static void nvme_disable_io_queues(struct nvme_dev *dev) atomic_set(&dq.refcount, 0); dq.worker = &worker; for (i = dev->queue_count - 1; i > 0; i--) { - struct nvme_queue *nvmeq = raw_nvmeq(dev, i); + struct nvme_queue *nvmeq = dev->queues[i]; if (nvme_suspend_queue(nvmeq)) continue; @@ -2637,7 +2359,7 @@ static void nvme_dev_shutdown(struct nvme_dev *dev) csts = readl(&dev->bar->csts); if (csts & NVME_CSTS_CFS || !(csts & NVME_CSTS_RDY)) { for (i = dev->queue_count - 1; i >= 0; i--) { - struct nvme_queue *nvmeq = raw_nvmeq(dev, i); + struct nvme_queue *nvmeq = dev->queues[i]; nvme_suspend_queue(nvmeq); nvme_clear_queue(nvmeq); } @@ -2649,6 +2371,12 @@ static void nvme_dev_shutdown(struct nvme_dev *dev) nvme_dev_unmap(dev); } +static void nvme_dev_remove_admin(struct nvme_dev *dev) +{ + if (dev->admin_q && !blk_queue_dying(dev->admin_q)) + blk_cleanup_queue(dev->admin_q); +} + static void nvme_dev_remove(struct nvme_dev *dev) { struct nvme_ns *ns; @@ -2736,7 +2464,7 @@ static void nvme_free_dev(struct kref *kref) pci_dev_put(dev->pci_dev); nvme_free_namespaces(dev); - free_percpu(dev->io_queue); + blk_mq_free_tag_set(&dev->tagset); kfree(dev->queues); kfree(dev->entry); kfree(dev); @@ -2761,11 +2489,16 @@ static int nvme_dev_release(struct inode *inode, struct file *f) static long nvme_dev_ioctl(struct file *f, unsigned int cmd, unsigned long arg) { struct nvme_dev *dev = f->private_data; + struct nvme_ns *ns; + switch (cmd) { case NVME_IOCTL_ADMIN_CMD: - return nvme_user_cmd(dev, (void __user *)arg, false); + return nvme_user_cmd(dev, NULL, (void __user *)arg); case NVME_IOCTL_IO_CMD: - return nvme_user_cmd(dev, (void __user *)arg, true); + if (list_empty(&dev->namespaces)) + return -ENOTTY; + ns = list_first_entry(&dev->namespaces, struct nvme_ns, list); + return nvme_user_cmd(dev, ns, (void __user *)arg); default: return -ENOTTY; } @@ -2779,6 +2512,22 @@ static const struct file_operations nvme_dev_fops = { .compat_ioctl = nvme_dev_ioctl, }; +static void nvme_set_irq_hints(struct nvme_dev *dev) +{ + struct nvme_queue *nvmeq; + int i; + + for (i = 0; i < dev->online_queues; i++) { + nvmeq = dev->queues[i]; + + if (!nvmeq->hctx) + continue; + + irq_set_affinity_hint(dev->entry[nvmeq->cq_vector].vector, + nvmeq->hctx->cpumask); + } +} + static int nvme_dev_start(struct nvme_dev *dev) { int result; @@ -2810,12 +2559,15 @@ static int nvme_dev_start(struct nvme_dev *dev) result = nvme_thread ? PTR_ERR(nvme_thread) : -EINTR; goto disable; } - nvme_init_queue(raw_nvmeq(dev, 0), 0); + + nvme_init_queue(dev->queues[0], 0); result = nvme_setup_io_queues(dev); if (result) goto disable; + nvme_set_irq_hints(dev); + return result; disable: @@ -2866,7 +2618,7 @@ static void nvme_dev_reset(struct nvme_dev *dev) { nvme_dev_shutdown(dev); if (nvme_dev_resume(dev)) { - dev_err(&dev->pci_dev->dev, "Device failed to resume\n"); + dev_warn(&dev->pci_dev->dev, "Device failed to resume\n"); kref_get(&dev->kref); if (IS_ERR(kthread_run(nvme_remove_dead_ctrl, dev, "nvme%d", dev->instance))) { @@ -2891,28 +2643,28 @@ static void nvme_reset_workfn(struct work_struct *work) static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) { - int result = -ENOMEM; + int node, result = -ENOMEM; struct nvme_dev *dev; - dev = kzalloc(sizeof(*dev), GFP_KERNEL); + node = dev_to_node(&pdev->dev); + if (node == NUMA_NO_NODE) + set_dev_node(&pdev->dev, 0); + + dev = kzalloc_node(sizeof(*dev), GFP_KERNEL, node); if (!dev) return -ENOMEM; - dev->entry = kcalloc(num_possible_cpus(), sizeof(*dev->entry), - GFP_KERNEL); + dev->entry = kzalloc_node(num_possible_cpus() * sizeof(*dev->entry), + GFP_KERNEL, node); if (!dev->entry) goto free; - dev->queues = kcalloc(num_possible_cpus() + 1, sizeof(void *), - GFP_KERNEL); + dev->queues = kzalloc_node((num_possible_cpus() + 1) * sizeof(void *), + GFP_KERNEL, node); if (!dev->queues) goto free; - dev->io_queue = alloc_percpu(unsigned short); - if (!dev->io_queue) - goto free; INIT_LIST_HEAD(&dev->namespaces); dev->reset_workfn = nvme_reset_failed_dev; INIT_WORK(&dev->reset_work, nvme_reset_workfn); - INIT_WORK(&dev->cpu_work, nvme_cpu_workfn); dev->pci_dev = pci_dev_get(pdev); pci_set_drvdata(pdev, dev); result = nvme_set_instance(dev); @@ -2942,11 +2694,14 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) if (result) goto remove; + nvme_set_irq_hints(dev); + dev->initialized = 1; return 0; remove: nvme_dev_remove(dev); + nvme_dev_remove_admin(dev); nvme_free_namespaces(dev); shutdown: nvme_dev_shutdown(dev); @@ -2958,7 +2713,6 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) put_pci: pci_dev_put(dev->pci_dev); free: - free_percpu(dev->io_queue); kfree(dev->queues); kfree(dev->entry); kfree(dev); @@ -2991,11 +2745,12 @@ static void nvme_remove(struct pci_dev *pdev) pci_set_drvdata(pdev, NULL); flush_work(&dev->reset_work); - flush_work(&dev->cpu_work); misc_deregister(&dev->miscdev); + nvme_dev_remove(dev); nvme_dev_shutdown(dev); + nvme_dev_remove_admin(dev); nvme_free_queues(dev, 0); - nvme_dev_remove(dev); + nvme_free_admin_tags(dev); nvme_release_instance(dev); nvme_release_prp_pools(dev); kref_put(&dev->kref, nvme_free_dev); @@ -3079,18 +2834,11 @@ static int __init nvme_init(void) else if (result > 0) nvme_major = result; - nvme_nb.notifier_call = &nvme_cpu_notify; - result = register_hotcpu_notifier(&nvme_nb); - if (result) - goto unregister_blkdev; - result = pci_register_driver(&nvme_driver); if (result) - goto unregister_hotcpu; + goto unregister_blkdev; return 0; - unregister_hotcpu: - unregister_hotcpu_notifier(&nvme_nb); unregister_blkdev: unregister_blkdev(nvme_major, "nvme"); kill_workq: diff --git a/drivers/block/nvme-scsi.c b/drivers/block/nvme-scsi.c index 046ae3321c5e..49f86d1a5aa2 100644 --- a/drivers/block/nvme-scsi.c +++ b/drivers/block/nvme-scsi.c @@ -2105,7 +2105,7 @@ static int nvme_trans_do_nvme_io(struct nvme_ns *ns, struct sg_io_hdr *hdr, nvme_offset += unit_num_blocks; - nvme_sc = nvme_submit_io_cmd(dev, &c, NULL); + nvme_sc = nvme_submit_io_cmd(dev, ns, &c, NULL); if (nvme_sc != NVME_SC_SUCCESS) { nvme_unmap_user_pages(dev, (is_write) ? DMA_TO_DEVICE : DMA_FROM_DEVICE, @@ -2658,7 +2658,7 @@ static int nvme_trans_start_stop(struct nvme_ns *ns, struct sg_io_hdr *hdr, c.common.opcode = nvme_cmd_flush; c.common.nsid = cpu_to_le32(ns->ns_id); - nvme_sc = nvme_submit_io_cmd(ns->dev, &c, NULL); + nvme_sc = nvme_submit_io_cmd(ns->dev, ns, &c, NULL); res = nvme_trans_status_code(hdr, nvme_sc); if (res) goto out; @@ -2686,7 +2686,7 @@ static int nvme_trans_synchronize_cache(struct nvme_ns *ns, c.common.opcode = nvme_cmd_flush; c.common.nsid = cpu_to_le32(ns->ns_id); - nvme_sc = nvme_submit_io_cmd(ns->dev, &c, NULL); + nvme_sc = nvme_submit_io_cmd(ns->dev, ns, &c, NULL); res = nvme_trans_status_code(hdr, nvme_sc); if (res) @@ -2894,7 +2894,7 @@ static int nvme_trans_unmap(struct nvme_ns *ns, struct sg_io_hdr *hdr, c.dsm.nr = cpu_to_le32(ndesc - 1); c.dsm.attributes = cpu_to_le32(NVME_DSMGMT_AD); - nvme_sc = nvme_submit_io_cmd(dev, &c, NULL); + nvme_sc = nvme_submit_io_cmd(dev, ns, &c, NULL); res = nvme_trans_status_code(hdr, nvme_sc); dma_free_coherent(&dev->pci_dev->dev, ndesc * sizeof(*range), diff --git a/include/linux/nvme.h b/include/linux/nvme.h index ed09074e5554..258945fcabf1 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -19,6 +19,7 @@ #include #include #include +#include struct nvme_bar { __u64 cap; /* Controller Capabilities */ @@ -71,8 +72,10 @@ extern unsigned char nvme_io_timeout; */ struct nvme_dev { struct list_head node; - struct nvme_queue __rcu **queues; - unsigned short __percpu *io_queue; + struct nvme_queue **queues; + struct request_queue *admin_q; + struct blk_mq_tag_set tagset; + struct blk_mq_tag_set admin_tagset; u32 __iomem *dbs; struct pci_dev *pci_dev; struct dma_pool *prp_page_pool; @@ -91,7 +94,6 @@ struct nvme_dev { struct miscdevice miscdev; work_func_t reset_workfn; struct work_struct reset_work; - struct work_struct cpu_work; char name[12]; char serial[20]; char model[40]; @@ -135,7 +137,6 @@ struct nvme_iod { int offset; /* Of PRP list */ int nents; /* Used in scatterlist */ int length; /* Of data, in bytes */ - unsigned long start_time; dma_addr_t first_dma; struct list_head node; struct scatterlist sg[0]; @@ -153,12 +154,14 @@ static inline u64 nvme_block_nr(struct nvme_ns *ns, sector_t sector) */ void nvme_free_iod(struct nvme_dev *dev, struct nvme_iod *iod); -int nvme_setup_prps(struct nvme_dev *, struct nvme_iod *, int , gfp_t); +int nvme_setup_prps(struct nvme_dev *, struct nvme_iod *, int, gfp_t); struct nvme_iod *nvme_map_user_pages(struct nvme_dev *dev, int write, unsigned long addr, unsigned length); void nvme_unmap_user_pages(struct nvme_dev *dev, int write, struct nvme_iod *iod); -int nvme_submit_io_cmd(struct nvme_dev *, struct nvme_command *, u32 *); +int nvme_submit_io_cmd(struct nvme_dev *, struct nvme_ns *, + struct nvme_command *, u32 *); +int nvme_submit_flush_data(struct nvme_queue *nvmeq, struct nvme_ns *ns); int nvme_submit_admin_cmd(struct nvme_dev *, struct nvme_command *, u32 *result); int nvme_identify(struct nvme_dev *, unsigned nsid, unsigned cns, -- cgit v1.2.3 From 179e20b8df97e0c7541a1bae30cad53ecc7a5e86 Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Mon, 10 Nov 2014 17:21:09 +0100 Subject: drbd: Minor cleanups . Update comments . drbd_set_{in,out_of}_sync(): Remove unused parameters . Move common code into adm_del_resource() . Redefine ERR_MINOR_EXISTS -> ERR_MINOR_OR_VOLUME_EXISTS Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg Signed-off-by: Jens Axboe --- drivers/block/drbd/drbd_actlog.c | 3 +- drivers/block/drbd/drbd_int.h | 9 +++--- drivers/block/drbd/drbd_main.c | 6 ++-- drivers/block/drbd/drbd_nl.c | 60 +++++++++++++++++----------------------- include/linux/drbd.h | 2 +- 5 files changed, 35 insertions(+), 45 deletions(-) (limited to 'include') diff --git a/drivers/block/drbd/drbd_actlog.c b/drivers/block/drbd/drbd_actlog.c index a2dfa169237d..1318e3217cb0 100644 --- a/drivers/block/drbd/drbd_actlog.c +++ b/drivers/block/drbd/drbd_actlog.c @@ -827,8 +827,7 @@ static int update_sync_bits(struct drbd_device *device, * */ int __drbd_change_sync(struct drbd_device *device, sector_t sector, int size, - enum update_sync_bits_mode mode, - const char *file, const unsigned int line) + enum update_sync_bits_mode mode) { /* Is called from worker and receiver context _only_ */ unsigned long sbnr, ebnr, lbnr; diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h index 9b22f8f01b57..c14b718c2fab 100644 --- a/drivers/block/drbd/drbd_int.h +++ b/drivers/block/drbd/drbd_int.h @@ -1662,14 +1662,13 @@ extern void drbd_advance_rs_marks(struct drbd_device *device, unsigned long stil enum update_sync_bits_mode { RECORD_RS_FAILED, SET_OUT_OF_SYNC, SET_IN_SYNC }; extern int __drbd_change_sync(struct drbd_device *device, sector_t sector, int size, - enum update_sync_bits_mode mode, - const char *file, const unsigned int line); + enum update_sync_bits_mode mode); #define drbd_set_in_sync(device, sector, size) \ - __drbd_change_sync(device, sector, size, SET_IN_SYNC, __FILE__, __LINE__) + __drbd_change_sync(device, sector, size, SET_IN_SYNC) #define drbd_set_out_of_sync(device, sector, size) \ - __drbd_change_sync(device, sector, size, SET_OUT_OF_SYNC, __FILE__, __LINE__) + __drbd_change_sync(device, sector, size, SET_OUT_OF_SYNC) #define drbd_rs_failed_io(device, sector, size) \ - __drbd_change_sync(device, sector, size, RECORD_RS_FAILED, __FILE__, __LINE__) + __drbd_change_sync(device, sector, size, RECORD_RS_FAILED) extern void drbd_al_shrink(struct drbd_device *device); extern int drbd_initialize_al(struct drbd_device *, void *); diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index 973c185c9cfe..cce25a5eb157 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -2731,7 +2731,7 @@ enum drbd_ret_code drbd_create_device(struct drbd_config_context *adm_ctx, unsig device = minor_to_device(minor); if (device) - return ERR_MINOR_EXISTS; + return ERR_MINOR_OR_VOLUME_EXISTS; /* GFP_KERNEL, we are outside of all write-out paths */ device = kzalloc(sizeof(struct drbd_device), GFP_KERNEL); @@ -2794,7 +2794,7 @@ enum drbd_ret_code drbd_create_device(struct drbd_config_context *adm_ctx, unsig id = idr_alloc(&drbd_devices, device, minor, minor + 1, GFP_KERNEL); if (id < 0) { if (id == -ENOSPC) { - err = ERR_MINOR_EXISTS; + err = ERR_MINOR_OR_VOLUME_EXISTS; drbd_msg_put_info(adm_ctx->reply_skb, "requested minor exists already"); } goto out_no_minor_idr; @@ -2804,7 +2804,7 @@ enum drbd_ret_code drbd_create_device(struct drbd_config_context *adm_ctx, unsig id = idr_alloc(&resource->devices, device, vnr, vnr + 1, GFP_KERNEL); if (id < 0) { if (id == -ENOSPC) { - err = ERR_MINOR_EXISTS; + err = ERR_MINOR_OR_VOLUME_EXISTS; drbd_msg_put_info(adm_ctx->reply_skb, "requested minor exists already"); } goto out_idr_remove_minor; diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c index 1cd47df44bda..c145619f1ccb 100644 --- a/drivers/block/drbd/drbd_nl.c +++ b/drivers/block/drbd/drbd_nl.c @@ -2052,7 +2052,7 @@ check_net_options(struct drbd_connection *connection, struct net_conf *new_net_c rv = _check_net_options(connection, rcu_dereference(connection->net_conf), new_net_conf); rcu_read_unlock(); - /* connection->volumes protected by genl_lock() here */ + /* connection->peer_devices protected by genl_lock() here */ idr_for_each_entry(&connection->peer_devices, peer_device, i) { struct drbd_device *device = peer_device->device; if (!device->bitmap) { @@ -3483,7 +3483,7 @@ int drbd_adm_new_minor(struct sk_buff *skb, struct genl_info *info) * that first_peer_device(device)->connection and device->vnr match the request. */ if (adm_ctx.device) { if (info->nlhdr->nlmsg_flags & NLM_F_EXCL) - retcode = ERR_MINOR_EXISTS; + retcode = ERR_MINOR_OR_VOLUME_EXISTS; /* else: still NO_ERROR */ goto out; } @@ -3530,6 +3530,27 @@ out: return 0; } +static int adm_del_resource(struct drbd_resource *resource) +{ + struct drbd_connection *connection; + + for_each_connection(connection, resource) { + if (connection->cstate > C_STANDALONE) + return ERR_NET_CONFIGURED; + } + if (!idr_is_empty(&resource->devices)) + return ERR_RES_IN_USE; + + list_del_rcu(&resource->resources); + /* Make sure all threads have actually stopped: state handling only + * does drbd_thread_stop_nowait(). */ + list_for_each_entry(connection, &resource->connections, connections) + drbd_thread_stop(&connection->worker); + synchronize_rcu(); + drbd_free_resource(resource); + return NO_ERROR; +} + int drbd_adm_down(struct sk_buff *skb, struct genl_info *info) { struct drbd_config_context adm_ctx; @@ -3575,14 +3596,6 @@ int drbd_adm_down(struct sk_buff *skb, struct genl_info *info) } } - /* If we reach this, all volumes (of this connection) are Secondary, - * Disconnected, Diskless, aka Unconfigured. Make sure all threads have - * actually stopped, state handling only does drbd_thread_stop_nowait(). */ - for_each_connection(connection, resource) - drbd_thread_stop(&connection->worker); - - /* Now, nothing can fail anymore */ - /* delete volumes */ idr_for_each_entry(&resource->devices, device, i) { retcode = adm_del_minor(device); @@ -3593,10 +3606,7 @@ int drbd_adm_down(struct sk_buff *skb, struct genl_info *info) } } - list_del_rcu(&resource->resources); - synchronize_rcu(); - drbd_free_resource(resource); - retcode = NO_ERROR; + retcode = adm_del_resource(resource); out: mutex_unlock(&resource->adm_mutex); finish: @@ -3608,7 +3618,6 @@ int drbd_adm_del_resource(struct sk_buff *skb, struct genl_info *info) { struct drbd_config_context adm_ctx; struct drbd_resource *resource; - struct drbd_connection *connection; enum drbd_ret_code retcode; retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_RESOURCE); @@ -3616,27 +3625,10 @@ int drbd_adm_del_resource(struct sk_buff *skb, struct genl_info *info) return retcode; if (retcode != NO_ERROR) goto finish; - resource = adm_ctx.resource; - mutex_lock(&resource->adm_mutex); - for_each_connection(connection, resource) { - if (connection->cstate > C_STANDALONE) { - retcode = ERR_NET_CONFIGURED; - goto out; - } - } - if (!idr_is_empty(&resource->devices)) { - retcode = ERR_RES_IN_USE; - goto out; - } - list_del_rcu(&resource->resources); - for_each_connection(connection, resource) - drbd_thread_stop(&connection->worker); - synchronize_rcu(); - drbd_free_resource(resource); - retcode = NO_ERROR; -out: + mutex_lock(&resource->adm_mutex); + retcode = adm_del_resource(resource); mutex_unlock(&resource->adm_mutex); finish: drbd_adm_finish(&adm_ctx, info, retcode); diff --git a/include/linux/drbd.h b/include/linux/drbd.h index debb70d40547..8723f2a99e15 100644 --- a/include/linux/drbd.h +++ b/include/linux/drbd.h @@ -172,7 +172,7 @@ enum drbd_ret_code { ERR_RES_NOT_KNOWN = 158, ERR_RES_IN_USE = 159, ERR_MINOR_CONFIGURED = 160, - ERR_MINOR_EXISTS = 161, + ERR_MINOR_OR_VOLUME_EXISTS = 161, ERR_INVALID_REQUEST = 162, ERR_NEED_APV_100 = 163, ERR_NEED_ALLOW_TWO_PRI = 164, -- cgit v1.2.3