From abfc426d1b2fb2176df59851a64223b58ddae7e7 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 2 Feb 2022 17:01:09 +0100 Subject: block: pass a block_device to bio_clone_fast Pass a block_device to bio_clone_fast and __bio_clone_fast and give the functions more suitable names. Signed-off-by: Christoph Hellwig Reviewed-by: Mike Snitzer Link: https://lore.kernel.org/r/20220202160109.108149-14-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-mq.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'block/blk-mq.c') diff --git a/block/blk-mq.c b/block/blk-mq.c index 1adfe4824ef5..4b868e792ba4 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2975,10 +2975,10 @@ int blk_rq_prep_clone(struct request *rq, struct request *rq_src, bs = &fs_bio_set; __rq_for_each_bio(bio_src, rq_src) { - bio = bio_clone_fast(bio_src, gfp_mask, bs); + bio = bio_alloc_clone(rq->q->disk->part0, bio_src, gfp_mask, + bs); if (!bio) goto free_and_out; - bio->bi_bdev = rq->q->disk->part0; if (bio_ctr && bio_ctr(bio, bio_src, data)) goto free_and_out; -- cgit v1.2.3 From d5869fdc189f0f12a954a48d58a48104a2f5d044 Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Thu, 10 Feb 2022 14:52:22 -0800 Subject: block: introduce block_rq_error tracepoint Currently, rasdaemon uses the existing tracepoint block_rq_complete and filters out non-error cases in order to capture block disk errors. But there are a few problems with this approach: 1. Even kernel trace filter could do the filtering work, there is still some overhead after we enable this tracepoint. 2. The filter is merely based on errno, which does not align with kernel logic to check the errors for print_req_error(). 3. block_rq_complete only provides dev major and minor to identify the block device, it is not convenient to use in user-space. So introduce a new tracepoint block_rq_error just for the error case. With this patch, rasdaemon could switch to block_rq_error. Since the new tracepoint has the similar implementation with block_rq_complete, so move the existing code from TRACE_EVENT block_rq_complete() into new event class block_rq_completion(). Then add event for block_rq_complete and block_rq_err respectively from the newly created event class per the suggestion from Chaitanya Kulkarni. Cc: Jens Axboe Cc: Christoph Hellwig Reviewed-by: Steven Rostedt Signed-off-by: Cong Wang Signed-off-by: Chaitanya Kulkarni Signed-off-by: Yang Shi Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20220210225222.260069-1-shy828301@gmail.com Signed-off-by: Jens Axboe --- block/blk-mq.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'block/blk-mq.c') diff --git a/block/blk-mq.c b/block/blk-mq.c index 4b868e792ba4..6c59ffe765fd 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -789,8 +789,10 @@ bool blk_update_request(struct request *req, blk_status_t error, #endif if (unlikely(error && !blk_rq_is_passthrough(req) && - !(req->rq_flags & RQF_QUIET))) + !(req->rq_flags & RQF_QUIET))) { blk_print_req_error(req, error); + trace_block_rq_error(req, error, nr_bytes); + } blk_account_io_completion(req, nr_bytes); -- cgit v1.2.3 From 248c793359daacd826a7507a258ffe41653efef7 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 15 Feb 2022 11:05:36 +0100 Subject: blk-mq: make the blk-mq stacking code optional The code to stack blk-mq drivers is only used by dm-multipath, and will preferably stay that way. Make it optional and only selected by device mapper, so that the buildbots more easily catch abuses like the one that slipped in in the ufs driver in the last merged window. Another positive side effects is that kernel builds without device mapper shrink a little bit as well. Signed-off-by: Christoph Hellwig Reviewed-by: Mike Snitzer Link: https://lore.kernel.org/r/20220215100540.3892965-2-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-mq.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'block/blk-mq.c') diff --git a/block/blk-mq.c b/block/blk-mq.c index 6c59ffe765fd..db62d34afb63 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2840,6 +2840,7 @@ void blk_mq_submit_bio(struct bio *bio) blk_mq_try_issue_directly(rq->mq_hctx, rq)); } +#ifdef CONFIG_BLK_MQ_STACKING /** * blk_cloned_rq_check_limits - Helper function to check a cloned request * for the new queue limits @@ -3017,6 +3018,7 @@ free_and_out: return -ENOMEM; } EXPORT_SYMBOL_GPL(blk_rq_prep_clone); +#endif /* CONFIG_BLK_MQ_STACKING */ /* * Steal bios from a request and add them to a bio list. -- cgit v1.2.3 From a5efda3c46a1db9a579d953667906933d5037bf9 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 15 Feb 2022 11:05:37 +0100 Subject: blk-mq: fold blk_cloned_rq_check_limits into blk_insert_cloned_request Fold blk_cloned_rq_check_limits into its only caller. Signed-off-by: Christoph Hellwig Reviewed-by: Mike Snitzer Link: https://lore.kernel.org/r/20220215100540.3892965-3-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-mq.c | 38 +++++--------------------------------- 1 file changed, 5 insertions(+), 33 deletions(-) (limited to 'block/blk-mq.c') diff --git a/block/blk-mq.c b/block/blk-mq.c index db62d34afb63..fc132933397f 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2842,26 +2842,14 @@ void blk_mq_submit_bio(struct bio *bio) #ifdef CONFIG_BLK_MQ_STACKING /** - * blk_cloned_rq_check_limits - Helper function to check a cloned request - * for the new queue limits - * @q: the queue - * @rq: the request being checked - * - * Description: - * @rq may have been made based on weaker limitations of upper-level queues - * in request stacking drivers, and it may violate the limitation of @q. - * Since the block layer and the underlying device driver trust @rq - * after it is inserted to @q, it should be checked against @q before - * the insertion using this generic function. - * - * Request stacking drivers like request-based dm may change the queue - * limits when retrying requests on other queues. Those requests need - * to be checked against the new queue limits again during dispatch. + * blk_insert_cloned_request - Helper for stacking drivers to submit a request + * @q: the queue to submit the request + * @rq: the request being queued */ -static blk_status_t blk_cloned_rq_check_limits(struct request_queue *q, - struct request *rq) +blk_status_t blk_insert_cloned_request(struct request_queue *q, struct request *rq) { unsigned int max_sectors = blk_queue_get_max_sectors(q, req_op(rq)); + blk_status_t ret; if (blk_rq_sectors(rq) > max_sectors) { /* @@ -2893,22 +2881,6 @@ static blk_status_t blk_cloned_rq_check_limits(struct request_queue *q, return BLK_STS_IOERR; } - return BLK_STS_OK; -} - -/** - * blk_insert_cloned_request - Helper for stacking drivers to submit a request - * @q: the queue to submit the request - * @rq: the request being queued - */ -blk_status_t blk_insert_cloned_request(struct request_queue *q, struct request *rq) -{ - blk_status_t ret; - - ret = blk_cloned_rq_check_limits(q, rq); - if (ret != BLK_STS_OK) - return ret; - if (rq->q->disk && should_fail_request(rq->q->disk->part0, blk_rq_bytes(rq))) return BLK_STS_IOERR; -- cgit v1.2.3 From 28db4711bf48303814dcfd8d41a41106e90bc374 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 15 Feb 2022 11:05:38 +0100 Subject: blk-mq: remove the request_queue argument to blk_insert_cloned_request The request must be submitted to the queue it was allocated for, so remove the extra request_queue argument. Signed-off-by: Christoph Hellwig Reviewed-by: Mike Snitzer Link: https://lore.kernel.org/r/20220215100540.3892965-4-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-mq.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'block/blk-mq.c') diff --git a/block/blk-mq.c b/block/blk-mq.c index fc132933397f..886836a54064 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2843,11 +2843,11 @@ void blk_mq_submit_bio(struct bio *bio) #ifdef CONFIG_BLK_MQ_STACKING /** * blk_insert_cloned_request - Helper for stacking drivers to submit a request - * @q: the queue to submit the request * @rq: the request being queued */ -blk_status_t blk_insert_cloned_request(struct request_queue *q, struct request *rq) +blk_status_t blk_insert_cloned_request(struct request *rq) { + struct request_queue *q = rq->q; unsigned int max_sectors = blk_queue_get_max_sectors(q, req_op(rq)); blk_status_t ret; @@ -2881,8 +2881,7 @@ blk_status_t blk_insert_cloned_request(struct request_queue *q, struct request * return BLK_STS_IOERR; } - if (rq->q->disk && - should_fail_request(rq->q->disk->part0, blk_rq_bytes(rq))) + if (q->disk && should_fail_request(q->disk->part0, blk_rq_bytes(rq))) return BLK_STS_IOERR; if (blk_crypto_insert_cloned_request(rq)) @@ -2895,7 +2894,7 @@ blk_status_t blk_insert_cloned_request(struct request_queue *q, struct request * * bypass a potential scheduler on the bottom device for * insert. */ - blk_mq_run_dispatch_ops(rq->q, + blk_mq_run_dispatch_ops(q, ret = blk_mq_request_issue_directly(rq, true)); if (ret) blk_account_io_done(rq, ktime_get_ns()); -- cgit v1.2.3 From 7f36b7d02a287ed18d02ae821868aa07b0235521 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Wed, 16 Feb 2022 12:45:08 +0800 Subject: block: move blk_crypto_bio_prep() out of blk-mq.c blk_crypto_bio_prep() is called for both bio based and blk-mq drivers, so move it out of blk-mq.c, then we can unify this kind of handling. Reviewed-by: Christoph Hellwig Signed-off-by: Ming Lei Reviewed-by: Chaitanya Kulkarni Link: https://lore.kernel.org/r/20220216044514.2903784-3-ming.lei@redhat.com Signed-off-by: Jens Axboe --- block/blk-mq.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'block/blk-mq.c') diff --git a/block/blk-mq.c b/block/blk-mq.c index 886836a54064..7ca0b47246a6 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2788,9 +2788,6 @@ void blk_mq_submit_bio(struct bio *bio) unsigned int nr_segs = 1; blk_status_t ret; - if (unlikely(!blk_crypto_bio_prep(&bio))) - return; - blk_queue_bounce(q, &bio); if (blk_may_split(q, bio)) __blk_queue_split(q, &bio, &nr_segs); -- cgit v1.2.3 From 8f5fea65b06de1cc51d4fc23fb4d378d1abd6ed7 Mon Sep 17 00:00:00 2001 From: David Jeffery Date: Mon, 31 Jan 2022 15:33:37 -0500 Subject: blk-mq: avoid extending delays of active hctx from blk_mq_delay_run_hw_queues When blk_mq_delay_run_hw_queues sets an hctx to run in the future, it can reset the delay length for an already pending delayed work run_work. This creates a scenario where multiple hctx may have their queues set to run, but if one runs first and finds nothing to do, it can reset the delay of another hctx and stall the other hctx's ability to run requests. To avoid this I/O stall when an hctx's run_work is already pending, leave it untouched to run at its current designated time rather than extending its delay. The work will still run which keeps closed the race calling blk_mq_delay_run_hw_queues is needed for while also avoiding the I/O stall. Signed-off-by: David Jeffery Reviewed-by: Ming Lei Link: https://lore.kernel.org/r/20220131203337.GA17666@redhat Signed-off-by: Jens Axboe --- block/blk-mq.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'block/blk-mq.c') diff --git a/block/blk-mq.c b/block/blk-mq.c index 7ca0b47246a6..a05ce7725031 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2179,6 +2179,14 @@ void blk_mq_delay_run_hw_queues(struct request_queue *q, unsigned long msecs) queue_for_each_hw_ctx(q, hctx, i) { if (blk_mq_hctx_stopped(hctx)) continue; + /* + * If there is already a run_work pending, leave the + * pending delay untouched. Otherwise, a hctx can stall + * if another hctx is re-delaying the other's work + * before the work executes. + */ + if (delayed_work_pending(&hctx->run_work)) + continue; /* * Dispatch from this hctx either if there's no hctx preferred * by IO scheduler or if it has requests that bypass the -- cgit v1.2.3 From 4d805131abf219e0019715f1cf29763c613aae07 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Tue, 8 Mar 2022 15:32:14 +0800 Subject: blk-mq: figure out correct numa node for hw queue The current code always uses default queue map and hw queue index for figuring out the numa node for hw queue, this way isn't correct because blk-mq supports three queue maps, and the correct queue map should be used for the specified hw queue. Reviewed-by: Hannes Reinecke Reviewed-by: Christoph Hellwig Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20220308073219.91173-2-ming.lei@redhat.com Signed-off-by: Jens Axboe --- block/blk-mq.c | 36 ++++++++++++++++++++++++++++++------ 1 file changed, 30 insertions(+), 6 deletions(-) (limited to 'block/blk-mq.c') diff --git a/block/blk-mq.c b/block/blk-mq.c index a05ce7725031..5d25abf9e551 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -3107,15 +3107,41 @@ void blk_mq_free_rq_map(struct blk_mq_tags *tags) blk_mq_free_tags(tags); } +static enum hctx_type hctx_idx_to_type(struct blk_mq_tag_set *set, + unsigned int hctx_idx) +{ + int i; + + for (i = 0; i < set->nr_maps; i++) { + unsigned int start = set->map[i].queue_offset; + unsigned int end = start + set->map[i].nr_queues; + + if (hctx_idx >= start && hctx_idx < end) + break; + } + + if (i >= set->nr_maps) + i = HCTX_TYPE_DEFAULT; + + return i; +} + +static int blk_mq_get_hctx_node(struct blk_mq_tag_set *set, + unsigned int hctx_idx) +{ + enum hctx_type type = hctx_idx_to_type(set, hctx_idx); + + return blk_mq_hw_queue_to_node(&set->map[type], hctx_idx); +} + static struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set, unsigned int hctx_idx, unsigned int nr_tags, unsigned int reserved_tags) { + int node = blk_mq_get_hctx_node(set, hctx_idx); struct blk_mq_tags *tags; - int node; - node = blk_mq_hw_queue_to_node(&set->map[HCTX_TYPE_DEFAULT], hctx_idx); if (node == NUMA_NO_NODE) node = set->numa_node; @@ -3164,10 +3190,9 @@ static int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, unsigned int hctx_idx, unsigned int depth) { unsigned int i, j, entries_per_page, max_order = 4; + int node = blk_mq_get_hctx_node(set, hctx_idx); size_t rq_size, left; - int node; - node = blk_mq_hw_queue_to_node(&set->map[HCTX_TYPE_DEFAULT], hctx_idx); if (node == NUMA_NO_NODE) node = set->numa_node; @@ -3941,10 +3966,9 @@ static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set, /* protect against switching io scheduler */ mutex_lock(&q->sysfs_lock); for (i = 0; i < set->nr_hw_queues; i++) { - int node; + int node = blk_mq_get_hctx_node(set, i); struct blk_mq_hw_ctx *hctx; - node = blk_mq_hw_queue_to_node(&set->map[HCTX_TYPE_DEFAULT], i); /* * If the hw queue has been mapped to another numa node, * we need to realloc the hctx. If allocation fails, fallback -- cgit v1.2.3 From 306f13ee16424805d602d427a66ea38078d473b0 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Tue, 8 Mar 2022 15:32:15 +0800 Subject: blk-mq: simplify reallocation of hw ctxs a bit blk_mq_alloc_and_init_hctx() has already taken reuse into account, so no need to do it outside, then we can simplify blk_mq_realloc_hw_ctxs(). Reviewed-by: Hannes Reinecke Reviewed-by: Christoph Hellwig Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20220308073219.91173-3-ming.lei@redhat.com Signed-off-by: Jens Axboe --- block/blk-mq.c | 33 ++++++++++++++------------------- 1 file changed, 14 insertions(+), 19 deletions(-) (limited to 'block/blk-mq.c') diff --git a/block/blk-mq.c b/block/blk-mq.c index 5d25abf9e551..64fddb36c93c 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -3966,29 +3966,24 @@ static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set, /* protect against switching io scheduler */ mutex_lock(&q->sysfs_lock); for (i = 0; i < set->nr_hw_queues; i++) { + int old_node; int node = blk_mq_get_hctx_node(set, i); - struct blk_mq_hw_ctx *hctx; + struct blk_mq_hw_ctx *old_hctx = hctxs[i]; - /* - * If the hw queue has been mapped to another numa node, - * we need to realloc the hctx. If allocation fails, fallback - * to use the previous one. - */ - if (hctxs[i] && (hctxs[i]->numa_node == node)) - continue; + if (old_hctx) { + old_node = old_hctx->numa_node; + blk_mq_exit_hctx(q, set, old_hctx, i); + } - hctx = blk_mq_alloc_and_init_hctx(set, q, i, node); - if (hctx) { - if (hctxs[i]) - blk_mq_exit_hctx(q, set, hctxs[i], i); - hctxs[i] = hctx; - } else { - if (hctxs[i]) - pr_warn("Allocate new hctx on node %d fails,\ - fallback to previous one on node %d\n", - node, hctxs[i]->numa_node); - else + hctxs[i] = blk_mq_alloc_and_init_hctx(set, q, i, node); + if (!hctxs[i]) { + if (!old_hctx) break; + pr_warn("Allocate new hctx on node %d fails, fallback to previous one on node %d\n", + node, old_node); + hctxs[i] = blk_mq_alloc_and_init_hctx(set, q, i, + old_node); + WARN_ON_ONCE(!hctxs[i]); } } /* -- cgit v1.2.3 From 42ee3061293e206bc0f3a084feea1304c61c137a Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Tue, 8 Mar 2022 15:32:16 +0800 Subject: blk-mq: reconfigure poll after queue map is changed queue map can be changed when updating nr_hw_queues, so we need to reconfigure queue's poll capability. Add one helper for doing this job. Reviewed-by: Hannes Reinecke Reviewed-by: Christoph Hellwig Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20220308073219.91173-4-ming.lei@redhat.com Signed-off-by: Jens Axboe --- block/blk-mq.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) (limited to 'block/blk-mq.c') diff --git a/block/blk-mq.c b/block/blk-mq.c index 64fddb36c93c..57ae9df0f4dc 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -4010,6 +4010,17 @@ static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set, mutex_unlock(&q->sysfs_lock); } +static void blk_mq_update_poll_flag(struct request_queue *q) +{ + struct blk_mq_tag_set *set = q->tag_set; + + if (set->nr_maps > HCTX_TYPE_POLL && + set->map[HCTX_TYPE_POLL].nr_queues) + blk_queue_flag_set(QUEUE_FLAG_POLL, q); + else + blk_queue_flag_clear(QUEUE_FLAG_POLL, q); +} + int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, struct request_queue *q) { @@ -4044,9 +4055,7 @@ int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, q->tag_set = set; q->queue_flags |= QUEUE_FLAG_MQ_DEFAULT; - if (set->nr_maps > HCTX_TYPE_POLL && - set->map[HCTX_TYPE_POLL].nr_queues) - blk_queue_flag_set(QUEUE_FLAG_POLL, q); + blk_mq_update_poll_flag(q); INIT_DELAYED_WORK(&q->requeue_work, blk_mq_requeue_work); INIT_LIST_HEAD(&q->requeue_list); @@ -4512,6 +4521,7 @@ fallback: blk_mq_update_queue_map(set); list_for_each_entry(q, &set->tag_list, tag_set_list) { blk_mq_realloc_hw_ctxs(set, q); + blk_mq_update_poll_flag(q); if (q->nr_hw_queues != set->nr_hw_queues) { int i = prev_nr_hw_queues; -- cgit v1.2.3 From 4f481208749a22d3570073e629dbc27d7d27c8da Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Tue, 8 Mar 2022 15:32:18 +0800 Subject: blk-mq: prepare for implementing hctx table via xarray It is inevitable to cause use-after-free on q->queue_hw_ctx between queue_for_each_hw_ctx() and blk_mq_update_nr_hw_queues(). And converting to xarray can fix the uaf, meantime code gets cleaner. Prepare for converting q->queue_hctx_ctx into xarray, one thing is that xa_for_each() can only accept 'unsigned long' as index, so changes type of hctx index of queue_for_each_hw_ctx() into 'unsigned long'. Reviewed-by: Hannes Reinecke Reviewed-by: Christoph Hellwig Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20220308073219.91173-6-ming.lei@redhat.com Signed-off-by: Jens Axboe --- block/blk-mq.c | 30 ++++++++++++++++-------------- 1 file changed, 16 insertions(+), 14 deletions(-) (limited to 'block/blk-mq.c') diff --git a/block/blk-mq.c b/block/blk-mq.c index 57ae9df0f4dc..bffdd71c670d 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -312,7 +312,7 @@ EXPORT_SYMBOL_GPL(blk_mq_unquiesce_queue); void blk_mq_wake_waiters(struct request_queue *q) { struct blk_mq_hw_ctx *hctx; - unsigned int i; + unsigned long i; queue_for_each_hw_ctx(q, hctx, i) if (blk_mq_hw_queue_mapped(hctx)) @@ -1442,7 +1442,7 @@ static void blk_mq_timeout_work(struct work_struct *work) container_of(work, struct request_queue, timeout_work); unsigned long next = 0; struct blk_mq_hw_ctx *hctx; - int i; + unsigned long i; /* A deadlock might occur if a request is stuck requiring a * timeout at the same time a queue freeze is waiting @@ -2143,7 +2143,7 @@ static struct blk_mq_hw_ctx *blk_mq_get_sq_hctx(struct request_queue *q) void blk_mq_run_hw_queues(struct request_queue *q, bool async) { struct blk_mq_hw_ctx *hctx, *sq_hctx; - int i; + unsigned long i; sq_hctx = NULL; if (blk_mq_has_sqsched(q)) @@ -2171,7 +2171,7 @@ EXPORT_SYMBOL(blk_mq_run_hw_queues); void blk_mq_delay_run_hw_queues(struct request_queue *q, unsigned long msecs) { struct blk_mq_hw_ctx *hctx, *sq_hctx; - int i; + unsigned long i; sq_hctx = NULL; if (blk_mq_has_sqsched(q)) @@ -2209,7 +2209,7 @@ EXPORT_SYMBOL(blk_mq_delay_run_hw_queues); bool blk_mq_queue_stopped(struct request_queue *q) { struct blk_mq_hw_ctx *hctx; - int i; + unsigned long i; queue_for_each_hw_ctx(q, hctx, i) if (blk_mq_hctx_stopped(hctx)) @@ -2248,7 +2248,7 @@ EXPORT_SYMBOL(blk_mq_stop_hw_queue); void blk_mq_stop_hw_queues(struct request_queue *q) { struct blk_mq_hw_ctx *hctx; - int i; + unsigned long i; queue_for_each_hw_ctx(q, hctx, i) blk_mq_stop_hw_queue(hctx); @@ -2266,7 +2266,7 @@ EXPORT_SYMBOL(blk_mq_start_hw_queue); void blk_mq_start_hw_queues(struct request_queue *q) { struct blk_mq_hw_ctx *hctx; - int i; + unsigned long i; queue_for_each_hw_ctx(q, hctx, i) blk_mq_start_hw_queue(hctx); @@ -2286,7 +2286,7 @@ EXPORT_SYMBOL_GPL(blk_mq_start_stopped_hw_queue); void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async) { struct blk_mq_hw_ctx *hctx; - int i; + unsigned long i; queue_for_each_hw_ctx(q, hctx, i) blk_mq_start_stopped_hw_queue(hctx, async); @@ -3446,7 +3446,7 @@ static void blk_mq_exit_hw_queues(struct request_queue *q, struct blk_mq_tag_set *set, int nr_queue) { struct blk_mq_hw_ctx *hctx; - unsigned int i; + unsigned long i; queue_for_each_hw_ctx(q, hctx, i) { if (i == nr_queue) @@ -3637,7 +3637,8 @@ static void __blk_mq_free_map_and_rqs(struct blk_mq_tag_set *set, static void blk_mq_map_swqueue(struct request_queue *q) { - unsigned int i, j, hctx_idx; + unsigned int j, hctx_idx; + unsigned long i; struct blk_mq_hw_ctx *hctx; struct blk_mq_ctx *ctx; struct blk_mq_tag_set *set = q->tag_set; @@ -3744,7 +3745,7 @@ static void blk_mq_map_swqueue(struct request_queue *q) static void queue_set_hctx_shared(struct request_queue *q, bool shared) { struct blk_mq_hw_ctx *hctx; - int i; + unsigned long i; queue_for_each_hw_ctx(q, hctx, i) { if (shared) { @@ -3844,7 +3845,7 @@ static int blk_mq_alloc_ctxs(struct request_queue *q) void blk_mq_release(struct request_queue *q) { struct blk_mq_hw_ctx *hctx, *next; - int i; + unsigned long i; queue_for_each_hw_ctx(q, hctx, i) WARN_ON_ONCE(hctx && list_empty(&hctx->hctx_list)); @@ -4362,7 +4363,8 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr) { struct blk_mq_tag_set *set = q->tag_set; struct blk_mq_hw_ctx *hctx; - int i, ret; + int ret; + unsigned long i; if (!set) return -EINVAL; @@ -4738,7 +4740,7 @@ void blk_mq_cancel_work_sync(struct request_queue *q) { if (queue_is_mq(q)) { struct blk_mq_hw_ctx *hctx; - int i; + unsigned long i; cancel_delayed_work_sync(&q->requeue_work); -- cgit v1.2.3 From 4e5cc99e1e485954a9c09872e0eeea570fb2b5a5 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Tue, 8 Mar 2022 15:32:19 +0800 Subject: blk-mq: manage hctx map via xarray First code becomes more clean by switching to xarray from plain array. Second use-after-free on q->queue_hw_ctx can be fixed because queue_for_each_hw_ctx() may be run when updating nr_hw_queues is in-progress. With this patch, q->hctx_table is defined as xarray, and this structure will share same lifetime with request queue, so queue_for_each_hw_ctx() can use q->hctx_table to lookup hctx reliably. Reported-by: Yu Kuai Signed-off-by: Ming Lei Reviewed-by: Hannes Reinecke Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20220308073219.91173-7-ming.lei@redhat.com [axboe: fix blk_mq_hw_ctx forward declaration] Signed-off-by: Jens Axboe --- block/blk-mq.c | 62 +++++++++++++++++++++++----------------------------------- 1 file changed, 24 insertions(+), 38 deletions(-) (limited to 'block/blk-mq.c') diff --git a/block/blk-mq.c b/block/blk-mq.c index bffdd71c670d..e8d6d7a05a43 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -71,7 +71,8 @@ static int blk_mq_poll_stats_bkt(const struct request *rq) static inline struct blk_mq_hw_ctx *blk_qc_to_hctx(struct request_queue *q, blk_qc_t qc) { - return q->queue_hw_ctx[(qc & ~BLK_QC_T_INTERNAL) >> BLK_QC_T_SHIFT]; + return xa_load(&q->hctx_table, + (qc & ~BLK_QC_T_INTERNAL) >> BLK_QC_T_SHIFT); } static inline struct request *blk_qc_to_rq(struct blk_mq_hw_ctx *hctx, @@ -573,7 +574,7 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q, * If not tell the caller that it should skip this queue. */ ret = -EXDEV; - data.hctx = q->queue_hw_ctx[hctx_idx]; + data.hctx = xa_load(&q->hctx_table, hctx_idx); if (!blk_mq_hw_queue_mapped(data.hctx)) goto out_queue_exit; cpu = cpumask_first_and(data.hctx->cpumask, cpu_online_mask); @@ -3437,6 +3438,8 @@ static void blk_mq_exit_hctx(struct request_queue *q, blk_mq_remove_cpuhp(hctx); + xa_erase(&q->hctx_table, hctx_idx); + spin_lock(&q->unused_hctx_lock); list_add(&hctx->hctx_list, &q->unused_hctx_list); spin_unlock(&q->unused_hctx_lock); @@ -3476,8 +3479,15 @@ static int blk_mq_init_hctx(struct request_queue *q, if (blk_mq_init_request(set, hctx->fq->flush_rq, hctx_idx, hctx->numa_node)) goto exit_hctx; + + if (xa_insert(&q->hctx_table, hctx_idx, hctx, GFP_KERNEL)) + goto exit_flush_rq; + return 0; + exit_flush_rq: + if (set->ops->exit_request) + set->ops->exit_request(set, hctx->fq->flush_rq, hctx_idx); exit_hctx: if (set->ops->exit_hctx) set->ops->exit_hctx(hctx, hctx_idx); @@ -3856,7 +3866,7 @@ void blk_mq_release(struct request_queue *q) kobject_put(&hctx->kobj); } - kfree(q->queue_hw_ctx); + xa_destroy(&q->hctx_table); /* * release .mq_kobj and sw queue's kobject now because @@ -3945,46 +3955,28 @@ static struct blk_mq_hw_ctx *blk_mq_alloc_and_init_hctx( static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set, struct request_queue *q) { - int i, j, end; - struct blk_mq_hw_ctx **hctxs = q->queue_hw_ctx; - - if (q->nr_hw_queues < set->nr_hw_queues) { - struct blk_mq_hw_ctx **new_hctxs; - - new_hctxs = kcalloc_node(set->nr_hw_queues, - sizeof(*new_hctxs), GFP_KERNEL, - set->numa_node); - if (!new_hctxs) - return; - if (hctxs) - memcpy(new_hctxs, hctxs, q->nr_hw_queues * - sizeof(*hctxs)); - q->queue_hw_ctx = new_hctxs; - kfree(hctxs); - hctxs = new_hctxs; - } + struct blk_mq_hw_ctx *hctx; + unsigned long i, j; /* protect against switching io scheduler */ mutex_lock(&q->sysfs_lock); for (i = 0; i < set->nr_hw_queues; i++) { int old_node; int node = blk_mq_get_hctx_node(set, i); - struct blk_mq_hw_ctx *old_hctx = hctxs[i]; + struct blk_mq_hw_ctx *old_hctx = xa_load(&q->hctx_table, i); if (old_hctx) { old_node = old_hctx->numa_node; blk_mq_exit_hctx(q, set, old_hctx, i); } - hctxs[i] = blk_mq_alloc_and_init_hctx(set, q, i, node); - if (!hctxs[i]) { + if (!blk_mq_alloc_and_init_hctx(set, q, i, node)) { if (!old_hctx) break; pr_warn("Allocate new hctx on node %d fails, fallback to previous one on node %d\n", node, old_node); - hctxs[i] = blk_mq_alloc_and_init_hctx(set, q, i, - old_node); - WARN_ON_ONCE(!hctxs[i]); + hctx = blk_mq_alloc_and_init_hctx(set, q, i, old_node); + WARN_ON_ONCE(!hctx); } } /* @@ -3993,21 +3985,13 @@ static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set, */ if (i != set->nr_hw_queues) { j = q->nr_hw_queues; - end = i; } else { j = i; - end = q->nr_hw_queues; q->nr_hw_queues = set->nr_hw_queues; } - for (; j < end; j++) { - struct blk_mq_hw_ctx *hctx = hctxs[j]; - - if (hctx) { - blk_mq_exit_hctx(q, set, hctx, j); - hctxs[j] = NULL; - } - } + xa_for_each_start(&q->hctx_table, j, hctx, j) + blk_mq_exit_hctx(q, set, hctx, j); mutex_unlock(&q->sysfs_lock); } @@ -4046,6 +4030,8 @@ int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, INIT_LIST_HEAD(&q->unused_hctx_list); spin_lock_init(&q->unused_hctx_lock); + xa_init(&q->hctx_table); + blk_mq_realloc_hw_ctxs(set, q); if (!q->nr_hw_queues) goto err_hctxs; @@ -4075,7 +4061,7 @@ int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, return 0; err_hctxs: - kfree(q->queue_hw_ctx); + xa_destroy(&q->hctx_table); q->nr_hw_queues = 0; blk_mq_sysfs_deinit(q); err_poll: -- cgit v1.2.3 From 41fa722239b46f0c033da94746212344175cdaa0 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 8 Mar 2022 06:51:47 +0100 Subject: blk-mq: do not include passthrough requests in I/O accounting I/O accounting buckets I/O into the read/write/discard categories into which passthrough I/O does not fit at all. It also accounts to the block_device, which may not even exist for passthrough I/O. Signed-off-by: Christoph Hellwig Reviewed-by: Bart Van Assche Reviewed-by: Chaitanya Kulkarni Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20220308055200.735835-2-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-mq.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'block/blk-mq.c') diff --git a/block/blk-mq.c b/block/blk-mq.c index e8d6d7a05a43..40f634ab7026 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -884,10 +884,15 @@ static inline void blk_account_io_done(struct request *req, u64 now) static void __blk_account_io_start(struct request *rq) { - /* passthrough requests can hold bios that do not have ->bi_bdev set */ - if (rq->bio && rq->bio->bi_bdev) + /* + * All non-passthrough requests are created from a bio with one + * exception: when a flush command that is part of a flush sequence + * generated by the state machine in blk-flush.c is cloned onto the + * lower device by dm-multipath we can get here without a bio. + */ + if (rq->bio) rq->part = rq->bio->bi_bdev; - else if (rq->q->disk) + else rq->part = rq->q->disk->part0; part_stat_lock(); -- cgit v1.2.3 From e02657ea7b86d1c41e095f49e4f7c7bb24bbee64 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Tue, 8 Mar 2022 06:51:48 +0100 Subject: blk-mq: handle already freed tags gracefully in blk_mq_free_rqs To simplify further changes allow for double calling blk_mq_free_rqs on a queue. Signed-off-by: Ming Lei [hch: split out from a larger patch] Signed-off-by: Christoph Hellwig Reviewed-by: Bart Van Assche Reviewed-by: Martin K. Petersen Link: https://lore.kernel.org/r/20220308055200.735835-3-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-mq.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'block/blk-mq.c') diff --git a/block/blk-mq.c b/block/blk-mq.c index 40f634ab7026..f7ef8e2ab935 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -3071,6 +3071,9 @@ void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, struct blk_mq_tags *drv_tags; struct page *page; + if (list_empty(&tags->page_list)) + return; + if (blk_mq_is_shared_tags(set->flags)) drv_tags = set->shared_tags; else -- cgit v1.2.3 From de3d347f7b8a1a997fbc9267454ed6065068b969 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Tue, 8 Mar 2022 06:51:56 +0100 Subject: block: don't remove hctx debugfs dir from blk_mq_exit_queue The queue's top debugfs dir is removed from blk_release_queue(), so all hctx's debugfs dirs are removed from there. Given blk_mq_exit_queue() is only called from blk_cleanup_queue(), it isn't necessary to remove hctx debugfs from blk_mq_exit_queue(). So remove it from blk_mq_exit_queue(). Signed-off-by: Ming Lei Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20220308055200.735835-11-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-mq.c | 1 - 1 file changed, 1 deletion(-) (limited to 'block/blk-mq.c') diff --git a/block/blk-mq.c b/block/blk-mq.c index f7ef8e2ab935..862d91c6112e 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -3462,7 +3462,6 @@ static void blk_mq_exit_hw_queues(struct request_queue *q, queue_for_each_hw_ctx(q, hctx, i) { if (i == nr_queue) break; - blk_mq_debugfs_unregister_hctx(hctx); blk_mq_exit_hctx(q, set, hctx, i); } } -- cgit v1.2.3 From 26fed4ac4eab09c27fbae1859696cc38f0536407 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 11 Mar 2022 10:24:17 -0700 Subject: block: flush plug based on hardware and software queue order We used to sort the plug list if we had multiple queues before dispatching requests to the IO scheduler. This usually isn't needed, but for certain workloads that interleave requests to disks, it's a less efficient to process the plug list one-by-one if everything is interleaved. Don't sort the list, but skip through it and flush out entries that have the same target at the same time. Fixes: df87eb0fce8f ("block: get rid of plug list sorting") Reported-and-tested-by: Song Liu Reviewed-by: Song Liu Signed-off-by: Jens Axboe --- block/blk-mq.c | 59 ++++++++++++++++++++++++++++------------------------------ 1 file changed, 28 insertions(+), 31 deletions(-) (limited to 'block/blk-mq.c') diff --git a/block/blk-mq.c b/block/blk-mq.c index 862d91c6112e..213bb5979bed 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2573,13 +2573,36 @@ static void __blk_mq_flush_plug_list(struct request_queue *q, q->mq_ops->queue_rqs(&plug->mq_list); } +static void blk_mq_dispatch_plug_list(struct blk_plug *plug, bool from_sched) +{ + struct blk_mq_hw_ctx *this_hctx = NULL; + struct blk_mq_ctx *this_ctx = NULL; + struct request *requeue_list = NULL; + unsigned int depth = 0; + LIST_HEAD(list); + + do { + struct request *rq = rq_list_pop(&plug->mq_list); + + if (!this_hctx) { + this_hctx = rq->mq_hctx; + this_ctx = rq->mq_ctx; + } else if (this_hctx != rq->mq_hctx || this_ctx != rq->mq_ctx) { + rq_list_add(&requeue_list, rq); + continue; + } + list_add_tail(&rq->queuelist, &list); + depth++; + } while (!rq_list_empty(plug->mq_list)); + + plug->mq_list = requeue_list; + trace_block_unplug(this_hctx->queue, depth, !from_sched); + blk_mq_sched_insert_requests(this_hctx, this_ctx, &list, from_sched); +} + void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule) { - struct blk_mq_hw_ctx *this_hctx; - struct blk_mq_ctx *this_ctx; struct request *rq; - unsigned int depth; - LIST_HEAD(list); if (rq_list_empty(plug->mq_list)) return; @@ -2615,35 +2638,9 @@ void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule) return; } - this_hctx = NULL; - this_ctx = NULL; - depth = 0; do { - rq = rq_list_pop(&plug->mq_list); - - if (!this_hctx) { - this_hctx = rq->mq_hctx; - this_ctx = rq->mq_ctx; - } else if (this_hctx != rq->mq_hctx || this_ctx != rq->mq_ctx) { - trace_block_unplug(this_hctx->queue, depth, - !from_schedule); - blk_mq_sched_insert_requests(this_hctx, this_ctx, - &list, from_schedule); - depth = 0; - this_hctx = rq->mq_hctx; - this_ctx = rq->mq_ctx; - - } - - list_add(&rq->queuelist, &list); - depth++; + blk_mq_dispatch_plug_list(plug, from_schedule); } while (!rq_list_empty(plug->mq_list)); - - if (!list_empty(&list)) { - trace_block_unplug(this_hctx->queue, depth, !from_schedule); - blk_mq_sched_insert_requests(this_hctx, this_ctx, &list, - from_schedule); - } } void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx, -- cgit v1.2.3